| // SPDX-License-Identifier: GPL-2.0-or-later |
| /* |
| * Copyright (C) 2018-2023 Oracle. All Rights Reserved. |
| * Author: Darrick J. Wong <djwong@kernel.org> |
| */ |
| #include "xfs.h" |
| #include "xfs_fs.h" |
| #include "xfs_shared.h" |
| #include "xfs_format.h" |
| #include "xfs_trans_resv.h" |
| #include "xfs_mount.h" |
| #include "xfs_defer.h" |
| #include "xfs_btree.h" |
| #include "xfs_bit.h" |
| #include "xfs_log_format.h" |
| #include "xfs_trans.h" |
| #include "xfs_sb.h" |
| #include "xfs_inode.h" |
| #include "xfs_icache.h" |
| #include "xfs_inode_buf.h" |
| #include "xfs_inode_fork.h" |
| #include "xfs_ialloc.h" |
| #include "xfs_da_format.h" |
| #include "xfs_reflink.h" |
| #include "xfs_alloc.h" |
| #include "xfs_rmap.h" |
| #include "xfs_rmap_btree.h" |
| #include "xfs_bmap.h" |
| #include "xfs_bmap_btree.h" |
| #include "xfs_bmap_util.h" |
| #include "xfs_dir2.h" |
| #include "xfs_dir2_priv.h" |
| #include "xfs_quota_defs.h" |
| #include "xfs_quota.h" |
| #include "xfs_ag.h" |
| #include "xfs_rtbitmap.h" |
| #include "xfs_attr_leaf.h" |
| #include "xfs_log_priv.h" |
| #include "xfs_health.h" |
| #include "xfs_symlink_remote.h" |
| #include "scrub/xfs_scrub.h" |
| #include "scrub/scrub.h" |
| #include "scrub/common.h" |
| #include "scrub/btree.h" |
| #include "scrub/trace.h" |
| #include "scrub/repair.h" |
| #include "scrub/iscan.h" |
| #include "scrub/readdir.h" |
| #include "scrub/tempfile.h" |
| |
| /* |
| * Inode Record Repair |
| * =================== |
| * |
| * Roughly speaking, inode problems can be classified based on whether or not |
| * they trip the dinode verifiers. If those trip, then we won't be able to |
| * xfs_iget ourselves the inode. |
| * |
| * Therefore, the xrep_dinode_* functions fix anything that will cause the |
| * inode buffer verifier or the dinode verifier. The xrep_inode_* functions |
| * fix things on live incore inodes. The inode repair functions make decisions |
| * with security and usability implications when reviving a file: |
| * |
| * - Files with zero di_mode or a garbage di_mode are converted to regular file |
| * that only root can read. This file may not actually contain user data, |
| * if the file was not previously a regular file. Setuid and setgid bits |
| * are cleared. |
| * |
| * - Zero-size directories can be truncated to look empty. It is necessary to |
| * run the bmapbtd and directory repair functions to fully rebuild the |
| * directory. |
| * |
| * - Zero-size symbolic link targets can be truncated to '?'. It is necessary |
| * to run the bmapbtd and symlink repair functions to salvage the symlink. |
| * |
| * - Invalid extent size hints will be removed. |
| * |
| * - Quotacheck will be scheduled if we repaired an inode that was so badly |
| * damaged that the ondisk inode had to be rebuilt. |
| * |
| * - Invalid user, group, or project IDs (aka -1U) will be reset to zero. |
| * Setuid and setgid bits are cleared. |
| * |
| * - Data and attr forks are reset to extents format with zero extents if the |
| * fork data is inconsistent. It is necessary to run the bmapbtd or bmapbta |
| * repair functions to recover the space mapping. |
| * |
| * - ACLs will not be recovered if the attr fork is zapped or the extended |
| * attribute structure itself requires salvaging. |
| * |
| * - If the attr fork is zapped, the user and group ids are reset to root and |
| * the setuid and setgid bits are removed. |
| */ |
| |
| /* |
| * All the information we need to repair the ondisk inode if we can't iget the |
| * incore inode. We don't allocate this buffer unless we're going to perform |
| * a repair to the ondisk inode cluster buffer. |
| */ |
| struct xrep_inode { |
| /* Inode mapping that we saved from the initial lookup attempt. */ |
| struct xfs_imap imap; |
| |
| struct xfs_scrub *sc; |
| |
| /* Blocks in use on the data device by data extents or bmbt blocks. */ |
| xfs_rfsblock_t data_blocks; |
| |
| /* Blocks in use on the rt device. */ |
| xfs_rfsblock_t rt_blocks; |
| |
| /* Blocks in use by the attr fork. */ |
| xfs_rfsblock_t attr_blocks; |
| |
| /* Number of data device extents for the data fork. */ |
| xfs_extnum_t data_extents; |
| |
| /* |
| * Number of realtime device extents for the data fork. If |
| * data_extents and rt_extents indicate that the data fork has extents |
| * on both devices, we'll just back away slowly. |
| */ |
| xfs_extnum_t rt_extents; |
| |
| /* Number of (data device) extents for the attr fork. */ |
| xfs_aextnum_t attr_extents; |
| |
| /* Sick state to set after zapping parts of the inode. */ |
| unsigned int ino_sick_mask; |
| |
| /* Must we remove all access from this file? */ |
| bool zap_acls; |
| |
| /* Inode scanner to see if we can find the ftype from dirents */ |
| struct xchk_iscan ftype_iscan; |
| uint8_t alleged_ftype; |
| }; |
| |
| /* |
| * Setup function for inode repair. @imap contains the ondisk inode mapping |
| * information so that we can correct the ondisk inode cluster buffer if |
| * necessary to make iget work. |
| */ |
| int |
| xrep_setup_inode( |
| struct xfs_scrub *sc, |
| const struct xfs_imap *imap) |
| { |
| struct xrep_inode *ri; |
| |
| sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS); |
| if (!sc->buf) |
| return -ENOMEM; |
| |
| ri = sc->buf; |
| memcpy(&ri->imap, imap, sizeof(struct xfs_imap)); |
| ri->sc = sc; |
| return 0; |
| } |
| |
| /* |
| * Make sure this ondisk inode can pass the inode buffer verifier. This is |
| * not the same as the dinode verifier. |
| */ |
| STATIC void |
| xrep_dinode_buf_core( |
| struct xfs_scrub *sc, |
| struct xfs_buf *bp, |
| unsigned int ioffset) |
| { |
| struct xfs_dinode *dip = xfs_buf_offset(bp, ioffset); |
| struct xfs_trans *tp = sc->tp; |
| struct xfs_mount *mp = sc->mp; |
| xfs_agino_t agino; |
| bool crc_ok = false; |
| bool magic_ok = false; |
| bool unlinked_ok = false; |
| |
| agino = be32_to_cpu(dip->di_next_unlinked); |
| |
| if (xfs_verify_agino_or_null(bp->b_pag, agino)) |
| unlinked_ok = true; |
| |
| if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && |
| xfs_dinode_good_version(mp, dip->di_version)) |
| magic_ok = true; |
| |
| if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, |
| XFS_DINODE_CRC_OFF)) |
| crc_ok = true; |
| |
| if (magic_ok && unlinked_ok && crc_ok) |
| return; |
| |
| if (!magic_ok) { |
| dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); |
| dip->di_version = 3; |
| } |
| if (!unlinked_ok) |
| dip->di_next_unlinked = cpu_to_be32(NULLAGINO); |
| xfs_dinode_calc_crc(mp, dip); |
| xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); |
| xfs_trans_log_buf(tp, bp, ioffset, |
| ioffset + sizeof(struct xfs_dinode) - 1); |
| } |
| |
| /* Make sure this inode cluster buffer can pass the inode buffer verifier. */ |
| STATIC void |
| xrep_dinode_buf( |
| struct xfs_scrub *sc, |
| struct xfs_buf *bp) |
| { |
| struct xfs_mount *mp = sc->mp; |
| int i; |
| int ni; |
| |
| ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; |
| for (i = 0; i < ni; i++) |
| xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog); |
| } |
| |
| /* Reinitialize things that never change in an inode. */ |
| STATIC void |
| xrep_dinode_header( |
| struct xfs_scrub *sc, |
| struct xfs_dinode *dip) |
| { |
| trace_xrep_dinode_header(sc, dip); |
| |
| dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); |
| if (!xfs_dinode_good_version(sc->mp, dip->di_version)) |
| dip->di_version = 3; |
| dip->di_ino = cpu_to_be64(sc->sm->sm_ino); |
| uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid); |
| dip->di_gen = cpu_to_be32(sc->sm->sm_gen); |
| } |
| |
| /* |
| * If this directory entry points to the scrub target inode, then the directory |
| * we're scanning is the parent of the scrub target inode. |
| */ |
| STATIC int |
| xrep_dinode_findmode_dirent( |
| struct xfs_scrub *sc, |
| struct xfs_inode *dp, |
| xfs_dir2_dataptr_t dapos, |
| const struct xfs_name *name, |
| xfs_ino_t ino, |
| void *priv) |
| { |
| struct xrep_inode *ri = priv; |
| int error = 0; |
| |
| if (xchk_should_terminate(ri->sc, &error)) |
| return error; |
| |
| if (ino != sc->sm->sm_ino) |
| return 0; |
| |
| /* Ignore garbage directory entry names. */ |
| if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) |
| return -EFSCORRUPTED; |
| |
| /* Don't pick up dot or dotdot entries; we only want child dirents. */ |
| if (xfs_dir2_samename(name, &xfs_name_dotdot) || |
| xfs_dir2_samename(name, &xfs_name_dot)) |
| return 0; |
| |
| /* |
| * Uhoh, more than one parent for this inode and they don't agree on |
| * the file type? |
| */ |
| if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN && |
| ri->alleged_ftype != name->type) { |
| trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type, |
| ri->alleged_ftype); |
| return -EFSCORRUPTED; |
| } |
| |
| /* We found a potential parent; remember the ftype. */ |
| trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type); |
| ri->alleged_ftype = name->type; |
| return 0; |
| } |
| |
| /* Try to lock a directory, or wait a jiffy. */ |
| static inline int |
| xrep_dinode_ilock_nowait( |
| struct xfs_inode *dp, |
| unsigned int lock_mode) |
| { |
| if (xfs_ilock_nowait(dp, lock_mode)) |
| return true; |
| |
| schedule_timeout_killable(1); |
| return false; |
| } |
| |
| /* |
| * Try to lock a directory to look for ftype hints. Since we already hold the |
| * AGI buffer, we cannot block waiting for the ILOCK because rename can take |
| * the ILOCK and then try to lock AGIs. |
| */ |
| STATIC int |
| xrep_dinode_trylock_directory( |
| struct xrep_inode *ri, |
| struct xfs_inode *dp, |
| unsigned int *lock_modep) |
| { |
| unsigned long deadline = jiffies + msecs_to_jiffies(30000); |
| unsigned int lock_mode; |
| int error = 0; |
| |
| do { |
| if (xchk_should_terminate(ri->sc, &error)) |
| return error; |
| |
| if (xfs_need_iread_extents(&dp->i_df)) |
| lock_mode = XFS_ILOCK_EXCL; |
| else |
| lock_mode = XFS_ILOCK_SHARED; |
| |
| if (xrep_dinode_ilock_nowait(dp, lock_mode)) { |
| *lock_modep = lock_mode; |
| return 0; |
| } |
| } while (!time_is_before_jiffies(deadline)); |
| return -EBUSY; |
| } |
| |
| /* |
| * If this is a directory, walk the dirents looking for any that point to the |
| * scrub target inode. |
| */ |
| STATIC int |
| xrep_dinode_findmode_walk_directory( |
| struct xrep_inode *ri, |
| struct xfs_inode *dp) |
| { |
| struct xfs_scrub *sc = ri->sc; |
| unsigned int lock_mode; |
| int error = 0; |
| |
| /* Ignore temporary repair directories. */ |
| if (xrep_is_tempfile(dp)) |
| return 0; |
| |
| /* |
| * Scan the directory to see if there it contains an entry pointing to |
| * the directory that we are repairing. |
| */ |
| error = xrep_dinode_trylock_directory(ri, dp, &lock_mode); |
| if (error) |
| return error; |
| |
| /* |
| * If this directory is known to be sick, we cannot scan it reliably |
| * and must abort. |
| */ |
| if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE | |
| XFS_SICK_INO_BMBTD | |
| XFS_SICK_INO_DIR)) { |
| error = -EFSCORRUPTED; |
| goto out_unlock; |
| } |
| |
| /* |
| * We cannot complete our parent pointer scan if a directory looks as |
| * though it has been zapped by the inode record repair code. |
| */ |
| if (xchk_dir_looks_zapped(dp)) { |
| error = -EBUSY; |
| goto out_unlock; |
| } |
| |
| error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri); |
| if (error) |
| goto out_unlock; |
| |
| out_unlock: |
| xfs_iunlock(dp, lock_mode); |
| return error; |
| } |
| |
| /* |
| * Try to find the mode of the inode being repaired by looking for directories |
| * that point down to this file. |
| */ |
| STATIC int |
| xrep_dinode_find_mode( |
| struct xrep_inode *ri, |
| uint16_t *mode) |
| { |
| struct xfs_scrub *sc = ri->sc; |
| struct xfs_inode *dp; |
| int error; |
| |
| /* No ftype means we have no other metadata to consult. */ |
| if (!xfs_has_ftype(sc->mp)) { |
| *mode = S_IFREG; |
| return 0; |
| } |
| |
| /* |
| * Scan all directories for parents that might point down to this |
| * inode. Skip the inode being repaired during the scan since it |
| * cannot be its own parent. Note that we still hold the AGI locked |
| * so there's a real possibility that _iscan_iter can return EBUSY. |
| */ |
| xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan); |
| xchk_iscan_set_agi_trylock(&ri->ftype_iscan); |
| ri->ftype_iscan.skip_ino = sc->sm->sm_ino; |
| ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN; |
| while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) { |
| if (S_ISDIR(VFS_I(dp)->i_mode)) |
| error = xrep_dinode_findmode_walk_directory(ri, dp); |
| xchk_iscan_mark_visited(&ri->ftype_iscan, dp); |
| xchk_irele(sc, dp); |
| if (error < 0) |
| break; |
| if (xchk_should_terminate(sc, &error)) |
| break; |
| } |
| xchk_iscan_iter_finish(&ri->ftype_iscan); |
| xchk_iscan_teardown(&ri->ftype_iscan); |
| |
| if (error == -EBUSY) { |
| if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) { |
| /* |
| * If we got an EBUSY after finding at least one |
| * dirent, that means the scan found an inode on the |
| * inactivation list and could not open it. Accept the |
| * alleged ftype and install a new mode below. |
| */ |
| error = 0; |
| } else if (!(sc->flags & XCHK_TRY_HARDER)) { |
| /* |
| * Otherwise, retry the operation one time to see if |
| * the reason for the delay is an inode from the same |
| * cluster buffer waiting on the inactivation list. |
| */ |
| error = -EDEADLOCK; |
| } |
| } |
| if (error) |
| return error; |
| |
| /* |
| * Convert the discovered ftype into the file mode. If all else fails, |
| * return S_IFREG. |
| */ |
| switch (ri->alleged_ftype) { |
| case XFS_DIR3_FT_DIR: |
| *mode = S_IFDIR; |
| break; |
| case XFS_DIR3_FT_WHT: |
| case XFS_DIR3_FT_CHRDEV: |
| *mode = S_IFCHR; |
| break; |
| case XFS_DIR3_FT_BLKDEV: |
| *mode = S_IFBLK; |
| break; |
| case XFS_DIR3_FT_FIFO: |
| *mode = S_IFIFO; |
| break; |
| case XFS_DIR3_FT_SOCK: |
| *mode = S_IFSOCK; |
| break; |
| case XFS_DIR3_FT_SYMLINK: |
| *mode = S_IFLNK; |
| break; |
| default: |
| *mode = S_IFREG; |
| break; |
| } |
| return 0; |
| } |
| |
| /* Turn di_mode into /something/ recognizable. Returns true if we succeed. */ |
| STATIC int |
| xrep_dinode_mode( |
| struct xrep_inode *ri, |
| struct xfs_dinode *dip) |
| { |
| struct xfs_scrub *sc = ri->sc; |
| uint16_t mode = be16_to_cpu(dip->di_mode); |
| int error; |
| |
| trace_xrep_dinode_mode(sc, dip); |
| |
| if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN) |
| return 0; |
| |
| /* Try to fix the mode. If we cannot, then leave everything alone. */ |
| error = xrep_dinode_find_mode(ri, &mode); |
| switch (error) { |
| case -EINTR: |
| case -EBUSY: |
| case -EDEADLOCK: |
| /* temporary failure or fatal signal */ |
| return error; |
| case 0: |
| /* found mode */ |
| break; |
| default: |
| /* some other error, assume S_IFREG */ |
| mode = S_IFREG; |
| break; |
| } |
| |
| /* bad mode, so we set it to a file that only root can read */ |
| dip->di_mode = cpu_to_be16(mode); |
| dip->di_uid = 0; |
| dip->di_gid = 0; |
| ri->zap_acls = true; |
| return 0; |
| } |
| |
| /* Fix unused link count fields having nonzero values. */ |
| STATIC void |
| xrep_dinode_nlinks( |
| struct xfs_dinode *dip) |
| { |
| if (dip->di_version > 1) |
| dip->di_onlink = 0; |
| else |
| dip->di_nlink = 0; |
| } |
| |
| /* Fix any conflicting flags that the verifiers complain about. */ |
| STATIC void |
| xrep_dinode_flags( |
| struct xfs_scrub *sc, |
| struct xfs_dinode *dip, |
| bool isrt) |
| { |
| struct xfs_mount *mp = sc->mp; |
| uint64_t flags2 = be64_to_cpu(dip->di_flags2); |
| uint16_t flags = be16_to_cpu(dip->di_flags); |
| uint16_t mode = be16_to_cpu(dip->di_mode); |
| |
| trace_xrep_dinode_flags(sc, dip); |
| |
| if (isrt) |
| flags |= XFS_DIFLAG_REALTIME; |
| else |
| flags &= ~XFS_DIFLAG_REALTIME; |
| |
| /* |
| * For regular files on a reflink filesystem, set the REFLINK flag to |
| * protect shared extents. A later stage will actually check those |
| * extents and clear the flag if possible. |
| */ |
| if (xfs_has_reflink(mp) && S_ISREG(mode)) |
| flags2 |= XFS_DIFLAG2_REFLINK; |
| else |
| flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE); |
| if (flags & XFS_DIFLAG_REALTIME) |
| flags2 &= ~XFS_DIFLAG2_REFLINK; |
| if (!xfs_has_bigtime(mp)) |
| flags2 &= ~XFS_DIFLAG2_BIGTIME; |
| if (!xfs_has_large_extent_counts(mp)) |
| flags2 &= ~XFS_DIFLAG2_NREXT64; |
| if (flags2 & XFS_DIFLAG2_NREXT64) |
| dip->di_nrext64_pad = 0; |
| else if (dip->di_version >= 3) |
| dip->di_v3_pad = 0; |
| dip->di_flags = cpu_to_be16(flags); |
| dip->di_flags2 = cpu_to_be64(flags2); |
| } |
| |
| /* |
| * Blow out symlink; now it points nowhere. We don't have to worry about |
| * incore state because this inode is failing the verifiers. |
| */ |
| STATIC void |
| xrep_dinode_zap_symlink( |
| struct xrep_inode *ri, |
| struct xfs_dinode *dip) |
| { |
| struct xfs_scrub *sc = ri->sc; |
| char *p; |
| |
| trace_xrep_dinode_zap_symlink(sc, dip); |
| |
| dip->di_format = XFS_DINODE_FMT_LOCAL; |
| dip->di_size = cpu_to_be64(1); |
| p = XFS_DFORK_PTR(dip, XFS_DATA_FORK); |
| *p = '?'; |
| ri->ino_sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED; |
| } |
| |
| /* |
| * Blow out dir, make the parent point to the root. In the future repair will |
| * reconstruct this directory for us. Note that there's no in-core directory |
| * inode because the sf verifier tripped, so we don't have to worry about the |
| * dentry cache. |
| */ |
| STATIC void |
| xrep_dinode_zap_dir( |
| struct xrep_inode *ri, |
| struct xfs_dinode *dip) |
| { |
| struct xfs_scrub *sc = ri->sc; |
| struct xfs_mount *mp = sc->mp; |
| struct xfs_dir2_sf_hdr *sfp; |
| int i8count; |
| |
| trace_xrep_dinode_zap_dir(sc, dip); |
| |
| dip->di_format = XFS_DINODE_FMT_LOCAL; |
| i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM; |
| sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); |
| sfp->count = 0; |
| sfp->i8count = i8count; |
| xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino); |
| dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count)); |
| ri->ino_sick_mask |= XFS_SICK_INO_DIR_ZAPPED; |
| } |
| |
| /* Make sure we don't have a garbage file size. */ |
| STATIC void |
| xrep_dinode_size( |
| struct xrep_inode *ri, |
| struct xfs_dinode *dip) |
| { |
| struct xfs_scrub *sc = ri->sc; |
| uint64_t size = be64_to_cpu(dip->di_size); |
| uint16_t mode = be16_to_cpu(dip->di_mode); |
| |
| trace_xrep_dinode_size(sc, dip); |
| |
| switch (mode & S_IFMT) { |
| case S_IFIFO: |
| case S_IFCHR: |
| case S_IFBLK: |
| case S_IFSOCK: |
| /* di_size can't be nonzero for special files */ |
| dip->di_size = 0; |
| break; |
| case S_IFREG: |
| /* Regular files can't be larger than 2^63-1 bytes. */ |
| dip->di_size = cpu_to_be64(size & ~(1ULL << 63)); |
| break; |
| case S_IFLNK: |
| /* |
| * Truncate ridiculously oversized symlinks. If the size is |
| * zero, reset it to point to the current directory. Both of |
| * these conditions trigger dinode verifier errors, so there |
| * is no in-core state to reset. |
| */ |
| if (size > XFS_SYMLINK_MAXLEN) |
| dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN); |
| else if (size == 0) |
| xrep_dinode_zap_symlink(ri, dip); |
| break; |
| case S_IFDIR: |
| /* |
| * Directories can't have a size larger than 32G. If the size |
| * is zero, reset it to an empty directory. Both of these |
| * conditions trigger dinode verifier errors, so there is no |
| * in-core state to reset. |
| */ |
| if (size > XFS_DIR2_SPACE_SIZE) |
| dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE); |
| else if (size == 0) |
| xrep_dinode_zap_dir(ri, dip); |
| break; |
| } |
| } |
| |
| /* Fix extent size hints. */ |
| STATIC void |
| xrep_dinode_extsize_hints( |
| struct xfs_scrub *sc, |
| struct xfs_dinode *dip) |
| { |
| struct xfs_mount *mp = sc->mp; |
| uint64_t flags2 = be64_to_cpu(dip->di_flags2); |
| uint16_t flags = be16_to_cpu(dip->di_flags); |
| uint16_t mode = be16_to_cpu(dip->di_mode); |
| |
| xfs_failaddr_t fa; |
| |
| trace_xrep_dinode_extsize_hints(sc, dip); |
| |
| fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize), |
| mode, flags); |
| if (fa) { |
| dip->di_extsize = 0; |
| dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE | |
| XFS_DIFLAG_EXTSZINHERIT); |
| } |
| |
| if (dip->di_version < 3) |
| return; |
| |
| fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), |
| mode, flags, flags2); |
| if (fa) { |
| dip->di_cowextsize = 0; |
| dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE); |
| } |
| } |
| |
| /* Count extents and blocks for an inode given an rmap. */ |
| STATIC int |
| xrep_dinode_walk_rmap( |
| struct xfs_btree_cur *cur, |
| const struct xfs_rmap_irec *rec, |
| void *priv) |
| { |
| struct xrep_inode *ri = priv; |
| int error = 0; |
| |
| if (xchk_should_terminate(ri->sc, &error)) |
| return error; |
| |
| /* We only care about this inode. */ |
| if (rec->rm_owner != ri->sc->sm->sm_ino) |
| return 0; |
| |
| if (rec->rm_flags & XFS_RMAP_ATTR_FORK) { |
| ri->attr_blocks += rec->rm_blockcount; |
| if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) |
| ri->attr_extents++; |
| |
| return 0; |
| } |
| |
| ri->data_blocks += rec->rm_blockcount; |
| if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) |
| ri->data_extents++; |
| |
| return 0; |
| } |
| |
| /* Count extents and blocks for an inode from all AG rmap data. */ |
| STATIC int |
| xrep_dinode_count_ag_rmaps( |
| struct xrep_inode *ri, |
| struct xfs_perag *pag) |
| { |
| struct xfs_btree_cur *cur; |
| struct xfs_buf *agf; |
| int error; |
| |
| error = xfs_alloc_read_agf(pag, ri->sc->tp, 0, &agf); |
| if (error) |
| return error; |
| |
| cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag); |
| error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri); |
| xfs_btree_del_cursor(cur, error); |
| xfs_trans_brelse(ri->sc->tp, agf); |
| return error; |
| } |
| |
| /* Count extents and blocks for a given inode from all rmap data. */ |
| STATIC int |
| xrep_dinode_count_rmaps( |
| struct xrep_inode *ri) |
| { |
| struct xfs_perag *pag; |
| xfs_agnumber_t agno; |
| int error; |
| |
| if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp)) |
| return -EOPNOTSUPP; |
| |
| for_each_perag(ri->sc->mp, agno, pag) { |
| error = xrep_dinode_count_ag_rmaps(ri, pag); |
| if (error) { |
| xfs_perag_rele(pag); |
| return error; |
| } |
| } |
| |
| /* Can't have extents on both the rt and the data device. */ |
| if (ri->data_extents && ri->rt_extents) |
| return -EFSCORRUPTED; |
| |
| trace_xrep_dinode_count_rmaps(ri->sc, |
| ri->data_blocks, ri->rt_blocks, ri->attr_blocks, |
| ri->data_extents, ri->rt_extents, ri->attr_extents); |
| return 0; |
| } |
| |
| /* Return true if this extents-format ifork looks like garbage. */ |
| STATIC bool |
| xrep_dinode_bad_extents_fork( |
| struct xfs_scrub *sc, |
| struct xfs_dinode *dip, |
| unsigned int dfork_size, |
| int whichfork) |
| { |
| struct xfs_bmbt_irec new; |
| struct xfs_bmbt_rec *dp; |
| xfs_extnum_t nex; |
| bool isrt; |
| unsigned int i; |
| |
| nex = xfs_dfork_nextents(dip, whichfork); |
| if (nex > dfork_size / sizeof(struct xfs_bmbt_rec)) |
| return true; |
| |
| dp = XFS_DFORK_PTR(dip, whichfork); |
| |
| isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME); |
| for (i = 0; i < nex; i++, dp++) { |
| xfs_failaddr_t fa; |
| |
| xfs_bmbt_disk_get_all(dp, &new); |
| fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork, |
| &new); |
| if (fa) |
| return true; |
| } |
| |
| return false; |
| } |
| |
| /* Return true if this btree-format ifork looks like garbage. */ |
| STATIC bool |
| xrep_dinode_bad_bmbt_fork( |
| struct xfs_scrub *sc, |
| struct xfs_dinode *dip, |
| unsigned int dfork_size, |
| int whichfork) |
| { |
| struct xfs_bmdr_block *dfp; |
| xfs_extnum_t nex; |
| unsigned int i; |
| unsigned int dmxr; |
| unsigned int nrecs; |
| unsigned int level; |
| |
| nex = xfs_dfork_nextents(dip, whichfork); |
| if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec)) |
| return true; |
| |
| if (dfork_size < sizeof(struct xfs_bmdr_block)) |
| return true; |
| |
| dfp = XFS_DFORK_PTR(dip, whichfork); |
| nrecs = be16_to_cpu(dfp->bb_numrecs); |
| level = be16_to_cpu(dfp->bb_level); |
| |
| if (nrecs == 0 || xfs_bmdr_space_calc(nrecs) > dfork_size) |
| return true; |
| if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork)) |
| return true; |
| |
| dmxr = xfs_bmdr_maxrecs(dfork_size, 0); |
| for (i = 1; i <= nrecs; i++) { |
| struct xfs_bmbt_key *fkp; |
| xfs_bmbt_ptr_t *fpp; |
| xfs_fileoff_t fileoff; |
| xfs_fsblock_t fsbno; |
| |
| fkp = xfs_bmdr_key_addr(dfp, i); |
| fileoff = be64_to_cpu(fkp->br_startoff); |
| if (!xfs_verify_fileoff(sc->mp, fileoff)) |
| return true; |
| |
| fpp = xfs_bmdr_ptr_addr(dfp, i, dmxr); |
| fsbno = be64_to_cpu(*fpp); |
| if (!xfs_verify_fsbno(sc->mp, fsbno)) |
| return true; |
| } |
| |
| return false; |
| } |
| |
| /* |
| * Check the data fork for things that will fail the ifork verifiers or the |
| * ifork formatters. |
| */ |
| STATIC bool |
| xrep_dinode_check_dfork( |
| struct xfs_scrub *sc, |
| struct xfs_dinode *dip, |
| uint16_t mode) |
| { |
| void *dfork_ptr; |
| int64_t data_size; |
| unsigned int fmt; |
| unsigned int dfork_size; |
| |
| /* |
| * Verifier functions take signed int64_t, so check for bogus negative |
| * values first. |
| */ |
| data_size = be64_to_cpu(dip->di_size); |
| if (data_size < 0) |
| return true; |
| |
| fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK); |
| switch (mode & S_IFMT) { |
| case S_IFIFO: |
| case S_IFCHR: |
| case S_IFBLK: |
| case S_IFSOCK: |
| if (fmt != XFS_DINODE_FMT_DEV) |
| return true; |
| break; |
| case S_IFREG: |
| if (fmt == XFS_DINODE_FMT_LOCAL) |
| return true; |
| fallthrough; |
| case S_IFLNK: |
| case S_IFDIR: |
| switch (fmt) { |
| case XFS_DINODE_FMT_LOCAL: |
| case XFS_DINODE_FMT_EXTENTS: |
| case XFS_DINODE_FMT_BTREE: |
| break; |
| default: |
| return true; |
| } |
| break; |
| default: |
| return true; |
| } |
| |
| dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK); |
| dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); |
| |
| switch (fmt) { |
| case XFS_DINODE_FMT_DEV: |
| break; |
| case XFS_DINODE_FMT_LOCAL: |
| /* dir/symlink structure cannot be larger than the fork */ |
| if (data_size > dfork_size) |
| return true; |
| /* directory structure must pass verification. */ |
| if (S_ISDIR(mode) && |
| xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL) |
| return true; |
| /* symlink structure must pass verification. */ |
| if (S_ISLNK(mode) && |
| xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL) |
| return true; |
| break; |
| case XFS_DINODE_FMT_EXTENTS: |
| if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size, |
| XFS_DATA_FORK)) |
| return true; |
| break; |
| case XFS_DINODE_FMT_BTREE: |
| if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size, |
| XFS_DATA_FORK)) |
| return true; |
| break; |
| default: |
| return true; |
| } |
| |
| return false; |
| } |
| |
| static void |
| xrep_dinode_set_data_nextents( |
| struct xfs_dinode *dip, |
| xfs_extnum_t nextents) |
| { |
| if (xfs_dinode_has_large_extent_counts(dip)) |
| dip->di_big_nextents = cpu_to_be64(nextents); |
| else |
| dip->di_nextents = cpu_to_be32(nextents); |
| } |
| |
| static void |
| xrep_dinode_set_attr_nextents( |
| struct xfs_dinode *dip, |
| xfs_extnum_t nextents) |
| { |
| if (xfs_dinode_has_large_extent_counts(dip)) |
| dip->di_big_anextents = cpu_to_be32(nextents); |
| else |
| dip->di_anextents = cpu_to_be16(nextents); |
| } |
| |
| /* Reset the data fork to something sane. */ |
| STATIC void |
| xrep_dinode_zap_dfork( |
| struct xrep_inode *ri, |
| struct xfs_dinode *dip, |
| uint16_t mode) |
| { |
| struct xfs_scrub *sc = ri->sc; |
| |
| trace_xrep_dinode_zap_dfork(sc, dip); |
| |
| ri->ino_sick_mask |= XFS_SICK_INO_BMBTD_ZAPPED; |
| |
| xrep_dinode_set_data_nextents(dip, 0); |
| ri->data_blocks = 0; |
| ri->rt_blocks = 0; |
| |
| /* Special files always get reset to DEV */ |
| switch (mode & S_IFMT) { |
| case S_IFIFO: |
| case S_IFCHR: |
| case S_IFBLK: |
| case S_IFSOCK: |
| dip->di_format = XFS_DINODE_FMT_DEV; |
| dip->di_size = 0; |
| return; |
| } |
| |
| /* |
| * If we have data extents, reset to an empty map and hope the user |
| * will run the bmapbtd checker next. |
| */ |
| if (ri->data_extents || ri->rt_extents || S_ISREG(mode)) { |
| dip->di_format = XFS_DINODE_FMT_EXTENTS; |
| return; |
| } |
| |
| /* Otherwise, reset the local format to the minimum. */ |
| switch (mode & S_IFMT) { |
| case S_IFLNK: |
| xrep_dinode_zap_symlink(ri, dip); |
| break; |
| case S_IFDIR: |
| xrep_dinode_zap_dir(ri, dip); |
| break; |
| } |
| } |
| |
| /* |
| * Check the attr fork for things that will fail the ifork verifiers or the |
| * ifork formatters. |
| */ |
| STATIC bool |
| xrep_dinode_check_afork( |
| struct xfs_scrub *sc, |
| struct xfs_dinode *dip) |
| { |
| struct xfs_attr_sf_hdr *afork_ptr; |
| size_t attr_size; |
| unsigned int afork_size; |
| |
| if (XFS_DFORK_BOFF(dip) == 0) |
| return dip->di_aformat != XFS_DINODE_FMT_EXTENTS || |
| xfs_dfork_attr_extents(dip) != 0; |
| |
| afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); |
| afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); |
| |
| switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) { |
| case XFS_DINODE_FMT_LOCAL: |
| /* Fork has to be large enough to extract the xattr size. */ |
| if (afork_size < sizeof(struct xfs_attr_sf_hdr)) |
| return true; |
| |
| /* xattr structure cannot be larger than the fork */ |
| attr_size = be16_to_cpu(afork_ptr->totsize); |
| if (attr_size > afork_size) |
| return true; |
| |
| /* xattr structure must pass verification. */ |
| return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL; |
| case XFS_DINODE_FMT_EXTENTS: |
| if (xrep_dinode_bad_extents_fork(sc, dip, afork_size, |
| XFS_ATTR_FORK)) |
| return true; |
| break; |
| case XFS_DINODE_FMT_BTREE: |
| if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size, |
| XFS_ATTR_FORK)) |
| return true; |
| break; |
| default: |
| return true; |
| } |
| |
| return false; |
| } |
| |
| /* |
| * Reset the attr fork to empty. Since the attr fork could have contained |
| * ACLs, make the file readable only by root. |
| */ |
| STATIC void |
| xrep_dinode_zap_afork( |
| struct xrep_inode *ri, |
| struct xfs_dinode *dip, |
| uint16_t mode) |
| { |
| struct xfs_scrub *sc = ri->sc; |
| |
| trace_xrep_dinode_zap_afork(sc, dip); |
| |
| ri->ino_sick_mask |= XFS_SICK_INO_BMBTA_ZAPPED; |
| |
| dip->di_aformat = XFS_DINODE_FMT_EXTENTS; |
| xrep_dinode_set_attr_nextents(dip, 0); |
| ri->attr_blocks = 0; |
| |
| /* |
| * If the data fork is in btree format, removing the attr fork entirely |
| * might cause verifier failures if the next level down in the bmbt |
| * could now fit in the data fork area. |
| */ |
| if (dip->di_format != XFS_DINODE_FMT_BTREE) |
| dip->di_forkoff = 0; |
| dip->di_mode = cpu_to_be16(mode & ~0777); |
| dip->di_uid = 0; |
| dip->di_gid = 0; |
| } |
| |
| /* Make sure the fork offset is a sensible value. */ |
| STATIC void |
| xrep_dinode_ensure_forkoff( |
| struct xrep_inode *ri, |
| struct xfs_dinode *dip, |
| uint16_t mode) |
| { |
| struct xfs_bmdr_block *bmdr; |
| struct xfs_scrub *sc = ri->sc; |
| xfs_extnum_t attr_extents, data_extents; |
| size_t bmdr_minsz = xfs_bmdr_space_calc(1); |
| unsigned int lit_sz = XFS_LITINO(sc->mp); |
| unsigned int afork_min, dfork_min; |
| |
| trace_xrep_dinode_ensure_forkoff(sc, dip); |
| |
| /* |
| * Before calling this function, xrep_dinode_core ensured that both |
| * forks actually fit inside their respective literal areas. If this |
| * was not the case, the fork was reset to FMT_EXTENTS with zero |
| * records. If the rmapbt scan found attr or data fork blocks, this |
| * will be noted in the dinode_stats, and we must leave enough room |
| * for the bmap repair code to reconstruct the mapping structure. |
| * |
| * First, compute the minimum space required for the attr fork. |
| */ |
| switch (dip->di_aformat) { |
| case XFS_DINODE_FMT_LOCAL: |
| /* |
| * If we still have a shortform xattr structure at all, that |
| * means the attr fork area was exactly large enough to fit |
| * the sf structure. |
| */ |
| afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); |
| break; |
| case XFS_DINODE_FMT_EXTENTS: |
| attr_extents = xfs_dfork_attr_extents(dip); |
| if (attr_extents) { |
| /* |
| * We must maintain sufficient space to hold the entire |
| * extent map array in the data fork. Note that we |
| * previously zapped the fork if it had no chance of |
| * fitting in the inode. |
| */ |
| afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents; |
| } else if (ri->attr_extents > 0) { |
| /* |
| * The attr fork thinks it has zero extents, but we |
| * found some xattr extents. We need to leave enough |
| * empty space here so that the incore attr fork will |
| * get created (and hence trigger the attr fork bmap |
| * repairer). |
| */ |
| afork_min = bmdr_minsz; |
| } else { |
| /* No extents on disk or found in rmapbt. */ |
| afork_min = 0; |
| } |
| break; |
| case XFS_DINODE_FMT_BTREE: |
| /* Must have space for btree header and key/pointers. */ |
| bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); |
| afork_min = xfs_bmap_broot_space(sc->mp, bmdr); |
| break; |
| default: |
| /* We should never see any other formats. */ |
| afork_min = 0; |
| break; |
| } |
| |
| /* Compute the minimum space required for the data fork. */ |
| switch (dip->di_format) { |
| case XFS_DINODE_FMT_DEV: |
| dfork_min = sizeof(__be32); |
| break; |
| case XFS_DINODE_FMT_UUID: |
| dfork_min = sizeof(uuid_t); |
| break; |
| case XFS_DINODE_FMT_LOCAL: |
| /* |
| * If we still have a shortform data fork at all, that means |
| * the data fork area was large enough to fit whatever was in |
| * there. |
| */ |
| dfork_min = be64_to_cpu(dip->di_size); |
| break; |
| case XFS_DINODE_FMT_EXTENTS: |
| data_extents = xfs_dfork_data_extents(dip); |
| if (data_extents) { |
| /* |
| * We must maintain sufficient space to hold the entire |
| * extent map array in the data fork. Note that we |
| * previously zapped the fork if it had no chance of |
| * fitting in the inode. |
| */ |
| dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents; |
| } else if (ri->data_extents > 0 || ri->rt_extents > 0) { |
| /* |
| * The data fork thinks it has zero extents, but we |
| * found some data extents. We need to leave enough |
| * empty space here so that the data fork bmap repair |
| * will recover the mappings. |
| */ |
| dfork_min = bmdr_minsz; |
| } else { |
| /* No extents on disk or found in rmapbt. */ |
| dfork_min = 0; |
| } |
| break; |
| case XFS_DINODE_FMT_BTREE: |
| /* Must have space for btree header and key/pointers. */ |
| bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); |
| dfork_min = xfs_bmap_broot_space(sc->mp, bmdr); |
| break; |
| default: |
| dfork_min = 0; |
| break; |
| } |
| |
| /* |
| * Round all values up to the nearest 8 bytes, because that is the |
| * precision of di_forkoff. |
| */ |
| afork_min = roundup(afork_min, 8); |
| dfork_min = roundup(dfork_min, 8); |
| bmdr_minsz = roundup(bmdr_minsz, 8); |
| |
| ASSERT(dfork_min <= lit_sz); |
| ASSERT(afork_min <= lit_sz); |
| |
| /* |
| * If the data fork was zapped and we don't have enough space for the |
| * recovery fork, move the attr fork up. |
| */ |
| if (dip->di_format == XFS_DINODE_FMT_EXTENTS && |
| xfs_dfork_data_extents(dip) == 0 && |
| (ri->data_extents > 0 || ri->rt_extents > 0) && |
| bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) { |
| if (bmdr_minsz + afork_min > lit_sz) { |
| /* |
| * The attr for and the stub fork we need to recover |
| * the data fork won't both fit. Zap the attr fork. |
| */ |
| xrep_dinode_zap_afork(ri, dip, mode); |
| afork_min = bmdr_minsz; |
| } else { |
| void *before, *after; |
| |
| /* Otherwise, just slide the attr fork up. */ |
| before = XFS_DFORK_APTR(dip); |
| dip->di_forkoff = bmdr_minsz >> 3; |
| after = XFS_DFORK_APTR(dip); |
| memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp)); |
| } |
| } |
| |
| /* |
| * If the attr fork was zapped and we don't have enough space for the |
| * recovery fork, move the attr fork down. |
| */ |
| if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS && |
| xfs_dfork_attr_extents(dip) == 0 && |
| ri->attr_extents > 0 && |
| bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) { |
| if (dip->di_format == XFS_DINODE_FMT_BTREE) { |
| /* |
| * If the data fork is in btree format then we can't |
| * adjust forkoff because that runs the risk of |
| * violating the extents/btree format transition rules. |
| */ |
| } else if (bmdr_minsz + dfork_min > lit_sz) { |
| /* |
| * If we can't move the attr fork, too bad, we lose the |
| * attr fork and leak its blocks. |
| */ |
| xrep_dinode_zap_afork(ri, dip, mode); |
| } else { |
| /* |
| * Otherwise, just slide the attr fork down. The attr |
| * fork is empty, so we don't have any old contents to |
| * move here. |
| */ |
| dip->di_forkoff = (lit_sz - bmdr_minsz) >> 3; |
| } |
| } |
| } |
| |
| /* |
| * Zap the data/attr forks if we spot anything that isn't going to pass the |
| * ifork verifiers or the ifork formatters, because we need to get the inode |
| * into good enough shape that the higher level repair functions can run. |
| */ |
| STATIC void |
| xrep_dinode_zap_forks( |
| struct xrep_inode *ri, |
| struct xfs_dinode *dip) |
| { |
| struct xfs_scrub *sc = ri->sc; |
| xfs_extnum_t data_extents; |
| xfs_extnum_t attr_extents; |
| xfs_filblks_t nblocks; |
| uint16_t mode; |
| bool zap_datafork = false; |
| bool zap_attrfork = ri->zap_acls; |
| |
| trace_xrep_dinode_zap_forks(sc, dip); |
| |
| mode = be16_to_cpu(dip->di_mode); |
| |
| data_extents = xfs_dfork_data_extents(dip); |
| attr_extents = xfs_dfork_attr_extents(dip); |
| nblocks = be64_to_cpu(dip->di_nblocks); |
| |
| /* Inode counters don't make sense? */ |
| if (data_extents > nblocks) |
| zap_datafork = true; |
| if (attr_extents > nblocks) |
| zap_attrfork = true; |
| if (data_extents + attr_extents > nblocks) |
| zap_datafork = zap_attrfork = true; |
| |
| if (!zap_datafork) |
| zap_datafork = xrep_dinode_check_dfork(sc, dip, mode); |
| if (!zap_attrfork) |
| zap_attrfork = xrep_dinode_check_afork(sc, dip); |
| |
| /* Zap whatever's bad. */ |
| if (zap_attrfork) |
| xrep_dinode_zap_afork(ri, dip, mode); |
| if (zap_datafork) |
| xrep_dinode_zap_dfork(ri, dip, mode); |
| xrep_dinode_ensure_forkoff(ri, dip, mode); |
| |
| /* |
| * Zero di_nblocks if we don't have any extents at all to satisfy the |
| * buffer verifier. |
| */ |
| data_extents = xfs_dfork_data_extents(dip); |
| attr_extents = xfs_dfork_attr_extents(dip); |
| if (data_extents + attr_extents == 0) |
| dip->di_nblocks = 0; |
| } |
| |
| /* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */ |
| STATIC int |
| xrep_dinode_core( |
| struct xrep_inode *ri) |
| { |
| struct xfs_scrub *sc = ri->sc; |
| struct xfs_buf *bp; |
| struct xfs_dinode *dip; |
| xfs_ino_t ino = sc->sm->sm_ino; |
| int error; |
| int iget_error; |
| |
| /* Figure out what this inode had mapped in both forks. */ |
| error = xrep_dinode_count_rmaps(ri); |
| if (error) |
| return error; |
| |
| /* Read the inode cluster buffer. */ |
| error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, |
| ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp, |
| NULL); |
| if (error) |
| return error; |
| |
| /* Make sure we can pass the inode buffer verifier. */ |
| xrep_dinode_buf(sc, bp); |
| bp->b_ops = &xfs_inode_buf_ops; |
| |
| /* Fix everything the verifier will complain about. */ |
| dip = xfs_buf_offset(bp, ri->imap.im_boffset); |
| xrep_dinode_header(sc, dip); |
| iget_error = xrep_dinode_mode(ri, dip); |
| if (iget_error) |
| goto write; |
| xrep_dinode_nlinks(dip); |
| xrep_dinode_flags(sc, dip, ri->rt_extents > 0); |
| xrep_dinode_size(ri, dip); |
| xrep_dinode_extsize_hints(sc, dip); |
| xrep_dinode_zap_forks(ri, dip); |
| |
| write: |
| /* Write out the inode. */ |
| trace_xrep_dinode_fixed(sc, dip); |
| xfs_dinode_calc_crc(sc->mp, dip); |
| xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF); |
| xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset, |
| ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1); |
| |
| /* |
| * In theory, we've fixed the ondisk inode record enough that we should |
| * be able to load the inode into the cache. Try to iget that inode |
| * now while we hold the AGI and the inode cluster buffer and take the |
| * IOLOCK so that we can continue with repairs without anyone else |
| * accessing the inode. If iget fails, we still need to commit the |
| * changes. |
| */ |
| if (!iget_error) |
| iget_error = xchk_iget(sc, ino, &sc->ip); |
| if (!iget_error) |
| xchk_ilock(sc, XFS_IOLOCK_EXCL); |
| |
| /* |
| * Commit the inode cluster buffer updates and drop the AGI buffer that |
| * we've been holding since scrub setup. From here on out, repairs |
| * deal only with the cached inode. |
| */ |
| error = xrep_trans_commit(sc); |
| if (error) |
| return error; |
| |
| if (iget_error) |
| return iget_error; |
| |
| error = xchk_trans_alloc(sc, 0); |
| if (error) |
| return error; |
| |
| error = xrep_ino_dqattach(sc); |
| if (error) |
| return error; |
| |
| xchk_ilock(sc, XFS_ILOCK_EXCL); |
| if (ri->ino_sick_mask) |
| xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask); |
| return 0; |
| } |
| |
| /* Fix everything xfs_dinode_verify cares about. */ |
| STATIC int |
| xrep_dinode_problems( |
| struct xrep_inode *ri) |
| { |
| struct xfs_scrub *sc = ri->sc; |
| int error; |
| |
| error = xrep_dinode_core(ri); |
| if (error) |
| return error; |
| |
| /* We had to fix a totally busted inode, schedule quotacheck. */ |
| if (XFS_IS_UQUOTA_ON(sc->mp)) |
| xrep_force_quotacheck(sc, XFS_DQTYPE_USER); |
| if (XFS_IS_GQUOTA_ON(sc->mp)) |
| xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); |
| if (XFS_IS_PQUOTA_ON(sc->mp)) |
| xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); |
| |
| return 0; |
| } |
| |
| /* |
| * Fix problems that the verifiers don't care about. In general these are |
| * errors that don't cause problems elsewhere in the kernel that we can easily |
| * detect, so we don't check them all that rigorously. |
| */ |
| |
| /* Make sure block and extent counts are ok. */ |
| STATIC int |
| xrep_inode_blockcounts( |
| struct xfs_scrub *sc) |
| { |
| struct xfs_ifork *ifp; |
| xfs_filblks_t count; |
| xfs_filblks_t acount; |
| xfs_extnum_t nextents; |
| int error; |
| |
| trace_xrep_inode_blockcounts(sc); |
| |
| /* Set data fork counters from the data fork mappings. */ |
| error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, |
| &nextents, &count); |
| if (error) |
| return error; |
| if (xfs_is_reflink_inode(sc->ip)) { |
| /* |
| * data fork blockcount can exceed physical storage if a user |
| * reflinks the same block over and over again. |
| */ |
| ; |
| } else if (XFS_IS_REALTIME_INODE(sc->ip)) { |
| if (count >= sc->mp->m_sb.sb_rblocks) |
| return -EFSCORRUPTED; |
| } else { |
| if (count >= sc->mp->m_sb.sb_dblocks) |
| return -EFSCORRUPTED; |
| } |
| error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents); |
| if (error) |
| return error; |
| sc->ip->i_df.if_nextents = nextents; |
| |
| /* Set attr fork counters from the attr fork mappings. */ |
| ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); |
| if (ifp) { |
| error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, |
| &nextents, &acount); |
| if (error) |
| return error; |
| if (count >= sc->mp->m_sb.sb_dblocks) |
| return -EFSCORRUPTED; |
| error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK, |
| nextents); |
| if (error) |
| return error; |
| ifp->if_nextents = nextents; |
| } else { |
| acount = 0; |
| } |
| |
| sc->ip->i_nblocks = count + acount; |
| return 0; |
| } |
| |
| /* Check for invalid uid/gid/prid. */ |
| STATIC void |
| xrep_inode_ids( |
| struct xfs_scrub *sc) |
| { |
| bool dirty = false; |
| |
| trace_xrep_inode_ids(sc); |
| |
| if (!uid_valid(VFS_I(sc->ip)->i_uid)) { |
| i_uid_write(VFS_I(sc->ip), 0); |
| dirty = true; |
| if (XFS_IS_UQUOTA_ON(sc->mp)) |
| xrep_force_quotacheck(sc, XFS_DQTYPE_USER); |
| } |
| |
| if (!gid_valid(VFS_I(sc->ip)->i_gid)) { |
| i_gid_write(VFS_I(sc->ip), 0); |
| dirty = true; |
| if (XFS_IS_GQUOTA_ON(sc->mp)) |
| xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); |
| } |
| |
| if (sc->ip->i_projid == -1U) { |
| sc->ip->i_projid = 0; |
| dirty = true; |
| if (XFS_IS_PQUOTA_ON(sc->mp)) |
| xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); |
| } |
| |
| /* strip setuid/setgid if we touched any of the ids */ |
| if (dirty) |
| VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID); |
| } |
| |
| static inline void |
| xrep_clamp_timestamp( |
| struct xfs_inode *ip, |
| struct timespec64 *ts) |
| { |
| ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC); |
| *ts = timestamp_truncate(*ts, VFS_I(ip)); |
| } |
| |
| /* Nanosecond counters can't have more than 1 billion. */ |
| STATIC void |
| xrep_inode_timestamps( |
| struct xfs_inode *ip) |
| { |
| struct timespec64 tstamp; |
| struct inode *inode = VFS_I(ip); |
| |
| tstamp = inode_get_atime(inode); |
| xrep_clamp_timestamp(ip, &tstamp); |
| inode_set_atime_to_ts(inode, tstamp); |
| |
| tstamp = inode_get_mtime(inode); |
| xrep_clamp_timestamp(ip, &tstamp); |
| inode_set_mtime_to_ts(inode, tstamp); |
| |
| tstamp = inode_get_ctime(inode); |
| xrep_clamp_timestamp(ip, &tstamp); |
| inode_set_ctime_to_ts(inode, tstamp); |
| |
| xrep_clamp_timestamp(ip, &ip->i_crtime); |
| } |
| |
| /* Fix inode flags that don't make sense together. */ |
| STATIC void |
| xrep_inode_flags( |
| struct xfs_scrub *sc) |
| { |
| uint16_t mode; |
| |
| trace_xrep_inode_flags(sc); |
| |
| mode = VFS_I(sc->ip)->i_mode; |
| |
| /* Clear junk flags */ |
| if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY) |
| sc->ip->i_diflags &= ~XFS_DIFLAG_ANY; |
| |
| /* NEWRTBM only applies to realtime bitmaps */ |
| if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino) |
| sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM; |
| else |
| sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM; |
| |
| /* These only make sense for directories. */ |
| if (!S_ISDIR(mode)) |
| sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT | |
| XFS_DIFLAG_EXTSZINHERIT | |
| XFS_DIFLAG_PROJINHERIT | |
| XFS_DIFLAG_NOSYMLINKS); |
| |
| /* These only make sense for files. */ |
| if (!S_ISREG(mode)) |
| sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME | |
| XFS_DIFLAG_EXTSIZE); |
| |
| /* These only make sense for non-rt files. */ |
| if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) |
| sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM; |
| |
| /* Immutable and append only? Drop the append. */ |
| if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) && |
| (sc->ip->i_diflags & XFS_DIFLAG_APPEND)) |
| sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND; |
| |
| /* Clear junk flags. */ |
| if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY) |
| sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY; |
| |
| /* No reflink flag unless we support it and it's a file. */ |
| if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode)) |
| sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; |
| |
| /* DAX only applies to files and dirs. */ |
| if (!(S_ISREG(mode) || S_ISDIR(mode))) |
| sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX; |
| |
| /* No reflink files on the realtime device. */ |
| if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) |
| sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; |
| } |
| |
| /* |
| * Fix size problems with block/node format directories. If we fail to find |
| * the extent list, just bail out and let the bmapbtd repair functions clean |
| * up that mess. |
| */ |
| STATIC void |
| xrep_inode_blockdir_size( |
| struct xfs_scrub *sc) |
| { |
| struct xfs_iext_cursor icur; |
| struct xfs_bmbt_irec got; |
| struct xfs_ifork *ifp; |
| xfs_fileoff_t off; |
| int error; |
| |
| trace_xrep_inode_blockdir_size(sc); |
| |
| error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK); |
| if (error) |
| return; |
| |
| /* Find the last block before 32G; this is the dir size. */ |
| ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); |
| off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE); |
| if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) { |
| /* zero-extents directory? */ |
| return; |
| } |
| |
| off = got.br_startoff + got.br_blockcount; |
| sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE, |
| XFS_FSB_TO_B(sc->mp, off)); |
| } |
| |
| /* Fix size problems with short format directories. */ |
| STATIC void |
| xrep_inode_sfdir_size( |
| struct xfs_scrub *sc) |
| { |
| struct xfs_ifork *ifp; |
| |
| trace_xrep_inode_sfdir_size(sc); |
| |
| ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); |
| sc->ip->i_disk_size = ifp->if_bytes; |
| } |
| |
| /* |
| * Fix any irregularities in a directory inode's size now that we can iterate |
| * extent maps and access other regular inode data. |
| */ |
| STATIC void |
| xrep_inode_dir_size( |
| struct xfs_scrub *sc) |
| { |
| trace_xrep_inode_dir_size(sc); |
| |
| switch (sc->ip->i_df.if_format) { |
| case XFS_DINODE_FMT_EXTENTS: |
| case XFS_DINODE_FMT_BTREE: |
| xrep_inode_blockdir_size(sc); |
| break; |
| case XFS_DINODE_FMT_LOCAL: |
| xrep_inode_sfdir_size(sc); |
| break; |
| } |
| } |
| |
| /* Fix extent size hint problems. */ |
| STATIC void |
| xrep_inode_extsize( |
| struct xfs_scrub *sc) |
| { |
| /* Fix misaligned extent size hints on a directory. */ |
| if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) && |
| (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && |
| xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > 0) { |
| sc->ip->i_extsize = 0; |
| sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT; |
| } |
| } |
| |
| /* Ensure this file has an attr fork if it needs to hold a parent pointer. */ |
| STATIC int |
| xrep_inode_pptr( |
| struct xfs_scrub *sc) |
| { |
| struct xfs_mount *mp = sc->mp; |
| struct xfs_inode *ip = sc->ip; |
| struct inode *inode = VFS_I(ip); |
| |
| if (!xfs_has_parent(mp)) |
| return 0; |
| |
| /* |
| * Unlinked inodes that cannot be added to the directory tree will not |
| * have a parent pointer. |
| */ |
| if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) |
| return 0; |
| |
| /* The root directory doesn't have a parent pointer. */ |
| if (ip == mp->m_rootip) |
| return 0; |
| |
| /* |
| * Metadata inodes are rooted in the superblock and do not have any |
| * parents. |
| */ |
| if (xfs_is_metadata_inode(ip)) |
| return 0; |
| |
| /* Inode already has an attr fork; no further work possible here. */ |
| if (xfs_inode_has_attr_fork(ip)) |
| return 0; |
| |
| return xfs_bmap_add_attrfork(sc->tp, ip, |
| sizeof(struct xfs_attr_sf_hdr), true); |
| } |
| |
| /* Fix any irregularities in an inode that the verifiers don't catch. */ |
| STATIC int |
| xrep_inode_problems( |
| struct xfs_scrub *sc) |
| { |
| int error; |
| |
| error = xrep_inode_blockcounts(sc); |
| if (error) |
| return error; |
| error = xrep_inode_pptr(sc); |
| if (error) |
| return error; |
| xrep_inode_timestamps(sc->ip); |
| xrep_inode_flags(sc); |
| xrep_inode_ids(sc); |
| /* |
| * We can now do a better job fixing the size of a directory now that |
| * we can scan the data fork extents than we could in xrep_dinode_size. |
| */ |
| if (S_ISDIR(VFS_I(sc->ip)->i_mode)) |
| xrep_inode_dir_size(sc); |
| xrep_inode_extsize(sc); |
| |
| trace_xrep_inode_fixed(sc); |
| xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); |
| return xrep_roll_trans(sc); |
| } |
| |
| /* |
| * Make sure this inode's unlinked list pointers are consistent with its |
| * link count. |
| */ |
| STATIC int |
| xrep_inode_unlinked( |
| struct xfs_scrub *sc) |
| { |
| unsigned int nlink = VFS_I(sc->ip)->i_nlink; |
| int error; |
| |
| /* |
| * If this inode is linked from the directory tree and on the unlinked |
| * list, remove it from the unlinked list. |
| */ |
| if (nlink > 0 && xfs_inode_on_unlinked_list(sc->ip)) { |
| struct xfs_perag *pag; |
| int error; |
| |
| pag = xfs_perag_get(sc->mp, |
| XFS_INO_TO_AGNO(sc->mp, sc->ip->i_ino)); |
| error = xfs_iunlink_remove(sc->tp, pag, sc->ip); |
| xfs_perag_put(pag); |
| if (error) |
| return error; |
| } |
| |
| /* |
| * If this inode is not linked from the directory tree yet not on the |
| * unlinked list, put it on the unlinked list. |
| */ |
| if (nlink == 0 && !xfs_inode_on_unlinked_list(sc->ip)) { |
| error = xfs_iunlink(sc->tp, sc->ip); |
| if (error) |
| return error; |
| } |
| |
| return 0; |
| } |
| |
| /* Repair an inode's fields. */ |
| int |
| xrep_inode( |
| struct xfs_scrub *sc) |
| { |
| int error = 0; |
| |
| /* |
| * No inode? That means we failed the _iget verifiers. Repair all |
| * the things that the inode verifiers care about, then retry _iget. |
| */ |
| if (!sc->ip) { |
| struct xrep_inode *ri = sc->buf; |
| |
| ASSERT(ri != NULL); |
| |
| error = xrep_dinode_problems(ri); |
| if (error == -EBUSY) { |
| /* |
| * Directory scan to recover inode mode encountered a |
| * busy inode, so we did not continue repairing things. |
| */ |
| return 0; |
| } |
| if (error) |
| return error; |
| |
| /* By this point we had better have a working incore inode. */ |
| if (!sc->ip) |
| return -EFSCORRUPTED; |
| } |
| |
| xfs_trans_ijoin(sc->tp, sc->ip, 0); |
| |
| /* If we found corruption of any kind, try to fix it. */ |
| if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) || |
| (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) { |
| error = xrep_inode_problems(sc); |
| if (error) |
| return error; |
| } |
| |
| /* See if we can clear the reflink flag. */ |
| if (xfs_is_reflink_inode(sc->ip)) { |
| error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp); |
| if (error) |
| return error; |
| } |
| |
| /* Reconnect incore unlinked list */ |
| error = xrep_inode_unlinked(sc); |
| if (error) |
| return error; |
| |
| return xrep_defer_finish(sc); |
| } |