Darrick J. Wong | dbfbf3b | 2023-12-15 10:03:32 -0800 | [diff] [blame] | 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
| 2 | /* |
| 3 | * Copyright (C) 2018-2023 Oracle. All Rights Reserved. |
| 4 | * Author: Darrick J. Wong <djwong@kernel.org> |
| 5 | */ |
| 6 | #include "xfs.h" |
| 7 | #include "xfs_fs.h" |
| 8 | #include "xfs_shared.h" |
| 9 | #include "xfs_format.h" |
| 10 | #include "xfs_trans_resv.h" |
| 11 | #include "xfs_mount.h" |
| 12 | #include "xfs_defer.h" |
| 13 | #include "xfs_btree.h" |
| 14 | #include "xfs_btree_staging.h" |
| 15 | #include "xfs_bit.h" |
| 16 | #include "xfs_log_format.h" |
| 17 | #include "xfs_trans.h" |
| 18 | #include "xfs_sb.h" |
| 19 | #include "xfs_inode.h" |
| 20 | #include "xfs_alloc.h" |
| 21 | #include "xfs_ialloc.h" |
| 22 | #include "xfs_ialloc_btree.h" |
| 23 | #include "xfs_icache.h" |
| 24 | #include "xfs_rmap.h" |
| 25 | #include "xfs_rmap_btree.h" |
| 26 | #include "xfs_log.h" |
| 27 | #include "xfs_trans_priv.h" |
| 28 | #include "xfs_error.h" |
| 29 | #include "xfs_health.h" |
| 30 | #include "xfs_ag.h" |
| 31 | #include "scrub/xfs_scrub.h" |
| 32 | #include "scrub/scrub.h" |
| 33 | #include "scrub/common.h" |
| 34 | #include "scrub/btree.h" |
| 35 | #include "scrub/trace.h" |
| 36 | #include "scrub/repair.h" |
| 37 | #include "scrub/bitmap.h" |
| 38 | #include "scrub/agb_bitmap.h" |
| 39 | #include "scrub/xfile.h" |
| 40 | #include "scrub/xfarray.h" |
| 41 | #include "scrub/newbt.h" |
| 42 | #include "scrub/reap.h" |
| 43 | |
| 44 | /* |
| 45 | * Inode Btree Repair |
| 46 | * ================== |
| 47 | * |
| 48 | * A quick refresher of inode btrees on a v5 filesystem: |
| 49 | * |
| 50 | * - Inode records are read into memory in units of 'inode clusters'. However |
| 51 | * many inodes fit in a cluster buffer is the smallest number of inodes that |
| 52 | * can be allocated or freed. Clusters are never smaller than one fs block |
| 53 | * though they can span multiple blocks. The size (in fs blocks) is |
| 54 | * computed with xfs_icluster_size_fsb(). The fs block alignment of a |
| 55 | * cluster is computed with xfs_ialloc_cluster_alignment(). |
| 56 | * |
| 57 | * - Each inode btree record can describe a single 'inode chunk'. The chunk |
| 58 | * size is defined to be 64 inodes. If sparse inodes are enabled, every |
| 59 | * inobt record must be aligned to the chunk size; if not, every record must |
| 60 | * be aligned to the start of a cluster. It is possible to construct an XFS |
| 61 | * geometry where one inobt record maps to multiple inode clusters; it is |
| 62 | * also possible to construct a geometry where multiple inobt records map to |
| 63 | * different parts of one inode cluster. |
| 64 | * |
| 65 | * - If sparse inodes are not enabled, the smallest unit of allocation for |
| 66 | * inode records is enough to contain one inode chunk's worth of inodes. |
| 67 | * |
| 68 | * - If sparse inodes are enabled, the holemask field will be active. Each |
| 69 | * bit of the holemask represents 4 potential inodes; if set, the |
| 70 | * corresponding space does *not* contain inodes and must be left alone. |
| 71 | * Clusters cannot be smaller than 4 inodes. The smallest unit of allocation |
| 72 | * of inode records is one inode cluster. |
| 73 | * |
| 74 | * So what's the rebuild algorithm? |
| 75 | * |
| 76 | * Iterate the reverse mapping records looking for OWN_INODES and OWN_INOBT |
| 77 | * records. The OWN_INOBT records are the old inode btree blocks and will be |
| 78 | * cleared out after we've rebuilt the tree. Each possible inode cluster |
| 79 | * within an OWN_INODES record will be read in; for each possible inobt record |
| 80 | * associated with that cluster, compute the freemask calculated from the |
| 81 | * i_mode data in the inode chunk. For sparse inodes the holemask will be |
| 82 | * calculated by creating the properly aligned inobt record and punching out |
| 83 | * any chunk that's missing. Inode allocations and frees grab the AGI first, |
| 84 | * so repair protects itself from concurrent access by locking the AGI. |
| 85 | * |
| 86 | * Once we've reconstructed all the inode records, we can create new inode |
| 87 | * btree roots and reload the btrees. We rebuild both inode trees at the same |
| 88 | * time because they have the same rmap owner and it would be more complex to |
| 89 | * figure out if the other tree isn't in need of a rebuild and which OWN_INOBT |
| 90 | * blocks it owns. We have all the data we need to build both, so dump |
| 91 | * everything and start over. |
| 92 | * |
| 93 | * We use the prefix 'xrep_ibt' because we rebuild both inode btrees at once. |
| 94 | */ |
| 95 | |
| 96 | struct xrep_ibt { |
| 97 | /* Record under construction. */ |
| 98 | struct xfs_inobt_rec_incore rie; |
| 99 | |
| 100 | /* new inobt information */ |
| 101 | struct xrep_newbt new_inobt; |
| 102 | |
| 103 | /* new finobt information */ |
| 104 | struct xrep_newbt new_finobt; |
| 105 | |
| 106 | /* Old inode btree blocks we found in the rmap. */ |
| 107 | struct xagb_bitmap old_iallocbt_blocks; |
| 108 | |
| 109 | /* Reconstructed inode records. */ |
| 110 | struct xfarray *inode_records; |
| 111 | |
| 112 | struct xfs_scrub *sc; |
| 113 | |
| 114 | /* Number of inodes assigned disk space. */ |
| 115 | unsigned int icount; |
| 116 | |
| 117 | /* Number of inodes in use. */ |
| 118 | unsigned int iused; |
| 119 | |
| 120 | /* Number of finobt records needed. */ |
| 121 | unsigned int finobt_recs; |
| 122 | |
| 123 | /* get_records()'s position in the inode record array. */ |
| 124 | xfarray_idx_t array_cur; |
| 125 | }; |
| 126 | |
| 127 | /* |
| 128 | * Is this inode in use? If the inode is in memory we can tell from i_mode, |
| 129 | * otherwise we have to check di_mode in the on-disk buffer. We only care |
| 130 | * that the high (i.e. non-permission) bits of _mode are zero. This should be |
| 131 | * safe because repair keeps all AG headers locked until the end, and process |
| 132 | * trying to perform an inode allocation/free must lock the AGI. |
| 133 | * |
| 134 | * @cluster_ag_base is the inode offset of the cluster within the AG. |
| 135 | * @cluster_bp is the cluster buffer. |
| 136 | * @cluster_index is the inode offset within the inode cluster. |
| 137 | */ |
| 138 | STATIC int |
| 139 | xrep_ibt_check_ifree( |
| 140 | struct xrep_ibt *ri, |
| 141 | xfs_agino_t cluster_ag_base, |
| 142 | struct xfs_buf *cluster_bp, |
| 143 | unsigned int cluster_index, |
| 144 | bool *inuse) |
| 145 | { |
| 146 | struct xfs_scrub *sc = ri->sc; |
| 147 | struct xfs_mount *mp = sc->mp; |
| 148 | struct xfs_dinode *dip; |
| 149 | xfs_ino_t fsino; |
| 150 | xfs_agino_t agino; |
| 151 | xfs_agnumber_t agno = ri->sc->sa.pag->pag_agno; |
| 152 | unsigned int cluster_buf_base; |
| 153 | unsigned int offset; |
| 154 | int error; |
| 155 | |
| 156 | agino = cluster_ag_base + cluster_index; |
| 157 | fsino = XFS_AGINO_TO_INO(mp, agno, agino); |
| 158 | |
| 159 | /* Inode uncached or half assembled, read disk buffer */ |
| 160 | cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base); |
| 161 | offset = (cluster_buf_base + cluster_index) * mp->m_sb.sb_inodesize; |
| 162 | if (offset >= BBTOB(cluster_bp->b_length)) |
| 163 | return -EFSCORRUPTED; |
| 164 | dip = xfs_buf_offset(cluster_bp, offset); |
| 165 | if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) |
| 166 | return -EFSCORRUPTED; |
| 167 | |
| 168 | if (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino) |
| 169 | return -EFSCORRUPTED; |
| 170 | |
| 171 | /* Will the in-core inode tell us if it's in use? */ |
| 172 | error = xchk_inode_is_allocated(sc, agino, inuse); |
| 173 | if (!error) |
| 174 | return 0; |
| 175 | |
| 176 | *inuse = dip->di_mode != 0; |
| 177 | return 0; |
| 178 | } |
| 179 | |
| 180 | /* Stash the accumulated inobt record for rebuilding. */ |
| 181 | STATIC int |
| 182 | xrep_ibt_stash( |
| 183 | struct xrep_ibt *ri) |
| 184 | { |
| 185 | int error = 0; |
| 186 | |
| 187 | if (xchk_should_terminate(ri->sc, &error)) |
| 188 | return error; |
| 189 | |
| 190 | ri->rie.ir_freecount = xfs_inobt_rec_freecount(&ri->rie); |
| 191 | if (xfs_inobt_check_irec(ri->sc->sa.pag, &ri->rie) != NULL) |
| 192 | return -EFSCORRUPTED; |
| 193 | |
| 194 | if (ri->rie.ir_freecount > 0) |
| 195 | ri->finobt_recs++; |
| 196 | |
| 197 | trace_xrep_ibt_found(ri->sc->mp, ri->sc->sa.pag->pag_agno, &ri->rie); |
| 198 | |
| 199 | error = xfarray_append(ri->inode_records, &ri->rie); |
| 200 | if (error) |
| 201 | return error; |
| 202 | |
| 203 | ri->rie.ir_startino = NULLAGINO; |
| 204 | return 0; |
| 205 | } |
| 206 | |
| 207 | /* |
| 208 | * Given an extent of inodes and an inode cluster buffer, calculate the |
| 209 | * location of the corresponding inobt record (creating it if necessary), |
| 210 | * then update the parts of the holemask and freemask of that record that |
| 211 | * correspond to the inode extent we were given. |
| 212 | * |
| 213 | * @cluster_ir_startino is the AG inode number of an inobt record that we're |
| 214 | * proposing to create for this inode cluster. If sparse inodes are enabled, |
| 215 | * we must round down to a chunk boundary to find the actual sparse record. |
| 216 | * @cluster_bp is the buffer of the inode cluster. |
| 217 | * @nr_inodes is the number of inodes to check from the cluster. |
| 218 | */ |
| 219 | STATIC int |
| 220 | xrep_ibt_cluster_record( |
| 221 | struct xrep_ibt *ri, |
| 222 | xfs_agino_t cluster_ir_startino, |
| 223 | struct xfs_buf *cluster_bp, |
| 224 | unsigned int nr_inodes) |
| 225 | { |
| 226 | struct xfs_scrub *sc = ri->sc; |
| 227 | struct xfs_mount *mp = sc->mp; |
| 228 | xfs_agino_t ir_startino; |
| 229 | unsigned int cluster_base; |
| 230 | unsigned int cluster_index; |
| 231 | int error = 0; |
| 232 | |
| 233 | ir_startino = cluster_ir_startino; |
| 234 | if (xfs_has_sparseinodes(mp)) |
| 235 | ir_startino = rounddown(ir_startino, XFS_INODES_PER_CHUNK); |
| 236 | cluster_base = cluster_ir_startino - ir_startino; |
| 237 | |
| 238 | /* |
| 239 | * If the accumulated inobt record doesn't map this cluster, add it to |
| 240 | * the list and reset it. |
| 241 | */ |
| 242 | if (ri->rie.ir_startino != NULLAGINO && |
| 243 | ri->rie.ir_startino + XFS_INODES_PER_CHUNK <= ir_startino) { |
| 244 | error = xrep_ibt_stash(ri); |
| 245 | if (error) |
| 246 | return error; |
| 247 | } |
| 248 | |
| 249 | if (ri->rie.ir_startino == NULLAGINO) { |
| 250 | ri->rie.ir_startino = ir_startino; |
| 251 | ri->rie.ir_free = XFS_INOBT_ALL_FREE; |
| 252 | ri->rie.ir_holemask = 0xFFFF; |
| 253 | ri->rie.ir_count = 0; |
| 254 | } |
| 255 | |
| 256 | /* Record the whole cluster. */ |
| 257 | ri->icount += nr_inodes; |
| 258 | ri->rie.ir_count += nr_inodes; |
| 259 | ri->rie.ir_holemask &= ~xfs_inobt_maskn( |
| 260 | cluster_base / XFS_INODES_PER_HOLEMASK_BIT, |
| 261 | nr_inodes / XFS_INODES_PER_HOLEMASK_BIT); |
| 262 | |
| 263 | /* Which inodes within this cluster are free? */ |
| 264 | for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) { |
| 265 | bool inuse = false; |
| 266 | |
| 267 | error = xrep_ibt_check_ifree(ri, cluster_ir_startino, |
| 268 | cluster_bp, cluster_index, &inuse); |
| 269 | if (error) |
| 270 | return error; |
| 271 | if (!inuse) |
| 272 | continue; |
| 273 | ri->iused++; |
| 274 | ri->rie.ir_free &= ~XFS_INOBT_MASK(cluster_base + |
| 275 | cluster_index); |
| 276 | } |
| 277 | return 0; |
| 278 | } |
| 279 | |
| 280 | /* |
| 281 | * For each inode cluster covering the physical extent recorded by the rmapbt, |
| 282 | * we must calculate the properly aligned startino of that cluster, then |
| 283 | * iterate each cluster to fill in used and filled masks appropriately. We |
| 284 | * then use the (startino, used, filled) information to construct the |
| 285 | * appropriate inode records. |
| 286 | */ |
| 287 | STATIC int |
| 288 | xrep_ibt_process_cluster( |
| 289 | struct xrep_ibt *ri, |
| 290 | xfs_agblock_t cluster_bno) |
| 291 | { |
| 292 | struct xfs_imap imap; |
| 293 | struct xfs_buf *cluster_bp; |
| 294 | struct xfs_scrub *sc = ri->sc; |
| 295 | struct xfs_mount *mp = sc->mp; |
| 296 | struct xfs_ino_geometry *igeo = M_IGEO(mp); |
| 297 | xfs_agino_t cluster_ag_base; |
| 298 | xfs_agino_t irec_index; |
| 299 | unsigned int nr_inodes; |
| 300 | int error; |
| 301 | |
| 302 | nr_inodes = min_t(unsigned int, igeo->inodes_per_cluster, |
| 303 | XFS_INODES_PER_CHUNK); |
| 304 | |
| 305 | /* |
| 306 | * Grab the inode cluster buffer. This is safe to do with a broken |
| 307 | * inobt because imap_to_bp directly maps the buffer without touching |
| 308 | * either inode btree. |
| 309 | */ |
| 310 | imap.im_blkno = XFS_AGB_TO_DADDR(mp, sc->sa.pag->pag_agno, cluster_bno); |
| 311 | imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster); |
| 312 | imap.im_boffset = 0; |
| 313 | error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp); |
| 314 | if (error) |
| 315 | return error; |
| 316 | |
| 317 | /* |
| 318 | * Record the contents of each possible inobt record mapping this |
| 319 | * cluster. |
| 320 | */ |
| 321 | cluster_ag_base = XFS_AGB_TO_AGINO(mp, cluster_bno); |
| 322 | for (irec_index = 0; |
| 323 | irec_index < igeo->inodes_per_cluster; |
| 324 | irec_index += XFS_INODES_PER_CHUNK) { |
| 325 | error = xrep_ibt_cluster_record(ri, |
| 326 | cluster_ag_base + irec_index, cluster_bp, |
| 327 | nr_inodes); |
| 328 | if (error) |
| 329 | break; |
| 330 | |
| 331 | } |
| 332 | |
| 333 | xfs_trans_brelse(sc->tp, cluster_bp); |
| 334 | return error; |
| 335 | } |
| 336 | |
| 337 | /* Check for any obvious conflicts in the inode chunk extent. */ |
| 338 | STATIC int |
| 339 | xrep_ibt_check_inode_ext( |
| 340 | struct xfs_scrub *sc, |
| 341 | xfs_agblock_t agbno, |
| 342 | xfs_extlen_t len) |
| 343 | { |
| 344 | struct xfs_mount *mp = sc->mp; |
| 345 | struct xfs_ino_geometry *igeo = M_IGEO(mp); |
| 346 | xfs_agino_t agino; |
| 347 | enum xbtree_recpacking outcome; |
| 348 | int error; |
| 349 | |
| 350 | /* Inode records must be within the AG. */ |
| 351 | if (!xfs_verify_agbext(sc->sa.pag, agbno, len)) |
| 352 | return -EFSCORRUPTED; |
| 353 | |
| 354 | /* The entire record must align to the inode cluster size. */ |
| 355 | if (!IS_ALIGNED(agbno, igeo->blocks_per_cluster) || |
| 356 | !IS_ALIGNED(agbno + len, igeo->blocks_per_cluster)) |
| 357 | return -EFSCORRUPTED; |
| 358 | |
| 359 | /* |
| 360 | * The entire record must also adhere to the inode cluster alignment |
| 361 | * size if sparse inodes are not enabled. |
| 362 | */ |
| 363 | if (!xfs_has_sparseinodes(mp) && |
| 364 | (!IS_ALIGNED(agbno, igeo->cluster_align) || |
| 365 | !IS_ALIGNED(agbno + len, igeo->cluster_align))) |
| 366 | return -EFSCORRUPTED; |
| 367 | |
| 368 | /* |
| 369 | * On a sparse inode fs, this cluster could be part of a sparse chunk. |
| 370 | * Sparse clusters must be aligned to sparse chunk alignment. |
| 371 | */ |
Darrick J. Wong | 1149314 | 2024-01-31 11:47:14 -0800 | [diff] [blame] | 372 | if (xfs_has_sparseinodes(mp) && mp->m_sb.sb_spino_align && |
Darrick J. Wong | dbfbf3b | 2023-12-15 10:03:32 -0800 | [diff] [blame] | 373 | (!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) || |
| 374 | !IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align))) |
| 375 | return -EFSCORRUPTED; |
| 376 | |
| 377 | /* Make sure the entire range of blocks are valid AG inodes. */ |
| 378 | agino = XFS_AGB_TO_AGINO(mp, agbno); |
| 379 | if (!xfs_verify_agino(sc->sa.pag, agino)) |
| 380 | return -EFSCORRUPTED; |
| 381 | |
| 382 | agino = XFS_AGB_TO_AGINO(mp, agbno + len) - 1; |
| 383 | if (!xfs_verify_agino(sc->sa.pag, agino)) |
| 384 | return -EFSCORRUPTED; |
| 385 | |
| 386 | /* Make sure this isn't free space. */ |
| 387 | error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome); |
| 388 | if (error) |
| 389 | return error; |
| 390 | if (outcome != XBTREE_RECPACKING_EMPTY) |
| 391 | return -EFSCORRUPTED; |
| 392 | |
| 393 | return 0; |
| 394 | } |
| 395 | |
| 396 | /* Found a fragment of the old inode btrees; dispose of them later. */ |
| 397 | STATIC int |
| 398 | xrep_ibt_record_old_btree_blocks( |
| 399 | struct xrep_ibt *ri, |
| 400 | const struct xfs_rmap_irec *rec) |
| 401 | { |
| 402 | if (!xfs_verify_agbext(ri->sc->sa.pag, rec->rm_startblock, |
| 403 | rec->rm_blockcount)) |
| 404 | return -EFSCORRUPTED; |
| 405 | |
| 406 | return xagb_bitmap_set(&ri->old_iallocbt_blocks, rec->rm_startblock, |
| 407 | rec->rm_blockcount); |
| 408 | } |
| 409 | |
| 410 | /* Record extents that belong to inode cluster blocks. */ |
| 411 | STATIC int |
| 412 | xrep_ibt_record_inode_blocks( |
| 413 | struct xrep_ibt *ri, |
| 414 | const struct xfs_rmap_irec *rec) |
| 415 | { |
| 416 | struct xfs_mount *mp = ri->sc->mp; |
| 417 | struct xfs_ino_geometry *igeo = M_IGEO(mp); |
| 418 | xfs_agblock_t cluster_base; |
| 419 | int error; |
| 420 | |
| 421 | error = xrep_ibt_check_inode_ext(ri->sc, rec->rm_startblock, |
| 422 | rec->rm_blockcount); |
| 423 | if (error) |
| 424 | return error; |
| 425 | |
| 426 | trace_xrep_ibt_walk_rmap(mp, ri->sc->sa.pag->pag_agno, |
| 427 | rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, |
| 428 | rec->rm_offset, rec->rm_flags); |
| 429 | |
| 430 | /* |
| 431 | * Record the free/hole masks for each inode cluster that could be |
| 432 | * mapped by this rmap record. |
| 433 | */ |
| 434 | for (cluster_base = 0; |
| 435 | cluster_base < rec->rm_blockcount; |
| 436 | cluster_base += igeo->blocks_per_cluster) { |
| 437 | error = xrep_ibt_process_cluster(ri, |
| 438 | rec->rm_startblock + cluster_base); |
| 439 | if (error) |
| 440 | return error; |
| 441 | } |
| 442 | |
| 443 | return 0; |
| 444 | } |
| 445 | |
| 446 | STATIC int |
| 447 | xrep_ibt_walk_rmap( |
| 448 | struct xfs_btree_cur *cur, |
| 449 | const struct xfs_rmap_irec *rec, |
| 450 | void *priv) |
| 451 | { |
| 452 | struct xrep_ibt *ri = priv; |
| 453 | int error = 0; |
| 454 | |
| 455 | if (xchk_should_terminate(ri->sc, &error)) |
| 456 | return error; |
| 457 | |
| 458 | switch (rec->rm_owner) { |
| 459 | case XFS_RMAP_OWN_INOBT: |
| 460 | return xrep_ibt_record_old_btree_blocks(ri, rec); |
| 461 | case XFS_RMAP_OWN_INODES: |
| 462 | return xrep_ibt_record_inode_blocks(ri, rec); |
| 463 | } |
| 464 | return 0; |
| 465 | } |
| 466 | |
| 467 | /* |
| 468 | * Iterate all reverse mappings to find the inodes (OWN_INODES) and the inode |
| 469 | * btrees (OWN_INOBT). Figure out if we have enough free space to reconstruct |
| 470 | * the inode btrees. The caller must clean up the lists if anything goes |
| 471 | * wrong. |
| 472 | */ |
| 473 | STATIC int |
| 474 | xrep_ibt_find_inodes( |
| 475 | struct xrep_ibt *ri) |
| 476 | { |
| 477 | struct xfs_scrub *sc = ri->sc; |
| 478 | int error; |
| 479 | |
| 480 | ri->rie.ir_startino = NULLAGINO; |
| 481 | |
| 482 | /* Collect all reverse mappings for inode blocks. */ |
| 483 | xrep_ag_btcur_init(sc, &sc->sa); |
| 484 | error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_ibt_walk_rmap, ri); |
| 485 | xchk_ag_btcur_free(&sc->sa); |
| 486 | if (error) |
| 487 | return error; |
| 488 | |
| 489 | /* If we have a record ready to go, add it to the array. */ |
| 490 | if (ri->rie.ir_startino != NULLAGINO) |
| 491 | return xrep_ibt_stash(ri); |
| 492 | |
| 493 | return 0; |
| 494 | } |
| 495 | |
| 496 | /* Update the AGI counters. */ |
| 497 | STATIC int |
| 498 | xrep_ibt_reset_counters( |
| 499 | struct xrep_ibt *ri) |
| 500 | { |
| 501 | struct xfs_scrub *sc = ri->sc; |
| 502 | struct xfs_agi *agi = sc->sa.agi_bp->b_addr; |
| 503 | unsigned int freecount = ri->icount - ri->iused; |
| 504 | |
| 505 | /* Trigger inode count recalculation */ |
| 506 | xfs_force_summary_recalc(sc->mp); |
| 507 | |
| 508 | /* |
| 509 | * The AGI header contains extra information related to the inode |
| 510 | * btrees, so we must update those fields here. |
| 511 | */ |
| 512 | agi->agi_count = cpu_to_be32(ri->icount); |
| 513 | agi->agi_freecount = cpu_to_be32(freecount); |
| 514 | xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, |
| 515 | XFS_AGI_COUNT | XFS_AGI_FREECOUNT); |
| 516 | |
| 517 | /* Reinitialize with the values we just logged. */ |
| 518 | return xrep_reinit_pagi(sc); |
| 519 | } |
| 520 | |
| 521 | /* Retrieve finobt data for bulk load. */ |
| 522 | STATIC int |
| 523 | xrep_fibt_get_records( |
| 524 | struct xfs_btree_cur *cur, |
| 525 | unsigned int idx, |
| 526 | struct xfs_btree_block *block, |
| 527 | unsigned int nr_wanted, |
| 528 | void *priv) |
| 529 | { |
| 530 | struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i; |
| 531 | struct xrep_ibt *ri = priv; |
| 532 | union xfs_btree_rec *block_rec; |
| 533 | unsigned int loaded; |
| 534 | int error; |
| 535 | |
| 536 | for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { |
| 537 | do { |
| 538 | error = xfarray_load(ri->inode_records, |
| 539 | ri->array_cur++, irec); |
| 540 | } while (error == 0 && xfs_inobt_rec_freecount(irec) == 0); |
| 541 | if (error) |
| 542 | return error; |
| 543 | |
| 544 | block_rec = xfs_btree_rec_addr(cur, idx, block); |
| 545 | cur->bc_ops->init_rec_from_cur(cur, block_rec); |
| 546 | } |
| 547 | |
| 548 | return loaded; |
| 549 | } |
| 550 | |
| 551 | /* Retrieve inobt data for bulk load. */ |
| 552 | STATIC int |
| 553 | xrep_ibt_get_records( |
| 554 | struct xfs_btree_cur *cur, |
| 555 | unsigned int idx, |
| 556 | struct xfs_btree_block *block, |
| 557 | unsigned int nr_wanted, |
| 558 | void *priv) |
| 559 | { |
| 560 | struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i; |
| 561 | struct xrep_ibt *ri = priv; |
| 562 | union xfs_btree_rec *block_rec; |
| 563 | unsigned int loaded; |
| 564 | int error; |
| 565 | |
| 566 | for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { |
| 567 | error = xfarray_load(ri->inode_records, ri->array_cur++, irec); |
| 568 | if (error) |
| 569 | return error; |
| 570 | |
| 571 | block_rec = xfs_btree_rec_addr(cur, idx, block); |
| 572 | cur->bc_ops->init_rec_from_cur(cur, block_rec); |
| 573 | } |
| 574 | |
| 575 | return loaded; |
| 576 | } |
| 577 | |
| 578 | /* Feed one of the new inobt blocks to the bulk loader. */ |
| 579 | STATIC int |
| 580 | xrep_ibt_claim_block( |
| 581 | struct xfs_btree_cur *cur, |
| 582 | union xfs_btree_ptr *ptr, |
| 583 | void *priv) |
| 584 | { |
| 585 | struct xrep_ibt *ri = priv; |
| 586 | |
| 587 | return xrep_newbt_claim_block(cur, &ri->new_inobt, ptr); |
| 588 | } |
| 589 | |
| 590 | /* Feed one of the new finobt blocks to the bulk loader. */ |
| 591 | STATIC int |
| 592 | xrep_fibt_claim_block( |
| 593 | struct xfs_btree_cur *cur, |
| 594 | union xfs_btree_ptr *ptr, |
| 595 | void *priv) |
| 596 | { |
| 597 | struct xrep_ibt *ri = priv; |
| 598 | |
| 599 | return xrep_newbt_claim_block(cur, &ri->new_finobt, ptr); |
| 600 | } |
| 601 | |
| 602 | /* Make sure the records do not overlap in inumber address space. */ |
| 603 | STATIC int |
| 604 | xrep_ibt_check_overlap( |
| 605 | struct xrep_ibt *ri) |
| 606 | { |
| 607 | struct xfs_inobt_rec_incore irec; |
| 608 | xfarray_idx_t cur; |
| 609 | xfs_agino_t next_agino = 0; |
| 610 | int error = 0; |
| 611 | |
| 612 | foreach_xfarray_idx(ri->inode_records, cur) { |
| 613 | if (xchk_should_terminate(ri->sc, &error)) |
| 614 | return error; |
| 615 | |
| 616 | error = xfarray_load(ri->inode_records, cur, &irec); |
| 617 | if (error) |
| 618 | return error; |
| 619 | |
| 620 | if (irec.ir_startino < next_agino) |
| 621 | return -EFSCORRUPTED; |
| 622 | |
| 623 | next_agino = irec.ir_startino + XFS_INODES_PER_CHUNK; |
| 624 | } |
| 625 | |
| 626 | return error; |
| 627 | } |
| 628 | |
| 629 | /* Build new inode btrees and dispose of the old one. */ |
| 630 | STATIC int |
| 631 | xrep_ibt_build_new_trees( |
| 632 | struct xrep_ibt *ri) |
| 633 | { |
| 634 | struct xfs_scrub *sc = ri->sc; |
| 635 | struct xfs_btree_cur *ino_cur; |
| 636 | struct xfs_btree_cur *fino_cur = NULL; |
| 637 | xfs_fsblock_t fsbno; |
| 638 | bool need_finobt; |
| 639 | int error; |
| 640 | |
| 641 | need_finobt = xfs_has_finobt(sc->mp); |
| 642 | |
| 643 | /* |
| 644 | * Create new btrees for staging all the inobt records we collected |
| 645 | * earlier. The records were collected in order of increasing agino, |
| 646 | * so we do not have to sort them. Ensure there are no overlapping |
| 647 | * records. |
| 648 | */ |
| 649 | error = xrep_ibt_check_overlap(ri); |
| 650 | if (error) |
| 651 | return error; |
| 652 | |
| 653 | /* |
| 654 | * The new inode btrees will not be rooted in the AGI until we've |
| 655 | * successfully rebuilt the tree. |
| 656 | * |
| 657 | * Start by setting up the inobt staging cursor. |
| 658 | */ |
| 659 | fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, |
| 660 | XFS_IBT_BLOCK(sc->mp)), |
| 661 | xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, fsbno, |
| 662 | XFS_AG_RESV_NONE); |
| 663 | ri->new_inobt.bload.claim_block = xrep_ibt_claim_block; |
| 664 | ri->new_inobt.bload.get_records = xrep_ibt_get_records; |
| 665 | |
Christoph Hellwig | 14dd46c | 2024-02-22 12:40:49 -0800 | [diff] [blame] | 666 | ino_cur = xfs_inobt_init_cursor(sc->sa.pag, NULL, NULL); |
Christoph Hellwig | 6234dee | 2024-02-22 12:39:39 -0800 | [diff] [blame] | 667 | xfs_btree_stage_afakeroot(ino_cur, &ri->new_inobt.afake); |
Darrick J. Wong | dbfbf3b | 2023-12-15 10:03:32 -0800 | [diff] [blame] | 668 | error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload, |
| 669 | xfarray_length(ri->inode_records)); |
| 670 | if (error) |
| 671 | goto err_inocur; |
| 672 | |
| 673 | /* Set up finobt staging cursor. */ |
| 674 | if (need_finobt) { |
| 675 | enum xfs_ag_resv_type resv = XFS_AG_RESV_METADATA; |
| 676 | |
| 677 | if (sc->mp->m_finobt_nores) |
| 678 | resv = XFS_AG_RESV_NONE; |
| 679 | |
| 680 | fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, |
| 681 | XFS_FIBT_BLOCK(sc->mp)), |
| 682 | xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT, |
| 683 | fsbno, resv); |
| 684 | ri->new_finobt.bload.claim_block = xrep_fibt_claim_block; |
| 685 | ri->new_finobt.bload.get_records = xrep_fibt_get_records; |
| 686 | |
Christoph Hellwig | 14dd46c | 2024-02-22 12:40:49 -0800 | [diff] [blame] | 687 | fino_cur = xfs_finobt_init_cursor(sc->sa.pag, NULL, NULL); |
Christoph Hellwig | 6234dee | 2024-02-22 12:39:39 -0800 | [diff] [blame] | 688 | xfs_btree_stage_afakeroot(fino_cur, &ri->new_finobt.afake); |
Darrick J. Wong | dbfbf3b | 2023-12-15 10:03:32 -0800 | [diff] [blame] | 689 | error = xfs_btree_bload_compute_geometry(fino_cur, |
| 690 | &ri->new_finobt.bload, ri->finobt_recs); |
| 691 | if (error) |
| 692 | goto err_finocur; |
| 693 | } |
| 694 | |
| 695 | /* Last chance to abort before we start committing fixes. */ |
| 696 | if (xchk_should_terminate(sc, &error)) |
| 697 | goto err_finocur; |
| 698 | |
| 699 | /* Reserve all the space we need to build the new btrees. */ |
| 700 | error = xrep_newbt_alloc_blocks(&ri->new_inobt, |
| 701 | ri->new_inobt.bload.nr_blocks); |
| 702 | if (error) |
| 703 | goto err_finocur; |
| 704 | |
| 705 | if (need_finobt) { |
| 706 | error = xrep_newbt_alloc_blocks(&ri->new_finobt, |
| 707 | ri->new_finobt.bload.nr_blocks); |
| 708 | if (error) |
| 709 | goto err_finocur; |
| 710 | } |
| 711 | |
| 712 | /* Add all inobt records. */ |
| 713 | ri->array_cur = XFARRAY_CURSOR_INIT; |
| 714 | error = xfs_btree_bload(ino_cur, &ri->new_inobt.bload, ri); |
| 715 | if (error) |
| 716 | goto err_finocur; |
| 717 | |
| 718 | /* Add all finobt records. */ |
| 719 | if (need_finobt) { |
| 720 | ri->array_cur = XFARRAY_CURSOR_INIT; |
| 721 | error = xfs_btree_bload(fino_cur, &ri->new_finobt.bload, ri); |
| 722 | if (error) |
| 723 | goto err_finocur; |
| 724 | } |
| 725 | |
| 726 | /* |
| 727 | * Install the new btrees in the AG header. After this point the old |
| 728 | * btrees are no longer accessible and the new trees are live. |
| 729 | */ |
| 730 | xfs_inobt_commit_staged_btree(ino_cur, sc->tp, sc->sa.agi_bp); |
| 731 | xfs_btree_del_cursor(ino_cur, 0); |
| 732 | |
| 733 | if (fino_cur) { |
| 734 | xfs_inobt_commit_staged_btree(fino_cur, sc->tp, sc->sa.agi_bp); |
| 735 | xfs_btree_del_cursor(fino_cur, 0); |
| 736 | } |
| 737 | |
| 738 | /* Reset the AGI counters now that we've changed the inode roots. */ |
| 739 | error = xrep_ibt_reset_counters(ri); |
| 740 | if (error) |
| 741 | goto err_finobt; |
| 742 | |
| 743 | /* Free unused blocks and bitmap. */ |
| 744 | if (need_finobt) { |
| 745 | error = xrep_newbt_commit(&ri->new_finobt); |
| 746 | if (error) |
| 747 | goto err_inobt; |
| 748 | } |
| 749 | error = xrep_newbt_commit(&ri->new_inobt); |
| 750 | if (error) |
| 751 | return error; |
| 752 | |
| 753 | return xrep_roll_ag_trans(sc); |
| 754 | |
| 755 | err_finocur: |
| 756 | if (need_finobt) |
| 757 | xfs_btree_del_cursor(fino_cur, error); |
| 758 | err_inocur: |
| 759 | xfs_btree_del_cursor(ino_cur, error); |
| 760 | err_finobt: |
| 761 | if (need_finobt) |
| 762 | xrep_newbt_cancel(&ri->new_finobt); |
| 763 | err_inobt: |
| 764 | xrep_newbt_cancel(&ri->new_inobt); |
| 765 | return error; |
| 766 | } |
| 767 | |
| 768 | /* |
| 769 | * Now that we've logged the roots of the new btrees, invalidate all of the |
| 770 | * old blocks and free them. |
| 771 | */ |
| 772 | STATIC int |
| 773 | xrep_ibt_remove_old_trees( |
| 774 | struct xrep_ibt *ri) |
| 775 | { |
| 776 | struct xfs_scrub *sc = ri->sc; |
| 777 | int error; |
| 778 | |
| 779 | /* |
| 780 | * Free the old inode btree blocks if they're not in use. It's ok to |
| 781 | * reap with XFS_AG_RESV_NONE even if the finobt had a per-AG |
| 782 | * reservation because we reset the reservation before releasing the |
| 783 | * AGI and AGF header buffer locks. |
| 784 | */ |
| 785 | error = xrep_reap_agblocks(sc, &ri->old_iallocbt_blocks, |
| 786 | &XFS_RMAP_OINFO_INOBT, XFS_AG_RESV_NONE); |
| 787 | if (error) |
| 788 | return error; |
| 789 | |
| 790 | /* |
| 791 | * If the finobt is enabled and has a per-AG reservation, make sure we |
| 792 | * reinitialize the per-AG reservations. |
| 793 | */ |
| 794 | if (xfs_has_finobt(sc->mp) && !sc->mp->m_finobt_nores) |
| 795 | sc->flags |= XREP_RESET_PERAG_RESV; |
| 796 | |
| 797 | return 0; |
| 798 | } |
| 799 | |
| 800 | /* Repair both inode btrees. */ |
| 801 | int |
| 802 | xrep_iallocbt( |
| 803 | struct xfs_scrub *sc) |
| 804 | { |
| 805 | struct xrep_ibt *ri; |
| 806 | struct xfs_mount *mp = sc->mp; |
| 807 | char *descr; |
| 808 | xfs_agino_t first_agino, last_agino; |
| 809 | int error = 0; |
| 810 | |
| 811 | /* We require the rmapbt to rebuild anything. */ |
| 812 | if (!xfs_has_rmapbt(mp)) |
| 813 | return -EOPNOTSUPP; |
| 814 | |
| 815 | ri = kzalloc(sizeof(struct xrep_ibt), XCHK_GFP_FLAGS); |
| 816 | if (!ri) |
| 817 | return -ENOMEM; |
| 818 | ri->sc = sc; |
| 819 | |
| 820 | /* We rebuild both inode btrees. */ |
| 821 | sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT; |
| 822 | |
| 823 | /* Set up enough storage to handle an AG with nothing but inodes. */ |
| 824 | xfs_agino_range(mp, sc->sa.pag->pag_agno, &first_agino, &last_agino); |
| 825 | last_agino /= XFS_INODES_PER_CHUNK; |
| 826 | descr = xchk_xfile_ag_descr(sc, "inode index records"); |
| 827 | error = xfarray_create(descr, last_agino, |
| 828 | sizeof(struct xfs_inobt_rec_incore), |
| 829 | &ri->inode_records); |
| 830 | kfree(descr); |
| 831 | if (error) |
| 832 | goto out_ri; |
| 833 | |
| 834 | /* Collect the inode data and find the old btree blocks. */ |
| 835 | xagb_bitmap_init(&ri->old_iallocbt_blocks); |
| 836 | error = xrep_ibt_find_inodes(ri); |
| 837 | if (error) |
| 838 | goto out_bitmap; |
| 839 | |
| 840 | /* Rebuild the inode indexes. */ |
| 841 | error = xrep_ibt_build_new_trees(ri); |
| 842 | if (error) |
| 843 | goto out_bitmap; |
| 844 | |
| 845 | /* Kill the old tree. */ |
| 846 | error = xrep_ibt_remove_old_trees(ri); |
| 847 | if (error) |
| 848 | goto out_bitmap; |
| 849 | |
| 850 | out_bitmap: |
| 851 | xagb_bitmap_destroy(&ri->old_iallocbt_blocks); |
| 852 | xfarray_destroy(ri->inode_records); |
| 853 | out_ri: |
| 854 | kfree(ri); |
| 855 | return error; |
| 856 | } |
| 857 | |
| 858 | /* Make sure both btrees are ok after we've rebuilt them. */ |
| 859 | int |
| 860 | xrep_revalidate_iallocbt( |
| 861 | struct xfs_scrub *sc) |
| 862 | { |
| 863 | __u32 old_type = sc->sm->sm_type; |
| 864 | int error; |
| 865 | |
| 866 | /* |
| 867 | * We must update sm_type temporarily so that the tree-to-tree cross |
| 868 | * reference checks will work in the correct direction, and also so |
| 869 | * that tracing will report correctly if there are more errors. |
| 870 | */ |
| 871 | sc->sm->sm_type = XFS_SCRUB_TYPE_INOBT; |
| 872 | error = xchk_iallocbt(sc); |
| 873 | if (error) |
| 874 | goto out; |
| 875 | |
| 876 | if (xfs_has_finobt(sc->mp)) { |
| 877 | sc->sm->sm_type = XFS_SCRUB_TYPE_FINOBT; |
| 878 | error = xchk_iallocbt(sc); |
| 879 | } |
| 880 | |
| 881 | out: |
| 882 | sc->sm->sm_type = old_type; |
| 883 | return error; |
| 884 | } |