| // SPDX-License-Identifier: GPL-2.0-or-later |
| /* |
| * Copyright (C) 2019 Oracle. All Rights Reserved. |
| * Author: Darrick J. Wong <darrick.wong@oracle.com> |
| */ |
| #include "xfs.h" |
| #include "xfs_fs.h" |
| #include "xfs_shared.h" |
| #include "xfs_format.h" |
| #include "xfs_log_format.h" |
| #include "xfs_trans_resv.h" |
| #include "xfs_mount.h" |
| #include "xfs_inode.h" |
| #include "xfs_btree.h" |
| #include "xfs_ialloc.h" |
| #include "xfs_ialloc_btree.h" |
| #include "xfs_iwalk.h" |
| #include "xfs_itable.h" |
| #include "xfs_error.h" |
| #include "xfs_trace.h" |
| #include "xfs_icache.h" |
| #include "xfs_health.h" |
| #include "xfs_trans.h" |
| |
| /* |
| * Walking Inodes in the Filesystem |
| * ================================ |
| * |
| * This iterator function walks a subset of filesystem inodes in increasing |
| * order from @startino until there are no more inodes. For each allocated |
| * inode it finds, it calls a walk function with the relevant inode number and |
| * a pointer to caller-provided data. The walk function can return the usual |
| * negative error code to stop the iteration; 0 to continue the iteration; or |
| * XFS_IWALK_ABORT to stop the iteration. This return value is returned to the |
| * caller. |
| * |
| * Internally, we allow the walk function to do anything, which means that we |
| * cannot maintain the inobt cursor or our lock on the AGI buffer. We |
| * therefore cache the inobt records in kernel memory and only call the walk |
| * function when our memory buffer is full. @nr_recs is the number of records |
| * that we've cached, and @sz_recs is the size of our cache. |
| * |
| * It is the responsibility of the walk function to ensure it accesses |
| * allocated inodes, as the inobt records may be stale by the time they are |
| * acted upon. |
| */ |
| |
| struct xfs_iwalk_ag { |
| struct xfs_mount *mp; |
| struct xfs_trans *tp; |
| |
| /* Where do we start the traversal? */ |
| xfs_ino_t startino; |
| |
| /* Array of inobt records we cache. */ |
| struct xfs_inobt_rec_incore *recs; |
| |
| /* Number of entries allocated for the @recs array. */ |
| unsigned int sz_recs; |
| |
| /* Number of entries in the @recs array that are in use. */ |
| unsigned int nr_recs; |
| |
| /* Inode walk function and data pointer. */ |
| xfs_iwalk_fn iwalk_fn; |
| void *data; |
| }; |
| |
| /* Allocate memory for a walk. */ |
| STATIC int |
| xfs_iwalk_alloc( |
| struct xfs_iwalk_ag *iwag) |
| { |
| size_t size; |
| |
| ASSERT(iwag->recs == NULL); |
| iwag->nr_recs = 0; |
| |
| /* Allocate a prefetch buffer for inobt records. */ |
| size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore); |
| iwag->recs = kmem_alloc(size, KM_MAYFAIL); |
| if (iwag->recs == NULL) |
| return -ENOMEM; |
| |
| return 0; |
| } |
| |
| /* Free memory we allocated for a walk. */ |
| STATIC void |
| xfs_iwalk_free( |
| struct xfs_iwalk_ag *iwag) |
| { |
| kmem_free(iwag->recs); |
| iwag->recs = NULL; |
| } |
| |
| /* For each inuse inode in each cached inobt record, call our function. */ |
| STATIC int |
| xfs_iwalk_ag_recs( |
| struct xfs_iwalk_ag *iwag) |
| { |
| struct xfs_mount *mp = iwag->mp; |
| struct xfs_trans *tp = iwag->tp; |
| xfs_ino_t ino; |
| unsigned int i, j; |
| xfs_agnumber_t agno; |
| int error; |
| |
| agno = XFS_INO_TO_AGNO(mp, iwag->startino); |
| for (i = 0; i < iwag->nr_recs; i++) { |
| struct xfs_inobt_rec_incore *irec = &iwag->recs[i]; |
| |
| trace_xfs_iwalk_ag_rec(mp, agno, irec); |
| |
| for (j = 0; j < XFS_INODES_PER_CHUNK; j++) { |
| /* Skip if this inode is free */ |
| if (XFS_INOBT_MASK(j) & irec->ir_free) |
| continue; |
| |
| /* Otherwise call our function. */ |
| ino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino + j); |
| error = iwag->iwalk_fn(mp, tp, ino, iwag->data); |
| if (error) |
| return error; |
| } |
| } |
| |
| return 0; |
| } |
| |
| /* Delete cursor and let go of AGI. */ |
| static inline void |
| xfs_iwalk_del_inobt( |
| struct xfs_trans *tp, |
| struct xfs_btree_cur **curpp, |
| struct xfs_buf **agi_bpp, |
| int error) |
| { |
| if (*curpp) { |
| xfs_btree_del_cursor(*curpp, error); |
| *curpp = NULL; |
| } |
| if (*agi_bpp) { |
| xfs_trans_brelse(tp, *agi_bpp); |
| *agi_bpp = NULL; |
| } |
| } |
| |
| /* |
| * Set ourselves up for walking inobt records starting from a given point in |
| * the filesystem. |
| * |
| * If caller passed in a nonzero start inode number, load the record from the |
| * inobt and make the record look like all the inodes before agino are free so |
| * that we skip them, and then move the cursor to the next inobt record. This |
| * is how we support starting an iwalk in the middle of an inode chunk. |
| * |
| * If the caller passed in a start number of zero, move the cursor to the first |
| * inobt record. |
| * |
| * The caller is responsible for cleaning up the cursor and buffer pointer |
| * regardless of the error status. |
| */ |
| STATIC int |
| xfs_iwalk_ag_start( |
| struct xfs_iwalk_ag *iwag, |
| xfs_agnumber_t agno, |
| xfs_agino_t agino, |
| struct xfs_btree_cur **curpp, |
| struct xfs_buf **agi_bpp, |
| int *has_more) |
| { |
| struct xfs_mount *mp = iwag->mp; |
| struct xfs_trans *tp = iwag->tp; |
| int icount; |
| int error; |
| |
| /* Set up a fresh cursor and empty the inobt cache. */ |
| iwag->nr_recs = 0; |
| error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp); |
| if (error) |
| return error; |
| |
| /* Starting at the beginning of the AG? That's easy! */ |
| if (agino == 0) |
| return xfs_inobt_lookup(*curpp, 0, XFS_LOOKUP_GE, has_more); |
| |
| /* |
| * Otherwise, we have to grab the inobt record where we left off, stuff |
| * the record into our cache, and then see if there are more records. |
| * We require a lookup cache of at least two elements so that we don't |
| * have to deal with tearing down the cursor to walk the records. |
| */ |
| error = xfs_bulkstat_grab_ichunk(*curpp, agino - 1, &icount, |
| &iwag->recs[iwag->nr_recs]); |
| if (error) |
| return error; |
| if (icount) |
| iwag->nr_recs++; |
| |
| /* |
| * The prefetch calculation is supposed to give us a large enough inobt |
| * record cache that grab_ichunk can stage a partial first record and |
| * the loop body can cache a record without having to check for cache |
| * space until after it reads an inobt record. |
| */ |
| ASSERT(iwag->nr_recs < iwag->sz_recs); |
| |
| return xfs_btree_increment(*curpp, 0, has_more); |
| } |
| |
| /* |
| * The inobt record cache is full, so preserve the inobt cursor state and |
| * run callbacks on the cached inobt records. When we're done, restore the |
| * cursor state to wherever the cursor would have been had the cache not been |
| * full (and therefore we could've just incremented the cursor) if *@has_more |
| * is true. On exit, *@has_more will indicate whether or not the caller should |
| * try for more inode records. |
| */ |
| STATIC int |
| xfs_iwalk_run_callbacks( |
| struct xfs_iwalk_ag *iwag, |
| xfs_agnumber_t agno, |
| struct xfs_btree_cur **curpp, |
| struct xfs_buf **agi_bpp, |
| int *has_more) |
| { |
| struct xfs_mount *mp = iwag->mp; |
| struct xfs_trans *tp = iwag->tp; |
| struct xfs_inobt_rec_incore *irec; |
| xfs_agino_t restart; |
| int error; |
| |
| ASSERT(iwag->nr_recs > 0); |
| |
| /* Delete cursor but remember the last record we cached... */ |
| xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0); |
| irec = &iwag->recs[iwag->nr_recs - 1]; |
| restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1; |
| |
| error = xfs_iwalk_ag_recs(iwag); |
| if (error) |
| return error; |
| |
| /* ...empty the cache... */ |
| iwag->nr_recs = 0; |
| |
| if (!has_more) |
| return 0; |
| |
| /* ...and recreate the cursor just past where we left off. */ |
| error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp); |
| if (error) |
| return error; |
| |
| return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more); |
| } |
| |
| /* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */ |
| STATIC int |
| xfs_iwalk_ag( |
| struct xfs_iwalk_ag *iwag) |
| { |
| struct xfs_mount *mp = iwag->mp; |
| struct xfs_trans *tp = iwag->tp; |
| struct xfs_buf *agi_bp = NULL; |
| struct xfs_btree_cur *cur = NULL; |
| xfs_agnumber_t agno; |
| xfs_agino_t agino; |
| int has_more; |
| int error = 0; |
| |
| /* Set up our cursor at the right place in the inode btree. */ |
| agno = XFS_INO_TO_AGNO(mp, iwag->startino); |
| agino = XFS_INO_TO_AGINO(mp, iwag->startino); |
| error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more); |
| |
| while (!error && has_more) { |
| struct xfs_inobt_rec_incore *irec; |
| |
| cond_resched(); |
| |
| /* Fetch the inobt record. */ |
| irec = &iwag->recs[iwag->nr_recs]; |
| error = xfs_inobt_get_rec(cur, irec, &has_more); |
| if (error || !has_more) |
| break; |
| |
| /* No allocated inodes in this chunk; skip it. */ |
| if (irec->ir_freecount == irec->ir_count) { |
| error = xfs_btree_increment(cur, 0, &has_more); |
| if (error) |
| break; |
| continue; |
| } |
| |
| /* |
| * Start readahead for this inode chunk in anticipation of |
| * walking the inodes. |
| */ |
| xfs_bulkstat_ichunk_ra(mp, agno, irec); |
| |
| /* |
| * If there's space in the buffer for more records, increment |
| * the btree cursor and grab more. |
| */ |
| if (++iwag->nr_recs < iwag->sz_recs) { |
| error = xfs_btree_increment(cur, 0, &has_more); |
| if (error || !has_more) |
| break; |
| continue; |
| } |
| |
| /* |
| * Otherwise, we need to save cursor state and run the callback |
| * function on the cached records. The run_callbacks function |
| * is supposed to return a cursor pointing to the record where |
| * we would be if we had been able to increment like above. |
| */ |
| ASSERT(has_more); |
| error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp, |
| &has_more); |
| } |
| |
| if (iwag->nr_recs == 0 || error) |
| goto out; |
| |
| /* Walk the unprocessed records in the cache. */ |
| error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp, &has_more); |
| |
| out: |
| xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); |
| return error; |
| } |
| |
| /* |
| * Given the number of inodes to prefetch, set the number of inobt records that |
| * we cache in memory, which controls the number of inodes we try to read |
| * ahead. |
| */ |
| static inline unsigned int |
| xfs_iwalk_prefetch( |
| unsigned int inode_records) |
| { |
| return PAGE_SIZE * 4 / sizeof(struct xfs_inobt_rec_incore); |
| } |
| |
| /* |
| * Walk all inodes in the filesystem starting from @startino. The @iwalk_fn |
| * will be called for each allocated inode, being passed the inode's number and |
| * @data. @max_prefetch controls how many inobt records' worth of inodes we |
| * try to readahead. |
| */ |
| int |
| xfs_iwalk( |
| struct xfs_mount *mp, |
| struct xfs_trans *tp, |
| xfs_ino_t startino, |
| xfs_iwalk_fn iwalk_fn, |
| unsigned int inode_records, |
| void *data) |
| { |
| struct xfs_iwalk_ag iwag = { |
| .mp = mp, |
| .tp = tp, |
| .iwalk_fn = iwalk_fn, |
| .data = data, |
| .startino = startino, |
| .sz_recs = xfs_iwalk_prefetch(inode_records), |
| }; |
| xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); |
| int error; |
| |
| ASSERT(agno < mp->m_sb.sb_agcount); |
| |
| error = xfs_iwalk_alloc(&iwag); |
| if (error) |
| return error; |
| |
| for (; agno < mp->m_sb.sb_agcount; agno++) { |
| error = xfs_iwalk_ag(&iwag); |
| if (error) |
| break; |
| iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); |
| } |
| |
| xfs_iwalk_free(&iwag); |
| return error; |
| } |