| // SPDX-License-Identifier: GPL-2.0 |
| /* |
| * Copyright (c) 2022 Fujitsu. All Rights Reserved. |
| */ |
| |
| #include "xfs.h" |
| #include "xfs_shared.h" |
| #include "xfs_format.h" |
| #include "xfs_log_format.h" |
| #include "xfs_trans_resv.h" |
| #include "xfs_mount.h" |
| #include "xfs_alloc.h" |
| #include "xfs_bit.h" |
| #include "xfs_btree.h" |
| #include "xfs_inode.h" |
| #include "xfs_icache.h" |
| #include "xfs_rmap.h" |
| #include "xfs_rmap_btree.h" |
| #include "xfs_rtalloc.h" |
| #include "xfs_trans.h" |
| #include "xfs_ag.h" |
| |
| #include <linux/mm.h> |
| #include <linux/dax.h> |
| #include <linux/fs.h> |
| |
| struct xfs_failure_info { |
| xfs_agblock_t startblock; |
| xfs_extlen_t blockcount; |
| int mf_flags; |
| bool want_shutdown; |
| }; |
| |
| static pgoff_t |
| xfs_failure_pgoff( |
| struct xfs_mount *mp, |
| const struct xfs_rmap_irec *rec, |
| const struct xfs_failure_info *notify) |
| { |
| loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset); |
| |
| if (notify->startblock > rec->rm_startblock) |
| pos += XFS_FSB_TO_B(mp, |
| notify->startblock - rec->rm_startblock); |
| return pos >> PAGE_SHIFT; |
| } |
| |
| static unsigned long |
| xfs_failure_pgcnt( |
| struct xfs_mount *mp, |
| const struct xfs_rmap_irec *rec, |
| const struct xfs_failure_info *notify) |
| { |
| xfs_agblock_t end_rec; |
| xfs_agblock_t end_notify; |
| xfs_agblock_t start_cross; |
| xfs_agblock_t end_cross; |
| |
| start_cross = max(rec->rm_startblock, notify->startblock); |
| |
| end_rec = rec->rm_startblock + rec->rm_blockcount; |
| end_notify = notify->startblock + notify->blockcount; |
| end_cross = min(end_rec, end_notify); |
| |
| return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT; |
| } |
| |
| static int |
| xfs_dax_failure_fn( |
| struct xfs_btree_cur *cur, |
| const struct xfs_rmap_irec *rec, |
| void *data) |
| { |
| struct xfs_mount *mp = cur->bc_mp; |
| struct xfs_inode *ip; |
| struct xfs_failure_info *notify = data; |
| struct address_space *mapping; |
| pgoff_t pgoff; |
| unsigned long pgcnt; |
| int error = 0; |
| |
| if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || |
| (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) { |
| /* Continue the query because this isn't a failure. */ |
| if (notify->mf_flags & MF_MEM_PRE_REMOVE) |
| return 0; |
| notify->want_shutdown = true; |
| return 0; |
| } |
| |
| /* Get files that incore, filter out others that are not in use. */ |
| error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE, |
| 0, &ip); |
| /* Continue the rmap query if the inode isn't incore */ |
| if (error == -ENODATA) |
| return 0; |
| if (error) { |
| notify->want_shutdown = true; |
| return 0; |
| } |
| |
| mapping = VFS_I(ip)->i_mapping; |
| pgoff = xfs_failure_pgoff(mp, rec, notify); |
| pgcnt = xfs_failure_pgcnt(mp, rec, notify); |
| |
| /* Continue the rmap query if the inode isn't a dax file. */ |
| if (dax_mapping(mapping)) |
| error = mf_dax_kill_procs(mapping, pgoff, pgcnt, |
| notify->mf_flags); |
| |
| /* Invalidate the cache in dax pages. */ |
| if (notify->mf_flags & MF_MEM_PRE_REMOVE) |
| invalidate_inode_pages2_range(mapping, pgoff, |
| pgoff + pgcnt - 1); |
| |
| xfs_irele(ip); |
| return error; |
| } |
| |
| static int |
| xfs_dax_notify_failure_freeze( |
| struct xfs_mount *mp) |
| { |
| struct super_block *sb = mp->m_super; |
| int error; |
| |
| error = freeze_super(sb, FREEZE_HOLDER_KERNEL); |
| if (error) |
| xfs_emerg(mp, "already frozen by kernel, err=%d", error); |
| |
| return error; |
| } |
| |
| static void |
| xfs_dax_notify_failure_thaw( |
| struct xfs_mount *mp, |
| bool kernel_frozen) |
| { |
| struct super_block *sb = mp->m_super; |
| int error; |
| |
| if (kernel_frozen) { |
| error = thaw_super(sb, FREEZE_HOLDER_KERNEL); |
| if (error) |
| xfs_emerg(mp, "still frozen after notify failure, err=%d", |
| error); |
| } |
| |
| /* |
| * Also thaw userspace call anyway because the device is about to be |
| * removed immediately. |
| */ |
| thaw_super(sb, FREEZE_HOLDER_USERSPACE); |
| } |
| |
| static int |
| xfs_dax_notify_ddev_failure( |
| struct xfs_mount *mp, |
| xfs_daddr_t daddr, |
| xfs_daddr_t bblen, |
| int mf_flags) |
| { |
| struct xfs_failure_info notify = { .mf_flags = mf_flags }; |
| struct xfs_trans *tp = NULL; |
| struct xfs_btree_cur *cur = NULL; |
| struct xfs_buf *agf_bp = NULL; |
| int error = 0; |
| bool kernel_frozen = false; |
| xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr); |
| xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); |
| xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, |
| daddr + bblen - 1); |
| xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno); |
| |
| if (mf_flags & MF_MEM_PRE_REMOVE) { |
| xfs_info(mp, "Device is about to be removed!"); |
| /* |
| * Freeze fs to prevent new mappings from being created. |
| * - Keep going on if others already hold the kernel forzen. |
| * - Keep going on if other errors too because this device is |
| * starting to fail. |
| * - If kernel frozen state is hold successfully here, thaw it |
| * here as well at the end. |
| */ |
| kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0; |
| } |
| |
| error = xfs_trans_alloc_empty(mp, &tp); |
| if (error) |
| goto out; |
| |
| for (; agno <= end_agno; agno++) { |
| struct xfs_rmap_irec ri_low = { }; |
| struct xfs_rmap_irec ri_high; |
| struct xfs_agf *agf; |
| struct xfs_perag *pag; |
| xfs_agblock_t range_agend; |
| |
| pag = xfs_perag_get(mp, agno); |
| error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); |
| if (error) { |
| xfs_perag_put(pag); |
| break; |
| } |
| |
| cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag); |
| |
| /* |
| * Set the rmap range from ri_low to ri_high, which represents |
| * a [start, end] where we looking for the files or metadata. |
| */ |
| memset(&ri_high, 0xFF, sizeof(ri_high)); |
| ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno); |
| if (agno == end_agno) |
| ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno); |
| |
| agf = agf_bp->b_addr; |
| range_agend = min(be32_to_cpu(agf->agf_length) - 1, |
| ri_high.rm_startblock); |
| notify.startblock = ri_low.rm_startblock; |
| notify.blockcount = range_agend + 1 - ri_low.rm_startblock; |
| |
| error = xfs_rmap_query_range(cur, &ri_low, &ri_high, |
| xfs_dax_failure_fn, ¬ify); |
| xfs_btree_del_cursor(cur, error); |
| xfs_trans_brelse(tp, agf_bp); |
| xfs_perag_put(pag); |
| if (error) |
| break; |
| |
| fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0); |
| } |
| |
| xfs_trans_cancel(tp); |
| |
| /* |
| * Shutdown fs from a force umount in pre-remove case which won't fail, |
| * so errors can be ignored. Otherwise, shutdown the filesystem with |
| * CORRUPT flag if error occured or notify.want_shutdown was set during |
| * RMAP querying. |
| */ |
| if (mf_flags & MF_MEM_PRE_REMOVE) |
| xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); |
| else if (error || notify.want_shutdown) { |
| xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); |
| if (!error) |
| error = -EFSCORRUPTED; |
| } |
| |
| out: |
| /* Thaw the fs if it has been frozen before. */ |
| if (mf_flags & MF_MEM_PRE_REMOVE) |
| xfs_dax_notify_failure_thaw(mp, kernel_frozen); |
| |
| return error; |
| } |
| |
| static int |
| xfs_dax_notify_failure( |
| struct dax_device *dax_dev, |
| u64 offset, |
| u64 len, |
| int mf_flags) |
| { |
| struct xfs_mount *mp = dax_holder(dax_dev); |
| u64 ddev_start; |
| u64 ddev_end; |
| |
| if (!(mp->m_super->s_flags & SB_BORN)) { |
| xfs_warn(mp, "filesystem is not ready for notify_failure()!"); |
| return -EIO; |
| } |
| |
| if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) { |
| xfs_debug(mp, |
| "notify_failure() not supported on realtime device!"); |
| return -EOPNOTSUPP; |
| } |
| |
| if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev && |
| mp->m_logdev_targp != mp->m_ddev_targp) { |
| /* |
| * In the pre-remove case the failure notification is attempting |
| * to trigger a force unmount. The expectation is that the |
| * device is still present, but its removal is in progress and |
| * can not be cancelled, proceed with accessing the log device. |
| */ |
| if (mf_flags & MF_MEM_PRE_REMOVE) |
| return 0; |
| xfs_err(mp, "ondisk log corrupt, shutting down fs!"); |
| xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); |
| return -EFSCORRUPTED; |
| } |
| |
| if (!xfs_has_rmapbt(mp)) { |
| xfs_debug(mp, "notify_failure() needs rmapbt enabled!"); |
| return -EOPNOTSUPP; |
| } |
| |
| ddev_start = mp->m_ddev_targp->bt_dax_part_off; |
| ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1; |
| |
| /* Notify failure on the whole device. */ |
| if (offset == 0 && len == U64_MAX) { |
| offset = ddev_start; |
| len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev); |
| } |
| |
| /* Ignore the range out of filesystem area */ |
| if (offset + len - 1 < ddev_start) |
| return -ENXIO; |
| if (offset > ddev_end) |
| return -ENXIO; |
| |
| /* Calculate the real range when it touches the boundary */ |
| if (offset > ddev_start) |
| offset -= ddev_start; |
| else { |
| len -= ddev_start - offset; |
| offset = 0; |
| } |
| if (offset + len - 1 > ddev_end) |
| len = ddev_end - offset + 1; |
| |
| return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len), |
| mf_flags); |
| } |
| |
| const struct dax_holder_operations xfs_dax_holder_operations = { |
| .notify_failure = xfs_dax_notify_failure, |
| }; |