| // SPDX-License-Identifier: GPL-2.0-or-later |
| /* |
| * Copyright (C) 2018-2023 Oracle. All Rights Reserved. |
| * Author: Darrick J. Wong <djwong@kernel.org> |
| */ |
| #include "xfs.h" |
| #include "xfs_fs.h" |
| #include "xfs_shared.h" |
| #include "xfs_format.h" |
| #include "xfs_log_format.h" |
| #include "xfs_trans_resv.h" |
| #include "xfs_mount.h" |
| #include "xfs_format.h" |
| #include "scrub/xfile.h" |
| #include "scrub/xfarray.h" |
| #include "scrub/scrub.h" |
| #include "scrub/trace.h" |
| #include <linux/shmem_fs.h> |
| |
| /* |
| * Swappable Temporary Memory |
| * ========================== |
| * |
| * Online checking sometimes needs to be able to stage a large amount of data |
| * in memory. This information might not fit in the available memory and it |
| * doesn't all need to be accessible at all times. In other words, we want an |
| * indexed data buffer to store data that can be paged out. |
| * |
| * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those |
| * requirements. Therefore, the xfile mechanism uses an unlinked shmem file to |
| * store our staging data. This file is not installed in the file descriptor |
| * table so that user programs cannot access the data, which means that the |
| * xfile must be freed with xfile_destroy. |
| * |
| * xfiles assume that the caller will handle all required concurrency |
| * management; standard vfs locks (freezer and inode) are not taken. Reads |
| * and writes are satisfied directly from the page cache. |
| * |
| * NOTE: The current shmemfs implementation has a quirk that in-kernel reads |
| * of a hole cause a page to be mapped into the file. If you are going to |
| * create a sparse xfile, please be careful about reading from uninitialized |
| * parts of the file. These pages are !Uptodate and will eventually be |
| * reclaimed if not written, but in the short term this boosts memory |
| * consumption. |
| */ |
| |
| /* |
| * xfiles must not be exposed to userspace and require upper layers to |
| * coordinate access to the one handle returned by the constructor, so |
| * establish a separate lock class for xfiles to avoid confusing lockdep. |
| */ |
| static struct lock_class_key xfile_i_mutex_key; |
| |
| /* |
| * Create an xfile of the given size. The description will be used in the |
| * trace output. |
| */ |
| int |
| xfile_create( |
| const char *description, |
| loff_t isize, |
| struct xfile **xfilep) |
| { |
| struct inode *inode; |
| struct xfile *xf; |
| int error = -ENOMEM; |
| |
| xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS); |
| if (!xf) |
| return -ENOMEM; |
| |
| xf->file = shmem_file_setup(description, isize, 0); |
| if (!xf->file) |
| goto out_xfile; |
| if (IS_ERR(xf->file)) { |
| error = PTR_ERR(xf->file); |
| goto out_xfile; |
| } |
| |
| /* |
| * We want a large sparse file that we can pread, pwrite, and seek. |
| * xfile users are responsible for keeping the xfile hidden away from |
| * all other callers, so we skip timestamp updates and security checks. |
| * Make the inode only accessible by root, just in case the xfile ever |
| * escapes. |
| */ |
| xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME | |
| FMODE_LSEEK; |
| xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME; |
| inode = file_inode(xf->file); |
| inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME; |
| inode->i_mode &= ~0177; |
| inode->i_uid = GLOBAL_ROOT_UID; |
| inode->i_gid = GLOBAL_ROOT_GID; |
| |
| lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key); |
| |
| trace_xfile_create(xf); |
| |
| *xfilep = xf; |
| return 0; |
| out_xfile: |
| kfree(xf); |
| return error; |
| } |
| |
| /* Close the file and release all resources. */ |
| void |
| xfile_destroy( |
| struct xfile *xf) |
| { |
| struct inode *inode = file_inode(xf->file); |
| |
| trace_xfile_destroy(xf); |
| |
| lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_key); |
| fput(xf->file); |
| kfree(xf); |
| } |
| |
| /* |
| * Read a memory object directly from the xfile's page cache. Unlike regular |
| * pread, we return -E2BIG and -EFBIG for reads that are too large or at too |
| * high an offset, instead of truncating the read. Otherwise, we return |
| * bytes read or an error code, like regular pread. |
| */ |
| ssize_t |
| xfile_pread( |
| struct xfile *xf, |
| void *buf, |
| size_t count, |
| loff_t pos) |
| { |
| struct inode *inode = file_inode(xf->file); |
| struct address_space *mapping = inode->i_mapping; |
| struct page *page = NULL; |
| ssize_t read = 0; |
| unsigned int pflags; |
| int error = 0; |
| |
| if (count > MAX_RW_COUNT) |
| return -E2BIG; |
| if (inode->i_sb->s_maxbytes - pos < count) |
| return -EFBIG; |
| |
| trace_xfile_pread(xf, pos, count); |
| |
| pflags = memalloc_nofs_save(); |
| while (count > 0) { |
| void *p, *kaddr; |
| unsigned int len; |
| |
| len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); |
| |
| /* |
| * In-kernel reads of a shmem file cause it to allocate a page |
| * if the mapping shows a hole. Therefore, if we hit ENOMEM |
| * we can continue by zeroing the caller's buffer. |
| */ |
| page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT, |
| __GFP_NOWARN); |
| if (IS_ERR(page)) { |
| error = PTR_ERR(page); |
| if (error != -ENOMEM) |
| break; |
| |
| memset(buf, 0, len); |
| goto advance; |
| } |
| |
| if (PageUptodate(page)) { |
| /* |
| * xfile pages must never be mapped into userspace, so |
| * we skip the dcache flush. |
| */ |
| kaddr = kmap_local_page(page); |
| p = kaddr + offset_in_page(pos); |
| memcpy(buf, p, len); |
| kunmap_local(kaddr); |
| } else { |
| memset(buf, 0, len); |
| } |
| put_page(page); |
| |
| advance: |
| count -= len; |
| pos += len; |
| buf += len; |
| read += len; |
| } |
| memalloc_nofs_restore(pflags); |
| |
| if (read > 0) |
| return read; |
| return error; |
| } |
| |
| /* |
| * Write a memory object directly to the xfile's page cache. Unlike regular |
| * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too |
| * high an offset, instead of truncating the write. Otherwise, we return |
| * bytes written or an error code, like regular pwrite. |
| */ |
| ssize_t |
| xfile_pwrite( |
| struct xfile *xf, |
| const void *buf, |
| size_t count, |
| loff_t pos) |
| { |
| struct inode *inode = file_inode(xf->file); |
| struct address_space *mapping = inode->i_mapping; |
| const struct address_space_operations *aops = mapping->a_ops; |
| struct page *page = NULL; |
| ssize_t written = 0; |
| unsigned int pflags; |
| int error = 0; |
| |
| if (count > MAX_RW_COUNT) |
| return -E2BIG; |
| if (inode->i_sb->s_maxbytes - pos < count) |
| return -EFBIG; |
| |
| trace_xfile_pwrite(xf, pos, count); |
| |
| pflags = memalloc_nofs_save(); |
| while (count > 0) { |
| void *fsdata = NULL; |
| void *p, *kaddr; |
| unsigned int len; |
| int ret; |
| |
| len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); |
| |
| /* |
| * We call write_begin directly here to avoid all the freezer |
| * protection lock-taking that happens in the normal path. |
| * shmem doesn't support fs freeze, but lockdep doesn't know |
| * that and will trip over that. |
| */ |
| error = aops->write_begin(NULL, mapping, pos, len, &page, |
| &fsdata); |
| if (error) |
| break; |
| |
| /* |
| * xfile pages must never be mapped into userspace, so we skip |
| * the dcache flush. If the page is not uptodate, zero it |
| * before writing data. |
| */ |
| kaddr = kmap_local_page(page); |
| if (!PageUptodate(page)) { |
| memset(kaddr, 0, PAGE_SIZE); |
| SetPageUptodate(page); |
| } |
| p = kaddr + offset_in_page(pos); |
| memcpy(p, buf, len); |
| kunmap_local(kaddr); |
| |
| ret = aops->write_end(NULL, mapping, pos, len, len, page, |
| fsdata); |
| if (ret < 0) { |
| error = ret; |
| break; |
| } |
| |
| written += ret; |
| if (ret != len) |
| break; |
| |
| count -= ret; |
| pos += ret; |
| buf += ret; |
| } |
| memalloc_nofs_restore(pflags); |
| |
| if (written > 0) |
| return written; |
| return error; |
| } |
| |
| /* Find the next written area in the xfile data for a given offset. */ |
| loff_t |
| xfile_seek_data( |
| struct xfile *xf, |
| loff_t pos) |
| { |
| loff_t ret; |
| |
| ret = vfs_llseek(xf->file, pos, SEEK_DATA); |
| trace_xfile_seek_data(xf, pos, ret); |
| return ret; |
| } |
| |
| /* Query stat information for an xfile. */ |
| int |
| xfile_stat( |
| struct xfile *xf, |
| struct xfile_stat *statbuf) |
| { |
| struct kstat ks; |
| int error; |
| |
| error = vfs_getattr_nosec(&xf->file->f_path, &ks, |
| STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC); |
| if (error) |
| return error; |
| |
| statbuf->size = ks.size; |
| statbuf->bytes = ks.blocks << SECTOR_SHIFT; |
| return 0; |
| } |
| |
| /* |
| * Grab the (locked) page for a memory object. The object cannot span a page |
| * boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we |
| * cannot grab the page, or the usual negative errno. |
| */ |
| int |
| xfile_get_page( |
| struct xfile *xf, |
| loff_t pos, |
| unsigned int len, |
| struct xfile_page *xfpage) |
| { |
| struct inode *inode = file_inode(xf->file); |
| struct address_space *mapping = inode->i_mapping; |
| const struct address_space_operations *aops = mapping->a_ops; |
| struct page *page = NULL; |
| void *fsdata = NULL; |
| loff_t key = round_down(pos, PAGE_SIZE); |
| unsigned int pflags; |
| int error; |
| |
| if (inode->i_sb->s_maxbytes - pos < len) |
| return -ENOMEM; |
| if (len > PAGE_SIZE - offset_in_page(pos)) |
| return -ENOTBLK; |
| |
| trace_xfile_get_page(xf, pos, len); |
| |
| pflags = memalloc_nofs_save(); |
| |
| /* |
| * We call write_begin directly here to avoid all the freezer |
| * protection lock-taking that happens in the normal path. shmem |
| * doesn't support fs freeze, but lockdep doesn't know that and will |
| * trip over that. |
| */ |
| error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page, |
| &fsdata); |
| if (error) |
| goto out_pflags; |
| |
| /* We got the page, so make sure we push out EOF. */ |
| if (i_size_read(inode) < pos + len) |
| i_size_write(inode, pos + len); |
| |
| /* |
| * If the page isn't up to date, fill it with zeroes before we hand it |
| * to the caller and make sure the backing store will hold on to them. |
| */ |
| if (!PageUptodate(page)) { |
| void *kaddr; |
| |
| kaddr = kmap_local_page(page); |
| memset(kaddr, 0, PAGE_SIZE); |
| kunmap_local(kaddr); |
| SetPageUptodate(page); |
| } |
| |
| /* |
| * Mark each page dirty so that the contents are written to some |
| * backing store when we drop this buffer, and take an extra reference |
| * to prevent the xfile page from being swapped or removed from the |
| * page cache by reclaim if the caller unlocks the page. |
| */ |
| set_page_dirty(page); |
| get_page(page); |
| |
| xfpage->page = page; |
| xfpage->fsdata = fsdata; |
| xfpage->pos = key; |
| out_pflags: |
| memalloc_nofs_restore(pflags); |
| return error; |
| } |
| |
| /* |
| * Release the (locked) page for a memory object. Returns 0 or a negative |
| * errno. |
| */ |
| int |
| xfile_put_page( |
| struct xfile *xf, |
| struct xfile_page *xfpage) |
| { |
| struct inode *inode = file_inode(xf->file); |
| struct address_space *mapping = inode->i_mapping; |
| const struct address_space_operations *aops = mapping->a_ops; |
| unsigned int pflags; |
| int ret; |
| |
| trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE); |
| |
| /* Give back the reference that we took in xfile_get_page. */ |
| put_page(xfpage->page); |
| |
| pflags = memalloc_nofs_save(); |
| ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE, |
| xfpage->page, xfpage->fsdata); |
| memalloc_nofs_restore(pflags); |
| memset(xfpage, 0, sizeof(struct xfile_page)); |
| |
| if (ret < 0) |
| return ret; |
| if (ret != PAGE_SIZE) |
| return -EIO; |
| return 0; |
| } |