blob: d98e8e77c684fa2bfca3232a60c115f905d8a6da [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2018-2023 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_format.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
#include "scrub/scrub.h"
#include "scrub/trace.h"
#include <linux/shmem_fs.h>
/*
* Swappable Temporary Memory
* ==========================
*
* Online checking sometimes needs to be able to stage a large amount of data
* in memory. This information might not fit in the available memory and it
* doesn't all need to be accessible at all times. In other words, we want an
* indexed data buffer to store data that can be paged out.
*
* When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those
* requirements. Therefore, the xfile mechanism uses an unlinked shmem file to
* store our staging data. This file is not installed in the file descriptor
* table so that user programs cannot access the data, which means that the
* xfile must be freed with xfile_destroy.
*
* xfiles assume that the caller will handle all required concurrency
* management; standard vfs locks (freezer and inode) are not taken. Reads
* and writes are satisfied directly from the page cache.
*
* NOTE: The current shmemfs implementation has a quirk that in-kernel reads
* of a hole cause a page to be mapped into the file. If you are going to
* create a sparse xfile, please be careful about reading from uninitialized
* parts of the file. These pages are !Uptodate and will eventually be
* reclaimed if not written, but in the short term this boosts memory
* consumption.
*/
/*
* xfiles must not be exposed to userspace and require upper layers to
* coordinate access to the one handle returned by the constructor, so
* establish a separate lock class for xfiles to avoid confusing lockdep.
*/
static struct lock_class_key xfile_i_mutex_key;
/*
* Create an xfile of the given size. The description will be used in the
* trace output.
*/
int
xfile_create(
const char *description,
loff_t isize,
struct xfile **xfilep)
{
struct inode *inode;
struct xfile *xf;
int error = -ENOMEM;
xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS);
if (!xf)
return -ENOMEM;
xf->file = shmem_file_setup(description, isize, 0);
if (!xf->file)
goto out_xfile;
if (IS_ERR(xf->file)) {
error = PTR_ERR(xf->file);
goto out_xfile;
}
/*
* We want a large sparse file that we can pread, pwrite, and seek.
* xfile users are responsible for keeping the xfile hidden away from
* all other callers, so we skip timestamp updates and security checks.
* Make the inode only accessible by root, just in case the xfile ever
* escapes.
*/
xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME |
FMODE_LSEEK;
xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME;
inode = file_inode(xf->file);
inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME;
inode->i_mode &= ~0177;
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key);
trace_xfile_create(xf);
*xfilep = xf;
return 0;
out_xfile:
kfree(xf);
return error;
}
/* Close the file and release all resources. */
void
xfile_destroy(
struct xfile *xf)
{
struct inode *inode = file_inode(xf->file);
trace_xfile_destroy(xf);
lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_key);
fput(xf->file);
kfree(xf);
}
/*
* Read a memory object directly from the xfile's page cache. Unlike regular
* pread, we return -E2BIG and -EFBIG for reads that are too large or at too
* high an offset, instead of truncating the read. Otherwise, we return
* bytes read or an error code, like regular pread.
*/
ssize_t
xfile_pread(
struct xfile *xf,
void *buf,
size_t count,
loff_t pos)
{
struct inode *inode = file_inode(xf->file);
struct address_space *mapping = inode->i_mapping;
struct page *page = NULL;
ssize_t read = 0;
unsigned int pflags;
int error = 0;
if (count > MAX_RW_COUNT)
return -E2BIG;
if (inode->i_sb->s_maxbytes - pos < count)
return -EFBIG;
trace_xfile_pread(xf, pos, count);
pflags = memalloc_nofs_save();
while (count > 0) {
void *p, *kaddr;
unsigned int len;
len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
/*
* In-kernel reads of a shmem file cause it to allocate a page
* if the mapping shows a hole. Therefore, if we hit ENOMEM
* we can continue by zeroing the caller's buffer.
*/
page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT,
__GFP_NOWARN);
if (IS_ERR(page)) {
error = PTR_ERR(page);
if (error != -ENOMEM)
break;
memset(buf, 0, len);
goto advance;
}
if (PageUptodate(page)) {
/*
* xfile pages must never be mapped into userspace, so
* we skip the dcache flush.
*/
kaddr = kmap_local_page(page);
p = kaddr + offset_in_page(pos);
memcpy(buf, p, len);
kunmap_local(kaddr);
} else {
memset(buf, 0, len);
}
put_page(page);
advance:
count -= len;
pos += len;
buf += len;
read += len;
}
memalloc_nofs_restore(pflags);
if (read > 0)
return read;
return error;
}
/*
* Write a memory object directly to the xfile's page cache. Unlike regular
* pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too
* high an offset, instead of truncating the write. Otherwise, we return
* bytes written or an error code, like regular pwrite.
*/
ssize_t
xfile_pwrite(
struct xfile *xf,
const void *buf,
size_t count,
loff_t pos)
{
struct inode *inode = file_inode(xf->file);
struct address_space *mapping = inode->i_mapping;
const struct address_space_operations *aops = mapping->a_ops;
struct page *page = NULL;
ssize_t written = 0;
unsigned int pflags;
int error = 0;
if (count > MAX_RW_COUNT)
return -E2BIG;
if (inode->i_sb->s_maxbytes - pos < count)
return -EFBIG;
trace_xfile_pwrite(xf, pos, count);
pflags = memalloc_nofs_save();
while (count > 0) {
void *fsdata = NULL;
void *p, *kaddr;
unsigned int len;
int ret;
len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
/*
* We call write_begin directly here to avoid all the freezer
* protection lock-taking that happens in the normal path.
* shmem doesn't support fs freeze, but lockdep doesn't know
* that and will trip over that.
*/
error = aops->write_begin(NULL, mapping, pos, len, &page,
&fsdata);
if (error)
break;
/*
* xfile pages must never be mapped into userspace, so we skip
* the dcache flush. If the page is not uptodate, zero it
* before writing data.
*/
kaddr = kmap_local_page(page);
if (!PageUptodate(page)) {
memset(kaddr, 0, PAGE_SIZE);
SetPageUptodate(page);
}
p = kaddr + offset_in_page(pos);
memcpy(p, buf, len);
kunmap_local(kaddr);
ret = aops->write_end(NULL, mapping, pos, len, len, page,
fsdata);
if (ret < 0) {
error = ret;
break;
}
written += ret;
if (ret != len)
break;
count -= ret;
pos += ret;
buf += ret;
}
memalloc_nofs_restore(pflags);
if (written > 0)
return written;
return error;
}
/* Find the next written area in the xfile data for a given offset. */
loff_t
xfile_seek_data(
struct xfile *xf,
loff_t pos)
{
loff_t ret;
ret = vfs_llseek(xf->file, pos, SEEK_DATA);
trace_xfile_seek_data(xf, pos, ret);
return ret;
}
/* Query stat information for an xfile. */
int
xfile_stat(
struct xfile *xf,
struct xfile_stat *statbuf)
{
struct kstat ks;
int error;
error = vfs_getattr_nosec(&xf->file->f_path, &ks,
STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC);
if (error)
return error;
statbuf->size = ks.size;
statbuf->bytes = ks.blocks << SECTOR_SHIFT;
return 0;
}
/*
* Grab the (locked) page for a memory object. The object cannot span a page
* boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we
* cannot grab the page, or the usual negative errno.
*/
int
xfile_get_page(
struct xfile *xf,
loff_t pos,
unsigned int len,
struct xfile_page *xfpage)
{
struct inode *inode = file_inode(xf->file);
struct address_space *mapping = inode->i_mapping;
const struct address_space_operations *aops = mapping->a_ops;
struct page *page = NULL;
void *fsdata = NULL;
loff_t key = round_down(pos, PAGE_SIZE);
unsigned int pflags;
int error;
if (inode->i_sb->s_maxbytes - pos < len)
return -ENOMEM;
if (len > PAGE_SIZE - offset_in_page(pos))
return -ENOTBLK;
trace_xfile_get_page(xf, pos, len);
pflags = memalloc_nofs_save();
/*
* We call write_begin directly here to avoid all the freezer
* protection lock-taking that happens in the normal path. shmem
* doesn't support fs freeze, but lockdep doesn't know that and will
* trip over that.
*/
error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page,
&fsdata);
if (error)
goto out_pflags;
/* We got the page, so make sure we push out EOF. */
if (i_size_read(inode) < pos + len)
i_size_write(inode, pos + len);
/*
* If the page isn't up to date, fill it with zeroes before we hand it
* to the caller and make sure the backing store will hold on to them.
*/
if (!PageUptodate(page)) {
void *kaddr;
kaddr = kmap_local_page(page);
memset(kaddr, 0, PAGE_SIZE);
kunmap_local(kaddr);
SetPageUptodate(page);
}
/*
* Mark each page dirty so that the contents are written to some
* backing store when we drop this buffer, and take an extra reference
* to prevent the xfile page from being swapped or removed from the
* page cache by reclaim if the caller unlocks the page.
*/
set_page_dirty(page);
get_page(page);
xfpage->page = page;
xfpage->fsdata = fsdata;
xfpage->pos = key;
out_pflags:
memalloc_nofs_restore(pflags);
return error;
}
/*
* Release the (locked) page for a memory object. Returns 0 or a negative
* errno.
*/
int
xfile_put_page(
struct xfile *xf,
struct xfile_page *xfpage)
{
struct inode *inode = file_inode(xf->file);
struct address_space *mapping = inode->i_mapping;
const struct address_space_operations *aops = mapping->a_ops;
unsigned int pflags;
int ret;
trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE);
/* Give back the reference that we took in xfile_get_page. */
put_page(xfpage->page);
pflags = memalloc_nofs_save();
ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE,
xfpage->page, xfpage->fsdata);
memalloc_nofs_restore(pflags);
memset(xfpage, 0, sizeof(struct xfile_page));
if (ret < 0)
return ret;
if (ret != PAGE_SIZE)
return -EIO;
return 0;
}