mm/memfd.c - linux - Git at Google

 /*
  * memfd_create system call and file sealing support
  *
  * Code was originally included in shmem.c, and broken out to facilitate
  * use by hugetlbfs as well as tmpfs.
  *
  * This file is released under the GPL.
  */

 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/pagemap.h>
 #include <linux/file.h>
 #include <linux/mm.h>
 #include <linux/sched/signal.h>
 #include <linux/khugepaged.h>
 #include <linux/syscalls.h>
 #include <linux/hugetlb.h>
 #include <linux/shmem_fs.h>
 #include <linux/memfd.h>
 #include <linux/pid_namespace.h>
 #include <uapi/linux/memfd.h>

 /*
  * We need a tag: a new tag would expand every xa_node by 8 bytes,
  * so reuse a tag which we firmly believe is never set or cleared on tmpfs
  * or hugetlbfs because they are memory only filesystems.
  */
 #define MEMFD_TAG_PINNED        PAGECACHE_TAG_TOWRITE
 #define LAST_SCAN               4       /* about 150ms max */

 static void memfd_tag_pins(struct xa_state *xas)
 {
 	struct page *page;
 	int latency = 0;
 	int cache_count;

 	lru_add_drain();

 	xas_lock_irq(xas);
 	xas_for_each(xas, page, ULONG_MAX) {
 		cache_count = 1;
 		if (!xa_is_value(page) &&
 		    PageTransHuge(page) && !PageHuge(page))
 			cache_count = HPAGE_PMD_NR;

 		if (!xa_is_value(page) &&
 		    page_count(page) - total_mapcount(page) != cache_count)
 			xas_set_mark(xas, MEMFD_TAG_PINNED);
 		if (cache_count != 1)
 			xas_set(xas, page->index + cache_count);

 		latency += cache_count;
 		if (latency < XA_CHECK_SCHED)
 			continue;
 		latency = 0;

 		xas_pause(xas);
 		xas_unlock_irq(xas);
 		cond_resched();
 		xas_lock_irq(xas);
 	}
 	xas_unlock_irq(xas);
 }

 /*
  * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
  * via get_user_pages(), drivers might have some pending I/O without any active
  * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
  * and see whether it has an elevated ref-count. If so, we tag them and wait for
  * them to be dropped.
  * The caller must guarantee that no new user will acquire writable references
  * to those pages to avoid races.
  */
 static int memfd_wait_for_pins(struct address_space *mapping)
 {
 	XA_STATE(xas, &mapping->i_pages, 0);
 	struct page *page;
 	int error, scan;

 	memfd_tag_pins(&xas);

 	error = 0;
 	for (scan = 0; scan <= LAST_SCAN; scan++) {
 		int latency = 0;
 		int cache_count;

 		if (!xas_marked(&xas, MEMFD_TAG_PINNED))
 			break;

 		if (!scan)
 			lru_add_drain_all();
 		else if (schedule_timeout_killable((HZ << scan) / 200))
 			scan = LAST_SCAN;

 		xas_set(&xas, 0);
 		xas_lock_irq(&xas);
 		xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
 			bool clear = true;

 			cache_count = 1;
 			if (!xa_is_value(page) &&
 			    PageTransHuge(page) && !PageHuge(page))
 				cache_count = HPAGE_PMD_NR;

 			if (!xa_is_value(page) && cache_count !=
 			    page_count(page) - total_mapcount(page)) {
 				/*
 				 * On the last scan, we clean up all those tags
 				 * we inserted; but make a note that we still
 				 * found pages pinned.
 				 */
 				if (scan == LAST_SCAN)
 					error = -EBUSY;
 				else
 					clear = false;
 			}
 			if (clear)
 				xas_clear_mark(&xas, MEMFD_TAG_PINNED);

 			latency += cache_count;
 			if (latency < XA_CHECK_SCHED)
 				continue;
 			latency = 0;

 			xas_pause(&xas);
 			xas_unlock_irq(&xas);
 			cond_resched();
 			xas_lock_irq(&xas);
 		}
 		xas_unlock_irq(&xas);
 	}

 	return error;
 }

 static unsigned int *memfd_file_seals_ptr(struct file *file)
 {
 	if (shmem_file(file))
 		return &SHMEM_I(file_inode(file))->seals;

 #ifdef CONFIG_HUGETLBFS
 	if (is_file_hugepages(file))
 		return &HUGETLBFS_I(file_inode(file))->seals;
 #endif

 	return NULL;
 }

 #define F_ALL_SEALS (F_SEAL_SEAL | \
 		     F_SEAL_EXEC | \
 		     F_SEAL_SHRINK | \
 		     F_SEAL_GROW | \
 		     F_SEAL_WRITE | \
 		     F_SEAL_FUTURE_WRITE)

 static int memfd_add_seals(struct file *file, unsigned int seals)
 {
 	struct inode *inode = file_inode(file);
 	unsigned int *file_seals;
 	int error;

 	/*
 	 * SEALING
 	 * Sealing allows multiple parties to share a tmpfs or hugetlbfs file
 	 * but restrict access to a specific subset of file operations. Seals
 	 * can only be added, but never removed. This way, mutually untrusted
 	 * parties can share common memory regions with a well-defined policy.
 	 * A malicious peer can thus never perform unwanted operations on a
 	 * shared object.
 	 *
 	 * Seals are only supported on special tmpfs or hugetlbfs files and
 	 * always affect the whole underlying inode. Once a seal is set, it
 	 * may prevent some kinds of access to the file. Currently, the
 	 * following seals are defined:
 	 *   SEAL_SEAL: Prevent further seals from being set on this file
 	 *   SEAL_SHRINK: Prevent the file from shrinking
 	 *   SEAL_GROW: Prevent the file from growing
 	 *   SEAL_WRITE: Prevent write access to the file
 	 *   SEAL_EXEC: Prevent modification of the exec bits in the file mode
 	 *
 	 * As we don't require any trust relationship between two parties, we
 	 * must prevent seals from being removed. Therefore, sealing a file
 	 * only adds a given set of seals to the file, it never touches
 	 * existing seals. Furthermore, the "setting seals"-operation can be
 	 * sealed itself, which basically prevents any further seal from being
 	 * added.
 	 *
 	 * Semantics of sealing are only defined on volatile files. Only
 	 * anonymous tmpfs and hugetlbfs files support sealing. More
 	 * importantly, seals are never written to disk. Therefore, there's
 	 * no plan to support it on other file types.
 	 */

 	if (!(file->f_mode & FMODE_WRITE))
 		return -EPERM;
 	if (seals & ~(unsigned int)F_ALL_SEALS)
 		return -EINVAL;

 	inode_lock(inode);

 	file_seals = memfd_file_seals_ptr(file);
 	if (!file_seals) {
 		error = -EINVAL;
 		goto unlock;
 	}

 	if (*file_seals & F_SEAL_SEAL) {
 		error = -EPERM;
 		goto unlock;
 	}

 	if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
 		error = mapping_deny_writable(file->f_mapping);
 		if (error)
 			goto unlock;

 		error = memfd_wait_for_pins(file->f_mapping);
 		if (error) {
 			mapping_allow_writable(file->f_mapping);
 			goto unlock;
 		}
 	}

 	/*
 	 * SEAL_EXEC implys SEAL_WRITE, making W^X from the start.
 	 */
 	if (seals & F_SEAL_EXEC && inode->i_mode & 0111)
 		seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE;

 	*file_seals |= seals;
 	error = 0;

 unlock:
 	inode_unlock(inode);
 	return error;
 }

 static int memfd_get_seals(struct file *file)
 {
 	unsigned int *seals = memfd_file_seals_ptr(file);

 	return seals ? *seals : -EINVAL;
 }

 long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
 {
 	long error;

 	switch (cmd) {
 	case F_ADD_SEALS:
 		error = memfd_add_seals(file, arg);
 		break;
 	case F_GET_SEALS:
 		error = memfd_get_seals(file);
 		break;
 	default:
 		error = -EINVAL;
 		break;
 	}

 	return error;
 }

 #define MFD_NAME_PREFIX "memfd:"
 #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
 #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)

 #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)

 SYSCALL_DEFINE2(memfd_create,
 		const char __user *, uname,
 		unsigned int, flags)
 {
 	char comm[TASK_COMM_LEN];
 	unsigned int *file_seals;
 	struct file *file;
 	int fd, error;
 	char *name;
 	long len;

 	if (!(flags & MFD_HUGETLB)) {
 		if (flags & ~(unsigned int)MFD_ALL_FLAGS)
 			return -EINVAL;
 	} else {
 		/* Allow huge page size encoding in flags. */
 		if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
 				(MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
 			return -EINVAL;
 	}

 	/* Invalid if both EXEC and NOEXEC_SEAL are set.*/
 	if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL))
 		return -EINVAL;

 	if (!(flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
 #ifdef CONFIG_SYSCTL
 		int sysctl = MEMFD_NOEXEC_SCOPE_EXEC;
 		struct pid_namespace *ns;

 		ns = task_active_pid_ns(current);
 		if (ns)
 			sysctl = ns->memfd_noexec_scope;

 		switch (sysctl) {
 		case MEMFD_NOEXEC_SCOPE_EXEC:
 			flags |= MFD_EXEC;
 			break;
 		case MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL:
 			flags |= MFD_NOEXEC_SEAL;
 			break;
 		default:
 			pr_warn_once(
 				"memfd_create(): MFD_NOEXEC_SEAL is enforced, pid=%d '%s'\n",
 				task_pid_nr(current), get_task_comm(comm, current));
 			return -EINVAL;
 		}
 #else
 		flags |= MFD_EXEC;
 #endif
 		pr_warn_once(
 			"memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL, pid=%d '%s'\n",
 			task_pid_nr(current), get_task_comm(comm, current));
 	}

 	/* length includes terminating zero */
 	len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
 	if (len <= 0)
 		return -EFAULT;
 	if (len > MFD_NAME_MAX_LEN + 1)
 		return -EINVAL;

 	name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
 	if (!name)
 		return -ENOMEM;

 	strcpy(name, MFD_NAME_PREFIX);
 	if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
 		error = -EFAULT;
 		goto err_name;
 	}

 	/* terminating-zero may have changed after strnlen_user() returned */
 	if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
 		error = -EFAULT;
 		goto err_name;
 	}

 	fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
 	if (fd < 0) {
 		error = fd;
 		goto err_name;
 	}

 	if (flags & MFD_HUGETLB) {
 		file = hugetlb_file_setup(name, 0, VM_NORESERVE,
 					HUGETLB_ANONHUGE_INODE,
 					(flags >> MFD_HUGE_SHIFT) &
 					MFD_HUGE_MASK);
 	} else
 		file = shmem_file_setup(name, 0, VM_NORESERVE);
 	if (IS_ERR(file)) {
 		error = PTR_ERR(file);
 		goto err_fd;
 	}
 	file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
 	file->f_flags |= O_LARGEFILE;

 	if (flags & MFD_NOEXEC_SEAL) {
 		struct inode *inode = file_inode(file);

 		inode->i_mode &= ~0111;
 		file_seals = memfd_file_seals_ptr(file);
 		*file_seals &= ~F_SEAL_SEAL;
 		*file_seals |= F_SEAL_EXEC;
 	} else if (flags & MFD_ALLOW_SEALING) {
 		/* MFD_EXEC and MFD_ALLOW_SEALING are set */
 		file_seals = memfd_file_seals_ptr(file);
 		*file_seals &= ~F_SEAL_SEAL;
 	}

 	fd_install(fd, file);
 	kfree(name);
 	return fd;

 err_fd:
 	put_unused_fd(fd);
 err_name:
 	kfree(name);
 	return error;
 }
	/*
	* memfd_create system call and file sealing support
	*
	* Code was originally included in shmem.c, and broken out to facilitate
	* use by hugetlbfs as well as tmpfs.
	*
	* This file is released under the GPL.
	*/

	#include <linux/fs.h>
	#include <linux/vfs.h>
	#include <linux/pagemap.h>
	#include <linux/file.h>
	#include <linux/mm.h>
	#include <linux/sched/signal.h>
	#include <linux/khugepaged.h>
	#include <linux/syscalls.h>
	#include <linux/hugetlb.h>
	#include <linux/shmem_fs.h>
	#include <linux/memfd.h>
	#include <linux/pid_namespace.h>
	#include <uapi/linux/memfd.h>

	/*
	* We need a tag: a new tag would expand every xa_node by 8 bytes,
	* so reuse a tag which we firmly believe is never set or cleared on tmpfs
	* or hugetlbfs because they are memory only filesystems.
	*/
	#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE
	#define LAST_SCAN 4 /* about 150ms max */

	static void memfd_tag_pins(struct xa_state *xas)
	{
	struct page *page;
	int latency = 0;
	int cache_count;

	lru_add_drain();

	xas_lock_irq(xas);
	xas_for_each(xas, page, ULONG_MAX) {
	cache_count = 1;
	if (!xa_is_value(page) &&
	PageTransHuge(page) && !PageHuge(page))
	cache_count = HPAGE_PMD_NR;

	if (!xa_is_value(page) &&
	page_count(page) - total_mapcount(page) != cache_count)
	xas_set_mark(xas, MEMFD_TAG_PINNED);
	if (cache_count != 1)
	xas_set(xas, page->index + cache_count);

	latency += cache_count;
	if (latency < XA_CHECK_SCHED)
	continue;
	latency = 0;

	xas_pause(xas);
	xas_unlock_irq(xas);
	cond_resched();
	xas_lock_irq(xas);
	}
	xas_unlock_irq(xas);
	}

	/*
	* Setting SEAL_WRITE requires us to verify there's no pending writer. However,
	* via get_user_pages(), drivers might have some pending I/O without any active
	* user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
	* and see whether it has an elevated ref-count. If so, we tag them and wait for
	* them to be dropped.
	* The caller must guarantee that no new user will acquire writable references
	* to those pages to avoid races.
	*/
	static int memfd_wait_for_pins(struct address_space *mapping)
	{
	XA_STATE(xas, &mapping->i_pages, 0);
	struct page *page;
	int error, scan;

	memfd_tag_pins(&xas);

	error = 0;
	for (scan = 0; scan <= LAST_SCAN; scan++) {
	int latency = 0;
	int cache_count;

	if (!xas_marked(&xas, MEMFD_TAG_PINNED))
	break;

	if (!scan)
	lru_add_drain_all();
	else if (schedule_timeout_killable((HZ << scan) / 200))
	scan = LAST_SCAN;

	xas_set(&xas, 0);
	xas_lock_irq(&xas);
	xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
	bool clear = true;

	cache_count = 1;
	if (!xa_is_value(page) &&
	PageTransHuge(page) && !PageHuge(page))
	cache_count = HPAGE_PMD_NR;

	if (!xa_is_value(page) && cache_count !=
	page_count(page) - total_mapcount(page)) {
	/*
	* On the last scan, we clean up all those tags
	* we inserted; but make a note that we still
	* found pages pinned.
	*/
	if (scan == LAST_SCAN)
	error = -EBUSY;
	else
	clear = false;
	}
	if (clear)
	xas_clear_mark(&xas, MEMFD_TAG_PINNED);

	latency += cache_count;
	if (latency < XA_CHECK_SCHED)
	continue;
	latency = 0;

	xas_pause(&xas);
	xas_unlock_irq(&xas);
	cond_resched();
	xas_lock_irq(&xas);
	}
	xas_unlock_irq(&xas);
	}

	return error;
	}

	static unsigned int memfd_file_seals_ptr(struct file file)
	{
	if (shmem_file(file))
	return &SHMEM_I(file_inode(file))->seals;

	#ifdef CONFIG_HUGETLBFS
	if (is_file_hugepages(file))
	return &HUGETLBFS_I(file_inode(file))->seals;
	#endif

	return NULL;
	}

	#define F_ALL_SEALS (F_SEAL_SEAL \| \
	F_SEAL_EXEC \| \
	F_SEAL_SHRINK \| \
	F_SEAL_GROW \| \
	F_SEAL_WRITE \| \
	F_SEAL_FUTURE_WRITE)

	static int memfd_add_seals(struct file *file, unsigned int seals)
	{
	struct inode *inode = file_inode(file);
	unsigned int *file_seals;
	int error;

	/*
	* SEALING
	* Sealing allows multiple parties to share a tmpfs or hugetlbfs file
	* but restrict access to a specific subset of file operations. Seals
	* can only be added, but never removed. This way, mutually untrusted
	* parties can share common memory regions with a well-defined policy.
	* A malicious peer can thus never perform unwanted operations on a
	* shared object.
	*
	* Seals are only supported on special tmpfs or hugetlbfs files and
	* always affect the whole underlying inode. Once a seal is set, it
	* may prevent some kinds of access to the file. Currently, the
	* following seals are defined:
	* SEAL_SEAL: Prevent further seals from being set on this file
	* SEAL_SHRINK: Prevent the file from shrinking
	* SEAL_GROW: Prevent the file from growing
	* SEAL_WRITE: Prevent write access to the file
	* SEAL_EXEC: Prevent modification of the exec bits in the file mode
	*
	* As we don't require any trust relationship between two parties, we
	* must prevent seals from being removed. Therefore, sealing a file
	* only adds a given set of seals to the file, it never touches
	* existing seals. Furthermore, the "setting seals"-operation can be
	* sealed itself, which basically prevents any further seal from being
	* added.
	*
	* Semantics of sealing are only defined on volatile files. Only
	* anonymous tmpfs and hugetlbfs files support sealing. More
	* importantly, seals are never written to disk. Therefore, there's
	* no plan to support it on other file types.
	*/

	if (!(file->f_mode & FMODE_WRITE))
	return -EPERM;
	if (seals & ~(unsigned int)F_ALL_SEALS)
	return -EINVAL;

	inode_lock(inode);

	file_seals = memfd_file_seals_ptr(file);
	if (!file_seals) {
	error = -EINVAL;
	goto unlock;
	}

	if (*file_seals & F_SEAL_SEAL) {
	error = -EPERM;
	goto unlock;
	}

	if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
	error = mapping_deny_writable(file->f_mapping);
	if (error)
	goto unlock;

	error = memfd_wait_for_pins(file->f_mapping);
	if (error) {
	mapping_allow_writable(file->f_mapping);
	goto unlock;
	}
	}

	/*
	* SEAL_EXEC implys SEAL_WRITE, making W^X from the start.
	*/
	if (seals & F_SEAL_EXEC && inode->i_mode & 0111)
	seals \|= F_SEAL_SHRINK\|F_SEAL_GROW\|F_SEAL_WRITE\|F_SEAL_FUTURE_WRITE;

	*file_seals \|= seals;
	error = 0;

	unlock:
	inode_unlock(inode);
	return error;
	}

	static int memfd_get_seals(struct file *file)
	{
	unsigned int *seals = memfd_file_seals_ptr(file);

	return seals ? *seals : -EINVAL;
	}

	long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
	{
	long error;

	switch (cmd) {
	case F_ADD_SEALS:
	error = memfd_add_seals(file, arg);
	break;
	case F_GET_SEALS:
	error = memfd_get_seals(file);
	break;
	default:
	error = -EINVAL;
	break;
	}

	return error;
	}

	#define MFD_NAME_PREFIX "memfd:"
	#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
	#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)

	#define MFD_ALL_FLAGS (MFD_CLOEXEC \| MFD_ALLOW_SEALING \| MFD_HUGETLB \| MFD_NOEXEC_SEAL \| MFD_EXEC)

	SYSCALL_DEFINE2(memfd_create,
	const char __user *, uname,
	unsigned int, flags)
	{
	char comm[TASK_COMM_LEN];
	unsigned int *file_seals;
	struct file *file;
	int fd, error;
	char *name;
	long len;

	if (!(flags & MFD_HUGETLB)) {
	if (flags & ~(unsigned int)MFD_ALL_FLAGS)
	return -EINVAL;
	} else {
	/* Allow huge page size encoding in flags. */
	if (flags & ~(unsigned int)(MFD_ALL_FLAGS \|
	(MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
	return -EINVAL;
	}

	/* Invalid if both EXEC and NOEXEC_SEAL are set.*/
	if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL))
	return -EINVAL;

	if (!(flags & (MFD_EXEC \| MFD_NOEXEC_SEAL))) {
	#ifdef CONFIG_SYSCTL
	int sysctl = MEMFD_NOEXEC_SCOPE_EXEC;
	struct pid_namespace *ns;

	ns = task_active_pid_ns(current);
	if (ns)
	sysctl = ns->memfd_noexec_scope;

	switch (sysctl) {
	case MEMFD_NOEXEC_SCOPE_EXEC:
	flags \|= MFD_EXEC;
	break;
	case MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL:
	flags \|= MFD_NOEXEC_SEAL;
	break;
	default:
	pr_warn_once(
	"memfd_create(): MFD_NOEXEC_SEAL is enforced, pid=%d '%s'\n",
	task_pid_nr(current), get_task_comm(comm, current));
	return -EINVAL;
	}
	#else
	flags \|= MFD_EXEC;
	#endif
	pr_warn_once(
	"memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL, pid=%d '%s'\n",
	task_pid_nr(current), get_task_comm(comm, current));
	}

	/* length includes terminating zero */
	len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
	if (len <= 0)
	return -EFAULT;
	if (len > MFD_NAME_MAX_LEN + 1)
	return -EINVAL;

	name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
	if (!name)
	return -ENOMEM;

	strcpy(name, MFD_NAME_PREFIX);
	if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
	error = -EFAULT;
	goto err_name;
	}

	/* terminating-zero may have changed after strnlen_user() returned */
	if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
	error = -EFAULT;
	goto err_name;
	}

	fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
	if (fd < 0) {
	error = fd;
	goto err_name;
	}

	if (flags & MFD_HUGETLB) {
	file = hugetlb_file_setup(name, 0, VM_NORESERVE,
	HUGETLB_ANONHUGE_INODE,
	(flags >> MFD_HUGE_SHIFT) &
	MFD_HUGE_MASK);
	} else
	file = shmem_file_setup(name, 0, VM_NORESERVE);
	if (IS_ERR(file)) {
	error = PTR_ERR(file);
	goto err_fd;
	}
	file->f_mode \|= FMODE_LSEEK \| FMODE_PREAD \| FMODE_PWRITE;
	file->f_flags \|= O_LARGEFILE;

	if (flags & MFD_NOEXEC_SEAL) {
	struct inode *inode = file_inode(file);

	inode->i_mode &= ~0111;
	file_seals = memfd_file_seals_ptr(file);
	*file_seals &= ~F_SEAL_SEAL;
	*file_seals \|= F_SEAL_EXEC;
	} else if (flags & MFD_ALLOW_SEALING) {
	/* MFD_EXEC and MFD_ALLOW_SEALING are set */
	file_seals = memfd_file_seals_ptr(file);
	*file_seals &= ~F_SEAL_SEAL;
	}

	fd_install(fd, file);
	kfree(name);
	return fd;

	err_fd:
	put_unused_fd(fd);
	err_name:
	kfree(name);
	return error;
	}