Blame - kernel/cpuset.c - linux

blob: 961d74044deb0b08f8ce237e4f86c700da5e768a [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* kernel/cpuset.c
				3	*
				4	* Processor and Memory placement constraints for sets of tasks.
				5	*
				6	* Copyright (C) 2003 BULL SA.
				7	* Copyright (C) 2004 Silicon Graphics, Inc.
				8	*
				9	* Portions derived from Patrick Mochel's sysfs code.
				10	* sysfs is Copyright (c) 2001-3 Patrick Mochel
				11	* Portions Copyright (c) 2004 Silicon Graphics, Inc.
				12	*
				13	* 2003-10-10 Written by Simon Derr <simon.derr@bull.net>
				14	* 2003-10-22 Updates by Stephen Hemminger.
				15	* 2004 May-July Rework by Paul Jackson <pj@sgi.com>
				16	*
				17	* This file is subject to the terms and conditions of the GNU General Public
				18	* License. See the file COPYING in the main directory of the Linux
				19	* distribution for more details.
				20	*/
				21
				22	#include <linux/config.h>
				23	#include <linux/cpu.h>
				24	#include <linux/cpumask.h>
				25	#include <linux/cpuset.h>
				26	#include <linux/err.h>
				27	#include <linux/errno.h>
				28	#include <linux/file.h>
				29	#include <linux/fs.h>
				30	#include <linux/init.h>
				31	#include <linux/interrupt.h>
				32	#include <linux/kernel.h>
				33	#include <linux/kmod.h>
				34	#include <linux/list.h>
				35	#include <linux/mm.h>
				36	#include <linux/module.h>
				37	#include <linux/mount.h>
				38	#include <linux/namei.h>
				39	#include <linux/pagemap.h>
				40	#include <linux/proc_fs.h>
				41	#include <linux/sched.h>
				42	#include <linux/seq_file.h>
				43	#include <linux/slab.h>
				44	#include <linux/smp_lock.h>
				45	#include <linux/spinlock.h>
				46	#include <linux/stat.h>
				47	#include <linux/string.h>
				48	#include <linux/time.h>
				49	#include <linux/backing-dev.h>
				50	#include <linux/sort.h>
				51
				52	#include <asm/uaccess.h>
				53	#include <asm/atomic.h>
				54	#include <asm/semaphore.h>
				55
				56	#define CPUSET_SUPER_MAGIC 0x27e0eb
				57
				58	struct cpuset {
				59	unsigned long flags; /* "unsigned long" so bitops work */
				60	cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
				61	nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
				62
				63	atomic_t count; /* count tasks using this cpuset */
				64
				65	/*
				66	* We link our 'sibling' struct into our parents 'children'.
				67	* Our children link their 'sibling' into our 'children'.
				68	*/
				69	struct list_head sibling; /* my parents children */
				70	struct list_head children; /* my children */
				71
				72	struct cpuset parent; / my parent */
				73	struct dentry dentry; / cpuset fs entry */
				74
				75	/*
				76	* Copy of global cpuset_mems_generation as of the most
				77	* recent time this cpuset changed its mems_allowed.
				78	*/
				79	int mems_generation;
				80	};
				81
				82	/* bits in struct cpuset flags field */
				83	typedef enum {
				84	CS_CPU_EXCLUSIVE,
				85	CS_MEM_EXCLUSIVE,
				86	CS_REMOVED,
				87	CS_NOTIFY_ON_RELEASE
				88	} cpuset_flagbits_t;
				89
				90	/* convenient tests for these bits */
				91	static inline int is_cpu_exclusive(const struct cpuset *cs)
				92	{
				93	return !!test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
				94	}
				95
				96	static inline int is_mem_exclusive(const struct cpuset *cs)
				97	{
				98	return !!test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
				99	}
				100
				101	static inline int is_removed(const struct cpuset *cs)
				102	{
				103	return !!test_bit(CS_REMOVED, &cs->flags);
				104	}
				105
				106	static inline int notify_on_release(const struct cpuset *cs)
				107	{
				108	return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
				109	}
				110
				111	/*
				112	* Increment this atomic integer everytime any cpuset changes its
				113	* mems_allowed value. Users of cpusets can track this generation
				114	* number, and avoid having to lock and reload mems_allowed unless
				115	* the cpuset they're using changes generation.
				116	*
				117	* A single, global generation is needed because attach_task() could
				118	* reattach a task to a different cpuset, which must not have its
				119	* generation numbers aliased with those of that tasks previous cpuset.
				120	*
				121	* Generations are needed for mems_allowed because one task cannot
				122	* modify anothers memory placement. So we must enable every task,
				123	* on every visit to __alloc_pages(), to efficiently check whether
				124	* its current->cpuset->mems_allowed has changed, requiring an update
				125	* of its current->mems_allowed.
				126	*/
				127	static atomic_t cpuset_mems_generation = ATOMIC_INIT(1);
				128
				129	static struct cpuset top_cpuset = {
				130	.flags = ((1 << CS_CPU_EXCLUSIVE) \| (1 << CS_MEM_EXCLUSIVE)),
				131	.cpus_allowed = CPU_MASK_ALL,
				132	.mems_allowed = NODE_MASK_ALL,
				133	.count = ATOMIC_INIT(0),
				134	.sibling = LIST_HEAD_INIT(top_cpuset.sibling),
				135	.children = LIST_HEAD_INIT(top_cpuset.children),
				136	.parent = NULL,
				137	.dentry = NULL,
				138	.mems_generation = 0,
				139	};
				140
				141	static struct vfsmount *cpuset_mount;
				142	static struct super_block *cpuset_sb = NULL;
				143
				144	/*
				145	* cpuset_sem should be held by anyone who is depending on the children
				146	* or sibling lists of any cpuset, or performing non-atomic operations
				147	* on the flags or *_allowed values of a cpuset, such as raising the
				148	* CS_REMOVED flag bit iff it is not already raised, or reading and
				149	* conditionally modifying the *_allowed values. One kernel global
				150	* cpuset semaphore should be sufficient - these things don't change
				151	* that much.
				152	*
				153	* The code that modifies cpusets holds cpuset_sem across the entire
				154	* operation, from cpuset_common_file_write() down, single threading
				155	* all cpuset modifications (except for counter manipulations from
				156	* fork and exit) across the system. This presumes that cpuset
				157	* modifications are rare - better kept simple and safe, even if slow.
				158	*
				159	* The code that reads cpusets, such as in cpuset_common_file_read()
				160	* and below, only holds cpuset_sem across small pieces of code, such
				161	* as when reading out possibly multi-word cpumasks and nodemasks, as
				162	* the risks are less, and the desire for performance a little greater.
				163	* The proc_cpuset_show() routine needs to hold cpuset_sem to insure
				164	* that no cs->dentry is NULL, as it walks up the cpuset tree to root.
				165	*
				166	* The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't
				167	* (usually) grab cpuset_sem. These are the two most performance
				168	* critical pieces of code here. The exception occurs on exit(),
				169	* if the last task using a cpuset exits, and the cpuset was marked
				170	* notify_on_release. In that case, the cpuset_sem is taken, the
				171	* path to the released cpuset calculated, and a usermode call made
				172	* to /sbin/cpuset_release_agent with the name of the cpuset (path
				173	* relative to the root of cpuset file system) as the argument.
				174	*
				175	* A cpuset can only be deleted if both its 'count' of using tasks is
				176	* zero, and its list of 'children' cpusets is empty. Since all tasks
				177	* in the system use _some_ cpuset, and since there is always at least
				178	* one task in the system (init, pid == 1), therefore, top_cpuset
				179	* always has either children cpusets and/or using tasks. So no need
				180	* for any special hack to ensure that top_cpuset cannot be deleted.
				181	*/
				182
				183	static DECLARE_MUTEX(cpuset_sem);
				184
				185	/*
				186	* A couple of forward declarations required, due to cyclic reference loop:
				187	* cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file
				188	* -> cpuset_create_file -> cpuset_dir_inode_operations -> cpuset_mkdir.
				189	*/
				190
				191	static int cpuset_mkdir(struct inode dir, struct dentry dentry, int mode);
				192	static int cpuset_rmdir(struct inode unused_dir, struct dentry dentry);
				193
				194	static struct backing_dev_info cpuset_backing_dev_info = {
				195	.ra_pages = 0, /* No readahead */
				196	.capabilities = BDI_CAP_NO_ACCT_DIRTY \| BDI_CAP_NO_WRITEBACK,
				197	};
				198
				199	static struct inode *cpuset_new_inode(mode_t mode)
				200	{
				201	struct inode *inode = new_inode(cpuset_sb);
				202
				203	if (inode) {
				204	inode->i_mode = mode;
				205	inode->i_uid = current->fsuid;
				206	inode->i_gid = current->fsgid;
				207	inode->i_blksize = PAGE_CACHE_SIZE;
				208	inode->i_blocks = 0;
				209	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
				210	inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info;
				211	}
				212	return inode;
				213	}
				214
				215	static void cpuset_diput(struct dentry dentry, struct inode inode)
				216	{
				217	/* is dentry a directory ? if so, kfree() associated cpuset */
				218	if (S_ISDIR(inode->i_mode)) {
				219	struct cpuset *cs = dentry->d_fsdata;
				220	BUG_ON(!(is_removed(cs)));
				221	kfree(cs);
				222	}
				223	iput(inode);
				224	}
				225
				226	static struct dentry_operations cpuset_dops = {
				227	.d_iput = cpuset_diput,
				228	};
				229
				230	static struct dentry cpuset_get_dentry(struct dentry parent, const char *name)
				231	{
				232	struct qstr qstr;
				233	struct dentry *d;
				234
				235	qstr.name = name;
				236	qstr.len = strlen(name);
				237	qstr.hash = full_name_hash(name, qstr.len);
				238	d = lookup_hash(&qstr, parent);
				239	if (!IS_ERR(d))
				240	d->d_op = &cpuset_dops;
				241	return d;
				242	}
				243
				244	static void remove_dir(struct dentry *d)
				245	{
				246	struct dentry *parent = dget(d->d_parent);
				247
				248	d_delete(d);
				249	simple_rmdir(parent->d_inode, d);
				250	dput(parent);
				251	}
				252
				253	/*
				254	* NOTE : the dentry must have been dget()'ed
				255	*/
				256	static void cpuset_d_remove_dir(struct dentry *dentry)
				257	{
				258	struct list_head *node;
				259
				260	spin_lock(&dcache_lock);
				261	node = dentry->d_subdirs.next;
				262	while (node != &dentry->d_subdirs) {
				263	struct dentry *d = list_entry(node, struct dentry, d_child);
				264	list_del_init(node);
				265	if (d->d_inode) {
				266	d = dget_locked(d);
				267	spin_unlock(&dcache_lock);
				268	d_delete(d);
				269	simple_unlink(dentry->d_inode, d);
				270	dput(d);
				271	spin_lock(&dcache_lock);
				272	}
				273	node = dentry->d_subdirs.next;
				274	}
				275	list_del_init(&dentry->d_child);
				276	spin_unlock(&dcache_lock);
				277	remove_dir(dentry);
				278	}
				279
				280	static struct super_operations cpuset_ops = {
				281	.statfs = simple_statfs,
				282	.drop_inode = generic_delete_inode,
				283	};
				284
				285	static int cpuset_fill_super(struct super_block sb, void unused_data,
				286	int unused_silent)
				287	{
				288	struct inode *inode;
				289	struct dentry *root;
				290
				291	sb->s_blocksize = PAGE_CACHE_SIZE;
				292	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
				293	sb->s_magic = CPUSET_SUPER_MAGIC;
				294	sb->s_op = &cpuset_ops;
				295	cpuset_sb = sb;
				296
				297	inode = cpuset_new_inode(S_IFDIR \| S_IRUGO \| S_IXUGO \| S_IWUSR);
				298	if (inode) {
				299	inode->i_op = &simple_dir_inode_operations;
				300	inode->i_fop = &simple_dir_operations;
				301	/* directories start off with i_nlink == 2 (for "." entry) */
				302	inode->i_nlink++;
				303	} else {
				304	return -ENOMEM;
				305	}
				306
				307	root = d_alloc_root(inode);
				308	if (!root) {
				309	iput(inode);
				310	return -ENOMEM;
				311	}
				312	sb->s_root = root;
				313	return 0;
				314	}
				315
				316	static struct super_block cpuset_get_sb(struct file_system_type fs_type,
				317	int flags, const char *unused_dev_name,
				318	void *data)
				319	{
				320	return get_sb_single(fs_type, flags, data, cpuset_fill_super);
				321	}
				322
				323	static struct file_system_type cpuset_fs_type = {
				324	.name = "cpuset",
				325	.get_sb = cpuset_get_sb,
				326	.kill_sb = kill_litter_super,
				327	};
				328
				329	/* struct cftype:
				330	*
				331	* The files in the cpuset filesystem mostly have a very simple read/write
				332	* handling, some common function will take care of it. Nevertheless some cases
				333	* (read tasks) are special and therefore I define this structure for every
				334	* kind of file.
				335	*
				336	*
				337	* When reading/writing to a file:
				338	* - the cpuset to use in file->f_dentry->d_parent->d_fsdata
				339	* - the 'cftype' of the file is file->f_dentry->d_fsdata
				340	*/
				341
				342	struct cftype {
				343	char *name;
				344	int private;
				345	int (open) (struct inode inode, struct file *file);
				346	ssize_t (read) (struct file file, char __user *buf, size_t nbytes,
				347	loff_t *ppos);
				348	int (write) (struct file file, const char __user *buf, size_t nbytes,
				349	loff_t *ppos);
				350	int (release) (struct inode inode, struct file *file);
				351	};
				352
				353	static inline struct cpuset __d_cs(struct dentry dentry)
				354	{
				355	return dentry->d_fsdata;
				356	}
				357
				358	static inline struct cftype __d_cft(struct dentry dentry)
				359	{
				360	return dentry->d_fsdata;
				361	}
				362
				363	/*
				364	* Call with cpuset_sem held. Writes path of cpuset into buf.
				365	* Returns 0 on success, -errno on error.
				366	*/
				367
				368	static int cpuset_path(const struct cpuset cs, char buf, int buflen)
				369	{
				370	char *start;
				371
				372	start = buf + buflen;
				373
				374	*--start = '\0';
				375	for (;;) {
				376	int len = cs->dentry->d_name.len;
				377	if ((start -= len) < buf)
				378	return -ENAMETOOLONG;
				379	memcpy(start, cs->dentry->d_name.name, len);
				380	cs = cs->parent;
				381	if (!cs)
				382	break;
				383	if (!cs->parent)
				384	continue;
				385	if (--start < buf)
				386	return -ENAMETOOLONG;
				387	*start = '/';
				388	}
				389	memmove(buf, start, buf + buflen - start);
				390	return 0;
				391	}
				392
				393	/*
				394	* Notify userspace when a cpuset is released, by running
				395	* /sbin/cpuset_release_agent with the name of the cpuset (path
				396	* relative to the root of cpuset file system) as the argument.
				397	*
				398	* Most likely, this user command will try to rmdir this cpuset.
				399	*
				400	* This races with the possibility that some other task will be
				401	* attached to this cpuset before it is removed, or that some other
				402	* user task will 'mkdir' a child cpuset of this cpuset. That's ok.
				403	* The presumed 'rmdir' will fail quietly if this cpuset is no longer
				404	* unused, and this cpuset will be reprieved from its death sentence,
				405	* to continue to serve a useful existence. Next time it's released,
				406	* we will get notified again, if it still has 'notify_on_release' set.
				407	*
				408	* Note final arg to call_usermodehelper() is 0 - that means
				409	* don't wait. Since we are holding the global cpuset_sem here,
				410	* and we are asking another thread (started from keventd) to rmdir a
				411	* cpuset, we can't wait - or we'd deadlock with the removing thread
				412	* on cpuset_sem.
				413	*/
				414
				415	static int cpuset_release_agent(char *cpuset_str)
				416	{
				417	char argv[3], envp[3];
				418	int i;
				419
				420	i = 0;
				421	argv[i++] = "/sbin/cpuset_release_agent";
				422	argv[i++] = cpuset_str;
				423	argv[i] = NULL;
				424
				425	i = 0;
				426	/* minimal command environment */
				427	envp[i++] = "HOME=/";
				428	envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
				429	envp[i] = NULL;
				430
				431	return call_usermodehelper(argv[0], argv, envp, 0);
				432	}
				433
				434	/*
				435	* Either cs->count of using tasks transitioned to zero, or the
				436	* cs->children list of child cpusets just became empty. If this
				437	* cs is notify_on_release() and now both the user count is zero and
				438	* the list of children is empty, send notice to user land.
				439	*/
				440
				441	static void check_for_release(struct cpuset *cs)
				442	{
				443	if (notify_on_release(cs) && atomic_read(&cs->count) == 0 &&
				444	list_empty(&cs->children)) {
				445	char *buf;
				446
				447	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
				448	if (!buf)
				449	return;
				450	if (cpuset_path(cs, buf, PAGE_SIZE) < 0)
				451	goto out;
				452	cpuset_release_agent(buf);
				453	out:
				454	kfree(buf);
				455	}
				456	}
				457
				458	/*
				459	* Return in *pmask the portion of a cpusets's cpus_allowed that
				460	* are online. If none are online, walk up the cpuset hierarchy
				461	* until we find one that does have some online cpus. If we get
				462	* all the way to the top and still haven't found any online cpus,
				463	* return cpu_online_map. Or if passed a NULL cs from an exit'ing
				464	* task, return cpu_online_map.
				465	*
				466	* One way or another, we guarantee to return some non-empty subset
				467	* of cpu_online_map.
				468	*
				469	* Call with cpuset_sem held.
				470	*/
				471
				472	static void guarantee_online_cpus(const struct cpuset cs, cpumask_t pmask)
				473	{
				474	while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map))
				475	cs = cs->parent;
				476	if (cs)
				477	cpus_and(*pmask, cs->cpus_allowed, cpu_online_map);
				478	else
				479	*pmask = cpu_online_map;
				480	BUG_ON(!cpus_intersects(*pmask, cpu_online_map));
				481	}
				482
				483	/*
				484	* Return in *pmask the portion of a cpusets's mems_allowed that
				485	* are online. If none are online, walk up the cpuset hierarchy
				486	* until we find one that does have some online mems. If we get
				487	* all the way to the top and still haven't found any online mems,
				488	* return node_online_map.
				489	*
				490	* One way or another, we guarantee to return some non-empty subset
				491	* of node_online_map.
				492	*
				493	* Call with cpuset_sem held.
				494	*/
				495
				496	static void guarantee_online_mems(const struct cpuset cs, nodemask_t pmask)
				497	{
				498	while (cs && !nodes_intersects(cs->mems_allowed, node_online_map))
				499	cs = cs->parent;
				500	if (cs)
				501	nodes_and(*pmask, cs->mems_allowed, node_online_map);
				502	else
				503	*pmask = node_online_map;
				504	BUG_ON(!nodes_intersects(*pmask, node_online_map));
				505	}
				506
				507	/*
				508	* Refresh current tasks mems_allowed and mems_generation from
				509	* current tasks cpuset. Call with cpuset_sem held.
				510	*
				511	* Be sure to call refresh_mems() on any cpuset operation which
				512	* (1) holds cpuset_sem, and (2) might possibly alloc memory.
				513	* Call after obtaining cpuset_sem lock, before any possible
				514	* allocation. Otherwise one risks trying to allocate memory
				515	* while the task cpuset_mems_generation is not the same as
				516	* the mems_generation in its cpuset, which would deadlock on
				517	* cpuset_sem in cpuset_update_current_mems_allowed().
				518	*
				519	* Since we hold cpuset_sem, once refresh_mems() is called, the
				520	* test (current->cpuset_mems_generation != cs->mems_generation)
				521	* in cpuset_update_current_mems_allowed() will remain false,
				522	* until we drop cpuset_sem. Anyone else who would change our
				523	* cpusets mems_generation needs to lock cpuset_sem first.
				524	*/
				525
				526	static void refresh_mems(void)
				527	{
				528	struct cpuset *cs = current->cpuset;
				529
				530	if (current->cpuset_mems_generation != cs->mems_generation) {
				531	guarantee_online_mems(cs, &current->mems_allowed);
				532	current->cpuset_mems_generation = cs->mems_generation;
				533	}
				534	}
				535
				536	/*
				537	* is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
				538	*
				539	* One cpuset is a subset of another if all its allowed CPUs and
				540	* Memory Nodes are a subset of the other, and its exclusive flags
				541	* are only set if the other's are set.
				542	*/
				543
				544	static int is_cpuset_subset(const struct cpuset p, const struct cpuset q)
				545	{
				546	return cpus_subset(p->cpus_allowed, q->cpus_allowed) &&
				547	nodes_subset(p->mems_allowed, q->mems_allowed) &&
				548	is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
				549	is_mem_exclusive(p) <= is_mem_exclusive(q);
				550	}
				551
				552	/*
				553	* validate_change() - Used to validate that any proposed cpuset change
				554	* follows the structural rules for cpusets.
				555	*
				556	* If we replaced the flag and mask values of the current cpuset
				557	* (cur) with those values in the trial cpuset (trial), would
				558	* our various subset and exclusive rules still be valid? Presumes
				559	* cpuset_sem held.
				560	*
				561	* 'cur' is the address of an actual, in-use cpuset. Operations
				562	* such as list traversal that depend on the actual address of the
				563	* cpuset in the list must use cur below, not trial.
				564	*
				565	* 'trial' is the address of bulk structure copy of cur, with
				566	* perhaps one or more of the fields cpus_allowed, mems_allowed,
				567	* or flags changed to new, trial values.
				568	*
				569	* Return 0 if valid, -errno if not.
				570	*/
				571
				572	static int validate_change(const struct cpuset cur, const struct cpuset trial)
				573	{
				574	struct cpuset c, par;
				575
				576	/* Each of our child cpusets must be a subset of us */
				577	list_for_each_entry(c, &cur->children, sibling) {
				578	if (!is_cpuset_subset(c, trial))
				579	return -EBUSY;
				580	}
				581
				582	/* Remaining checks don't apply to root cpuset */
				583	if ((par = cur->parent) == NULL)
				584	return 0;
				585
				586	/* We must be a subset of our parent cpuset */
				587	if (!is_cpuset_subset(trial, par))
				588	return -EACCES;
				589
				590	/* If either I or some sibling (!= me) is exclusive, we can't overlap */
				591	list_for_each_entry(c, &par->children, sibling) {
				592	if ((is_cpu_exclusive(trial) \|\| is_cpu_exclusive(c)) &&
				593	c != cur &&
				594	cpus_intersects(trial->cpus_allowed, c->cpus_allowed))
				595	return -EINVAL;
				596	if ((is_mem_exclusive(trial) \|\| is_mem_exclusive(c)) &&
				597	c != cur &&
				598	nodes_intersects(trial->mems_allowed, c->mems_allowed))
				599	return -EINVAL;
				600	}
				601
				602	return 0;
				603	}
				604
				605	static int update_cpumask(struct cpuset cs, char buf)
				606	{
				607	struct cpuset trialcs;
				608	int retval;
				609
				610	trialcs = *cs;
				611	retval = cpulist_parse(buf, trialcs.cpus_allowed);
				612	if (retval < 0)
				613	return retval;
				614	cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map);
				615	if (cpus_empty(trialcs.cpus_allowed))
				616	return -ENOSPC;
				617	retval = validate_change(cs, &trialcs);
				618	if (retval == 0)
				619	cs->cpus_allowed = trialcs.cpus_allowed;
				620	return retval;
				621	}
				622
				623	static int update_nodemask(struct cpuset cs, char buf)
				624	{
				625	struct cpuset trialcs;
				626	int retval;
				627
				628	trialcs = *cs;
				629	retval = nodelist_parse(buf, trialcs.mems_allowed);
				630	if (retval < 0)
				631	return retval;
				632	nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);
				633	if (nodes_empty(trialcs.mems_allowed))
				634	return -ENOSPC;
				635	retval = validate_change(cs, &trialcs);
				636	if (retval == 0) {
				637	cs->mems_allowed = trialcs.mems_allowed;
				638	atomic_inc(&cpuset_mems_generation);
				639	cs->mems_generation = atomic_read(&cpuset_mems_generation);
				640	}
				641	return retval;
				642	}
				643
				644	/*
				645	* update_flag - read a 0 or a 1 in a file and update associated flag
				646	* bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
				647	* CS_NOTIFY_ON_RELEASE)
				648	* cs: the cpuset to update
				649	* buf: the buffer where we read the 0 or 1
				650	*/
				651
				652	static int update_flag(cpuset_flagbits_t bit, struct cpuset cs, char buf)
				653	{
				654	int turning_on;
				655	struct cpuset trialcs;
				656	int err;
				657
				658	turning_on = (simple_strtoul(buf, NULL, 10) != 0);
				659
				660	trialcs = *cs;
				661	if (turning_on)
				662	set_bit(bit, &trialcs.flags);
				663	else
				664	clear_bit(bit, &trialcs.flags);
				665
				666	err = validate_change(cs, &trialcs);
				667	if (err == 0) {
				668	if (turning_on)
				669	set_bit(bit, &cs->flags);
				670	else
				671	clear_bit(bit, &cs->flags);
				672	}
				673	return err;
				674	}
				675
				676	static int attach_task(struct cpuset cs, char buf)
				677	{
				678	pid_t pid;
				679	struct task_struct *tsk;
				680	struct cpuset *oldcs;
				681	cpumask_t cpus;
				682
				683	if (sscanf(buf, "%d", &pid) != 1)
				684	return -EIO;
				685	if (cpus_empty(cs->cpus_allowed) \|\| nodes_empty(cs->mems_allowed))
				686	return -ENOSPC;
				687
				688	if (pid) {
				689	read_lock(&tasklist_lock);
				690
				691	tsk = find_task_by_pid(pid);
				692	if (!tsk) {
				693	read_unlock(&tasklist_lock);
				694	return -ESRCH;
				695	}
				696
				697	get_task_struct(tsk);
				698	read_unlock(&tasklist_lock);
				699
				700	if ((current->euid) && (current->euid != tsk->uid)
				701	&& (current->euid != tsk->suid)) {
				702	put_task_struct(tsk);
				703	return -EACCES;
				704	}
				705	} else {
				706	tsk = current;
				707	get_task_struct(tsk);
				708	}
				709
				710	task_lock(tsk);
				711	oldcs = tsk->cpuset;
				712	if (!oldcs) {
				713	task_unlock(tsk);
				714	put_task_struct(tsk);
				715	return -ESRCH;
				716	}
				717	atomic_inc(&cs->count);
				718	tsk->cpuset = cs;
				719	task_unlock(tsk);
				720
				721	guarantee_online_cpus(cs, &cpus);
				722	set_cpus_allowed(tsk, cpus);
				723
				724	put_task_struct(tsk);
				725	if (atomic_dec_and_test(&oldcs->count))
				726	check_for_release(oldcs);
				727	return 0;
				728	}
				729
				730	/* The various types of files and directories in a cpuset file system */
				731
				732	typedef enum {
				733	FILE_ROOT,
				734	FILE_DIR,
				735	FILE_CPULIST,
				736	FILE_MEMLIST,
				737	FILE_CPU_EXCLUSIVE,
				738	FILE_MEM_EXCLUSIVE,
				739	FILE_NOTIFY_ON_RELEASE,
				740	FILE_TASKLIST,
				741	} cpuset_filetype_t;
				742
				743	static ssize_t cpuset_common_file_write(struct file file, const char __user userbuf,
				744	size_t nbytes, loff_t *unused_ppos)
				745	{
				746	struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
				747	struct cftype *cft = __d_cft(file->f_dentry);
				748	cpuset_filetype_t type = cft->private;
				749	char *buffer;
				750	int retval = 0;
				751
				752	/* Crude upper limit on largest legitimate cpulist user might write. */
				753	if (nbytes > 100 + 6 * NR_CPUS)
				754	return -E2BIG;
				755
				756	/* +1 for nul-terminator */
				757	if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0)
				758	return -ENOMEM;
				759
				760	if (copy_from_user(buffer, userbuf, nbytes)) {
				761	retval = -EFAULT;
				762	goto out1;
				763	}
				764	buffer[nbytes] = 0; /* nul-terminate */
				765
				766	down(&cpuset_sem);
				767
				768	if (is_removed(cs)) {
				769	retval = -ENODEV;
				770	goto out2;
				771	}
				772
				773	switch (type) {
				774	case FILE_CPULIST:
				775	retval = update_cpumask(cs, buffer);
				776	break;
				777	case FILE_MEMLIST:
				778	retval = update_nodemask(cs, buffer);
				779	break;
				780	case FILE_CPU_EXCLUSIVE:
				781	retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer);
				782	break;
				783	case FILE_MEM_EXCLUSIVE:
				784	retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
				785	break;
				786	case FILE_NOTIFY_ON_RELEASE:
				787	retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
				788	break;
				789	case FILE_TASKLIST:
				790	retval = attach_task(cs, buffer);
				791	break;
				792	default:
				793	retval = -EINVAL;
				794	goto out2;
				795	}
				796
				797	if (retval == 0)
				798	retval = nbytes;
				799	out2:
				800	up(&cpuset_sem);
				801	out1:
				802	kfree(buffer);
				803	return retval;
				804	}
				805
				806	static ssize_t cpuset_file_write(struct file file, const char __user buf,
				807	size_t nbytes, loff_t *ppos)
				808	{
				809	ssize_t retval = 0;
				810	struct cftype *cft = __d_cft(file->f_dentry);
				811	if (!cft)
				812	return -ENODEV;
				813
				814	/* special function ? */
				815	if (cft->write)
				816	retval = cft->write(file, buf, nbytes, ppos);
				817	else
				818	retval = cpuset_common_file_write(file, buf, nbytes, ppos);
				819
				820	return retval;
				821	}
				822
				823	/*
				824	* These ascii lists should be read in a single call, by using a user
				825	* buffer large enough to hold the entire map. If read in smaller
				826	* chunks, there is no guarantee of atomicity. Since the display format
				827	* used, list of ranges of sequential numbers, is variable length,
				828	* and since these maps can change value dynamically, one could read
				829	* gibberish by doing partial reads while a list was changing.
				830	* A single large read to a buffer that crosses a page boundary is
				831	* ok, because the result being copied to user land is not recomputed
				832	* across a page fault.
				833	*/
				834
				835	static int cpuset_sprintf_cpulist(char page, struct cpuset cs)
				836	{
				837	cpumask_t mask;
				838
				839	down(&cpuset_sem);
				840	mask = cs->cpus_allowed;
				841	up(&cpuset_sem);
				842
				843	return cpulist_scnprintf(page, PAGE_SIZE, mask);
				844	}
				845
				846	static int cpuset_sprintf_memlist(char page, struct cpuset cs)
				847	{
				848	nodemask_t mask;
				849
				850	down(&cpuset_sem);
				851	mask = cs->mems_allowed;
				852	up(&cpuset_sem);
				853
				854	return nodelist_scnprintf(page, PAGE_SIZE, mask);
				855	}
				856
				857	static ssize_t cpuset_common_file_read(struct file file, char __user buf,
				858	size_t nbytes, loff_t *ppos)
				859	{
				860	struct cftype *cft = __d_cft(file->f_dentry);
				861	struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
				862	cpuset_filetype_t type = cft->private;
				863	char *page;
				864	ssize_t retval = 0;
				865	char *s;
				866	char *start;
				867	size_t n;
				868
				869	if (!(page = (char *)__get_free_page(GFP_KERNEL)))
				870	return -ENOMEM;
				871
				872	s = page;
				873
				874	switch (type) {
				875	case FILE_CPULIST:
				876	s += cpuset_sprintf_cpulist(s, cs);
				877	break;
				878	case FILE_MEMLIST:
				879	s += cpuset_sprintf_memlist(s, cs);
				880	break;
				881	case FILE_CPU_EXCLUSIVE:
				882	*s++ = is_cpu_exclusive(cs) ? '1' : '0';
				883	break;
				884	case FILE_MEM_EXCLUSIVE:
				885	*s++ = is_mem_exclusive(cs) ? '1' : '0';
				886	break;
				887	case FILE_NOTIFY_ON_RELEASE:
				888	*s++ = notify_on_release(cs) ? '1' : '0';
				889	break;
				890	default:
				891	retval = -EINVAL;
				892	goto out;
				893	}
				894	*s++ = '\n';
				895	*s = '\0';
				896
				897	start = page + *ppos;
				898	n = s - start;
				899	retval = n - copy_to_user(buf, start, min(n, nbytes));
				900	*ppos += retval;
				901	out:
				902	free_page((unsigned long)page);
				903	return retval;
				904	}
				905
				906	static ssize_t cpuset_file_read(struct file file, char __user buf, size_t nbytes,
				907	loff_t *ppos)
				908	{
				909	ssize_t retval = 0;
				910	struct cftype *cft = __d_cft(file->f_dentry);
				911	if (!cft)
				912	return -ENODEV;
				913
				914	/* special function ? */
				915	if (cft->read)
				916	retval = cft->read(file, buf, nbytes, ppos);
				917	else
				918	retval = cpuset_common_file_read(file, buf, nbytes, ppos);
				919
				920	return retval;
				921	}
				922
				923	static int cpuset_file_open(struct inode inode, struct file file)
				924	{
				925	int err;
				926	struct cftype *cft;
				927
				928	err = generic_file_open(inode, file);
				929	if (err)
				930	return err;
				931
				932	cft = __d_cft(file->f_dentry);
				933	if (!cft)
				934	return -ENODEV;
				935	if (cft->open)
				936	err = cft->open(inode, file);
				937	else
				938	err = 0;
				939
				940	return err;
				941	}
				942
				943	static int cpuset_file_release(struct inode inode, struct file file)
				944	{
				945	struct cftype *cft = __d_cft(file->f_dentry);
				946	if (cft->release)
				947	return cft->release(inode, file);
				948	return 0;
				949	}
				950
				951	static struct file_operations cpuset_file_operations = {
				952	.read = cpuset_file_read,
				953	.write = cpuset_file_write,
				954	.llseek = generic_file_llseek,
				955	.open = cpuset_file_open,
				956	.release = cpuset_file_release,
				957	};
				958
				959	static struct inode_operations cpuset_dir_inode_operations = {
				960	.lookup = simple_lookup,
				961	.mkdir = cpuset_mkdir,
				962	.rmdir = cpuset_rmdir,
				963	};
				964
				965	static int cpuset_create_file(struct dentry *dentry, int mode)
				966	{
				967	struct inode *inode;
				968
				969	if (!dentry)
				970	return -ENOENT;
				971	if (dentry->d_inode)
				972	return -EEXIST;
				973
				974	inode = cpuset_new_inode(mode);
				975	if (!inode)
				976	return -ENOMEM;
				977
				978	if (S_ISDIR(mode)) {
				979	inode->i_op = &cpuset_dir_inode_operations;
				980	inode->i_fop = &simple_dir_operations;
				981
				982	/* start off with i_nlink == 2 (for "." entry) */
				983	inode->i_nlink++;
				984	} else if (S_ISREG(mode)) {
				985	inode->i_size = 0;
				986	inode->i_fop = &cpuset_file_operations;
				987	}
				988
				989	d_instantiate(dentry, inode);
				990	dget(dentry); /* Extra count - pin the dentry in core */
				991	return 0;
				992	}
				993
				994	/*
				995	* cpuset_create_dir - create a directory for an object.
				996	* cs: the cpuset we create the directory for.
				997	* It must have a valid ->parent field
				998	* And we are going to fill its ->dentry field.
				999	* name: The name to give to the cpuset directory. Will be copied.
				1000	* mode: mode to set on new directory.
				1001	*/
				1002
				1003	static int cpuset_create_dir(struct cpuset cs, const char name, int mode)
				1004	{
				1005	struct dentry *dentry = NULL;
				1006	struct dentry *parent;
				1007	int error = 0;
				1008
				1009	parent = cs->parent->dentry;
				1010	dentry = cpuset_get_dentry(parent, name);
				1011	if (IS_ERR(dentry))
				1012	return PTR_ERR(dentry);
				1013	error = cpuset_create_file(dentry, S_IFDIR \| mode);
				1014	if (!error) {
				1015	dentry->d_fsdata = cs;
				1016	parent->d_inode->i_nlink++;
				1017	cs->dentry = dentry;
				1018	}
				1019	dput(dentry);
				1020
				1021	return error;
				1022	}
				1023
				1024	static int cpuset_add_file(struct dentry dir, const struct cftype cft)
				1025	{
				1026	struct dentry *dentry;
				1027	int error;
				1028
				1029	down(&dir->d_inode->i_sem);
				1030	dentry = cpuset_get_dentry(dir, cft->name);
				1031	if (!IS_ERR(dentry)) {
				1032	error = cpuset_create_file(dentry, 0644 \| S_IFREG);
				1033	if (!error)
				1034	dentry->d_fsdata = (void *)cft;
				1035	dput(dentry);
				1036	} else
				1037	error = PTR_ERR(dentry);
				1038	up(&dir->d_inode->i_sem);
				1039	return error;
				1040	}
				1041
				1042	/*
				1043	* Stuff for reading the 'tasks' file.
				1044	*
				1045	* Reading this file can return large amounts of data if a cpuset has
				1046	* lots of attached tasks. So it may need several calls to read(),
				1047	* but we cannot guarantee that the information we produce is correct
				1048	* unless we produce it entirely atomically.
				1049	*
				1050	* Upon tasks file open(), a struct ctr_struct is allocated, that
				1051	* will have a pointer to an array (also allocated here). The struct
				1052	* ctr_struct * is stored in file->private_data. Its resources will
				1053	* be freed by release() when the file is closed. The array is used
				1054	* to sprintf the PIDs and then used by read().
				1055	*/
				1056
				1057	/* cpusets_tasks_read array */
				1058
				1059	struct ctr_struct {
				1060	char *buf;
				1061	int bufsz;
				1062	};
				1063
				1064	/*
				1065	* Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'.
				1066	* Return actual number of pids loaded.
				1067	*/
				1068	static inline int pid_array_load(pid_t pidarray, int npids, struct cpuset cs)
				1069	{
				1070	int n = 0;
				1071	struct task_struct g, p;
				1072
				1073	read_lock(&tasklist_lock);
				1074
				1075	do_each_thread(g, p) {
				1076	if (p->cpuset == cs) {
				1077	pidarray[n++] = p->pid;
				1078	if (unlikely(n == npids))
				1079	goto array_full;
				1080	}
				1081	} while_each_thread(g, p);
				1082
				1083	array_full:
				1084	read_unlock(&tasklist_lock);
				1085	return n;
				1086	}
				1087
				1088	static int cmppid(const void a, const void b)
				1089	{
				1090	return (pid_t )a - (pid_t )b;
				1091	}
				1092
				1093	/*
				1094	* Convert array 'a' of 'npids' pid_t's to a string of newline separated
				1095	* decimal pids in 'buf'. Don't write more than 'sz' chars, but return
				1096	* count 'cnt' of how many chars would be written if buf were large enough.
				1097	*/
				1098	static int pid_array_to_buf(char buf, int sz, pid_t a, int npids)
				1099	{
				1100	int cnt = 0;
				1101	int i;
				1102
				1103	for (i = 0; i < npids; i++)
				1104	cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
				1105	return cnt;
				1106	}
				1107
				1108	static int cpuset_tasks_open(struct inode unused, struct file file)
				1109	{
				1110	struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
				1111	struct ctr_struct *ctr;
				1112	pid_t *pidarray;
				1113	int npids;
				1114	char c;
				1115
				1116	if (!(file->f_mode & FMODE_READ))
				1117	return 0;
				1118
				1119	ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
				1120	if (!ctr)
				1121	goto err0;
				1122
				1123	/*
				1124	* If cpuset gets more users after we read count, we won't have
				1125	* enough space - tough. This race is indistinguishable to the
				1126	* caller from the case that the additional cpuset users didn't
				1127	* show up until sometime later on.
				1128	*/
				1129	npids = atomic_read(&cs->count);
				1130	pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
				1131	if (!pidarray)
				1132	goto err1;
				1133
				1134	npids = pid_array_load(pidarray, npids, cs);
				1135	sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
				1136
				1137	/* Call pid_array_to_buf() twice, first just to get bufsz */
				1138	ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
				1139	ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
				1140	if (!ctr->buf)
				1141	goto err2;
				1142	ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
				1143
				1144	kfree(pidarray);
				1145	file->private_data = ctr;
				1146	return 0;
				1147
				1148	err2:
				1149	kfree(pidarray);
				1150	err1:
				1151	kfree(ctr);
				1152	err0:
				1153	return -ENOMEM;
				1154	}
				1155
				1156	static ssize_t cpuset_tasks_read(struct file file, char __user buf,
				1157	size_t nbytes, loff_t *ppos)
				1158	{
				1159	struct ctr_struct *ctr = file->private_data;
				1160
				1161	if (*ppos + nbytes > ctr->bufsz)
				1162	nbytes = ctr->bufsz - *ppos;
				1163	if (copy_to_user(buf, ctr->buf + *ppos, nbytes))
				1164	return -EFAULT;
				1165	*ppos += nbytes;
				1166	return nbytes;
				1167	}
				1168
				1169	static int cpuset_tasks_release(struct inode unused_inode, struct file file)
				1170	{
				1171	struct ctr_struct *ctr;
				1172
				1173	if (file->f_mode & FMODE_READ) {
				1174	ctr = file->private_data;
				1175	kfree(ctr->buf);
				1176	kfree(ctr);
				1177	}
				1178	return 0;
				1179	}
				1180
				1181	/*
				1182	* for the common functions, 'private' gives the type of file
				1183	*/
				1184
				1185	static struct cftype cft_tasks = {
				1186	.name = "tasks",
				1187	.open = cpuset_tasks_open,
				1188	.read = cpuset_tasks_read,
				1189	.release = cpuset_tasks_release,
				1190	.private = FILE_TASKLIST,
				1191	};
				1192
				1193	static struct cftype cft_cpus = {
				1194	.name = "cpus",
				1195	.private = FILE_CPULIST,
				1196	};
				1197
				1198	static struct cftype cft_mems = {
				1199	.name = "mems",
				1200	.private = FILE_MEMLIST,
				1201	};
				1202
				1203	static struct cftype cft_cpu_exclusive = {
				1204	.name = "cpu_exclusive",
				1205	.private = FILE_CPU_EXCLUSIVE,
				1206	};
				1207
				1208	static struct cftype cft_mem_exclusive = {
				1209	.name = "mem_exclusive",
				1210	.private = FILE_MEM_EXCLUSIVE,
				1211	};
				1212
				1213	static struct cftype cft_notify_on_release = {
				1214	.name = "notify_on_release",
				1215	.private = FILE_NOTIFY_ON_RELEASE,
				1216	};
				1217
				1218	static int cpuset_populate_dir(struct dentry *cs_dentry)
				1219	{
				1220	int err;
				1221
				1222	if ((err = cpuset_add_file(cs_dentry, &cft_cpus)) < 0)
				1223	return err;
				1224	if ((err = cpuset_add_file(cs_dentry, &cft_mems)) < 0)
				1225	return err;
				1226	if ((err = cpuset_add_file(cs_dentry, &cft_cpu_exclusive)) < 0)
				1227	return err;
				1228	if ((err = cpuset_add_file(cs_dentry, &cft_mem_exclusive)) < 0)
				1229	return err;
				1230	if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
				1231	return err;
				1232	if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
				1233	return err;
				1234	return 0;
				1235	}
				1236
				1237	/*
				1238	* cpuset_create - create a cpuset
				1239	* parent: cpuset that will be parent of the new cpuset.
				1240	* name: name of the new cpuset. Will be strcpy'ed.
				1241	* mode: mode to set on new inode
				1242	*
				1243	* Must be called with the semaphore on the parent inode held
				1244	*/
				1245
				1246	static long cpuset_create(struct cpuset parent, const char name, int mode)
				1247	{
				1248	struct cpuset *cs;
				1249	int err;
				1250
				1251	cs = kmalloc(sizeof(*cs), GFP_KERNEL);
				1252	if (!cs)
				1253	return -ENOMEM;
				1254
				1255	down(&cpuset_sem);
				1256	refresh_mems();
				1257	cs->flags = 0;
				1258	if (notify_on_release(parent))
				1259	set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
				1260	cs->cpus_allowed = CPU_MASK_NONE;
				1261	cs->mems_allowed = NODE_MASK_NONE;
				1262	atomic_set(&cs->count, 0);
				1263	INIT_LIST_HEAD(&cs->sibling);
				1264	INIT_LIST_HEAD(&cs->children);
				1265	atomic_inc(&cpuset_mems_generation);
				1266	cs->mems_generation = atomic_read(&cpuset_mems_generation);
				1267
				1268	cs->parent = parent;
				1269
				1270	list_add(&cs->sibling, &cs->parent->children);
				1271
				1272	err = cpuset_create_dir(cs, name, mode);
				1273	if (err < 0)
				1274	goto err;
				1275
				1276	/*
				1277	* Release cpuset_sem before cpuset_populate_dir() because it
				1278	* will down() this new directory's i_sem and if we race with
				1279	* another mkdir, we might deadlock.
				1280	*/
				1281	up(&cpuset_sem);
				1282
				1283	err = cpuset_populate_dir(cs->dentry);
				1284	/* If err < 0, we have a half-filled directory - oh well ;) */
				1285	return 0;
				1286	err:
				1287	list_del(&cs->sibling);
				1288	up(&cpuset_sem);
				1289	kfree(cs);
				1290	return err;
				1291	}
				1292
				1293	static int cpuset_mkdir(struct inode dir, struct dentry dentry, int mode)
				1294	{
				1295	struct cpuset *c_parent = dentry->d_parent->d_fsdata;
				1296
				1297	/* the vfs holds inode->i_sem already */
				1298	return cpuset_create(c_parent, dentry->d_name.name, mode \| S_IFDIR);
				1299	}
				1300
				1301	static int cpuset_rmdir(struct inode unused_dir, struct dentry dentry)
				1302	{
				1303	struct cpuset *cs = dentry->d_fsdata;
				1304	struct dentry *d;
				1305	struct cpuset *parent;
				1306
				1307	/* the vfs holds both inode->i_sem already */
				1308
				1309	down(&cpuset_sem);
				1310	refresh_mems();
				1311	if (atomic_read(&cs->count) > 0) {
				1312	up(&cpuset_sem);
				1313	return -EBUSY;
				1314	}
				1315	if (!list_empty(&cs->children)) {
				1316	up(&cpuset_sem);
				1317	return -EBUSY;
				1318	}
				1319	spin_lock(&cs->dentry->d_lock);
				1320	parent = cs->parent;
				1321	set_bit(CS_REMOVED, &cs->flags);
				1322	list_del(&cs->sibling); /* delete my sibling from parent->children */
				1323	if (list_empty(&parent->children))
				1324	check_for_release(parent);
				1325	d = dget(cs->dentry);
				1326	cs->dentry = NULL;
				1327	spin_unlock(&d->d_lock);
				1328	cpuset_d_remove_dir(d);
				1329	dput(d);
				1330	up(&cpuset_sem);
				1331	return 0;
				1332	}
				1333
				1334	/**
				1335	* cpuset_init - initialize cpusets at system boot
				1336	*
				1337	* Description: Initialize top_cpuset and the cpuset internal file system,
				1338	**/
				1339
				1340	int __init cpuset_init(void)
				1341	{
				1342	struct dentry *root;
				1343	int err;
				1344
				1345	top_cpuset.cpus_allowed = CPU_MASK_ALL;
				1346	top_cpuset.mems_allowed = NODE_MASK_ALL;
				1347
				1348	atomic_inc(&cpuset_mems_generation);
				1349	top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation);
				1350
				1351	init_task.cpuset = &top_cpuset;
				1352
				1353	err = register_filesystem(&cpuset_fs_type);
				1354	if (err < 0)
				1355	goto out;
				1356	cpuset_mount = kern_mount(&cpuset_fs_type);
				1357	if (IS_ERR(cpuset_mount)) {
				1358	printk(KERN_ERR "cpuset: could not mount!\n");
				1359	err = PTR_ERR(cpuset_mount);
				1360	cpuset_mount = NULL;
				1361	goto out;
				1362	}
				1363	root = cpuset_mount->mnt_sb->s_root;
				1364	root->d_fsdata = &top_cpuset;
				1365	root->d_inode->i_nlink++;
				1366	top_cpuset.dentry = root;
				1367	root->d_inode->i_op = &cpuset_dir_inode_operations;
				1368	err = cpuset_populate_dir(root);
				1369	out:
				1370	return err;
				1371	}
				1372
				1373	/**
				1374	* cpuset_init_smp - initialize cpus_allowed
				1375	*
				1376	* Description: Finish top cpuset after cpu, node maps are initialized
				1377	**/
				1378
				1379	void __init cpuset_init_smp(void)
				1380	{
				1381	top_cpuset.cpus_allowed = cpu_online_map;
				1382	top_cpuset.mems_allowed = node_online_map;
				1383	}
				1384
				1385	/**
				1386	* cpuset_fork - attach newly forked task to its parents cpuset.
				1387	* @p: pointer to task_struct of forking parent process.
				1388	*
				1389	* Description: By default, on fork, a task inherits its
				1390	* parents cpuset. The pointer to the shared cpuset is
				1391	* automatically copied in fork.c by dup_task_struct().
				1392	* This cpuset_fork() routine need only increment the usage
				1393	* counter in that cpuset.
				1394	**/
				1395
				1396	void cpuset_fork(struct task_struct *tsk)
				1397	{
				1398	atomic_inc(&tsk->cpuset->count);
				1399	}
				1400
				1401	/**
				1402	* cpuset_exit - detach cpuset from exiting task
				1403	* @tsk: pointer to task_struct of exiting process
				1404	*
				1405	* Description: Detach cpuset from @tsk and release it.
				1406	*
				1407	**/
				1408
				1409	void cpuset_exit(struct task_struct *tsk)
				1410	{
				1411	struct cpuset *cs;
				1412
				1413	task_lock(tsk);
				1414	cs = tsk->cpuset;
				1415	tsk->cpuset = NULL;
				1416	task_unlock(tsk);
				1417
				1418	if (atomic_dec_and_test(&cs->count)) {
				1419	down(&cpuset_sem);
				1420	check_for_release(cs);
				1421	up(&cpuset_sem);
				1422	}
				1423	}
				1424
				1425	/**
				1426	* cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
				1427	* @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
				1428	*
				1429	* Description: Returns the cpumask_t cpus_allowed of the cpuset
				1430	* attached to the specified @tsk. Guaranteed to return some non-empty
				1431	* subset of cpu_online_map, even if this means going outside the
				1432	* tasks cpuset.
				1433	**/
				1434
Benoit Boissinot	9a84889	2005-04-16 15:25:59 -0700	[diff] [blame]	1435	cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1436	{
				1437	cpumask_t mask;
				1438
				1439	down(&cpuset_sem);
				1440	task_lock((struct task_struct *)tsk);
				1441	guarantee_online_cpus(tsk->cpuset, &mask);
				1442	task_unlock((struct task_struct *)tsk);
				1443	up(&cpuset_sem);
				1444
				1445	return mask;
				1446	}
				1447
				1448	void cpuset_init_current_mems_allowed(void)
				1449	{
				1450	current->mems_allowed = NODE_MASK_ALL;
				1451	}
				1452
				1453	/*
				1454	* If the current tasks cpusets mems_allowed changed behind our backs,
				1455	* update current->mems_allowed and mems_generation to the new value.
				1456	* Do not call this routine if in_interrupt().
				1457	*/
				1458
				1459	void cpuset_update_current_mems_allowed(void)
				1460	{
				1461	struct cpuset *cs = current->cpuset;
				1462
				1463	if (!cs)
				1464	return; /* task is exiting */
				1465	if (current->cpuset_mems_generation != cs->mems_generation) {
				1466	down(&cpuset_sem);
				1467	refresh_mems();
				1468	up(&cpuset_sem);
				1469	}
				1470	}
				1471
				1472	void cpuset_restrict_to_mems_allowed(unsigned long *nodes)
				1473	{
				1474	bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed),
				1475	MAX_NUMNODES);
				1476	}
				1477
				1478	/*
				1479	* Are any of the nodes on zonelist zl allowed in current->mems_allowed?
				1480	*/
				1481	int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
				1482	{
				1483	int i;
				1484
				1485	for (i = 0; zl->zones[i]; i++) {
				1486	int nid = zl->zones[i]->zone_pgdat->node_id;
				1487
				1488	if (node_isset(nid, current->mems_allowed))
				1489	return 1;
				1490	}
				1491	return 0;
				1492	}
				1493
				1494	/*
				1495	* Is 'current' valid, and is zone z allowed in current->mems_allowed?
				1496	*/
				1497	int cpuset_zone_allowed(struct zone *z)
				1498	{
				1499	return in_interrupt() \|\|
				1500	node_isset(z->zone_pgdat->node_id, current->mems_allowed);
				1501	}
				1502
				1503	/*
				1504	* proc_cpuset_show()
				1505	* - Print tasks cpuset path into seq_file.
				1506	* - Used for /proc/<pid>/cpuset.
				1507	*/
				1508
				1509	static int proc_cpuset_show(struct seq_file m, void v)
				1510	{
				1511	struct cpuset *cs;
				1512	struct task_struct *tsk;
				1513	char *buf;
				1514	int retval = 0;
				1515
				1516	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
				1517	if (!buf)
				1518	return -ENOMEM;
				1519
				1520	tsk = m->private;
				1521	down(&cpuset_sem);
				1522	task_lock(tsk);
				1523	cs = tsk->cpuset;
				1524	task_unlock(tsk);
				1525	if (!cs) {
				1526	retval = -EINVAL;
				1527	goto out;
				1528	}
				1529
				1530	retval = cpuset_path(cs, buf, PAGE_SIZE);
				1531	if (retval < 0)
				1532	goto out;
				1533	seq_puts(m, buf);
				1534	seq_putc(m, '\n');
				1535	out:
				1536	up(&cpuset_sem);
				1537	kfree(buf);
				1538	return retval;
				1539	}
				1540
				1541	static int cpuset_open(struct inode inode, struct file file)
				1542	{
				1543	struct task_struct *tsk = PROC_I(inode)->task;
				1544	return single_open(file, proc_cpuset_show, tsk);
				1545	}
				1546
				1547	struct file_operations proc_cpuset_operations = {
				1548	.open = cpuset_open,
				1549	.read = seq_read,
				1550	.llseek = seq_lseek,
				1551	.release = single_release,
				1552	};
				1553
				1554	/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */
				1555	char cpuset_task_status_allowed(struct task_struct task, char *buffer)
				1556	{
				1557	buffer += sprintf(buffer, "Cpus_allowed:\t");
				1558	buffer += cpumask_scnprintf(buffer, PAGE_SIZE, task->cpus_allowed);
				1559	buffer += sprintf(buffer, "\n");
				1560	buffer += sprintf(buffer, "Mems_allowed:\t");
				1561	buffer += nodemask_scnprintf(buffer, PAGE_SIZE, task->mems_allowed);
				1562	buffer += sprintf(buffer, "\n");
				1563	return buffer;
				1564	}