blob: 83369058ec133b380a482ca86792087134b944f6 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
Christoph Lameter8bccd852005-10-29 18:16:59 -07005 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
Christoph Lameter8bccd852005-10-29 18:16:59 -070021 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
Christoph Lameter8bccd852005-10-29 18:16:59 -070024 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070028 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
Christoph Lameter8bccd852005-10-29 18:16:59 -070033 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070034 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
Linus Torvalds1da177e2005-04-16 15:20:36 -070066*/
67
68#include <linux/mempolicy.h>
69#include <linux/mm.h>
70#include <linux/highmem.h>
71#include <linux/hugetlb.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070074#include <linux/nodemask.h>
75#include <linux/cpuset.h>
76#include <linux/gfp.h>
77#include <linux/slab.h>
78#include <linux/string.h>
79#include <linux/module.h>
Pavel Emelyanovb4888932007-10-18 23:40:14 -070080#include <linux/nsproxy.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070081#include <linux/interrupt.h>
82#include <linux/init.h>
83#include <linux/compat.h>
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080084#include <linux/swap.h>
Christoph Lameter1a75a6c2006-01-08 01:01:02 -080085#include <linux/seq_file.h>
86#include <linux/proc_fs.h>
Christoph Lameterb20a3502006-03-22 00:09:12 -080087#include <linux/migrate.h>
Christoph Lameter95a402c2006-06-23 02:03:53 -070088#include <linux/rmap.h>
David Quigley86c3a762006-06-23 02:04:02 -070089#include <linux/security.h>
Adrian Bunkdbcb0f12007-10-16 01:26:26 -070090#include <linux/syscalls.h>
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -070091#include <linux/ctype.h>
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080092
Linus Torvalds1da177e2005-04-16 15:20:36 -070093#include <asm/tlbflush.h>
94#include <asm/uaccess.h>
95
Christoph Lameter38e35862006-01-08 01:01:01 -080096/* Internal flags */
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080097#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
Christoph Lameter38e35862006-01-08 01:01:01 -080098#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
Christoph Lameter1a75a6c2006-01-08 01:01:02 -080099#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800100
Pekka Enbergfcc234f2006-03-22 00:08:13 -0800101static struct kmem_cache *policy_cache;
102static struct kmem_cache *sn_cache;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103
Linus Torvalds1da177e2005-04-16 15:20:36 -0700104/* Highest zone. An specific allocation for a zone below that is not
105 policied. */
Christoph Lameter62672762007-02-10 01:43:07 -0800106enum zone_type policy_zone = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700108/*
109 * run-time system-wide default policy => local allocation
110 */
Andi Kleend42c6992005-07-06 19:56:03 +0200111struct mempolicy default_policy = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700112 .refcnt = ATOMIC_INIT(1), /* never free it */
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700113 .mode = MPOL_PREFERRED,
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700114 .flags = MPOL_F_LOCAL,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700115};
116
David Rientjes37012942008-04-28 02:12:33 -0700117static const struct mempolicy_operations {
118 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
119 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
120} mpol_ops[MPOL_MAX];
121
Mel Gorman19770b32008-04-28 02:12:18 -0700122/* Check that the nodemask contains at least one populated zone */
David Rientjes37012942008-04-28 02:12:33 -0700123static int is_valid_nodemask(const nodemask_t *nodemask)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700124{
Mel Gorman19770b32008-04-28 02:12:18 -0700125 int nd, k;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700126
Mel Gorman19770b32008-04-28 02:12:18 -0700127 /* Check that there is something useful in this mask */
128 k = policy_zone;
129
130 for_each_node_mask(nd, *nodemask) {
131 struct zone *z;
132
133 for (k = 0; k <= policy_zone; k++) {
134 z = &NODE_DATA(nd)->node_zones[k];
135 if (z->present_pages > 0)
136 return 1;
Andi Kleendd942ae2006-02-17 01:39:16 +0100137 }
138 }
Mel Gorman19770b32008-04-28 02:12:18 -0700139
140 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700141}
142
David Rientjesf5b087b2008-04-28 02:12:27 -0700143static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
144{
David Rientjes4c50bc02008-04-28 02:12:30 -0700145 return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
146}
147
148static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
149 const nodemask_t *rel)
150{
151 nodemask_t tmp;
152 nodes_fold(tmp, *orig, nodes_weight(*rel));
153 nodes_onto(*ret, tmp, *rel);
David Rientjesf5b087b2008-04-28 02:12:27 -0700154}
155
David Rientjes37012942008-04-28 02:12:33 -0700156static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
157{
158 if (nodes_empty(*nodes))
159 return -EINVAL;
160 pol->v.nodes = *nodes;
161 return 0;
162}
163
164static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
165{
166 if (!nodes)
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700167 pol->flags |= MPOL_F_LOCAL; /* local allocation */
David Rientjes37012942008-04-28 02:12:33 -0700168 else if (nodes_empty(*nodes))
169 return -EINVAL; /* no allowed nodes */
170 else
171 pol->v.preferred_node = first_node(*nodes);
172 return 0;
173}
174
175static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
176{
177 if (!is_valid_nodemask(nodes))
178 return -EINVAL;
179 pol->v.nodes = *nodes;
180 return 0;
181}
182
Linus Torvalds1da177e2005-04-16 15:20:36 -0700183/* Create a new policy */
David Rientjes028fec42008-04-28 02:12:25 -0700184static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
185 nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186{
187 struct mempolicy *policy;
David Rientjesf5b087b2008-04-28 02:12:27 -0700188 nodemask_t cpuset_context_nmask;
David Rientjes37012942008-04-28 02:12:33 -0700189 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700190
David Rientjes028fec42008-04-28 02:12:25 -0700191 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
192 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
Paul Mundt140d5a42007-07-15 23:38:16 -0700193
David Rientjes3e1f06452008-04-28 02:12:34 -0700194 if (mode == MPOL_DEFAULT) {
195 if (nodes && !nodes_empty(*nodes))
David Rientjes37012942008-04-28 02:12:33 -0700196 return ERR_PTR(-EINVAL);
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700197 return NULL; /* simply delete any existing policy */
David Rientjes37012942008-04-28 02:12:33 -0700198 }
David Rientjes3e1f06452008-04-28 02:12:34 -0700199 VM_BUG_ON(!nodes);
200
201 /*
202 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
203 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
204 * All other modes require a valid pointer to a non-empty nodemask.
205 */
206 if (mode == MPOL_PREFERRED) {
207 if (nodes_empty(*nodes)) {
208 if (((flags & MPOL_F_STATIC_NODES) ||
209 (flags & MPOL_F_RELATIVE_NODES)))
210 return ERR_PTR(-EINVAL);
211 nodes = NULL; /* flag local alloc */
212 }
213 } else if (nodes_empty(*nodes))
214 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700215 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
216 if (!policy)
217 return ERR_PTR(-ENOMEM);
218 atomic_set(&policy->refcnt, 1);
Lee Schermerhorn45c47452008-04-28 02:13:12 -0700219 policy->mode = mode;
David Rientjes3e1f06452008-04-28 02:12:34 -0700220 policy->flags = flags;
David Rientjesf5b087b2008-04-28 02:12:27 -0700221
David Rientjes3e1f06452008-04-28 02:12:34 -0700222 if (nodes) {
223 /*
224 * cpuset related setup doesn't apply to local allocation
225 */
David Rientjes37012942008-04-28 02:12:33 -0700226 cpuset_update_task_memory_state();
227 if (flags & MPOL_F_RELATIVE_NODES)
228 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
229 &cpuset_current_mems_allowed);
230 else
231 nodes_and(cpuset_context_nmask, *nodes,
232 cpuset_current_mems_allowed);
233 if (mpol_store_user_nodemask(policy))
234 policy->w.user_nodemask = *nodes;
235 else
236 policy->w.cpuset_mems_allowed =
237 cpuset_mems_allowed(current);
238 }
239
240 ret = mpol_ops[mode].create(policy,
David Rientjes3e1f06452008-04-28 02:12:34 -0700241 nodes ? &cpuset_context_nmask : NULL);
David Rientjes37012942008-04-28 02:12:33 -0700242 if (ret < 0) {
243 kmem_cache_free(policy_cache, policy);
244 return ERR_PTR(ret);
245 }
246 return policy;
247}
248
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -0700249/* Slow path of a mpol destructor. */
250void __mpol_put(struct mempolicy *p)
251{
252 if (!atomic_dec_and_test(&p->refcnt))
253 return;
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -0700254 kmem_cache_free(policy_cache, p);
255}
256
David Rientjes37012942008-04-28 02:12:33 -0700257static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
258{
259}
260
261static void mpol_rebind_nodemask(struct mempolicy *pol,
262 const nodemask_t *nodes)
263{
264 nodemask_t tmp;
265
266 if (pol->flags & MPOL_F_STATIC_NODES)
267 nodes_and(tmp, pol->w.user_nodemask, *nodes);
268 else if (pol->flags & MPOL_F_RELATIVE_NODES)
269 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
270 else {
271 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
272 *nodes);
273 pol->w.cpuset_mems_allowed = *nodes;
274 }
275
276 pol->v.nodes = tmp;
277 if (!node_isset(current->il_next, tmp)) {
278 current->il_next = next_node(current->il_next, tmp);
279 if (current->il_next >= MAX_NUMNODES)
280 current->il_next = first_node(tmp);
281 if (current->il_next >= MAX_NUMNODES)
282 current->il_next = numa_node_id();
283 }
284}
285
286static void mpol_rebind_preferred(struct mempolicy *pol,
287 const nodemask_t *nodes)
288{
289 nodemask_t tmp;
290
David Rientjes37012942008-04-28 02:12:33 -0700291 if (pol->flags & MPOL_F_STATIC_NODES) {
292 int node = first_node(pol->w.user_nodemask);
293
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700294 if (node_isset(node, *nodes)) {
David Rientjes37012942008-04-28 02:12:33 -0700295 pol->v.preferred_node = node;
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700296 pol->flags &= ~MPOL_F_LOCAL;
297 } else
298 pol->flags |= MPOL_F_LOCAL;
David Rientjes37012942008-04-28 02:12:33 -0700299 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
300 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
301 pol->v.preferred_node = first_node(tmp);
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700302 } else if (!(pol->flags & MPOL_F_LOCAL)) {
David Rientjes37012942008-04-28 02:12:33 -0700303 pol->v.preferred_node = node_remap(pol->v.preferred_node,
304 pol->w.cpuset_mems_allowed,
305 *nodes);
306 pol->w.cpuset_mems_allowed = *nodes;
307 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700308}
309
David Rientjes1d0d2682008-04-28 02:12:32 -0700310/* Migrate a policy to a different set of nodes */
311static void mpol_rebind_policy(struct mempolicy *pol,
312 const nodemask_t *newmask)
313{
David Rientjes1d0d2682008-04-28 02:12:32 -0700314 if (!pol)
315 return;
David Rientjes1d0d2682008-04-28 02:12:32 -0700316 if (!mpol_store_user_nodemask(pol) &&
317 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
318 return;
Lee Schermerhorn45c47452008-04-28 02:13:12 -0700319 mpol_ops[pol->mode].rebind(pol, newmask);
David Rientjes1d0d2682008-04-28 02:12:32 -0700320}
321
322/*
323 * Wrapper for mpol_rebind_policy() that just requires task
324 * pointer, and updates task mempolicy.
325 */
326
327void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
328{
329 mpol_rebind_policy(tsk->mempolicy, new);
330}
331
332/*
333 * Rebind each vma in mm to new nodemask.
334 *
335 * Call holding a reference to mm. Takes mm->mmap_sem during call.
336 */
337
338void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
339{
340 struct vm_area_struct *vma;
341
342 down_write(&mm->mmap_sem);
343 for (vma = mm->mmap; vma; vma = vma->vm_next)
344 mpol_rebind_policy(vma->vm_policy, new);
345 up_write(&mm->mmap_sem);
346}
347
David Rientjes37012942008-04-28 02:12:33 -0700348static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
349 [MPOL_DEFAULT] = {
350 .rebind = mpol_rebind_default,
351 },
352 [MPOL_INTERLEAVE] = {
353 .create = mpol_new_interleave,
354 .rebind = mpol_rebind_nodemask,
355 },
356 [MPOL_PREFERRED] = {
357 .create = mpol_new_preferred,
358 .rebind = mpol_rebind_preferred,
359 },
360 [MPOL_BIND] = {
361 .create = mpol_new_bind,
362 .rebind = mpol_rebind_nodemask,
363 },
364};
365
Christoph Lameter397874d2006-03-06 15:42:53 -0800366static void gather_stats(struct page *, void *, int pte_dirty);
Christoph Lameterfc301282006-01-18 17:42:29 -0800367static void migrate_page_add(struct page *page, struct list_head *pagelist,
368 unsigned long flags);
Christoph Lameter1a75a6c2006-01-08 01:01:02 -0800369
Christoph Lameter38e35862006-01-08 01:01:01 -0800370/* Scan through pages checking if pages follow certain conditions. */
Nick Pigginb5810032005-10-29 18:16:12 -0700371static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800372 unsigned long addr, unsigned long end,
373 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800374 void *private)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700375{
Hugh Dickins91612e02005-06-21 17:15:07 -0700376 pte_t *orig_pte;
377 pte_t *pte;
Hugh Dickins705e87c2005-10-29 18:16:27 -0700378 spinlock_t *ptl;
Hugh Dickins941150a2005-06-21 17:15:06 -0700379
Hugh Dickins705e87c2005-10-29 18:16:27 -0700380 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
Hugh Dickins91612e02005-06-21 17:15:07 -0700381 do {
Linus Torvalds6aab3412005-11-28 14:34:23 -0800382 struct page *page;
Andy Whitcroft25ba77c2006-12-06 20:33:03 -0800383 int nid;
Hugh Dickins91612e02005-06-21 17:15:07 -0700384
385 if (!pte_present(*pte))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386 continue;
Linus Torvalds6aab3412005-11-28 14:34:23 -0800387 page = vm_normal_page(vma, addr, *pte);
388 if (!page)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700389 continue;
Nick Piggin053837f2006-01-18 17:42:27 -0800390 /*
391 * The check for PageReserved here is important to avoid
392 * handling zero pages and other pages that may have been
393 * marked special by the system.
394 *
395 * If the PageReserved would not be checked here then f.e.
396 * the location of the zero page could have an influence
397 * on MPOL_MF_STRICT, zero pages would be counted for
398 * the per node stats, and there would be useless attempts
399 * to put zero pages on the migration list.
400 */
Christoph Lameterf4598c82006-01-12 01:05:20 -0800401 if (PageReserved(page))
402 continue;
Linus Torvalds6aab3412005-11-28 14:34:23 -0800403 nid = page_to_nid(page);
Christoph Lameter38e35862006-01-08 01:01:01 -0800404 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
405 continue;
406
Christoph Lameter1a75a6c2006-01-08 01:01:02 -0800407 if (flags & MPOL_MF_STATS)
Christoph Lameter397874d2006-03-06 15:42:53 -0800408 gather_stats(page, private, pte_dirty(*pte));
Nick Piggin053837f2006-01-18 17:42:27 -0800409 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
Christoph Lameterfc301282006-01-18 17:42:29 -0800410 migrate_page_add(page, private, flags);
Christoph Lameter38e35862006-01-08 01:01:01 -0800411 else
412 break;
Hugh Dickins91612e02005-06-21 17:15:07 -0700413 } while (pte++, addr += PAGE_SIZE, addr != end);
Hugh Dickins705e87c2005-10-29 18:16:27 -0700414 pte_unmap_unlock(orig_pte, ptl);
Hugh Dickins91612e02005-06-21 17:15:07 -0700415 return addr != end;
416}
417
Nick Pigginb5810032005-10-29 18:16:12 -0700418static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800419 unsigned long addr, unsigned long end,
420 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800421 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700422{
423 pmd_t *pmd;
424 unsigned long next;
425
426 pmd = pmd_offset(pud, addr);
427 do {
428 next = pmd_addr_end(addr, end);
429 if (pmd_none_or_clear_bad(pmd))
430 continue;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800431 if (check_pte_range(vma, pmd, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800432 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700433 return -EIO;
434 } while (pmd++, addr = next, addr != end);
435 return 0;
436}
437
Nick Pigginb5810032005-10-29 18:16:12 -0700438static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800439 unsigned long addr, unsigned long end,
440 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800441 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700442{
443 pud_t *pud;
444 unsigned long next;
445
446 pud = pud_offset(pgd, addr);
447 do {
448 next = pud_addr_end(addr, end);
449 if (pud_none_or_clear_bad(pud))
450 continue;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800451 if (check_pmd_range(vma, pud, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800452 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700453 return -EIO;
454 } while (pud++, addr = next, addr != end);
455 return 0;
456}
457
Nick Pigginb5810032005-10-29 18:16:12 -0700458static inline int check_pgd_range(struct vm_area_struct *vma,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800459 unsigned long addr, unsigned long end,
460 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800461 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700462{
463 pgd_t *pgd;
464 unsigned long next;
465
Nick Pigginb5810032005-10-29 18:16:12 -0700466 pgd = pgd_offset(vma->vm_mm, addr);
Hugh Dickins91612e02005-06-21 17:15:07 -0700467 do {
468 next = pgd_addr_end(addr, end);
469 if (pgd_none_or_clear_bad(pgd))
470 continue;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800471 if (check_pud_range(vma, pgd, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800472 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700473 return -EIO;
474 } while (pgd++, addr = next, addr != end);
475 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700476}
477
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800478/*
479 * Check if all pages in a range are on a set of nodes.
480 * If pagelist != NULL then isolate pages from the LRU and
481 * put them on the pagelist.
482 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700483static struct vm_area_struct *
484check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
Christoph Lameter38e35862006-01-08 01:01:01 -0800485 const nodemask_t *nodes, unsigned long flags, void *private)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486{
487 int err;
488 struct vm_area_struct *first, *vma, *prev;
489
Christoph Lameter90036ee2006-03-16 23:03:59 -0800490 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
Christoph Lameter90036ee2006-03-16 23:03:59 -0800491
Christoph Lameterb20a3502006-03-22 00:09:12 -0800492 err = migrate_prep();
493 if (err)
494 return ERR_PTR(err);
Christoph Lameter90036ee2006-03-16 23:03:59 -0800495 }
Nick Piggin053837f2006-01-18 17:42:27 -0800496
Linus Torvalds1da177e2005-04-16 15:20:36 -0700497 first = find_vma(mm, start);
498 if (!first)
499 return ERR_PTR(-EFAULT);
500 prev = NULL;
501 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800502 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
503 if (!vma->vm_next && vma->vm_end < end)
504 return ERR_PTR(-EFAULT);
505 if (prev && prev->vm_end < vma->vm_start)
506 return ERR_PTR(-EFAULT);
507 }
508 if (!is_vm_hugetlb_page(vma) &&
509 ((flags & MPOL_MF_STRICT) ||
510 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
511 vma_migratable(vma)))) {
Andi Kleen5b952b32005-09-13 01:25:08 -0700512 unsigned long endvma = vma->vm_end;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800513
Andi Kleen5b952b32005-09-13 01:25:08 -0700514 if (endvma > end)
515 endvma = end;
516 if (vma->vm_start > start)
517 start = vma->vm_start;
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800518 err = check_pgd_range(vma, start, endvma, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800519 flags, private);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700520 if (err) {
521 first = ERR_PTR(err);
522 break;
523 }
524 }
525 prev = vma;
526 }
527 return first;
528}
529
530/* Apply policy to a single VMA */
531static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
532{
533 int err = 0;
534 struct mempolicy *old = vma->vm_policy;
535
Paul Mundt140d5a42007-07-15 23:38:16 -0700536 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700537 vma->vm_start, vma->vm_end, vma->vm_pgoff,
538 vma->vm_ops, vma->vm_file,
539 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
540
541 if (vma->vm_ops && vma->vm_ops->set_policy)
542 err = vma->vm_ops->set_policy(vma, new);
543 if (!err) {
544 mpol_get(new);
545 vma->vm_policy = new;
Lee Schermerhornf0be3d32008-04-28 02:13:08 -0700546 mpol_put(old);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700547 }
548 return err;
549}
550
551/* Step 2: apply policy to a range and do splits. */
552static int mbind_range(struct vm_area_struct *vma, unsigned long start,
553 unsigned long end, struct mempolicy *new)
554{
555 struct vm_area_struct *next;
556 int err;
557
558 err = 0;
559 for (; vma && vma->vm_start < end; vma = next) {
560 next = vma->vm_next;
561 if (vma->vm_start < start)
562 err = split_vma(vma->vm_mm, vma, start, 1);
563 if (!err && vma->vm_end > end)
564 err = split_vma(vma->vm_mm, vma, end, 0);
565 if (!err)
566 err = policy_vma(vma, new);
567 if (err)
568 break;
569 }
570 return err;
571}
572
Paul Jacksonc61afb12006-03-24 03:16:08 -0800573/*
574 * Update task->flags PF_MEMPOLICY bit: set iff non-default
575 * mempolicy. Allows more rapid checking of this (combined perhaps
576 * with other PF_* flag bits) on memory allocation hot code paths.
577 *
578 * If called from outside this file, the task 'p' should -only- be
579 * a newly forked child not yet visible on the task list, because
580 * manipulating the task flags of a visible task is not safe.
581 *
582 * The above limitation is why this routine has the funny name
583 * mpol_fix_fork_child_flag().
584 *
585 * It is also safe to call this with a task pointer of current,
586 * which the static wrapper mpol_set_task_struct_flag() does,
587 * for use within this file.
588 */
589
590void mpol_fix_fork_child_flag(struct task_struct *p)
591{
592 if (p->mempolicy)
593 p->flags |= PF_MEMPOLICY;
594 else
595 p->flags &= ~PF_MEMPOLICY;
596}
597
598static void mpol_set_task_struct_flag(void)
599{
600 mpol_fix_fork_child_flag(current);
601}
602
Linus Torvalds1da177e2005-04-16 15:20:36 -0700603/* Set the process memory policy */
David Rientjes028fec42008-04-28 02:12:25 -0700604static long do_set_mempolicy(unsigned short mode, unsigned short flags,
605 nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700606{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700607 struct mempolicy *new;
Lee Schermerhornf4e53d92008-04-28 02:13:10 -0700608 struct mm_struct *mm = current->mm;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700609
David Rientjes028fec42008-04-28 02:12:25 -0700610 new = mpol_new(mode, flags, nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700611 if (IS_ERR(new))
612 return PTR_ERR(new);
Lee Schermerhornf4e53d92008-04-28 02:13:10 -0700613
614 /*
615 * prevent changing our mempolicy while show_numa_maps()
616 * is using it.
617 * Note: do_set_mempolicy() can be called at init time
618 * with no 'mm'.
619 */
620 if (mm)
621 down_write(&mm->mmap_sem);
Lee Schermerhornf0be3d32008-04-28 02:13:08 -0700622 mpol_put(current->mempolicy);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700623 current->mempolicy = new;
Paul Jacksonc61afb12006-03-24 03:16:08 -0800624 mpol_set_task_struct_flag();
Lee Schermerhorn45c47452008-04-28 02:13:12 -0700625 if (new && new->mode == MPOL_INTERLEAVE &&
David Rientjesf5b087b2008-04-28 02:12:27 -0700626 nodes_weight(new->v.nodes))
Andi Kleendfcd3c0d2005-10-29 18:15:48 -0700627 current->il_next = first_node(new->v.nodes);
Lee Schermerhornf4e53d92008-04-28 02:13:10 -0700628 if (mm)
629 up_write(&mm->mmap_sem);
630
Linus Torvalds1da177e2005-04-16 15:20:36 -0700631 return 0;
632}
633
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700634/*
635 * Return nodemask for policy for get_mempolicy() query
636 */
637static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700638{
Andi Kleendfcd3c0d2005-10-29 18:15:48 -0700639 nodes_clear(*nodes);
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700640 if (p == &default_policy)
641 return;
642
Lee Schermerhorn45c47452008-04-28 02:13:12 -0700643 switch (p->mode) {
Mel Gorman19770b32008-04-28 02:12:18 -0700644 case MPOL_BIND:
645 /* Fall through */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700646 case MPOL_INTERLEAVE:
Andi Kleendfcd3c0d2005-10-29 18:15:48 -0700647 *nodes = p->v.nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700648 break;
649 case MPOL_PREFERRED:
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700650 if (!(p->flags & MPOL_F_LOCAL))
Andi Kleendfcd3c0d2005-10-29 18:15:48 -0700651 node_set(p->v.preferred_node, *nodes);
Lee Schermerhorn53f25562008-04-28 02:13:20 -0700652 /* else return empty node mask for local allocation */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700653 break;
654 default:
655 BUG();
656 }
657}
658
659static int lookup_node(struct mm_struct *mm, unsigned long addr)
660{
661 struct page *p;
662 int err;
663
664 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
665 if (err >= 0) {
666 err = page_to_nid(p);
667 put_page(p);
668 }
669 return err;
670}
671
Linus Torvalds1da177e2005-04-16 15:20:36 -0700672/* Retrieve NUMA policy */
Adrian Bunkdbcb0f12007-10-16 01:26:26 -0700673static long do_get_mempolicy(int *policy, nodemask_t *nmask,
674 unsigned long addr, unsigned long flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700675{
Christoph Lameter8bccd852005-10-29 18:16:59 -0700676 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700677 struct mm_struct *mm = current->mm;
678 struct vm_area_struct *vma = NULL;
679 struct mempolicy *pol = current->mempolicy;
680
Paul Jacksoncf2a473c2006-01-08 01:01:54 -0800681 cpuset_update_task_memory_state();
Lee Schermerhorn754af6f2007-10-16 01:24:51 -0700682 if (flags &
683 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700684 return -EINVAL;
Lee Schermerhorn754af6f2007-10-16 01:24:51 -0700685
686 if (flags & MPOL_F_MEMS_ALLOWED) {
687 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
688 return -EINVAL;
689 *policy = 0; /* just so it's initialized */
690 *nmask = cpuset_current_mems_allowed;
691 return 0;
692 }
693
Linus Torvalds1da177e2005-04-16 15:20:36 -0700694 if (flags & MPOL_F_ADDR) {
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700695 /*
696 * Do NOT fall back to task policy if the
697 * vma/shared policy at addr is NULL. We
698 * want to return MPOL_DEFAULT in this case.
699 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700700 down_read(&mm->mmap_sem);
701 vma = find_vma_intersection(mm, addr, addr+1);
702 if (!vma) {
703 up_read(&mm->mmap_sem);
704 return -EFAULT;
705 }
706 if (vma->vm_ops && vma->vm_ops->get_policy)
707 pol = vma->vm_ops->get_policy(vma, addr);
708 else
709 pol = vma->vm_policy;
710 } else if (addr)
711 return -EINVAL;
712
713 if (!pol)
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700714 pol = &default_policy; /* indicates default behavior */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700715
716 if (flags & MPOL_F_NODE) {
717 if (flags & MPOL_F_ADDR) {
718 err = lookup_node(mm, addr);
719 if (err < 0)
720 goto out;
Christoph Lameter8bccd852005-10-29 18:16:59 -0700721 *policy = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700722 } else if (pol == current->mempolicy &&
Lee Schermerhorn45c47452008-04-28 02:13:12 -0700723 pol->mode == MPOL_INTERLEAVE) {
Christoph Lameter8bccd852005-10-29 18:16:59 -0700724 *policy = current->il_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700725 } else {
726 err = -EINVAL;
727 goto out;
728 }
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700729 } else {
730 *policy = pol == &default_policy ? MPOL_DEFAULT :
731 pol->mode;
David Rientjesd79df632008-07-04 12:24:13 -0700732 /*
733 * Internal mempolicy flags must be masked off before exposing
734 * the policy to userspace.
735 */
736 *policy |= (pol->flags & MPOL_MODE_FLAGS);
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700737 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700738
739 if (vma) {
740 up_read(&current->mm->mmap_sem);
741 vma = NULL;
742 }
743
Linus Torvalds1da177e2005-04-16 15:20:36 -0700744 err = 0;
Christoph Lameter8bccd852005-10-29 18:16:59 -0700745 if (nmask)
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700746 get_policy_nodemask(pol, nmask);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700747
748 out:
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -0700749 mpol_cond_put(pol);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700750 if (vma)
751 up_read(&current->mm->mmap_sem);
752 return err;
753}
754
Christoph Lameterb20a3502006-03-22 00:09:12 -0800755#ifdef CONFIG_MIGRATION
Christoph Lameter8bccd852005-10-29 18:16:59 -0700756/*
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800757 * page migration
758 */
Christoph Lameterfc301282006-01-18 17:42:29 -0800759static void migrate_page_add(struct page *page, struct list_head *pagelist,
760 unsigned long flags)
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800761{
762 /*
Christoph Lameterfc301282006-01-18 17:42:29 -0800763 * Avoid migrating a page that is shared with others.
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800764 */
Christoph Lameterb20a3502006-03-22 00:09:12 -0800765 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
766 isolate_lru_page(page, pagelist);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800767}
768
Christoph Lameter742755a2006-06-23 02:03:55 -0700769static struct page *new_node_page(struct page *page, unsigned long node, int **x)
Christoph Lameter95a402c2006-06-23 02:03:53 -0700770{
Mel Gorman769848c2007-07-17 04:03:05 -0700771 return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
Christoph Lameter95a402c2006-06-23 02:03:53 -0700772}
773
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800774/*
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800775 * Migrate pages from one node to a target node.
776 * Returns error or the number of pages not migrated.
777 */
Adrian Bunkdbcb0f12007-10-16 01:26:26 -0700778static int migrate_to_node(struct mm_struct *mm, int source, int dest,
779 int flags)
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800780{
781 nodemask_t nmask;
782 LIST_HEAD(pagelist);
783 int err = 0;
784
785 nodes_clear(nmask);
786 node_set(source, nmask);
787
788 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
789 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
790
Christoph Lameteraaa994b2006-06-23 02:03:52 -0700791 if (!list_empty(&pagelist))
Christoph Lameter95a402c2006-06-23 02:03:53 -0700792 err = migrate_pages(&pagelist, new_node_page, dest);
793
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800794 return err;
795}
796
797/*
798 * Move pages between the two nodesets so as to preserve the physical
799 * layout as much as possible.
Christoph Lameter39743882006-01-08 01:00:51 -0800800 *
801 * Returns the number of page that could not be moved.
802 */
803int do_migrate_pages(struct mm_struct *mm,
804 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
805{
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800806 int busy = 0;
807 int err = 0;
808 nodemask_t tmp;
Christoph Lameter39743882006-01-08 01:00:51 -0800809
Lee Schermerhorn53f25562008-04-28 02:13:20 -0700810 down_read(&mm->mmap_sem);
Christoph Lameter39743882006-01-08 01:00:51 -0800811
Christoph Lameter7b2259b2006-06-25 05:46:48 -0700812 err = migrate_vmas(mm, from_nodes, to_nodes, flags);
813 if (err)
814 goto out;
815
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800816/*
817 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
818 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
819 * bit in 'tmp', and return that <source, dest> pair for migration.
820 * The pair of nodemasks 'to' and 'from' define the map.
821 *
822 * If no pair of bits is found that way, fallback to picking some
823 * pair of 'source' and 'dest' bits that are not the same. If the
824 * 'source' and 'dest' bits are the same, this represents a node
825 * that will be migrating to itself, so no pages need move.
826 *
827 * If no bits are left in 'tmp', or if all remaining bits left
828 * in 'tmp' correspond to the same bit in 'to', return false
829 * (nothing left to migrate).
830 *
831 * This lets us pick a pair of nodes to migrate between, such that
832 * if possible the dest node is not already occupied by some other
833 * source node, minimizing the risk of overloading the memory on a
834 * node that would happen if we migrated incoming memory to a node
835 * before migrating outgoing memory source that same node.
836 *
837 * A single scan of tmp is sufficient. As we go, we remember the
838 * most recent <s, d> pair that moved (s != d). If we find a pair
839 * that not only moved, but what's better, moved to an empty slot
840 * (d is not set in tmp), then we break out then, with that pair.
841 * Otherwise when we finish scannng from_tmp, we at least have the
842 * most recent <s, d> pair that moved. If we get all the way through
843 * the scan of tmp without finding any node that moved, much less
844 * moved to an empty node, then there is nothing left worth migrating.
845 */
Christoph Lameterd4984712006-01-08 01:00:55 -0800846
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800847 tmp = *from_nodes;
848 while (!nodes_empty(tmp)) {
849 int s,d;
850 int source = -1;
851 int dest = 0;
852
853 for_each_node_mask(s, tmp) {
854 d = node_remap(s, *from_nodes, *to_nodes);
855 if (s == d)
856 continue;
857
858 source = s; /* Node moved. Memorize */
859 dest = d;
860
861 /* dest not in remaining from nodes? */
862 if (!node_isset(dest, tmp))
863 break;
864 }
865 if (source == -1)
866 break;
867
868 node_clear(source, tmp);
869 err = migrate_to_node(mm, source, dest, flags);
870 if (err > 0)
871 busy += err;
872 if (err < 0)
873 break;
Christoph Lameter39743882006-01-08 01:00:51 -0800874 }
Christoph Lameter7b2259b2006-06-25 05:46:48 -0700875out:
Christoph Lameter39743882006-01-08 01:00:51 -0800876 up_read(&mm->mmap_sem);
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800877 if (err < 0)
878 return err;
879 return busy;
Christoph Lameterb20a3502006-03-22 00:09:12 -0800880
Christoph Lameter39743882006-01-08 01:00:51 -0800881}
882
Lee Schermerhorn3ad33b242007-11-14 16:59:10 -0800883/*
884 * Allocate a new page for page migration based on vma policy.
885 * Start assuming that page is mapped by vma pointed to by @private.
886 * Search forward from there, if not. N.B., this assumes that the
887 * list of pages handed to migrate_pages()--which is how we get here--
888 * is in virtual address order.
889 */
Christoph Lameter742755a2006-06-23 02:03:55 -0700890static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
Christoph Lameter95a402c2006-06-23 02:03:53 -0700891{
892 struct vm_area_struct *vma = (struct vm_area_struct *)private;
Lee Schermerhorn3ad33b242007-11-14 16:59:10 -0800893 unsigned long uninitialized_var(address);
Christoph Lameter95a402c2006-06-23 02:03:53 -0700894
Lee Schermerhorn3ad33b242007-11-14 16:59:10 -0800895 while (vma) {
896 address = page_address_in_vma(page, vma);
897 if (address != -EFAULT)
898 break;
899 vma = vma->vm_next;
900 }
901
902 /*
903 * if !vma, alloc_page_vma() will use task or system default policy
904 */
905 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
Christoph Lameter95a402c2006-06-23 02:03:53 -0700906}
Christoph Lameterb20a3502006-03-22 00:09:12 -0800907#else
908
909static void migrate_page_add(struct page *page, struct list_head *pagelist,
910 unsigned long flags)
911{
912}
913
914int do_migrate_pages(struct mm_struct *mm,
915 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
916{
917 return -ENOSYS;
918}
Christoph Lameter95a402c2006-06-23 02:03:53 -0700919
Keith Owens69939742006-10-11 01:21:28 -0700920static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
Christoph Lameter95a402c2006-06-23 02:03:53 -0700921{
922 return NULL;
923}
Christoph Lameterb20a3502006-03-22 00:09:12 -0800924#endif
925
Adrian Bunkdbcb0f12007-10-16 01:26:26 -0700926static long do_mbind(unsigned long start, unsigned long len,
David Rientjes028fec42008-04-28 02:12:25 -0700927 unsigned short mode, unsigned short mode_flags,
928 nodemask_t *nmask, unsigned long flags)
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800929{
930 struct vm_area_struct *vma;
931 struct mm_struct *mm = current->mm;
932 struct mempolicy *new;
933 unsigned long end;
934 int err;
935 LIST_HEAD(pagelist);
936
David Rientjesa3b51e02008-04-28 02:12:23 -0700937 if (flags & ~(unsigned long)(MPOL_MF_STRICT |
938 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800939 return -EINVAL;
Christoph Lameter74c00242006-03-14 19:50:21 -0800940 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800941 return -EPERM;
942
943 if (start & ~PAGE_MASK)
944 return -EINVAL;
945
946 if (mode == MPOL_DEFAULT)
947 flags &= ~MPOL_MF_STRICT;
948
949 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
950 end = start + len;
951
952 if (end < start)
953 return -EINVAL;
954 if (end == start)
955 return 0;
956
David Rientjes028fec42008-04-28 02:12:25 -0700957 new = mpol_new(mode, mode_flags, nmask);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800958 if (IS_ERR(new))
959 return PTR_ERR(new);
960
961 /*
962 * If we are using the default policy then operation
963 * on discontinuous address spaces is okay after all
964 */
965 if (!new)
966 flags |= MPOL_MF_DISCONTIG_OK;
967
David Rientjes028fec42008-04-28 02:12:25 -0700968 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
969 start, start + len, mode, mode_flags,
970 nmask ? nodes_addr(*nmask)[0] : -1);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800971
972 down_write(&mm->mmap_sem);
973 vma = check_range(mm, start, end, nmask,
974 flags | MPOL_MF_INVERT, &pagelist);
975
976 err = PTR_ERR(vma);
977 if (!IS_ERR(vma)) {
978 int nr_failed = 0;
979
980 err = mbind_range(vma, start, end, new);
Christoph Lameter7e2ab152006-02-01 03:05:40 -0800981
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800982 if (!list_empty(&pagelist))
Christoph Lameter95a402c2006-06-23 02:03:53 -0700983 nr_failed = migrate_pages(&pagelist, new_vma_page,
984 (unsigned long)vma);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800985
986 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
987 err = -EIO;
988 }
Christoph Lameterb20a3502006-03-22 00:09:12 -0800989
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800990 up_write(&mm->mmap_sem);
Lee Schermerhornf0be3d32008-04-28 02:13:08 -0700991 mpol_put(new);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800992 return err;
993}
994
Christoph Lameter39743882006-01-08 01:00:51 -0800995/*
Christoph Lameter8bccd852005-10-29 18:16:59 -0700996 * User space interface with variable sized bitmaps for nodelists.
997 */
998
999/* Copy a node mask from user space. */
Christoph Lameter39743882006-01-08 01:00:51 -08001000static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
Christoph Lameter8bccd852005-10-29 18:16:59 -07001001 unsigned long maxnode)
1002{
1003 unsigned long k;
1004 unsigned long nlongs;
1005 unsigned long endmask;
1006
1007 --maxnode;
1008 nodes_clear(*nodes);
1009 if (maxnode == 0 || !nmask)
1010 return 0;
Andi Kleena9c930b2006-02-20 18:27:59 -08001011 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
Chris Wright636f13c2006-02-17 13:59:36 -08001012 return -EINVAL;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001013
1014 nlongs = BITS_TO_LONGS(maxnode);
1015 if ((maxnode % BITS_PER_LONG) == 0)
1016 endmask = ~0UL;
1017 else
1018 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1019
1020 /* When the user specified more nodes than supported just check
1021 if the non supported part is all zero. */
1022 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1023 if (nlongs > PAGE_SIZE/sizeof(long))
1024 return -EINVAL;
1025 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1026 unsigned long t;
1027 if (get_user(t, nmask + k))
1028 return -EFAULT;
1029 if (k == nlongs - 1) {
1030 if (t & endmask)
1031 return -EINVAL;
1032 } else if (t)
1033 return -EINVAL;
1034 }
1035 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1036 endmask = ~0UL;
1037 }
1038
1039 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1040 return -EFAULT;
1041 nodes_addr(*nodes)[nlongs-1] &= endmask;
1042 return 0;
1043}
1044
1045/* Copy a kernel node mask to user space */
1046static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1047 nodemask_t *nodes)
1048{
1049 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1050 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1051
1052 if (copy > nbytes) {
1053 if (copy > PAGE_SIZE)
1054 return -EINVAL;
1055 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1056 return -EFAULT;
1057 copy = nbytes;
1058 }
1059 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1060}
1061
1062asmlinkage long sys_mbind(unsigned long start, unsigned long len,
1063 unsigned long mode,
1064 unsigned long __user *nmask, unsigned long maxnode,
1065 unsigned flags)
1066{
1067 nodemask_t nodes;
1068 int err;
David Rientjes028fec42008-04-28 02:12:25 -07001069 unsigned short mode_flags;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001070
David Rientjes028fec42008-04-28 02:12:25 -07001071 mode_flags = mode & MPOL_MODE_FLAGS;
1072 mode &= ~MPOL_MODE_FLAGS;
David Rientjesa3b51e02008-04-28 02:12:23 -07001073 if (mode >= MPOL_MAX)
1074 return -EINVAL;
David Rientjes4c50bc02008-04-28 02:12:30 -07001075 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1076 (mode_flags & MPOL_F_RELATIVE_NODES))
1077 return -EINVAL;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001078 err = get_nodes(&nodes, nmask, maxnode);
1079 if (err)
1080 return err;
David Rientjes028fec42008-04-28 02:12:25 -07001081 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
Christoph Lameter8bccd852005-10-29 18:16:59 -07001082}
1083
1084/* Set the process memory policy */
1085asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
1086 unsigned long maxnode)
1087{
1088 int err;
1089 nodemask_t nodes;
David Rientjes028fec42008-04-28 02:12:25 -07001090 unsigned short flags;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001091
David Rientjes028fec42008-04-28 02:12:25 -07001092 flags = mode & MPOL_MODE_FLAGS;
1093 mode &= ~MPOL_MODE_FLAGS;
1094 if ((unsigned int)mode >= MPOL_MAX)
Christoph Lameter8bccd852005-10-29 18:16:59 -07001095 return -EINVAL;
David Rientjes4c50bc02008-04-28 02:12:30 -07001096 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1097 return -EINVAL;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001098 err = get_nodes(&nodes, nmask, maxnode);
1099 if (err)
1100 return err;
David Rientjes028fec42008-04-28 02:12:25 -07001101 return do_set_mempolicy(mode, flags, &nodes);
Christoph Lameter8bccd852005-10-29 18:16:59 -07001102}
1103
Christoph Lameter39743882006-01-08 01:00:51 -08001104asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
1105 const unsigned long __user *old_nodes,
1106 const unsigned long __user *new_nodes)
1107{
1108 struct mm_struct *mm;
1109 struct task_struct *task;
1110 nodemask_t old;
1111 nodemask_t new;
1112 nodemask_t task_nodes;
1113 int err;
1114
1115 err = get_nodes(&old, old_nodes, maxnode);
1116 if (err)
1117 return err;
1118
1119 err = get_nodes(&new, new_nodes, maxnode);
1120 if (err)
1121 return err;
1122
1123 /* Find the mm_struct */
1124 read_lock(&tasklist_lock);
Pavel Emelyanov228ebcb2007-10-18 23:40:16 -07001125 task = pid ? find_task_by_vpid(pid) : current;
Christoph Lameter39743882006-01-08 01:00:51 -08001126 if (!task) {
1127 read_unlock(&tasklist_lock);
1128 return -ESRCH;
1129 }
1130 mm = get_task_mm(task);
1131 read_unlock(&tasklist_lock);
1132
1133 if (!mm)
1134 return -EINVAL;
1135
1136 /*
1137 * Check if this process has the right to modify the specified
1138 * process. The right exists if the process has administrative
Alexey Dobriyan7f927fc2006-03-28 01:56:53 -08001139 * capabilities, superuser privileges or the same
Christoph Lameter39743882006-01-08 01:00:51 -08001140 * userid as the target process.
1141 */
1142 if ((current->euid != task->suid) && (current->euid != task->uid) &&
1143 (current->uid != task->suid) && (current->uid != task->uid) &&
Christoph Lameter74c00242006-03-14 19:50:21 -08001144 !capable(CAP_SYS_NICE)) {
Christoph Lameter39743882006-01-08 01:00:51 -08001145 err = -EPERM;
1146 goto out;
1147 }
1148
1149 task_nodes = cpuset_mems_allowed(task);
1150 /* Is the user allowed to access the target nodes? */
Christoph Lameter74c00242006-03-14 19:50:21 -08001151 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
Christoph Lameter39743882006-01-08 01:00:51 -08001152 err = -EPERM;
1153 goto out;
1154 }
1155
Lee Schermerhorn37b07e42007-10-16 01:25:39 -07001156 if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
Christoph Lameter3b42d282007-08-31 00:12:08 -07001157 err = -EINVAL;
1158 goto out;
1159 }
1160
David Quigley86c3a762006-06-23 02:04:02 -07001161 err = security_task_movememory(task);
1162 if (err)
1163 goto out;
1164
Christoph Lameter511030b2006-02-28 16:58:57 -08001165 err = do_migrate_pages(mm, &old, &new,
Christoph Lameter74c00242006-03-14 19:50:21 -08001166 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
Christoph Lameter39743882006-01-08 01:00:51 -08001167out:
1168 mmput(mm);
1169 return err;
1170}
1171
1172
Christoph Lameter8bccd852005-10-29 18:16:59 -07001173/* Retrieve NUMA policy */
1174asmlinkage long sys_get_mempolicy(int __user *policy,
1175 unsigned long __user *nmask,
1176 unsigned long maxnode,
1177 unsigned long addr, unsigned long flags)
1178{
Adrian Bunkdbcb0f12007-10-16 01:26:26 -07001179 int err;
1180 int uninitialized_var(pval);
Christoph Lameter8bccd852005-10-29 18:16:59 -07001181 nodemask_t nodes;
1182
1183 if (nmask != NULL && maxnode < MAX_NUMNODES)
1184 return -EINVAL;
1185
1186 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1187
1188 if (err)
1189 return err;
1190
1191 if (policy && put_user(pval, policy))
1192 return -EFAULT;
1193
1194 if (nmask)
1195 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1196
1197 return err;
1198}
1199
Linus Torvalds1da177e2005-04-16 15:20:36 -07001200#ifdef CONFIG_COMPAT
1201
1202asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1203 compat_ulong_t __user *nmask,
1204 compat_ulong_t maxnode,
1205 compat_ulong_t addr, compat_ulong_t flags)
1206{
1207 long err;
1208 unsigned long __user *nm = NULL;
1209 unsigned long nr_bits, alloc_size;
1210 DECLARE_BITMAP(bm, MAX_NUMNODES);
1211
1212 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1213 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1214
1215 if (nmask)
1216 nm = compat_alloc_user_space(alloc_size);
1217
1218 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1219
1220 if (!err && nmask) {
1221 err = copy_from_user(bm, nm, alloc_size);
1222 /* ensure entire bitmap is zeroed */
1223 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1224 err |= compat_put_bitmap(nmask, bm, nr_bits);
1225 }
1226
1227 return err;
1228}
1229
1230asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1231 compat_ulong_t maxnode)
1232{
1233 long err = 0;
1234 unsigned long __user *nm = NULL;
1235 unsigned long nr_bits, alloc_size;
1236 DECLARE_BITMAP(bm, MAX_NUMNODES);
1237
1238 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1239 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1240
1241 if (nmask) {
1242 err = compat_get_bitmap(bm, nmask, nr_bits);
1243 nm = compat_alloc_user_space(alloc_size);
1244 err |= copy_to_user(nm, bm, alloc_size);
1245 }
1246
1247 if (err)
1248 return -EFAULT;
1249
1250 return sys_set_mempolicy(mode, nm, nr_bits+1);
1251}
1252
1253asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1254 compat_ulong_t mode, compat_ulong_t __user *nmask,
1255 compat_ulong_t maxnode, compat_ulong_t flags)
1256{
1257 long err = 0;
1258 unsigned long __user *nm = NULL;
1259 unsigned long nr_bits, alloc_size;
Andi Kleendfcd3c0d2005-10-29 18:15:48 -07001260 nodemask_t bm;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001261
1262 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1263 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1264
1265 if (nmask) {
Andi Kleendfcd3c0d2005-10-29 18:15:48 -07001266 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001267 nm = compat_alloc_user_space(alloc_size);
Andi Kleendfcd3c0d2005-10-29 18:15:48 -07001268 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001269 }
1270
1271 if (err)
1272 return -EFAULT;
1273
1274 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1275}
1276
1277#endif
1278
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001279/*
1280 * get_vma_policy(@task, @vma, @addr)
1281 * @task - task for fallback if vma policy == default
1282 * @vma - virtual memory area whose policy is sought
1283 * @addr - address in @vma for shared policy lookup
1284 *
1285 * Returns effective policy for a VMA at specified address.
1286 * Falls back to @task or system default policy, as necessary.
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001287 * Current or other task's task mempolicy and non-shared vma policies
1288 * are protected by the task's mmap_sem, which must be held for read by
1289 * the caller.
1290 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1291 * count--added by the get_policy() vm_op, as appropriate--to protect against
1292 * freeing by another task. It is the caller's responsibility to free the
1293 * extra reference for shared policies.
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001294 */
Lee Schermerhornae4d8c12008-04-28 02:13:11 -07001295static struct mempolicy *get_vma_policy(struct task_struct *task,
Christoph Lameter48fce342006-01-08 01:01:03 -08001296 struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001297{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -07001298 struct mempolicy *pol = task->mempolicy;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001299
1300 if (vma) {
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001301 if (vma->vm_ops && vma->vm_ops->get_policy) {
Lee Schermerhornae4d8c12008-04-28 02:13:11 -07001302 struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1303 addr);
1304 if (vpol)
1305 pol = vpol;
Lee Schermerhornbea904d2008-04-28 02:13:18 -07001306 } else if (vma->vm_policy)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001307 pol = vma->vm_policy;
1308 }
1309 if (!pol)
1310 pol = &default_policy;
1311 return pol;
1312}
1313
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001314/*
1315 * Return a nodemask representing a mempolicy for filtering nodes for
1316 * page allocation
1317 */
1318static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
Mel Gorman19770b32008-04-28 02:12:18 -07001319{
1320 /* Lower zones don't get a nodemask applied for MPOL_BIND */
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001321 if (unlikely(policy->mode == MPOL_BIND) &&
Mel Gorman19770b32008-04-28 02:12:18 -07001322 gfp_zone(gfp) >= policy_zone &&
1323 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1324 return &policy->v.nodes;
1325
1326 return NULL;
1327}
1328
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001329/* Return a zonelist indicated by gfp for node representing a mempolicy */
1330static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001331{
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07001332 int nd = numa_node_id();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001333
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001334 switch (policy->mode) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001335 case MPOL_PREFERRED:
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07001336 if (!(policy->flags & MPOL_F_LOCAL))
1337 nd = policy->v.preferred_node;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001338 break;
1339 case MPOL_BIND:
Mel Gorman19770b32008-04-28 02:12:18 -07001340 /*
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001341 * Normally, MPOL_BIND allocations are node-local within the
1342 * allowed nodemask. However, if __GFP_THISNODE is set and the
1343 * current node is part of the mask, we use the zonelist for
1344 * the first node in the mask instead.
Mel Gorman19770b32008-04-28 02:12:18 -07001345 */
Mel Gorman19770b32008-04-28 02:12:18 -07001346 if (unlikely(gfp & __GFP_THISNODE) &&
1347 unlikely(!node_isset(nd, policy->v.nodes)))
1348 nd = first_node(policy->v.nodes);
1349 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001350 case MPOL_INTERLEAVE: /* should not happen */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001351 break;
1352 default:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001353 BUG();
1354 }
Mel Gorman0e884602008-04-28 02:12:14 -07001355 return node_zonelist(nd, gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001356}
1357
1358/* Do dynamic interleaving for a process */
1359static unsigned interleave_nodes(struct mempolicy *policy)
1360{
1361 unsigned nid, next;
1362 struct task_struct *me = current;
1363
1364 nid = me->il_next;
Andi Kleendfcd3c0d2005-10-29 18:15:48 -07001365 next = next_node(nid, policy->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001366 if (next >= MAX_NUMNODES)
Andi Kleendfcd3c0d2005-10-29 18:15:48 -07001367 next = first_node(policy->v.nodes);
David Rientjesf5b087b2008-04-28 02:12:27 -07001368 if (next < MAX_NUMNODES)
1369 me->il_next = next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001370 return nid;
1371}
1372
Christoph Lameterdc85da12006-01-18 17:42:36 -08001373/*
1374 * Depending on the memory policy provide a node from which to allocate the
1375 * next slab entry.
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001376 * @policy must be protected by freeing by the caller. If @policy is
1377 * the current task's mempolicy, this protection is implicit, as only the
1378 * task can change it's policy. The system default policy requires no
1379 * such protection.
Christoph Lameterdc85da12006-01-18 17:42:36 -08001380 */
1381unsigned slab_node(struct mempolicy *policy)
1382{
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07001383 if (!policy || policy->flags & MPOL_F_LOCAL)
Lee Schermerhornbea904d2008-04-28 02:13:18 -07001384 return numa_node_id();
Christoph Lameter765c4502006-09-27 01:50:08 -07001385
Lee Schermerhornbea904d2008-04-28 02:13:18 -07001386 switch (policy->mode) {
1387 case MPOL_PREFERRED:
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07001388 /*
1389 * handled MPOL_F_LOCAL above
1390 */
1391 return policy->v.preferred_node;
Lee Schermerhornbea904d2008-04-28 02:13:18 -07001392
Christoph Lameterdc85da12006-01-18 17:42:36 -08001393 case MPOL_INTERLEAVE:
1394 return interleave_nodes(policy);
1395
Mel Gormandd1a2392008-04-28 02:12:17 -07001396 case MPOL_BIND: {
Christoph Lameterdc85da12006-01-18 17:42:36 -08001397 /*
1398 * Follow bind policy behavior and start allocation at the
1399 * first node.
1400 */
Mel Gorman19770b32008-04-28 02:12:18 -07001401 struct zonelist *zonelist;
1402 struct zone *zone;
1403 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1404 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1405 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1406 &policy->v.nodes,
1407 &zone);
1408 return zone->node;
Mel Gormandd1a2392008-04-28 02:12:17 -07001409 }
Christoph Lameterdc85da12006-01-18 17:42:36 -08001410
Christoph Lameterdc85da12006-01-18 17:42:36 -08001411 default:
Lee Schermerhornbea904d2008-04-28 02:13:18 -07001412 BUG();
Christoph Lameterdc85da12006-01-18 17:42:36 -08001413 }
1414}
1415
Linus Torvalds1da177e2005-04-16 15:20:36 -07001416/* Do static interleaving for a VMA with known offset. */
1417static unsigned offset_il_node(struct mempolicy *pol,
1418 struct vm_area_struct *vma, unsigned long off)
1419{
Andi Kleendfcd3c0d2005-10-29 18:15:48 -07001420 unsigned nnodes = nodes_weight(pol->v.nodes);
David Rientjesf5b087b2008-04-28 02:12:27 -07001421 unsigned target;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001422 int c;
1423 int nid = -1;
1424
David Rientjesf5b087b2008-04-28 02:12:27 -07001425 if (!nnodes)
1426 return numa_node_id();
1427 target = (unsigned int)off % nnodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001428 c = 0;
1429 do {
Andi Kleendfcd3c0d2005-10-29 18:15:48 -07001430 nid = next_node(nid, pol->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001431 c++;
1432 } while (c <= target);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001433 return nid;
1434}
1435
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001436/* Determine a node number for interleave */
1437static inline unsigned interleave_nid(struct mempolicy *pol,
1438 struct vm_area_struct *vma, unsigned long addr, int shift)
1439{
1440 if (vma) {
1441 unsigned long off;
1442
Nishanth Aravamudan3b98b082006-08-31 21:27:53 -07001443 /*
1444 * for small pages, there is no difference between
1445 * shift and PAGE_SHIFT, so the bit-shift is safe.
1446 * for huge pages, since vm_pgoff is in units of small
1447 * pages, we need to shift off the always 0 bits to get
1448 * a useful offset.
1449 */
1450 BUG_ON(shift < PAGE_SHIFT);
1451 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001452 off += (addr - vma->vm_start) >> shift;
1453 return offset_il_node(pol, vma, off);
1454 } else
1455 return interleave_nodes(pol);
1456}
1457
Chen, Kenneth W00ac59ad2006-02-03 21:51:14 +01001458#ifdef CONFIG_HUGETLBFS
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001459/*
1460 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1461 * @vma = virtual memory area whose policy is sought
1462 * @addr = address in @vma for shared policy lookup and interleave policy
1463 * @gfp_flags = for requested zone
Mel Gorman19770b32008-04-28 02:12:18 -07001464 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1465 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001466 *
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001467 * Returns a zonelist suitable for a huge page allocation and a pointer
1468 * to the struct mempolicy for conditional unref after allocation.
1469 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1470 * @nodemask for filtering the zonelist.
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001471 */
Mel Gorman396faf02007-07-17 04:03:13 -07001472struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
Mel Gorman19770b32008-04-28 02:12:18 -07001473 gfp_t gfp_flags, struct mempolicy **mpol,
1474 nodemask_t **nodemask)
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001475{
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001476 struct zonelist *zl;
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001477
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001478 *mpol = get_vma_policy(current, vma, addr);
Mel Gorman19770b32008-04-28 02:12:18 -07001479 *nodemask = NULL; /* assume !MPOL_BIND */
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001480
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001481 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1482 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
Andi Kleena5516432008-07-23 21:27:41 -07001483 huge_page_shift(hstate_vma(vma))), gfp_flags);
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001484 } else {
1485 zl = policy_zonelist(gfp_flags, *mpol);
1486 if ((*mpol)->mode == MPOL_BIND)
1487 *nodemask = &(*mpol)->v.nodes;
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001488 }
1489 return zl;
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001490}
Chen, Kenneth W00ac59ad2006-02-03 21:51:14 +01001491#endif
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001492
Linus Torvalds1da177e2005-04-16 15:20:36 -07001493/* Allocate a page in interleaved policy.
1494 Own path because it needs to do special accounting. */
Andi Kleen662f3a02005-10-29 18:15:49 -07001495static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1496 unsigned nid)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001497{
1498 struct zonelist *zl;
1499 struct page *page;
1500
Mel Gorman0e884602008-04-28 02:12:14 -07001501 zl = node_zonelist(nid, gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001502 page = __alloc_pages(gfp, order, zl);
Mel Gormandd1a2392008-04-28 02:12:17 -07001503 if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
Christoph Lameterca889e62006-06-30 01:55:44 -07001504 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001505 return page;
1506}
1507
1508/**
1509 * alloc_page_vma - Allocate a page for a VMA.
1510 *
1511 * @gfp:
1512 * %GFP_USER user allocation.
1513 * %GFP_KERNEL kernel allocations,
1514 * %GFP_HIGHMEM highmem/user allocations,
1515 * %GFP_FS allocation should not call back into a file system.
1516 * %GFP_ATOMIC don't sleep.
1517 *
1518 * @vma: Pointer to VMA or NULL if not available.
1519 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1520 *
1521 * This function allocates a page from the kernel page pool and applies
1522 * a NUMA policy associated with the VMA or the current process.
1523 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1524 * mm_struct of the VMA to prevent it from going away. Should be used for
1525 * all allocations for pages that will be mapped into
1526 * user space. Returns NULL when no page can be allocated.
1527 *
1528 * Should be called with the mm_sem of the vma hold.
1529 */
1530struct page *
Al Virodd0fc662005-10-07 07:46:04 +01001531alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001532{
Christoph Lameter6e21c8f2005-09-03 15:54:45 -07001533 struct mempolicy *pol = get_vma_policy(current, vma, addr);
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001534 struct zonelist *zl;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001535
Paul Jacksoncf2a473c2006-01-08 01:01:54 -08001536 cpuset_update_task_memory_state();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001537
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001538 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001539 unsigned nid;
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001540
1541 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001542 mpol_cond_put(pol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001543 return alloc_page_interleave(gfp, 0, nid);
1544 }
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001545 zl = policy_zonelist(gfp, pol);
1546 if (unlikely(mpol_needs_cond_ref(pol))) {
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001547 /*
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001548 * slow path: ref counted shared policy
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001549 */
Mel Gorman19770b32008-04-28 02:12:18 -07001550 struct page *page = __alloc_pages_nodemask(gfp, 0,
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001551 zl, policy_nodemask(gfp, pol));
Lee Schermerhornf0be3d32008-04-28 02:13:08 -07001552 __mpol_put(pol);
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001553 return page;
1554 }
1555 /*
1556 * fast path: default or task policy
1557 */
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001558 return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001559}
1560
1561/**
1562 * alloc_pages_current - Allocate pages.
1563 *
1564 * @gfp:
1565 * %GFP_USER user allocation,
1566 * %GFP_KERNEL kernel allocation,
1567 * %GFP_HIGHMEM highmem allocation,
1568 * %GFP_FS don't call back into a file system.
1569 * %GFP_ATOMIC don't sleep.
1570 * @order: Power of two of allocation size in pages. 0 is a single page.
1571 *
1572 * Allocate a page from the kernel page pool. When not in
1573 * interrupt context and apply the current process NUMA policy.
1574 * Returns NULL when no page can be allocated.
1575 *
Paul Jacksoncf2a473c2006-01-08 01:01:54 -08001576 * Don't call cpuset_update_task_memory_state() unless
Linus Torvalds1da177e2005-04-16 15:20:36 -07001577 * 1) it's ok to take cpuset_sem (can WAIT), and
1578 * 2) allocating for current task (not interrupt).
1579 */
Al Virodd0fc662005-10-07 07:46:04 +01001580struct page *alloc_pages_current(gfp_t gfp, unsigned order)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001581{
1582 struct mempolicy *pol = current->mempolicy;
1583
1584 if ((gfp & __GFP_WAIT) && !in_interrupt())
Paul Jacksoncf2a473c2006-01-08 01:01:54 -08001585 cpuset_update_task_memory_state();
Christoph Lameter9b819d22006-09-25 23:31:40 -07001586 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001587 pol = &default_policy;
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001588
1589 /*
1590 * No reference counting needed for current->mempolicy
1591 * nor system default_policy
1592 */
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001593 if (pol->mode == MPOL_INTERLEAVE)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001594 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
Mel Gorman19770b32008-04-28 02:12:18 -07001595 return __alloc_pages_nodemask(gfp, order,
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001596 policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001597}
1598EXPORT_SYMBOL(alloc_pages_current);
1599
Paul Jackson42253992006-01-08 01:01:59 -08001600/*
Lee Schermerhorn846a16b2008-04-28 02:13:09 -07001601 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
Paul Jackson42253992006-01-08 01:01:59 -08001602 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1603 * with the mems_allowed returned by cpuset_mems_allowed(). This
1604 * keeps mempolicies cpuset relative after its cpuset moves. See
1605 * further kernel/cpuset.c update_nodemask().
1606 */
Paul Jackson42253992006-01-08 01:01:59 -08001607
Lee Schermerhorn846a16b2008-04-28 02:13:09 -07001608/* Slow path of a mempolicy duplicate */
1609struct mempolicy *__mpol_dup(struct mempolicy *old)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001610{
1611 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1612
1613 if (!new)
1614 return ERR_PTR(-ENOMEM);
Paul Jackson42253992006-01-08 01:01:59 -08001615 if (current_cpuset_is_being_rebound()) {
1616 nodemask_t mems = cpuset_mems_allowed(current);
1617 mpol_rebind_policy(old, &mems);
1618 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001619 *new = *old;
1620 atomic_set(&new->refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001621 return new;
1622}
1623
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001624/*
1625 * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1626 * eliminate the * MPOL_F_* flags that require conditional ref and
1627 * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly
1628 * after return. Use the returned value.
1629 *
1630 * Allows use of a mempolicy for, e.g., multiple allocations with a single
1631 * policy lookup, even if the policy needs/has extra ref on lookup.
1632 * shmem_readahead needs this.
1633 */
1634struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1635 struct mempolicy *frompol)
1636{
1637 if (!mpol_needs_cond_ref(frompol))
1638 return frompol;
1639
1640 *tompol = *frompol;
1641 tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */
1642 __mpol_put(frompol);
1643 return tompol;
1644}
1645
David Rientjesf5b087b2008-04-28 02:12:27 -07001646static int mpol_match_intent(const struct mempolicy *a,
1647 const struct mempolicy *b)
1648{
1649 if (a->flags != b->flags)
1650 return 0;
1651 if (!mpol_store_user_nodemask(a))
1652 return 1;
1653 return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1654}
1655
Linus Torvalds1da177e2005-04-16 15:20:36 -07001656/* Slow path of a mempolicy comparison */
1657int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1658{
1659 if (!a || !b)
1660 return 0;
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001661 if (a->mode != b->mode)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001662 return 0;
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001663 if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
David Rientjesf5b087b2008-04-28 02:12:27 -07001664 return 0;
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001665 switch (a->mode) {
Mel Gorman19770b32008-04-28 02:12:18 -07001666 case MPOL_BIND:
1667 /* Fall through */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001668 case MPOL_INTERLEAVE:
Andi Kleendfcd3c0d2005-10-29 18:15:48 -07001669 return nodes_equal(a->v.nodes, b->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001670 case MPOL_PREFERRED:
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07001671 return a->v.preferred_node == b->v.preferred_node &&
1672 a->flags == b->flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001673 default:
1674 BUG();
1675 return 0;
1676 }
1677}
1678
Linus Torvalds1da177e2005-04-16 15:20:36 -07001679/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001680 * Shared memory backing store policy support.
1681 *
1682 * Remember policies even when nobody has shared memory mapped.
1683 * The policies are kept in Red-Black tree linked from the inode.
1684 * They are protected by the sp->lock spinlock, which should be held
1685 * for any accesses to the tree.
1686 */
1687
1688/* lookup first element intersecting start-end */
1689/* Caller holds sp->lock */
1690static struct sp_node *
1691sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1692{
1693 struct rb_node *n = sp->root.rb_node;
1694
1695 while (n) {
1696 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1697
1698 if (start >= p->end)
1699 n = n->rb_right;
1700 else if (end <= p->start)
1701 n = n->rb_left;
1702 else
1703 break;
1704 }
1705 if (!n)
1706 return NULL;
1707 for (;;) {
1708 struct sp_node *w = NULL;
1709 struct rb_node *prev = rb_prev(n);
1710 if (!prev)
1711 break;
1712 w = rb_entry(prev, struct sp_node, nd);
1713 if (w->end <= start)
1714 break;
1715 n = prev;
1716 }
1717 return rb_entry(n, struct sp_node, nd);
1718}
1719
1720/* Insert a new shared policy into the list. */
1721/* Caller holds sp->lock */
1722static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1723{
1724 struct rb_node **p = &sp->root.rb_node;
1725 struct rb_node *parent = NULL;
1726 struct sp_node *nd;
1727
1728 while (*p) {
1729 parent = *p;
1730 nd = rb_entry(parent, struct sp_node, nd);
1731 if (new->start < nd->start)
1732 p = &(*p)->rb_left;
1733 else if (new->end > nd->end)
1734 p = &(*p)->rb_right;
1735 else
1736 BUG();
1737 }
1738 rb_link_node(&new->nd, parent, p);
1739 rb_insert_color(&new->nd, &sp->root);
Paul Mundt140d5a42007-07-15 23:38:16 -07001740 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001741 new->policy ? new->policy->mode : 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001742}
1743
1744/* Find shared policy intersecting idx */
1745struct mempolicy *
1746mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1747{
1748 struct mempolicy *pol = NULL;
1749 struct sp_node *sn;
1750
1751 if (!sp->root.rb_node)
1752 return NULL;
1753 spin_lock(&sp->lock);
1754 sn = sp_lookup(sp, idx, idx+1);
1755 if (sn) {
1756 mpol_get(sn->policy);
1757 pol = sn->policy;
1758 }
1759 spin_unlock(&sp->lock);
1760 return pol;
1761}
1762
1763static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1764{
Paul Mundt140d5a42007-07-15 23:38:16 -07001765 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001766 rb_erase(&n->nd, &sp->root);
Lee Schermerhornf0be3d32008-04-28 02:13:08 -07001767 mpol_put(n->policy);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001768 kmem_cache_free(sn_cache, n);
1769}
1770
Adrian Bunkdbcb0f12007-10-16 01:26:26 -07001771static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1772 struct mempolicy *pol)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001773{
1774 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1775
1776 if (!n)
1777 return NULL;
1778 n->start = start;
1779 n->end = end;
1780 mpol_get(pol);
Lee Schermerhornaab0b102008-04-28 02:13:13 -07001781 pol->flags |= MPOL_F_SHARED; /* for unref */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001782 n->policy = pol;
1783 return n;
1784}
1785
1786/* Replace a policy range. */
1787static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1788 unsigned long end, struct sp_node *new)
1789{
1790 struct sp_node *n, *new2 = NULL;
1791
1792restart:
1793 spin_lock(&sp->lock);
1794 n = sp_lookup(sp, start, end);
1795 /* Take care of old policies in the same range. */
1796 while (n && n->start < end) {
1797 struct rb_node *next = rb_next(&n->nd);
1798 if (n->start >= start) {
1799 if (n->end <= end)
1800 sp_delete(sp, n);
1801 else
1802 n->start = end;
1803 } else {
1804 /* Old policy spanning whole new range. */
1805 if (n->end > end) {
1806 if (!new2) {
1807 spin_unlock(&sp->lock);
1808 new2 = sp_alloc(end, n->end, n->policy);
1809 if (!new2)
1810 return -ENOMEM;
1811 goto restart;
1812 }
1813 n->end = start;
1814 sp_insert(sp, new2);
1815 new2 = NULL;
1816 break;
1817 } else
1818 n->end = start;
1819 }
1820 if (!next)
1821 break;
1822 n = rb_entry(next, struct sp_node, nd);
1823 }
1824 if (new)
1825 sp_insert(sp, new);
1826 spin_unlock(&sp->lock);
1827 if (new2) {
Lee Schermerhornf0be3d32008-04-28 02:13:08 -07001828 mpol_put(new2->policy);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001829 kmem_cache_free(sn_cache, new2);
1830 }
1831 return 0;
1832}
1833
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07001834/**
1835 * mpol_shared_policy_init - initialize shared policy for inode
1836 * @sp: pointer to inode shared policy
1837 * @mpol: struct mempolicy to install
1838 *
1839 * Install non-NULL @mpol in inode's shared policy rb-tree.
1840 * On entry, the current task has a reference on a non-NULL @mpol.
1841 * This must be released on exit.
1842 */
1843void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
Robin Holt7339ff82006-01-14 13:20:48 -08001844{
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07001845 sp->root = RB_ROOT; /* empty tree == default mempolicy */
1846 spin_lock_init(&sp->lock);
Robin Holt7339ff82006-01-14 13:20:48 -08001847
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07001848 if (mpol) {
1849 struct vm_area_struct pvma;
1850 struct mempolicy *new;
Robin Holt7339ff82006-01-14 13:20:48 -08001851
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07001852 /* contextualize the tmpfs mount point mempolicy */
1853 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1854 mpol_put(mpol); /* drop our ref on sb mpol */
1855 if (IS_ERR(new))
1856 return; /* no valid nodemask intersection */
Robin Holt7339ff82006-01-14 13:20:48 -08001857
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07001858 /* Create pseudo-vma that contains just the policy */
1859 memset(&pvma, 0, sizeof(struct vm_area_struct));
1860 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
1861 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
1862 mpol_put(new); /* drop initial ref */
Robin Holt7339ff82006-01-14 13:20:48 -08001863 }
1864}
1865
Linus Torvalds1da177e2005-04-16 15:20:36 -07001866int mpol_set_shared_policy(struct shared_policy *info,
1867 struct vm_area_struct *vma, struct mempolicy *npol)
1868{
1869 int err;
1870 struct sp_node *new = NULL;
1871 unsigned long sz = vma_pages(vma);
1872
David Rientjes028fec42008-04-28 02:12:25 -07001873 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001874 vma->vm_pgoff,
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001875 sz, npol ? npol->mode : -1,
David Rientjes028fec42008-04-28 02:12:25 -07001876 npol ? npol->flags : -1,
Paul Mundt140d5a42007-07-15 23:38:16 -07001877 npol ? nodes_addr(npol->v.nodes)[0] : -1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001878
1879 if (npol) {
1880 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1881 if (!new)
1882 return -ENOMEM;
1883 }
1884 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1885 if (err && new)
1886 kmem_cache_free(sn_cache, new);
1887 return err;
1888}
1889
1890/* Free a backing policy store on inode delete. */
1891void mpol_free_shared_policy(struct shared_policy *p)
1892{
1893 struct sp_node *n;
1894 struct rb_node *next;
1895
1896 if (!p->root.rb_node)
1897 return;
1898 spin_lock(&p->lock);
1899 next = rb_first(&p->root);
1900 while (next) {
1901 n = rb_entry(next, struct sp_node, nd);
1902 next = rb_next(&n->nd);
Andi Kleen90c50292005-07-27 11:43:50 -07001903 rb_erase(&n->nd, &p->root);
Lee Schermerhornf0be3d32008-04-28 02:13:08 -07001904 mpol_put(n->policy);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001905 kmem_cache_free(sn_cache, n);
1906 }
1907 spin_unlock(&p->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001908}
1909
1910/* assumes fs == KERNEL_DS */
1911void __init numa_policy_init(void)
1912{
Paul Mundtb71636e22007-07-15 23:38:15 -07001913 nodemask_t interleave_nodes;
1914 unsigned long largest = 0;
1915 int nid, prefer = 0;
1916
Linus Torvalds1da177e2005-04-16 15:20:36 -07001917 policy_cache = kmem_cache_create("numa_policy",
1918 sizeof(struct mempolicy),
Paul Mundt20c2df82007-07-20 10:11:58 +09001919 0, SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001920
1921 sn_cache = kmem_cache_create("shared_policy_node",
1922 sizeof(struct sp_node),
Paul Mundt20c2df82007-07-20 10:11:58 +09001923 0, SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001924
Paul Mundtb71636e22007-07-15 23:38:15 -07001925 /*
1926 * Set interleaving policy for system init. Interleaving is only
1927 * enabled across suitably sized nodes (default is >= 16MB), or
1928 * fall back to the largest node if they're all smaller.
1929 */
1930 nodes_clear(interleave_nodes);
Christoph Lameter56bbd652007-10-16 01:25:35 -07001931 for_each_node_state(nid, N_HIGH_MEMORY) {
Paul Mundtb71636e22007-07-15 23:38:15 -07001932 unsigned long total_pages = node_present_pages(nid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001933
Paul Mundtb71636e22007-07-15 23:38:15 -07001934 /* Preserve the largest node */
1935 if (largest < total_pages) {
1936 largest = total_pages;
1937 prefer = nid;
1938 }
1939
1940 /* Interleave this node? */
1941 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1942 node_set(nid, interleave_nodes);
1943 }
1944
1945 /* All too small, use the largest */
1946 if (unlikely(nodes_empty(interleave_nodes)))
1947 node_set(prefer, interleave_nodes);
1948
David Rientjes028fec42008-04-28 02:12:25 -07001949 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001950 printk("numa_policy_init: interleaving failed\n");
1951}
1952
Christoph Lameter8bccd852005-10-29 18:16:59 -07001953/* Reset policy of current process to default */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001954void numa_default_policy(void)
1955{
David Rientjes028fec42008-04-28 02:12:25 -07001956 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001957}
Paul Jackson68860ec2005-10-30 15:02:36 -08001958
Paul Jackson42253992006-01-08 01:01:59 -08001959/*
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07001960 * Parse and format mempolicy from/to strings
1961 */
1962
1963/*
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07001964 * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07001965 * Used only for mpol_parse_str() and mpol_to_str()
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08001966 */
Lee Schermerhorn53f25562008-04-28 02:13:20 -07001967#define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
Helge Deller15ad7cd2006-12-06 20:40:36 -08001968static const char * const policy_types[] =
Lee Schermerhorn53f25562008-04-28 02:13:20 -07001969 { "default", "prefer", "bind", "interleave", "local" };
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08001970
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07001971
1972#ifdef CONFIG_TMPFS
1973/**
1974 * mpol_parse_str - parse string to mempolicy
1975 * @str: string containing mempolicy to parse
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07001976 * @mpol: pointer to struct mempolicy pointer, returned on success.
1977 * @no_context: flag whether to "contextualize" the mempolicy
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07001978 *
1979 * Format of input:
1980 * <mode>[=<flags>][:<nodelist>]
1981 *
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07001982 * if @no_context is true, save the input nodemask in w.user_nodemask in
1983 * the returned mempolicy. This will be used to "clone" the mempolicy in
1984 * a specific context [cpuset] at a later time. Used to parse tmpfs mpol
1985 * mount option. Note that if 'static' or 'relative' mode flags were
1986 * specified, the input nodemask will already have been saved. Saving
1987 * it again is redundant, but safe.
1988 *
1989 * On success, returns 0, else 1
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07001990 */
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07001991int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07001992{
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07001993 struct mempolicy *new = NULL;
1994 unsigned short uninitialized_var(mode);
1995 unsigned short uninitialized_var(mode_flags);
1996 nodemask_t nodes;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07001997 char *nodelist = strchr(str, ':');
1998 char *flags = strchr(str, '=');
1999 int i;
2000 int err = 1;
2001
2002 if (nodelist) {
2003 /* NUL-terminate mode or flags string */
2004 *nodelist++ = '\0';
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002005 if (nodelist_parse(nodelist, nodes))
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002006 goto out;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002007 if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002008 goto out;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002009 } else
2010 nodes_clear(nodes);
2011
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002012 if (flags)
2013 *flags++ = '\0'; /* terminate mode string */
2014
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07002015 for (i = 0; i <= MPOL_LOCAL; i++) {
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002016 if (!strcmp(str, policy_types[i])) {
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002017 mode = i;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002018 break;
2019 }
2020 }
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07002021 if (i > MPOL_LOCAL)
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002022 goto out;
2023
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002024 switch (mode) {
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002025 case MPOL_PREFERRED:
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002026 /*
2027 * Insist on a nodelist of one node only
2028 */
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002029 if (nodelist) {
2030 char *rest = nodelist;
2031 while (isdigit(*rest))
2032 rest++;
2033 if (!*rest)
2034 err = 0;
2035 }
2036 break;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002037 case MPOL_INTERLEAVE:
2038 /*
2039 * Default to online nodes with memory if no nodelist
2040 */
2041 if (!nodelist)
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002042 nodes = node_states[N_HIGH_MEMORY];
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002043 err = 0;
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07002044 break;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002045 case MPOL_LOCAL:
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07002046 /*
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002047 * Don't allow a nodelist; mpol_new() checks flags
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07002048 */
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002049 if (nodelist)
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07002050 goto out;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002051 mode = MPOL_PREFERRED;
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07002052 break;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002053
2054 /*
2055 * case MPOL_BIND: mpol_new() enforces non-empty nodemask.
2056 * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
2057 */
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002058 }
2059
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002060 mode_flags = 0;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002061 if (flags) {
2062 /*
2063 * Currently, we only support two mutually exclusive
2064 * mode flags.
2065 */
2066 if (!strcmp(flags, "static"))
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002067 mode_flags |= MPOL_F_STATIC_NODES;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002068 else if (!strcmp(flags, "relative"))
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002069 mode_flags |= MPOL_F_RELATIVE_NODES;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002070 else
2071 err = 1;
2072 }
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002073
2074 new = mpol_new(mode, mode_flags, &nodes);
2075 if (IS_ERR(new))
2076 err = 1;
2077 else if (no_context)
2078 new->w.user_nodemask = nodes; /* save for contextualization */
2079
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002080out:
2081 /* Restore string for error message */
2082 if (nodelist)
2083 *--nodelist = ':';
2084 if (flags)
2085 *--flags = '=';
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002086 if (!err)
2087 *mpol = new;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002088 return err;
2089}
2090#endif /* CONFIG_TMPFS */
2091
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002092/**
2093 * mpol_to_str - format a mempolicy structure for printing
2094 * @buffer: to contain formatted mempolicy string
2095 * @maxlen: length of @buffer
2096 * @pol: pointer to mempolicy to be formatted
2097 * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask
2098 *
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002099 * Convert a mempolicy into a string.
2100 * Returns the number of characters in buffer (if positive)
2101 * or an error (negative)
2102 */
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002103int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002104{
2105 char *p = buffer;
2106 int l;
2107 nodemask_t nodes;
Lee Schermerhornbea904d2008-04-28 02:13:18 -07002108 unsigned short mode;
David Rientjesf5b087b2008-04-28 02:12:27 -07002109 unsigned short flags = pol ? pol->flags : 0;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002110
Lee Schermerhorn22919902008-04-28 02:13:22 -07002111 /*
2112 * Sanity check: room for longest mode, flag and some nodes
2113 */
2114 VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2115
Lee Schermerhornbea904d2008-04-28 02:13:18 -07002116 if (!pol || pol == &default_policy)
2117 mode = MPOL_DEFAULT;
2118 else
2119 mode = pol->mode;
2120
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002121 switch (mode) {
2122 case MPOL_DEFAULT:
2123 nodes_clear(nodes);
2124 break;
2125
2126 case MPOL_PREFERRED:
2127 nodes_clear(nodes);
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07002128 if (flags & MPOL_F_LOCAL)
Lee Schermerhorn53f25562008-04-28 02:13:20 -07002129 mode = MPOL_LOCAL; /* pseudo-policy */
2130 else
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07002131 node_set(pol->v.preferred_node, nodes);
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002132 break;
2133
2134 case MPOL_BIND:
Mel Gorman19770b32008-04-28 02:12:18 -07002135 /* Fall through */
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002136 case MPOL_INTERLEAVE:
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002137 if (no_context)
2138 nodes = pol->w.user_nodemask;
2139 else
2140 nodes = pol->v.nodes;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002141 break;
2142
2143 default:
2144 BUG();
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002145 }
2146
2147 l = strlen(policy_types[mode]);
Lee Schermerhorn53f25562008-04-28 02:13:20 -07002148 if (buffer + maxlen < p + l + 1)
2149 return -ENOSPC;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002150
2151 strcpy(p, policy_types[mode]);
2152 p += l;
2153
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07002154 if (flags & MPOL_MODE_FLAGS) {
David Rientjesf5b087b2008-04-28 02:12:27 -07002155 if (buffer + maxlen < p + 2)
2156 return -ENOSPC;
2157 *p++ = '=';
2158
Lee Schermerhorn22919902008-04-28 02:13:22 -07002159 /*
2160 * Currently, the only defined flags are mutually exclusive
2161 */
David Rientjesf5b087b2008-04-28 02:12:27 -07002162 if (flags & MPOL_F_STATIC_NODES)
Lee Schermerhorn22919902008-04-28 02:13:22 -07002163 p += snprintf(p, buffer + maxlen - p, "static");
2164 else if (flags & MPOL_F_RELATIVE_NODES)
2165 p += snprintf(p, buffer + maxlen - p, "relative");
David Rientjesf5b087b2008-04-28 02:12:27 -07002166 }
2167
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002168 if (!nodes_empty(nodes)) {
2169 if (buffer + maxlen < p + 2)
2170 return -ENOSPC;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002171 *p++ = ':';
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002172 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2173 }
2174 return p - buffer;
2175}
2176
2177struct numa_maps {
2178 unsigned long pages;
2179 unsigned long anon;
Christoph Lameter397874d2006-03-06 15:42:53 -08002180 unsigned long active;
2181 unsigned long writeback;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002182 unsigned long mapcount_max;
Christoph Lameter397874d2006-03-06 15:42:53 -08002183 unsigned long dirty;
2184 unsigned long swapcache;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002185 unsigned long node[MAX_NUMNODES];
2186};
2187
Christoph Lameter397874d2006-03-06 15:42:53 -08002188static void gather_stats(struct page *page, void *private, int pte_dirty)
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002189{
2190 struct numa_maps *md = private;
2191 int count = page_mapcount(page);
2192
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002193 md->pages++;
Christoph Lameter397874d2006-03-06 15:42:53 -08002194 if (pte_dirty || PageDirty(page))
2195 md->dirty++;
2196
2197 if (PageSwapCache(page))
2198 md->swapcache++;
2199
2200 if (PageActive(page))
2201 md->active++;
2202
2203 if (PageWriteback(page))
2204 md->writeback++;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002205
2206 if (PageAnon(page))
2207 md->anon++;
2208
Christoph Lameter397874d2006-03-06 15:42:53 -08002209 if (count > md->mapcount_max)
2210 md->mapcount_max = count;
2211
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002212 md->node[page_to_nid(page)]++;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002213}
2214
Andrew Morton7f709ed2006-03-07 21:55:22 -08002215#ifdef CONFIG_HUGETLB_PAGE
Christoph Lameter397874d2006-03-06 15:42:53 -08002216static void check_huge_range(struct vm_area_struct *vma,
2217 unsigned long start, unsigned long end,
2218 struct numa_maps *md)
2219{
2220 unsigned long addr;
2221 struct page *page;
Andi Kleena5516432008-07-23 21:27:41 -07002222 struct hstate *h = hstate_vma(vma);
2223 unsigned long sz = huge_page_size(h);
Christoph Lameter397874d2006-03-06 15:42:53 -08002224
Andi Kleena5516432008-07-23 21:27:41 -07002225 for (addr = start; addr < end; addr += sz) {
2226 pte_t *ptep = huge_pte_offset(vma->vm_mm,
2227 addr & huge_page_mask(h));
Christoph Lameter397874d2006-03-06 15:42:53 -08002228 pte_t pte;
2229
2230 if (!ptep)
2231 continue;
2232
2233 pte = *ptep;
2234 if (pte_none(pte))
2235 continue;
2236
2237 page = pte_page(pte);
2238 if (!page)
2239 continue;
2240
2241 gather_stats(page, md, pte_dirty(*ptep));
2242 }
2243}
Andrew Morton7f709ed2006-03-07 21:55:22 -08002244#else
2245static inline void check_huge_range(struct vm_area_struct *vma,
2246 unsigned long start, unsigned long end,
2247 struct numa_maps *md)
2248{
2249}
2250#endif
Christoph Lameter397874d2006-03-06 15:42:53 -08002251
Lee Schermerhorn53f25562008-04-28 02:13:20 -07002252/*
2253 * Display pages allocated per node and memory policy via /proc.
2254 */
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002255int show_numa_map(struct seq_file *m, void *v)
2256{
Eric W. Biederman99f89552006-06-26 00:25:55 -07002257 struct proc_maps_private *priv = m->private;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002258 struct vm_area_struct *vma = v;
2259 struct numa_maps *md;
Christoph Lameter397874d2006-03-06 15:42:53 -08002260 struct file *file = vma->vm_file;
2261 struct mm_struct *mm = vma->vm_mm;
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07002262 struct mempolicy *pol;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002263 int n;
2264 char buffer[50];
2265
Christoph Lameter397874d2006-03-06 15:42:53 -08002266 if (!mm)
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002267 return 0;
2268
2269 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2270 if (!md)
2271 return 0;
2272
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07002273 pol = get_vma_policy(priv->task, vma, vma->vm_start);
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002274 mpol_to_str(buffer, sizeof(buffer), pol, 0);
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07002275 mpol_cond_put(pol);
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002276
Christoph Lameter397874d2006-03-06 15:42:53 -08002277 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002278
Christoph Lameter397874d2006-03-06 15:42:53 -08002279 if (file) {
2280 seq_printf(m, " file=");
Jan Blunckc32c2f62008-02-14 19:38:43 -08002281 seq_path(m, &file->f_path, "\n\t= ");
Christoph Lameter397874d2006-03-06 15:42:53 -08002282 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2283 seq_printf(m, " heap");
2284 } else if (vma->vm_start <= mm->start_stack &&
2285 vma->vm_end >= mm->start_stack) {
2286 seq_printf(m, " stack");
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002287 }
Christoph Lameter397874d2006-03-06 15:42:53 -08002288
2289 if (is_vm_hugetlb_page(vma)) {
2290 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2291 seq_printf(m, " huge");
2292 } else {
2293 check_pgd_range(vma, vma->vm_start, vma->vm_end,
Christoph Lameter56bbd652007-10-16 01:25:35 -07002294 &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
Christoph Lameter397874d2006-03-06 15:42:53 -08002295 }
2296
2297 if (!md->pages)
2298 goto out;
2299
2300 if (md->anon)
2301 seq_printf(m," anon=%lu",md->anon);
2302
2303 if (md->dirty)
2304 seq_printf(m," dirty=%lu",md->dirty);
2305
2306 if (md->pages != md->anon && md->pages != md->dirty)
2307 seq_printf(m, " mapped=%lu", md->pages);
2308
2309 if (md->mapcount_max > 1)
2310 seq_printf(m, " mapmax=%lu", md->mapcount_max);
2311
2312 if (md->swapcache)
2313 seq_printf(m," swapcache=%lu", md->swapcache);
2314
2315 if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2316 seq_printf(m," active=%lu", md->active);
2317
2318 if (md->writeback)
2319 seq_printf(m," writeback=%lu", md->writeback);
2320
Christoph Lameter56bbd652007-10-16 01:25:35 -07002321 for_each_node_state(n, N_HIGH_MEMORY)
Christoph Lameter397874d2006-03-06 15:42:53 -08002322 if (md->node[n])
2323 seq_printf(m, " N%d=%lu", n, md->node[n]);
2324out:
2325 seq_putc(m, '\n');
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002326 kfree(md);
2327
2328 if (m->count < m->size)
Eric W. Biederman99f89552006-06-26 00:25:55 -07002329 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002330 return 0;
2331}