blob: 1c16c228f35ab12e77105561f7c7b20fb4db17bd [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
Christoph Lameter8bccd852005-10-29 18:16:59 -07005 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
Christoph Lameter8bccd852005-10-29 18:16:59 -070021 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
Christoph Lameter8bccd852005-10-29 18:16:59 -070024 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070028 * preferred Try a specific node first before normal fallback.
David Rientjes00ef2d22013-02-22 16:35:36 -080029 * As a special case NUMA_NO_NODE here means do the allocation
Linus Torvalds1da177e2005-04-16 15:20:36 -070030 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
Christoph Lameter8bccd852005-10-29 18:16:59 -070033 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070034 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
Linus Torvalds1da177e2005-04-16 15:20:36 -070066*/
67
Mitchel Humpherysb1de0d12014-06-06 14:38:30 -070068#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69
Linus Torvalds1da177e2005-04-16 15:20:36 -070070#include <linux/mempolicy.h>
71#include <linux/mm.h>
72#include <linux/highmem.h>
73#include <linux/hugetlb.h>
74#include <linux/kernel.h>
75#include <linux/sched.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070076#include <linux/nodemask.h>
77#include <linux/cpuset.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070078#include <linux/slab.h>
79#include <linux/string.h>
Paul Gortmakerb95f1b312011-10-16 02:01:52 -040080#include <linux/export.h>
Pavel Emelyanovb4888932007-10-18 23:40:14 -070081#include <linux/nsproxy.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070082#include <linux/interrupt.h>
83#include <linux/init.h>
84#include <linux/compat.h>
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080085#include <linux/swap.h>
Christoph Lameter1a75a6c2006-01-08 01:01:02 -080086#include <linux/seq_file.h>
87#include <linux/proc_fs.h>
Christoph Lameterb20a3502006-03-22 00:09:12 -080088#include <linux/migrate.h>
Hugh Dickins62b61f62009-12-14 17:59:33 -080089#include <linux/ksm.h>
Christoph Lameter95a402c2006-06-23 02:03:53 -070090#include <linux/rmap.h>
David Quigley86c3a762006-06-23 02:04:02 -070091#include <linux/security.h>
Adrian Bunkdbcb0f12007-10-16 01:26:26 -070092#include <linux/syscalls.h>
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -070093#include <linux/ctype.h>
KOSAKI Motohiro6d9c2852009-12-14 17:58:11 -080094#include <linux/mm_inline.h>
Lee Schermerhornb24f53a2012-10-25 14:16:32 +020095#include <linux/mmu_notifier.h>
Mitchel Humpherysb1de0d12014-06-06 14:38:30 -070096#include <linux/printk.h>
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080097
Linus Torvalds1da177e2005-04-16 15:20:36 -070098#include <asm/tlbflush.h>
99#include <asm/uaccess.h>
Michal Hocko778d3b02011-07-26 16:08:30 -0700100#include <linux/random.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700101
Nick Piggin62695a82008-10-18 20:26:09 -0700102#include "internal.h"
103
Christoph Lameter38e35862006-01-08 01:01:01 -0800104/* Internal flags */
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800105#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
Christoph Lameter38e35862006-01-08 01:01:01 -0800106#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800107
Pekka Enbergfcc234f2006-03-22 00:08:13 -0800108static struct kmem_cache *policy_cache;
109static struct kmem_cache *sn_cache;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110
Linus Torvalds1da177e2005-04-16 15:20:36 -0700111/* Highest zone. An specific allocation for a zone below that is not
112 policied. */
Christoph Lameter62672762007-02-10 01:43:07 -0800113enum zone_type policy_zone = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700115/*
116 * run-time system-wide default policy => local allocation
117 */
H Hartley Sweetene754d792011-10-31 17:09:23 -0700118static struct mempolicy default_policy = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119 .refcnt = ATOMIC_INIT(1), /* never free it */
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700120 .mode = MPOL_PREFERRED,
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700121 .flags = MPOL_F_LOCAL,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122};
123
Mel Gorman5606e382012-11-02 18:19:13 +0000124static struct mempolicy preferred_node_policy[MAX_NUMNODES];
125
126static struct mempolicy *get_task_policy(struct task_struct *p)
127{
128 struct mempolicy *pol = p->mempolicy;
Mel Gorman5606e382012-11-02 18:19:13 +0000129
130 if (!pol) {
Jianguo Wu1da6f0e2013-09-11 14:21:25 -0700131 int node = numa_node_id();
Mel Gorman5606e382012-11-02 18:19:13 +0000132
Jianguo Wu1da6f0e2013-09-11 14:21:25 -0700133 if (node != NUMA_NO_NODE) {
134 pol = &preferred_node_policy[node];
135 /*
136 * preferred_node_policy is not initialised early in
137 * boot
138 */
139 if (!pol->mode)
140 pol = NULL;
141 }
Mel Gorman5606e382012-11-02 18:19:13 +0000142 }
143
144 return pol;
145}
146
David Rientjes37012942008-04-28 02:12:33 -0700147static const struct mempolicy_operations {
148 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
Miao Xie708c1bb2010-05-24 14:32:07 -0700149 /*
150 * If read-side task has no lock to protect task->mempolicy, write-side
151 * task will rebind the task->mempolicy by two step. The first step is
152 * setting all the newly nodes, and the second step is cleaning all the
153 * disallowed nodes. In this way, we can avoid finding no node to alloc
154 * page.
155 * If we have a lock to protect task->mempolicy in read-side, we do
156 * rebind directly.
157 *
158 * step:
159 * MPOL_REBIND_ONCE - do rebind work at once
160 * MPOL_REBIND_STEP1 - set all the newly nodes
161 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
162 */
163 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
164 enum mpol_rebind_step step);
David Rientjes37012942008-04-28 02:12:33 -0700165} mpol_ops[MPOL_MAX];
166
Mel Gorman19770b32008-04-28 02:12:18 -0700167/* Check that the nodemask contains at least one populated zone */
David Rientjes37012942008-04-28 02:12:33 -0700168static int is_valid_nodemask(const nodemask_t *nodemask)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169{
Lai Jiangshand3eb1572013-02-22 16:33:22 -0800170 return nodes_intersects(*nodemask, node_states[N_MEMORY]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700171}
172
David Rientjesf5b087b2008-04-28 02:12:27 -0700173static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
174{
Bob Liu6d556292010-05-24 14:31:59 -0700175 return pol->flags & MPOL_MODE_FLAGS;
David Rientjes4c50bc02008-04-28 02:12:30 -0700176}
177
178static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
179 const nodemask_t *rel)
180{
181 nodemask_t tmp;
182 nodes_fold(tmp, *orig, nodes_weight(*rel));
183 nodes_onto(*ret, tmp, *rel);
David Rientjesf5b087b2008-04-28 02:12:27 -0700184}
185
David Rientjes37012942008-04-28 02:12:33 -0700186static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
187{
188 if (nodes_empty(*nodes))
189 return -EINVAL;
190 pol->v.nodes = *nodes;
191 return 0;
192}
193
194static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
195{
196 if (!nodes)
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700197 pol->flags |= MPOL_F_LOCAL; /* local allocation */
David Rientjes37012942008-04-28 02:12:33 -0700198 else if (nodes_empty(*nodes))
199 return -EINVAL; /* no allowed nodes */
200 else
201 pol->v.preferred_node = first_node(*nodes);
202 return 0;
203}
204
205static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
206{
207 if (!is_valid_nodemask(nodes))
208 return -EINVAL;
209 pol->v.nodes = *nodes;
210 return 0;
211}
212
Miao Xie58568d22009-06-16 15:31:49 -0700213/*
214 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
215 * any, for the new policy. mpol_new() has already validated the nodes
216 * parameter with respect to the policy mode and flags. But, we need to
217 * handle an empty nodemask with MPOL_PREFERRED here.
218 *
219 * Must be called holding task's alloc_lock to protect task's mems_allowed
220 * and mempolicy. May also be called holding the mmap_semaphore for write.
221 */
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700222static int mpol_set_nodemask(struct mempolicy *pol,
223 const nodemask_t *nodes, struct nodemask_scratch *nsc)
Miao Xie58568d22009-06-16 15:31:49 -0700224{
Miao Xie58568d22009-06-16 15:31:49 -0700225 int ret;
226
227 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
228 if (pol == NULL)
229 return 0;
Lai Jiangshan01f13bd2012-12-12 13:51:33 -0800230 /* Check N_MEMORY */
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700231 nodes_and(nsc->mask1,
Lai Jiangshan01f13bd2012-12-12 13:51:33 -0800232 cpuset_current_mems_allowed, node_states[N_MEMORY]);
Miao Xie58568d22009-06-16 15:31:49 -0700233
234 VM_BUG_ON(!nodes);
235 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
236 nodes = NULL; /* explicit local allocation */
237 else {
238 if (pol->flags & MPOL_F_RELATIVE_NODES)
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700239 mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
Miao Xie58568d22009-06-16 15:31:49 -0700240 else
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700241 nodes_and(nsc->mask2, *nodes, nsc->mask1);
242
Miao Xie58568d22009-06-16 15:31:49 -0700243 if (mpol_store_user_nodemask(pol))
244 pol->w.user_nodemask = *nodes;
245 else
246 pol->w.cpuset_mems_allowed =
247 cpuset_current_mems_allowed;
248 }
249
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700250 if (nodes)
251 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
252 else
253 ret = mpol_ops[pol->mode].create(pol, NULL);
Miao Xie58568d22009-06-16 15:31:49 -0700254 return ret;
255}
256
257/*
258 * This function just creates a new policy, does some check and simple
259 * initialization. You must invoke mpol_set_nodemask() to set nodes.
260 */
David Rientjes028fec42008-04-28 02:12:25 -0700261static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
262 nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263{
264 struct mempolicy *policy;
265
David Rientjes028fec42008-04-28 02:12:25 -0700266 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
David Rientjes00ef2d22013-02-22 16:35:36 -0800267 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
Paul Mundt140d5a42007-07-15 23:38:16 -0700268
David Rientjes3e1f06452008-04-28 02:12:34 -0700269 if (mode == MPOL_DEFAULT) {
270 if (nodes && !nodes_empty(*nodes))
David Rientjes37012942008-04-28 02:12:33 -0700271 return ERR_PTR(-EINVAL);
Lee Schermerhornd3a71032012-10-25 14:16:29 +0200272 return NULL;
David Rientjes37012942008-04-28 02:12:33 -0700273 }
David Rientjes3e1f06452008-04-28 02:12:34 -0700274 VM_BUG_ON(!nodes);
275
276 /*
277 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
278 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
279 * All other modes require a valid pointer to a non-empty nodemask.
280 */
281 if (mode == MPOL_PREFERRED) {
282 if (nodes_empty(*nodes)) {
283 if (((flags & MPOL_F_STATIC_NODES) ||
284 (flags & MPOL_F_RELATIVE_NODES)))
285 return ERR_PTR(-EINVAL);
David Rientjes3e1f06452008-04-28 02:12:34 -0700286 }
Peter Zijlstra479e2802012-10-25 14:16:28 +0200287 } else if (mode == MPOL_LOCAL) {
288 if (!nodes_empty(*nodes))
289 return ERR_PTR(-EINVAL);
290 mode = MPOL_PREFERRED;
David Rientjes3e1f06452008-04-28 02:12:34 -0700291 } else if (nodes_empty(*nodes))
292 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700293 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
294 if (!policy)
295 return ERR_PTR(-ENOMEM);
296 atomic_set(&policy->refcnt, 1);
Lee Schermerhorn45c47452008-04-28 02:13:12 -0700297 policy->mode = mode;
David Rientjes3e1f06452008-04-28 02:12:34 -0700298 policy->flags = flags;
David Rientjesf5b087b2008-04-28 02:12:27 -0700299
David Rientjes37012942008-04-28 02:12:33 -0700300 return policy;
301}
302
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -0700303/* Slow path of a mpol destructor. */
304void __mpol_put(struct mempolicy *p)
305{
306 if (!atomic_dec_and_test(&p->refcnt))
307 return;
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -0700308 kmem_cache_free(policy_cache, p);
309}
310
Miao Xie708c1bb2010-05-24 14:32:07 -0700311static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
312 enum mpol_rebind_step step)
David Rientjes37012942008-04-28 02:12:33 -0700313{
314}
315
Miao Xie708c1bb2010-05-24 14:32:07 -0700316/*
317 * step:
318 * MPOL_REBIND_ONCE - do rebind work at once
319 * MPOL_REBIND_STEP1 - set all the newly nodes
320 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
321 */
322static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
323 enum mpol_rebind_step step)
David Rientjes37012942008-04-28 02:12:33 -0700324{
325 nodemask_t tmp;
326
327 if (pol->flags & MPOL_F_STATIC_NODES)
328 nodes_and(tmp, pol->w.user_nodemask, *nodes);
329 else if (pol->flags & MPOL_F_RELATIVE_NODES)
330 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
331 else {
Miao Xie708c1bb2010-05-24 14:32:07 -0700332 /*
333 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
334 * result
335 */
336 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
337 nodes_remap(tmp, pol->v.nodes,
338 pol->w.cpuset_mems_allowed, *nodes);
339 pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
340 } else if (step == MPOL_REBIND_STEP2) {
341 tmp = pol->w.cpuset_mems_allowed;
342 pol->w.cpuset_mems_allowed = *nodes;
343 } else
344 BUG();
David Rientjes37012942008-04-28 02:12:33 -0700345 }
346
Miao Xie708c1bb2010-05-24 14:32:07 -0700347 if (nodes_empty(tmp))
348 tmp = *nodes;
349
350 if (step == MPOL_REBIND_STEP1)
351 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
352 else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
353 pol->v.nodes = tmp;
354 else
355 BUG();
356
David Rientjes37012942008-04-28 02:12:33 -0700357 if (!node_isset(current->il_next, tmp)) {
358 current->il_next = next_node(current->il_next, tmp);
359 if (current->il_next >= MAX_NUMNODES)
360 current->il_next = first_node(tmp);
361 if (current->il_next >= MAX_NUMNODES)
362 current->il_next = numa_node_id();
363 }
364}
365
366static void mpol_rebind_preferred(struct mempolicy *pol,
Miao Xie708c1bb2010-05-24 14:32:07 -0700367 const nodemask_t *nodes,
368 enum mpol_rebind_step step)
David Rientjes37012942008-04-28 02:12:33 -0700369{
370 nodemask_t tmp;
371
David Rientjes37012942008-04-28 02:12:33 -0700372 if (pol->flags & MPOL_F_STATIC_NODES) {
373 int node = first_node(pol->w.user_nodemask);
374
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700375 if (node_isset(node, *nodes)) {
David Rientjes37012942008-04-28 02:12:33 -0700376 pol->v.preferred_node = node;
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700377 pol->flags &= ~MPOL_F_LOCAL;
378 } else
379 pol->flags |= MPOL_F_LOCAL;
David Rientjes37012942008-04-28 02:12:33 -0700380 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
381 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
382 pol->v.preferred_node = first_node(tmp);
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700383 } else if (!(pol->flags & MPOL_F_LOCAL)) {
David Rientjes37012942008-04-28 02:12:33 -0700384 pol->v.preferred_node = node_remap(pol->v.preferred_node,
385 pol->w.cpuset_mems_allowed,
386 *nodes);
387 pol->w.cpuset_mems_allowed = *nodes;
388 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700389}
390
Miao Xie708c1bb2010-05-24 14:32:07 -0700391/*
392 * mpol_rebind_policy - Migrate a policy to a different set of nodes
393 *
394 * If read-side task has no lock to protect task->mempolicy, write-side
395 * task will rebind the task->mempolicy by two step. The first step is
396 * setting all the newly nodes, and the second step is cleaning all the
397 * disallowed nodes. In this way, we can avoid finding no node to alloc
398 * page.
399 * If we have a lock to protect task->mempolicy in read-side, we do
400 * rebind directly.
401 *
402 * step:
403 * MPOL_REBIND_ONCE - do rebind work at once
404 * MPOL_REBIND_STEP1 - set all the newly nodes
405 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
406 */
407static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
408 enum mpol_rebind_step step)
David Rientjes1d0d2682008-04-28 02:12:32 -0700409{
David Rientjes1d0d2682008-04-28 02:12:32 -0700410 if (!pol)
411 return;
Wang Sheng-Hui89c522c2012-05-29 15:06:16 -0700412 if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
David Rientjes1d0d2682008-04-28 02:12:32 -0700413 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
414 return;
Miao Xie708c1bb2010-05-24 14:32:07 -0700415
416 if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
417 return;
418
419 if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
420 BUG();
421
422 if (step == MPOL_REBIND_STEP1)
423 pol->flags |= MPOL_F_REBINDING;
424 else if (step == MPOL_REBIND_STEP2)
425 pol->flags &= ~MPOL_F_REBINDING;
426 else if (step >= MPOL_REBIND_NSTEP)
427 BUG();
428
429 mpol_ops[pol->mode].rebind(pol, newmask, step);
David Rientjes1d0d2682008-04-28 02:12:32 -0700430}
431
432/*
433 * Wrapper for mpol_rebind_policy() that just requires task
434 * pointer, and updates task mempolicy.
Miao Xie58568d22009-06-16 15:31:49 -0700435 *
436 * Called with task's alloc_lock held.
David Rientjes1d0d2682008-04-28 02:12:32 -0700437 */
438
Miao Xie708c1bb2010-05-24 14:32:07 -0700439void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
440 enum mpol_rebind_step step)
David Rientjes1d0d2682008-04-28 02:12:32 -0700441{
Miao Xie708c1bb2010-05-24 14:32:07 -0700442 mpol_rebind_policy(tsk->mempolicy, new, step);
David Rientjes1d0d2682008-04-28 02:12:32 -0700443}
444
445/*
446 * Rebind each vma in mm to new nodemask.
447 *
448 * Call holding a reference to mm. Takes mm->mmap_sem during call.
449 */
450
451void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
452{
453 struct vm_area_struct *vma;
454
455 down_write(&mm->mmap_sem);
456 for (vma = mm->mmap; vma; vma = vma->vm_next)
Miao Xie708c1bb2010-05-24 14:32:07 -0700457 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
David Rientjes1d0d2682008-04-28 02:12:32 -0700458 up_write(&mm->mmap_sem);
459}
460
David Rientjes37012942008-04-28 02:12:33 -0700461static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
462 [MPOL_DEFAULT] = {
463 .rebind = mpol_rebind_default,
464 },
465 [MPOL_INTERLEAVE] = {
466 .create = mpol_new_interleave,
467 .rebind = mpol_rebind_nodemask,
468 },
469 [MPOL_PREFERRED] = {
470 .create = mpol_new_preferred,
471 .rebind = mpol_rebind_preferred,
472 },
473 [MPOL_BIND] = {
474 .create = mpol_new_bind,
475 .rebind = mpol_rebind_nodemask,
476 },
477};
478
Christoph Lameterfc301282006-01-18 17:42:29 -0800479static void migrate_page_add(struct page *page, struct list_head *pagelist,
480 unsigned long flags);
Christoph Lameter1a75a6c2006-01-08 01:01:02 -0800481
Naoya Horiguchi98094942013-09-11 14:22:14 -0700482/*
483 * Scan through pages checking if pages follow certain conditions,
484 * and move them to the pagelist if they do.
485 */
486static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800487 unsigned long addr, unsigned long end,
488 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800489 void *private)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700490{
Hugh Dickins91612e02005-06-21 17:15:07 -0700491 pte_t *orig_pte;
492 pte_t *pte;
Hugh Dickins705e87c2005-10-29 18:16:27 -0700493 spinlock_t *ptl;
Hugh Dickins941150a2005-06-21 17:15:06 -0700494
Hugh Dickins705e87c2005-10-29 18:16:27 -0700495 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
Hugh Dickins91612e02005-06-21 17:15:07 -0700496 do {
Linus Torvalds6aab3412005-11-28 14:34:23 -0800497 struct page *page;
Andy Whitcroft25ba77c2006-12-06 20:33:03 -0800498 int nid;
Hugh Dickins91612e02005-06-21 17:15:07 -0700499
500 if (!pte_present(*pte))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700501 continue;
Linus Torvalds6aab3412005-11-28 14:34:23 -0800502 page = vm_normal_page(vma, addr, *pte);
503 if (!page)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700504 continue;
Nick Piggin053837f2006-01-18 17:42:27 -0800505 /*
Hugh Dickins62b61f62009-12-14 17:59:33 -0800506 * vm_normal_page() filters out zero pages, but there might
507 * still be PageReserved pages to skip, perhaps in a VDSO.
Nick Piggin053837f2006-01-18 17:42:27 -0800508 */
Hugh Dickinsb79bc0a2013-02-22 16:35:13 -0800509 if (PageReserved(page))
Christoph Lameterf4598c82006-01-12 01:05:20 -0800510 continue;
Linus Torvalds6aab3412005-11-28 14:34:23 -0800511 nid = page_to_nid(page);
Christoph Lameter38e35862006-01-08 01:01:01 -0800512 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
513 continue;
514
Stephen Wilsonb1f72d12011-05-24 17:12:43 -0700515 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
Christoph Lameterfc301282006-01-18 17:42:29 -0800516 migrate_page_add(page, private, flags);
Christoph Lameter38e35862006-01-08 01:01:01 -0800517 else
518 break;
Hugh Dickins91612e02005-06-21 17:15:07 -0700519 } while (pte++, addr += PAGE_SIZE, addr != end);
Hugh Dickins705e87c2005-10-29 18:16:27 -0700520 pte_unmap_unlock(orig_pte, ptl);
Hugh Dickins91612e02005-06-21 17:15:07 -0700521 return addr != end;
522}
523
Naoya Horiguchi98094942013-09-11 14:22:14 -0700524static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
525 pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -0700526 void *private)
527{
528#ifdef CONFIG_HUGETLB_PAGE
529 int nid;
530 struct page *page;
Kirill A. Shutemovcb900f42013-11-14 14:31:02 -0800531 spinlock_t *ptl;
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -0700532
Kirill A. Shutemovcb900f42013-11-14 14:31:02 -0800533 ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -0700534 page = pte_page(huge_ptep_get((pte_t *)pmd));
535 nid = page_to_nid(page);
536 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
537 goto unlock;
538 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
539 if (flags & (MPOL_MF_MOVE_ALL) ||
540 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
541 isolate_huge_page(page, private);
542unlock:
Kirill A. Shutemovcb900f42013-11-14 14:31:02 -0800543 spin_unlock(ptl);
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -0700544#else
545 BUG();
546#endif
547}
548
Naoya Horiguchi98094942013-09-11 14:22:14 -0700549static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800550 unsigned long addr, unsigned long end,
551 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800552 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700553{
554 pmd_t *pmd;
555 unsigned long next;
556
557 pmd = pmd_offset(pud, addr);
558 do {
559 next = pmd_addr_end(addr, end);
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -0700560 if (!pmd_present(*pmd))
561 continue;
562 if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
Naoya Horiguchi98094942013-09-11 14:22:14 -0700563 queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -0700564 flags, private);
565 continue;
566 }
Kirill A. Shutemove1803772012-12-12 13:50:59 -0800567 split_huge_page_pmd(vma, addr, pmd);
Andrea Arcangeli1a5a9902012-03-21 16:33:42 -0700568 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
Hugh Dickins91612e02005-06-21 17:15:07 -0700569 continue;
Naoya Horiguchi98094942013-09-11 14:22:14 -0700570 if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800571 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700572 return -EIO;
573 } while (pmd++, addr = next, addr != end);
574 return 0;
575}
576
Naoya Horiguchi98094942013-09-11 14:22:14 -0700577static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800578 unsigned long addr, unsigned long end,
579 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800580 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700581{
582 pud_t *pud;
583 unsigned long next;
584
585 pud = pud_offset(pgd, addr);
586 do {
587 next = pud_addr_end(addr, end);
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -0700588 if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
589 continue;
Hugh Dickins91612e02005-06-21 17:15:07 -0700590 if (pud_none_or_clear_bad(pud))
591 continue;
Naoya Horiguchi98094942013-09-11 14:22:14 -0700592 if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800593 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700594 return -EIO;
595 } while (pud++, addr = next, addr != end);
596 return 0;
597}
598
Naoya Horiguchi98094942013-09-11 14:22:14 -0700599static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800600 unsigned long addr, unsigned long end,
601 const nodemask_t *nodes, unsigned long flags,
Christoph Lameter38e35862006-01-08 01:01:01 -0800602 void *private)
Hugh Dickins91612e02005-06-21 17:15:07 -0700603{
604 pgd_t *pgd;
605 unsigned long next;
606
Nick Pigginb5810032005-10-29 18:16:12 -0700607 pgd = pgd_offset(vma->vm_mm, addr);
Hugh Dickins91612e02005-06-21 17:15:07 -0700608 do {
609 next = pgd_addr_end(addr, end);
610 if (pgd_none_or_clear_bad(pgd))
611 continue;
Naoya Horiguchi98094942013-09-11 14:22:14 -0700612 if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800613 flags, private))
Hugh Dickins91612e02005-06-21 17:15:07 -0700614 return -EIO;
615 } while (pgd++, addr = next, addr != end);
616 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700617}
618
Aneesh Kumar K.V58772312013-12-06 00:08:22 +0530619#ifdef CONFIG_NUMA_BALANCING
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200620/*
Mel Gorman4b10e7d2012-10-25 14:16:32 +0200621 * This is used to mark a range of virtual addresses to be inaccessible.
622 * These are later cleared by a NUMA hinting fault. Depending on these
623 * faults, pages may be migrated for better NUMA placement.
624 *
625 * This is assuming that NUMA faults are handled using PROT_NONE. If
626 * an architecture makes a different choice, it will need further
627 * changes to the core.
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200628 */
Mel Gorman4b10e7d2012-10-25 14:16:32 +0200629unsigned long change_prot_numa(struct vm_area_struct *vma,
630 unsigned long addr, unsigned long end)
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200631{
Mel Gorman4b10e7d2012-10-25 14:16:32 +0200632 int nr_updated;
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200633
Mel Gorman4b10e7d2012-10-25 14:16:32 +0200634 nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
Mel Gorman03c5a6e2012-11-02 14:52:48 +0000635 if (nr_updated)
636 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200637
Mel Gorman4b10e7d2012-10-25 14:16:32 +0200638 return nr_updated;
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200639}
640#else
641static unsigned long change_prot_numa(struct vm_area_struct *vma,
642 unsigned long addr, unsigned long end)
643{
644 return 0;
645}
Aneesh Kumar K.V58772312013-12-06 00:08:22 +0530646#endif /* CONFIG_NUMA_BALANCING */
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200647
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800648/*
Naoya Horiguchi98094942013-09-11 14:22:14 -0700649 * Walk through page tables and collect pages to be migrated.
650 *
651 * If pages found in a given range are on a set of nodes (determined by
652 * @nodes and @flags,) it's isolated and queued to the pagelist which is
653 * passed via @private.)
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800654 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700655static struct vm_area_struct *
Naoya Horiguchi98094942013-09-11 14:22:14 -0700656queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
Christoph Lameter38e35862006-01-08 01:01:01 -0800657 const nodemask_t *nodes, unsigned long flags, void *private)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700658{
659 int err;
660 struct vm_area_struct *first, *vma, *prev;
661
Nick Piggin053837f2006-01-18 17:42:27 -0800662
Linus Torvalds1da177e2005-04-16 15:20:36 -0700663 first = find_vma(mm, start);
664 if (!first)
665 return ERR_PTR(-EFAULT);
666 prev = NULL;
667 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200668 unsigned long endvma = vma->vm_end;
669
670 if (endvma > end)
671 endvma = end;
672 if (vma->vm_start > start)
673 start = vma->vm_start;
674
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800675 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
676 if (!vma->vm_next && vma->vm_end < end)
677 return ERR_PTR(-EFAULT);
678 if (prev && prev->vm_end < vma->vm_start)
679 return ERR_PTR(-EFAULT);
680 }
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800681
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200682 if (flags & MPOL_MF_LAZY) {
683 change_prot_numa(vma, start, endvma);
684 goto next;
685 }
686
687 if ((flags & MPOL_MF_STRICT) ||
688 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
689 vma_migratable(vma))) {
690
Naoya Horiguchi98094942013-09-11 14:22:14 -0700691 err = queue_pages_pgd_range(vma, start, endvma, nodes,
Christoph Lameter38e35862006-01-08 01:01:01 -0800692 flags, private);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700693 if (err) {
694 first = ERR_PTR(err);
695 break;
696 }
697 }
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200698next:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700699 prev = vma;
700 }
701 return first;
702}
703
KOSAKI Motohiro869833f2012-10-08 16:29:16 -0700704/*
705 * Apply policy to a single VMA
706 * This must be called with the mmap_sem held for writing.
707 */
708static int vma_replace_policy(struct vm_area_struct *vma,
709 struct mempolicy *pol)
KOSAKI Motohiro8d34694c12012-10-08 16:29:14 -0700710{
KOSAKI Motohiro869833f2012-10-08 16:29:16 -0700711 int err;
712 struct mempolicy *old;
713 struct mempolicy *new;
KOSAKI Motohiro8d34694c12012-10-08 16:29:14 -0700714
715 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
716 vma->vm_start, vma->vm_end, vma->vm_pgoff,
717 vma->vm_ops, vma->vm_file,
718 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
719
KOSAKI Motohiro869833f2012-10-08 16:29:16 -0700720 new = mpol_dup(pol);
721 if (IS_ERR(new))
722 return PTR_ERR(new);
723
724 if (vma->vm_ops && vma->vm_ops->set_policy) {
KOSAKI Motohiro8d34694c12012-10-08 16:29:14 -0700725 err = vma->vm_ops->set_policy(vma, new);
KOSAKI Motohiro869833f2012-10-08 16:29:16 -0700726 if (err)
727 goto err_out;
KOSAKI Motohiro8d34694c12012-10-08 16:29:14 -0700728 }
KOSAKI Motohiro869833f2012-10-08 16:29:16 -0700729
730 old = vma->vm_policy;
731 vma->vm_policy = new; /* protected by mmap_sem */
732 mpol_put(old);
733
734 return 0;
735 err_out:
736 mpol_put(new);
KOSAKI Motohiro8d34694c12012-10-08 16:29:14 -0700737 return err;
738}
739
Linus Torvalds1da177e2005-04-16 15:20:36 -0700740/* Step 2: apply policy to a range and do splits. */
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800741static int mbind_range(struct mm_struct *mm, unsigned long start,
742 unsigned long end, struct mempolicy *new_pol)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700743{
744 struct vm_area_struct *next;
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800745 struct vm_area_struct *prev;
746 struct vm_area_struct *vma;
747 int err = 0;
KOSAKI Motohiroe26a5112011-12-28 15:57:11 -0800748 pgoff_t pgoff;
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800749 unsigned long vmstart;
750 unsigned long vmend;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700751
Linus Torvalds097d5912012-03-06 18:23:36 -0800752 vma = find_vma(mm, start);
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800753 if (!vma || vma->vm_start > start)
754 return -EFAULT;
755
Linus Torvalds097d5912012-03-06 18:23:36 -0800756 prev = vma->vm_prev;
KOSAKI Motohiroe26a5112011-12-28 15:57:11 -0800757 if (start > vma->vm_start)
758 prev = vma;
759
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800760 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700761 next = vma->vm_next;
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800762 vmstart = max(start, vma->vm_start);
763 vmend = min(end, vma->vm_end);
764
KOSAKI Motohiroe26a5112011-12-28 15:57:11 -0800765 if (mpol_equal(vma_policy(vma), new_pol))
766 continue;
767
768 pgoff = vma->vm_pgoff +
769 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800770 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
KOSAKI Motohiroe26a5112011-12-28 15:57:11 -0800771 vma->anon_vma, vma->vm_file, pgoff,
Caspar Zhang8aacc9f2011-09-14 16:20:58 -0700772 new_pol);
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800773 if (prev) {
774 vma = prev;
775 next = vma->vm_next;
Oleg Nesterov3964acd2013-07-31 13:53:28 -0700776 if (mpol_equal(vma_policy(vma), new_pol))
777 continue;
778 /* vma_merge() joined vma && vma->next, case 8 */
779 goto replace;
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800780 }
781 if (vma->vm_start != vmstart) {
782 err = split_vma(vma->vm_mm, vma, vmstart, 1);
783 if (err)
784 goto out;
785 }
786 if (vma->vm_end != vmend) {
787 err = split_vma(vma->vm_mm, vma, vmend, 0);
788 if (err)
789 goto out;
790 }
Oleg Nesterov3964acd2013-07-31 13:53:28 -0700791 replace:
KOSAKI Motohiro869833f2012-10-08 16:29:16 -0700792 err = vma_replace_policy(vma, new_pol);
KOSAKI Motohiro8d34694c12012-10-08 16:29:14 -0700793 if (err)
794 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700795 }
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800796
797 out:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700798 return err;
799}
800
Linus Torvalds1da177e2005-04-16 15:20:36 -0700801/* Set the process memory policy */
David Rientjes028fec42008-04-28 02:12:25 -0700802static long do_set_mempolicy(unsigned short mode, unsigned short flags,
803 nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700804{
Miao Xie58568d22009-06-16 15:31:49 -0700805 struct mempolicy *new, *old;
Lee Schermerhornf4e53d92008-04-28 02:13:10 -0700806 struct mm_struct *mm = current->mm;
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700807 NODEMASK_SCRATCH(scratch);
Miao Xie58568d22009-06-16 15:31:49 -0700808 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700809
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700810 if (!scratch)
811 return -ENOMEM;
Lee Schermerhornf4e53d92008-04-28 02:13:10 -0700812
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700813 new = mpol_new(mode, flags, nodes);
814 if (IS_ERR(new)) {
815 ret = PTR_ERR(new);
816 goto out;
817 }
Lee Schermerhornf4e53d92008-04-28 02:13:10 -0700818 /*
819 * prevent changing our mempolicy while show_numa_maps()
820 * is using it.
821 * Note: do_set_mempolicy() can be called at init time
822 * with no 'mm'.
823 */
824 if (mm)
825 down_write(&mm->mmap_sem);
Miao Xie58568d22009-06-16 15:31:49 -0700826 task_lock(current);
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700827 ret = mpol_set_nodemask(new, nodes, scratch);
Miao Xie58568d22009-06-16 15:31:49 -0700828 if (ret) {
829 task_unlock(current);
830 if (mm)
831 up_write(&mm->mmap_sem);
832 mpol_put(new);
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700833 goto out;
Miao Xie58568d22009-06-16 15:31:49 -0700834 }
835 old = current->mempolicy;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700836 current->mempolicy = new;
Lee Schermerhorn45c47452008-04-28 02:13:12 -0700837 if (new && new->mode == MPOL_INTERLEAVE &&
David Rientjesf5b087b2008-04-28 02:12:27 -0700838 nodes_weight(new->v.nodes))
Andi Kleendfcd3c0d2005-10-29 18:15:48 -0700839 current->il_next = first_node(new->v.nodes);
Miao Xie58568d22009-06-16 15:31:49 -0700840 task_unlock(current);
Lee Schermerhornf4e53d92008-04-28 02:13:10 -0700841 if (mm)
842 up_write(&mm->mmap_sem);
843
Miao Xie58568d22009-06-16 15:31:49 -0700844 mpol_put(old);
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700845 ret = 0;
846out:
847 NODEMASK_SCRATCH_FREE(scratch);
848 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700849}
850
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700851/*
852 * Return nodemask for policy for get_mempolicy() query
Miao Xie58568d22009-06-16 15:31:49 -0700853 *
854 * Called with task's alloc_lock held
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700855 */
856static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700857{
Andi Kleendfcd3c0d2005-10-29 18:15:48 -0700858 nodes_clear(*nodes);
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700859 if (p == &default_policy)
860 return;
861
Lee Schermerhorn45c47452008-04-28 02:13:12 -0700862 switch (p->mode) {
Mel Gorman19770b32008-04-28 02:12:18 -0700863 case MPOL_BIND:
864 /* Fall through */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700865 case MPOL_INTERLEAVE:
Andi Kleendfcd3c0d2005-10-29 18:15:48 -0700866 *nodes = p->v.nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700867 break;
868 case MPOL_PREFERRED:
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700869 if (!(p->flags & MPOL_F_LOCAL))
Andi Kleendfcd3c0d2005-10-29 18:15:48 -0700870 node_set(p->v.preferred_node, *nodes);
Lee Schermerhorn53f25562008-04-28 02:13:20 -0700871 /* else return empty node mask for local allocation */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700872 break;
873 default:
874 BUG();
875 }
876}
877
878static int lookup_node(struct mm_struct *mm, unsigned long addr)
879{
880 struct page *p;
881 int err;
882
883 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
884 if (err >= 0) {
885 err = page_to_nid(p);
886 put_page(p);
887 }
888 return err;
889}
890
Linus Torvalds1da177e2005-04-16 15:20:36 -0700891/* Retrieve NUMA policy */
Adrian Bunkdbcb0f12007-10-16 01:26:26 -0700892static long do_get_mempolicy(int *policy, nodemask_t *nmask,
893 unsigned long addr, unsigned long flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700894{
Christoph Lameter8bccd852005-10-29 18:16:59 -0700895 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700896 struct mm_struct *mm = current->mm;
897 struct vm_area_struct *vma = NULL;
898 struct mempolicy *pol = current->mempolicy;
899
Lee Schermerhorn754af6f2007-10-16 01:24:51 -0700900 if (flags &
901 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700902 return -EINVAL;
Lee Schermerhorn754af6f2007-10-16 01:24:51 -0700903
904 if (flags & MPOL_F_MEMS_ALLOWED) {
905 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
906 return -EINVAL;
907 *policy = 0; /* just so it's initialized */
Miao Xie58568d22009-06-16 15:31:49 -0700908 task_lock(current);
Lee Schermerhorn754af6f2007-10-16 01:24:51 -0700909 *nmask = cpuset_current_mems_allowed;
Miao Xie58568d22009-06-16 15:31:49 -0700910 task_unlock(current);
Lee Schermerhorn754af6f2007-10-16 01:24:51 -0700911 return 0;
912 }
913
Linus Torvalds1da177e2005-04-16 15:20:36 -0700914 if (flags & MPOL_F_ADDR) {
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700915 /*
916 * Do NOT fall back to task policy if the
917 * vma/shared policy at addr is NULL. We
918 * want to return MPOL_DEFAULT in this case.
919 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700920 down_read(&mm->mmap_sem);
921 vma = find_vma_intersection(mm, addr, addr+1);
922 if (!vma) {
923 up_read(&mm->mmap_sem);
924 return -EFAULT;
925 }
926 if (vma->vm_ops && vma->vm_ops->get_policy)
927 pol = vma->vm_ops->get_policy(vma, addr);
928 else
929 pol = vma->vm_policy;
930 } else if (addr)
931 return -EINVAL;
932
933 if (!pol)
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700934 pol = &default_policy; /* indicates default behavior */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700935
936 if (flags & MPOL_F_NODE) {
937 if (flags & MPOL_F_ADDR) {
938 err = lookup_node(mm, addr);
939 if (err < 0)
940 goto out;
Christoph Lameter8bccd852005-10-29 18:16:59 -0700941 *policy = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700942 } else if (pol == current->mempolicy &&
Lee Schermerhorn45c47452008-04-28 02:13:12 -0700943 pol->mode == MPOL_INTERLEAVE) {
Christoph Lameter8bccd852005-10-29 18:16:59 -0700944 *policy = current->il_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700945 } else {
946 err = -EINVAL;
947 goto out;
948 }
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700949 } else {
950 *policy = pol == &default_policy ? MPOL_DEFAULT :
951 pol->mode;
David Rientjesd79df632008-07-04 12:24:13 -0700952 /*
953 * Internal mempolicy flags must be masked off before exposing
954 * the policy to userspace.
955 */
956 *policy |= (pol->flags & MPOL_MODE_FLAGS);
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700957 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700958
959 if (vma) {
960 up_read(&current->mm->mmap_sem);
961 vma = NULL;
962 }
963
Linus Torvalds1da177e2005-04-16 15:20:36 -0700964 err = 0;
Miao Xie58568d22009-06-16 15:31:49 -0700965 if (nmask) {
Lee Schermerhornc6b6ef82010-03-23 13:35:41 -0700966 if (mpol_store_user_nodemask(pol)) {
967 *nmask = pol->w.user_nodemask;
968 } else {
969 task_lock(current);
970 get_policy_nodemask(pol, nmask);
971 task_unlock(current);
972 }
Miao Xie58568d22009-06-16 15:31:49 -0700973 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700974
975 out:
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -0700976 mpol_cond_put(pol);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700977 if (vma)
978 up_read(&current->mm->mmap_sem);
979 return err;
980}
981
Christoph Lameterb20a3502006-03-22 00:09:12 -0800982#ifdef CONFIG_MIGRATION
Christoph Lameter8bccd852005-10-29 18:16:59 -0700983/*
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800984 * page migration
985 */
Christoph Lameterfc301282006-01-18 17:42:29 -0800986static void migrate_page_add(struct page *page, struct list_head *pagelist,
987 unsigned long flags)
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800988{
989 /*
Christoph Lameterfc301282006-01-18 17:42:29 -0800990 * Avoid migrating a page that is shared with others.
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800991 */
Nick Piggin62695a82008-10-18 20:26:09 -0700992 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
993 if (!isolate_lru_page(page)) {
994 list_add_tail(&page->lru, pagelist);
KOSAKI Motohiro6d9c2852009-12-14 17:58:11 -0800995 inc_zone_page_state(page, NR_ISOLATED_ANON +
996 page_is_file_cache(page));
Nick Piggin62695a82008-10-18 20:26:09 -0700997 }
998 }
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -0800999}
1000
Christoph Lameter742755a2006-06-23 02:03:55 -07001001static struct page *new_node_page(struct page *page, unsigned long node, int **x)
Christoph Lameter95a402c2006-06-23 02:03:53 -07001002{
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -07001003 if (PageHuge(page))
1004 return alloc_huge_page_node(page_hstate(compound_head(page)),
1005 node);
1006 else
1007 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
Christoph Lameter95a402c2006-06-23 02:03:53 -07001008}
1009
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001010/*
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001011 * Migrate pages from one node to a target node.
1012 * Returns error or the number of pages not migrated.
1013 */
Adrian Bunkdbcb0f12007-10-16 01:26:26 -07001014static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1015 int flags)
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001016{
1017 nodemask_t nmask;
1018 LIST_HEAD(pagelist);
1019 int err = 0;
1020
1021 nodes_clear(nmask);
1022 node_set(source, nmask);
1023
Minchan Kim08270802012-10-08 16:33:38 -07001024 /*
1025 * This does not "check" the range but isolates all pages that
1026 * need migration. Between passing in the full user address
1027 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1028 */
1029 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
Naoya Horiguchi98094942013-09-11 14:22:14 -07001030 queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001031 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1032
Minchan Kimcf608ac2010-10-26 14:21:29 -07001033 if (!list_empty(&pagelist)) {
David Rientjes68711a72014-06-04 16:08:25 -07001034 err = migrate_pages(&pagelist, new_node_page, NULL, dest,
Hugh Dickins9c620e22013-02-22 16:35:14 -08001035 MIGRATE_SYNC, MR_SYSCALL);
Minchan Kimcf608ac2010-10-26 14:21:29 -07001036 if (err)
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -07001037 putback_movable_pages(&pagelist);
Minchan Kimcf608ac2010-10-26 14:21:29 -07001038 }
Christoph Lameter95a402c2006-06-23 02:03:53 -07001039
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001040 return err;
1041}
1042
1043/*
1044 * Move pages between the two nodesets so as to preserve the physical
1045 * layout as much as possible.
Christoph Lameter39743882006-01-08 01:00:51 -08001046 *
1047 * Returns the number of page that could not be moved.
1048 */
Andrew Morton0ce72d42012-05-29 15:06:24 -07001049int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1050 const nodemask_t *to, int flags)
Christoph Lameter39743882006-01-08 01:00:51 -08001051{
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001052 int busy = 0;
Christoph Lameter0aedadf2008-11-06 12:53:30 -08001053 int err;
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001054 nodemask_t tmp;
Christoph Lameter39743882006-01-08 01:00:51 -08001055
Christoph Lameter0aedadf2008-11-06 12:53:30 -08001056 err = migrate_prep();
1057 if (err)
1058 return err;
1059
Lee Schermerhorn53f25562008-04-28 02:13:20 -07001060 down_read(&mm->mmap_sem);
Christoph Lameter39743882006-01-08 01:00:51 -08001061
Andrew Morton0ce72d42012-05-29 15:06:24 -07001062 err = migrate_vmas(mm, from, to, flags);
Christoph Lameter7b2259b2006-06-25 05:46:48 -07001063 if (err)
1064 goto out;
1065
KOSAKI Motohiroda0aa132010-03-05 13:41:59 -08001066 /*
1067 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1068 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1069 * bit in 'tmp', and return that <source, dest> pair for migration.
1070 * The pair of nodemasks 'to' and 'from' define the map.
1071 *
1072 * If no pair of bits is found that way, fallback to picking some
1073 * pair of 'source' and 'dest' bits that are not the same. If the
1074 * 'source' and 'dest' bits are the same, this represents a node
1075 * that will be migrating to itself, so no pages need move.
1076 *
1077 * If no bits are left in 'tmp', or if all remaining bits left
1078 * in 'tmp' correspond to the same bit in 'to', return false
1079 * (nothing left to migrate).
1080 *
1081 * This lets us pick a pair of nodes to migrate between, such that
1082 * if possible the dest node is not already occupied by some other
1083 * source node, minimizing the risk of overloading the memory on a
1084 * node that would happen if we migrated incoming memory to a node
1085 * before migrating outgoing memory source that same node.
1086 *
1087 * A single scan of tmp is sufficient. As we go, we remember the
1088 * most recent <s, d> pair that moved (s != d). If we find a pair
1089 * that not only moved, but what's better, moved to an empty slot
1090 * (d is not set in tmp), then we break out then, with that pair.
Justin P. Mattockae0e47f2011-03-01 15:06:02 +01001091 * Otherwise when we finish scanning from_tmp, we at least have the
KOSAKI Motohiroda0aa132010-03-05 13:41:59 -08001092 * most recent <s, d> pair that moved. If we get all the way through
1093 * the scan of tmp without finding any node that moved, much less
1094 * moved to an empty node, then there is nothing left worth migrating.
1095 */
Christoph Lameterd4984712006-01-08 01:00:55 -08001096
Andrew Morton0ce72d42012-05-29 15:06:24 -07001097 tmp = *from;
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001098 while (!nodes_empty(tmp)) {
1099 int s,d;
Jianguo Wub76ac7e2013-11-12 15:07:39 -08001100 int source = NUMA_NO_NODE;
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001101 int dest = 0;
1102
1103 for_each_node_mask(s, tmp) {
Larry Woodman4a5b18c2012-05-29 15:06:24 -07001104
1105 /*
1106 * do_migrate_pages() tries to maintain the relative
1107 * node relationship of the pages established between
1108 * threads and memory areas.
1109 *
1110 * However if the number of source nodes is not equal to
1111 * the number of destination nodes we can not preserve
1112 * this node relative relationship. In that case, skip
1113 * copying memory from a node that is in the destination
1114 * mask.
1115 *
1116 * Example: [2,3,4] -> [3,4,5] moves everything.
1117 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1118 */
1119
Andrew Morton0ce72d42012-05-29 15:06:24 -07001120 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1121 (node_isset(s, *to)))
Larry Woodman4a5b18c2012-05-29 15:06:24 -07001122 continue;
1123
Andrew Morton0ce72d42012-05-29 15:06:24 -07001124 d = node_remap(s, *from, *to);
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001125 if (s == d)
1126 continue;
1127
1128 source = s; /* Node moved. Memorize */
1129 dest = d;
1130
1131 /* dest not in remaining from nodes? */
1132 if (!node_isset(dest, tmp))
1133 break;
1134 }
Jianguo Wub76ac7e2013-11-12 15:07:39 -08001135 if (source == NUMA_NO_NODE)
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001136 break;
1137
1138 node_clear(source, tmp);
1139 err = migrate_to_node(mm, source, dest, flags);
1140 if (err > 0)
1141 busy += err;
1142 if (err < 0)
1143 break;
Christoph Lameter39743882006-01-08 01:00:51 -08001144 }
Christoph Lameter7b2259b2006-06-25 05:46:48 -07001145out:
Christoph Lameter39743882006-01-08 01:00:51 -08001146 up_read(&mm->mmap_sem);
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001147 if (err < 0)
1148 return err;
1149 return busy;
Christoph Lameterb20a3502006-03-22 00:09:12 -08001150
Christoph Lameter39743882006-01-08 01:00:51 -08001151}
1152
Lee Schermerhorn3ad33b242007-11-14 16:59:10 -08001153/*
1154 * Allocate a new page for page migration based on vma policy.
1155 * Start assuming that page is mapped by vma pointed to by @private.
1156 * Search forward from there, if not. N.B., this assumes that the
1157 * list of pages handed to migrate_pages()--which is how we get here--
1158 * is in virtual address order.
1159 */
Christoph Lameter742755a2006-06-23 02:03:55 -07001160static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
Christoph Lameter95a402c2006-06-23 02:03:53 -07001161{
1162 struct vm_area_struct *vma = (struct vm_area_struct *)private;
Lee Schermerhorn3ad33b242007-11-14 16:59:10 -08001163 unsigned long uninitialized_var(address);
Christoph Lameter95a402c2006-06-23 02:03:53 -07001164
Lee Schermerhorn3ad33b242007-11-14 16:59:10 -08001165 while (vma) {
1166 address = page_address_in_vma(page, vma);
1167 if (address != -EFAULT)
1168 break;
1169 vma = vma->vm_next;
1170 }
1171
Wanpeng Li11c731e2013-12-18 17:08:56 -08001172 if (PageHuge(page)) {
Michal Hockocc817172014-01-23 15:53:15 -08001173 BUG_ON(!vma);
1174 return alloc_huge_page_noerr(vma, address, 1);
Wanpeng Li11c731e2013-12-18 17:08:56 -08001175 }
1176 /*
1177 * if !vma, alloc_page_vma() will use task or system default policy
1178 */
Lee Schermerhorn3ad33b242007-11-14 16:59:10 -08001179 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
Christoph Lameter95a402c2006-06-23 02:03:53 -07001180}
Christoph Lameterb20a3502006-03-22 00:09:12 -08001181#else
1182
1183static void migrate_page_add(struct page *page, struct list_head *pagelist,
1184 unsigned long flags)
1185{
1186}
1187
Andrew Morton0ce72d42012-05-29 15:06:24 -07001188int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1189 const nodemask_t *to, int flags)
Christoph Lameterb20a3502006-03-22 00:09:12 -08001190{
1191 return -ENOSYS;
1192}
Christoph Lameter95a402c2006-06-23 02:03:53 -07001193
Keith Owens69939742006-10-11 01:21:28 -07001194static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
Christoph Lameter95a402c2006-06-23 02:03:53 -07001195{
1196 return NULL;
1197}
Christoph Lameterb20a3502006-03-22 00:09:12 -08001198#endif
1199
Adrian Bunkdbcb0f12007-10-16 01:26:26 -07001200static long do_mbind(unsigned long start, unsigned long len,
David Rientjes028fec42008-04-28 02:12:25 -07001201 unsigned short mode, unsigned short mode_flags,
1202 nodemask_t *nmask, unsigned long flags)
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001203{
1204 struct vm_area_struct *vma;
1205 struct mm_struct *mm = current->mm;
1206 struct mempolicy *new;
1207 unsigned long end;
1208 int err;
1209 LIST_HEAD(pagelist);
1210
Lee Schermerhornb24f53a2012-10-25 14:16:32 +02001211 if (flags & ~(unsigned long)MPOL_MF_VALID)
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001212 return -EINVAL;
Christoph Lameter74c00242006-03-14 19:50:21 -08001213 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001214 return -EPERM;
1215
1216 if (start & ~PAGE_MASK)
1217 return -EINVAL;
1218
1219 if (mode == MPOL_DEFAULT)
1220 flags &= ~MPOL_MF_STRICT;
1221
1222 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1223 end = start + len;
1224
1225 if (end < start)
1226 return -EINVAL;
1227 if (end == start)
1228 return 0;
1229
David Rientjes028fec42008-04-28 02:12:25 -07001230 new = mpol_new(mode, mode_flags, nmask);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001231 if (IS_ERR(new))
1232 return PTR_ERR(new);
1233
Lee Schermerhornb24f53a2012-10-25 14:16:32 +02001234 if (flags & MPOL_MF_LAZY)
1235 new->flags |= MPOL_F_MOF;
1236
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001237 /*
1238 * If we are using the default policy then operation
1239 * on discontinuous address spaces is okay after all
1240 */
1241 if (!new)
1242 flags |= MPOL_MF_DISCONTIG_OK;
1243
David Rientjes028fec42008-04-28 02:12:25 -07001244 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1245 start, start + len, mode, mode_flags,
David Rientjes00ef2d22013-02-22 16:35:36 -08001246 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001247
Christoph Lameter0aedadf2008-11-06 12:53:30 -08001248 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1249
1250 err = migrate_prep();
1251 if (err)
KOSAKI Motohirob05ca732009-10-26 16:49:59 -07001252 goto mpol_out;
Christoph Lameter0aedadf2008-11-06 12:53:30 -08001253 }
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -07001254 {
1255 NODEMASK_SCRATCH(scratch);
1256 if (scratch) {
1257 down_write(&mm->mmap_sem);
1258 task_lock(current);
1259 err = mpol_set_nodemask(new, nmask, scratch);
1260 task_unlock(current);
1261 if (err)
1262 up_write(&mm->mmap_sem);
1263 } else
1264 err = -ENOMEM;
1265 NODEMASK_SCRATCH_FREE(scratch);
1266 }
KOSAKI Motohirob05ca732009-10-26 16:49:59 -07001267 if (err)
1268 goto mpol_out;
1269
Naoya Horiguchi98094942013-09-11 14:22:14 -07001270 vma = queue_pages_range(mm, start, end, nmask,
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001271 flags | MPOL_MF_INVERT, &pagelist);
1272
Lee Schermerhornb24f53a2012-10-25 14:16:32 +02001273 err = PTR_ERR(vma); /* maybe ... */
Mel Gormana7200942012-11-16 09:37:58 +00001274 if (!IS_ERR(vma))
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -08001275 err = mbind_range(mm, start, end, new);
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001276
Lee Schermerhornb24f53a2012-10-25 14:16:32 +02001277 if (!err) {
1278 int nr_failed = 0;
1279
Minchan Kimcf608ac2010-10-26 14:21:29 -07001280 if (!list_empty(&pagelist)) {
Lee Schermerhornb24f53a2012-10-25 14:16:32 +02001281 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
Christoph Lameter95a402c2006-06-23 02:03:53 -07001282 nr_failed = migrate_pages(&pagelist, new_vma_page,
David Rientjes68711a72014-06-04 16:08:25 -07001283 NULL, (unsigned long)vma,
Hugh Dickins9c620e22013-02-22 16:35:14 -08001284 MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
Minchan Kimcf608ac2010-10-26 14:21:29 -07001285 if (nr_failed)
Naoya Horiguchi74060e42013-09-11 14:22:06 -07001286 putback_movable_pages(&pagelist);
Minchan Kimcf608ac2010-10-26 14:21:29 -07001287 }
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001288
Lee Schermerhornb24f53a2012-10-25 14:16:32 +02001289 if (nr_failed && (flags & MPOL_MF_STRICT))
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001290 err = -EIO;
KOSAKI Motohiroab8a3e12009-10-26 16:49:58 -07001291 } else
Joonsoo Kimb0e5fd72013-12-18 17:08:51 -08001292 putback_movable_pages(&pagelist);
Christoph Lameterb20a3502006-03-22 00:09:12 -08001293
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001294 up_write(&mm->mmap_sem);
KOSAKI Motohirob05ca732009-10-26 16:49:59 -07001295 mpol_out:
Lee Schermerhornf0be3d32008-04-28 02:13:08 -07001296 mpol_put(new);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001297 return err;
1298}
1299
Christoph Lameter39743882006-01-08 01:00:51 -08001300/*
Christoph Lameter8bccd852005-10-29 18:16:59 -07001301 * User space interface with variable sized bitmaps for nodelists.
1302 */
1303
1304/* Copy a node mask from user space. */
Christoph Lameter39743882006-01-08 01:00:51 -08001305static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
Christoph Lameter8bccd852005-10-29 18:16:59 -07001306 unsigned long maxnode)
1307{
1308 unsigned long k;
1309 unsigned long nlongs;
1310 unsigned long endmask;
1311
1312 --maxnode;
1313 nodes_clear(*nodes);
1314 if (maxnode == 0 || !nmask)
1315 return 0;
Andi Kleena9c930b2006-02-20 18:27:59 -08001316 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
Chris Wright636f13c2006-02-17 13:59:36 -08001317 return -EINVAL;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001318
1319 nlongs = BITS_TO_LONGS(maxnode);
1320 if ((maxnode % BITS_PER_LONG) == 0)
1321 endmask = ~0UL;
1322 else
1323 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1324
1325 /* When the user specified more nodes than supported just check
1326 if the non supported part is all zero. */
1327 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1328 if (nlongs > PAGE_SIZE/sizeof(long))
1329 return -EINVAL;
1330 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1331 unsigned long t;
1332 if (get_user(t, nmask + k))
1333 return -EFAULT;
1334 if (k == nlongs - 1) {
1335 if (t & endmask)
1336 return -EINVAL;
1337 } else if (t)
1338 return -EINVAL;
1339 }
1340 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1341 endmask = ~0UL;
1342 }
1343
1344 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1345 return -EFAULT;
1346 nodes_addr(*nodes)[nlongs-1] &= endmask;
1347 return 0;
1348}
1349
1350/* Copy a kernel node mask to user space */
1351static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1352 nodemask_t *nodes)
1353{
1354 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1355 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1356
1357 if (copy > nbytes) {
1358 if (copy > PAGE_SIZE)
1359 return -EINVAL;
1360 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1361 return -EFAULT;
1362 copy = nbytes;
1363 }
1364 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1365}
1366
Heiko Carstens938bb9f2009-01-14 14:14:30 +01001367SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
Rasmus Villemoesf7f28ca2014-06-04 16:07:57 -07001368 unsigned long, mode, const unsigned long __user *, nmask,
Heiko Carstens938bb9f2009-01-14 14:14:30 +01001369 unsigned long, maxnode, unsigned, flags)
Christoph Lameter8bccd852005-10-29 18:16:59 -07001370{
1371 nodemask_t nodes;
1372 int err;
David Rientjes028fec42008-04-28 02:12:25 -07001373 unsigned short mode_flags;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001374
David Rientjes028fec42008-04-28 02:12:25 -07001375 mode_flags = mode & MPOL_MODE_FLAGS;
1376 mode &= ~MPOL_MODE_FLAGS;
David Rientjesa3b51e02008-04-28 02:12:23 -07001377 if (mode >= MPOL_MAX)
1378 return -EINVAL;
David Rientjes4c50bc02008-04-28 02:12:30 -07001379 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1380 (mode_flags & MPOL_F_RELATIVE_NODES))
1381 return -EINVAL;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001382 err = get_nodes(&nodes, nmask, maxnode);
1383 if (err)
1384 return err;
David Rientjes028fec42008-04-28 02:12:25 -07001385 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
Christoph Lameter8bccd852005-10-29 18:16:59 -07001386}
1387
1388/* Set the process memory policy */
Rasmus Villemoes23c89022014-06-04 16:07:58 -07001389SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
Heiko Carstens938bb9f2009-01-14 14:14:30 +01001390 unsigned long, maxnode)
Christoph Lameter8bccd852005-10-29 18:16:59 -07001391{
1392 int err;
1393 nodemask_t nodes;
David Rientjes028fec42008-04-28 02:12:25 -07001394 unsigned short flags;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001395
David Rientjes028fec42008-04-28 02:12:25 -07001396 flags = mode & MPOL_MODE_FLAGS;
1397 mode &= ~MPOL_MODE_FLAGS;
1398 if ((unsigned int)mode >= MPOL_MAX)
Christoph Lameter8bccd852005-10-29 18:16:59 -07001399 return -EINVAL;
David Rientjes4c50bc02008-04-28 02:12:30 -07001400 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1401 return -EINVAL;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001402 err = get_nodes(&nodes, nmask, maxnode);
1403 if (err)
1404 return err;
David Rientjes028fec42008-04-28 02:12:25 -07001405 return do_set_mempolicy(mode, flags, &nodes);
Christoph Lameter8bccd852005-10-29 18:16:59 -07001406}
1407
Heiko Carstens938bb9f2009-01-14 14:14:30 +01001408SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1409 const unsigned long __user *, old_nodes,
1410 const unsigned long __user *, new_nodes)
Christoph Lameter39743882006-01-08 01:00:51 -08001411{
David Howellsc69e8d92008-11-14 10:39:19 +11001412 const struct cred *cred = current_cred(), *tcred;
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001413 struct mm_struct *mm = NULL;
Christoph Lameter39743882006-01-08 01:00:51 -08001414 struct task_struct *task;
Christoph Lameter39743882006-01-08 01:00:51 -08001415 nodemask_t task_nodes;
1416 int err;
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001417 nodemask_t *old;
1418 nodemask_t *new;
1419 NODEMASK_SCRATCH(scratch);
Christoph Lameter39743882006-01-08 01:00:51 -08001420
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001421 if (!scratch)
1422 return -ENOMEM;
Christoph Lameter39743882006-01-08 01:00:51 -08001423
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001424 old = &scratch->mask1;
1425 new = &scratch->mask2;
1426
1427 err = get_nodes(old, old_nodes, maxnode);
Christoph Lameter39743882006-01-08 01:00:51 -08001428 if (err)
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001429 goto out;
1430
1431 err = get_nodes(new, new_nodes, maxnode);
1432 if (err)
1433 goto out;
Christoph Lameter39743882006-01-08 01:00:51 -08001434
1435 /* Find the mm_struct */
Zeng Zhaoming55cfaa32010-12-02 14:31:13 -08001436 rcu_read_lock();
Pavel Emelyanov228ebcb2007-10-18 23:40:16 -07001437 task = pid ? find_task_by_vpid(pid) : current;
Christoph Lameter39743882006-01-08 01:00:51 -08001438 if (!task) {
Zeng Zhaoming55cfaa32010-12-02 14:31:13 -08001439 rcu_read_unlock();
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001440 err = -ESRCH;
1441 goto out;
Christoph Lameter39743882006-01-08 01:00:51 -08001442 }
Christoph Lameter3268c632012-03-21 16:34:06 -07001443 get_task_struct(task);
Christoph Lameter39743882006-01-08 01:00:51 -08001444
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001445 err = -EINVAL;
Christoph Lameter39743882006-01-08 01:00:51 -08001446
1447 /*
1448 * Check if this process has the right to modify the specified
1449 * process. The right exists if the process has administrative
Alexey Dobriyan7f927fc2006-03-28 01:56:53 -08001450 * capabilities, superuser privileges or the same
Christoph Lameter39743882006-01-08 01:00:51 -08001451 * userid as the target process.
1452 */
David Howellsc69e8d92008-11-14 10:39:19 +11001453 tcred = __task_cred(task);
Eric W. Biedermanb38a86e2012-03-12 15:48:24 -07001454 if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1455 !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
Christoph Lameter74c00242006-03-14 19:50:21 -08001456 !capable(CAP_SYS_NICE)) {
David Howellsc69e8d92008-11-14 10:39:19 +11001457 rcu_read_unlock();
Christoph Lameter39743882006-01-08 01:00:51 -08001458 err = -EPERM;
Christoph Lameter3268c632012-03-21 16:34:06 -07001459 goto out_put;
Christoph Lameter39743882006-01-08 01:00:51 -08001460 }
David Howellsc69e8d92008-11-14 10:39:19 +11001461 rcu_read_unlock();
Christoph Lameter39743882006-01-08 01:00:51 -08001462
1463 task_nodes = cpuset_mems_allowed(task);
1464 /* Is the user allowed to access the target nodes? */
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001465 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
Christoph Lameter39743882006-01-08 01:00:51 -08001466 err = -EPERM;
Christoph Lameter3268c632012-03-21 16:34:06 -07001467 goto out_put;
Christoph Lameter39743882006-01-08 01:00:51 -08001468 }
1469
Lai Jiangshan01f13bd2012-12-12 13:51:33 -08001470 if (!nodes_subset(*new, node_states[N_MEMORY])) {
Christoph Lameter3b42d282007-08-31 00:12:08 -07001471 err = -EINVAL;
Christoph Lameter3268c632012-03-21 16:34:06 -07001472 goto out_put;
Christoph Lameter3b42d282007-08-31 00:12:08 -07001473 }
1474
David Quigley86c3a762006-06-23 02:04:02 -07001475 err = security_task_movememory(task);
1476 if (err)
Christoph Lameter3268c632012-03-21 16:34:06 -07001477 goto out_put;
David Quigley86c3a762006-06-23 02:04:02 -07001478
Christoph Lameter3268c632012-03-21 16:34:06 -07001479 mm = get_task_mm(task);
1480 put_task_struct(task);
Sasha Levinf2a9ef82012-04-25 16:01:52 -07001481
1482 if (!mm) {
Christoph Lameter3268c632012-03-21 16:34:06 -07001483 err = -EINVAL;
Sasha Levinf2a9ef82012-04-25 16:01:52 -07001484 goto out;
1485 }
1486
1487 err = do_migrate_pages(mm, old, new,
1488 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
Christoph Lameter3268c632012-03-21 16:34:06 -07001489
1490 mmput(mm);
1491out:
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001492 NODEMASK_SCRATCH_FREE(scratch);
1493
Christoph Lameter39743882006-01-08 01:00:51 -08001494 return err;
Christoph Lameter3268c632012-03-21 16:34:06 -07001495
1496out_put:
1497 put_task_struct(task);
1498 goto out;
1499
Christoph Lameter39743882006-01-08 01:00:51 -08001500}
1501
1502
Christoph Lameter8bccd852005-10-29 18:16:59 -07001503/* Retrieve NUMA policy */
Heiko Carstens938bb9f2009-01-14 14:14:30 +01001504SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1505 unsigned long __user *, nmask, unsigned long, maxnode,
1506 unsigned long, addr, unsigned long, flags)
Christoph Lameter8bccd852005-10-29 18:16:59 -07001507{
Adrian Bunkdbcb0f12007-10-16 01:26:26 -07001508 int err;
1509 int uninitialized_var(pval);
Christoph Lameter8bccd852005-10-29 18:16:59 -07001510 nodemask_t nodes;
1511
1512 if (nmask != NULL && maxnode < MAX_NUMNODES)
1513 return -EINVAL;
1514
1515 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1516
1517 if (err)
1518 return err;
1519
1520 if (policy && put_user(pval, policy))
1521 return -EFAULT;
1522
1523 if (nmask)
1524 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1525
1526 return err;
1527}
1528
Linus Torvalds1da177e2005-04-16 15:20:36 -07001529#ifdef CONFIG_COMPAT
1530
Heiko Carstensc93e0f62014-03-03 16:32:26 +01001531COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1532 compat_ulong_t __user *, nmask,
1533 compat_ulong_t, maxnode,
1534 compat_ulong_t, addr, compat_ulong_t, flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001535{
1536 long err;
1537 unsigned long __user *nm = NULL;
1538 unsigned long nr_bits, alloc_size;
1539 DECLARE_BITMAP(bm, MAX_NUMNODES);
1540
1541 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1542 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1543
1544 if (nmask)
1545 nm = compat_alloc_user_space(alloc_size);
1546
1547 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1548
1549 if (!err && nmask) {
KAMEZAWA Hiroyuki2bbff6c2011-09-14 16:21:02 -07001550 unsigned long copy_size;
1551 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1552 err = copy_from_user(bm, nm, copy_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001553 /* ensure entire bitmap is zeroed */
1554 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1555 err |= compat_put_bitmap(nmask, bm, nr_bits);
1556 }
1557
1558 return err;
1559}
1560
Heiko Carstensc93e0f62014-03-03 16:32:26 +01001561COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1562 compat_ulong_t, maxnode)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001563{
1564 long err = 0;
1565 unsigned long __user *nm = NULL;
1566 unsigned long nr_bits, alloc_size;
1567 DECLARE_BITMAP(bm, MAX_NUMNODES);
1568
1569 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1570 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1571
1572 if (nmask) {
1573 err = compat_get_bitmap(bm, nmask, nr_bits);
1574 nm = compat_alloc_user_space(alloc_size);
1575 err |= copy_to_user(nm, bm, alloc_size);
1576 }
1577
1578 if (err)
1579 return -EFAULT;
1580
1581 return sys_set_mempolicy(mode, nm, nr_bits+1);
1582}
1583
Heiko Carstensc93e0f62014-03-03 16:32:26 +01001584COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1585 compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1586 compat_ulong_t, maxnode, compat_ulong_t, flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001587{
1588 long err = 0;
1589 unsigned long __user *nm = NULL;
1590 unsigned long nr_bits, alloc_size;
Andi Kleendfcd3c0d2005-10-29 18:15:48 -07001591 nodemask_t bm;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001592
1593 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1594 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1595
1596 if (nmask) {
Andi Kleendfcd3c0d2005-10-29 18:15:48 -07001597 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001598 nm = compat_alloc_user_space(alloc_size);
Andi Kleendfcd3c0d2005-10-29 18:15:48 -07001599 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001600 }
1601
1602 if (err)
1603 return -EFAULT;
1604
1605 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1606}
1607
1608#endif
1609
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001610/*
1611 * get_vma_policy(@task, @vma, @addr)
Fabian Frederickb46e14a2014-06-04 16:08:18 -07001612 * @task: task for fallback if vma policy == default
1613 * @vma: virtual memory area whose policy is sought
1614 * @addr: address in @vma for shared policy lookup
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001615 *
1616 * Returns effective policy for a VMA at specified address.
1617 * Falls back to @task or system default policy, as necessary.
David Rientjes32f85162012-10-16 17:31:23 -07001618 * Current or other task's task mempolicy and non-shared vma policies must be
1619 * protected by task_lock(task) by the caller.
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001620 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1621 * count--added by the get_policy() vm_op, as appropriate--to protect against
1622 * freeing by another task. It is the caller's responsibility to free the
1623 * extra reference for shared policies.
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001624 */
Stephen Wilsond98f6cb2011-05-24 17:12:41 -07001625struct mempolicy *get_vma_policy(struct task_struct *task,
Christoph Lameter48fce342006-01-08 01:01:03 -08001626 struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001627{
Mel Gorman5606e382012-11-02 18:19:13 +00001628 struct mempolicy *pol = get_task_policy(task);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001629
1630 if (vma) {
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001631 if (vma->vm_ops && vma->vm_ops->get_policy) {
Lee Schermerhornae4d8c12008-04-28 02:13:11 -07001632 struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1633 addr);
1634 if (vpol)
1635 pol = vpol;
Mel Gorman00442ad2012-10-08 16:29:20 -07001636 } else if (vma->vm_policy) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001637 pol = vma->vm_policy;
Mel Gorman00442ad2012-10-08 16:29:20 -07001638
1639 /*
1640 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1641 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1642 * count on these policies which will be dropped by
1643 * mpol_cond_put() later
1644 */
1645 if (mpol_needs_cond_ref(pol))
1646 mpol_get(pol);
1647 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001648 }
1649 if (!pol)
1650 pol = &default_policy;
1651 return pol;
1652}
1653
Mel Gormanfc3147242013-10-07 11:29:09 +01001654bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
1655{
1656 struct mempolicy *pol = get_task_policy(task);
1657 if (vma) {
1658 if (vma->vm_ops && vma->vm_ops->get_policy) {
1659 bool ret = false;
1660
1661 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1662 if (pol && (pol->flags & MPOL_F_MOF))
1663 ret = true;
1664 mpol_cond_put(pol);
1665
1666 return ret;
1667 } else if (vma->vm_policy) {
1668 pol = vma->vm_policy;
1669 }
1670 }
1671
1672 if (!pol)
1673 return default_policy.flags & MPOL_F_MOF;
1674
1675 return pol->flags & MPOL_F_MOF;
1676}
1677
Lai Jiangshand3eb1572013-02-22 16:33:22 -08001678static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1679{
1680 enum zone_type dynamic_policy_zone = policy_zone;
1681
1682 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1683
1684 /*
1685 * if policy->v.nodes has movable memory only,
1686 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1687 *
1688 * policy->v.nodes is intersect with node_states[N_MEMORY].
1689 * so if the following test faile, it implies
1690 * policy->v.nodes has movable memory only.
1691 */
1692 if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1693 dynamic_policy_zone = ZONE_MOVABLE;
1694
1695 return zone >= dynamic_policy_zone;
1696}
1697
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001698/*
1699 * Return a nodemask representing a mempolicy for filtering nodes for
1700 * page allocation
1701 */
1702static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
Mel Gorman19770b32008-04-28 02:12:18 -07001703{
1704 /* Lower zones don't get a nodemask applied for MPOL_BIND */
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001705 if (unlikely(policy->mode == MPOL_BIND) &&
Lai Jiangshand3eb1572013-02-22 16:33:22 -08001706 apply_policy_zone(policy, gfp_zone(gfp)) &&
Mel Gorman19770b32008-04-28 02:12:18 -07001707 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1708 return &policy->v.nodes;
1709
1710 return NULL;
1711}
1712
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001713/* Return a zonelist indicated by gfp for node representing a mempolicy */
Andi Kleen2f5f9482011-03-04 17:36:29 -08001714static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1715 int nd)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001716{
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001717 switch (policy->mode) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001718 case MPOL_PREFERRED:
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07001719 if (!(policy->flags & MPOL_F_LOCAL))
1720 nd = policy->v.preferred_node;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001721 break;
1722 case MPOL_BIND:
Mel Gorman19770b32008-04-28 02:12:18 -07001723 /*
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001724 * Normally, MPOL_BIND allocations are node-local within the
1725 * allowed nodemask. However, if __GFP_THISNODE is set and the
Bob Liu6eb27e12010-05-24 14:32:00 -07001726 * current node isn't part of the mask, we use the zonelist for
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001727 * the first node in the mask instead.
Mel Gorman19770b32008-04-28 02:12:18 -07001728 */
Mel Gorman19770b32008-04-28 02:12:18 -07001729 if (unlikely(gfp & __GFP_THISNODE) &&
1730 unlikely(!node_isset(nd, policy->v.nodes)))
1731 nd = first_node(policy->v.nodes);
1732 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001733 default:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001734 BUG();
1735 }
Mel Gorman0e884602008-04-28 02:12:14 -07001736 return node_zonelist(nd, gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001737}
1738
1739/* Do dynamic interleaving for a process */
1740static unsigned interleave_nodes(struct mempolicy *policy)
1741{
1742 unsigned nid, next;
1743 struct task_struct *me = current;
1744
1745 nid = me->il_next;
Andi Kleendfcd3c0d2005-10-29 18:15:48 -07001746 next = next_node(nid, policy->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001747 if (next >= MAX_NUMNODES)
Andi Kleendfcd3c0d2005-10-29 18:15:48 -07001748 next = first_node(policy->v.nodes);
David Rientjesf5b087b2008-04-28 02:12:27 -07001749 if (next < MAX_NUMNODES)
1750 me->il_next = next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001751 return nid;
1752}
1753
Christoph Lameterdc85da12006-01-18 17:42:36 -08001754/*
1755 * Depending on the memory policy provide a node from which to allocate the
1756 * next slab entry.
1757 */
David Rientjes2a389612014-04-07 15:37:29 -07001758unsigned int mempolicy_slab_node(void)
Christoph Lameterdc85da12006-01-18 17:42:36 -08001759{
Andi Kleene7b691b2012-06-09 02:40:03 -07001760 struct mempolicy *policy;
David Rientjes2a389612014-04-07 15:37:29 -07001761 int node = numa_mem_id();
Andi Kleene7b691b2012-06-09 02:40:03 -07001762
1763 if (in_interrupt())
David Rientjes2a389612014-04-07 15:37:29 -07001764 return node;
Andi Kleene7b691b2012-06-09 02:40:03 -07001765
1766 policy = current->mempolicy;
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07001767 if (!policy || policy->flags & MPOL_F_LOCAL)
David Rientjes2a389612014-04-07 15:37:29 -07001768 return node;
Christoph Lameter765c4502006-09-27 01:50:08 -07001769
Lee Schermerhornbea904d2008-04-28 02:13:18 -07001770 switch (policy->mode) {
1771 case MPOL_PREFERRED:
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07001772 /*
1773 * handled MPOL_F_LOCAL above
1774 */
1775 return policy->v.preferred_node;
Lee Schermerhornbea904d2008-04-28 02:13:18 -07001776
Christoph Lameterdc85da12006-01-18 17:42:36 -08001777 case MPOL_INTERLEAVE:
1778 return interleave_nodes(policy);
1779
Mel Gormandd1a2392008-04-28 02:12:17 -07001780 case MPOL_BIND: {
Christoph Lameterdc85da12006-01-18 17:42:36 -08001781 /*
1782 * Follow bind policy behavior and start allocation at the
1783 * first node.
1784 */
Mel Gorman19770b32008-04-28 02:12:18 -07001785 struct zonelist *zonelist;
1786 struct zone *zone;
1787 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
David Rientjes2a389612014-04-07 15:37:29 -07001788 zonelist = &NODE_DATA(node)->node_zonelists[0];
Mel Gorman19770b32008-04-28 02:12:18 -07001789 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1790 &policy->v.nodes,
1791 &zone);
David Rientjes2a389612014-04-07 15:37:29 -07001792 return zone ? zone->node : node;
Mel Gormandd1a2392008-04-28 02:12:17 -07001793 }
Christoph Lameterdc85da12006-01-18 17:42:36 -08001794
Christoph Lameterdc85da12006-01-18 17:42:36 -08001795 default:
Lee Schermerhornbea904d2008-04-28 02:13:18 -07001796 BUG();
Christoph Lameterdc85da12006-01-18 17:42:36 -08001797 }
1798}
1799
Linus Torvalds1da177e2005-04-16 15:20:36 -07001800/* Do static interleaving for a VMA with known offset. */
1801static unsigned offset_il_node(struct mempolicy *pol,
1802 struct vm_area_struct *vma, unsigned long off)
1803{
Andi Kleendfcd3c0d2005-10-29 18:15:48 -07001804 unsigned nnodes = nodes_weight(pol->v.nodes);
David Rientjesf5b087b2008-04-28 02:12:27 -07001805 unsigned target;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001806 int c;
Jianguo Wub76ac7e2013-11-12 15:07:39 -08001807 int nid = NUMA_NO_NODE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001808
David Rientjesf5b087b2008-04-28 02:12:27 -07001809 if (!nnodes)
1810 return numa_node_id();
1811 target = (unsigned int)off % nnodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001812 c = 0;
1813 do {
Andi Kleendfcd3c0d2005-10-29 18:15:48 -07001814 nid = next_node(nid, pol->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001815 c++;
1816 } while (c <= target);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001817 return nid;
1818}
1819
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001820/* Determine a node number for interleave */
1821static inline unsigned interleave_nid(struct mempolicy *pol,
1822 struct vm_area_struct *vma, unsigned long addr, int shift)
1823{
1824 if (vma) {
1825 unsigned long off;
1826
Nishanth Aravamudan3b98b082006-08-31 21:27:53 -07001827 /*
1828 * for small pages, there is no difference between
1829 * shift and PAGE_SHIFT, so the bit-shift is safe.
1830 * for huge pages, since vm_pgoff is in units of small
1831 * pages, we need to shift off the always 0 bits to get
1832 * a useful offset.
1833 */
1834 BUG_ON(shift < PAGE_SHIFT);
1835 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001836 off += (addr - vma->vm_start) >> shift;
1837 return offset_il_node(pol, vma, off);
1838 } else
1839 return interleave_nodes(pol);
1840}
1841
Michal Hocko778d3b02011-07-26 16:08:30 -07001842/*
1843 * Return the bit number of a random bit set in the nodemask.
Jianguo Wub76ac7e2013-11-12 15:07:39 -08001844 * (returns NUMA_NO_NODE if nodemask is empty)
Michal Hocko778d3b02011-07-26 16:08:30 -07001845 */
1846int node_random(const nodemask_t *maskp)
1847{
Jianguo Wub76ac7e2013-11-12 15:07:39 -08001848 int w, bit = NUMA_NO_NODE;
Michal Hocko778d3b02011-07-26 16:08:30 -07001849
1850 w = nodes_weight(*maskp);
1851 if (w)
1852 bit = bitmap_ord_to_pos(maskp->bits,
1853 get_random_int() % w, MAX_NUMNODES);
1854 return bit;
1855}
1856
Chen, Kenneth W00ac59a2006-02-03 21:51:14 +01001857#ifdef CONFIG_HUGETLBFS
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001858/*
1859 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
Fabian Frederickb46e14a2014-06-04 16:08:18 -07001860 * @vma: virtual memory area whose policy is sought
1861 * @addr: address in @vma for shared policy lookup and interleave policy
1862 * @gfp_flags: for requested zone
1863 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1864 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001865 *
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001866 * Returns a zonelist suitable for a huge page allocation and a pointer
1867 * to the struct mempolicy for conditional unref after allocation.
1868 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1869 * @nodemask for filtering the zonelist.
Miao Xiec0ff7452010-05-24 14:32:08 -07001870 *
Mel Gormand26914d2014-04-03 14:47:24 -07001871 * Must be protected by read_mems_allowed_begin()
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001872 */
Mel Gorman396faf02007-07-17 04:03:13 -07001873struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
Mel Gorman19770b32008-04-28 02:12:18 -07001874 gfp_t gfp_flags, struct mempolicy **mpol,
1875 nodemask_t **nodemask)
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001876{
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001877 struct zonelist *zl;
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001878
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001879 *mpol = get_vma_policy(current, vma, addr);
Mel Gorman19770b32008-04-28 02:12:18 -07001880 *nodemask = NULL; /* assume !MPOL_BIND */
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001881
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001882 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1883 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
Andi Kleena5516432008-07-23 21:27:41 -07001884 huge_page_shift(hstate_vma(vma))), gfp_flags);
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001885 } else {
Andi Kleen2f5f9482011-03-04 17:36:29 -08001886 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001887 if ((*mpol)->mode == MPOL_BIND)
1888 *nodemask = &(*mpol)->v.nodes;
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001889 }
1890 return zl;
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001891}
Lee Schermerhorn06808b02009-12-14 17:58:21 -08001892
1893/*
1894 * init_nodemask_of_mempolicy
1895 *
1896 * If the current task's mempolicy is "default" [NULL], return 'false'
1897 * to indicate default policy. Otherwise, extract the policy nodemask
1898 * for 'bind' or 'interleave' policy into the argument nodemask, or
1899 * initialize the argument nodemask to contain the single node for
1900 * 'preferred' or 'local' policy and return 'true' to indicate presence
1901 * of non-default mempolicy.
1902 *
1903 * We don't bother with reference counting the mempolicy [mpol_get/put]
1904 * because the current task is examining it's own mempolicy and a task's
1905 * mempolicy is only ever changed by the task itself.
1906 *
1907 * N.B., it is the caller's responsibility to free a returned nodemask.
1908 */
1909bool init_nodemask_of_mempolicy(nodemask_t *mask)
1910{
1911 struct mempolicy *mempolicy;
1912 int nid;
1913
1914 if (!(mask && current->mempolicy))
1915 return false;
1916
Miao Xiec0ff7452010-05-24 14:32:08 -07001917 task_lock(current);
Lee Schermerhorn06808b02009-12-14 17:58:21 -08001918 mempolicy = current->mempolicy;
1919 switch (mempolicy->mode) {
1920 case MPOL_PREFERRED:
1921 if (mempolicy->flags & MPOL_F_LOCAL)
1922 nid = numa_node_id();
1923 else
1924 nid = mempolicy->v.preferred_node;
1925 init_nodemask_of_node(mask, nid);
1926 break;
1927
1928 case MPOL_BIND:
1929 /* Fall through */
1930 case MPOL_INTERLEAVE:
1931 *mask = mempolicy->v.nodes;
1932 break;
1933
1934 default:
1935 BUG();
1936 }
Miao Xiec0ff7452010-05-24 14:32:08 -07001937 task_unlock(current);
Lee Schermerhorn06808b02009-12-14 17:58:21 -08001938
1939 return true;
1940}
Chen, Kenneth W00ac59a2006-02-03 21:51:14 +01001941#endif
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001942
David Rientjes6f48d0eb2010-08-09 17:18:52 -07001943/*
1944 * mempolicy_nodemask_intersects
1945 *
1946 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1947 * policy. Otherwise, check for intersection between mask and the policy
1948 * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
1949 * policy, always return true since it may allocate elsewhere on fallback.
1950 *
1951 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1952 */
1953bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1954 const nodemask_t *mask)
1955{
1956 struct mempolicy *mempolicy;
1957 bool ret = true;
1958
1959 if (!mask)
1960 return ret;
1961 task_lock(tsk);
1962 mempolicy = tsk->mempolicy;
1963 if (!mempolicy)
1964 goto out;
1965
1966 switch (mempolicy->mode) {
1967 case MPOL_PREFERRED:
1968 /*
1969 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1970 * allocate from, they may fallback to other nodes when oom.
1971 * Thus, it's possible for tsk to have allocated memory from
1972 * nodes in mask.
1973 */
1974 break;
1975 case MPOL_BIND:
1976 case MPOL_INTERLEAVE:
1977 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1978 break;
1979 default:
1980 BUG();
1981 }
1982out:
1983 task_unlock(tsk);
1984 return ret;
1985}
1986
Linus Torvalds1da177e2005-04-16 15:20:36 -07001987/* Allocate a page in interleaved policy.
1988 Own path because it needs to do special accounting. */
Andi Kleen662f3a02005-10-29 18:15:49 -07001989static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1990 unsigned nid)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001991{
1992 struct zonelist *zl;
1993 struct page *page;
1994
Mel Gorman0e884602008-04-28 02:12:14 -07001995 zl = node_zonelist(nid, gfp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001996 page = __alloc_pages(gfp, order, zl);
Mel Gormandd1a2392008-04-28 02:12:17 -07001997 if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
Christoph Lameterca889e62006-06-30 01:55:44 -07001998 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001999 return page;
2000}
2001
2002/**
Andrea Arcangeli0bbbc0b2011-01-13 15:47:05 -08002003 * alloc_pages_vma - Allocate a page for a VMA.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002004 *
2005 * @gfp:
2006 * %GFP_USER user allocation.
2007 * %GFP_KERNEL kernel allocations,
2008 * %GFP_HIGHMEM highmem/user allocations,
2009 * %GFP_FS allocation should not call back into a file system.
2010 * %GFP_ATOMIC don't sleep.
2011 *
Andrea Arcangeli0bbbc0b2011-01-13 15:47:05 -08002012 * @order:Order of the GFP allocation.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002013 * @vma: Pointer to VMA or NULL if not available.
2014 * @addr: Virtual Address of the allocation. Must be inside the VMA.
2015 *
2016 * This function allocates a page from the kernel page pool and applies
2017 * a NUMA policy associated with the VMA or the current process.
2018 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
2019 * mm_struct of the VMA to prevent it from going away. Should be used for
2020 * all allocations for pages that will be mapped into
2021 * user space. Returns NULL when no page can be allocated.
2022 *
2023 * Should be called with the mm_sem of the vma hold.
2024 */
2025struct page *
Andrea Arcangeli0bbbc0b2011-01-13 15:47:05 -08002026alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
Andi Kleen2f5f9482011-03-04 17:36:29 -08002027 unsigned long addr, int node)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002028{
Mel Gormancc9a6c82012-03-21 16:34:11 -07002029 struct mempolicy *pol;
Miao Xiec0ff7452010-05-24 14:32:08 -07002030 struct page *page;
Mel Gormancc9a6c82012-03-21 16:34:11 -07002031 unsigned int cpuset_mems_cookie;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002032
Mel Gormancc9a6c82012-03-21 16:34:11 -07002033retry_cpuset:
2034 pol = get_vma_policy(current, vma, addr);
Mel Gormand26914d2014-04-03 14:47:24 -07002035 cpuset_mems_cookie = read_mems_allowed_begin();
Mel Gormancc9a6c82012-03-21 16:34:11 -07002036
Lee Schermerhorn45c47452008-04-28 02:13:12 -07002037 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002038 unsigned nid;
Christoph Lameter5da7ca82006-01-06 00:10:46 -08002039
Andi Kleen8eac5632011-02-25 14:44:28 -08002040 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07002041 mpol_cond_put(pol);
Andrea Arcangeli0bbbc0b2011-01-13 15:47:05 -08002042 page = alloc_page_interleave(gfp, order, nid);
Mel Gormand26914d2014-04-03 14:47:24 -07002043 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
Mel Gormancc9a6c82012-03-21 16:34:11 -07002044 goto retry_cpuset;
2045
Miao Xiec0ff7452010-05-24 14:32:08 -07002046 return page;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002047 }
David Rientjes212a0a62012-12-11 16:02:51 -08002048 page = __alloc_pages_nodemask(gfp, order,
2049 policy_zonelist(gfp, pol, node),
Andrea Arcangeli0bbbc0b2011-01-13 15:47:05 -08002050 policy_nodemask(gfp, pol));
David Rientjes212a0a62012-12-11 16:02:51 -08002051 if (unlikely(mpol_needs_cond_ref(pol)))
2052 __mpol_put(pol);
Mel Gormand26914d2014-04-03 14:47:24 -07002053 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
Mel Gormancc9a6c82012-03-21 16:34:11 -07002054 goto retry_cpuset;
Miao Xiec0ff7452010-05-24 14:32:08 -07002055 return page;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002056}
2057
2058/**
2059 * alloc_pages_current - Allocate pages.
2060 *
2061 * @gfp:
2062 * %GFP_USER user allocation,
2063 * %GFP_KERNEL kernel allocation,
2064 * %GFP_HIGHMEM highmem allocation,
2065 * %GFP_FS don't call back into a file system.
2066 * %GFP_ATOMIC don't sleep.
2067 * @order: Power of two of allocation size in pages. 0 is a single page.
2068 *
2069 * Allocate a page from the kernel page pool. When not in
2070 * interrupt context and apply the current process NUMA policy.
2071 * Returns NULL when no page can be allocated.
2072 *
Paul Jacksoncf2a473c2006-01-08 01:01:54 -08002073 * Don't call cpuset_update_task_memory_state() unless
Linus Torvalds1da177e2005-04-16 15:20:36 -07002074 * 1) it's ok to take cpuset_sem (can WAIT), and
2075 * 2) allocating for current task (not interrupt).
2076 */
Al Virodd0fc662005-10-07 07:46:04 +01002077struct page *alloc_pages_current(gfp_t gfp, unsigned order)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002078{
Mel Gorman5606e382012-11-02 18:19:13 +00002079 struct mempolicy *pol = get_task_policy(current);
Miao Xiec0ff7452010-05-24 14:32:08 -07002080 struct page *page;
Mel Gormancc9a6c82012-03-21 16:34:11 -07002081 unsigned int cpuset_mems_cookie;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002082
Christoph Lameter9b819d22006-09-25 23:31:40 -07002083 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002084 pol = &default_policy;
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07002085
Mel Gormancc9a6c82012-03-21 16:34:11 -07002086retry_cpuset:
Mel Gormand26914d2014-04-03 14:47:24 -07002087 cpuset_mems_cookie = read_mems_allowed_begin();
Mel Gormancc9a6c82012-03-21 16:34:11 -07002088
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07002089 /*
2090 * No reference counting needed for current->mempolicy
2091 * nor system default_policy
2092 */
Lee Schermerhorn45c47452008-04-28 02:13:12 -07002093 if (pol->mode == MPOL_INTERLEAVE)
Miao Xiec0ff7452010-05-24 14:32:08 -07002094 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2095 else
2096 page = __alloc_pages_nodemask(gfp, order,
Andi Kleen5c4b4be2011-03-04 17:36:32 -08002097 policy_zonelist(gfp, pol, numa_node_id()),
2098 policy_nodemask(gfp, pol));
Mel Gormancc9a6c82012-03-21 16:34:11 -07002099
Mel Gormand26914d2014-04-03 14:47:24 -07002100 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
Mel Gormancc9a6c82012-03-21 16:34:11 -07002101 goto retry_cpuset;
2102
Miao Xiec0ff7452010-05-24 14:32:08 -07002103 return page;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002104}
2105EXPORT_SYMBOL(alloc_pages_current);
2106
Oleg Nesterovef0855d2013-09-11 14:20:14 -07002107int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2108{
2109 struct mempolicy *pol = mpol_dup(vma_policy(src));
2110
2111 if (IS_ERR(pol))
2112 return PTR_ERR(pol);
2113 dst->vm_policy = pol;
2114 return 0;
2115}
2116
Paul Jackson42253992006-01-08 01:01:59 -08002117/*
Lee Schermerhorn846a16b2008-04-28 02:13:09 -07002118 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
Paul Jackson42253992006-01-08 01:01:59 -08002119 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2120 * with the mems_allowed returned by cpuset_mems_allowed(). This
2121 * keeps mempolicies cpuset relative after its cpuset moves. See
2122 * further kernel/cpuset.c update_nodemask().
Miao Xie708c1bb2010-05-24 14:32:07 -07002123 *
2124 * current's mempolicy may be rebinded by the other task(the task that changes
2125 * cpuset's mems), so we needn't do rebind work for current task.
Paul Jackson42253992006-01-08 01:01:59 -08002126 */
Paul Jackson42253992006-01-08 01:01:59 -08002127
Lee Schermerhorn846a16b2008-04-28 02:13:09 -07002128/* Slow path of a mempolicy duplicate */
2129struct mempolicy *__mpol_dup(struct mempolicy *old)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002130{
2131 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2132
2133 if (!new)
2134 return ERR_PTR(-ENOMEM);
Miao Xie708c1bb2010-05-24 14:32:07 -07002135
2136 /* task's mempolicy is protected by alloc_lock */
2137 if (old == current->mempolicy) {
2138 task_lock(current);
2139 *new = *old;
2140 task_unlock(current);
2141 } else
2142 *new = *old;
2143
Paul E. McKenney99ee4ca2010-03-03 17:50:17 -08002144 rcu_read_lock();
Paul Jackson42253992006-01-08 01:01:59 -08002145 if (current_cpuset_is_being_rebound()) {
2146 nodemask_t mems = cpuset_mems_allowed(current);
Miao Xie708c1bb2010-05-24 14:32:07 -07002147 if (new->flags & MPOL_F_REBINDING)
2148 mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2149 else
2150 mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
Paul Jackson42253992006-01-08 01:01:59 -08002151 }
Paul E. McKenney99ee4ca2010-03-03 17:50:17 -08002152 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002153 atomic_set(&new->refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002154 return new;
2155}
2156
2157/* Slow path of a mempolicy comparison */
KOSAKI Motohirofcfb4dc2012-01-10 15:08:21 -08002158bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002159{
2160 if (!a || !b)
KOSAKI Motohirofcfb4dc2012-01-10 15:08:21 -08002161 return false;
Lee Schermerhorn45c47452008-04-28 02:13:12 -07002162 if (a->mode != b->mode)
KOSAKI Motohirofcfb4dc2012-01-10 15:08:21 -08002163 return false;
Bob Liu19800502010-05-24 14:32:01 -07002164 if (a->flags != b->flags)
KOSAKI Motohirofcfb4dc2012-01-10 15:08:21 -08002165 return false;
Bob Liu19800502010-05-24 14:32:01 -07002166 if (mpol_store_user_nodemask(a))
2167 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
KOSAKI Motohirofcfb4dc2012-01-10 15:08:21 -08002168 return false;
Bob Liu19800502010-05-24 14:32:01 -07002169
Lee Schermerhorn45c47452008-04-28 02:13:12 -07002170 switch (a->mode) {
Mel Gorman19770b32008-04-28 02:12:18 -07002171 case MPOL_BIND:
2172 /* Fall through */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002173 case MPOL_INTERLEAVE:
KOSAKI Motohirofcfb4dc2012-01-10 15:08:21 -08002174 return !!nodes_equal(a->v.nodes, b->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002175 case MPOL_PREFERRED:
Namhyung Kim75719662011-03-22 16:33:02 -07002176 return a->v.preferred_node == b->v.preferred_node;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002177 default:
2178 BUG();
KOSAKI Motohirofcfb4dc2012-01-10 15:08:21 -08002179 return false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002180 }
2181}
2182
Linus Torvalds1da177e2005-04-16 15:20:36 -07002183/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002184 * Shared memory backing store policy support.
2185 *
2186 * Remember policies even when nobody has shared memory mapped.
2187 * The policies are kept in Red-Black tree linked from the inode.
2188 * They are protected by the sp->lock spinlock, which should be held
2189 * for any accesses to the tree.
2190 */
2191
2192/* lookup first element intersecting start-end */
Mel Gorman42288fe2012-12-21 23:10:25 +00002193/* Caller holds sp->lock */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002194static struct sp_node *
2195sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2196{
2197 struct rb_node *n = sp->root.rb_node;
2198
2199 while (n) {
2200 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2201
2202 if (start >= p->end)
2203 n = n->rb_right;
2204 else if (end <= p->start)
2205 n = n->rb_left;
2206 else
2207 break;
2208 }
2209 if (!n)
2210 return NULL;
2211 for (;;) {
2212 struct sp_node *w = NULL;
2213 struct rb_node *prev = rb_prev(n);
2214 if (!prev)
2215 break;
2216 w = rb_entry(prev, struct sp_node, nd);
2217 if (w->end <= start)
2218 break;
2219 n = prev;
2220 }
2221 return rb_entry(n, struct sp_node, nd);
2222}
2223
2224/* Insert a new shared policy into the list. */
2225/* Caller holds sp->lock */
2226static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2227{
2228 struct rb_node **p = &sp->root.rb_node;
2229 struct rb_node *parent = NULL;
2230 struct sp_node *nd;
2231
2232 while (*p) {
2233 parent = *p;
2234 nd = rb_entry(parent, struct sp_node, nd);
2235 if (new->start < nd->start)
2236 p = &(*p)->rb_left;
2237 else if (new->end > nd->end)
2238 p = &(*p)->rb_right;
2239 else
2240 BUG();
2241 }
2242 rb_link_node(&new->nd, parent, p);
2243 rb_insert_color(&new->nd, &sp->root);
Paul Mundt140d5a42007-07-15 23:38:16 -07002244 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
Lee Schermerhorn45c47452008-04-28 02:13:12 -07002245 new->policy ? new->policy->mode : 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002246}
2247
2248/* Find shared policy intersecting idx */
2249struct mempolicy *
2250mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2251{
2252 struct mempolicy *pol = NULL;
2253 struct sp_node *sn;
2254
2255 if (!sp->root.rb_node)
2256 return NULL;
Mel Gorman42288fe2012-12-21 23:10:25 +00002257 spin_lock(&sp->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002258 sn = sp_lookup(sp, idx, idx+1);
2259 if (sn) {
2260 mpol_get(sn->policy);
2261 pol = sn->policy;
2262 }
Mel Gorman42288fe2012-12-21 23:10:25 +00002263 spin_unlock(&sp->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002264 return pol;
2265}
2266
KOSAKI Motohiro63f74ca2012-10-08 16:29:19 -07002267static void sp_free(struct sp_node *n)
2268{
2269 mpol_put(n->policy);
2270 kmem_cache_free(sn_cache, n);
2271}
2272
Lee Schermerhorn771fb4d2012-10-25 14:16:30 +02002273/**
2274 * mpol_misplaced - check whether current page node is valid in policy
2275 *
Fabian Frederickb46e14a2014-06-04 16:08:18 -07002276 * @page: page to be checked
2277 * @vma: vm area where page mapped
2278 * @addr: virtual address where page mapped
Lee Schermerhorn771fb4d2012-10-25 14:16:30 +02002279 *
2280 * Lookup current policy node id for vma,addr and "compare to" page's
2281 * node id.
2282 *
2283 * Returns:
2284 * -1 - not misplaced, page is in the right node
2285 * node - node id where the page should be
2286 *
2287 * Policy determination "mimics" alloc_page_vma().
2288 * Called from fault path where we know the vma and faulting address.
2289 */
2290int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2291{
2292 struct mempolicy *pol;
2293 struct zone *zone;
2294 int curnid = page_to_nid(page);
2295 unsigned long pgoff;
Peter Zijlstra90572892013-10-07 11:29:20 +01002296 int thiscpu = raw_smp_processor_id();
2297 int thisnid = cpu_to_node(thiscpu);
Lee Schermerhorn771fb4d2012-10-25 14:16:30 +02002298 int polnid = -1;
2299 int ret = -1;
2300
2301 BUG_ON(!vma);
2302
2303 pol = get_vma_policy(current, vma, addr);
2304 if (!(pol->flags & MPOL_F_MOF))
2305 goto out;
2306
2307 switch (pol->mode) {
2308 case MPOL_INTERLEAVE:
2309 BUG_ON(addr >= vma->vm_end);
2310 BUG_ON(addr < vma->vm_start);
2311
2312 pgoff = vma->vm_pgoff;
2313 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2314 polnid = offset_il_node(pol, vma, pgoff);
2315 break;
2316
2317 case MPOL_PREFERRED:
2318 if (pol->flags & MPOL_F_LOCAL)
2319 polnid = numa_node_id();
2320 else
2321 polnid = pol->v.preferred_node;
2322 break;
2323
2324 case MPOL_BIND:
2325 /*
2326 * allows binding to multiple nodes.
2327 * use current page if in policy nodemask,
2328 * else select nearest allowed node, if any.
2329 * If no allowed nodes, use current [!misplaced].
2330 */
2331 if (node_isset(curnid, pol->v.nodes))
2332 goto out;
2333 (void)first_zones_zonelist(
2334 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2335 gfp_zone(GFP_HIGHUSER),
2336 &pol->v.nodes, &zone);
2337 polnid = zone->node;
2338 break;
2339
2340 default:
2341 BUG();
2342 }
Mel Gorman5606e382012-11-02 18:19:13 +00002343
2344 /* Migrate the page towards the node whose CPU is referencing it */
Mel Gormane42c8ff2012-11-12 09:17:07 +00002345 if (pol->flags & MPOL_F_MORON) {
Peter Zijlstra90572892013-10-07 11:29:20 +01002346 polnid = thisnid;
Mel Gorman5606e382012-11-02 18:19:13 +00002347
Rik van Riel10f39042014-01-27 17:03:44 -05002348 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
Rik van Rielde1c9ce62013-10-07 11:29:39 +01002349 goto out;
Mel Gormane42c8ff2012-11-12 09:17:07 +00002350 }
2351
Lee Schermerhorn771fb4d2012-10-25 14:16:30 +02002352 if (curnid != polnid)
2353 ret = polnid;
2354out:
2355 mpol_cond_put(pol);
2356
2357 return ret;
2358}
2359
Linus Torvalds1da177e2005-04-16 15:20:36 -07002360static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2361{
Paul Mundt140d5a42007-07-15 23:38:16 -07002362 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002363 rb_erase(&n->nd, &sp->root);
KOSAKI Motohiro63f74ca2012-10-08 16:29:19 -07002364 sp_free(n);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002365}
2366
Mel Gorman42288fe2012-12-21 23:10:25 +00002367static void sp_node_init(struct sp_node *node, unsigned long start,
2368 unsigned long end, struct mempolicy *pol)
2369{
2370 node->start = start;
2371 node->end = end;
2372 node->policy = pol;
2373}
2374
Adrian Bunkdbcb0f12007-10-16 01:26:26 -07002375static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2376 struct mempolicy *pol)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002377{
KOSAKI Motohiro869833f2012-10-08 16:29:16 -07002378 struct sp_node *n;
2379 struct mempolicy *newpol;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002380
KOSAKI Motohiro869833f2012-10-08 16:29:16 -07002381 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002382 if (!n)
2383 return NULL;
KOSAKI Motohiro869833f2012-10-08 16:29:16 -07002384
2385 newpol = mpol_dup(pol);
2386 if (IS_ERR(newpol)) {
2387 kmem_cache_free(sn_cache, n);
2388 return NULL;
2389 }
2390 newpol->flags |= MPOL_F_SHARED;
Mel Gorman42288fe2012-12-21 23:10:25 +00002391 sp_node_init(n, start, end, newpol);
KOSAKI Motohiro869833f2012-10-08 16:29:16 -07002392
Linus Torvalds1da177e2005-04-16 15:20:36 -07002393 return n;
2394}
2395
2396/* Replace a policy range. */
2397static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2398 unsigned long end, struct sp_node *new)
2399{
Mel Gormanb22d1272012-10-08 16:29:17 -07002400 struct sp_node *n;
Mel Gorman42288fe2012-12-21 23:10:25 +00002401 struct sp_node *n_new = NULL;
2402 struct mempolicy *mpol_new = NULL;
Mel Gormanb22d1272012-10-08 16:29:17 -07002403 int ret = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002404
Mel Gorman42288fe2012-12-21 23:10:25 +00002405restart:
2406 spin_lock(&sp->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002407 n = sp_lookup(sp, start, end);
2408 /* Take care of old policies in the same range. */
2409 while (n && n->start < end) {
2410 struct rb_node *next = rb_next(&n->nd);
2411 if (n->start >= start) {
2412 if (n->end <= end)
2413 sp_delete(sp, n);
2414 else
2415 n->start = end;
2416 } else {
2417 /* Old policy spanning whole new range. */
2418 if (n->end > end) {
Mel Gorman42288fe2012-12-21 23:10:25 +00002419 if (!n_new)
2420 goto alloc_new;
2421
2422 *mpol_new = *n->policy;
2423 atomic_set(&mpol_new->refcnt, 1);
KOSAKI Motohiro78806392013-03-08 12:43:29 -08002424 sp_node_init(n_new, end, n->end, mpol_new);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002425 n->end = start;
Hillf Danton5ca39572013-03-08 12:43:28 -08002426 sp_insert(sp, n_new);
Mel Gorman42288fe2012-12-21 23:10:25 +00002427 n_new = NULL;
2428 mpol_new = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002429 break;
2430 } else
2431 n->end = start;
2432 }
2433 if (!next)
2434 break;
2435 n = rb_entry(next, struct sp_node, nd);
2436 }
2437 if (new)
2438 sp_insert(sp, new);
Mel Gorman42288fe2012-12-21 23:10:25 +00002439 spin_unlock(&sp->lock);
2440 ret = 0;
2441
2442err_out:
2443 if (mpol_new)
2444 mpol_put(mpol_new);
2445 if (n_new)
2446 kmem_cache_free(sn_cache, n_new);
2447
Mel Gormanb22d1272012-10-08 16:29:17 -07002448 return ret;
Mel Gorman42288fe2012-12-21 23:10:25 +00002449
2450alloc_new:
2451 spin_unlock(&sp->lock);
2452 ret = -ENOMEM;
2453 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2454 if (!n_new)
2455 goto err_out;
2456 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2457 if (!mpol_new)
2458 goto err_out;
2459 goto restart;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002460}
2461
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002462/**
2463 * mpol_shared_policy_init - initialize shared policy for inode
2464 * @sp: pointer to inode shared policy
2465 * @mpol: struct mempolicy to install
2466 *
2467 * Install non-NULL @mpol in inode's shared policy rb-tree.
2468 * On entry, the current task has a reference on a non-NULL @mpol.
2469 * This must be released on exit.
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -07002470 * This is called at get_inode() calls and we can use GFP_KERNEL.
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002471 */
2472void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
Robin Holt7339ff82006-01-14 13:20:48 -08002473{
Miao Xie58568d22009-06-16 15:31:49 -07002474 int ret;
2475
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002476 sp->root = RB_ROOT; /* empty tree == default mempolicy */
Mel Gorman42288fe2012-12-21 23:10:25 +00002477 spin_lock_init(&sp->lock);
Robin Holt7339ff82006-01-14 13:20:48 -08002478
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002479 if (mpol) {
2480 struct vm_area_struct pvma;
2481 struct mempolicy *new;
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -07002482 NODEMASK_SCRATCH(scratch);
Robin Holt7339ff82006-01-14 13:20:48 -08002483
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -07002484 if (!scratch)
Lee Schermerhorn5c0c1652010-06-29 15:05:30 -07002485 goto put_mpol;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002486 /* contextualize the tmpfs mount point mempolicy */
2487 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
Lee Schermerhorn15d77832010-05-24 14:32:04 -07002488 if (IS_ERR(new))
Dan Carpenter0cae3452010-05-25 23:42:58 -07002489 goto free_scratch; /* no valid nodemask intersection */
Miao Xie58568d22009-06-16 15:31:49 -07002490
2491 task_lock(current);
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -07002492 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
Miao Xie58568d22009-06-16 15:31:49 -07002493 task_unlock(current);
Lee Schermerhorn15d77832010-05-24 14:32:04 -07002494 if (ret)
Lee Schermerhorn5c0c1652010-06-29 15:05:30 -07002495 goto put_new;
Robin Holt7339ff82006-01-14 13:20:48 -08002496
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002497 /* Create pseudo-vma that contains just the policy */
2498 memset(&pvma, 0, sizeof(struct vm_area_struct));
2499 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2500 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
Lee Schermerhorn15d77832010-05-24 14:32:04 -07002501
Lee Schermerhorn5c0c1652010-06-29 15:05:30 -07002502put_new:
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002503 mpol_put(new); /* drop initial ref */
Dan Carpenter0cae3452010-05-25 23:42:58 -07002504free_scratch:
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -07002505 NODEMASK_SCRATCH_FREE(scratch);
Lee Schermerhorn5c0c1652010-06-29 15:05:30 -07002506put_mpol:
2507 mpol_put(mpol); /* drop our incoming ref on sb mpol */
Robin Holt7339ff82006-01-14 13:20:48 -08002508 }
2509}
2510
Linus Torvalds1da177e2005-04-16 15:20:36 -07002511int mpol_set_shared_policy(struct shared_policy *info,
2512 struct vm_area_struct *vma, struct mempolicy *npol)
2513{
2514 int err;
2515 struct sp_node *new = NULL;
2516 unsigned long sz = vma_pages(vma);
2517
David Rientjes028fec42008-04-28 02:12:25 -07002518 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07002519 vma->vm_pgoff,
Lee Schermerhorn45c47452008-04-28 02:13:12 -07002520 sz, npol ? npol->mode : -1,
David Rientjes028fec42008-04-28 02:12:25 -07002521 npol ? npol->flags : -1,
David Rientjes00ef2d22013-02-22 16:35:36 -08002522 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002523
2524 if (npol) {
2525 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2526 if (!new)
2527 return -ENOMEM;
2528 }
2529 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2530 if (err && new)
KOSAKI Motohiro63f74ca2012-10-08 16:29:19 -07002531 sp_free(new);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002532 return err;
2533}
2534
2535/* Free a backing policy store on inode delete. */
2536void mpol_free_shared_policy(struct shared_policy *p)
2537{
2538 struct sp_node *n;
2539 struct rb_node *next;
2540
2541 if (!p->root.rb_node)
2542 return;
Mel Gorman42288fe2012-12-21 23:10:25 +00002543 spin_lock(&p->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002544 next = rb_first(&p->root);
2545 while (next) {
2546 n = rb_entry(next, struct sp_node, nd);
2547 next = rb_next(&n->nd);
KOSAKI Motohiro63f74ca2012-10-08 16:29:19 -07002548 sp_delete(p, n);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002549 }
Mel Gorman42288fe2012-12-21 23:10:25 +00002550 spin_unlock(&p->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002551}
2552
Mel Gorman1a687c22012-11-22 11:16:36 +00002553#ifdef CONFIG_NUMA_BALANCING
Mel Gormanc2976632014-01-29 14:05:42 -08002554static int __initdata numabalancing_override;
Mel Gorman1a687c22012-11-22 11:16:36 +00002555
2556static void __init check_numabalancing_enable(void)
2557{
2558 bool numabalancing_default = false;
2559
2560 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2561 numabalancing_default = true;
2562
Mel Gormanc2976632014-01-29 14:05:42 -08002563 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2564 if (numabalancing_override)
2565 set_numabalancing_state(numabalancing_override == 1);
2566
Mel Gorman1a687c22012-11-22 11:16:36 +00002567 if (nr_node_ids > 1 && !numabalancing_override) {
Andrew Morton4a404be2014-01-29 14:05:43 -08002568 pr_info("%s automatic NUMA balancing. "
Mel Gormanc2976632014-01-29 14:05:42 -08002569 "Configure with numa_balancing= or the "
2570 "kernel.numa_balancing sysctl",
2571 numabalancing_default ? "Enabling" : "Disabling");
Mel Gorman1a687c22012-11-22 11:16:36 +00002572 set_numabalancing_state(numabalancing_default);
2573 }
2574}
2575
2576static int __init setup_numabalancing(char *str)
2577{
2578 int ret = 0;
2579 if (!str)
2580 goto out;
Mel Gorman1a687c22012-11-22 11:16:36 +00002581
2582 if (!strcmp(str, "enable")) {
Mel Gormanc2976632014-01-29 14:05:42 -08002583 numabalancing_override = 1;
Mel Gorman1a687c22012-11-22 11:16:36 +00002584 ret = 1;
2585 } else if (!strcmp(str, "disable")) {
Mel Gormanc2976632014-01-29 14:05:42 -08002586 numabalancing_override = -1;
Mel Gorman1a687c22012-11-22 11:16:36 +00002587 ret = 1;
2588 }
2589out:
2590 if (!ret)
Andrew Morton4a404be2014-01-29 14:05:43 -08002591 pr_warn("Unable to parse numa_balancing=\n");
Mel Gorman1a687c22012-11-22 11:16:36 +00002592
2593 return ret;
2594}
2595__setup("numa_balancing=", setup_numabalancing);
2596#else
2597static inline void __init check_numabalancing_enable(void)
2598{
2599}
2600#endif /* CONFIG_NUMA_BALANCING */
2601
Linus Torvalds1da177e2005-04-16 15:20:36 -07002602/* assumes fs == KERNEL_DS */
2603void __init numa_policy_init(void)
2604{
Paul Mundtb71636e22007-07-15 23:38:15 -07002605 nodemask_t interleave_nodes;
2606 unsigned long largest = 0;
2607 int nid, prefer = 0;
2608
Linus Torvalds1da177e2005-04-16 15:20:36 -07002609 policy_cache = kmem_cache_create("numa_policy",
2610 sizeof(struct mempolicy),
Paul Mundt20c2df82007-07-20 10:11:58 +09002611 0, SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002612
2613 sn_cache = kmem_cache_create("shared_policy_node",
2614 sizeof(struct sp_node),
Paul Mundt20c2df82007-07-20 10:11:58 +09002615 0, SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002616
Mel Gorman5606e382012-11-02 18:19:13 +00002617 for_each_node(nid) {
2618 preferred_node_policy[nid] = (struct mempolicy) {
2619 .refcnt = ATOMIC_INIT(1),
2620 .mode = MPOL_PREFERRED,
2621 .flags = MPOL_F_MOF | MPOL_F_MORON,
2622 .v = { .preferred_node = nid, },
2623 };
2624 }
2625
Paul Mundtb71636e22007-07-15 23:38:15 -07002626 /*
2627 * Set interleaving policy for system init. Interleaving is only
2628 * enabled across suitably sized nodes (default is >= 16MB), or
2629 * fall back to the largest node if they're all smaller.
2630 */
2631 nodes_clear(interleave_nodes);
Lai Jiangshan01f13bd2012-12-12 13:51:33 -08002632 for_each_node_state(nid, N_MEMORY) {
Paul Mundtb71636e22007-07-15 23:38:15 -07002633 unsigned long total_pages = node_present_pages(nid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002634
Paul Mundtb71636e22007-07-15 23:38:15 -07002635 /* Preserve the largest node */
2636 if (largest < total_pages) {
2637 largest = total_pages;
2638 prefer = nid;
2639 }
2640
2641 /* Interleave this node? */
2642 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2643 node_set(nid, interleave_nodes);
2644 }
2645
2646 /* All too small, use the largest */
2647 if (unlikely(nodes_empty(interleave_nodes)))
2648 node_set(prefer, interleave_nodes);
2649
David Rientjes028fec42008-04-28 02:12:25 -07002650 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
Mitchel Humpherysb1de0d12014-06-06 14:38:30 -07002651 pr_err("%s: interleaving failed\n", __func__);
Mel Gorman1a687c22012-11-22 11:16:36 +00002652
2653 check_numabalancing_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002654}
2655
Christoph Lameter8bccd852005-10-29 18:16:59 -07002656/* Reset policy of current process to default */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002657void numa_default_policy(void)
2658{
David Rientjes028fec42008-04-28 02:12:25 -07002659 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002660}
Paul Jackson68860ec2005-10-30 15:02:36 -08002661
Paul Jackson42253992006-01-08 01:01:59 -08002662/*
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002663 * Parse and format mempolicy from/to strings
2664 */
2665
2666/*
Hugh Dickinsf2a07f42013-01-02 02:01:33 -08002667 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002668 */
Lee Schermerhorn345ace92010-05-24 14:32:04 -07002669static const char * const policy_modes[] =
2670{
2671 [MPOL_DEFAULT] = "default",
2672 [MPOL_PREFERRED] = "prefer",
2673 [MPOL_BIND] = "bind",
2674 [MPOL_INTERLEAVE] = "interleave",
Lee Schermerhornd3a71032012-10-25 14:16:29 +02002675 [MPOL_LOCAL] = "local",
Lee Schermerhorn345ace92010-05-24 14:32:04 -07002676};
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002677
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002678
2679#ifdef CONFIG_TMPFS
2680/**
Hugh Dickinsf2a07f42013-01-02 02:01:33 -08002681 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002682 * @str: string containing mempolicy to parse
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002683 * @mpol: pointer to struct mempolicy pointer, returned on success.
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002684 *
2685 * Format of input:
2686 * <mode>[=<flags>][:<nodelist>]
2687 *
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002688 * On success, returns 0, else 1
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002689 */
Hugh Dickinsa7a88b22013-01-02 02:04:23 -08002690int mpol_parse_str(char *str, struct mempolicy **mpol)
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002691{
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002692 struct mempolicy *new = NULL;
Lee Schermerhornb4652e82010-05-24 14:32:03 -07002693 unsigned short mode;
Hugh Dickinsf2a07f42013-01-02 02:01:33 -08002694 unsigned short mode_flags;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002695 nodemask_t nodes;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002696 char *nodelist = strchr(str, ':');
2697 char *flags = strchr(str, '=');
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002698 int err = 1;
2699
2700 if (nodelist) {
2701 /* NUL-terminate mode or flags string */
2702 *nodelist++ = '\0';
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002703 if (nodelist_parse(nodelist, nodes))
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002704 goto out;
Lai Jiangshan01f13bd2012-12-12 13:51:33 -08002705 if (!nodes_subset(nodes, node_states[N_MEMORY]))
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002706 goto out;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002707 } else
2708 nodes_clear(nodes);
2709
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002710 if (flags)
2711 *flags++ = '\0'; /* terminate mode string */
2712
Peter Zijlstra479e2802012-10-25 14:16:28 +02002713 for (mode = 0; mode < MPOL_MAX; mode++) {
Lee Schermerhorn345ace92010-05-24 14:32:04 -07002714 if (!strcmp(str, policy_modes[mode])) {
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002715 break;
2716 }
2717 }
Mel Gormana7200942012-11-16 09:37:58 +00002718 if (mode >= MPOL_MAX)
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002719 goto out;
2720
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002721 switch (mode) {
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002722 case MPOL_PREFERRED:
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002723 /*
2724 * Insist on a nodelist of one node only
2725 */
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002726 if (nodelist) {
2727 char *rest = nodelist;
2728 while (isdigit(*rest))
2729 rest++;
KOSAKI Motohiro926f2ae2010-03-23 13:35:32 -07002730 if (*rest)
2731 goto out;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002732 }
2733 break;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002734 case MPOL_INTERLEAVE:
2735 /*
2736 * Default to online nodes with memory if no nodelist
2737 */
2738 if (!nodelist)
Lai Jiangshan01f13bd2012-12-12 13:51:33 -08002739 nodes = node_states[N_MEMORY];
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07002740 break;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002741 case MPOL_LOCAL:
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07002742 /*
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002743 * Don't allow a nodelist; mpol_new() checks flags
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07002744 */
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002745 if (nodelist)
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07002746 goto out;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002747 mode = MPOL_PREFERRED;
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07002748 break;
Ravikiran G Thirumalai413b43d2010-03-23 13:35:28 -07002749 case MPOL_DEFAULT:
2750 /*
2751 * Insist on a empty nodelist
2752 */
2753 if (!nodelist)
2754 err = 0;
2755 goto out;
KOSAKI Motohirod69b2e632010-03-23 13:35:30 -07002756 case MPOL_BIND:
2757 /*
2758 * Insist on a nodelist
2759 */
2760 if (!nodelist)
2761 goto out;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002762 }
2763
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002764 mode_flags = 0;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002765 if (flags) {
2766 /*
2767 * Currently, we only support two mutually exclusive
2768 * mode flags.
2769 */
2770 if (!strcmp(flags, "static"))
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002771 mode_flags |= MPOL_F_STATIC_NODES;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002772 else if (!strcmp(flags, "relative"))
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002773 mode_flags |= MPOL_F_RELATIVE_NODES;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002774 else
KOSAKI Motohiro926f2ae2010-03-23 13:35:32 -07002775 goto out;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002776 }
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002777
2778 new = mpol_new(mode, mode_flags, &nodes);
2779 if (IS_ERR(new))
KOSAKI Motohiro926f2ae2010-03-23 13:35:32 -07002780 goto out;
2781
Hugh Dickinsf2a07f42013-01-02 02:01:33 -08002782 /*
2783 * Save nodes for mpol_to_str() to show the tmpfs mount options
2784 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2785 */
2786 if (mode != MPOL_PREFERRED)
2787 new->v.nodes = nodes;
2788 else if (nodelist)
2789 new->v.preferred_node = first_node(nodes);
2790 else
2791 new->flags |= MPOL_F_LOCAL;
2792
2793 /*
2794 * Save nodes for contextualization: this will be used to "clone"
2795 * the mempolicy in a specific context [cpuset] at a later time.
2796 */
2797 new->w.user_nodemask = nodes;
2798
KOSAKI Motohiro926f2ae2010-03-23 13:35:32 -07002799 err = 0;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002800
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002801out:
2802 /* Restore string for error message */
2803 if (nodelist)
2804 *--nodelist = ':';
2805 if (flags)
2806 *--flags = '=';
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002807 if (!err)
2808 *mpol = new;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002809 return err;
2810}
2811#endif /* CONFIG_TMPFS */
2812
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002813/**
2814 * mpol_to_str - format a mempolicy structure for printing
2815 * @buffer: to contain formatted mempolicy string
2816 * @maxlen: length of @buffer
2817 * @pol: pointer to mempolicy to be formatted
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002818 *
David Rientjes948927e2013-11-12 15:07:28 -08002819 * Convert @pol into a string. If @buffer is too short, truncate the string.
2820 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2821 * longest flag, "relative", and to display at least a few node ids.
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002822 */
David Rientjes948927e2013-11-12 15:07:28 -08002823void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002824{
2825 char *p = buffer;
David Rientjes948927e2013-11-12 15:07:28 -08002826 nodemask_t nodes = NODE_MASK_NONE;
2827 unsigned short mode = MPOL_DEFAULT;
2828 unsigned short flags = 0;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002829
David Rientjes8790c71a2014-01-30 15:46:08 -08002830 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
Lee Schermerhornbea904d2008-04-28 02:13:18 -07002831 mode = pol->mode;
David Rientjes948927e2013-11-12 15:07:28 -08002832 flags = pol->flags;
2833 }
Lee Schermerhornbea904d2008-04-28 02:13:18 -07002834
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002835 switch (mode) {
2836 case MPOL_DEFAULT:
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002837 break;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002838 case MPOL_PREFERRED:
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07002839 if (flags & MPOL_F_LOCAL)
Hugh Dickinsf2a07f42013-01-02 02:01:33 -08002840 mode = MPOL_LOCAL;
Lee Schermerhorn53f25562008-04-28 02:13:20 -07002841 else
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07002842 node_set(pol->v.preferred_node, nodes);
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002843 break;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002844 case MPOL_BIND:
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002845 case MPOL_INTERLEAVE:
Hugh Dickinsf2a07f42013-01-02 02:01:33 -08002846 nodes = pol->v.nodes;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002847 break;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002848 default:
David Rientjes948927e2013-11-12 15:07:28 -08002849 WARN_ON_ONCE(1);
2850 snprintf(p, maxlen, "unknown");
2851 return;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002852 }
2853
David Rientjesb7a9f422013-11-21 14:32:06 -08002854 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002855
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07002856 if (flags & MPOL_MODE_FLAGS) {
David Rientjes948927e2013-11-12 15:07:28 -08002857 p += snprintf(p, buffer + maxlen - p, "=");
David Rientjesf5b087b2008-04-28 02:12:27 -07002858
Lee Schermerhorn22919902008-04-28 02:13:22 -07002859 /*
2860 * Currently, the only defined flags are mutually exclusive
2861 */
David Rientjesf5b087b2008-04-28 02:12:27 -07002862 if (flags & MPOL_F_STATIC_NODES)
Lee Schermerhorn22919902008-04-28 02:13:22 -07002863 p += snprintf(p, buffer + maxlen - p, "static");
2864 else if (flags & MPOL_F_RELATIVE_NODES)
2865 p += snprintf(p, buffer + maxlen - p, "relative");
David Rientjesf5b087b2008-04-28 02:12:27 -07002866 }
2867
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002868 if (!nodes_empty(nodes)) {
David Rientjes948927e2013-11-12 15:07:28 -08002869 p += snprintf(p, buffer + maxlen - p, ":");
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002870 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2871 }
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002872}