Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: make set_mempolicy(MPOL_INTERLEAV) N_HIGH_MEMORY aware

At first, init_task's mems_allowed is initialized as this.
init_task->mems_allowed == node_state[N_POSSIBLE]

And cpuset's top_cpuset mask is initialized as this
top_cpuset->mems_allowed = node_state[N_HIGH_MEMORY]

Before 2.6.29:
policy's mems_allowed is initialized as this.

1. update tasks->mems_allowed by its cpuset->mems_allowed.
2. policy->mems_allowed = nodes_and(tasks->mems_allowed, user's mask)

Updating task's mems_allowed in reference to top_cpuset's one.
cpuset's mems_allowed is aware of N_HIGH_MEMORY, always.

In 2.6.30: After commit 58568d2a8215cb6f55caf2332017d7bdff954e1c
("cpuset,mm: update tasks' mems_allowed in time"), policy's mems_allowed
is initialized as this.

1. policy->mems_allowd = nodes_and(task->mems_allowed, user's mask)

Here, if task is in top_cpuset, task->mems_allowed is not updated from
init's one. Assume user excutes command as #numactrl --interleave=all
,....

policy->mems_allowd = nodes_and(N_POSSIBLE, ALL_SET_MASK)

Then, policy's mems_allowd can includes a possible node, which has no pgdat.

MPOL's INTERLEAVE just scans nodemask of task->mems_allowd and access this
directly.

NODE_DATA(nid)->zonelist even if NODE_DATA(nid)==NULL

Then, what's we need is making policy->mems_allowed be aware of
N_HIGH_MEMORY. This patch does that. But to do so, extra nodemask will
be on statck. Because I know cpumask has a new interface of
CPUMASK_ALLOC(), I added it to node.

This patch stands on old behavior. But I feel this fix itself is just a
Band-Aid. But to do fundametal fix, we have to take care of memory
hotplug and it takes time. (task->mems_allowd should be N_HIGH_MEMORY, I
think.)

mpol_set_nodemask() should be aware of N_HIGH_MEMORY and policy's nodemask
should be includes only online nodes.

In old behavior, this is guaranteed by frequent reference to cpuset's
code. Now, most of them are removed and mempolicy has to check it by
itself.

To do check, a few nodemask_t will be used for calculating nodemask. But,
size of nodemask_t can be big and it's not good to allocate them on stack.

Now, cpumask_t has CPUMASK_ALLOC/FREE an easy code for get scratch area.
NODEMASK_ALLOC/FREE shoudl be there.

[akpm@linux-foundation.org: cleanups & tweaks]
Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Paul Menage <menage@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: David Rientjes <rientjes@google.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

KAMEZAWA Hiroyuki and committed by
Linus Torvalds
4bfc4495 93274e4d

+86 -26
+28
include/linux/nodemask.h
··· 82 82 * to generate slightly worse code. So use a simple one-line #define 83 83 * for node_isset(), instead of wrapping an inline inside a macro, the 84 84 * way we do the other calls. 85 + * 86 + * NODEMASK_SCRATCH 87 + * When doing above logical AND, OR, XOR, Remap operations the callers tend to 88 + * need temporary nodemask_t's on the stack. But if NODES_SHIFT is large, 89 + * nodemask_t's consume too much stack space. NODEMASK_SCRATCH is a helper 90 + * for such situations. See below and CPUMASK_ALLOC also. 85 91 */ 86 92 87 93 #include <linux/kernel.h> ··· 478 472 479 473 #define for_each_node(node) for_each_node_state(node, N_POSSIBLE) 480 474 #define for_each_online_node(node) for_each_node_state(node, N_ONLINE) 475 + 476 + /* 477 + * For nodemask scrach area.(See CPUMASK_ALLOC() in cpumask.h) 478 + */ 479 + 480 + #if NODES_SHIFT > 8 /* nodemask_t > 64 bytes */ 481 + #define NODEMASK_ALLOC(x, m) struct x *m = kmalloc(sizeof(*m), GFP_KERNEL) 482 + #define NODEMASK_FREE(m) kfree(m) 483 + #else 484 + #define NODEMASK_ALLOC(x, m) struct x _m, *m = &_m 485 + #define NODEMASK_FREE(m) 486 + #endif 487 + 488 + /* A example struture for using NODEMASK_ALLOC, used in mempolicy. */ 489 + struct nodemask_scratch { 490 + nodemask_t mask1; 491 + nodemask_t mask2; 492 + }; 493 + 494 + #define NODEMASK_SCRATCH(x) NODEMASK_ALLOC(nodemask_scratch, x) 495 + #define NODEMASK_SCRATCH_FREE(x) NODEMASK_FREE(x) 496 + 481 497 482 498 #endif /* __LINUX_NODEMASK_H */
+58 -26
mm/mempolicy.c
··· 191 191 * Must be called holding task's alloc_lock to protect task's mems_allowed 192 192 * and mempolicy. May also be called holding the mmap_semaphore for write. 193 193 */ 194 - static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) 194 + static int mpol_set_nodemask(struct mempolicy *pol, 195 + const nodemask_t *nodes, struct nodemask_scratch *nsc) 195 196 { 196 - nodemask_t cpuset_context_nmask; 197 197 int ret; 198 198 199 199 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ 200 200 if (pol == NULL) 201 201 return 0; 202 + /* Check N_HIGH_MEMORY */ 203 + nodes_and(nsc->mask1, 204 + cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); 202 205 203 206 VM_BUG_ON(!nodes); 204 207 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) 205 208 nodes = NULL; /* explicit local allocation */ 206 209 else { 207 210 if (pol->flags & MPOL_F_RELATIVE_NODES) 208 - mpol_relative_nodemask(&cpuset_context_nmask, nodes, 209 - &cpuset_current_mems_allowed); 211 + mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1); 210 212 else 211 - nodes_and(cpuset_context_nmask, *nodes, 212 - cpuset_current_mems_allowed); 213 + nodes_and(nsc->mask2, *nodes, nsc->mask1); 214 + 213 215 if (mpol_store_user_nodemask(pol)) 214 216 pol->w.user_nodemask = *nodes; 215 217 else ··· 219 217 cpuset_current_mems_allowed; 220 218 } 221 219 222 - ret = mpol_ops[pol->mode].create(pol, 223 - nodes ? &cpuset_context_nmask : NULL); 220 + if (nodes) 221 + ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); 222 + else 223 + ret = mpol_ops[pol->mode].create(pol, NULL); 224 224 return ret; 225 225 } 226 226 ··· 624 620 { 625 621 struct mempolicy *new, *old; 626 622 struct mm_struct *mm = current->mm; 623 + NODEMASK_SCRATCH(scratch); 627 624 int ret; 628 625 629 - new = mpol_new(mode, flags, nodes); 630 - if (IS_ERR(new)) 631 - return PTR_ERR(new); 626 + if (!scratch) 627 + return -ENOMEM; 632 628 629 + new = mpol_new(mode, flags, nodes); 630 + if (IS_ERR(new)) { 631 + ret = PTR_ERR(new); 632 + goto out; 633 + } 633 634 /* 634 635 * prevent changing our mempolicy while show_numa_maps() 635 636 * is using it. ··· 644 635 if (mm) 645 636 down_write(&mm->mmap_sem); 646 637 task_lock(current); 647 - ret = mpol_set_nodemask(new, nodes); 638 + ret = mpol_set_nodemask(new, nodes, scratch); 648 639 if (ret) { 649 640 task_unlock(current); 650 641 if (mm) 651 642 up_write(&mm->mmap_sem); 652 643 mpol_put(new); 653 - return ret; 644 + goto out; 654 645 } 655 646 old = current->mempolicy; 656 647 current->mempolicy = new; ··· 663 654 up_write(&mm->mmap_sem); 664 655 665 656 mpol_put(old); 666 - return 0; 657 + ret = 0; 658 + out: 659 + NODEMASK_SCRATCH_FREE(scratch); 660 + return ret; 667 661 } 668 662 669 663 /* ··· 1026 1014 if (err) 1027 1015 return err; 1028 1016 } 1029 - down_write(&mm->mmap_sem); 1030 - task_lock(current); 1031 - err = mpol_set_nodemask(new, nmask); 1032 - task_unlock(current); 1017 + { 1018 + NODEMASK_SCRATCH(scratch); 1019 + if (scratch) { 1020 + down_write(&mm->mmap_sem); 1021 + task_lock(current); 1022 + err = mpol_set_nodemask(new, nmask, scratch); 1023 + task_unlock(current); 1024 + if (err) 1025 + up_write(&mm->mmap_sem); 1026 + } else 1027 + err = -ENOMEM; 1028 + NODEMASK_SCRATCH_FREE(scratch); 1029 + } 1033 1030 if (err) { 1034 - up_write(&mm->mmap_sem); 1035 1031 mpol_put(new); 1036 1032 return err; 1037 1033 } ··· 1911 1891 * Install non-NULL @mpol in inode's shared policy rb-tree. 1912 1892 * On entry, the current task has a reference on a non-NULL @mpol. 1913 1893 * This must be released on exit. 1894 + * This is called at get_inode() calls and we can use GFP_KERNEL. 1914 1895 */ 1915 1896 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 1916 1897 { ··· 1923 1902 if (mpol) { 1924 1903 struct vm_area_struct pvma; 1925 1904 struct mempolicy *new; 1905 + NODEMASK_SCRATCH(scratch); 1926 1906 1907 + if (!scratch) 1908 + return; 1927 1909 /* contextualize the tmpfs mount point mempolicy */ 1928 1910 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 1929 1911 if (IS_ERR(new)) { 1930 1912 mpol_put(mpol); /* drop our ref on sb mpol */ 1913 + NODEMASK_SCRATCH_FREE(scratch); 1931 1914 return; /* no valid nodemask intersection */ 1932 1915 } 1933 1916 1934 1917 task_lock(current); 1935 - ret = mpol_set_nodemask(new, &mpol->w.user_nodemask); 1918 + ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); 1936 1919 task_unlock(current); 1937 1920 mpol_put(mpol); /* drop our ref on sb mpol */ 1938 1921 if (ret) { 1922 + NODEMASK_SCRATCH_FREE(scratch); 1939 1923 mpol_put(new); 1940 1924 return; 1941 1925 } ··· 1950 1924 pvma.vm_end = TASK_SIZE; /* policy covers entire file */ 1951 1925 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ 1952 1926 mpol_put(new); /* drop initial ref */ 1927 + NODEMASK_SCRATCH_FREE(scratch); 1953 1928 } 1954 1929 } 1955 1930 ··· 2167 2140 err = 1; 2168 2141 else { 2169 2142 int ret; 2170 - 2171 - task_lock(current); 2172 - ret = mpol_set_nodemask(new, &nodes); 2173 - task_unlock(current); 2174 - if (ret) 2143 + NODEMASK_SCRATCH(scratch); 2144 + if (scratch) { 2145 + task_lock(current); 2146 + ret = mpol_set_nodemask(new, &nodes, scratch); 2147 + task_unlock(current); 2148 + } else 2149 + ret = -ENOMEM; 2150 + NODEMASK_SCRATCH_FREE(scratch); 2151 + if (ret) { 2175 2152 err = 1; 2176 - else if (no_context) { 2153 + mpol_put(new); 2154 + } else if (no_context) { 2177 2155 /* save for contextualization */ 2178 2156 new->w.user_nodemask = nodes; 2179 2157 }