Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

[PATCH] cpuset memory spread slab cache optimizations

The hooks in the slab cache allocator code path for support of NUMA
mempolicies and cpuset memory spreading are in an important code path. Many
systems will use neither feature.

This patch optimizes those hooks down to a single check of some bits in the
current tasks task_struct flags. For non NUMA systems, this hook and related
code is already ifdef'd out.

The optimization is done by using another task flag, set if the task is using
a non-default NUMA mempolicy. Taking this flag bit along with the
PF_SPREAD_PAGE and PF_SPREAD_SLAB flag bits added earlier in this 'cpuset
memory spreading' patch set, one can check for the combination of any of these
special case memory placement mechanisms with a single test of the current
tasks task_struct flags.

This patch also tightens up the code, to save a few bytes of kernel text
space, and moves some of it out of line. Due to the nested inlines called
from multiple places, we were ending up with three copies of this code, which
once we get off the main code path (for local node allocation) seems a bit
wasteful of instruction memory.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by

Paul Jackson and committed by
Linus Torvalds
c61afb18 101a5001

+67 -13
+5
include/linux/mempolicy.h
··· 147 147 extern void mpol_rebind_task(struct task_struct *tsk, 148 148 const nodemask_t *new); 149 149 extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); 150 + extern void mpol_fix_fork_child_flag(struct task_struct *p); 150 151 #define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x)) 151 152 152 153 #ifdef CONFIG_CPUSET ··· 246 245 } 247 246 248 247 static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) 248 + { 249 + } 250 + 251 + static inline void mpol_fix_fork_child_flag(struct task_struct *p) 249 252 { 250 253 } 251 254
+1
include/linux/sched.h
··· 932 932 #define PF_SWAPWRITE 0x01000000 /* Allowed to write to swap */ 933 933 #define PF_SPREAD_PAGE 0x04000000 /* Spread page cache over cpuset */ 934 934 #define PF_SPREAD_SLAB 0x08000000 /* Spread some slab caches over cpuset */ 935 + #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ 935 936 936 937 /* 937 938 * Only the _current_ task can read/write to tsk->flags, but other
+1
kernel/fork.c
··· 1021 1021 p->mempolicy = NULL; 1022 1022 goto bad_fork_cleanup_cpuset; 1023 1023 } 1024 + mpol_fix_fork_child_flag(p); 1024 1025 #endif 1025 1026 1026 1027 #ifdef CONFIG_DEBUG_MUTEXES
+32
mm/mempolicy.c
··· 422 422 return mpol_check_policy(mode, nodes); 423 423 } 424 424 425 + 426 + /* 427 + * Update task->flags PF_MEMPOLICY bit: set iff non-default 428 + * mempolicy. Allows more rapid checking of this (combined perhaps 429 + * with other PF_* flag bits) on memory allocation hot code paths. 430 + * 431 + * If called from outside this file, the task 'p' should -only- be 432 + * a newly forked child not yet visible on the task list, because 433 + * manipulating the task flags of a visible task is not safe. 434 + * 435 + * The above limitation is why this routine has the funny name 436 + * mpol_fix_fork_child_flag(). 437 + * 438 + * It is also safe to call this with a task pointer of current, 439 + * which the static wrapper mpol_set_task_struct_flag() does, 440 + * for use within this file. 441 + */ 442 + 443 + void mpol_fix_fork_child_flag(struct task_struct *p) 444 + { 445 + if (p->mempolicy) 446 + p->flags |= PF_MEMPOLICY; 447 + else 448 + p->flags &= ~PF_MEMPOLICY; 449 + } 450 + 451 + static void mpol_set_task_struct_flag(void) 452 + { 453 + mpol_fix_fork_child_flag(current); 454 + } 455 + 425 456 /* Set the process memory policy */ 426 457 long do_set_mempolicy(int mode, nodemask_t *nodes) 427 458 { ··· 465 434 return PTR_ERR(new); 466 435 mpol_free(current->mempolicy); 467 436 current->mempolicy = new; 437 + mpol_set_task_struct_flag(); 468 438 if (new && new->policy == MPOL_INTERLEAVE) 469 439 current->il_next = first_node(new->v.nodes); 470 440 return 0;
+28 -13
mm/slab.c
··· 899 899 900 900 #ifdef CONFIG_NUMA 901 901 static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); 902 + static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 902 903 903 904 static struct array_cache **alloc_alien_cache(int node, int limit) 904 905 { ··· 2809 2808 struct array_cache *ac; 2810 2809 2811 2810 #ifdef CONFIG_NUMA 2812 - if (unlikely(current->mempolicy && !in_interrupt())) { 2813 - int nid = slab_node(current->mempolicy); 2814 - 2815 - if (nid != numa_node_id()) 2816 - return __cache_alloc_node(cachep, flags, nid); 2817 - } 2818 - if (unlikely(cpuset_do_slab_mem_spread() && 2819 - (cachep->flags & SLAB_MEM_SPREAD) && 2820 - !in_interrupt())) { 2821 - int nid = cpuset_mem_spread_node(); 2822 - 2823 - if (nid != numa_node_id()) 2824 - return __cache_alloc_node(cachep, flags, nid); 2811 + if (unlikely(current->flags & (PF_SPREAD_PAGE | PF_SPREAD_SLAB | 2812 + PF_MEMPOLICY))) { 2813 + objp = alternate_node_alloc(cachep, flags); 2814 + if (objp != NULL) 2815 + return objp; 2825 2816 } 2826 2817 #endif 2827 2818 ··· 2848 2855 } 2849 2856 2850 2857 #ifdef CONFIG_NUMA 2858 + /* 2859 + * Try allocating on another node if PF_SPREAD_PAGE|PF_SPREAD_SLAB|PF_MEMPOLICY. 2860 + * 2861 + * If we are in_interrupt, then process context, including cpusets and 2862 + * mempolicy, may not apply and should not be used for allocation policy. 2863 + */ 2864 + static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) 2865 + { 2866 + int nid_alloc, nid_here; 2867 + 2868 + if (in_interrupt()) 2869 + return NULL; 2870 + nid_alloc = nid_here = numa_node_id(); 2871 + if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 2872 + nid_alloc = cpuset_mem_spread_node(); 2873 + else if (current->mempolicy) 2874 + nid_alloc = slab_node(current->mempolicy); 2875 + if (nid_alloc != nid_here) 2876 + return __cache_alloc_node(cachep, flags, nid_alloc); 2877 + return NULL; 2878 + } 2879 + 2851 2880 /* 2852 2881 * A interface to enable slab creation on nodeid 2853 2882 */