Fix NUMA Memory Policy Reference Counting

This patch proposes fixes to the reference counting of memory policy in the
page allocation paths and in show_numa_map(). Extracted from my "Memory
Policy Cleanups and Enhancements" series as stand-alone.

Shared policy lookup [shmem] has always added a reference to the policy,
but this was never unrefed after page allocation or after formatting the
numa map data.

Default system policy should not require additional ref counting, nor
should the current task's task policy. However, show_numa_map() calls
get_vma_policy() to examine what may be [likely is] another task's policy.
The latter case needs protection against freeing of the policy.

This patch adds a reference count to a mempolicy returned by
get_vma_policy() when the policy is a vma policy or another task's
mempolicy. Again, shared policy is already reference counted on lookup. A
matching "unref" [__mpol_free()] is performed in alloc_page_vma() for
shared and vma policies, and in show_numa_map() for shared and another
task's mempolicy. We can call __mpol_free() directly, saving an admittedly
inexpensive inline NULL test, because we know we have a non-NULL policy.

Handling policy ref counts for hugepages is a bit trickier.
huge_zonelist() returns a zone list that might come from a shared or vma
'BIND policy. In this case, we should hold the reference until after the
huge page allocation in dequeue_hugepage(). The patch modifies
huge_zonelist() to return a pointer to the mempolicy if it needs to be
unref'd after allocation.

Kernel Build [16cpu, 32GB, ia64] - average of 10 runs:

w/o patch w/ refcount patch
Avg Std Devn Avg Std Devn
Real: 100.59 0.38 100.63 0.43
User: 1209.60 0.37 1209.91 0.31
System: 81.52 0.42 81.64 0.34

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Andi Kleen <ak@suse.de>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by Lee Schermerhorn and committed by Linus Torvalds 480eccf9 28f300d2

+75 -12
+2 -2
include/linux/mempolicy.h
··· 159 160 extern struct mempolicy default_policy; 161 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, 162 - unsigned long addr, gfp_t gfp_flags); 163 extern unsigned slab_node(struct mempolicy *policy); 164 165 extern enum zone_type policy_zone; ··· 256 #define set_cpuset_being_rebound(x) do {} while (0) 257 258 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, 259 - unsigned long addr, gfp_t gfp_flags) 260 { 261 return NODE_DATA(0)->node_zonelists + gfp_zone(gfp_flags); 262 }
··· 159 160 extern struct mempolicy default_policy; 161 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, 162 + unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol); 163 extern unsigned slab_node(struct mempolicy *policy); 164 165 extern enum zone_type policy_zone; ··· 256 #define set_cpuset_being_rebound(x) do {} while (0) 257 258 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, 259 + unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol) 260 { 261 return NODE_DATA(0)->node_zonelists + gfp_zone(gfp_flags); 262 }
+3 -1
mm/hugetlb.c
··· 71 { 72 int nid; 73 struct page *page = NULL; 74 struct zonelist *zonelist = huge_zonelist(vma, address, 75 - htlb_alloc_mask); 76 struct zone **z; 77 78 for (z = zonelist->zones; *z; z++) { ··· 88 break; 89 } 90 } 91 return page; 92 } 93
··· 71 { 72 int nid; 73 struct page *page = NULL; 74 + struct mempolicy *mpol; 75 struct zonelist *zonelist = huge_zonelist(vma, address, 76 + htlb_alloc_mask, &mpol); 77 struct zone **z; 78 79 for (z = zonelist->zones; *z; z++) { ··· 87 break; 88 } 89 } 90 + mpol_free(mpol); /* unref if mpol !NULL */ 91 return page; 92 } 93
+70 -9
mm/mempolicy.c
··· 1077 1078 #endif 1079 1080 - /* Return effective policy for a VMA */ 1081 static struct mempolicy * get_vma_policy(struct task_struct *task, 1082 struct vm_area_struct *vma, unsigned long addr) 1083 { 1084 struct mempolicy *pol = task->mempolicy; 1085 1086 if (vma) { 1087 - if (vma->vm_ops && vma->vm_ops->get_policy) 1088 pol = vma->vm_ops->get_policy(vma, addr); 1089 - else if (vma->vm_policy && 1090 vma->vm_policy->policy != MPOL_DEFAULT) 1091 pol = vma->vm_policy; 1092 } 1093 if (!pol) 1094 pol = &default_policy; 1095 return pol; 1096 } 1097 ··· 1223 } 1224 1225 #ifdef CONFIG_HUGETLBFS 1226 - /* Return a zonelist suitable for a huge page allocation. */ 1227 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, 1228 - gfp_t gfp_flags) 1229 { 1230 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1231 1232 if (pol->policy == MPOL_INTERLEAVE) { 1233 unsigned nid; 1234 1235 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); 1236 return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags); 1237 } 1238 - return zonelist_policy(GFP_HIGHUSER, pol); 1239 } 1240 #endif 1241 ··· 1306 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) 1307 { 1308 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1309 1310 cpuset_update_task_memory_state(); 1311 ··· 1316 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); 1317 return alloc_page_interleave(gfp, 0, nid); 1318 } 1319 - return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); 1320 } 1321 1322 /** ··· 1927 struct numa_maps *md; 1928 struct file *file = vma->vm_file; 1929 struct mm_struct *mm = vma->vm_mm; 1930 int n; 1931 char buffer[50]; 1932 ··· 1938 if (!md) 1939 return 0; 1940 1941 - mpol_to_str(buffer, sizeof(buffer), 1942 - get_vma_policy(priv->task, vma, vma->vm_start)); 1943 1944 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 1945
··· 1077 1078 #endif 1079 1080 + /* 1081 + * get_vma_policy(@task, @vma, @addr) 1082 + * @task - task for fallback if vma policy == default 1083 + * @vma - virtual memory area whose policy is sought 1084 + * @addr - address in @vma for shared policy lookup 1085 + * 1086 + * Returns effective policy for a VMA at specified address. 1087 + * Falls back to @task or system default policy, as necessary. 1088 + * Returned policy has extra reference count if shared, vma, 1089 + * or some other task's policy [show_numa_maps() can pass 1090 + * @task != current]. It is the caller's responsibility to 1091 + * free the reference in these cases. 1092 + */ 1093 static struct mempolicy * get_vma_policy(struct task_struct *task, 1094 struct vm_area_struct *vma, unsigned long addr) 1095 { 1096 struct mempolicy *pol = task->mempolicy; 1097 + int shared_pol = 0; 1098 1099 if (vma) { 1100 + if (vma->vm_ops && vma->vm_ops->get_policy) { 1101 pol = vma->vm_ops->get_policy(vma, addr); 1102 + shared_pol = 1; /* if pol non-NULL, add ref below */ 1103 + } else if (vma->vm_policy && 1104 vma->vm_policy->policy != MPOL_DEFAULT) 1105 pol = vma->vm_policy; 1106 } 1107 if (!pol) 1108 pol = &default_policy; 1109 + else if (!shared_pol && pol != current->mempolicy) 1110 + mpol_get(pol); /* vma or other task's policy */ 1111 return pol; 1112 } 1113 ··· 1207 } 1208 1209 #ifdef CONFIG_HUGETLBFS 1210 + /* 1211 + * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) 1212 + * @vma = virtual memory area whose policy is sought 1213 + * @addr = address in @vma for shared policy lookup and interleave policy 1214 + * @gfp_flags = for requested zone 1215 + * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy 1216 + * 1217 + * Returns a zonelist suitable for a huge page allocation. 1218 + * If the effective policy is 'BIND, returns pointer to policy's zonelist. 1219 + * If it is also a policy for which get_vma_policy() returns an extra 1220 + * reference, we must hold that reference until after allocation. 1221 + * In that case, return policy via @mpol so hugetlb allocation can drop 1222 + * the reference. For non-'BIND referenced policies, we can/do drop the 1223 + * reference here, so the caller doesn't need to know about the special case 1224 + * for default and current task policy. 1225 + */ 1226 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, 1227 + gfp_t gfp_flags, struct mempolicy **mpol) 1228 { 1229 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1230 + struct zonelist *zl; 1231 1232 + *mpol = NULL; /* probably no unref needed */ 1233 if (pol->policy == MPOL_INTERLEAVE) { 1234 unsigned nid; 1235 1236 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); 1237 + __mpol_free(pol); /* finished with pol */ 1238 return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags); 1239 } 1240 + 1241 + zl = zonelist_policy(GFP_HIGHUSER, pol); 1242 + if (unlikely(pol != &default_policy && pol != current->mempolicy)) { 1243 + if (pol->policy != MPOL_BIND) 1244 + __mpol_free(pol); /* finished with pol */ 1245 + else 1246 + *mpol = pol; /* unref needed after allocation */ 1247 + } 1248 + return zl; 1249 } 1250 #endif 1251 ··· 1264 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) 1265 { 1266 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1267 + struct zonelist *zl; 1268 1269 cpuset_update_task_memory_state(); 1270 ··· 1273 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); 1274 return alloc_page_interleave(gfp, 0, nid); 1275 } 1276 + zl = zonelist_policy(gfp, pol); 1277 + if (pol != &default_policy && pol != current->mempolicy) { 1278 + /* 1279 + * slow path: ref counted policy -- shared or vma 1280 + */ 1281 + struct page *page = __alloc_pages(gfp, 0, zl); 1282 + __mpol_free(pol); 1283 + return page; 1284 + } 1285 + /* 1286 + * fast path: default or task policy 1287 + */ 1288 + return __alloc_pages(gfp, 0, zl); 1289 } 1290 1291 /** ··· 1872 struct numa_maps *md; 1873 struct file *file = vma->vm_file; 1874 struct mm_struct *mm = vma->vm_mm; 1875 + struct mempolicy *pol; 1876 int n; 1877 char buffer[50]; 1878 ··· 1882 if (!md) 1883 return 0; 1884 1885 + pol = get_vma_policy(priv->task, vma, vma->vm_start); 1886 + mpol_to_str(buffer, sizeof(buffer), pol); 1887 + /* 1888 + * unref shared or other task's mempolicy 1889 + */ 1890 + if (pol != &default_policy && pol != current->mempolicy) 1891 + __mpol_free(pol); 1892 1893 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 1894