Fix NUMA Memory Policy Reference Counting

This patch proposes fixes to the reference counting of memory policy in the
page allocation paths and in show_numa_map(). Extracted from my "Memory
Policy Cleanups and Enhancements" series as stand-alone.

Shared policy lookup [shmem] has always added a reference to the policy,
but this was never unrefed after page allocation or after formatting the
numa map data.

Default system policy should not require additional ref counting, nor
should the current task's task policy. However, show_numa_map() calls
get_vma_policy() to examine what may be [likely is] another task's policy.
The latter case needs protection against freeing of the policy.

This patch adds a reference count to a mempolicy returned by
get_vma_policy() when the policy is a vma policy or another task's
mempolicy. Again, shared policy is already reference counted on lookup. A
matching "unref" [__mpol_free()] is performed in alloc_page_vma() for
shared and vma policies, and in show_numa_map() for shared and another
task's mempolicy. We can call __mpol_free() directly, saving an admittedly
inexpensive inline NULL test, because we know we have a non-NULL policy.

Handling policy ref counts for hugepages is a bit trickier.
huge_zonelist() returns a zone list that might come from a shared or vma
'BIND policy. In this case, we should hold the reference until after the
huge page allocation in dequeue_hugepage(). The patch modifies
huge_zonelist() to return a pointer to the mempolicy if it needs to be
unref'd after allocation.

Kernel Build [16cpu, 32GB, ia64] - average of 10 runs:

w/o patch w/ refcount patch
Avg Std Devn Avg Std Devn
Real: 100.59 0.38 100.63 0.43
User: 1209.60 0.37 1209.91 0.31
System: 81.52 0.42 81.64 0.34

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Andi Kleen <ak@suse.de>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by Lee Schermerhorn and committed by Linus Torvalds 480eccf9 28f300d2

+75 -12
+2 -2
include/linux/mempolicy.h
··· 159 159 160 160 extern struct mempolicy default_policy; 161 161 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, 162 - unsigned long addr, gfp_t gfp_flags); 162 + unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol); 163 163 extern unsigned slab_node(struct mempolicy *policy); 164 164 165 165 extern enum zone_type policy_zone; ··· 256 256 #define set_cpuset_being_rebound(x) do {} while (0) 257 257 258 258 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, 259 - unsigned long addr, gfp_t gfp_flags) 259 + unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol) 260 260 { 261 261 return NODE_DATA(0)->node_zonelists + gfp_zone(gfp_flags); 262 262 }
+3 -1
mm/hugetlb.c
··· 71 71 { 72 72 int nid; 73 73 struct page *page = NULL; 74 + struct mempolicy *mpol; 74 75 struct zonelist *zonelist = huge_zonelist(vma, address, 75 - htlb_alloc_mask); 76 + htlb_alloc_mask, &mpol); 76 77 struct zone **z; 77 78 78 79 for (z = zonelist->zones; *z; z++) { ··· 88 87 break; 89 88 } 90 89 } 90 + mpol_free(mpol); /* unref if mpol !NULL */ 91 91 return page; 92 92 } 93 93
+70 -9
mm/mempolicy.c
··· 1077 1077 1078 1078 #endif 1079 1079 1080 - /* Return effective policy for a VMA */ 1080 + /* 1081 + * get_vma_policy(@task, @vma, @addr) 1082 + * @task - task for fallback if vma policy == default 1083 + * @vma - virtual memory area whose policy is sought 1084 + * @addr - address in @vma for shared policy lookup 1085 + * 1086 + * Returns effective policy for a VMA at specified address. 1087 + * Falls back to @task or system default policy, as necessary. 1088 + * Returned policy has extra reference count if shared, vma, 1089 + * or some other task's policy [show_numa_maps() can pass 1090 + * @task != current]. It is the caller's responsibility to 1091 + * free the reference in these cases. 1092 + */ 1081 1093 static struct mempolicy * get_vma_policy(struct task_struct *task, 1082 1094 struct vm_area_struct *vma, unsigned long addr) 1083 1095 { 1084 1096 struct mempolicy *pol = task->mempolicy; 1097 + int shared_pol = 0; 1085 1098 1086 1099 if (vma) { 1087 - if (vma->vm_ops && vma->vm_ops->get_policy) 1100 + if (vma->vm_ops && vma->vm_ops->get_policy) { 1088 1101 pol = vma->vm_ops->get_policy(vma, addr); 1089 - else if (vma->vm_policy && 1102 + shared_pol = 1; /* if pol non-NULL, add ref below */ 1103 + } else if (vma->vm_policy && 1090 1104 vma->vm_policy->policy != MPOL_DEFAULT) 1091 1105 pol = vma->vm_policy; 1092 1106 } 1093 1107 if (!pol) 1094 1108 pol = &default_policy; 1109 + else if (!shared_pol && pol != current->mempolicy) 1110 + mpol_get(pol); /* vma or other task's policy */ 1095 1111 return pol; 1096 1112 } 1097 1113 ··· 1223 1207 } 1224 1208 1225 1209 #ifdef CONFIG_HUGETLBFS 1226 - /* Return a zonelist suitable for a huge page allocation. */ 1210 + /* 1211 + * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) 1212 + * @vma = virtual memory area whose policy is sought 1213 + * @addr = address in @vma for shared policy lookup and interleave policy 1214 + * @gfp_flags = for requested zone 1215 + * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy 1216 + * 1217 + * Returns a zonelist suitable for a huge page allocation. 1218 + * If the effective policy is 'BIND, returns pointer to policy's zonelist. 1219 + * If it is also a policy for which get_vma_policy() returns an extra 1220 + * reference, we must hold that reference until after allocation. 1221 + * In that case, return policy via @mpol so hugetlb allocation can drop 1222 + * the reference. For non-'BIND referenced policies, we can/do drop the 1223 + * reference here, so the caller doesn't need to know about the special case 1224 + * for default and current task policy. 1225 + */ 1227 1226 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, 1228 - gfp_t gfp_flags) 1227 + gfp_t gfp_flags, struct mempolicy **mpol) 1229 1228 { 1230 1229 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1230 + struct zonelist *zl; 1231 1231 1232 + *mpol = NULL; /* probably no unref needed */ 1232 1233 if (pol->policy == MPOL_INTERLEAVE) { 1233 1234 unsigned nid; 1234 1235 1235 1236 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); 1237 + __mpol_free(pol); /* finished with pol */ 1236 1238 return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags); 1237 1239 } 1238 - return zonelist_policy(GFP_HIGHUSER, pol); 1240 + 1241 + zl = zonelist_policy(GFP_HIGHUSER, pol); 1242 + if (unlikely(pol != &default_policy && pol != current->mempolicy)) { 1243 + if (pol->policy != MPOL_BIND) 1244 + __mpol_free(pol); /* finished with pol */ 1245 + else 1246 + *mpol = pol; /* unref needed after allocation */ 1247 + } 1248 + return zl; 1239 1249 } 1240 1250 #endif 1241 1251 ··· 1306 1264 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) 1307 1265 { 1308 1266 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1267 + struct zonelist *zl; 1309 1268 1310 1269 cpuset_update_task_memory_state(); 1311 1270 ··· 1316 1273 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); 1317 1274 return alloc_page_interleave(gfp, 0, nid); 1318 1275 } 1319 - return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); 1276 + zl = zonelist_policy(gfp, pol); 1277 + if (pol != &default_policy && pol != current->mempolicy) { 1278 + /* 1279 + * slow path: ref counted policy -- shared or vma 1280 + */ 1281 + struct page *page = __alloc_pages(gfp, 0, zl); 1282 + __mpol_free(pol); 1283 + return page; 1284 + } 1285 + /* 1286 + * fast path: default or task policy 1287 + */ 1288 + return __alloc_pages(gfp, 0, zl); 1320 1289 } 1321 1290 1322 1291 /** ··· 1927 1872 struct numa_maps *md; 1928 1873 struct file *file = vma->vm_file; 1929 1874 struct mm_struct *mm = vma->vm_mm; 1875 + struct mempolicy *pol; 1930 1876 int n; 1931 1877 char buffer[50]; 1932 1878 ··· 1938 1882 if (!md) 1939 1883 return 0; 1940 1884 1941 - mpol_to_str(buffer, sizeof(buffer), 1942 - get_vma_policy(priv->task, vma, vma->vm_start)); 1885 + pol = get_vma_policy(priv->task, vma, vma->vm_start); 1886 + mpol_to_str(buffer, sizeof(buffer), pol); 1887 + /* 1888 + * unref shared or other task's mempolicy 1889 + */ 1890 + if (pol != &default_policy && pol != current->mempolicy) 1891 + __mpol_free(pol); 1943 1892 1944 1893 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 1945 1894