Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

[PATCH] VM: early zone reclaim

This is the core of the (much simplified) early reclaim. The goal of this
patch is to reclaim some easily-freed pages from a zone before falling back
onto another zone.

One of the major uses of this is NUMA machines. With the default allocator
behavior the allocator would look for memory in another zone, which might be
off-node, before trying to reclaim from the current zone.

This adds a zone tuneable to enable early zone reclaim. It is selected on a
per-zone basis and is turned on/off via syscall.

Adding some extra throttling on the reclaim was also required (patch
4/4). Without the machine would grind to a crawl when doing a "make -j"
kernel build. Even with this patch the System Time is higher on
average, but it seems tolerable. Here are some numbers for kernbench
runs on a 2-node, 4cpu, 8Gig RAM Altix in the "make -j" run:

wall user sys %cpu ctx sw. sleeps
---- ---- --- ---- ------ ------
No patch 1009 1384 847 258 298170 504402
w/patch, no reclaim 880 1376 667 288 254064 396745
w/patch & reclaim 1079 1385 926 252 291625 548873

These numbers are the average of 2 runs of 3 "make -j" runs done right
after system boot. Run-to-run variability for "make -j" is huge, so
these numbers aren't terribly useful except to seee that with reclaim
the benchmark still finishes in a reasonable amount of time.

I also looked at the NUMA hit/miss stats for the "make -j" runs and the
reclaim doesn't make any difference when the machine is thrashing away.

Doing a "make -j8" on a single node that is filled with page cache pages
takes 700 seconds with reclaim turned on and 735 seconds without reclaim
(due to remote memory accesses).

The simple zone_reclaim syscall program is at
http://www.bork.org/~mort/sgi/zone_reclaim.c

Signed-off-by: Martin Hicks <mort@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by

Martin Hicks and committed by
Linus Torvalds
753ee728 bfbb38fb

+104 -8
+1 -1
arch/i386/kernel/syscall_table.S
··· 251 251 .long sys_io_submit 252 252 .long sys_io_cancel 253 253 .long sys_fadvise64 /* 250 */ 254 - .long sys_ni_syscall 254 + .long sys_set_zone_reclaim 255 255 .long sys_exit_group 256 256 .long sys_lookup_dcookie 257 257 .long sys_epoll_create
+1 -1
arch/ia64/kernel/entry.S
··· 1579 1579 data8 sys_keyctl 1580 1580 data8 sys_ni_syscall 1581 1581 data8 sys_ni_syscall // 1275 1582 - data8 sys_ni_syscall 1582 + data8 sys_set_zone_reclaim 1583 1583 data8 sys_ni_syscall 1584 1584 data8 sys_ni_syscall 1585 1585 data8 sys_ni_syscall
+1 -1
include/asm-i386/unistd.h
··· 256 256 #define __NR_io_submit 248 257 257 #define __NR_io_cancel 249 258 258 #define __NR_fadvise64 250 259 - 259 + #define __NR_set_zone_reclaim 251 260 260 #define __NR_exit_group 252 261 261 #define __NR_lookup_dcookie 253 262 262 #define __NR_epoll_create 254
+1
include/asm-ia64/unistd.h
··· 263 263 #define __NR_add_key 1271 264 264 #define __NR_request_key 1272 265 265 #define __NR_keyctl 1273 266 + #define __NR_set_zone_reclaim 1276 266 267 267 268 #ifdef __KERNEL__ 268 269
+6
include/linux/mmzone.h
··· 145 145 int all_unreclaimable; /* All pages pinned */ 146 146 147 147 /* 148 + * Does the allocator try to reclaim pages from the zone as soon 149 + * as it fails a watermark_ok() in __alloc_pages? 150 + */ 151 + int reclaim_pages; 152 + 153 + /* 148 154 * prev_priority holds the scanning priority for this zone. It is 149 155 * defined as the scanning priority at which we achieved our reclaim 150 156 * target at the previous try_to_free_pages() or balance_pgdat()
+1
include/linux/swap.h
··· 173 173 174 174 /* linux/mm/vmscan.c */ 175 175 extern int try_to_free_pages(struct zone **, unsigned int, unsigned int); 176 + extern int zone_reclaim(struct zone *, unsigned int, unsigned int); 176 177 extern int shrink_all_memory(int); 177 178 extern int vm_swappiness; 178 179
+1
kernel/sys_ni.c
··· 77 77 cond_syscall(sys_keyctl); 78 78 cond_syscall(compat_sys_keyctl); 79 79 cond_syscall(compat_sys_socketcall); 80 + cond_syscall(sys_set_zone_reclaim); 80 81 81 82 /* arch-specific weak syscall entries */ 82 83 cond_syscall(sys_pciconfig_read);
+28 -5
mm/page_alloc.c
··· 724 724 return 1; 725 725 } 726 726 727 + static inline int 728 + should_reclaim_zone(struct zone *z, unsigned int gfp_mask) 729 + { 730 + if (!z->reclaim_pages) 731 + return 0; 732 + return 1; 733 + } 734 + 727 735 /* 728 736 * This is the 'heart' of the zoned buddy allocator. 729 737 */ ··· 768 760 769 761 classzone_idx = zone_idx(zones[0]); 770 762 771 - restart: 763 + restart: 772 764 /* Go through the zonelist once, looking for a zone with enough free */ 773 765 for (i = 0; (z = zones[i]) != NULL; i++) { 774 - 775 - if (!zone_watermark_ok(z, order, z->pages_low, 776 - classzone_idx, 0, 0)) 777 - continue; 766 + int do_reclaim = should_reclaim_zone(z, gfp_mask); 778 767 779 768 if (!cpuset_zone_allowed(z)) 780 769 continue; 770 + 771 + /* 772 + * If the zone is to attempt early page reclaim then this loop 773 + * will try to reclaim pages and check the watermark a second 774 + * time before giving up and falling back to the next zone. 775 + */ 776 + zone_reclaim_retry: 777 + if (!zone_watermark_ok(z, order, z->pages_low, 778 + classzone_idx, 0, 0)) { 779 + if (!do_reclaim) 780 + continue; 781 + else { 782 + zone_reclaim(z, gfp_mask, order); 783 + /* Only try reclaim once */ 784 + do_reclaim = 0; 785 + goto zone_reclaim_retry; 786 + } 787 + } 781 788 782 789 page = buffered_rmqueue(z, order, gfp_mask); 783 790 if (page)
+64
mm/vmscan.c
··· 1323 1323 } 1324 1324 1325 1325 module_init(kswapd_init) 1326 + 1327 + 1328 + /* 1329 + * Try to free up some pages from this zone through reclaim. 1330 + */ 1331 + int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order) 1332 + { 1333 + struct scan_control sc; 1334 + int nr_pages = 1 << order; 1335 + int total_reclaimed = 0; 1336 + 1337 + /* The reclaim may sleep, so don't do it if sleep isn't allowed */ 1338 + if (!(gfp_mask & __GFP_WAIT)) 1339 + return 0; 1340 + if (zone->all_unreclaimable) 1341 + return 0; 1342 + 1343 + sc.gfp_mask = gfp_mask; 1344 + sc.may_writepage = 0; 1345 + sc.may_swap = 0; 1346 + sc.nr_mapped = read_page_state(nr_mapped); 1347 + sc.nr_scanned = 0; 1348 + sc.nr_reclaimed = 0; 1349 + /* scan at the highest priority */ 1350 + sc.priority = 0; 1351 + 1352 + if (nr_pages > SWAP_CLUSTER_MAX) 1353 + sc.swap_cluster_max = nr_pages; 1354 + else 1355 + sc.swap_cluster_max = SWAP_CLUSTER_MAX; 1356 + 1357 + shrink_zone(zone, &sc); 1358 + total_reclaimed = sc.nr_reclaimed; 1359 + 1360 + return total_reclaimed; 1361 + } 1362 + 1363 + asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone, 1364 + unsigned int state) 1365 + { 1366 + struct zone *z; 1367 + int i; 1368 + 1369 + if (node >= MAX_NUMNODES || !node_online(node)) 1370 + return -EINVAL; 1371 + 1372 + /* This will break if we ever add more zones */ 1373 + if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM))) 1374 + return -EINVAL; 1375 + 1376 + for (i = 0; i < MAX_NR_ZONES; i++) { 1377 + if (!(zone & 1<<i)) 1378 + continue; 1379 + 1380 + z = &NODE_DATA(node)->node_zones[i]; 1381 + 1382 + if (state) 1383 + z->reclaim_pages = 1; 1384 + else 1385 + z->reclaim_pages = 0; 1386 + } 1387 + 1388 + return 0; 1389 + }