[PATCH] VM: early zone reclaim · tjh.dev/kernel@753ee72

+1 -1

arch/i386/kernel/syscall_table.S

··· 251 251 .long sys_io_submit 252 252 .long sys_io_cancel 253 253 .long sys_fadvise64 /* 250 */ 254 - .long sys_ni_syscall 254 + .long sys_set_zone_reclaim 255 255 .long sys_exit_group 256 256 .long sys_lookup_dcookie 257 257 .long sys_epoll_create

+1 -1

arch/ia64/kernel/entry.S

··· 1579 1579 data8 sys_keyctl 1580 1580 data8 sys_ni_syscall 1581 1581 data8 sys_ni_syscall // 1275 1582 - data8 sys_ni_syscall 1582 + data8 sys_set_zone_reclaim 1583 1583 data8 sys_ni_syscall 1584 1584 data8 sys_ni_syscall 1585 1585 data8 sys_ni_syscall

+1 -1

include/asm-i386/unistd.h

··· 256 256 #define __NR_io_submit 248 257 257 #define __NR_io_cancel 249 258 258 #define __NR_fadvise64 250 259 - 259 + #define __NR_set_zone_reclaim 251 260 260 #define __NR_exit_group 252 261 261 #define __NR_lookup_dcookie 253 262 262 #define __NR_epoll_create 254

+1

include/asm-ia64/unistd.h

··· 263 263 #define __NR_add_key 1271 264 264 #define __NR_request_key 1272 265 265 #define __NR_keyctl 1273 266 + #define __NR_set_zone_reclaim 1276 266 267 267 268 #ifdef __KERNEL__ 268 269

+6

include/linux/mmzone.h

··· 145 145 int all_unreclaimable; /* All pages pinned */ 146 146 147 147 /* 148 + * Does the allocator try to reclaim pages from the zone as soon 149 + * as it fails a watermark_ok() in __alloc_pages? 150 + */ 151 + int reclaim_pages; 152 + 153 + /* 148 154 * prev_priority holds the scanning priority for this zone. It is 149 155 * defined as the scanning priority at which we achieved our reclaim 150 156 * target at the previous try_to_free_pages() or balance_pgdat()

+1

include/linux/swap.h

··· 173 173 174 174 /* linux/mm/vmscan.c */ 175 175 extern int try_to_free_pages(struct zone **, unsigned int, unsigned int); 176 + extern int zone_reclaim(struct zone *, unsigned int, unsigned int); 176 177 extern int shrink_all_memory(int); 177 178 extern int vm_swappiness; 178 179

+1

kernel/sys_ni.c

··· 77 77 cond_syscall(sys_keyctl); 78 78 cond_syscall(compat_sys_keyctl); 79 79 cond_syscall(compat_sys_socketcall); 80 + cond_syscall(sys_set_zone_reclaim); 80 81 81 82 /* arch-specific weak syscall entries */ 82 83 cond_syscall(sys_pciconfig_read);

+28 -5

mm/page_alloc.c

··· 724 724 return 1; 725 725 } 726 726 727 + static inline int 728 + should_reclaim_zone(struct zone *z, unsigned int gfp_mask) 729 + { 730 + if (!z->reclaim_pages) 731 + return 0; 732 + return 1; 733 + } 734 + 727 735 /* 728 736 * This is the 'heart' of the zoned buddy allocator. 729 737 */ ··· 768 760 769 761 classzone_idx = zone_idx(zones[0]); 770 762 771 - restart: 763 + restart: 772 764 /* Go through the zonelist once, looking for a zone with enough free */ 773 765 for (i = 0; (z = zones[i]) != NULL; i++) { 774 - 775 - if (!zone_watermark_ok(z, order, z->pages_low, 776 - classzone_idx, 0, 0)) 777 - continue; 766 + int do_reclaim = should_reclaim_zone(z, gfp_mask); 778 767 779 768 if (!cpuset_zone_allowed(z)) 780 769 continue; 770 + 771 + /* 772 + * If the zone is to attempt early page reclaim then this loop 773 + * will try to reclaim pages and check the watermark a second 774 + * time before giving up and falling back to the next zone. 775 + */ 776 + zone_reclaim_retry: 777 + if (!zone_watermark_ok(z, order, z->pages_low, 778 + classzone_idx, 0, 0)) { 779 + if (!do_reclaim) 780 + continue; 781 + else { 782 + zone_reclaim(z, gfp_mask, order); 783 + /* Only try reclaim once */ 784 + do_reclaim = 0; 785 + goto zone_reclaim_retry; 786 + } 787 + } 781 788 782 789 page = buffered_rmqueue(z, order, gfp_mask); 783 790 if (page)

+64

mm/vmscan.c

··· 1323 1323 } 1324 1324 1325 1325 module_init(kswapd_init) 1326 + 1327 + 1328 + /* 1329 + * Try to free up some pages from this zone through reclaim. 1330 + */ 1331 + int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order) 1332 + { 1333 + struct scan_control sc; 1334 + int nr_pages = 1 << order; 1335 + int total_reclaimed = 0; 1336 + 1337 + /* The reclaim may sleep, so don't do it if sleep isn't allowed */ 1338 + if (!(gfp_mask & __GFP_WAIT)) 1339 + return 0; 1340 + if (zone->all_unreclaimable) 1341 + return 0; 1342 + 1343 + sc.gfp_mask = gfp_mask; 1344 + sc.may_writepage = 0; 1345 + sc.may_swap = 0; 1346 + sc.nr_mapped = read_page_state(nr_mapped); 1347 + sc.nr_scanned = 0; 1348 + sc.nr_reclaimed = 0; 1349 + /* scan at the highest priority */ 1350 + sc.priority = 0; 1351 + 1352 + if (nr_pages > SWAP_CLUSTER_MAX) 1353 + sc.swap_cluster_max = nr_pages; 1354 + else 1355 + sc.swap_cluster_max = SWAP_CLUSTER_MAX; 1356 + 1357 + shrink_zone(zone, &sc); 1358 + total_reclaimed = sc.nr_reclaimed; 1359 + 1360 + return total_reclaimed; 1361 + } 1362 + 1363 + asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone, 1364 + unsigned int state) 1365 + { 1366 + struct zone *z; 1367 + int i; 1368 + 1369 + if (node >= MAX_NUMNODES || !node_online(node)) 1370 + return -EINVAL; 1371 + 1372 + /* This will break if we ever add more zones */ 1373 + if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM))) 1374 + return -EINVAL; 1375 + 1376 + for (i = 0; i < MAX_NR_ZONES; i++) { 1377 + if (!(zone & 1<<i)) 1378 + continue; 1379 + 1380 + z = &NODE_DATA(node)->node_zones[i]; 1381 + 1382 + if (state) 1383 + z->reclaim_pages = 1; 1384 + else 1385 + z->reclaim_pages = 0; 1386 + } 1387 + 1388 + return 0; 1389 + }