Merge branch 'slub-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/christoph/vm

* 'slub-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/christoph/vm:
SLUB: fix checkpatch warnings
Use non atomic unlock
SLUB: Support for performance statistics
SLUB: Alternate fast paths using cmpxchg_local
SLUB: Use unique end pointer for each slab page.
SLUB: Deal with annoying gcc warning on kfree()

+457 -63
+138 -11
Documentation/vm/slabinfo.c
··· 32 32 int sanity_checks, slab_size, store_user, trace; 33 33 int order, poison, reclaim_account, red_zone; 34 34 unsigned long partial, objects, slabs; 35 + unsigned long alloc_fastpath, alloc_slowpath; 36 + unsigned long free_fastpath, free_slowpath; 37 + unsigned long free_frozen, free_add_partial, free_remove_partial; 38 + unsigned long alloc_from_partial, alloc_slab, free_slab, alloc_refill; 39 + unsigned long cpuslab_flush, deactivate_full, deactivate_empty; 40 + unsigned long deactivate_to_head, deactivate_to_tail; 41 + unsigned long deactivate_remote_frees; 35 42 int numa[MAX_NODES]; 36 43 int numa_partial[MAX_NODES]; 37 44 } slabinfo[MAX_SLABS]; ··· 71 64 int show_single_ref = 0; 72 65 int show_totals = 0; 73 66 int sort_size = 0; 67 + int sort_active = 0; 74 68 int set_debug = 0; 75 69 int show_ops = 0; 70 + int show_activity = 0; 76 71 77 72 /* Debug options */ 78 73 int sanity = 0; ··· 102 93 printf("slabinfo 5/7/2007. (c) 2007 sgi. clameter@sgi.com\n\n" 103 94 "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n" 104 95 "-a|--aliases Show aliases\n" 96 + "-A|--activity Most active slabs first\n" 105 97 "-d<options>|--debug=<options> Set/Clear Debug options\n" 106 - "-e|--empty Show empty slabs\n" 98 + "-D|--display-active Switch line format to activity\n" 99 + "-e|--empty Show empty slabs\n" 107 100 "-f|--first-alias Show first alias\n" 108 101 "-h|--help Show usage information\n" 109 102 "-i|--inverted Inverted list\n" ··· 292 281 293 282 void first_line(void) 294 283 { 295 - printf("Name Objects Objsize Space " 296 - "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n"); 284 + if (show_activity) 285 + printf("Name Objects Alloc Free %%Fast\n"); 286 + else 287 + printf("Name Objects Objsize Space " 288 + "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n"); 297 289 } 298 290 299 291 /* ··· 321 307 unsigned long slab_size(struct slabinfo *s) 322 308 { 323 309 return s->slabs * (page_size << s->order); 310 + } 311 + 312 + unsigned long slab_activity(struct slabinfo *s) 313 + { 314 + return s->alloc_fastpath + s->free_fastpath + 315 + s->alloc_slowpath + s->free_slowpath; 324 316 } 325 317 326 318 void slab_numa(struct slabinfo *s, int mode) ··· 412 392 return "Off"; 413 393 } 414 394 395 + void slab_stats(struct slabinfo *s) 396 + { 397 + unsigned long total_alloc; 398 + unsigned long total_free; 399 + unsigned long total; 400 + 401 + if (!s->alloc_slab) 402 + return; 403 + 404 + total_alloc = s->alloc_fastpath + s->alloc_slowpath; 405 + total_free = s->free_fastpath + s->free_slowpath; 406 + 407 + if (!total_alloc) 408 + return; 409 + 410 + printf("\n"); 411 + printf("Slab Perf Counter Alloc Free %%Al %%Fr\n"); 412 + printf("--------------------------------------------------\n"); 413 + printf("Fastpath %8lu %8lu %3lu %3lu\n", 414 + s->alloc_fastpath, s->free_fastpath, 415 + s->alloc_fastpath * 100 / total_alloc, 416 + s->free_fastpath * 100 / total_free); 417 + printf("Slowpath %8lu %8lu %3lu %3lu\n", 418 + total_alloc - s->alloc_fastpath, s->free_slowpath, 419 + (total_alloc - s->alloc_fastpath) * 100 / total_alloc, 420 + s->free_slowpath * 100 / total_free); 421 + printf("Page Alloc %8lu %8lu %3lu %3lu\n", 422 + s->alloc_slab, s->free_slab, 423 + s->alloc_slab * 100 / total_alloc, 424 + s->free_slab * 100 / total_free); 425 + printf("Add partial %8lu %8lu %3lu %3lu\n", 426 + s->deactivate_to_head + s->deactivate_to_tail, 427 + s->free_add_partial, 428 + (s->deactivate_to_head + s->deactivate_to_tail) * 100 / total_alloc, 429 + s->free_add_partial * 100 / total_free); 430 + printf("Remove partial %8lu %8lu %3lu %3lu\n", 431 + s->alloc_from_partial, s->free_remove_partial, 432 + s->alloc_from_partial * 100 / total_alloc, 433 + s->free_remove_partial * 100 / total_free); 434 + 435 + printf("RemoteObj/SlabFrozen %8lu %8lu %3lu %3lu\n", 436 + s->deactivate_remote_frees, s->free_frozen, 437 + s->deactivate_remote_frees * 100 / total_alloc, 438 + s->free_frozen * 100 / total_free); 439 + 440 + printf("Total %8lu %8lu\n\n", total_alloc, total_free); 441 + 442 + if (s->cpuslab_flush) 443 + printf("Flushes %8lu\n", s->cpuslab_flush); 444 + 445 + if (s->alloc_refill) 446 + printf("Refill %8lu\n", s->alloc_refill); 447 + 448 + total = s->deactivate_full + s->deactivate_empty + 449 + s->deactivate_to_head + s->deactivate_to_tail; 450 + 451 + if (total) 452 + printf("Deactivate Full=%lu(%lu%%) Empty=%lu(%lu%%) " 453 + "ToHead=%lu(%lu%%) ToTail=%lu(%lu%%)\n", 454 + s->deactivate_full, (s->deactivate_full * 100) / total, 455 + s->deactivate_empty, (s->deactivate_empty * 100) / total, 456 + s->deactivate_to_head, (s->deactivate_to_head * 100) / total, 457 + s->deactivate_to_tail, (s->deactivate_to_tail * 100) / total); 458 + } 459 + 415 460 void report(struct slabinfo *s) 416 461 { 417 462 if (strcmp(s->name, "*") == 0) ··· 515 430 ops(s); 516 431 show_tracking(s); 517 432 slab_numa(s, 1); 433 + slab_stats(s); 518 434 } 519 435 520 436 void slabcache(struct slabinfo *s) ··· 565 479 *p++ = 'T'; 566 480 567 481 *p = 0; 568 - printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n", 569 - s->name, s->objects, s->object_size, size_str, dist_str, 570 - s->objs_per_slab, s->order, 571 - s->slabs ? (s->partial * 100) / s->slabs : 100, 572 - s->slabs ? (s->objects * s->object_size * 100) / 573 - (s->slabs * (page_size << s->order)) : 100, 574 - flags); 482 + if (show_activity) { 483 + unsigned long total_alloc; 484 + unsigned long total_free; 485 + 486 + total_alloc = s->alloc_fastpath + s->alloc_slowpath; 487 + total_free = s->free_fastpath + s->free_slowpath; 488 + 489 + printf("%-21s %8ld %8ld %8ld %3ld %3ld \n", 490 + s->name, s->objects, 491 + total_alloc, total_free, 492 + total_alloc ? (s->alloc_fastpath * 100 / total_alloc) : 0, 493 + total_free ? (s->free_fastpath * 100 / total_free) : 0); 494 + } 495 + else 496 + printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n", 497 + s->name, s->objects, s->object_size, size_str, dist_str, 498 + s->objs_per_slab, s->order, 499 + s->slabs ? (s->partial * 100) / s->slabs : 100, 500 + s->slabs ? (s->objects * s->object_size * 100) / 501 + (s->slabs * (page_size << s->order)) : 100, 502 + flags); 575 503 } 576 504 577 505 /* ··· 992 892 993 893 if (sort_size) 994 894 result = slab_size(s1) < slab_size(s2); 895 + else if (sort_active) 896 + result = slab_activity(s1) < slab_activity(s2); 995 897 else 996 898 result = strcasecmp(s1->name, s2->name); 997 899 ··· 1176 1074 free(t); 1177 1075 slab->store_user = get_obj("store_user"); 1178 1076 slab->trace = get_obj("trace"); 1077 + slab->alloc_fastpath = get_obj("alloc_fastpath"); 1078 + slab->alloc_slowpath = get_obj("alloc_slowpath"); 1079 + slab->free_fastpath = get_obj("free_fastpath"); 1080 + slab->free_slowpath = get_obj("free_slowpath"); 1081 + slab->free_frozen= get_obj("free_frozen"); 1082 + slab->free_add_partial = get_obj("free_add_partial"); 1083 + slab->free_remove_partial = get_obj("free_remove_partial"); 1084 + slab->alloc_from_partial = get_obj("alloc_from_partial"); 1085 + slab->alloc_slab = get_obj("alloc_slab"); 1086 + slab->alloc_refill = get_obj("alloc_refill"); 1087 + slab->free_slab = get_obj("free_slab"); 1088 + slab->cpuslab_flush = get_obj("cpuslab_flush"); 1089 + slab->deactivate_full = get_obj("deactivate_full"); 1090 + slab->deactivate_empty = get_obj("deactivate_empty"); 1091 + slab->deactivate_to_head = get_obj("deactivate_to_head"); 1092 + slab->deactivate_to_tail = get_obj("deactivate_to_tail"); 1093 + slab->deactivate_remote_frees = get_obj("deactivate_remote_frees"); 1179 1094 chdir(".."); 1180 1095 if (slab->name[0] == ':') 1181 1096 alias_targets++; ··· 1243 1124 1244 1125 struct option opts[] = { 1245 1126 { "aliases", 0, NULL, 'a' }, 1127 + { "activity", 0, NULL, 'A' }, 1246 1128 { "debug", 2, NULL, 'd' }, 1129 + { "display-activity", 0, NULL, 'D' }, 1247 1130 { "empty", 0, NULL, 'e' }, 1248 1131 { "first-alias", 0, NULL, 'f' }, 1249 1132 { "help", 0, NULL, 'h' }, ··· 1270 1149 1271 1150 page_size = getpagesize(); 1272 1151 1273 - while ((c = getopt_long(argc, argv, "ad::efhil1noprstvzTS", 1152 + while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTS", 1274 1153 opts, NULL)) != -1) 1275 1154 switch (c) { 1276 1155 case '1': ··· 1279 1158 case 'a': 1280 1159 show_alias = 1; 1281 1160 break; 1161 + case 'A': 1162 + sort_active = 1; 1163 + break; 1282 1164 case 'd': 1283 1165 set_debug = 1; 1284 1166 if (!debug_opt_scan(optarg)) 1285 1167 fatal("Invalid debug option '%s'\n", optarg); 1168 + break; 1169 + case 'D': 1170 + show_activity = 1; 1286 1171 break; 1287 1172 case 'e': 1288 1173 show_empty = 1;
+4
arch/x86/Kconfig
··· 52 52 config SEMAPHORE_SLEEPERS 53 53 def_bool y 54 54 55 + config FAST_CMPXCHG_LOCAL 56 + bool 57 + default y 58 + 55 59 config MMU 56 60 def_bool y 57 61
+4 -1
include/linux/mm_types.h
··· 64 64 #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS 65 65 spinlock_t ptl; 66 66 #endif 67 - struct kmem_cache *slab; /* SLUB: Pointer to slab */ 67 + struct { 68 + struct kmem_cache *slab; /* SLUB: Pointer to slab */ 69 + void *end; /* SLUB: end marker */ 70 + }; 68 71 struct page *first_page; /* Compound tail pages */ 69 72 }; 70 73 union {
+23
include/linux/slub_def.h
··· 11 11 #include <linux/workqueue.h> 12 12 #include <linux/kobject.h> 13 13 14 + enum stat_item { 15 + ALLOC_FASTPATH, /* Allocation from cpu slab */ 16 + ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ 17 + FREE_FASTPATH, /* Free to cpu slub */ 18 + FREE_SLOWPATH, /* Freeing not to cpu slab */ 19 + FREE_FROZEN, /* Freeing to frozen slab */ 20 + FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ 21 + FREE_REMOVE_PARTIAL, /* Freeing removes last object */ 22 + ALLOC_FROM_PARTIAL, /* Cpu slab acquired from partial list */ 23 + ALLOC_SLAB, /* Cpu slab acquired from page allocator */ 24 + ALLOC_REFILL, /* Refill cpu slab from slab freelist */ 25 + FREE_SLAB, /* Slab freed to the page allocator */ 26 + CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ 27 + DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ 28 + DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */ 29 + DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ 30 + DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ 31 + DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ 32 + NR_SLUB_STAT_ITEMS }; 33 + 14 34 struct kmem_cache_cpu { 15 35 void **freelist; /* Pointer to first free per cpu object */ 16 36 struct page *page; /* The slab from which we are allocating */ 17 37 int node; /* The node of the page (or -1 for debug) */ 18 38 unsigned int offset; /* Freepointer offset (in word units) */ 19 39 unsigned int objsize; /* Size of an object (from kmem_cache) */ 40 + #ifdef CONFIG_SLUB_STATS 41 + unsigned stat[NR_SLUB_STAT_ITEMS]; 42 + #endif 20 43 }; 21 44 22 45 struct kmem_cache_node {
+13
lib/Kconfig.debug
··· 205 205 off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying 206 206 "slub_debug=-". 207 207 208 + config SLUB_STATS 209 + default n 210 + bool "Enable SLUB performance statistics" 211 + depends on SLUB 212 + help 213 + SLUB statistics are useful to debug SLUBs allocation behavior in 214 + order find ways to optimize the allocator. This should never be 215 + enabled for production use since keeping statistics slows down 216 + the allocator by a few percentage points. The slabinfo command 217 + supports the determination of the most active slabs to figure 218 + out which slabs are relevant to a particular load. 219 + Try running: slabinfo -DA 220 + 208 221 config DEBUG_PREEMPT 209 222 bool "Debug preemptible kernel" 210 223 depends on DEBUG_KERNEL && PREEMPT && (TRACE_IRQFLAGS_SUPPORT || PPC64)
+275 -51
mm/slub.c
··· 149 149 /* Enable to test recovery from slab corruption on boot */ 150 150 #undef SLUB_RESILIENCY_TEST 151 151 152 + /* 153 + * Currently fastpath is not supported if preemption is enabled. 154 + */ 155 + #if defined(CONFIG_FAST_CMPXCHG_LOCAL) && !defined(CONFIG_PREEMPT) 156 + #define SLUB_FASTPATH 157 + #endif 158 + 152 159 #if PAGE_SHIFT <= 12 153 160 154 161 /* ··· 250 243 static int sysfs_slab_add(struct kmem_cache *); 251 244 static int sysfs_slab_alias(struct kmem_cache *, const char *); 252 245 static void sysfs_slab_remove(struct kmem_cache *); 246 + 253 247 #else 254 248 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } 255 249 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) ··· 259 251 { 260 252 kfree(s); 261 253 } 254 + 262 255 #endif 256 + 257 + static inline void stat(struct kmem_cache_cpu *c, enum stat_item si) 258 + { 259 + #ifdef CONFIG_SLUB_STATS 260 + c->stat[si]++; 261 + #endif 262 + } 263 263 264 264 /******************************************************************** 265 265 * Core slab cache functions ··· 296 280 #endif 297 281 } 298 282 283 + /* 284 + * The end pointer in a slab is special. It points to the first object in the 285 + * slab but has bit 0 set to mark it. 286 + * 287 + * Note that SLUB relies on page_mapping returning NULL for pages with bit 0 288 + * in the mapping set. 289 + */ 290 + static inline int is_end(void *addr) 291 + { 292 + return (unsigned long)addr & PAGE_MAPPING_ANON; 293 + } 294 + 295 + void *slab_address(struct page *page) 296 + { 297 + return page->end - PAGE_MAPPING_ANON; 298 + } 299 + 299 300 static inline int check_valid_pointer(struct kmem_cache *s, 300 301 struct page *page, const void *object) 301 302 { 302 303 void *base; 303 304 304 - if (!object) 305 + if (object == page->end) 305 306 return 1; 306 307 307 - base = page_address(page); 308 + base = slab_address(page); 308 309 if (object < base || object >= base + s->objects * s->size || 309 310 (object - base) % s->size) { 310 311 return 0; ··· 354 321 355 322 /* Scan freelist */ 356 323 #define for_each_free_object(__p, __s, __free) \ 357 - for (__p = (__free); __p; __p = get_freepointer((__s), __p)) 324 + for (__p = (__free); (__p) != page->end; __p = get_freepointer((__s),\ 325 + __p)) 358 326 359 327 /* Determine object index from a given position */ 360 328 static inline int slab_index(void *p, struct kmem_cache *s, void *addr) ··· 507 473 static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) 508 474 { 509 475 unsigned int off; /* Offset of last byte */ 510 - u8 *addr = page_address(page); 476 + u8 *addr = slab_address(page); 511 477 512 478 print_tracking(s, p); 513 479 ··· 685 651 if (!(s->flags & SLAB_POISON)) 686 652 return 1; 687 653 688 - start = page_address(page); 654 + start = slab_address(page); 689 655 end = start + (PAGE_SIZE << s->order); 690 656 length = s->objects * s->size; 691 657 remainder = end - (start + length); ··· 719 685 endobject, red, s->inuse - s->objsize)) 720 686 return 0; 721 687 } else { 722 - if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) 723 - check_bytes_and_report(s, page, p, "Alignment padding", endobject, 724 - POISON_INUSE, s->inuse - s->objsize); 688 + if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { 689 + check_bytes_and_report(s, page, p, "Alignment padding", 690 + endobject, POISON_INUSE, s->inuse - s->objsize); 691 + } 725 692 } 726 693 727 694 if (s->flags & SLAB_POISON) { ··· 753 718 * of the free objects in this slab. May cause 754 719 * another error because the object count is now wrong. 755 720 */ 756 - set_freepointer(s, p, NULL); 721 + set_freepointer(s, p, page->end); 757 722 return 0; 758 723 } 759 724 return 1; ··· 787 752 void *fp = page->freelist; 788 753 void *object = NULL; 789 754 790 - while (fp && nr <= s->objects) { 755 + while (fp != page->end && nr <= s->objects) { 791 756 if (fp == search) 792 757 return 1; 793 758 if (!check_valid_pointer(s, page, fp)) { 794 759 if (object) { 795 760 object_err(s, page, object, 796 761 "Freechain corrupt"); 797 - set_freepointer(s, object, NULL); 762 + set_freepointer(s, object, page->end); 798 763 break; 799 764 } else { 800 765 slab_err(s, page, "Freepointer corrupt"); 801 - page->freelist = NULL; 766 + page->freelist = page->end; 802 767 page->inuse = s->objects; 803 768 slab_fix(s, "Freelist cleared"); 804 769 return 0; ··· 904 869 */ 905 870 slab_fix(s, "Marking all objects used"); 906 871 page->inuse = s->objects; 907 - page->freelist = NULL; 872 + page->freelist = page->end; 908 873 } 909 874 return 0; 910 875 } ··· 929 894 return 0; 930 895 931 896 if (unlikely(s != page->slab)) { 932 - if (!PageSlab(page)) 897 + if (!PageSlab(page)) { 933 898 slab_err(s, page, "Attempt to free object(0x%p) " 934 899 "outside of slab", object); 935 - else 936 - if (!page->slab) { 900 + } else if (!page->slab) { 937 901 printk(KERN_ERR 938 902 "SLUB <none>: no slab for object 0x%p.\n", 939 903 object); ··· 944 910 } 945 911 946 912 /* Special debug activities for freeing objects */ 947 - if (!SlabFrozen(page) && !page->freelist) 913 + if (!SlabFrozen(page) && page->freelist == page->end) 948 914 remove_full(s, page); 949 915 if (s->flags & SLAB_STORE_USER) 950 916 set_track(s, object, TRACK_FREE, addr); ··· 1041 1007 */ 1042 1008 if (slub_debug && (!slub_debug_slabs || 1043 1009 strncmp(slub_debug_slabs, name, 1044 - strlen(slub_debug_slabs)) == 0)) 1010 + strlen(slub_debug_slabs)) == 0)) 1045 1011 flags |= slub_debug; 1046 1012 } 1047 1013 ··· 1136 1102 SetSlabDebug(page); 1137 1103 1138 1104 start = page_address(page); 1105 + page->end = start + 1; 1139 1106 1140 1107 if (unlikely(s->flags & SLAB_POISON)) 1141 1108 memset(start, POISON_INUSE, PAGE_SIZE << s->order); ··· 1148 1113 last = p; 1149 1114 } 1150 1115 setup_object(s, page, last); 1151 - set_freepointer(s, last, NULL); 1116 + set_freepointer(s, last, page->end); 1152 1117 1153 1118 page->freelist = start; 1154 1119 page->inuse = 0; ··· 1164 1129 void *p; 1165 1130 1166 1131 slab_pad_check(s, page); 1167 - for_each_object(p, s, page_address(page)) 1132 + for_each_object(p, s, slab_address(page)) 1168 1133 check_object(s, page, p, 0); 1169 1134 ClearSlabDebug(page); 1170 1135 } ··· 1174 1139 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1175 1140 -pages); 1176 1141 1142 + page->mapping = NULL; 1177 1143 __free_pages(page, s->order); 1178 1144 } 1179 1145 ··· 1219 1183 1220 1184 static __always_inline void slab_unlock(struct page *page) 1221 1185 { 1222 - bit_spin_unlock(PG_locked, &page->flags); 1186 + __bit_spin_unlock(PG_locked, &page->flags); 1223 1187 } 1224 1188 1225 1189 static __always_inline int slab_trylock(struct page *page) ··· 1330 1294 get_cycles() % 1024 > s->remote_node_defrag_ratio) 1331 1295 return NULL; 1332 1296 1333 - zonelist = &NODE_DATA(slab_node(current->mempolicy)) 1334 - ->node_zonelists[gfp_zone(flags)]; 1297 + zonelist = &NODE_DATA( 1298 + slab_node(current->mempolicy))->node_zonelists[gfp_zone(flags)]; 1335 1299 for (z = zonelist->zones; *z; z++) { 1336 1300 struct kmem_cache_node *n; 1337 1301 ··· 1373 1337 static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) 1374 1338 { 1375 1339 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1340 + struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); 1376 1341 1377 1342 ClearSlabFrozen(page); 1378 1343 if (page->inuse) { 1379 1344 1380 - if (page->freelist) 1345 + if (page->freelist != page->end) { 1381 1346 add_partial(n, page, tail); 1382 - else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) 1383 - add_full(n, page); 1347 + stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); 1348 + } else { 1349 + stat(c, DEACTIVATE_FULL); 1350 + if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) 1351 + add_full(n, page); 1352 + } 1384 1353 slab_unlock(page); 1385 - 1386 1354 } else { 1355 + stat(c, DEACTIVATE_EMPTY); 1387 1356 if (n->nr_partial < MIN_PARTIAL) { 1388 1357 /* 1389 1358 * Adding an empty slab to the partial slabs in order ··· 1402 1361 slab_unlock(page); 1403 1362 } else { 1404 1363 slab_unlock(page); 1364 + stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB); 1405 1365 discard_slab(s, page); 1406 1366 } 1407 1367 } ··· 1415 1373 { 1416 1374 struct page *page = c->page; 1417 1375 int tail = 1; 1376 + 1377 + if (c->freelist) 1378 + stat(c, DEACTIVATE_REMOTE_FREES); 1418 1379 /* 1419 1380 * Merge cpu freelist into freelist. Typically we get here 1420 1381 * because both freelists are empty. So this is unlikely 1421 1382 * to occur. 1383 + * 1384 + * We need to use _is_end here because deactivate slab may 1385 + * be called for a debug slab. Then c->freelist may contain 1386 + * a dummy pointer. 1422 1387 */ 1423 - while (unlikely(c->freelist)) { 1388 + while (unlikely(!is_end(c->freelist))) { 1424 1389 void **object; 1425 1390 1426 1391 tail = 0; /* Hot objects. Put the slab first */ ··· 1447 1398 1448 1399 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1449 1400 { 1401 + stat(c, CPUSLAB_FLUSH); 1450 1402 slab_lock(c->page); 1451 1403 deactivate_slab(s, c); 1452 1404 } ··· 1519 1469 { 1520 1470 void **object; 1521 1471 struct page *new; 1472 + #ifdef SLUB_FASTPATH 1473 + unsigned long flags; 1522 1474 1475 + local_irq_save(flags); 1476 + #endif 1523 1477 if (!c->page) 1524 1478 goto new_slab; 1525 1479 1526 1480 slab_lock(c->page); 1527 1481 if (unlikely(!node_match(c, node))) 1528 1482 goto another_slab; 1483 + stat(c, ALLOC_REFILL); 1529 1484 load_freelist: 1530 1485 object = c->page->freelist; 1531 - if (unlikely(!object)) 1486 + if (unlikely(object == c->page->end)) 1532 1487 goto another_slab; 1533 1488 if (unlikely(SlabDebug(c->page))) 1534 1489 goto debug; ··· 1541 1486 object = c->page->freelist; 1542 1487 c->freelist = object[c->offset]; 1543 1488 c->page->inuse = s->objects; 1544 - c->page->freelist = NULL; 1489 + c->page->freelist = c->page->end; 1545 1490 c->node = page_to_nid(c->page); 1491 + unlock_out: 1546 1492 slab_unlock(c->page); 1493 + stat(c, ALLOC_SLOWPATH); 1494 + out: 1495 + #ifdef SLUB_FASTPATH 1496 + local_irq_restore(flags); 1497 + #endif 1547 1498 return object; 1548 1499 1549 1500 another_slab: ··· 1559 1498 new = get_partial(s, gfpflags, node); 1560 1499 if (new) { 1561 1500 c->page = new; 1501 + stat(c, ALLOC_FROM_PARTIAL); 1562 1502 goto load_freelist; 1563 1503 } 1564 1504 ··· 1573 1511 1574 1512 if (new) { 1575 1513 c = get_cpu_slab(s, smp_processor_id()); 1514 + stat(c, ALLOC_SLAB); 1576 1515 if (c->page) 1577 1516 flush_slab(s, c); 1578 1517 slab_lock(new); ··· 1581 1518 c->page = new; 1582 1519 goto load_freelist; 1583 1520 } 1584 - return NULL; 1521 + object = NULL; 1522 + goto out; 1585 1523 debug: 1586 1524 object = c->page->freelist; 1587 1525 if (!alloc_debug_processing(s, c->page, object, addr)) ··· 1591 1527 c->page->inuse++; 1592 1528 c->page->freelist = object[c->offset]; 1593 1529 c->node = -1; 1594 - slab_unlock(c->page); 1595 - return object; 1530 + goto unlock_out; 1596 1531 } 1597 1532 1598 1533 /* ··· 1608 1545 gfp_t gfpflags, int node, void *addr) 1609 1546 { 1610 1547 void **object; 1611 - unsigned long flags; 1612 1548 struct kmem_cache_cpu *c; 1549 + 1550 + /* 1551 + * The SLUB_FASTPATH path is provisional and is currently disabled if the 1552 + * kernel is compiled with preemption or if the arch does not support 1553 + * fast cmpxchg operations. There are a couple of coming changes that will 1554 + * simplify matters and allow preemption. Ultimately we may end up making 1555 + * SLUB_FASTPATH the default. 1556 + * 1557 + * 1. The introduction of the per cpu allocator will avoid array lookups 1558 + * through get_cpu_slab(). A special register can be used instead. 1559 + * 1560 + * 2. The introduction of per cpu atomic operations (cpu_ops) means that 1561 + * we can realize the logic here entirely with per cpu atomics. The 1562 + * per cpu atomic ops will take care of the preemption issues. 1563 + */ 1564 + 1565 + #ifdef SLUB_FASTPATH 1566 + c = get_cpu_slab(s, raw_smp_processor_id()); 1567 + do { 1568 + object = c->freelist; 1569 + if (unlikely(is_end(object) || !node_match(c, node))) { 1570 + object = __slab_alloc(s, gfpflags, node, addr, c); 1571 + break; 1572 + } 1573 + stat(c, ALLOC_FASTPATH); 1574 + } while (cmpxchg_local(&c->freelist, object, object[c->offset]) 1575 + != object); 1576 + #else 1577 + unsigned long flags; 1613 1578 1614 1579 local_irq_save(flags); 1615 1580 c = get_cpu_slab(s, smp_processor_id()); 1616 - if (unlikely(!c->freelist || !node_match(c, node))) 1581 + if (unlikely(is_end(c->freelist) || !node_match(c, node))) 1617 1582 1618 1583 object = __slab_alloc(s, gfpflags, node, addr, c); 1619 1584 1620 1585 else { 1621 1586 object = c->freelist; 1622 1587 c->freelist = object[c->offset]; 1588 + stat(c, ALLOC_FASTPATH); 1623 1589 } 1624 1590 local_irq_restore(flags); 1591 + #endif 1625 1592 1626 1593 if (unlikely((gfpflags & __GFP_ZERO) && object)) 1627 1594 memset(object, 0, c->objsize); ··· 1686 1593 { 1687 1594 void *prior; 1688 1595 void **object = (void *)x; 1596 + struct kmem_cache_cpu *c; 1689 1597 1598 + #ifdef SLUB_FASTPATH 1599 + unsigned long flags; 1600 + 1601 + local_irq_save(flags); 1602 + #endif 1603 + c = get_cpu_slab(s, raw_smp_processor_id()); 1604 + stat(c, FREE_SLOWPATH); 1690 1605 slab_lock(page); 1691 1606 1692 1607 if (unlikely(SlabDebug(page))) ··· 1704 1603 page->freelist = object; 1705 1604 page->inuse--; 1706 1605 1707 - if (unlikely(SlabFrozen(page))) 1606 + if (unlikely(SlabFrozen(page))) { 1607 + stat(c, FREE_FROZEN); 1708 1608 goto out_unlock; 1609 + } 1709 1610 1710 1611 if (unlikely(!page->inuse)) 1711 1612 goto slab_empty; ··· 1717 1614 * was not on the partial list before 1718 1615 * then add it. 1719 1616 */ 1720 - if (unlikely(!prior)) 1617 + if (unlikely(prior == page->end)) { 1721 1618 add_partial(get_node(s, page_to_nid(page)), page, 1); 1619 + stat(c, FREE_ADD_PARTIAL); 1620 + } 1722 1621 1723 1622 out_unlock: 1724 1623 slab_unlock(page); 1624 + #ifdef SLUB_FASTPATH 1625 + local_irq_restore(flags); 1626 + #endif 1725 1627 return; 1726 1628 1727 1629 slab_empty: 1728 - if (prior) 1630 + if (prior != page->end) { 1729 1631 /* 1730 1632 * Slab still on the partial list. 1731 1633 */ 1732 1634 remove_partial(s, page); 1733 - 1635 + stat(c, FREE_REMOVE_PARTIAL); 1636 + } 1734 1637 slab_unlock(page); 1638 + stat(c, FREE_SLAB); 1639 + #ifdef SLUB_FASTPATH 1640 + local_irq_restore(flags); 1641 + #endif 1735 1642 discard_slab(s, page); 1736 1643 return; 1737 1644 ··· 1766 1653 struct page *page, void *x, void *addr) 1767 1654 { 1768 1655 void **object = (void *)x; 1769 - unsigned long flags; 1770 1656 struct kmem_cache_cpu *c; 1657 + 1658 + #ifdef SLUB_FASTPATH 1659 + void **freelist; 1660 + 1661 + c = get_cpu_slab(s, raw_smp_processor_id()); 1662 + debug_check_no_locks_freed(object, s->objsize); 1663 + do { 1664 + freelist = c->freelist; 1665 + barrier(); 1666 + /* 1667 + * If the compiler would reorder the retrieval of c->page to 1668 + * come before c->freelist then an interrupt could 1669 + * change the cpu slab before we retrieve c->freelist. We 1670 + * could be matching on a page no longer active and put the 1671 + * object onto the freelist of the wrong slab. 1672 + * 1673 + * On the other hand: If we already have the freelist pointer 1674 + * then any change of cpu_slab will cause the cmpxchg to fail 1675 + * since the freelist pointers are unique per slab. 1676 + */ 1677 + if (unlikely(page != c->page || c->node < 0)) { 1678 + __slab_free(s, page, x, addr, c->offset); 1679 + break; 1680 + } 1681 + object[c->offset] = freelist; 1682 + stat(c, FREE_FASTPATH); 1683 + } while (cmpxchg_local(&c->freelist, freelist, object) != freelist); 1684 + #else 1685 + unsigned long flags; 1771 1686 1772 1687 local_irq_save(flags); 1773 1688 debug_check_no_locks_freed(object, s->objsize); ··· 1803 1662 if (likely(page == c->page && c->node >= 0)) { 1804 1663 object[c->offset] = c->freelist; 1805 1664 c->freelist = object; 1665 + stat(c, FREE_FASTPATH); 1806 1666 } else 1807 1667 __slab_free(s, page, x, addr, c->offset); 1808 1668 1809 1669 local_irq_restore(flags); 1670 + #endif 1810 1671 } 1811 1672 1812 1673 void kmem_cache_free(struct kmem_cache *s, void *x) ··· 1985 1842 struct kmem_cache_cpu *c) 1986 1843 { 1987 1844 c->page = NULL; 1988 - c->freelist = NULL; 1845 + c->freelist = (void *)PAGE_MAPPING_ANON; 1989 1846 c->node = 0; 1990 1847 c->offset = s->offset / sizeof(void *); 1991 1848 c->objsize = s->objsize; ··· 2589 2446 goto unlock_out; 2590 2447 2591 2448 realsize = kmalloc_caches[index].objsize; 2592 - text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", (unsigned int)realsize), 2449 + text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", 2450 + (unsigned int)realsize); 2593 2451 s = kmalloc(kmem_size, flags & ~SLUB_DMA); 2594 2452 2595 2453 if (!s || !text || !kmem_cache_open(s, flags, text, ··· 2745 2601 void kfree(const void *x) 2746 2602 { 2747 2603 struct page *page; 2604 + void *object = (void *)x; 2748 2605 2749 2606 if (unlikely(ZERO_OR_NULL_PTR(x))) 2750 2607 return; ··· 2755 2610 put_page(page); 2756 2611 return; 2757 2612 } 2758 - slab_free(page->slab, page, (void *)x, __builtin_return_address(0)); 2613 + slab_free(page->slab, page, object, __builtin_return_address(0)); 2759 2614 } 2760 2615 EXPORT_SYMBOL(kfree); 2761 2616 ··· 3041 2896 #endif 3042 2897 3043 2898 3044 - printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 2899 + printk(KERN_INFO 2900 + "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 3045 2901 " CPUs=%d, Nodes=%d\n", 3046 2902 caches, cache_line_size(), 3047 2903 slub_min_order, slub_max_order, slub_min_objects, ··· 3209 3063 } 3210 3064 3211 3065 static struct notifier_block __cpuinitdata slab_notifier = { 3212 - &slab_cpuup_callback, NULL, 0 3066 + .notifier_call = slab_cpuup_callback 3213 3067 }; 3214 3068 3215 3069 #endif ··· 3250 3104 unsigned long *map) 3251 3105 { 3252 3106 void *p; 3253 - void *addr = page_address(page); 3107 + void *addr = slab_address(page); 3254 3108 3255 3109 if (!check_slab(s, page) || 3256 3110 !on_freelist(s, page, NULL)) ··· 3367 3221 p = kzalloc(32, GFP_KERNEL); 3368 3222 p[32 + sizeof(void *)] = 0x34; 3369 3223 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" 3370 - " 0x34 -> -0x%p\n", p); 3371 - printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); 3224 + " 0x34 -> -0x%p\n", p); 3225 + printk(KERN_ERR 3226 + "If allocated object is overwritten then not detectable\n\n"); 3372 3227 3373 3228 validate_slab_cache(kmalloc_caches + 5); 3374 3229 p = kzalloc(64, GFP_KERNEL); ··· 3377 3230 *p = 0x56; 3378 3231 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", 3379 3232 p); 3380 - printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); 3233 + printk(KERN_ERR 3234 + "If allocated object is overwritten then not detectable\n\n"); 3381 3235 validate_slab_cache(kmalloc_caches + 6); 3382 3236 3383 3237 printk(KERN_ERR "\nB. Corruption after free\n"); ··· 3391 3243 p = kzalloc(256, GFP_KERNEL); 3392 3244 kfree(p); 3393 3245 p[50] = 0x9a; 3394 - printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); 3246 + printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", 3247 + p); 3395 3248 validate_slab_cache(kmalloc_caches + 8); 3396 3249 3397 3250 p = kzalloc(512, GFP_KERNEL); ··· 3533 3384 static void process_slab(struct loc_track *t, struct kmem_cache *s, 3534 3385 struct page *page, enum track_item alloc) 3535 3386 { 3536 - void *addr = page_address(page); 3387 + void *addr = slab_address(page); 3537 3388 DECLARE_BITMAP(map, s->objects); 3538 3389 void *p; 3539 3390 ··· 4021 3872 SLAB_ATTR(remote_node_defrag_ratio); 4022 3873 #endif 4023 3874 3875 + #ifdef CONFIG_SLUB_STATS 3876 + 3877 + static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) 3878 + { 3879 + unsigned long sum = 0; 3880 + int cpu; 3881 + int len; 3882 + int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); 3883 + 3884 + if (!data) 3885 + return -ENOMEM; 3886 + 3887 + for_each_online_cpu(cpu) { 3888 + unsigned x = get_cpu_slab(s, cpu)->stat[si]; 3889 + 3890 + data[cpu] = x; 3891 + sum += x; 3892 + } 3893 + 3894 + len = sprintf(buf, "%lu", sum); 3895 + 3896 + for_each_online_cpu(cpu) { 3897 + if (data[cpu] && len < PAGE_SIZE - 20) 3898 + len += sprintf(buf + len, " c%d=%u", cpu, data[cpu]); 3899 + } 3900 + kfree(data); 3901 + return len + sprintf(buf + len, "\n"); 3902 + } 3903 + 3904 + #define STAT_ATTR(si, text) \ 3905 + static ssize_t text##_show(struct kmem_cache *s, char *buf) \ 3906 + { \ 3907 + return show_stat(s, buf, si); \ 3908 + } \ 3909 + SLAB_ATTR_RO(text); \ 3910 + 3911 + STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); 3912 + STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); 3913 + STAT_ATTR(FREE_FASTPATH, free_fastpath); 3914 + STAT_ATTR(FREE_SLOWPATH, free_slowpath); 3915 + STAT_ATTR(FREE_FROZEN, free_frozen); 3916 + STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); 3917 + STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); 3918 + STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); 3919 + STAT_ATTR(ALLOC_SLAB, alloc_slab); 3920 + STAT_ATTR(ALLOC_REFILL, alloc_refill); 3921 + STAT_ATTR(FREE_SLAB, free_slab); 3922 + STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); 3923 + STAT_ATTR(DEACTIVATE_FULL, deactivate_full); 3924 + STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); 3925 + STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); 3926 + STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); 3927 + STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); 3928 + 3929 + #endif 3930 + 4024 3931 static struct attribute *slab_attrs[] = { 4025 3932 &slab_size_attr.attr, 4026 3933 &object_size_attr.attr, ··· 4106 3901 #endif 4107 3902 #ifdef CONFIG_NUMA 4108 3903 &remote_node_defrag_ratio_attr.attr, 3904 + #endif 3905 + #ifdef CONFIG_SLUB_STATS 3906 + &alloc_fastpath_attr.attr, 3907 + &alloc_slowpath_attr.attr, 3908 + &free_fastpath_attr.attr, 3909 + &free_slowpath_attr.attr, 3910 + &free_frozen_attr.attr, 3911 + &free_add_partial_attr.attr, 3912 + &free_remove_partial_attr.attr, 3913 + &alloc_from_partial_attr.attr, 3914 + &alloc_slab_attr.attr, 3915 + &alloc_refill_attr.attr, 3916 + &free_slab_attr.attr, 3917 + &cpuslab_flush_attr.attr, 3918 + &deactivate_full_attr.attr, 3919 + &deactivate_empty_attr.attr, 3920 + &deactivate_to_head_attr.attr, 3921 + &deactivate_to_tail_attr.attr, 3922 + &deactivate_remote_frees_attr.attr, 4109 3923 #endif 4110 3924 NULL 4111 3925 };