at v5.13 19 kB view raw
1// SPDX-License-Identifier: GPL-2.0 2/* 3 * SLOB Allocator: Simple List Of Blocks 4 * 5 * Matt Mackall <mpm@selenic.com> 12/30/03 6 * 7 * NUMA support by Paul Mundt, 2007. 8 * 9 * How SLOB works: 10 * 11 * The core of SLOB is a traditional K&R style heap allocator, with 12 * support for returning aligned objects. The granularity of this 13 * allocator is as little as 2 bytes, however typically most architectures 14 * will require 4 bytes on 32-bit and 8 bytes on 64-bit. 15 * 16 * The slob heap is a set of linked list of pages from alloc_pages(), 17 * and within each page, there is a singly-linked list of free blocks 18 * (slob_t). The heap is grown on demand. To reduce fragmentation, 19 * heap pages are segregated into three lists, with objects less than 20 * 256 bytes, objects less than 1024 bytes, and all other objects. 21 * 22 * Allocation from heap involves first searching for a page with 23 * sufficient free blocks (using a next-fit-like approach) followed by 24 * a first-fit scan of the page. Deallocation inserts objects back 25 * into the free list in address order, so this is effectively an 26 * address-ordered first fit. 27 * 28 * Above this is an implementation of kmalloc/kfree. Blocks returned 29 * from kmalloc are prepended with a 4-byte header with the kmalloc size. 30 * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls 31 * alloc_pages() directly, allocating compound pages so the page order 32 * does not have to be separately tracked. 33 * These objects are detected in kfree() because PageSlab() 34 * is false for them. 35 * 36 * SLAB is emulated on top of SLOB by simply calling constructors and 37 * destructors for every SLAB allocation. Objects are returned with the 38 * 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which 39 * case the low-level allocator will fragment blocks to create the proper 40 * alignment. Again, objects of page-size or greater are allocated by 41 * calling alloc_pages(). As SLAB objects know their size, no separate 42 * size bookkeeping is necessary and there is essentially no allocation 43 * space overhead, and compound pages aren't needed for multi-page 44 * allocations. 45 * 46 * NUMA support in SLOB is fairly simplistic, pushing most of the real 47 * logic down to the page allocator, and simply doing the node accounting 48 * on the upper levels. In the event that a node id is explicitly 49 * provided, __alloc_pages_node() with the specified node id is used 50 * instead. The common case (or when the node id isn't explicitly provided) 51 * will default to the current node, as per numa_node_id(). 52 * 53 * Node aware pages are still inserted in to the global freelist, and 54 * these are scanned for by matching against the node id encoded in the 55 * page flags. As a result, block allocations that can be satisfied from 56 * the freelist will only be done so on pages residing on the same node, 57 * in order to prevent random node placement. 58 */ 59 60#include <linux/kernel.h> 61#include <linux/slab.h> 62 63#include <linux/mm.h> 64#include <linux/swap.h> /* struct reclaim_state */ 65#include <linux/cache.h> 66#include <linux/init.h> 67#include <linux/export.h> 68#include <linux/rcupdate.h> 69#include <linux/list.h> 70#include <linux/kmemleak.h> 71 72#include <trace/events/kmem.h> 73 74#include <linux/atomic.h> 75 76#include "slab.h" 77/* 78 * slob_block has a field 'units', which indicates size of block if +ve, 79 * or offset of next block if -ve (in SLOB_UNITs). 80 * 81 * Free blocks of size 1 unit simply contain the offset of the next block. 82 * Those with larger size contain their size in the first SLOB_UNIT of 83 * memory, and the offset of the next free block in the second SLOB_UNIT. 84 */ 85#if PAGE_SIZE <= (32767 * 2) 86typedef s16 slobidx_t; 87#else 88typedef s32 slobidx_t; 89#endif 90 91struct slob_block { 92 slobidx_t units; 93}; 94typedef struct slob_block slob_t; 95 96/* 97 * All partially free slob pages go on these lists. 98 */ 99#define SLOB_BREAK1 256 100#define SLOB_BREAK2 1024 101static LIST_HEAD(free_slob_small); 102static LIST_HEAD(free_slob_medium); 103static LIST_HEAD(free_slob_large); 104 105/* 106 * slob_page_free: true for pages on free_slob_pages list. 107 */ 108static inline int slob_page_free(struct page *sp) 109{ 110 return PageSlobFree(sp); 111} 112 113static void set_slob_page_free(struct page *sp, struct list_head *list) 114{ 115 list_add(&sp->slab_list, list); 116 __SetPageSlobFree(sp); 117} 118 119static inline void clear_slob_page_free(struct page *sp) 120{ 121 list_del(&sp->slab_list); 122 __ClearPageSlobFree(sp); 123} 124 125#define SLOB_UNIT sizeof(slob_t) 126#define SLOB_UNITS(size) DIV_ROUND_UP(size, SLOB_UNIT) 127 128/* 129 * struct slob_rcu is inserted at the tail of allocated slob blocks, which 130 * were created with a SLAB_TYPESAFE_BY_RCU slab. slob_rcu is used to free 131 * the block using call_rcu. 132 */ 133struct slob_rcu { 134 struct rcu_head head; 135 int size; 136}; 137 138/* 139 * slob_lock protects all slob allocator structures. 140 */ 141static DEFINE_SPINLOCK(slob_lock); 142 143/* 144 * Encode the given size and next info into a free slob block s. 145 */ 146static void set_slob(slob_t *s, slobidx_t size, slob_t *next) 147{ 148 slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK); 149 slobidx_t offset = next - base; 150 151 if (size > 1) { 152 s[0].units = size; 153 s[1].units = offset; 154 } else 155 s[0].units = -offset; 156} 157 158/* 159 * Return the size of a slob block. 160 */ 161static slobidx_t slob_units(slob_t *s) 162{ 163 if (s->units > 0) 164 return s->units; 165 return 1; 166} 167 168/* 169 * Return the next free slob block pointer after this one. 170 */ 171static slob_t *slob_next(slob_t *s) 172{ 173 slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK); 174 slobidx_t next; 175 176 if (s[0].units < 0) 177 next = -s[0].units; 178 else 179 next = s[1].units; 180 return base+next; 181} 182 183/* 184 * Returns true if s is the last free block in its page. 185 */ 186static int slob_last(slob_t *s) 187{ 188 return !((unsigned long)slob_next(s) & ~PAGE_MASK); 189} 190 191static void *slob_new_pages(gfp_t gfp, int order, int node) 192{ 193 struct page *page; 194 195#ifdef CONFIG_NUMA 196 if (node != NUMA_NO_NODE) 197 page = __alloc_pages_node(node, gfp, order); 198 else 199#endif 200 page = alloc_pages(gfp, order); 201 202 if (!page) 203 return NULL; 204 205 mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, 206 PAGE_SIZE << order); 207 return page_address(page); 208} 209 210static void slob_free_pages(void *b, int order) 211{ 212 struct page *sp = virt_to_page(b); 213 214 if (current->reclaim_state) 215 current->reclaim_state->reclaimed_slab += 1 << order; 216 217 mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B, 218 -(PAGE_SIZE << order)); 219 __free_pages(sp, order); 220} 221 222/* 223 * slob_page_alloc() - Allocate a slob block within a given slob_page sp. 224 * @sp: Page to look in. 225 * @size: Size of the allocation. 226 * @align: Allocation alignment. 227 * @align_offset: Offset in the allocated block that will be aligned. 228 * @page_removed_from_list: Return parameter. 229 * 230 * Tries to find a chunk of memory at least @size bytes big within @page. 231 * 232 * Return: Pointer to memory if allocated, %NULL otherwise. If the 233 * allocation fills up @page then the page is removed from the 234 * freelist, in this case @page_removed_from_list will be set to 235 * true (set to false otherwise). 236 */ 237static void *slob_page_alloc(struct page *sp, size_t size, int align, 238 int align_offset, bool *page_removed_from_list) 239{ 240 slob_t *prev, *cur, *aligned = NULL; 241 int delta = 0, units = SLOB_UNITS(size); 242 243 *page_removed_from_list = false; 244 for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) { 245 slobidx_t avail = slob_units(cur); 246 247 /* 248 * 'aligned' will hold the address of the slob block so that the 249 * address 'aligned'+'align_offset' is aligned according to the 250 * 'align' parameter. This is for kmalloc() which prepends the 251 * allocated block with its size, so that the block itself is 252 * aligned when needed. 253 */ 254 if (align) { 255 aligned = (slob_t *) 256 (ALIGN((unsigned long)cur + align_offset, align) 257 - align_offset); 258 delta = aligned - cur; 259 } 260 if (avail >= units + delta) { /* room enough? */ 261 slob_t *next; 262 263 if (delta) { /* need to fragment head to align? */ 264 next = slob_next(cur); 265 set_slob(aligned, avail - delta, next); 266 set_slob(cur, delta, aligned); 267 prev = cur; 268 cur = aligned; 269 avail = slob_units(cur); 270 } 271 272 next = slob_next(cur); 273 if (avail == units) { /* exact fit? unlink. */ 274 if (prev) 275 set_slob(prev, slob_units(prev), next); 276 else 277 sp->freelist = next; 278 } else { /* fragment */ 279 if (prev) 280 set_slob(prev, slob_units(prev), cur + units); 281 else 282 sp->freelist = cur + units; 283 set_slob(cur + units, avail - units, next); 284 } 285 286 sp->units -= units; 287 if (!sp->units) { 288 clear_slob_page_free(sp); 289 *page_removed_from_list = true; 290 } 291 return cur; 292 } 293 if (slob_last(cur)) 294 return NULL; 295 } 296} 297 298/* 299 * slob_alloc: entry point into the slob allocator. 300 */ 301static void *slob_alloc(size_t size, gfp_t gfp, int align, int node, 302 int align_offset) 303{ 304 struct page *sp; 305 struct list_head *slob_list; 306 slob_t *b = NULL; 307 unsigned long flags; 308 bool _unused; 309 310 if (size < SLOB_BREAK1) 311 slob_list = &free_slob_small; 312 else if (size < SLOB_BREAK2) 313 slob_list = &free_slob_medium; 314 else 315 slob_list = &free_slob_large; 316 317 spin_lock_irqsave(&slob_lock, flags); 318 /* Iterate through each partially free page, try to find room */ 319 list_for_each_entry(sp, slob_list, slab_list) { 320 bool page_removed_from_list = false; 321#ifdef CONFIG_NUMA 322 /* 323 * If there's a node specification, search for a partial 324 * page with a matching node id in the freelist. 325 */ 326 if (node != NUMA_NO_NODE && page_to_nid(sp) != node) 327 continue; 328#endif 329 /* Enough room on this page? */ 330 if (sp->units < SLOB_UNITS(size)) 331 continue; 332 333 b = slob_page_alloc(sp, size, align, align_offset, &page_removed_from_list); 334 if (!b) 335 continue; 336 337 /* 338 * If slob_page_alloc() removed sp from the list then we 339 * cannot call list functions on sp. If so allocation 340 * did not fragment the page anyway so optimisation is 341 * unnecessary. 342 */ 343 if (!page_removed_from_list) { 344 /* 345 * Improve fragment distribution and reduce our average 346 * search time by starting our next search here. (see 347 * Knuth vol 1, sec 2.5, pg 449) 348 */ 349 if (!list_is_first(&sp->slab_list, slob_list)) 350 list_rotate_to_front(&sp->slab_list, slob_list); 351 } 352 break; 353 } 354 spin_unlock_irqrestore(&slob_lock, flags); 355 356 /* Not enough space: must allocate a new page */ 357 if (!b) { 358 b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node); 359 if (!b) 360 return NULL; 361 sp = virt_to_page(b); 362 __SetPageSlab(sp); 363 364 spin_lock_irqsave(&slob_lock, flags); 365 sp->units = SLOB_UNITS(PAGE_SIZE); 366 sp->freelist = b; 367 INIT_LIST_HEAD(&sp->slab_list); 368 set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); 369 set_slob_page_free(sp, slob_list); 370 b = slob_page_alloc(sp, size, align, align_offset, &_unused); 371 BUG_ON(!b); 372 spin_unlock_irqrestore(&slob_lock, flags); 373 } 374 if (unlikely(gfp & __GFP_ZERO)) 375 memset(b, 0, size); 376 return b; 377} 378 379/* 380 * slob_free: entry point into the slob allocator. 381 */ 382static void slob_free(void *block, int size) 383{ 384 struct page *sp; 385 slob_t *prev, *next, *b = (slob_t *)block; 386 slobidx_t units; 387 unsigned long flags; 388 struct list_head *slob_list; 389 390 if (unlikely(ZERO_OR_NULL_PTR(block))) 391 return; 392 BUG_ON(!size); 393 394 sp = virt_to_page(block); 395 units = SLOB_UNITS(size); 396 397 spin_lock_irqsave(&slob_lock, flags); 398 399 if (sp->units + units == SLOB_UNITS(PAGE_SIZE)) { 400 /* Go directly to page allocator. Do not pass slob allocator */ 401 if (slob_page_free(sp)) 402 clear_slob_page_free(sp); 403 spin_unlock_irqrestore(&slob_lock, flags); 404 __ClearPageSlab(sp); 405 page_mapcount_reset(sp); 406 slob_free_pages(b, 0); 407 return; 408 } 409 410 if (!slob_page_free(sp)) { 411 /* This slob page is about to become partially free. Easy! */ 412 sp->units = units; 413 sp->freelist = b; 414 set_slob(b, units, 415 (void *)((unsigned long)(b + 416 SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); 417 if (size < SLOB_BREAK1) 418 slob_list = &free_slob_small; 419 else if (size < SLOB_BREAK2) 420 slob_list = &free_slob_medium; 421 else 422 slob_list = &free_slob_large; 423 set_slob_page_free(sp, slob_list); 424 goto out; 425 } 426 427 /* 428 * Otherwise the page is already partially free, so find reinsertion 429 * point. 430 */ 431 sp->units += units; 432 433 if (b < (slob_t *)sp->freelist) { 434 if (b + units == sp->freelist) { 435 units += slob_units(sp->freelist); 436 sp->freelist = slob_next(sp->freelist); 437 } 438 set_slob(b, units, sp->freelist); 439 sp->freelist = b; 440 } else { 441 prev = sp->freelist; 442 next = slob_next(prev); 443 while (b > next) { 444 prev = next; 445 next = slob_next(prev); 446 } 447 448 if (!slob_last(prev) && b + units == next) { 449 units += slob_units(next); 450 set_slob(b, units, slob_next(next)); 451 } else 452 set_slob(b, units, next); 453 454 if (prev + slob_units(prev) == b) { 455 units = slob_units(b) + slob_units(prev); 456 set_slob(prev, units, slob_next(b)); 457 } else 458 set_slob(prev, slob_units(prev), b); 459 } 460out: 461 spin_unlock_irqrestore(&slob_lock, flags); 462} 463 464#ifdef CONFIG_PRINTK 465void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) 466{ 467 kpp->kp_ptr = object; 468 kpp->kp_page = page; 469} 470#endif 471 472/* 473 * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. 474 */ 475 476static __always_inline void * 477__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) 478{ 479 unsigned int *m; 480 int minalign = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 481 void *ret; 482 483 gfp &= gfp_allowed_mask; 484 485 might_alloc(gfp); 486 487 if (size < PAGE_SIZE - minalign) { 488 int align = minalign; 489 490 /* 491 * For power of two sizes, guarantee natural alignment for 492 * kmalloc()'d objects. 493 */ 494 if (is_power_of_2(size)) 495 align = max(minalign, (int) size); 496 497 if (!size) 498 return ZERO_SIZE_PTR; 499 500 m = slob_alloc(size + minalign, gfp, align, node, minalign); 501 502 if (!m) 503 return NULL; 504 *m = size; 505 ret = (void *)m + minalign; 506 507 trace_kmalloc_node(caller, ret, 508 size, size + minalign, gfp, node); 509 } else { 510 unsigned int order = get_order(size); 511 512 if (likely(order)) 513 gfp |= __GFP_COMP; 514 ret = slob_new_pages(gfp, order, node); 515 516 trace_kmalloc_node(caller, ret, 517 size, PAGE_SIZE << order, gfp, node); 518 } 519 520 kmemleak_alloc(ret, size, 1, gfp); 521 return ret; 522} 523 524void *__kmalloc(size_t size, gfp_t gfp) 525{ 526 return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, _RET_IP_); 527} 528EXPORT_SYMBOL(__kmalloc); 529 530void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) 531{ 532 return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); 533} 534EXPORT_SYMBOL(__kmalloc_track_caller); 535 536#ifdef CONFIG_NUMA 537void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, 538 int node, unsigned long caller) 539{ 540 return __do_kmalloc_node(size, gfp, node, caller); 541} 542EXPORT_SYMBOL(__kmalloc_node_track_caller); 543#endif 544 545void kfree(const void *block) 546{ 547 struct page *sp; 548 549 trace_kfree(_RET_IP_, block); 550 551 if (unlikely(ZERO_OR_NULL_PTR(block))) 552 return; 553 kmemleak_free(block); 554 555 sp = virt_to_page(block); 556 if (PageSlab(sp)) { 557 int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 558 unsigned int *m = (unsigned int *)(block - align); 559 slob_free(m, *m + align); 560 } else { 561 unsigned int order = compound_order(sp); 562 mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B, 563 -(PAGE_SIZE << order)); 564 __free_pages(sp, order); 565 566 } 567} 568EXPORT_SYMBOL(kfree); 569 570/* can't use ksize for kmem_cache_alloc memory, only kmalloc */ 571size_t __ksize(const void *block) 572{ 573 struct page *sp; 574 int align; 575 unsigned int *m; 576 577 BUG_ON(!block); 578 if (unlikely(block == ZERO_SIZE_PTR)) 579 return 0; 580 581 sp = virt_to_page(block); 582 if (unlikely(!PageSlab(sp))) 583 return page_size(sp); 584 585 align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 586 m = (unsigned int *)(block - align); 587 return SLOB_UNITS(*m) * SLOB_UNIT; 588} 589EXPORT_SYMBOL(__ksize); 590 591int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags) 592{ 593 if (flags & SLAB_TYPESAFE_BY_RCU) { 594 /* leave room for rcu footer at the end of object */ 595 c->size += sizeof(struct slob_rcu); 596 } 597 c->flags = flags; 598 return 0; 599} 600 601static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) 602{ 603 void *b; 604 605 flags &= gfp_allowed_mask; 606 607 might_alloc(flags); 608 609 if (c->size < PAGE_SIZE) { 610 b = slob_alloc(c->size, flags, c->align, node, 0); 611 trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, 612 SLOB_UNITS(c->size) * SLOB_UNIT, 613 flags, node); 614 } else { 615 b = slob_new_pages(flags, get_order(c->size), node); 616 trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, 617 PAGE_SIZE << get_order(c->size), 618 flags, node); 619 } 620 621 if (b && c->ctor) { 622 WARN_ON_ONCE(flags & __GFP_ZERO); 623 c->ctor(b); 624 } 625 626 kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); 627 return b; 628} 629 630void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 631{ 632 return slob_alloc_node(cachep, flags, NUMA_NO_NODE); 633} 634EXPORT_SYMBOL(kmem_cache_alloc); 635 636#ifdef CONFIG_NUMA 637void *__kmalloc_node(size_t size, gfp_t gfp, int node) 638{ 639 return __do_kmalloc_node(size, gfp, node, _RET_IP_); 640} 641EXPORT_SYMBOL(__kmalloc_node); 642 643void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node) 644{ 645 return slob_alloc_node(cachep, gfp, node); 646} 647EXPORT_SYMBOL(kmem_cache_alloc_node); 648#endif 649 650static void __kmem_cache_free(void *b, int size) 651{ 652 if (size < PAGE_SIZE) 653 slob_free(b, size); 654 else 655 slob_free_pages(b, get_order(size)); 656} 657 658static void kmem_rcu_free(struct rcu_head *head) 659{ 660 struct slob_rcu *slob_rcu = (struct slob_rcu *)head; 661 void *b = (void *)slob_rcu - (slob_rcu->size - sizeof(struct slob_rcu)); 662 663 __kmem_cache_free(b, slob_rcu->size); 664} 665 666void kmem_cache_free(struct kmem_cache *c, void *b) 667{ 668 kmemleak_free_recursive(b, c->flags); 669 if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) { 670 struct slob_rcu *slob_rcu; 671 slob_rcu = b + (c->size - sizeof(struct slob_rcu)); 672 slob_rcu->size = c->size; 673 call_rcu(&slob_rcu->head, kmem_rcu_free); 674 } else { 675 __kmem_cache_free(b, c->size); 676 } 677 678 trace_kmem_cache_free(_RET_IP_, b, c->name); 679} 680EXPORT_SYMBOL(kmem_cache_free); 681 682void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) 683{ 684 __kmem_cache_free_bulk(s, size, p); 685} 686EXPORT_SYMBOL(kmem_cache_free_bulk); 687 688int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, 689 void **p) 690{ 691 return __kmem_cache_alloc_bulk(s, flags, size, p); 692} 693EXPORT_SYMBOL(kmem_cache_alloc_bulk); 694 695int __kmem_cache_shutdown(struct kmem_cache *c) 696{ 697 /* No way to check for remaining objects */ 698 return 0; 699} 700 701void __kmem_cache_release(struct kmem_cache *c) 702{ 703} 704 705int __kmem_cache_shrink(struct kmem_cache *d) 706{ 707 return 0; 708} 709 710struct kmem_cache kmem_cache_boot = { 711 .name = "kmem_cache", 712 .size = sizeof(struct kmem_cache), 713 .flags = SLAB_PANIC, 714 .align = ARCH_KMALLOC_MINALIGN, 715}; 716 717void __init kmem_cache_init(void) 718{ 719 kmem_cache = &kmem_cache_boot; 720 slab_state = UP; 721} 722 723void __init kmem_cache_init_late(void) 724{ 725 slab_state = FULL; 726}