Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'slab/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/penberg/linux

* 'slab/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/penberg/linux:
tools, slub: Fix off-by-one buffer corruption after readlink() call
slub: Discard slab page when node partial > minimum partial number
slub: correct comments error for per cpu partial
mm: restrict access to slab files under procfs and sysfs
slub: Code optimization in get_partial_node()
slub: doc: update the slabinfo.c file path
slub: explicitly document position of inserting slab to partial list
slub: update slabinfo tools to report per cpu partial list statistics
slub: per cpu cache for partial pages
slub: return object pointer from get_partial() / new_slab().
slub: pass kmem_cache_cpu pointer to get_partial()
slub: Prepare inuse field in new_slab()
slub: Remove useless statements in __slab_alloc
slub: free slabs without holding locks
slub: use print_hex_dump
slab: use print_hex_dump

+425 -182
-2
Documentation/vm/00-INDEX
··· 30 30 - description of page migration in NUMA systems. 31 31 pagemap.txt 32 32 - pagemap, from the userspace perspective 33 - slabinfo.c 34 - - source code for a tool to get reports about slabs. 35 33 slub.txt 36 34 - a short users guide for SLUB. 37 35 unevictable-lru.txt
+13 -1
include/linux/mm_types.h
··· 79 79 }; 80 80 81 81 /* Third double word block */ 82 - struct list_head lru; /* Pageout list, eg. active_list 82 + union { 83 + struct list_head lru; /* Pageout list, eg. active_list 83 84 * protected by zone->lru_lock ! 84 85 */ 86 + struct { /* slub per cpu partial pages */ 87 + struct page *next; /* Next partial slab */ 88 + #ifdef CONFIG_64BIT 89 + int pages; /* Nr of partial slabs left */ 90 + int pobjects; /* Approximate # of objects */ 91 + #else 92 + short int pages; 93 + short int pobjects; 94 + #endif 95 + }; 96 + }; 85 97 86 98 /* Remainder is not double word aligned */ 87 99 union {
+4
include/linux/slub_def.h
··· 36 36 ORDER_FALLBACK, /* Number of times fallback was necessary */ 37 37 CMPXCHG_DOUBLE_CPU_FAIL,/* Failure of this_cpu_cmpxchg_double */ 38 38 CMPXCHG_DOUBLE_FAIL, /* Number of times that cmpxchg double did not match */ 39 + CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ 40 + CPU_PARTIAL_FREE, /* USed cpu partial on free */ 39 41 NR_SLUB_STAT_ITEMS }; 40 42 41 43 struct kmem_cache_cpu { 42 44 void **freelist; /* Pointer to next available object */ 43 45 unsigned long tid; /* Globally unique transaction id */ 44 46 struct page *page; /* The slab from which we are allocating */ 47 + struct page *partial; /* Partially allocated frozen slabs */ 45 48 int node; /* The node of the page (or -1 for debug) */ 46 49 #ifdef CONFIG_SLUB_STATS 47 50 unsigned stat[NR_SLUB_STAT_ITEMS]; ··· 82 79 int size; /* The size of an object including meta data */ 83 80 int objsize; /* The size of an object without meta data */ 84 81 int offset; /* Free pointer offset. */ 82 + int cpu_partial; /* Number of per cpu partial objects to keep around */ 85 83 struct kmem_cache_order_objects oo; 86 84 87 85 /* Allocation and freeing of slabs */
+7 -12
mm/slab.c
··· 1851 1851 unsigned char error = 0; 1852 1852 int bad_count = 0; 1853 1853 1854 - printk(KERN_ERR "%03x:", offset); 1854 + printk(KERN_ERR "%03x: ", offset); 1855 1855 for (i = 0; i < limit; i++) { 1856 1856 if (data[offset + i] != POISON_FREE) { 1857 1857 error = data[offset + i]; 1858 1858 bad_count++; 1859 1859 } 1860 - printk(" %02x", (unsigned char)data[offset + i]); 1861 1860 } 1862 - printk("\n"); 1861 + print_hex_dump(KERN_CONT, "", 0, 16, 1, 1862 + &data[offset], limit, 1); 1863 1863 1864 1864 if (bad_count == 1) { 1865 1865 error ^= POISON_FREE; ··· 3039 3039 printk(KERN_ERR "slab: Internal list corruption detected in " 3040 3040 "cache '%s'(%d), slabp %p(%d). Hexdump:\n", 3041 3041 cachep->name, cachep->num, slabp, slabp->inuse); 3042 - for (i = 0; 3043 - i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); 3044 - i++) { 3045 - if (i % 16 == 0) 3046 - printk("\n%03x:", i); 3047 - printk(" %02x", ((unsigned char *)slabp)[i]); 3048 - } 3049 - printk("\n"); 3042 + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp, 3043 + sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t), 3044 + 1); 3050 3045 BUG(); 3051 3046 } 3052 3047 } ··· 4579 4584 4580 4585 static int __init slab_proc_init(void) 4581 4586 { 4582 - proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); 4587 + proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations); 4583 4588 #ifdef CONFIG_DEBUG_SLAB_LEAK 4584 4589 proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); 4585 4590 #endif
+392 -166
mm/slub.c
··· 467 467 */ 468 468 static void print_section(char *text, u8 *addr, unsigned int length) 469 469 { 470 - int i, offset; 471 - int newline = 1; 472 - char ascii[17]; 473 - 474 - ascii[16] = 0; 475 - 476 - for (i = 0; i < length; i++) { 477 - if (newline) { 478 - printk(KERN_ERR "%8s 0x%p: ", text, addr + i); 479 - newline = 0; 480 - } 481 - printk(KERN_CONT " %02x", addr[i]); 482 - offset = i % 16; 483 - ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; 484 - if (offset == 15) { 485 - printk(KERN_CONT " %s\n", ascii); 486 - newline = 1; 487 - } 488 - } 489 - if (!newline) { 490 - i %= 16; 491 - while (i < 16) { 492 - printk(KERN_CONT " "); 493 - ascii[i] = ' '; 494 - i++; 495 - } 496 - printk(KERN_CONT " %s\n", ascii); 497 - } 470 + print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, 471 + length, 1); 498 472 } 499 473 500 474 static struct track *get_track(struct kmem_cache *s, void *object, ··· 599 625 p, p - addr, get_freepointer(s, p)); 600 626 601 627 if (p > addr + 16) 602 - print_section("Bytes b4", p - 16, 16); 628 + print_section("Bytes b4 ", p - 16, 16); 603 629 604 - print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE)); 605 - 630 + print_section("Object ", p, min_t(unsigned long, s->objsize, 631 + PAGE_SIZE)); 606 632 if (s->flags & SLAB_RED_ZONE) 607 - print_section("Redzone", p + s->objsize, 633 + print_section("Redzone ", p + s->objsize, 608 634 s->inuse - s->objsize); 609 635 610 636 if (s->offset) ··· 617 643 618 644 if (off != s->size) 619 645 /* Beginning of the filler is the free pointer */ 620 - print_section("Padding", p + off, s->size - off); 646 + print_section("Padding ", p + off, s->size - off); 621 647 622 648 dump_stack(); 623 649 } ··· 812 838 end--; 813 839 814 840 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); 815 - print_section("Padding", end - remainder, remainder); 841 + print_section("Padding ", end - remainder, remainder); 816 842 817 843 restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); 818 844 return 0; ··· 961 987 page->freelist); 962 988 963 989 if (!alloc) 964 - print_section("Object", (void *)object, s->objsize); 990 + print_section("Object ", (void *)object, s->objsize); 965 991 966 992 dump_stack(); 967 993 } ··· 1421 1447 set_freepointer(s, last, NULL); 1422 1448 1423 1449 page->freelist = start; 1424 - page->inuse = 0; 1450 + page->inuse = page->objects; 1425 1451 page->frozen = 1; 1426 1452 out: 1427 1453 return page; ··· 1508 1534 struct page *page, int tail) 1509 1535 { 1510 1536 n->nr_partial++; 1511 - if (tail) 1537 + if (tail == DEACTIVATE_TO_TAIL) 1512 1538 list_add_tail(&page->lru, &n->partial); 1513 1539 else 1514 1540 list_add(&page->lru, &n->partial); ··· 1528 1554 * Lock slab, remove from the partial list and put the object into the 1529 1555 * per cpu freelist. 1530 1556 * 1557 + * Returns a list of objects or NULL if it fails. 1558 + * 1531 1559 * Must hold list_lock. 1532 1560 */ 1533 - static inline int acquire_slab(struct kmem_cache *s, 1534 - struct kmem_cache_node *n, struct page *page) 1561 + static inline void *acquire_slab(struct kmem_cache *s, 1562 + struct kmem_cache_node *n, struct page *page, 1563 + int mode) 1535 1564 { 1536 1565 void *freelist; 1537 1566 unsigned long counters; ··· 1549 1572 freelist = page->freelist; 1550 1573 counters = page->counters; 1551 1574 new.counters = counters; 1552 - new.inuse = page->objects; 1575 + if (mode) 1576 + new.inuse = page->objects; 1553 1577 1554 1578 VM_BUG_ON(new.frozen); 1555 1579 new.frozen = 1; ··· 1561 1583 "lock and freeze")); 1562 1584 1563 1585 remove_partial(n, page); 1564 - 1565 - if (freelist) { 1566 - /* Populate the per cpu freelist */ 1567 - this_cpu_write(s->cpu_slab->freelist, freelist); 1568 - this_cpu_write(s->cpu_slab->page, page); 1569 - this_cpu_write(s->cpu_slab->node, page_to_nid(page)); 1570 - return 1; 1571 - } else { 1572 - /* 1573 - * Slab page came from the wrong list. No object to allocate 1574 - * from. Put it onto the correct list and continue partial 1575 - * scan. 1576 - */ 1577 - printk(KERN_ERR "SLUB: %s : Page without available objects on" 1578 - " partial list\n", s->name); 1579 - return 0; 1580 - } 1586 + return freelist; 1581 1587 } 1588 + 1589 + static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); 1582 1590 1583 1591 /* 1584 1592 * Try to allocate a partial slab from a specific node. 1585 1593 */ 1586 - static struct page *get_partial_node(struct kmem_cache *s, 1587 - struct kmem_cache_node *n) 1594 + static void *get_partial_node(struct kmem_cache *s, 1595 + struct kmem_cache_node *n, struct kmem_cache_cpu *c) 1588 1596 { 1589 - struct page *page; 1597 + struct page *page, *page2; 1598 + void *object = NULL; 1590 1599 1591 1600 /* 1592 1601 * Racy check. If we mistakenly see no partial slabs then we ··· 1585 1620 return NULL; 1586 1621 1587 1622 spin_lock(&n->list_lock); 1588 - list_for_each_entry(page, &n->partial, lru) 1589 - if (acquire_slab(s, n, page)) 1590 - goto out; 1591 - page = NULL; 1592 - out: 1623 + list_for_each_entry_safe(page, page2, &n->partial, lru) { 1624 + void *t = acquire_slab(s, n, page, object == NULL); 1625 + int available; 1626 + 1627 + if (!t) 1628 + break; 1629 + 1630 + if (!object) { 1631 + c->page = page; 1632 + c->node = page_to_nid(page); 1633 + stat(s, ALLOC_FROM_PARTIAL); 1634 + object = t; 1635 + available = page->objects - page->inuse; 1636 + } else { 1637 + page->freelist = t; 1638 + available = put_cpu_partial(s, page, 0); 1639 + } 1640 + if (kmem_cache_debug(s) || available > s->cpu_partial / 2) 1641 + break; 1642 + 1643 + } 1593 1644 spin_unlock(&n->list_lock); 1594 - return page; 1645 + return object; 1595 1646 } 1596 1647 1597 1648 /* 1598 1649 * Get a page from somewhere. Search in increasing NUMA distances. 1599 1650 */ 1600 - static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) 1651 + static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, 1652 + struct kmem_cache_cpu *c) 1601 1653 { 1602 1654 #ifdef CONFIG_NUMA 1603 1655 struct zonelist *zonelist; 1604 1656 struct zoneref *z; 1605 1657 struct zone *zone; 1606 1658 enum zone_type high_zoneidx = gfp_zone(flags); 1607 - struct page *page; 1659 + void *object; 1608 1660 1609 1661 /* 1610 1662 * The defrag ratio allows a configuration of the tradeoffs between ··· 1654 1672 1655 1673 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1656 1674 n->nr_partial > s->min_partial) { 1657 - page = get_partial_node(s, n); 1658 - if (page) { 1675 + object = get_partial_node(s, n, c); 1676 + if (object) { 1659 1677 put_mems_allowed(); 1660 - return page; 1678 + return object; 1661 1679 } 1662 1680 } 1663 1681 } ··· 1669 1687 /* 1670 1688 * Get a partial page, lock it and return it. 1671 1689 */ 1672 - static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) 1690 + static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, 1691 + struct kmem_cache_cpu *c) 1673 1692 { 1674 - struct page *page; 1693 + void *object; 1675 1694 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; 1676 1695 1677 - page = get_partial_node(s, get_node(s, searchnode)); 1678 - if (page || node != NUMA_NO_NODE) 1679 - return page; 1696 + object = get_partial_node(s, get_node(s, searchnode), c); 1697 + if (object || node != NUMA_NO_NODE) 1698 + return object; 1680 1699 1681 - return get_any_partial(s, flags); 1700 + return get_any_partial(s, flags, c); 1682 1701 } 1683 1702 1684 1703 #ifdef CONFIG_PREEMPT ··· 1748 1765 for_each_possible_cpu(cpu) 1749 1766 per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); 1750 1767 } 1751 - /* 1752 - * Remove the cpu slab 1753 - */ 1754 1768 1755 1769 /* 1756 1770 * Remove the cpu slab ··· 1761 1781 enum slab_modes l = M_NONE, m = M_NONE; 1762 1782 void *freelist; 1763 1783 void *nextfree; 1764 - int tail = 0; 1784 + int tail = DEACTIVATE_TO_HEAD; 1765 1785 struct page new; 1766 1786 struct page old; 1767 1787 1768 1788 if (page->freelist) { 1769 1789 stat(s, DEACTIVATE_REMOTE_FREES); 1770 - tail = 1; 1790 + tail = DEACTIVATE_TO_TAIL; 1771 1791 } 1772 1792 1773 1793 c->tid = next_tid(c->tid); ··· 1873 1893 if (m == M_PARTIAL) { 1874 1894 1875 1895 add_partial(n, page, tail); 1876 - stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); 1896 + stat(s, tail); 1877 1897 1878 1898 } else if (m == M_FULL) { 1879 1899 ··· 1900 1920 } 1901 1921 } 1902 1922 1923 + /* Unfreeze all the cpu partial slabs */ 1924 + static void unfreeze_partials(struct kmem_cache *s) 1925 + { 1926 + struct kmem_cache_node *n = NULL; 1927 + struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); 1928 + struct page *page; 1929 + 1930 + while ((page = c->partial)) { 1931 + enum slab_modes { M_PARTIAL, M_FREE }; 1932 + enum slab_modes l, m; 1933 + struct page new; 1934 + struct page old; 1935 + 1936 + c->partial = page->next; 1937 + l = M_FREE; 1938 + 1939 + do { 1940 + 1941 + old.freelist = page->freelist; 1942 + old.counters = page->counters; 1943 + VM_BUG_ON(!old.frozen); 1944 + 1945 + new.counters = old.counters; 1946 + new.freelist = old.freelist; 1947 + 1948 + new.frozen = 0; 1949 + 1950 + if (!new.inuse && (!n || n->nr_partial > s->min_partial)) 1951 + m = M_FREE; 1952 + else { 1953 + struct kmem_cache_node *n2 = get_node(s, 1954 + page_to_nid(page)); 1955 + 1956 + m = M_PARTIAL; 1957 + if (n != n2) { 1958 + if (n) 1959 + spin_unlock(&n->list_lock); 1960 + 1961 + n = n2; 1962 + spin_lock(&n->list_lock); 1963 + } 1964 + } 1965 + 1966 + if (l != m) { 1967 + if (l == M_PARTIAL) 1968 + remove_partial(n, page); 1969 + else 1970 + add_partial(n, page, 1); 1971 + 1972 + l = m; 1973 + } 1974 + 1975 + } while (!cmpxchg_double_slab(s, page, 1976 + old.freelist, old.counters, 1977 + new.freelist, new.counters, 1978 + "unfreezing slab")); 1979 + 1980 + if (m == M_FREE) { 1981 + stat(s, DEACTIVATE_EMPTY); 1982 + discard_slab(s, page); 1983 + stat(s, FREE_SLAB); 1984 + } 1985 + } 1986 + 1987 + if (n) 1988 + spin_unlock(&n->list_lock); 1989 + } 1990 + 1991 + /* 1992 + * Put a page that was just frozen (in __slab_free) into a partial page 1993 + * slot if available. This is done without interrupts disabled and without 1994 + * preemption disabled. The cmpxchg is racy and may put the partial page 1995 + * onto a random cpus partial slot. 1996 + * 1997 + * If we did not find a slot then simply move all the partials to the 1998 + * per node partial list. 1999 + */ 2000 + int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) 2001 + { 2002 + struct page *oldpage; 2003 + int pages; 2004 + int pobjects; 2005 + 2006 + do { 2007 + pages = 0; 2008 + pobjects = 0; 2009 + oldpage = this_cpu_read(s->cpu_slab->partial); 2010 + 2011 + if (oldpage) { 2012 + pobjects = oldpage->pobjects; 2013 + pages = oldpage->pages; 2014 + if (drain && pobjects > s->cpu_partial) { 2015 + unsigned long flags; 2016 + /* 2017 + * partial array is full. Move the existing 2018 + * set to the per node partial list. 2019 + */ 2020 + local_irq_save(flags); 2021 + unfreeze_partials(s); 2022 + local_irq_restore(flags); 2023 + pobjects = 0; 2024 + pages = 0; 2025 + } 2026 + } 2027 + 2028 + pages++; 2029 + pobjects += page->objects - page->inuse; 2030 + 2031 + page->pages = pages; 2032 + page->pobjects = pobjects; 2033 + page->next = oldpage; 2034 + 2035 + } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); 2036 + stat(s, CPU_PARTIAL_FREE); 2037 + return pobjects; 2038 + } 2039 + 1903 2040 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1904 2041 { 1905 2042 stat(s, CPUSLAB_FLUSH); ··· 2032 1935 { 2033 1936 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 2034 1937 2035 - if (likely(c && c->page)) 2036 - flush_slab(s, c); 1938 + if (likely(c)) { 1939 + if (c->page) 1940 + flush_slab(s, c); 1941 + 1942 + unfreeze_partials(s); 1943 + } 2037 1944 } 2038 1945 2039 1946 static void flush_cpu_slab(void *d) ··· 2128 2027 } 2129 2028 } 2130 2029 2030 + static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, 2031 + int node, struct kmem_cache_cpu **pc) 2032 + { 2033 + void *object; 2034 + struct kmem_cache_cpu *c; 2035 + struct page *page = new_slab(s, flags, node); 2036 + 2037 + if (page) { 2038 + c = __this_cpu_ptr(s->cpu_slab); 2039 + if (c->page) 2040 + flush_slab(s, c); 2041 + 2042 + /* 2043 + * No other reference to the page yet so we can 2044 + * muck around with it freely without cmpxchg 2045 + */ 2046 + object = page->freelist; 2047 + page->freelist = NULL; 2048 + 2049 + stat(s, ALLOC_SLAB); 2050 + c->node = page_to_nid(page); 2051 + c->page = page; 2052 + *pc = c; 2053 + } else 2054 + object = NULL; 2055 + 2056 + return object; 2057 + } 2058 + 2131 2059 /* 2132 2060 * Slow path. The lockless freelist is empty or we need to perform 2133 2061 * debugging duties. 2134 - * 2135 - * Interrupts are disabled. 2136 2062 * 2137 2063 * Processing is still very fast if new objects have been freed to the 2138 2064 * regular freelist. In that case we simply take over the regular freelist ··· 2177 2049 unsigned long addr, struct kmem_cache_cpu *c) 2178 2050 { 2179 2051 void **object; 2180 - struct page *page; 2181 2052 unsigned long flags; 2182 2053 struct page new; 2183 2054 unsigned long counters; ··· 2191 2064 c = this_cpu_ptr(s->cpu_slab); 2192 2065 #endif 2193 2066 2194 - /* We handle __GFP_ZERO in the caller */ 2195 - gfpflags &= ~__GFP_ZERO; 2196 - 2197 - page = c->page; 2198 - if (!page) 2067 + if (!c->page) 2199 2068 goto new_slab; 2200 - 2069 + redo: 2201 2070 if (unlikely(!node_match(c, node))) { 2202 2071 stat(s, ALLOC_NODE_MISMATCH); 2203 2072 deactivate_slab(s, c); ··· 2203 2080 stat(s, ALLOC_SLOWPATH); 2204 2081 2205 2082 do { 2206 - object = page->freelist; 2207 - counters = page->counters; 2083 + object = c->page->freelist; 2084 + counters = c->page->counters; 2208 2085 new.counters = counters; 2209 2086 VM_BUG_ON(!new.frozen); 2210 2087 ··· 2216 2093 * 2217 2094 * If there are objects left then we retrieve them 2218 2095 * and use them to refill the per cpu queue. 2219 - */ 2096 + */ 2220 2097 2221 - new.inuse = page->objects; 2098 + new.inuse = c->page->objects; 2222 2099 new.frozen = object != NULL; 2223 2100 2224 - } while (!__cmpxchg_double_slab(s, page, 2101 + } while (!__cmpxchg_double_slab(s, c->page, 2225 2102 object, counters, 2226 2103 NULL, new.counters, 2227 2104 "__slab_alloc")); 2228 2105 2229 - if (unlikely(!object)) { 2106 + if (!object) { 2230 2107 c->page = NULL; 2231 2108 stat(s, DEACTIVATE_BYPASS); 2232 2109 goto new_slab; ··· 2235 2112 stat(s, ALLOC_REFILL); 2236 2113 2237 2114 load_freelist: 2238 - VM_BUG_ON(!page->frozen); 2239 2115 c->freelist = get_freepointer(s, object); 2240 2116 c->tid = next_tid(c->tid); 2241 2117 local_irq_restore(flags); 2242 2118 return object; 2243 2119 2244 2120 new_slab: 2245 - page = get_partial(s, gfpflags, node); 2246 - if (page) { 2247 - stat(s, ALLOC_FROM_PARTIAL); 2248 - object = c->freelist; 2249 2121 2250 - if (kmem_cache_debug(s)) 2251 - goto debug; 2252 - goto load_freelist; 2122 + if (c->partial) { 2123 + c->page = c->partial; 2124 + c->partial = c->page->next; 2125 + c->node = page_to_nid(c->page); 2126 + stat(s, CPU_PARTIAL_ALLOC); 2127 + c->freelist = NULL; 2128 + goto redo; 2253 2129 } 2254 2130 2255 - page = new_slab(s, gfpflags, node); 2131 + /* Then do expensive stuff like retrieving pages from the partial lists */ 2132 + object = get_partial(s, gfpflags, node, c); 2256 2133 2257 - if (page) { 2258 - c = __this_cpu_ptr(s->cpu_slab); 2259 - if (c->page) 2260 - flush_slab(s, c); 2134 + if (unlikely(!object)) { 2261 2135 2262 - /* 2263 - * No other reference to the page yet so we can 2264 - * muck around with it freely without cmpxchg 2265 - */ 2266 - object = page->freelist; 2267 - page->freelist = NULL; 2268 - page->inuse = page->objects; 2136 + object = new_slab_objects(s, gfpflags, node, &c); 2269 2137 2270 - stat(s, ALLOC_SLAB); 2271 - c->node = page_to_nid(page); 2272 - c->page = page; 2138 + if (unlikely(!object)) { 2139 + if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) 2140 + slab_out_of_memory(s, gfpflags, node); 2273 2141 2274 - if (kmem_cache_debug(s)) 2275 - goto debug; 2276 - goto load_freelist; 2142 + local_irq_restore(flags); 2143 + return NULL; 2144 + } 2277 2145 } 2278 - if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) 2279 - slab_out_of_memory(s, gfpflags, node); 2280 - local_irq_restore(flags); 2281 - return NULL; 2282 2146 2283 - debug: 2284 - if (!object || !alloc_debug_processing(s, page, object, addr)) 2285 - goto new_slab; 2147 + if (likely(!kmem_cache_debug(s))) 2148 + goto load_freelist; 2149 + 2150 + /* Only entered in the debug case */ 2151 + if (!alloc_debug_processing(s, c->page, object, addr)) 2152 + goto new_slab; /* Slab failed checks. Next slab needed */ 2286 2153 2287 2154 c->freelist = get_freepointer(s, object); 2288 2155 deactivate_slab(s, c); 2289 - c->page = NULL; 2290 2156 c->node = NUMA_NO_NODE; 2291 2157 local_irq_restore(flags); 2292 2158 return object; ··· 2445 2333 was_frozen = new.frozen; 2446 2334 new.inuse--; 2447 2335 if ((!new.inuse || !prior) && !was_frozen && !n) { 2448 - n = get_node(s, page_to_nid(page)); 2449 - /* 2450 - * Speculatively acquire the list_lock. 2451 - * If the cmpxchg does not succeed then we may 2452 - * drop the list_lock without any processing. 2453 - * 2454 - * Otherwise the list_lock will synchronize with 2455 - * other processors updating the list of slabs. 2456 - */ 2457 - spin_lock_irqsave(&n->list_lock, flags); 2336 + 2337 + if (!kmem_cache_debug(s) && !prior) 2338 + 2339 + /* 2340 + * Slab was on no list before and will be partially empty 2341 + * We can defer the list move and instead freeze it. 2342 + */ 2343 + new.frozen = 1; 2344 + 2345 + else { /* Needs to be taken off a list */ 2346 + 2347 + n = get_node(s, page_to_nid(page)); 2348 + /* 2349 + * Speculatively acquire the list_lock. 2350 + * If the cmpxchg does not succeed then we may 2351 + * drop the list_lock without any processing. 2352 + * 2353 + * Otherwise the list_lock will synchronize with 2354 + * other processors updating the list of slabs. 2355 + */ 2356 + spin_lock_irqsave(&n->list_lock, flags); 2357 + 2358 + } 2458 2359 } 2459 2360 inuse = new.inuse; 2460 2361 ··· 2477 2352 "__slab_free")); 2478 2353 2479 2354 if (likely(!n)) { 2480 - /* 2355 + 2356 + /* 2357 + * If we just froze the page then put it onto the 2358 + * per cpu partial list. 2359 + */ 2360 + if (new.frozen && !was_frozen) 2361 + put_cpu_partial(s, page, 1); 2362 + 2363 + /* 2481 2364 * The list lock was not taken therefore no list 2482 2365 * activity can be necessary. 2483 2366 */ ··· 2510 2377 */ 2511 2378 if (unlikely(!prior)) { 2512 2379 remove_full(s, page); 2513 - add_partial(n, page, 1); 2380 + add_partial(n, page, DEACTIVATE_TO_TAIL); 2514 2381 stat(s, FREE_ADD_PARTIAL); 2515 2382 } 2516 2383 } ··· 2554 2421 slab_free_hook(s, x); 2555 2422 2556 2423 redo: 2557 - 2558 2424 /* 2559 2425 * Determine the currently cpus per cpu slab. 2560 2426 * The cpu may change afterward. However that does not matter since ··· 2817 2685 n = page->freelist; 2818 2686 BUG_ON(!n); 2819 2687 page->freelist = get_freepointer(kmem_cache_node, n); 2820 - page->inuse++; 2688 + page->inuse = 1; 2821 2689 page->frozen = 0; 2822 2690 kmem_cache_node->node[node] = n; 2823 2691 #ifdef CONFIG_SLUB_DEBUG ··· 2827 2695 init_kmem_cache_node(n, kmem_cache_node); 2828 2696 inc_slabs_node(kmem_cache_node, node, page->objects); 2829 2697 2830 - add_partial(n, page, 0); 2698 + add_partial(n, page, DEACTIVATE_TO_HEAD); 2831 2699 } 2832 2700 2833 2701 static void free_kmem_cache_nodes(struct kmem_cache *s) ··· 3043 2911 * The larger the object size is, the more pages we want on the partial 3044 2912 * list to avoid pounding the page allocator excessively. 3045 2913 */ 3046 - set_min_partial(s, ilog2(s->size)); 2914 + set_min_partial(s, ilog2(s->size) / 2); 2915 + 2916 + /* 2917 + * cpu_partial determined the maximum number of objects kept in the 2918 + * per cpu partial lists of a processor. 2919 + * 2920 + * Per cpu partial lists mainly contain slabs that just have one 2921 + * object freed. If they are used for allocation then they can be 2922 + * filled up again with minimal effort. The slab will never hit the 2923 + * per node partial lists and therefore no locking will be required. 2924 + * 2925 + * This setting also determines 2926 + * 2927 + * A) The number of objects from per cpu partial slabs dumped to the 2928 + * per node list when we reach the limit. 2929 + * B) The number of objects in cpu partial slabs to extract from the 2930 + * per node list when we run out of per cpu objects. We only fetch 50% 2931 + * to keep some capacity around for frees. 2932 + */ 2933 + if (s->size >= PAGE_SIZE) 2934 + s->cpu_partial = 2; 2935 + else if (s->size >= 1024) 2936 + s->cpu_partial = 6; 2937 + else if (s->size >= 256) 2938 + s->cpu_partial = 13; 2939 + else 2940 + s->cpu_partial = 30; 2941 + 3047 2942 s->refcount = 1; 3048 2943 #ifdef CONFIG_NUMA 3049 2944 s->remote_node_defrag_ratio = 1000; ··· 3129 2970 3130 2971 /* 3131 2972 * Attempt to free all partial slabs on a node. 2973 + * This is called from kmem_cache_close(). We must be the last thread 2974 + * using the cache and therefore we do not need to lock anymore. 3132 2975 */ 3133 2976 static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) 3134 2977 { 3135 - unsigned long flags; 3136 2978 struct page *page, *h; 3137 2979 3138 - spin_lock_irqsave(&n->list_lock, flags); 3139 2980 list_for_each_entry_safe(page, h, &n->partial, lru) { 3140 2981 if (!page->inuse) { 3141 2982 remove_partial(n, page); ··· 3145 2986 "Objects remaining on kmem_cache_close()"); 3146 2987 } 3147 2988 } 3148 - spin_unlock_irqrestore(&n->list_lock, flags); 3149 2989 } 3150 2990 3151 2991 /* ··· 3178 3020 s->refcount--; 3179 3021 if (!s->refcount) { 3180 3022 list_del(&s->list); 3023 + up_write(&slub_lock); 3181 3024 if (kmem_cache_close(s)) { 3182 3025 printk(KERN_ERR "SLUB %s: %s called for cache that " 3183 3026 "still has objects.\n", s->name, __func__); ··· 3187 3028 if (s->flags & SLAB_DESTROY_BY_RCU) 3188 3029 rcu_barrier(); 3189 3030 sysfs_slab_remove(s); 3190 - } 3191 - up_write(&slub_lock); 3031 + } else 3032 + up_write(&slub_lock); 3192 3033 } 3193 3034 EXPORT_SYMBOL(kmem_cache_destroy); 3194 3035 ··· 3506 3347 * list_lock. page->inuse here is the upper limit. 3507 3348 */ 3508 3349 list_for_each_entry_safe(page, t, &n->partial, lru) { 3509 - if (!page->inuse) { 3510 - remove_partial(n, page); 3511 - discard_slab(s, page); 3512 - } else { 3513 - list_move(&page->lru, 3514 - slabs_by_inuse + page->inuse); 3515 - } 3350 + list_move(&page->lru, slabs_by_inuse + page->inuse); 3351 + if (!page->inuse) 3352 + n->nr_partial--; 3516 3353 } 3517 3354 3518 3355 /* 3519 3356 * Rebuild the partial list with the slabs filled up most 3520 3357 * first and the least used slabs at the end. 3521 3358 */ 3522 - for (i = objects - 1; i >= 0; i--) 3359 + for (i = objects - 1; i > 0; i--) 3523 3360 list_splice(slabs_by_inuse + i, n->partial.prev); 3524 3361 3525 3362 spin_unlock_irqrestore(&n->list_lock, flags); 3363 + 3364 + /* Release empty slabs */ 3365 + list_for_each_entry_safe(page, t, slabs_by_inuse, lru) 3366 + discard_slab(s, page); 3526 3367 } 3527 3368 3528 3369 kfree(slabs_by_inuse); ··· 4478 4319 4479 4320 for_each_possible_cpu(cpu) { 4480 4321 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 4322 + struct page *page; 4481 4323 4482 4324 if (!c || c->node < 0) 4483 4325 continue; ··· 4493 4333 4494 4334 total += x; 4495 4335 nodes[c->node] += x; 4336 + } 4337 + page = c->partial; 4338 + 4339 + if (page) { 4340 + x = page->pobjects; 4341 + total += x; 4342 + nodes[c->node] += x; 4496 4343 } 4497 4344 per_cpu[c->node]++; 4498 4345 } ··· 4579 4412 }; 4580 4413 4581 4414 #define SLAB_ATTR_RO(_name) \ 4582 - static struct slab_attribute _name##_attr = __ATTR_RO(_name) 4415 + static struct slab_attribute _name##_attr = \ 4416 + __ATTR(_name, 0400, _name##_show, NULL) 4583 4417 4584 4418 #define SLAB_ATTR(_name) \ 4585 4419 static struct slab_attribute _name##_attr = \ 4586 - __ATTR(_name, 0644, _name##_show, _name##_store) 4420 + __ATTR(_name, 0600, _name##_show, _name##_store) 4587 4421 4588 4422 static ssize_t slab_size_show(struct kmem_cache *s, char *buf) 4589 4423 { ··· 4653 4485 } 4654 4486 SLAB_ATTR(min_partial); 4655 4487 4488 + static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) 4489 + { 4490 + return sprintf(buf, "%u\n", s->cpu_partial); 4491 + } 4492 + 4493 + static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, 4494 + size_t length) 4495 + { 4496 + unsigned long objects; 4497 + int err; 4498 + 4499 + err = strict_strtoul(buf, 10, &objects); 4500 + if (err) 4501 + return err; 4502 + 4503 + s->cpu_partial = objects; 4504 + flush_all(s); 4505 + return length; 4506 + } 4507 + SLAB_ATTR(cpu_partial); 4508 + 4656 4509 static ssize_t ctor_show(struct kmem_cache *s, char *buf) 4657 4510 { 4658 4511 if (!s->ctor) ··· 4711 4522 return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS); 4712 4523 } 4713 4524 SLAB_ATTR_RO(objects_partial); 4525 + 4526 + static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) 4527 + { 4528 + int objects = 0; 4529 + int pages = 0; 4530 + int cpu; 4531 + int len; 4532 + 4533 + for_each_online_cpu(cpu) { 4534 + struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; 4535 + 4536 + if (page) { 4537 + pages += page->pages; 4538 + objects += page->pobjects; 4539 + } 4540 + } 4541 + 4542 + len = sprintf(buf, "%d(%d)", objects, pages); 4543 + 4544 + #ifdef CONFIG_SMP 4545 + for_each_online_cpu(cpu) { 4546 + struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; 4547 + 4548 + if (page && len < PAGE_SIZE - 20) 4549 + len += sprintf(buf + len, " C%d=%d(%d)", cpu, 4550 + page->pobjects, page->pages); 4551 + } 4552 + #endif 4553 + return len + sprintf(buf + len, "\n"); 4554 + } 4555 + SLAB_ATTR_RO(slabs_cpu_partial); 4714 4556 4715 4557 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 4716 4558 { ··· 5065 4845 STAT_ATTR(ORDER_FALLBACK, order_fallback); 5066 4846 STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); 5067 4847 STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); 4848 + STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); 4849 + STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); 5068 4850 #endif 5069 4851 5070 4852 static struct attribute *slab_attrs[] = { ··· 5075 4853 &objs_per_slab_attr.attr, 5076 4854 &order_attr.attr, 5077 4855 &min_partial_attr.attr, 4856 + &cpu_partial_attr.attr, 5078 4857 &objects_attr.attr, 5079 4858 &objects_partial_attr.attr, 5080 4859 &partial_attr.attr, ··· 5088 4865 &destroy_by_rcu_attr.attr, 5089 4866 &shrink_attr.attr, 5090 4867 &reserved_attr.attr, 4868 + &slabs_cpu_partial_attr.attr, 5091 4869 #ifdef CONFIG_SLUB_DEBUG 5092 4870 &total_objects_attr.attr, 5093 4871 &slabs_attr.attr, ··· 5130 4906 &order_fallback_attr.attr, 5131 4907 &cmpxchg_double_fail_attr.attr, 5132 4908 &cmpxchg_double_cpu_fail_attr.attr, 4909 + &cpu_partial_alloc_attr.attr, 4910 + &cpu_partial_free_attr.attr, 5133 4911 #endif 5134 4912 #ifdef CONFIG_FAILSLAB 5135 4913 &failslab_attr.attr, ··· 5483 5257 5484 5258 static int __init slab_proc_init(void) 5485 5259 { 5486 - proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations); 5260 + proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); 5487 5261 return 0; 5488 5262 } 5489 5263 module_init(slab_proc_init);
+9 -1
tools/slub/slabinfo.c
··· 42 42 unsigned long deactivate_remote_frees, order_fallback; 43 43 unsigned long cmpxchg_double_cpu_fail, cmpxchg_double_fail; 44 44 unsigned long alloc_node_mismatch, deactivate_bypass; 45 + unsigned long cpu_partial_alloc, cpu_partial_free; 45 46 int numa[MAX_NODES]; 46 47 int numa_partial[MAX_NODES]; 47 48 } slabinfo[MAX_SLABS]; ··· 455 454 s->alloc_from_partial, s->free_remove_partial, 456 455 s->alloc_from_partial * 100 / total_alloc, 457 456 s->free_remove_partial * 100 / total_free); 457 + 458 + printf("Cpu partial list %8lu %8lu %3lu %3lu\n", 459 + s->cpu_partial_alloc, s->cpu_partial_free, 460 + s->cpu_partial_alloc * 100 / total_alloc, 461 + s->cpu_partial_free * 100 / total_free); 458 462 459 463 printf("RemoteObj/SlabFrozen %8lu %8lu %3lu %3lu\n", 460 464 s->deactivate_remote_frees, s->free_frozen, ··· 1151 1145 switch (de->d_type) { 1152 1146 case DT_LNK: 1153 1147 alias->name = strdup(de->d_name); 1154 - count = readlink(de->d_name, buffer, sizeof(buffer)); 1148 + count = readlink(de->d_name, buffer, sizeof(buffer)-1); 1155 1149 1156 1150 if (count < 0) 1157 1151 fatal("Cannot read symlink %s\n", de->d_name); ··· 1215 1209 slab->order_fallback = get_obj("order_fallback"); 1216 1210 slab->cmpxchg_double_cpu_fail = get_obj("cmpxchg_double_cpu_fail"); 1217 1211 slab->cmpxchg_double_fail = get_obj("cmpxchg_double_fail"); 1212 + slab->cpu_partial_alloc = get_obj("cpu_partial_alloc"); 1213 + slab->cpu_partial_free = get_obj("cpu_partial_free"); 1218 1214 slab->alloc_node_mismatch = get_obj("alloc_node_mismatch"); 1219 1215 slab->deactivate_bypass = get_obj("deactivate_bypass"); 1220 1216 chdir("..");