SLUB: Alternate fast paths using cmpxchg_local

Provide an alternate implementation of the SLUB fast paths for alloc
and free using cmpxchg_local. The cmpxchg_local fast path is selected
for arches that have CONFIG_FAST_CMPXCHG_LOCAL set. An arch should only
set CONFIG_FAST_CMPXCHG_LOCAL if the cmpxchg_local is faster than an
interrupt enable/disable sequence. This is known to be true for both
x86 platforms so set FAST_CMPXCHG_LOCAL for both arches.

Currently another requirement for the fastpath is that the kernel is
compiled without preemption. The restriction will go away with the
introduction of a new per cpu allocator and new per cpu operations.

The advantages of a cmpxchg_local based fast path are:

1. Potentially lower cycle count (30%-60% faster)

2. There is no need to disable and enable interrupts on the fast path.
Currently interrupts have to be disabled and enabled on every
slab operation. This is likely avoiding a significant percentage
of interrupt off / on sequences in the kernel.

3. The disposal of freed slabs can occur with interrupts enabled.

The alternate path is realized using #ifdef's. Several attempts to do the
same with macros and inline functions resulted in a mess (in particular due
to the strange way that local_interrupt_save() handles its argument and due
to the need to define macros/functions that sometimes disable interrupts
and sometimes do something else).

[clameter: Stripped preempt bits and disabled fastpath if preempt is enabled]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by Christoph Lameter and committed by Christoph Lameter 1f84260c 683d0baa

+92 -5
+4
arch/x86/Kconfig
··· 52 52 config SEMAPHORE_SLEEPERS 53 53 def_bool y 54 54 55 + config FAST_CMPXCHG_LOCAL 56 + bool 57 + default y 58 + 55 59 config MMU 56 60 def_bool y 57 61
+88 -5
mm/slub.c
··· 149 149 /* Enable to test recovery from slab corruption on boot */ 150 150 #undef SLUB_RESILIENCY_TEST 151 151 152 + /* 153 + * Currently fastpath is not supported if preemption is enabled. 154 + */ 155 + #if defined(CONFIG_FAST_CMPXCHG_LOCAL) && !defined(CONFIG_PREEMPT) 156 + #define SLUB_FASTPATH 157 + #endif 158 + 152 159 #if PAGE_SHIFT <= 12 153 160 154 161 /* ··· 1500 1493 { 1501 1494 void **object; 1502 1495 struct page *new; 1496 + #ifdef SLUB_FASTPATH 1497 + unsigned long flags; 1503 1498 1499 + local_irq_save(flags); 1500 + #endif 1504 1501 if (!c->page) 1505 1502 goto new_slab; 1506 1503 ··· 1523 1512 c->page->inuse = s->objects; 1524 1513 c->page->freelist = c->page->end; 1525 1514 c->node = page_to_nid(c->page); 1515 + unlock_out: 1526 1516 slab_unlock(c->page); 1517 + out: 1518 + #ifdef SLUB_FASTPATH 1519 + local_irq_restore(flags); 1520 + #endif 1527 1521 return object; 1528 1522 1529 1523 another_slab: ··· 1558 1542 c->page = new; 1559 1543 goto load_freelist; 1560 1544 } 1561 - return NULL; 1545 + object = NULL; 1546 + goto out; 1562 1547 debug: 1563 1548 object = c->page->freelist; 1564 1549 if (!alloc_debug_processing(s, c->page, object, addr)) ··· 1568 1551 c->page->inuse++; 1569 1552 c->page->freelist = object[c->offset]; 1570 1553 c->node = -1; 1571 - slab_unlock(c->page); 1572 - return object; 1554 + goto unlock_out; 1573 1555 } 1574 1556 1575 1557 /* ··· 1585 1569 gfp_t gfpflags, int node, void *addr) 1586 1570 { 1587 1571 void **object; 1588 - unsigned long flags; 1589 1572 struct kmem_cache_cpu *c; 1573 + 1574 + /* 1575 + * The SLUB_FASTPATH path is provisional and is currently disabled if the 1576 + * kernel is compiled with preemption or if the arch does not support 1577 + * fast cmpxchg operations. There are a couple of coming changes that will 1578 + * simplify matters and allow preemption. Ultimately we may end up making 1579 + * SLUB_FASTPATH the default. 1580 + * 1581 + * 1. The introduction of the per cpu allocator will avoid array lookups 1582 + * through get_cpu_slab(). A special register can be used instead. 1583 + * 1584 + * 2. The introduction of per cpu atomic operations (cpu_ops) means that 1585 + * we can realize the logic here entirely with per cpu atomics. The 1586 + * per cpu atomic ops will take care of the preemption issues. 1587 + */ 1588 + 1589 + #ifdef SLUB_FASTPATH 1590 + c = get_cpu_slab(s, raw_smp_processor_id()); 1591 + do { 1592 + object = c->freelist; 1593 + if (unlikely(is_end(object) || !node_match(c, node))) { 1594 + object = __slab_alloc(s, gfpflags, node, addr, c); 1595 + break; 1596 + } 1597 + } while (cmpxchg_local(&c->freelist, object, object[c->offset]) 1598 + != object); 1599 + #else 1600 + unsigned long flags; 1590 1601 1591 1602 local_irq_save(flags); 1592 1603 c = get_cpu_slab(s, smp_processor_id()); ··· 1626 1583 c->freelist = object[c->offset]; 1627 1584 } 1628 1585 local_irq_restore(flags); 1586 + #endif 1629 1587 1630 1588 if (unlikely((gfpflags & __GFP_ZERO) && object)) 1631 1589 memset(object, 0, c->objsize); ··· 1662 1618 void *prior; 1663 1619 void **object = (void *)x; 1664 1620 1621 + #ifdef SLUB_FASTPATH 1622 + unsigned long flags; 1623 + 1624 + local_irq_save(flags); 1625 + #endif 1665 1626 slab_lock(page); 1666 1627 1667 1628 if (unlikely(SlabDebug(page))) ··· 1692 1643 1693 1644 out_unlock: 1694 1645 slab_unlock(page); 1646 + #ifdef SLUB_FASTPATH 1647 + local_irq_restore(flags); 1648 + #endif 1695 1649 return; 1696 1650 1697 1651 slab_empty: ··· 1705 1653 remove_partial(s, page); 1706 1654 1707 1655 slab_unlock(page); 1656 + #ifdef SLUB_FASTPATH 1657 + local_irq_restore(flags); 1658 + #endif 1708 1659 discard_slab(s, page); 1709 1660 return; 1710 1661 ··· 1732 1677 struct page *page, void *x, void *addr) 1733 1678 { 1734 1679 void **object = (void *)x; 1735 - unsigned long flags; 1736 1680 struct kmem_cache_cpu *c; 1681 + 1682 + #ifdef SLUB_FASTPATH 1683 + void **freelist; 1684 + 1685 + c = get_cpu_slab(s, raw_smp_processor_id()); 1686 + debug_check_no_locks_freed(object, s->objsize); 1687 + do { 1688 + freelist = c->freelist; 1689 + barrier(); 1690 + /* 1691 + * If the compiler would reorder the retrieval of c->page to 1692 + * come before c->freelist then an interrupt could 1693 + * change the cpu slab before we retrieve c->freelist. We 1694 + * could be matching on a page no longer active and put the 1695 + * object onto the freelist of the wrong slab. 1696 + * 1697 + * On the other hand: If we already have the freelist pointer 1698 + * then any change of cpu_slab will cause the cmpxchg to fail 1699 + * since the freelist pointers are unique per slab. 1700 + */ 1701 + if (unlikely(page != c->page || c->node < 0)) { 1702 + __slab_free(s, page, x, addr, c->offset); 1703 + break; 1704 + } 1705 + object[c->offset] = freelist; 1706 + } while (cmpxchg_local(&c->freelist, freelist, object) != freelist); 1707 + #else 1708 + unsigned long flags; 1737 1709 1738 1710 local_irq_save(flags); 1739 1711 debug_check_no_locks_freed(object, s->objsize); ··· 1772 1690 __slab_free(s, page, x, addr, c->offset); 1773 1691 1774 1692 local_irq_restore(flags); 1693 + #endif 1775 1694 } 1776 1695 1777 1696 void kmem_cache_free(struct kmem_cache *s, void *x)