Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

zsmalloc: support compaction

This patch provides core functions for migration of zsmalloc. Migraion
policy is simple as follows.

for each size class {
while {
src_page = get zs_page from ZS_ALMOST_EMPTY
if (!src_page)
break;
dst_page = get zs_page from ZS_ALMOST_FULL
if (!dst_page)
dst_page = get zs_page from ZS_ALMOST_EMPTY
if (!dst_page)
break;
migrate(from src_page, to dst_page);
}
}

For migration, we need to identify which objects in zspage are allocated
to migrate them out. We could know it by iterating of freed objects in a
zspage because first_page of zspage keeps free objects singly-linked list
but it's not efficient. Instead, this patch adds a tag(ie,
OBJ_ALLOCATED_TAG) in header of each object(ie, handle) so we could check
whether the object is allocated easily.

This patch adds another status bit in handle to synchronize between user
access through zs_map_object and migration. During migration, we cannot
move objects user are using due to data coherency between old object and
new object.

[akpm@linux-foundation.org: zsmalloc.c needs sched.h for cond_resched()]
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Juneho Choi <juno.choi@lge.com>
Cc: Gunho Lee <gunho.lee@lge.com>
Cc: Luigi Semenzato <semenzato@google.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Seth Jennings <sjennings@variantweb.net>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Minchan Kim and committed by
Linus Torvalds
312fcae2 c7806261

+362 -21
+1
include/linux/zsmalloc.h
··· 47 47 void zs_unmap_object(struct zs_pool *pool, unsigned long handle); 48 48 49 49 unsigned long zs_get_total_pages(struct zs_pool *pool); 50 + unsigned long zs_compact(struct zs_pool *pool); 50 51 51 52 #endif
+361 -21
mm/zsmalloc.c
··· 78 78 79 79 #include <linux/module.h> 80 80 #include <linux/kernel.h> 81 + #include <linux/sched.h> 81 82 #include <linux/bitops.h> 82 83 #include <linux/errno.h> 83 84 #include <linux/highmem.h> ··· 136 135 #endif 137 136 #endif 138 137 #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) 139 - #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) 138 + 139 + /* 140 + * Memory for allocating for handle keeps object position by 141 + * encoding <page, obj_idx> and the encoded value has a room 142 + * in least bit(ie, look at obj_to_location). 143 + * We use the bit to synchronize between object access by 144 + * user and migration. 145 + */ 146 + #define HANDLE_PIN_BIT 0 147 + 148 + /* 149 + * Head in allocated object should have OBJ_ALLOCATED_TAG 150 + * to identify the object was allocated or not. 151 + * It's okay to add the status bit in the least bit because 152 + * header keeps handle which is 4byte-aligned address so we 153 + * have room for two bit at least. 154 + */ 155 + #define OBJ_ALLOCATED_TAG 1 156 + #define OBJ_TAG_BITS 1 157 + #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) 140 158 #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) 141 159 142 160 #define MAX(a, b) ((a) >= (b) ? (a) : (b)) ··· 630 610 631 611 /* 632 612 * Encode <page, obj_idx> as a single handle value. 633 - * On hardware platforms with physical memory starting at 0x0 the pfn 634 - * could be 0 so we ensure that the handle will never be 0 by adjusting the 635 - * encoded obj_idx value before encoding. 613 + * We use the least bit of handle for tagging. 636 614 */ 637 - static void *obj_location_to_handle(struct page *page, unsigned long obj_idx) 615 + static void *location_to_obj(struct page *page, unsigned long obj_idx) 638 616 { 639 - unsigned long handle; 617 + unsigned long obj; 640 618 641 619 if (!page) { 642 620 BUG_ON(obj_idx); 643 621 return NULL; 644 622 } 645 623 646 - handle = page_to_pfn(page) << OBJ_INDEX_BITS; 647 - handle |= ((obj_idx + 1) & OBJ_INDEX_MASK); 624 + obj = page_to_pfn(page) << OBJ_INDEX_BITS; 625 + obj |= ((obj_idx) & OBJ_INDEX_MASK); 626 + obj <<= OBJ_TAG_BITS; 648 627 649 - return (void *)handle; 628 + return (void *)obj; 650 629 } 651 630 652 631 /* 653 632 * Decode <page, obj_idx> pair from the given object handle. We adjust the 654 633 * decoded obj_idx back to its original value since it was adjusted in 655 - * obj_location_to_handle(). 634 + * location_to_obj(). 656 635 */ 657 - static void obj_to_location(unsigned long handle, struct page **page, 636 + static void obj_to_location(unsigned long obj, struct page **page, 658 637 unsigned long *obj_idx) 659 638 { 660 - *page = pfn_to_page(handle >> OBJ_INDEX_BITS); 661 - *obj_idx = (handle & OBJ_INDEX_MASK) - 1; 639 + obj >>= OBJ_TAG_BITS; 640 + *page = pfn_to_page(obj >> OBJ_INDEX_BITS); 641 + *obj_idx = (obj & OBJ_INDEX_MASK); 662 642 } 663 643 664 644 static unsigned long handle_to_obj(unsigned long handle) 665 645 { 666 646 return *(unsigned long *)handle; 647 + } 648 + 649 + unsigned long obj_to_head(void *obj) 650 + { 651 + return *(unsigned long *)obj; 667 652 } 668 653 669 654 static unsigned long obj_idx_to_offset(struct page *page, ··· 680 655 off = page->index; 681 656 682 657 return off + obj_idx * class_size; 658 + } 659 + 660 + static inline int trypin_tag(unsigned long handle) 661 + { 662 + unsigned long *ptr = (unsigned long *)handle; 663 + 664 + return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr); 665 + } 666 + 667 + static void pin_tag(unsigned long handle) 668 + { 669 + while (!trypin_tag(handle)); 670 + } 671 + 672 + static void unpin_tag(unsigned long handle) 673 + { 674 + unsigned long *ptr = (unsigned long *)handle; 675 + 676 + clear_bit_unlock(HANDLE_PIN_BIT, ptr); 683 677 } 684 678 685 679 static void reset_page(struct page *page) ··· 762 718 link = (struct link_free *)vaddr + off / sizeof(*link); 763 719 764 720 while ((off += class->size) < PAGE_SIZE) { 765 - link->next = obj_location_to_handle(page, i++); 721 + link->next = location_to_obj(page, i++); 766 722 link += class->size / sizeof(*link); 767 723 } 768 724 ··· 772 728 * page (if present) 773 729 */ 774 730 next_page = get_next_page(page); 775 - link->next = obj_location_to_handle(next_page, 0); 731 + link->next = location_to_obj(next_page, 0); 776 732 kunmap_atomic(vaddr); 777 733 page = next_page; 778 734 off %= PAGE_SIZE; ··· 826 782 827 783 init_zspage(first_page, class); 828 784 829 - first_page->freelist = obj_location_to_handle(first_page, 0); 785 + first_page->freelist = location_to_obj(first_page, 0); 830 786 /* Maximum number of objects we can store in this zspage */ 831 787 first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; 832 788 ··· 1061 1017 return true; 1062 1018 } 1063 1019 1020 + static bool zspage_full(struct page *page) 1021 + { 1022 + BUG_ON(!is_first_page(page)); 1023 + 1024 + return page->inuse == page->objects; 1025 + } 1026 + 1064 1027 #ifdef CONFIG_ZSMALLOC_STAT 1065 1028 1066 1029 static inline void zs_stat_inc(struct size_class *class, ··· 1270 1219 */ 1271 1220 BUG_ON(in_interrupt()); 1272 1221 1222 + /* From now on, migration cannot move the object */ 1223 + pin_tag(handle); 1224 + 1273 1225 obj = handle_to_obj(handle); 1274 1226 obj_to_location(obj, &page, &obj_idx); 1275 1227 get_zspage_mapping(get_first_page(page), &class_idx, &fg); ··· 1330 1276 __zs_unmap_object(area, pages, off, class->size); 1331 1277 } 1332 1278 put_cpu_var(zs_map_area); 1279 + unpin_tag(handle); 1333 1280 } 1334 1281 EXPORT_SYMBOL_GPL(zs_unmap_object); 1335 1282 ··· 1344 1289 unsigned long m_objidx, m_offset; 1345 1290 void *vaddr; 1346 1291 1292 + handle |= OBJ_ALLOCATED_TAG; 1347 1293 obj = (unsigned long)first_page->freelist; 1348 1294 obj_to_location(obj, &m_page, &m_objidx); 1349 1295 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); ··· 1430 1374 1431 1375 BUG_ON(!obj); 1432 1376 1377 + obj &= ~OBJ_ALLOCATED_TAG; 1433 1378 obj_to_location(obj, &f_page, &f_objidx); 1434 1379 first_page = get_first_page(f_page); 1435 1380 ··· 1459 1402 if (unlikely(!handle)) 1460 1403 return; 1461 1404 1405 + pin_tag(handle); 1462 1406 obj = handle_to_obj(handle); 1463 - free_handle(pool, handle); 1464 1407 obj_to_location(obj, &f_page, &f_objidx); 1465 1408 first_page = get_first_page(f_page); 1466 1409 ··· 1470 1413 spin_lock(&class->lock); 1471 1414 obj_free(pool, class, obj); 1472 1415 fullness = fix_fullness_group(class, first_page); 1473 - if (fullness == ZS_EMPTY) 1416 + if (fullness == ZS_EMPTY) { 1474 1417 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1475 1418 class->size, class->pages_per_zspage)); 1476 - spin_unlock(&class->lock); 1477 - 1478 - if (fullness == ZS_EMPTY) { 1479 1419 atomic_long_sub(class->pages_per_zspage, 1480 1420 &pool->pages_allocated); 1481 1421 free_zspage(first_page); 1482 1422 } 1423 + spin_unlock(&class->lock); 1424 + unpin_tag(handle); 1425 + 1426 + free_handle(pool, handle); 1483 1427 } 1484 1428 EXPORT_SYMBOL_GPL(zs_free); 1429 + 1430 + static void zs_object_copy(unsigned long src, unsigned long dst, 1431 + struct size_class *class) 1432 + { 1433 + struct page *s_page, *d_page; 1434 + unsigned long s_objidx, d_objidx; 1435 + unsigned long s_off, d_off; 1436 + void *s_addr, *d_addr; 1437 + int s_size, d_size, size; 1438 + int written = 0; 1439 + 1440 + s_size = d_size = class->size; 1441 + 1442 + obj_to_location(src, &s_page, &s_objidx); 1443 + obj_to_location(dst, &d_page, &d_objidx); 1444 + 1445 + s_off = obj_idx_to_offset(s_page, s_objidx, class->size); 1446 + d_off = obj_idx_to_offset(d_page, d_objidx, class->size); 1447 + 1448 + if (s_off + class->size > PAGE_SIZE) 1449 + s_size = PAGE_SIZE - s_off; 1450 + 1451 + if (d_off + class->size > PAGE_SIZE) 1452 + d_size = PAGE_SIZE - d_off; 1453 + 1454 + s_addr = kmap_atomic(s_page); 1455 + d_addr = kmap_atomic(d_page); 1456 + 1457 + while (1) { 1458 + size = min(s_size, d_size); 1459 + memcpy(d_addr + d_off, s_addr + s_off, size); 1460 + written += size; 1461 + 1462 + if (written == class->size) 1463 + break; 1464 + 1465 + if (s_off + size >= PAGE_SIZE) { 1466 + kunmap_atomic(d_addr); 1467 + kunmap_atomic(s_addr); 1468 + s_page = get_next_page(s_page); 1469 + BUG_ON(!s_page); 1470 + s_addr = kmap_atomic(s_page); 1471 + d_addr = kmap_atomic(d_page); 1472 + s_size = class->size - written; 1473 + s_off = 0; 1474 + } else { 1475 + s_off += size; 1476 + s_size -= size; 1477 + } 1478 + 1479 + if (d_off + size >= PAGE_SIZE) { 1480 + kunmap_atomic(d_addr); 1481 + d_page = get_next_page(d_page); 1482 + BUG_ON(!d_page); 1483 + d_addr = kmap_atomic(d_page); 1484 + d_size = class->size - written; 1485 + d_off = 0; 1486 + } else { 1487 + d_off += size; 1488 + d_size -= size; 1489 + } 1490 + } 1491 + 1492 + kunmap_atomic(d_addr); 1493 + kunmap_atomic(s_addr); 1494 + } 1495 + 1496 + /* 1497 + * Find alloced object in zspage from index object and 1498 + * return handle. 1499 + */ 1500 + static unsigned long find_alloced_obj(struct page *page, int index, 1501 + struct size_class *class) 1502 + { 1503 + unsigned long head; 1504 + int offset = 0; 1505 + unsigned long handle = 0; 1506 + void *addr = kmap_atomic(page); 1507 + 1508 + if (!is_first_page(page)) 1509 + offset = page->index; 1510 + offset += class->size * index; 1511 + 1512 + while (offset < PAGE_SIZE) { 1513 + head = obj_to_head(addr + offset); 1514 + if (head & OBJ_ALLOCATED_TAG) { 1515 + handle = head & ~OBJ_ALLOCATED_TAG; 1516 + if (trypin_tag(handle)) 1517 + break; 1518 + handle = 0; 1519 + } 1520 + 1521 + offset += class->size; 1522 + index++; 1523 + } 1524 + 1525 + kunmap_atomic(addr); 1526 + return handle; 1527 + } 1528 + 1529 + struct zs_compact_control { 1530 + /* Source page for migration which could be a subpage of zspage. */ 1531 + struct page *s_page; 1532 + /* Destination page for migration which should be a first page 1533 + * of zspage. */ 1534 + struct page *d_page; 1535 + /* Starting object index within @s_page which used for live object 1536 + * in the subpage. */ 1537 + int index; 1538 + /* how many of objects are migrated */ 1539 + int nr_migrated; 1540 + }; 1541 + 1542 + static int migrate_zspage(struct zs_pool *pool, struct size_class *class, 1543 + struct zs_compact_control *cc) 1544 + { 1545 + unsigned long used_obj, free_obj; 1546 + unsigned long handle; 1547 + struct page *s_page = cc->s_page; 1548 + struct page *d_page = cc->d_page; 1549 + unsigned long index = cc->index; 1550 + int nr_migrated = 0; 1551 + int ret = 0; 1552 + 1553 + while (1) { 1554 + handle = find_alloced_obj(s_page, index, class); 1555 + if (!handle) { 1556 + s_page = get_next_page(s_page); 1557 + if (!s_page) 1558 + break; 1559 + index = 0; 1560 + continue; 1561 + } 1562 + 1563 + /* Stop if there is no more space */ 1564 + if (zspage_full(d_page)) { 1565 + unpin_tag(handle); 1566 + ret = -ENOMEM; 1567 + break; 1568 + } 1569 + 1570 + used_obj = handle_to_obj(handle); 1571 + free_obj = obj_malloc(d_page, class, handle); 1572 + zs_object_copy(used_obj, free_obj, class); 1573 + index++; 1574 + record_obj(handle, free_obj); 1575 + unpin_tag(handle); 1576 + obj_free(pool, class, used_obj); 1577 + nr_migrated++; 1578 + } 1579 + 1580 + /* Remember last position in this iteration */ 1581 + cc->s_page = s_page; 1582 + cc->index = index; 1583 + cc->nr_migrated = nr_migrated; 1584 + 1585 + return ret; 1586 + } 1587 + 1588 + static struct page *alloc_target_page(struct size_class *class) 1589 + { 1590 + int i; 1591 + struct page *page; 1592 + 1593 + for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { 1594 + page = class->fullness_list[i]; 1595 + if (page) { 1596 + remove_zspage(page, class, i); 1597 + break; 1598 + } 1599 + } 1600 + 1601 + return page; 1602 + } 1603 + 1604 + static void putback_zspage(struct zs_pool *pool, struct size_class *class, 1605 + struct page *first_page) 1606 + { 1607 + int class_idx; 1608 + enum fullness_group fullness; 1609 + 1610 + BUG_ON(!is_first_page(first_page)); 1611 + 1612 + get_zspage_mapping(first_page, &class_idx, &fullness); 1613 + insert_zspage(first_page, class, fullness); 1614 + fullness = fix_fullness_group(class, first_page); 1615 + if (fullness == ZS_EMPTY) { 1616 + zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1617 + class->size, class->pages_per_zspage)); 1618 + atomic_long_sub(class->pages_per_zspage, 1619 + &pool->pages_allocated); 1620 + 1621 + free_zspage(first_page); 1622 + } 1623 + } 1624 + 1625 + static struct page *isolate_source_page(struct size_class *class) 1626 + { 1627 + struct page *page; 1628 + 1629 + page = class->fullness_list[ZS_ALMOST_EMPTY]; 1630 + if (page) 1631 + remove_zspage(page, class, ZS_ALMOST_EMPTY); 1632 + 1633 + return page; 1634 + } 1635 + 1636 + static unsigned long __zs_compact(struct zs_pool *pool, 1637 + struct size_class *class) 1638 + { 1639 + int nr_to_migrate; 1640 + struct zs_compact_control cc; 1641 + struct page *src_page; 1642 + struct page *dst_page = NULL; 1643 + unsigned long nr_total_migrated = 0; 1644 + 1645 + cond_resched(); 1646 + 1647 + spin_lock(&class->lock); 1648 + while ((src_page = isolate_source_page(class))) { 1649 + 1650 + BUG_ON(!is_first_page(src_page)); 1651 + 1652 + /* The goal is to migrate all live objects in source page */ 1653 + nr_to_migrate = src_page->inuse; 1654 + cc.index = 0; 1655 + cc.s_page = src_page; 1656 + 1657 + while ((dst_page = alloc_target_page(class))) { 1658 + cc.d_page = dst_page; 1659 + /* 1660 + * If there is no more space in dst_page, try to 1661 + * allocate another zspage. 1662 + */ 1663 + if (!migrate_zspage(pool, class, &cc)) 1664 + break; 1665 + 1666 + putback_zspage(pool, class, dst_page); 1667 + nr_total_migrated += cc.nr_migrated; 1668 + nr_to_migrate -= cc.nr_migrated; 1669 + } 1670 + 1671 + /* Stop if we couldn't find slot */ 1672 + if (dst_page == NULL) 1673 + break; 1674 + 1675 + putback_zspage(pool, class, dst_page); 1676 + putback_zspage(pool, class, src_page); 1677 + spin_unlock(&class->lock); 1678 + nr_total_migrated += cc.nr_migrated; 1679 + cond_resched(); 1680 + spin_lock(&class->lock); 1681 + } 1682 + 1683 + if (src_page) 1684 + putback_zspage(pool, class, src_page); 1685 + 1686 + spin_unlock(&class->lock); 1687 + 1688 + return nr_total_migrated; 1689 + } 1690 + 1691 + unsigned long zs_compact(struct zs_pool *pool) 1692 + { 1693 + int i; 1694 + unsigned long nr_migrated = 0; 1695 + struct size_class *class; 1696 + 1697 + for (i = zs_size_classes - 1; i >= 0; i--) { 1698 + class = pool->size_class[i]; 1699 + if (!class) 1700 + continue; 1701 + if (class->index != i) 1702 + continue; 1703 + nr_migrated += __zs_compact(pool, class); 1704 + } 1705 + 1706 + synchronize_rcu(); 1707 + 1708 + return nr_migrated; 1709 + } 1710 + EXPORT_SYMBOL_GPL(zs_compact); 1485 1711 1486 1712 /** 1487 1713 * zs_create_pool - Creates an allocation pool to work from.