Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

swap: change swap_list_head to plist, add swap_avail_head

Originally get_swap_page() started iterating through the singly-linked
list of swap_info_structs using swap_list.next or highest_priority_index,
which both were intended to point to the highest priority active swap
target that was not full. The first patch in this series changed the
singly-linked list to a doubly-linked list, and removed the logic to start
at the highest priority non-full entry; it starts scanning at the highest
priority entry each time, even if the entry is full.

Replace the manually ordered swap_list_head with a plist, swap_active_head.
Add a new plist, swap_avail_head. The original swap_active_head plist
contains all active swap_info_structs, as before, while the new
swap_avail_head plist contains only swap_info_structs that are active and
available, i.e. not full. Add a new spinlock, swap_avail_lock, to protect
the swap_avail_head list.

Mel Gorman suggested using plists since they internally handle ordering
the list entries based on priority, which is exactly what swap was doing
manually. All the ordering code is now removed, and swap_info_struct
entries and simply added to their corresponding plist and automatically
ordered correctly.

Using a new plist for available swap_info_structs simplifies and
optimizes get_swap_page(), which no longer has to iterate over full
swap_info_structs. Using a new spinlock for swap_avail_head plist
allows each swap_info_struct to add or remove themselves from the
plist when they become full or not-full; previously they could not
do so because the swap_info_struct->lock is held when they change
from full<->not-full, and the swap_lock protecting the main
swap_active_head must be ordered before any swap_info_struct->lock.

Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Shaohua Li <shli@fusionio.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Cc: Weijie Yang <weijieut@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Bob Liu <bob.liu@oracle.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Dan Streetman and committed by
Linus Torvalds
18ab4d4c a75f232c

+97 -59
+2 -1
include/linux/swap.h
··· 214 214 struct swap_info_struct { 215 215 unsigned long flags; /* SWP_USED etc: see above */ 216 216 signed short prio; /* swap priority of this type */ 217 - struct list_head list; /* entry in swap list */ 217 + struct plist_node list; /* entry in swap_active_head */ 218 + struct plist_node avail_list; /* entry in swap_avail_head */ 218 219 signed char type; /* strange name for an index */ 219 220 unsigned int max; /* extent of the swap_map */ 220 221 unsigned char *swap_map; /* vmalloc'ed array of usage counts */
+1 -1
include/linux/swapfile.h
··· 6 6 * want to expose them to the dozens of source files that include swap.h 7 7 */ 8 8 extern spinlock_t swap_lock; 9 - extern struct list_head swap_list_head; 9 + extern struct plist_head swap_active_head; 10 10 extern struct swap_info_struct *swap_info[]; 11 11 extern int try_to_unuse(unsigned int, bool, unsigned long); 12 12
+3 -3
mm/frontswap.c
··· 331 331 struct swap_info_struct *si = NULL; 332 332 333 333 assert_spin_locked(&swap_lock); 334 - list_for_each_entry(si, &swap_list_head, list) 334 + plist_for_each_entry(si, &swap_active_head, list) 335 335 totalpages += atomic_read(&si->frontswap_pages); 336 336 return totalpages; 337 337 } ··· 346 346 unsigned long pages = 0, pages_to_unuse = 0; 347 347 348 348 assert_spin_locked(&swap_lock); 349 - list_for_each_entry(si, &swap_list_head, list) { 349 + plist_for_each_entry(si, &swap_active_head, list) { 350 350 si_frontswap_pages = atomic_read(&si->frontswap_pages); 351 351 if (total_pages_to_unuse < si_frontswap_pages) { 352 352 pages = pages_to_unuse = total_pages_to_unuse; ··· 408 408 /* 409 409 * we don't want to hold swap_lock while doing a very 410 410 * lengthy try_to_unuse, but swap_list may change 411 - * so restart scan from swap_list_head each time 411 + * so restart scan from swap_active_head each time 412 412 */ 413 413 spin_lock(&swap_lock); 414 414 ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
+91 -54
mm/swapfile.c
··· 61 61 * all active swap_info_structs 62 62 * protected with swap_lock, and ordered by priority. 63 63 */ 64 - LIST_HEAD(swap_list_head); 64 + PLIST_HEAD(swap_active_head); 65 + 66 + /* 67 + * all available (active, not full) swap_info_structs 68 + * protected with swap_avail_lock, ordered by priority. 69 + * This is used by get_swap_page() instead of swap_active_head 70 + * because swap_active_head includes all swap_info_structs, 71 + * but get_swap_page() doesn't need to look at full ones. 72 + * This uses its own lock instead of swap_lock because when a 73 + * swap_info_struct changes between not-full/full, it needs to 74 + * add/remove itself to/from this list, but the swap_info_struct->lock 75 + * is held and the locking order requires swap_lock to be taken 76 + * before any swap_info_struct->lock. 77 + */ 78 + static PLIST_HEAD(swap_avail_head); 79 + static DEFINE_SPINLOCK(swap_avail_lock); 65 80 66 81 struct swap_info_struct *swap_info[MAX_SWAPFILES]; 67 82 ··· 609 594 if (si->inuse_pages == si->pages) { 610 595 si->lowest_bit = si->max; 611 596 si->highest_bit = 0; 597 + spin_lock(&swap_avail_lock); 598 + plist_del(&si->avail_list, &swap_avail_head); 599 + spin_unlock(&swap_avail_lock); 612 600 } 613 601 si->swap_map[offset] = usage; 614 602 inc_cluster_info_page(si, si->cluster_info, offset); ··· 663 645 { 664 646 struct swap_info_struct *si, *next; 665 647 pgoff_t offset; 666 - struct list_head *tmp; 667 648 668 - spin_lock(&swap_lock); 669 649 if (atomic_long_read(&nr_swap_pages) <= 0) 670 650 goto noswap; 671 651 atomic_long_dec(&nr_swap_pages); 672 652 673 - list_for_each(tmp, &swap_list_head) { 674 - si = list_entry(tmp, typeof(*si), list); 653 + spin_lock(&swap_avail_lock); 654 + 655 + start_over: 656 + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { 657 + /* requeue si to after same-priority siblings */ 658 + plist_requeue(&si->avail_list, &swap_avail_head); 659 + spin_unlock(&swap_avail_lock); 675 660 spin_lock(&si->lock); 676 661 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { 662 + spin_lock(&swap_avail_lock); 663 + if (plist_node_empty(&si->avail_list)) { 664 + spin_unlock(&si->lock); 665 + goto nextsi; 666 + } 667 + WARN(!si->highest_bit, 668 + "swap_info %d in list but !highest_bit\n", 669 + si->type); 670 + WARN(!(si->flags & SWP_WRITEOK), 671 + "swap_info %d in list but !SWP_WRITEOK\n", 672 + si->type); 673 + plist_del(&si->avail_list, &swap_avail_head); 677 674 spin_unlock(&si->lock); 678 - continue; 675 + goto nextsi; 679 676 } 680 677 681 - /* 682 - * rotate the current swap_info that we're going to use 683 - * to after any other swap_info that have the same prio, 684 - * so that all equal-priority swap_info get used equally 685 - */ 686 - next = si; 687 - list_for_each_entry_continue(next, &swap_list_head, list) { 688 - if (si->prio != next->prio) 689 - break; 690 - list_rotate_left(&si->list); 691 - next = si; 692 - } 693 - 694 - spin_unlock(&swap_lock); 695 678 /* This is called for allocating swap entry for cache */ 696 679 offset = scan_swap_map(si, SWAP_HAS_CACHE); 697 680 spin_unlock(&si->lock); 698 681 if (offset) 699 682 return swp_entry(si->type, offset); 700 - spin_lock(&swap_lock); 683 + pr_debug("scan_swap_map of si %d failed to find offset\n", 684 + si->type); 685 + spin_lock(&swap_avail_lock); 686 + nextsi: 701 687 /* 702 688 * if we got here, it's likely that si was almost full before, 703 689 * and since scan_swap_map() can drop the si->lock, multiple 704 690 * callers probably all tried to get a page from the same si 705 - * and it filled up before we could get one. So we need to 706 - * try again. Since we dropped the swap_lock, there may now 707 - * be non-full higher priority swap_infos, and this si may have 708 - * even been removed from the list (although very unlikely). 709 - * Let's start over. 691 + * and it filled up before we could get one; or, the si filled 692 + * up between us dropping swap_avail_lock and taking si->lock. 693 + * Since we dropped the swap_avail_lock, the swap_avail_head 694 + * list may have been modified; so if next is still in the 695 + * swap_avail_head list then try it, otherwise start over. 710 696 */ 711 - tmp = &swap_list_head; 697 + if (plist_node_empty(&next->avail_list)) 698 + goto start_over; 712 699 } 700 + 701 + spin_unlock(&swap_avail_lock); 713 702 714 703 atomic_long_inc(&nr_swap_pages); 715 704 noswap: 716 - spin_unlock(&swap_lock); 717 705 return (swp_entry_t) {0}; 718 706 } 719 707 ··· 822 798 dec_cluster_info_page(p, p->cluster_info, offset); 823 799 if (offset < p->lowest_bit) 824 800 p->lowest_bit = offset; 825 - if (offset > p->highest_bit) 801 + if (offset > p->highest_bit) { 802 + bool was_full = !p->highest_bit; 826 803 p->highest_bit = offset; 804 + if (was_full && (p->flags & SWP_WRITEOK)) { 805 + spin_lock(&swap_avail_lock); 806 + WARN_ON(!plist_node_empty(&p->avail_list)); 807 + if (plist_node_empty(&p->avail_list)) 808 + plist_add(&p->avail_list, 809 + &swap_avail_head); 810 + spin_unlock(&swap_avail_lock); 811 + } 812 + } 827 813 atomic_long_inc(&nr_swap_pages); 828 814 p->inuse_pages--; 829 815 frontswap_invalidate_page(p->type, offset); ··· 1768 1734 unsigned char *swap_map, 1769 1735 struct swap_cluster_info *cluster_info) 1770 1736 { 1771 - struct swap_info_struct *si; 1772 - 1773 1737 if (prio >= 0) 1774 1738 p->prio = prio; 1775 1739 else 1776 1740 p->prio = --least_priority; 1741 + /* 1742 + * the plist prio is negated because plist ordering is 1743 + * low-to-high, while swap ordering is high-to-low 1744 + */ 1745 + p->list.prio = -p->prio; 1746 + p->avail_list.prio = -p->prio; 1777 1747 p->swap_map = swap_map; 1778 1748 p->cluster_info = cluster_info; 1779 1749 p->flags |= SWP_WRITEOK; ··· 1785 1747 total_swap_pages += p->pages; 1786 1748 1787 1749 assert_spin_locked(&swap_lock); 1788 - BUG_ON(!list_empty(&p->list)); 1789 1750 /* 1790 - * insert into swap list; the list is in priority order, 1791 - * so that get_swap_page() can get a page from the highest 1792 - * priority swap_info_struct with available page(s), and 1793 - * swapoff can adjust the auto-assigned (i.e. negative) prio 1794 - * values for any lower-priority swap_info_structs when 1795 - * removing a negative-prio swap_info_struct 1751 + * both lists are plists, and thus priority ordered. 1752 + * swap_active_head needs to be priority ordered for swapoff(), 1753 + * which on removal of any swap_info_struct with an auto-assigned 1754 + * (i.e. negative) priority increments the auto-assigned priority 1755 + * of any lower-priority swap_info_structs. 1756 + * swap_avail_head needs to be priority ordered for get_swap_page(), 1757 + * which allocates swap pages from the highest available priority 1758 + * swap_info_struct. 1796 1759 */ 1797 - list_for_each_entry(si, &swap_list_head, list) { 1798 - if (p->prio >= si->prio) { 1799 - list_add_tail(&p->list, &si->list); 1800 - return; 1801 - } 1802 - } 1803 - /* 1804 - * this covers two cases: 1805 - * 1) p->prio is less than all existing prio 1806 - * 2) the swap list is empty 1807 - */ 1808 - list_add_tail(&p->list, &swap_list_head); 1760 + plist_add(&p->list, &swap_active_head); 1761 + spin_lock(&swap_avail_lock); 1762 + plist_add(&p->avail_list, &swap_avail_head); 1763 + spin_unlock(&swap_avail_lock); 1809 1764 } 1810 1765 1811 1766 static void enable_swap_info(struct swap_info_struct *p, int prio, ··· 1852 1821 1853 1822 mapping = victim->f_mapping; 1854 1823 spin_lock(&swap_lock); 1855 - list_for_each_entry(p, &swap_list_head, list) { 1824 + plist_for_each_entry(p, &swap_active_head, list) { 1856 1825 if (p->flags & SWP_WRITEOK) { 1857 1826 if (p->swap_file->f_mapping == mapping) { 1858 1827 found = 1; ··· 1872 1841 spin_unlock(&swap_lock); 1873 1842 goto out_dput; 1874 1843 } 1844 + spin_lock(&swap_avail_lock); 1845 + plist_del(&p->avail_list, &swap_avail_head); 1846 + spin_unlock(&swap_avail_lock); 1875 1847 spin_lock(&p->lock); 1876 1848 if (p->prio < 0) { 1877 1849 struct swap_info_struct *si = p; 1878 1850 1879 - list_for_each_entry_continue(si, &swap_list_head, list) { 1851 + plist_for_each_entry_continue(si, &swap_active_head, list) { 1880 1852 si->prio++; 1853 + si->list.prio--; 1854 + si->avail_list.prio--; 1881 1855 } 1882 1856 least_priority++; 1883 1857 } 1884 - list_del_init(&p->list); 1858 + plist_del(&p->list, &swap_active_head); 1885 1859 atomic_long_sub(p->pages, &nr_swap_pages); 1886 1860 total_swap_pages -= p->pages; 1887 1861 p->flags &= ~SWP_WRITEOK; ··· 2151 2115 */ 2152 2116 } 2153 2117 INIT_LIST_HEAD(&p->first_swap_extent.list); 2154 - INIT_LIST_HEAD(&p->list); 2118 + plist_node_init(&p->list, 0); 2119 + plist_node_init(&p->avail_list, 0); 2155 2120 p->flags = SWP_USED; 2156 2121 spin_unlock(&swap_lock); 2157 2122 spin_lock_init(&p->lock);