Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: frontswap: core swap subsystem hooks and headers

This patch, 2of4, contains the changes to the core swap subsystem.
This includes:

(1) makes available core swap data structures (swap_lock, swap_list and
swap_info) that are needed by frontswap.c but we don't need to expose them
to the dozens of files that include swap.h so we create a new swapfile.h
just to extern-ify these and modify their declarations to non-static

(2) adds frontswap-related elements to swap_info_struct. Frontswap_map
points to vzalloc'ed one-bit-per-swap-page metadata that indicates
whether the swap page is in frontswap or in the device and frontswap_pages
counts how many pages are in frontswap.

(3) adds hooks in the swap subsystem and extends try_to_unuse so that
frontswap_shrink can do a "partial swapoff".

Note that a failed frontswap_map allocation is safe... failure is noted
by lack of "FS" in the subsequent printk.

---

[v14: rebase to 3.4-rc2]
[v10: no change]
[v9: akpm@linux-foundation.org: mark some statics __read_mostly]
[v9: akpm@linux-foundation.org: add clarifying comments]
[v9: akpm@linux-foundation.org: no need to loop repeating try_to_unuse]
[v9: error27@gmail.com: remove superfluous check for NULL]
[v8: rebase to 3.0-rc4]
[v8: kamezawa.hiroyu@jp.fujitsu.com: change counter to atomic_t to avoid races]
[v8: kamezawa.hiroyu@jp.fujitsu.com: comment to clarify informational counters]
[v7: rebase to 3.0-rc3]
[v7: JBeulich@novell.com: add new swap struct elements only if config'd]
[v6: rebase to 3.0-rc1]
[v6: lliubbo@gmail.com: fix null pointer deref if vzalloc fails]
[v6: konrad.wilk@oracl.com: various checks and code clarifications/comments]
[v5: no change from v4]
[v4: rebase to 2.6.39]
Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
Reviewed-by: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Jan Beulich <JBeulich@novell.com>
Acked-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Rik Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
[v11: Rebased, fixed mm/swapfile.c context change]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

authored by

Dan Magenheimer and committed by
Konrad Rzeszutek Wilk
38b5faf4 c3ba9698

+70 -13
+4
include/linux/swap.h
··· 197 197 struct block_device *bdev; /* swap device or bdev of swap file */ 198 198 struct file *swap_file; /* seldom referenced */ 199 199 unsigned int old_block_size; /* seldom referenced */ 200 + #ifdef CONFIG_FRONTSWAP 201 + unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ 202 + atomic_t frontswap_pages; /* frontswap pages in-use counter */ 203 + #endif 200 204 }; 201 205 202 206 struct swap_list_t {
+13
include/linux/swapfile.h
··· 1 + #ifndef _LINUX_SWAPFILE_H 2 + #define _LINUX_SWAPFILE_H 3 + 4 + /* 5 + * these were static in swapfile.c but frontswap.c needs them and we don't 6 + * want to expose them to the dozens of source files that include swap.h 7 + */ 8 + extern spinlock_t swap_lock; 9 + extern struct swap_list_t swap_list; 10 + extern struct swap_info_struct *swap_info[]; 11 + extern int try_to_unuse(unsigned int, bool, unsigned long); 12 + 13 + #endif /* _LINUX_SWAPFILE_H */
+12
mm/page_io.c
··· 18 18 #include <linux/bio.h> 19 19 #include <linux/swapops.h> 20 20 #include <linux/writeback.h> 21 + #include <linux/frontswap.h> 21 22 #include <asm/pgtable.h> 22 23 23 24 static struct bio *get_swap_bio(gfp_t gfp_flags, ··· 99 98 unlock_page(page); 100 99 goto out; 101 100 } 101 + if (frontswap_put_page(page) == 0) { 102 + set_page_writeback(page); 103 + unlock_page(page); 104 + end_page_writeback(page); 105 + goto out; 106 + } 102 107 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); 103 108 if (bio == NULL) { 104 109 set_page_dirty(page); ··· 129 122 130 123 VM_BUG_ON(!PageLocked(page)); 131 124 VM_BUG_ON(PageUptodate(page)); 125 + if (frontswap_get_page(page) == 0) { 126 + SetPageUptodate(page); 127 + unlock_page(page); 128 + goto out; 129 + } 132 130 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); 133 131 if (bio == NULL) { 134 132 unlock_page(page);
+41 -13
mm/swapfile.c
··· 31 31 #include <linux/memcontrol.h> 32 32 #include <linux/poll.h> 33 33 #include <linux/oom.h> 34 + #include <linux/frontswap.h> 35 + #include <linux/swapfile.h> 34 36 35 37 #include <asm/pgtable.h> 36 38 #include <asm/tlbflush.h> ··· 44 42 static void free_swap_count_continuations(struct swap_info_struct *); 45 43 static sector_t map_swap_entry(swp_entry_t, struct block_device**); 46 44 47 - static DEFINE_SPINLOCK(swap_lock); 45 + DEFINE_SPINLOCK(swap_lock); 48 46 static unsigned int nr_swapfiles; 49 47 long nr_swap_pages; 50 48 long total_swap_pages; ··· 55 53 static const char Bad_offset[] = "Bad swap offset entry "; 56 54 static const char Unused_offset[] = "Unused swap offset entry "; 57 55 58 - static struct swap_list_t swap_list = {-1, -1}; 56 + struct swap_list_t swap_list = {-1, -1}; 59 57 60 - static struct swap_info_struct *swap_info[MAX_SWAPFILES]; 58 + struct swap_info_struct *swap_info[MAX_SWAPFILES]; 61 59 62 60 static DEFINE_MUTEX(swapon_mutex); 63 61 ··· 558 556 swap_list.next = p->type; 559 557 nr_swap_pages++; 560 558 p->inuse_pages--; 559 + frontswap_invalidate_page(p->type, offset); 561 560 if ((p->flags & SWP_BLKDEV) && 562 561 disk->fops->swap_slot_free_notify) 563 562 disk->fops->swap_slot_free_notify(p->bdev, offset); ··· 1019 1016 } 1020 1017 1021 1018 /* 1022 - * Scan swap_map from current position to next entry still in use. 1019 + * Scan swap_map (or frontswap_map if frontswap parameter is true) 1020 + * from current position to next entry still in use. 1023 1021 * Recycle to start on reaching the end, returning 0 when empty. 1024 1022 */ 1025 1023 static unsigned int find_next_to_unuse(struct swap_info_struct *si, 1026 - unsigned int prev) 1024 + unsigned int prev, bool frontswap) 1027 1025 { 1028 1026 unsigned int max = si->max; 1029 1027 unsigned int i = prev; ··· 1050 1046 prev = 0; 1051 1047 i = 1; 1052 1048 } 1049 + if (frontswap) { 1050 + if (frontswap_test(si, i)) 1051 + break; 1052 + else 1053 + continue; 1054 + } 1053 1055 count = si->swap_map[i]; 1054 1056 if (count && swap_count(count) != SWAP_MAP_BAD) 1055 1057 break; ··· 1067 1057 * We completely avoid races by reading each swap page in advance, 1068 1058 * and then search for the process using it. All the necessary 1069 1059 * page table adjustments can then be made atomically. 1060 + * 1061 + * if the boolean frontswap is true, only unuse pages_to_unuse pages; 1062 + * pages_to_unuse==0 means all pages; ignored if frontswap is false 1070 1063 */ 1071 - static int try_to_unuse(unsigned int type) 1064 + int try_to_unuse(unsigned int type, bool frontswap, 1065 + unsigned long pages_to_unuse) 1072 1066 { 1073 1067 struct swap_info_struct *si = swap_info[type]; 1074 1068 struct mm_struct *start_mm; ··· 1105 1091 * one pass through swap_map is enough, but not necessarily: 1106 1092 * there are races when an instance of an entry might be missed. 1107 1093 */ 1108 - while ((i = find_next_to_unuse(si, i)) != 0) { 1094 + while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { 1109 1095 if (signal_pending(current)) { 1110 1096 retval = -EINTR; 1111 1097 break; ··· 1272 1258 * interactive performance. 1273 1259 */ 1274 1260 cond_resched(); 1261 + if (frontswap && pages_to_unuse > 0) { 1262 + if (!--pages_to_unuse) 1263 + break; 1264 + } 1275 1265 } 1276 1266 1277 1267 mmput(start_mm); ··· 1535 1517 } 1536 1518 1537 1519 static void enable_swap_info(struct swap_info_struct *p, int prio, 1538 - unsigned char *swap_map) 1520 + unsigned char *swap_map, 1521 + unsigned long *frontswap_map) 1539 1522 { 1540 1523 int i, prev; 1541 1524 ··· 1546 1527 else 1547 1528 p->prio = --least_priority; 1548 1529 p->swap_map = swap_map; 1530 + frontswap_map_set(p, frontswap_map); 1549 1531 p->flags |= SWP_WRITEOK; 1550 1532 nr_swap_pages += p->pages; 1551 1533 total_swap_pages += p->pages; ··· 1563 1543 swap_list.head = swap_list.next = p->type; 1564 1544 else 1565 1545 swap_info[prev]->next = p->type; 1546 + frontswap_init(p->type); 1566 1547 spin_unlock(&swap_lock); 1567 1548 } 1568 1549 ··· 1637 1616 spin_unlock(&swap_lock); 1638 1617 1639 1618 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); 1640 - err = try_to_unuse(type); 1619 + err = try_to_unuse(type, false, 0); /* force all pages to be unused */ 1641 1620 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); 1642 1621 1643 1622 if (err) { ··· 1648 1627 * sys_swapoff for this swap_info_struct at this point. 1649 1628 */ 1650 1629 /* re-insert swap space back into swap_list */ 1651 - enable_swap_info(p, p->prio, p->swap_map); 1630 + enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); 1652 1631 goto out_dput; 1653 1632 } 1654 1633 ··· 1674 1653 swap_map = p->swap_map; 1675 1654 p->swap_map = NULL; 1676 1655 p->flags = 0; 1656 + frontswap_invalidate_area(type); 1677 1657 spin_unlock(&swap_lock); 1678 1658 mutex_unlock(&swapon_mutex); 1679 1659 vfree(swap_map); 1660 + vfree(frontswap_map_get(p)); 1680 1661 /* Destroy swap account informatin */ 1681 1662 swap_cgroup_swapoff(type); 1682 1663 ··· 2042 2019 sector_t span; 2043 2020 unsigned long maxpages; 2044 2021 unsigned char *swap_map = NULL; 2022 + unsigned long *frontswap_map = NULL; 2045 2023 struct page *page = NULL; 2046 2024 struct inode *inode = NULL; 2047 2025 ··· 2126 2102 error = nr_extents; 2127 2103 goto bad_swap; 2128 2104 } 2105 + /* frontswap enabled? set up bit-per-page map for frontswap */ 2106 + if (frontswap_enabled) 2107 + frontswap_map = vzalloc(maxpages / sizeof(long)); 2129 2108 2130 2109 if (p->bdev) { 2131 2110 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { ··· 2144 2117 if (swap_flags & SWAP_FLAG_PREFER) 2145 2118 prio = 2146 2119 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 2147 - enable_swap_info(p, prio, swap_map); 2120 + enable_swap_info(p, prio, swap_map, frontswap_map); 2148 2121 2149 2122 printk(KERN_INFO "Adding %uk swap on %s. " 2150 - "Priority:%d extents:%d across:%lluk %s%s\n", 2123 + "Priority:%d extents:%d across:%lluk %s%s%s\n", 2151 2124 p->pages<<(PAGE_SHIFT-10), name, p->prio, 2152 2125 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 2153 2126 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 2154 - (p->flags & SWP_DISCARDABLE) ? "D" : ""); 2127 + (p->flags & SWP_DISCARDABLE) ? "D" : "", 2128 + (frontswap_map) ? "FS" : ""); 2155 2129 2156 2130 mutex_unlock(&swapon_mutex); 2157 2131 atomic_inc(&proc_poll_event);