Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

[PATCH] swap: swap_lock replace list+device

The idea of a swap_device_lock per device, and a swap_list_lock over them all,
is appealing; but in practice almost every holder of swap_device_lock must
already hold swap_list_lock, which defeats the purpose of the split.

The only exceptions have been swap_duplicate, valid_swaphandles and an
untrodden path in try_to_unuse (plus a few places added in this series).
valid_swaphandles doesn't show up high in profiles, but swap_duplicate does
demand attention. However, with the hold time in get_swap_pages so much
reduced, I've not yet found a load and set of swap device priorities to show
even swap_duplicate benefitting from the split. Certainly the split is mere
overhead in the common case of a single swap device.

So, replace swap_list_lock and swap_device_lock by spinlock_t swap_lock
(generally we seem to prefer an _ in the name, and not hide in a macro).

If someone can show a regression in swap_duplicate, then probably we should
add a hashlock for the swap_map entries alone (shorts being anatomic), so as
to help the case of the single swap device too.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by

Hugh Dickins and committed by
Linus Torvalds
5d337b91 048c27fd

+70 -99
+7 -8
Documentation/vm/locking
··· 83 83 vmtruncate) does not lose sending ipi's to cloned threads that might 84 84 be spawned underneath it and go to user mode to drag in pte's into tlbs. 85 85 86 - swap_list_lock/swap_device_lock 87 - ------------------------------- 86 + swap_lock 87 + -------------- 88 88 The swap devices are chained in priority order from the "swap_list" header. 89 89 The "swap_list" is used for the round-robin swaphandle allocation strategy. 90 90 The #free swaphandles is maintained in "nr_swap_pages". These two together 91 - are protected by the swap_list_lock. 91 + are protected by the swap_lock. 92 92 93 - The swap_device_lock, which is per swap device, protects the reference 94 - counts on the corresponding swaphandles, maintained in the "swap_map" 95 - array, and the "highest_bit" and "lowest_bit" fields. 93 + The swap_lock also protects all the device reference counts on the 94 + corresponding swaphandles, maintained in the "swap_map" array, and the 95 + "highest_bit" and "lowest_bit" fields. 96 96 97 - Both of these are spinlocks, and are never acquired from intr level. The 98 - locking hierarchy is swap_list_lock -> swap_device_lock. 97 + The swap_lock is a spinlock, and is never acquired from intr level. 99 98 100 99 To prevent races between swap space deletion or async readahead swapins 101 100 deciding whether a swap handle is being used, ie worthy of being read in
+2 -9
include/linux/swap.h
··· 121 121 */ 122 122 struct swap_info_struct { 123 123 unsigned int flags; 124 - spinlock_t sdev_lock; 124 + int prio; /* swap priority */ 125 125 struct file *swap_file; 126 126 struct block_device *bdev; 127 127 struct list_head extent_list; ··· 135 135 unsigned int pages; 136 136 unsigned int max; 137 137 unsigned int inuse_pages; 138 - int prio; /* swap priority */ 139 138 int next; /* next entry on swap list */ 140 139 }; 141 140 ··· 220 221 extern int remove_exclusive_swap_page(struct page *); 221 222 struct backing_dev_info; 222 223 223 - extern struct swap_list_t swap_list; 224 - extern spinlock_t swaplock; 225 - 226 - #define swap_list_lock() spin_lock(&swaplock) 227 - #define swap_list_unlock() spin_unlock(&swaplock) 228 - #define swap_device_lock(p) spin_lock(&p->sdev_lock) 229 - #define swap_device_unlock(p) spin_unlock(&p->sdev_lock) 224 + extern spinlock_t swap_lock; 230 225 231 226 /* linux/mm/thrash.c */ 232 227 extern struct mm_struct * swap_token_mm;
+3 -4
mm/filemap.c
··· 54 54 * 55 55 * ->i_mmap_lock (vmtruncate) 56 56 * ->private_lock (__free_pte->__set_page_dirty_buffers) 57 - * ->swap_list_lock 58 - * ->swap_device_lock (exclusive_swap_page, others) 59 - * ->mapping->tree_lock 57 + * ->swap_lock (exclusive_swap_page, others) 58 + * ->mapping->tree_lock 60 59 * 61 60 * ->i_sem 62 61 * ->i_mmap_lock (truncate->unmap_mapping_range) ··· 85 86 * ->page_table_lock (anon_vma_prepare and various) 86 87 * 87 88 * ->page_table_lock 88 - * ->swap_device_lock (try_to_unmap_one) 89 + * ->swap_lock (try_to_unmap_one) 89 90 * ->private_lock (try_to_unmap_one) 90 91 * ->tree_lock (try_to_unmap_one) 91 92 * ->zone.lru_lock (follow_page->mark_page_accessed)
+1 -2
mm/rmap.c
··· 34 34 * anon_vma->lock 35 35 * mm->page_table_lock 36 36 * zone->lru_lock (in mark_page_accessed) 37 - * swap_list_lock (in swap_free etc's swap_info_get) 37 + * swap_lock (in swap_duplicate, swap_info_get) 38 38 * mmlist_lock (in mmput, drain_mmlist and others) 39 - * swap_device_lock (in swap_duplicate, swap_info_get) 40 39 * mapping->private_lock (in __set_page_dirty_buffers) 41 40 * inode_lock (in set_page_dirty's __mark_inode_dirty) 42 41 * sb_lock (within inode_lock in fs/fs-writeback.c)
+57 -76
mm/swapfile.c
··· 31 31 #include <asm/tlbflush.h> 32 32 #include <linux/swapops.h> 33 33 34 - DEFINE_SPINLOCK(swaplock); 34 + DEFINE_SPINLOCK(swap_lock); 35 35 unsigned int nr_swapfiles; 36 36 long total_swap_pages; 37 37 static int swap_overflow; ··· 51 51 52 52 /* 53 53 * We need this because the bdev->unplug_fn can sleep and we cannot 54 - * hold swap_list_lock while calling the unplug_fn. And swap_list_lock 54 + * hold swap_lock while calling the unplug_fn. And swap_lock 55 55 * cannot be turned into a semaphore. 56 56 */ 57 57 static DECLARE_RWSEM(swap_unplug_sem); ··· 105 105 si->cluster_nr = SWAPFILE_CLUSTER - 1; 106 106 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) 107 107 goto lowest; 108 - swap_device_unlock(si); 108 + spin_unlock(&swap_lock); 109 109 110 110 offset = si->lowest_bit; 111 111 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; ··· 115 115 if (si->swap_map[offset]) 116 116 last_in_cluster = offset + SWAPFILE_CLUSTER; 117 117 else if (offset == last_in_cluster) { 118 - swap_device_lock(si); 118 + spin_lock(&swap_lock); 119 119 si->cluster_next = offset-SWAPFILE_CLUSTER-1; 120 120 goto cluster; 121 121 } ··· 124 124 latency_ration = LATENCY_LIMIT; 125 125 } 126 126 } 127 - swap_device_lock(si); 127 + spin_lock(&swap_lock); 128 128 goto lowest; 129 129 } 130 130 ··· 153 153 return offset; 154 154 } 155 155 156 - swap_device_unlock(si); 156 + spin_unlock(&swap_lock); 157 157 while (++offset <= si->highest_bit) { 158 158 if (!si->swap_map[offset]) { 159 - swap_device_lock(si); 159 + spin_lock(&swap_lock); 160 160 goto checks; 161 161 } 162 162 if (unlikely(--latency_ration < 0)) { ··· 164 164 latency_ration = LATENCY_LIMIT; 165 165 } 166 166 } 167 - swap_device_lock(si); 167 + spin_lock(&swap_lock); 168 168 goto lowest; 169 169 170 170 no_page: ··· 179 179 int type, next; 180 180 int wrapped = 0; 181 181 182 - swap_list_lock(); 182 + spin_lock(&swap_lock); 183 183 if (nr_swap_pages <= 0) 184 184 goto noswap; 185 185 nr_swap_pages--; ··· 199 199 continue; 200 200 201 201 swap_list.next = next; 202 - swap_device_lock(si); 203 - swap_list_unlock(); 204 202 offset = scan_swap_map(si); 205 - swap_device_unlock(si); 206 - if (offset) 203 + if (offset) { 204 + spin_unlock(&swap_lock); 207 205 return swp_entry(type, offset); 208 - swap_list_lock(); 206 + } 209 207 next = swap_list.next; 210 208 } 211 209 212 210 nr_swap_pages++; 213 211 noswap: 214 - swap_list_unlock(); 212 + spin_unlock(&swap_lock); 215 213 return (swp_entry_t) {0}; 216 214 } 217 215 ··· 231 233 goto bad_offset; 232 234 if (!p->swap_map[offset]) 233 235 goto bad_free; 234 - swap_list_lock(); 235 - swap_device_lock(p); 236 + spin_lock(&swap_lock); 236 237 return p; 237 238 238 239 bad_free: ··· 248 251 out: 249 252 return NULL; 250 253 } 251 - 252 - static void swap_info_put(struct swap_info_struct * p) 253 - { 254 - swap_device_unlock(p); 255 - swap_list_unlock(); 256 - } 257 254 258 255 static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) 259 256 { ··· 281 290 p = swap_info_get(entry); 282 291 if (p) { 283 292 swap_entry_free(p, swp_offset(entry)); 284 - swap_info_put(p); 293 + spin_unlock(&swap_lock); 285 294 } 286 295 } 287 296 ··· 299 308 if (p) { 300 309 /* Subtract the 1 for the swap cache itself */ 301 310 count = p->swap_map[swp_offset(entry)] - 1; 302 - swap_info_put(p); 311 + spin_unlock(&swap_lock); 303 312 } 304 313 return count; 305 314 } ··· 356 365 } 357 366 write_unlock_irq(&swapper_space.tree_lock); 358 367 } 359 - swap_info_put(p); 368 + spin_unlock(&swap_lock); 360 369 361 370 if (retval) { 362 371 swap_free(entry); ··· 379 388 if (p) { 380 389 if (swap_entry_free(p, swp_offset(entry)) == 1) 381 390 page = find_trylock_page(&swapper_space, entry.val); 382 - swap_info_put(p); 391 + spin_unlock(&swap_lock); 383 392 } 384 393 if (page) { 385 394 int one_user; ··· 549 558 int count; 550 559 551 560 /* 552 - * No need for swap_device_lock(si) here: we're just looking 561 + * No need for swap_lock here: we're just looking 553 562 * for whether an entry is in use, not modifying it; false 554 563 * hits are okay, and sys_swapoff() has already prevented new 555 - * allocations from this area (while holding swap_list_lock()). 564 + * allocations from this area (while holding swap_lock). 556 565 */ 557 566 for (;;) { 558 567 if (++i >= max) { ··· 742 751 * report them; but do report if we reset SWAP_MAP_MAX. 743 752 */ 744 753 if (*swap_map == SWAP_MAP_MAX) { 745 - swap_device_lock(si); 754 + spin_lock(&swap_lock); 746 755 *swap_map = 1; 747 - swap_device_unlock(si); 756 + spin_unlock(&swap_lock); 748 757 reset_overflow = 1; 749 758 } 750 759 ··· 808 817 } 809 818 810 819 /* 811 - * After a successful try_to_unuse, if no swap is now in use, we know we 812 - * can empty the mmlist. swap_list_lock must be held on entry and exit. 813 - * Note that mmlist_lock nests inside swap_list_lock, and an mm must be 820 + * After a successful try_to_unuse, if no swap is now in use, we know 821 + * we can empty the mmlist. swap_lock must be held on entry and exit. 822 + * Note that mmlist_lock nests inside swap_lock, and an mm must be 814 823 * added to the mmlist just after page_duplicate - before would be racy. 815 824 */ 816 825 static void drain_mmlist(void) ··· 1083 1092 1084 1093 mapping = victim->f_mapping; 1085 1094 prev = -1; 1086 - swap_list_lock(); 1095 + spin_lock(&swap_lock); 1087 1096 for (type = swap_list.head; type >= 0; type = swap_info[type].next) { 1088 1097 p = swap_info + type; 1089 1098 if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) { ··· 1094 1103 } 1095 1104 if (type < 0) { 1096 1105 err = -EINVAL; 1097 - swap_list_unlock(); 1106 + spin_unlock(&swap_lock); 1098 1107 goto out_dput; 1099 1108 } 1100 1109 if (!security_vm_enough_memory(p->pages)) 1101 1110 vm_unacct_memory(p->pages); 1102 1111 else { 1103 1112 err = -ENOMEM; 1104 - swap_list_unlock(); 1113 + spin_unlock(&swap_lock); 1105 1114 goto out_dput; 1106 1115 } 1107 1116 if (prev < 0) { ··· 1115 1124 } 1116 1125 nr_swap_pages -= p->pages; 1117 1126 total_swap_pages -= p->pages; 1118 - swap_device_lock(p); 1119 1127 p->flags &= ~SWP_WRITEOK; 1120 - swap_device_unlock(p); 1121 - swap_list_unlock(); 1128 + spin_unlock(&swap_lock); 1122 1129 1123 1130 current->flags |= PF_SWAPOFF; 1124 1131 err = try_to_unuse(type); ··· 1124 1135 1125 1136 if (err) { 1126 1137 /* re-insert swap space back into swap_list */ 1127 - swap_list_lock(); 1138 + spin_lock(&swap_lock); 1128 1139 for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) 1129 1140 if (p->prio >= swap_info[i].prio) 1130 1141 break; ··· 1135 1146 swap_info[prev].next = p - swap_info; 1136 1147 nr_swap_pages += p->pages; 1137 1148 total_swap_pages += p->pages; 1138 - swap_device_lock(p); 1139 1149 p->flags |= SWP_WRITEOK; 1140 - swap_device_unlock(p); 1141 - swap_list_unlock(); 1150 + spin_unlock(&swap_lock); 1142 1151 goto out_dput; 1143 1152 } 1144 1153 ··· 1144 1157 down_write(&swap_unplug_sem); 1145 1158 up_write(&swap_unplug_sem); 1146 1159 1147 - /* wait for anyone still in scan_swap_map */ 1148 - swap_device_lock(p); 1149 - p->highest_bit = 0; /* cuts scans short */ 1150 - while (p->flags >= SWP_SCANNING) { 1151 - swap_device_unlock(p); 1152 - set_current_state(TASK_UNINTERRUPTIBLE); 1153 - schedule_timeout(1); 1154 - swap_device_lock(p); 1155 - } 1156 - swap_device_unlock(p); 1157 - 1158 1160 destroy_swap_extents(p); 1159 1161 down(&swapon_sem); 1160 - swap_list_lock(); 1162 + spin_lock(&swap_lock); 1161 1163 drain_mmlist(); 1162 - swap_device_lock(p); 1164 + 1165 + /* wait for anyone still in scan_swap_map */ 1166 + p->highest_bit = 0; /* cuts scans short */ 1167 + while (p->flags >= SWP_SCANNING) { 1168 + spin_unlock(&swap_lock); 1169 + set_current_state(TASK_UNINTERRUPTIBLE); 1170 + schedule_timeout(1); 1171 + spin_lock(&swap_lock); 1172 + } 1173 + 1163 1174 swap_file = p->swap_file; 1164 1175 p->swap_file = NULL; 1165 1176 p->max = 0; 1166 1177 swap_map = p->swap_map; 1167 1178 p->swap_map = NULL; 1168 1179 p->flags = 0; 1169 - swap_device_unlock(p); 1170 - swap_list_unlock(); 1180 + spin_unlock(&swap_lock); 1171 1181 up(&swapon_sem); 1172 1182 vfree(swap_map); 1173 1183 inode = mapping->host; ··· 1308 1324 1309 1325 if (!capable(CAP_SYS_ADMIN)) 1310 1326 return -EPERM; 1311 - swap_list_lock(); 1327 + spin_lock(&swap_lock); 1312 1328 p = swap_info; 1313 1329 for (type = 0 ; type < nr_swapfiles ; type++,p++) 1314 1330 if (!(p->flags & SWP_USED)) ··· 1327 1343 * swp_entry_t or the architecture definition of a swap pte. 1328 1344 */ 1329 1345 if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) { 1330 - swap_list_unlock(); 1346 + spin_unlock(&swap_lock); 1331 1347 goto out; 1332 1348 } 1333 1349 if (type >= nr_swapfiles) ··· 1341 1357 p->highest_bit = 0; 1342 1358 p->cluster_nr = 0; 1343 1359 p->inuse_pages = 0; 1344 - spin_lock_init(&p->sdev_lock); 1345 1360 p->next = -1; 1346 1361 if (swap_flags & SWAP_FLAG_PREFER) { 1347 1362 p->prio = ··· 1348 1365 } else { 1349 1366 p->prio = --least_priority; 1350 1367 } 1351 - swap_list_unlock(); 1368 + spin_unlock(&swap_lock); 1352 1369 name = getname(specialfile); 1353 1370 error = PTR_ERR(name); 1354 1371 if (IS_ERR(name)) { ··· 1525 1542 } 1526 1543 1527 1544 down(&swapon_sem); 1528 - swap_list_lock(); 1529 - swap_device_lock(p); 1545 + spin_lock(&swap_lock); 1530 1546 p->flags = SWP_ACTIVE; 1531 1547 nr_swap_pages += nr_good_pages; 1532 1548 total_swap_pages += nr_good_pages; ··· 1549 1567 } else { 1550 1568 swap_info[prev].next = p - swap_info; 1551 1569 } 1552 - swap_device_unlock(p); 1553 - swap_list_unlock(); 1570 + spin_unlock(&swap_lock); 1554 1571 up(&swapon_sem); 1555 1572 error = 0; 1556 1573 goto out; ··· 1560 1579 } 1561 1580 destroy_swap_extents(p); 1562 1581 bad_swap_2: 1563 - swap_list_lock(); 1582 + spin_lock(&swap_lock); 1564 1583 swap_map = p->swap_map; 1565 1584 p->swap_file = NULL; 1566 1585 p->swap_map = NULL; 1567 1586 p->flags = 0; 1568 1587 if (!(swap_flags & SWAP_FLAG_PREFER)) 1569 1588 ++least_priority; 1570 - swap_list_unlock(); 1589 + spin_unlock(&swap_lock); 1571 1590 vfree(swap_map); 1572 1591 if (swap_file) 1573 1592 filp_close(swap_file, NULL); ··· 1591 1610 unsigned int i; 1592 1611 unsigned long nr_to_be_unused = 0; 1593 1612 1594 - swap_list_lock(); 1613 + spin_lock(&swap_lock); 1595 1614 for (i = 0; i < nr_swapfiles; i++) { 1596 1615 if (!(swap_info[i].flags & SWP_USED) || 1597 1616 (swap_info[i].flags & SWP_WRITEOK)) ··· 1600 1619 } 1601 1620 val->freeswap = nr_swap_pages + nr_to_be_unused; 1602 1621 val->totalswap = total_swap_pages + nr_to_be_unused; 1603 - swap_list_unlock(); 1622 + spin_unlock(&swap_lock); 1604 1623 } 1605 1624 1606 1625 /* ··· 1621 1640 p = type + swap_info; 1622 1641 offset = swp_offset(entry); 1623 1642 1624 - swap_device_lock(p); 1643 + spin_lock(&swap_lock); 1625 1644 if (offset < p->max && p->swap_map[offset]) { 1626 1645 if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { 1627 1646 p->swap_map[offset]++; ··· 1633 1652 result = 1; 1634 1653 } 1635 1654 } 1636 - swap_device_unlock(p); 1655 + spin_unlock(&swap_lock); 1637 1656 out: 1638 1657 return result; 1639 1658 ··· 1649 1668 } 1650 1669 1651 1670 /* 1652 - * swap_device_lock prevents swap_map being freed. Don't grab an extra 1671 + * swap_lock prevents swap_map being freed. Don't grab an extra 1653 1672 * reference on the swaphandle, it doesn't matter if it becomes unused. 1654 1673 */ 1655 1674 int valid_swaphandles(swp_entry_t entry, unsigned long *offset) ··· 1665 1684 toff++, i--; 1666 1685 *offset = toff; 1667 1686 1668 - swap_device_lock(swapdev); 1687 + spin_lock(&swap_lock); 1669 1688 do { 1670 1689 /* Don't read-ahead past the end of the swap area */ 1671 1690 if (toff >= swapdev->max) ··· 1678 1697 toff++; 1679 1698 ret++; 1680 1699 } while (--i); 1681 - swap_device_unlock(swapdev); 1700 + spin_unlock(&swap_lock); 1682 1701 return ret; 1683 1702 }