Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ufs: remove the BKL

This introduces a new per-superblock mutex in UFS to replace
the big kernel lock. I have been careful to avoid nested
calls to lock_ufs and to get the lock order right with
respect to other mutexes, in particular lock_super.

I did not make any attempt to prove that the big kernel
lock is not needed in a particular place in the code,
which is very possible.

The mutex has a significant performance impact, so it is only
used on SMP or PREEMPT configurations.

As Nick Piggin noticed, any allocation inside of the lock
may end up deadlocking when we get to ufs_getfrag_block
in the reclaim task, so we now use GFP_NOFS.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Tested-by: Nick Bowler <nbowler@elliptictech.com>
Cc: Evgeniy Dushistov <dushistov@mail.ru>
Cc: Nick Piggin <npiggin@gmail.com>

+83 -108
-1
fs/ufs/Kconfig
··· 1 1 config UFS_FS 2 2 tristate "UFS file system support (read only)" 3 3 depends on BLOCK 4 - depends on BKL # probably fixable 5 4 help 6 5 BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, 7 6 OpenBSD and NeXTstep) use a file system called UFS. Some System V
+21 -57
fs/ufs/inode.c
··· 34 34 #include <linux/stat.h> 35 35 #include <linux/string.h> 36 36 #include <linux/mm.h> 37 - #include <linux/smp_lock.h> 38 37 #include <linux/buffer_head.h> 39 38 #include <linux/writeback.h> 40 39 ··· 42 43 #include "swab.h" 43 44 #include "util.h" 44 45 45 - static u64 ufs_frag_map(struct inode *inode, sector_t frag); 46 + static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock); 46 47 47 48 static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4]) 48 49 { ··· 81 82 * the begining of the filesystem. 82 83 */ 83 84 84 - static u64 ufs_frag_map(struct inode *inode, sector_t frag) 85 + static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock) 85 86 { 86 87 struct ufs_inode_info *ufsi = UFS_I(inode); 87 88 struct super_block *sb = inode->i_sb; ··· 106 107 107 108 p = offsets; 108 109 109 - lock_kernel(); 110 + if (needs_lock) 111 + lock_ufs(sb); 110 112 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) 111 113 goto ufs2; 112 114 ··· 152 152 ret = temp + (u64) (frag & uspi->s_fpbmask); 153 153 154 154 out: 155 - unlock_kernel(); 155 + if (needs_lock) 156 + unlock_ufs(sb); 156 157 return ret; 157 158 } 158 159 ··· 416 415 int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) 417 416 { 418 417 struct super_block * sb = inode->i_sb; 419 - struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi; 418 + struct ufs_sb_info * sbi = UFS_SB(sb); 419 + struct ufs_sb_private_info * uspi = sbi->s_uspi; 420 420 struct buffer_head * bh; 421 421 int ret, err, new; 422 422 unsigned long ptr,phys; 423 423 u64 phys64 = 0; 424 + bool needs_lock = (sbi->mutex_owner != current); 424 425 425 426 if (!create) { 426 - phys64 = ufs_frag_map(inode, fragment); 427 + phys64 = ufs_frag_map(inode, fragment, needs_lock); 427 428 UFSD("phys64 = %llu\n", (unsigned long long)phys64); 428 429 if (phys64) 429 430 map_bh(bh_result, sb, phys64); ··· 439 436 ret = 0; 440 437 bh = NULL; 441 438 442 - lock_kernel(); 439 + if (needs_lock) 440 + lock_ufs(sb); 443 441 444 442 UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment); 445 443 if (fragment > ··· 502 498 set_buffer_new(bh_result); 503 499 map_bh(bh_result, sb, phys); 504 500 abort: 505 - unlock_kernel(); 501 + if (needs_lock) 502 + unlock_ufs(sb); 503 + 506 504 return err; 507 505 508 506 abort_too_big: 509 507 ufs_warning(sb, "ufs_get_block", "block > big"); 510 508 goto abort; 511 - } 512 - 513 - static struct buffer_head *ufs_getfrag(struct inode *inode, 514 - unsigned int fragment, 515 - int create, int *err) 516 - { 517 - struct buffer_head dummy; 518 - int error; 519 - 520 - dummy.b_state = 0; 521 - dummy.b_blocknr = -1000; 522 - error = ufs_getfrag_block(inode, fragment, &dummy, create); 523 - *err = error; 524 - if (!error && buffer_mapped(&dummy)) { 525 - struct buffer_head *bh; 526 - bh = sb_getblk(inode->i_sb, dummy.b_blocknr); 527 - if (buffer_new(&dummy)) { 528 - memset(bh->b_data, 0, inode->i_sb->s_blocksize); 529 - set_buffer_uptodate(bh); 530 - mark_buffer_dirty(bh); 531 - } 532 - return bh; 533 - } 534 - return NULL; 535 - } 536 - 537 - struct buffer_head * ufs_bread (struct inode * inode, unsigned fragment, 538 - int create, int * err) 539 - { 540 - struct buffer_head * bh; 541 - 542 - UFSD("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment); 543 - bh = ufs_getfrag (inode, fragment, create, err); 544 - if (!bh || buffer_uptodate(bh)) 545 - return bh; 546 - ll_rw_block (READ, 1, &bh); 547 - wait_on_buffer (bh); 548 - if (buffer_uptodate(bh)) 549 - return bh; 550 - brelse (bh); 551 - *err = -EIO; 552 - return NULL; 553 509 } 554 510 555 511 static int ufs_writepage(struct page *page, struct writeback_control *wbc) ··· 864 900 int ufs_write_inode(struct inode *inode, struct writeback_control *wbc) 865 901 { 866 902 int ret; 867 - lock_kernel(); 903 + lock_ufs(inode->i_sb); 868 904 ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); 869 - unlock_kernel(); 905 + unlock_ufs(inode->i_sb); 870 906 return ret; 871 907 } 872 908 ··· 886 922 if (want_delete) { 887 923 loff_t old_i_size; 888 924 /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/ 889 - lock_kernel(); 925 + lock_ufs(inode->i_sb); 890 926 mark_inode_dirty(inode); 891 927 ufs_update_inode(inode, IS_SYNC(inode)); 892 928 old_i_size = inode->i_size; 893 929 inode->i_size = 0; 894 930 if (inode->i_blocks && ufs_truncate(inode, old_i_size)) 895 931 ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n"); 896 - unlock_kernel(); 932 + unlock_ufs(inode->i_sb); 897 933 } 898 934 899 935 invalidate_inode_buffers(inode); 900 936 end_writeback(inode); 901 937 902 938 if (want_delete) { 903 - lock_kernel(); 939 + lock_ufs(inode->i_sb); 904 940 ufs_free_inode (inode); 905 - unlock_kernel(); 941 + unlock_ufs(inode->i_sb); 906 942 } 907 943 }
+17 -18
fs/ufs/namei.c
··· 29 29 30 30 #include <linux/time.h> 31 31 #include <linux/fs.h> 32 - #include <linux/smp_lock.h> 33 32 34 33 #include "ufs_fs.h" 35 34 #include "ufs.h" ··· 54 55 if (dentry->d_name.len > UFS_MAXNAMLEN) 55 56 return ERR_PTR(-ENAMETOOLONG); 56 57 57 - lock_kernel(); 58 + lock_ufs(dir->i_sb); 58 59 ino = ufs_inode_by_name(dir, &dentry->d_name); 59 60 if (ino) { 60 61 inode = ufs_iget(dir->i_sb, ino); 61 62 if (IS_ERR(inode)) { 62 - unlock_kernel(); 63 + unlock_ufs(dir->i_sb); 63 64 return ERR_CAST(inode); 64 65 } 65 66 } 66 - unlock_kernel(); 67 + unlock_ufs(dir->i_sb); 67 68 d_add(dentry, inode); 68 69 return NULL; 69 70 } ··· 92 93 inode->i_fop = &ufs_file_operations; 93 94 inode->i_mapping->a_ops = &ufs_aops; 94 95 mark_inode_dirty(inode); 95 - lock_kernel(); 96 + lock_ufs(dir->i_sb); 96 97 err = ufs_add_nondir(dentry, inode); 97 - unlock_kernel(); 98 + unlock_ufs(dir->i_sb); 98 99 } 99 100 UFSD("END: err=%d\n", err); 100 101 return err; ··· 114 115 init_special_inode(inode, mode, rdev); 115 116 ufs_set_inode_dev(inode->i_sb, UFS_I(inode), rdev); 116 117 mark_inode_dirty(inode); 117 - lock_kernel(); 118 + lock_ufs(dir->i_sb); 118 119 err = ufs_add_nondir(dentry, inode); 119 - unlock_kernel(); 120 + unlock_ufs(dir->i_sb); 120 121 } 121 122 return err; 122 123 } ··· 132 133 if (l > sb->s_blocksize) 133 134 goto out_notlocked; 134 135 135 - lock_kernel(); 136 + lock_ufs(dir->i_sb); 136 137 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); 137 138 err = PTR_ERR(inode); 138 139 if (IS_ERR(inode)) ··· 155 156 156 157 err = ufs_add_nondir(dentry, inode); 157 158 out: 158 - unlock_kernel(); 159 + unlock_ufs(dir->i_sb); 159 160 out_notlocked: 160 161 return err; 161 162 ··· 171 172 struct inode *inode = old_dentry->d_inode; 172 173 int error; 173 174 174 - lock_kernel(); 175 + lock_ufs(dir->i_sb); 175 176 if (inode->i_nlink >= UFS_LINK_MAX) { 176 - unlock_kernel(); 177 + unlock_ufs(dir->i_sb); 177 178 return -EMLINK; 178 179 } 179 180 ··· 182 183 ihold(inode); 183 184 184 185 error = ufs_add_nondir(dentry, inode); 185 - unlock_kernel(); 186 + unlock_ufs(dir->i_sb); 186 187 return error; 187 188 } 188 189 ··· 194 195 if (dir->i_nlink >= UFS_LINK_MAX) 195 196 goto out; 196 197 197 - lock_kernel(); 198 + lock_ufs(dir->i_sb); 198 199 inode_inc_link_count(dir); 199 200 200 201 inode = ufs_new_inode(dir, S_IFDIR|mode); ··· 215 216 err = ufs_add_link(dentry, inode); 216 217 if (err) 217 218 goto out_fail; 218 - unlock_kernel(); 219 + unlock_ufs(dir->i_sb); 219 220 220 221 d_instantiate(dentry, inode); 221 222 out: ··· 227 228 iput (inode); 228 229 out_dir: 229 230 inode_dec_link_count(dir); 230 - unlock_kernel(); 231 + unlock_ufs(dir->i_sb); 231 232 goto out; 232 233 } 233 234 ··· 258 259 struct inode * inode = dentry->d_inode; 259 260 int err= -ENOTEMPTY; 260 261 261 - lock_kernel(); 262 + lock_ufs(dir->i_sb); 262 263 if (ufs_empty_dir (inode)) { 263 264 err = ufs_unlink(dir, dentry); 264 265 if (!err) { ··· 267 268 inode_dec_link_count(dir); 268 269 } 269 270 } 270 - unlock_kernel(); 271 + unlock_ufs(dir->i_sb); 271 272 return err; 272 273 } 273 274
+37 -27
fs/ufs/super.c
··· 84 84 #include <linux/blkdev.h> 85 85 #include <linux/init.h> 86 86 #include <linux/parser.h> 87 - #include <linux/smp_lock.h> 88 87 #include <linux/buffer_head.h> 89 88 #include <linux/vfs.h> 90 89 #include <linux/log2.h> ··· 94 95 #include "ufs.h" 95 96 #include "swab.h" 96 97 #include "util.h" 98 + 99 + void lock_ufs(struct super_block *sb) 100 + { 101 + #if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT) 102 + struct ufs_sb_info *sbi = UFS_SB(sb); 103 + 104 + mutex_lock(&sbi->mutex); 105 + sbi->mutex_owner = current; 106 + #endif 107 + } 108 + 109 + void unlock_ufs(struct super_block *sb) 110 + { 111 + #if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT) 112 + struct ufs_sb_info *sbi = UFS_SB(sb); 113 + 114 + sbi->mutex_owner = NULL; 115 + mutex_unlock(&sbi->mutex); 116 + #endif 117 + } 97 118 98 119 static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) 99 120 { ··· 332 313 struct ufs_super_block_first * usb1; 333 314 va_list args; 334 315 335 - lock_kernel(); 336 316 uspi = UFS_SB(sb)->s_uspi; 337 317 usb1 = ubh_get_usb_first(uspi); 338 318 ··· 539 521 */ 540 522 size = uspi->s_cssize; 541 523 blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; 542 - base = space = kmalloc(size, GFP_KERNEL); 524 + base = space = kmalloc(size, GFP_NOFS); 543 525 if (!base) 544 526 goto failed; 545 527 sbi->s_csp = (struct ufs_csum *)space; ··· 564 546 * Read cylinder group (we read only first fragment from block 565 547 * at this time) and prepare internal data structures for cg caching. 566 548 */ 567 - if (!(sbi->s_ucg = kmalloc (sizeof(struct buffer_head *) * uspi->s_ncg, GFP_KERNEL))) 549 + if (!(sbi->s_ucg = kmalloc (sizeof(struct buffer_head *) * uspi->s_ncg, GFP_NOFS))) 568 550 goto failed; 569 551 for (i = 0; i < uspi->s_ncg; i++) 570 552 sbi->s_ucg[i] = NULL; ··· 582 564 ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data); 583 565 } 584 566 for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) { 585 - if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_KERNEL))) 567 + if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_NOFS))) 586 568 goto failed; 587 569 sbi->s_cgno[i] = UFS_CGNO_EMPTY; 588 570 } ··· 664 646 665 647 UFSD("ENTER\n"); 666 648 667 - lock_kernel(); 668 - 669 649 ufs_put_cstotal(sb); 670 650 size = uspi->s_cssize; 671 651 blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; ··· 692 676 kfree (sbi->s_ucg); 693 677 kfree (base); 694 678 695 - unlock_kernel(); 696 - 697 679 UFSD("EXIT\n"); 698 680 } 699 681 ··· 709 695 unsigned super_block_offset; 710 696 unsigned maxsymlen; 711 697 int ret = -EINVAL; 712 - 713 - lock_kernel(); 714 698 715 699 uspi = NULL; 716 700 ubh = NULL; ··· 730 718 goto failed; 731 719 } 732 720 #endif 721 + mutex_init(&sbi->mutex); 733 722 /* 734 723 * Set default mount options 735 724 * Parse mount options ··· 1178 1165 goto failed; 1179 1166 1180 1167 UFSD("EXIT\n"); 1181 - unlock_kernel(); 1182 1168 return 0; 1183 1169 1184 1170 dalloc_failed: ··· 1189 1177 kfree(sbi); 1190 1178 sb->s_fs_info = NULL; 1191 1179 UFSD("EXIT (FAILED)\n"); 1192 - unlock_kernel(); 1193 1180 return ret; 1194 1181 1195 1182 failed_nomem: 1196 1183 UFSD("EXIT (NOMEM)\n"); 1197 - unlock_kernel(); 1198 1184 return -ENOMEM; 1199 1185 } 1200 1186 ··· 1203 1193 struct ufs_super_block_third * usb3; 1204 1194 unsigned flags; 1205 1195 1196 + lock_ufs(sb); 1206 1197 lock_super(sb); 1207 - lock_kernel(); 1208 1198 1209 1199 UFSD("ENTER\n"); 1210 1200 ··· 1223 1213 sb->s_dirt = 0; 1224 1214 1225 1215 UFSD("EXIT\n"); 1226 - unlock_kernel(); 1227 1216 unlock_super(sb); 1217 + unlock_ufs(sb); 1228 1218 1229 1219 return 0; 1230 1220 } ··· 1266 1256 unsigned new_mount_opt, ufstype; 1267 1257 unsigned flags; 1268 1258 1269 - lock_kernel(); 1259 + lock_ufs(sb); 1270 1260 lock_super(sb); 1271 1261 uspi = UFS_SB(sb)->s_uspi; 1272 1262 flags = UFS_SB(sb)->s_flags; ··· 1282 1272 ufs_set_opt (new_mount_opt, ONERROR_LOCK); 1283 1273 if (!ufs_parse_options (data, &new_mount_opt)) { 1284 1274 unlock_super(sb); 1285 - unlock_kernel(); 1275 + unlock_ufs(sb); 1286 1276 return -EINVAL; 1287 1277 } 1288 1278 if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { ··· 1290 1280 } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { 1291 1281 printk("ufstype can't be changed during remount\n"); 1292 1282 unlock_super(sb); 1293 - unlock_kernel(); 1283 + unlock_ufs(sb); 1294 1284 return -EINVAL; 1295 1285 } 1296 1286 1297 1287 if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 1298 1288 UFS_SB(sb)->s_mount_opt = new_mount_opt; 1299 1289 unlock_super(sb); 1300 - unlock_kernel(); 1290 + unlock_ufs(sb); 1301 1291 return 0; 1302 1292 } 1303 1293 ··· 1323 1313 printk("ufs was compiled with read-only support, " 1324 1314 "can't be mounted as read-write\n"); 1325 1315 unlock_super(sb); 1326 - unlock_kernel(); 1316 + unlock_ufs(sb); 1327 1317 return -EINVAL; 1328 1318 #else 1329 1319 if (ufstype != UFS_MOUNT_UFSTYPE_SUN && ··· 1333 1323 ufstype != UFS_MOUNT_UFSTYPE_UFS2) { 1334 1324 printk("this ufstype is read-only supported\n"); 1335 1325 unlock_super(sb); 1336 - unlock_kernel(); 1326 + unlock_ufs(sb); 1337 1327 return -EINVAL; 1338 1328 } 1339 1329 if (!ufs_read_cylinder_structures(sb)) { 1340 1330 printk("failed during remounting\n"); 1341 1331 unlock_super(sb); 1342 - unlock_kernel(); 1332 + unlock_ufs(sb); 1343 1333 return -EPERM; 1344 1334 } 1345 1335 sb->s_flags &= ~MS_RDONLY; ··· 1347 1337 } 1348 1338 UFS_SB(sb)->s_mount_opt = new_mount_opt; 1349 1339 unlock_super(sb); 1350 - unlock_kernel(); 1340 + unlock_ufs(sb); 1351 1341 return 0; 1352 1342 } 1353 1343 ··· 1381 1371 struct ufs_super_block_third *usb3; 1382 1372 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 1383 1373 1384 - lock_kernel(); 1374 + lock_ufs(sb); 1385 1375 1386 1376 usb1 = ubh_get_usb_first(uspi); 1387 1377 usb2 = ubh_get_usb_second(uspi); ··· 1405 1395 buf->f_fsid.val[0] = (u32)id; 1406 1396 buf->f_fsid.val[1] = (u32)(id >> 32); 1407 1397 1408 - unlock_kernel(); 1398 + unlock_ufs(sb); 1409 1399 1410 1400 return 0; 1411 1401 } ··· 1415 1405 static struct inode *ufs_alloc_inode(struct super_block *sb) 1416 1406 { 1417 1407 struct ufs_inode_info *ei; 1418 - ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_KERNEL); 1408 + ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS); 1419 1409 if (!ei) 1420 1410 return NULL; 1421 1411 ei->vfs_inode.i_version = 1;
+2 -3
fs/ufs/truncate.c
··· 40 40 #include <linux/time.h> 41 41 #include <linux/stat.h> 42 42 #include <linux/string.h> 43 - #include <linux/smp_lock.h> 44 43 #include <linux/buffer_head.h> 45 44 #include <linux/blkdev.h> 46 45 #include <linux/sched.h> ··· 466 467 467 468 block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block); 468 469 469 - lock_kernel(); 470 470 while (1) { 471 471 retry = ufs_trunc_direct(inode); 472 472 retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK, ··· 485 487 486 488 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 487 489 ufsi->i_lastfrag = DIRECT_FRAGMENT; 488 - unlock_kernel(); 489 490 mark_inode_dirty(inode); 490 491 out: 491 492 UFSD("EXIT: err %d\n", err); ··· 507 510 /* XXX(truncate): truncate_setsize should be called last */ 508 511 truncate_setsize(inode, attr->ia_size); 509 512 513 + lock_ufs(inode->i_sb); 510 514 error = ufs_truncate(inode, old_i_size); 515 + unlock_ufs(inode->i_sb); 511 516 if (error) 512 517 return error; 513 518 }
+5 -1
fs/ufs/ufs.h
··· 18 18 unsigned s_cgno[UFS_MAX_GROUP_LOADED]; 19 19 unsigned short s_cg_loaded; 20 20 unsigned s_mount_opt; 21 + struct mutex mutex; 22 + struct task_struct *mutex_owner; 21 23 }; 22 24 23 25 struct ufs_inode_info { ··· 111 109 extern int ufs_write_inode (struct inode *, struct writeback_control *); 112 110 extern int ufs_sync_inode (struct inode *); 113 111 extern void ufs_evict_inode (struct inode *); 114 - extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *); 115 112 extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create); 116 113 117 114 /* namei.c */ ··· 154 153 { 155 154 return do_div(b, uspi->s_fpg); 156 155 } 156 + 157 + extern void lock_ufs(struct super_block *sb); 158 + extern void unlock_ufs(struct super_block *sb); 157 159 158 160 #endif /* _UFS_UFS_H */
+1 -1
fs/ufs/util.c
··· 27 27 if (count > UFS_MAXFRAG) 28 28 return NULL; 29 29 ubh = (struct ufs_buffer_head *) 30 - kmalloc (sizeof (struct ufs_buffer_head), GFP_KERNEL); 30 + kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS); 31 31 if (!ubh) 32 32 return NULL; 33 33 ubh->fragment = fragment;