Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ufs: don't use lock_ufs() for block pointers tree protection

* stores to block pointers are under per-inode seqlock (meta_lock) and
mutex (truncate_mutex)
* fetches of block pointers are either under truncate_mutex, or wrapped
into seqretry loop on meta_lock
* all changes of ->i_size are under truncate_mutex and i_mutex
* all changes of ->i_lastfrag are under truncate_mutex

It's similar to what ext2 is doing; the main difference is that unlike
ext2 we can't rely upon the atomicity of stores into block pointers -
on UFS2 they are 64bit. So we can't cut the corner when switching
a pointer from NULL to non-NULL as we could in ext2_splice_branch()
and need to use meta_lock on all modifications.

We use seqlock where ext2 uses rwlock; ext2 could probably also benefit
from such change...

Another non-trivial difference is that with UFS we *cannot* have reader
grab truncate_mutex in case of race - it has to keep retrying. That
might be possible to change, but not until we lift tail unpacking
several levels up in call chain.

After that commit we do *NOT* hold fs-wide serialization on accesses
to block pointers anymore. Moreover, lock_ufs() can become a normal
mutex now - it's only used on statfs, remount and sync_fs and none
of those uses are recursive. As the matter of fact, *now* it can be
collapsed with ->s_lock, and be eventually replaced with saner
per-cylinder-group spinlocks, but that's a separate story.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Al Viro 724bb09f 4af7b2c0

+121 -47
+4
fs/ufs/balloc.c
··· 417 417 if (oldcount == 0) { 418 418 result = ufs_alloc_fragments (inode, cgno, goal, count, err); 419 419 if (result) { 420 + write_seqlock(&UFS_I(inode)->meta_lock); 420 421 ufs_cpu_to_data_ptr(sb, p, result); 422 + write_sequnlock(&UFS_I(inode)->meta_lock); 421 423 *err = 0; 422 424 UFS_I(inode)->i_lastfrag = 423 425 max(UFS_I(inode)->i_lastfrag, fragment + count); ··· 475 473 ufs_change_blocknr(inode, fragment - oldcount, oldcount, 476 474 uspi->s_sbbase + tmp, 477 475 uspi->s_sbbase + result, locked_page); 476 + write_seqlock(&UFS_I(inode)->meta_lock); 478 477 ufs_cpu_to_data_ptr(sb, p, result); 478 + write_sequnlock(&UFS_I(inode)->meta_lock); 479 479 *err = 0; 480 480 UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, 481 481 fragment + count);
+93 -45
fs/ufs/inode.c
··· 41 41 #include "swab.h" 42 42 #include "util.h" 43 43 44 - static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock); 45 - 46 44 static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4]) 47 45 { 48 46 struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi; ··· 73 75 return n; 74 76 } 75 77 78 + typedef struct { 79 + void *p; 80 + union { 81 + __fs32 key32; 82 + __fs64 key64; 83 + }; 84 + struct buffer_head *bh; 85 + } Indirect; 86 + 87 + static inline int grow_chain32(struct ufs_inode_info *ufsi, 88 + struct buffer_head *bh, __fs32 *v, 89 + Indirect *from, Indirect *to) 90 + { 91 + Indirect *p; 92 + unsigned seq; 93 + to->bh = bh; 94 + do { 95 + seq = read_seqbegin(&ufsi->meta_lock); 96 + to->key32 = *(__fs32 *)(to->p = v); 97 + for (p = from; p <= to && p->key32 == *(__fs32 *)p->p; p++) 98 + ; 99 + } while (read_seqretry(&ufsi->meta_lock, seq)); 100 + return (p > to); 101 + } 102 + 103 + static inline int grow_chain64(struct ufs_inode_info *ufsi, 104 + struct buffer_head *bh, __fs64 *v, 105 + Indirect *from, Indirect *to) 106 + { 107 + Indirect *p; 108 + unsigned seq; 109 + to->bh = bh; 110 + do { 111 + seq = read_seqbegin(&ufsi->meta_lock); 112 + to->key64 = *(__fs64 *)(to->p = v); 113 + for (p = from; p <= to && p->key64 == *(__fs64 *)p->p; p++) 114 + ; 115 + } while (read_seqretry(&ufsi->meta_lock, seq)); 116 + return (p > to); 117 + } 118 + 76 119 /* 77 120 * Returns the location of the fragment from 78 121 * the beginning of the filesystem. 79 122 */ 80 123 81 - static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock) 124 + static u64 ufs_frag_map(struct inode *inode, sector_t frag) 82 125 { 83 126 struct ufs_inode_info *ufsi = UFS_I(inode); 84 127 struct super_block *sb = inode->i_sb; ··· 127 88 u64 mask = (u64) uspi->s_apbmask>>uspi->s_fpbshift; 128 89 int shift = uspi->s_apbshift-uspi->s_fpbshift; 129 90 sector_t offsets[4], *p; 91 + Indirect chain[4], *q = chain; 130 92 int depth = ufs_block_to_path(inode, frag >> uspi->s_fpbshift, offsets); 131 - u64 ret = 0L; 132 - __fs32 block; 133 - __fs64 u2_block = 0L; 134 93 unsigned flags = UFS_SB(sb)->s_flags; 135 - u64 temp = 0L; 94 + u64 res = 0; 136 95 137 96 UFSD(": frag = %llu depth = %d\n", (unsigned long long)frag, depth); 138 97 UFSD(": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n", ··· 138 101 (unsigned long long)mask); 139 102 140 103 if (depth == 0) 141 - return 0; 104 + goto no_block; 142 105 106 + again: 143 107 p = offsets; 144 108 145 - if (needs_lock) 146 - lock_ufs(sb); 147 109 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) 148 110 goto ufs2; 149 111 150 - block = ufsi->i_u1.i_data[*p++]; 151 - if (!block) 152 - goto out; 112 + if (!grow_chain32(ufsi, NULL, &ufsi->i_u1.i_data[*p++], chain, q)) 113 + goto changed; 114 + if (!q->key32) 115 + goto no_block; 153 116 while (--depth) { 117 + __fs32 *ptr; 154 118 struct buffer_head *bh; 155 119 sector_t n = *p++; 156 120 157 - bh = sb_bread(sb, uspi->s_sbbase + fs32_to_cpu(sb, block)+(n>>shift)); 121 + bh = sb_bread(sb, uspi->s_sbbase + 122 + fs32_to_cpu(sb, q->key32) + (n>>shift)); 158 123 if (!bh) 159 - goto out; 160 - block = ((__fs32 *) bh->b_data)[n & mask]; 161 - brelse (bh); 162 - if (!block) 163 - goto out; 124 + goto no_block; 125 + ptr = (__fs32 *)bh->b_data + (n & mask); 126 + if (!grow_chain32(ufsi, bh, ptr, chain, ++q)) 127 + goto changed; 128 + if (!q->key32) 129 + goto no_block; 164 130 } 165 - ret = (u64) (uspi->s_sbbase + fs32_to_cpu(sb, block) + (frag & uspi->s_fpbmask)); 166 - goto out; 131 + res = fs32_to_cpu(sb, q->key32); 132 + goto found; 133 + 167 134 ufs2: 168 - u2_block = ufsi->i_u1.u2_i_data[*p++]; 169 - if (!u2_block) 170 - goto out; 171 - 135 + if (!grow_chain64(ufsi, NULL, &ufsi->i_u1.u2_i_data[*p++], chain, q)) 136 + goto changed; 137 + if (!q->key64) 138 + goto no_block; 172 139 173 140 while (--depth) { 141 + __fs64 *ptr; 174 142 struct buffer_head *bh; 175 143 sector_t n = *p++; 176 144 177 - 178 - temp = (u64)(uspi->s_sbbase) + fs64_to_cpu(sb, u2_block); 179 - bh = sb_bread(sb, temp +(u64) (n>>shift)); 145 + bh = sb_bread(sb, uspi->s_sbbase + 146 + fs64_to_cpu(sb, q->key64) + (n>>shift)); 180 147 if (!bh) 181 - goto out; 182 - u2_block = ((__fs64 *)bh->b_data)[n & mask]; 183 - brelse(bh); 184 - if (!u2_block) 185 - goto out; 148 + goto no_block; 149 + ptr = (__fs64 *)bh->b_data + (n & mask); 150 + if (!grow_chain64(ufsi, bh, ptr, chain, ++q)) 151 + goto changed; 152 + if (!q->key64) 153 + goto no_block; 186 154 } 187 - temp = (u64)uspi->s_sbbase + fs64_to_cpu(sb, u2_block); 188 - ret = temp + (u64) (frag & uspi->s_fpbmask); 155 + res = fs64_to_cpu(sb, q->key64); 156 + found: 157 + res += uspi->s_sbbase + (frag & uspi->s_fpbmask); 158 + no_block: 159 + while (q > chain) { 160 + brelse(q->bh); 161 + q--; 162 + } 163 + return res; 189 164 190 - out: 191 - if (needs_lock) 192 - unlock_ufs(sb); 193 - return ret; 165 + changed: 166 + while (q > chain) { 167 + brelse(q->bh); 168 + q--; 169 + } 170 + goto again; 194 171 } 195 172 196 173 /** ··· 472 421 int ret, err, new; 473 422 unsigned long ptr,phys; 474 423 u64 phys64 = 0; 475 - bool needs_lock = (sbi->mutex_owner != current); 476 424 477 425 if (!create) { 478 - phys64 = ufs_frag_map(inode, fragment, needs_lock); 426 + phys64 = ufs_frag_map(inode, fragment); 479 427 UFSD("phys64 = %llu\n", (unsigned long long)phys64); 480 428 if (phys64) 481 429 map_bh(bh_result, sb, phys64); ··· 488 438 ret = 0; 489 439 bh = NULL; 490 440 491 - if (needs_lock) 492 - lock_ufs(sb); 441 + mutex_lock(&UFS_I(inode)->truncate_mutex); 493 442 494 443 UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment); 495 444 if (fragment > ··· 550 501 set_buffer_new(bh_result); 551 502 map_bh(bh_result, sb, phys); 552 503 abort: 553 - if (needs_lock) 554 - unlock_ufs(sb); 504 + mutex_unlock(&UFS_I(inode)->truncate_mutex); 555 505 556 506 return err; 557 507
+2
fs/ufs/super.c
··· 1429 1429 return NULL; 1430 1430 1431 1431 ei->vfs_inode.i_version = 1; 1432 + seqlock_init(&ei->meta_lock); 1433 + mutex_init(&ei->truncate_mutex); 1432 1434 return &ei->vfs_inode; 1433 1435 } 1434 1436
+20 -2
fs/ufs/truncate.c
··· 128 128 tmp = ufs_data_ptr_to_cpu(sb, p); 129 129 if (!tmp) 130 130 continue; 131 + write_seqlock(&ufsi->meta_lock); 131 132 ufs_data_ptr_clear(uspi, p); 133 + write_sequnlock(&ufsi->meta_lock); 132 134 133 135 if (free_count == 0) { 134 136 frag_to_free = tmp; ··· 159 157 if (!tmp ) 160 158 ufs_panic(sb, "ufs_truncate_direct", "internal error"); 161 159 frag4 = ufs_fragnum (frag4); 160 + write_seqlock(&ufsi->meta_lock); 162 161 ufs_data_ptr_clear(uspi, p); 162 + write_sequnlock(&ufsi->meta_lock); 163 163 164 164 ufs_free_fragments (inode, tmp, frag4); 165 165 mark_inode_dirty(inode); ··· 203 199 return 1; 204 200 } 205 201 if (!ind_ubh) { 202 + write_seqlock(&UFS_I(inode)->meta_lock); 206 203 ufs_data_ptr_clear(uspi, p); 204 + write_sequnlock(&UFS_I(inode)->meta_lock); 207 205 return 0; 208 206 } 209 207 ··· 216 210 if (!tmp) 217 211 continue; 218 212 213 + write_seqlock(&UFS_I(inode)->meta_lock); 219 214 ufs_data_ptr_clear(uspi, ind); 215 + write_sequnlock(&UFS_I(inode)->meta_lock); 220 216 ubh_mark_buffer_dirty(ind_ubh); 221 217 if (free_count == 0) { 222 218 frag_to_free = tmp; ··· 243 235 break; 244 236 if (i >= uspi->s_apb) { 245 237 tmp = ufs_data_ptr_to_cpu(sb, p); 238 + write_seqlock(&UFS_I(inode)->meta_lock); 246 239 ufs_data_ptr_clear(uspi, p); 240 + write_sequnlock(&UFS_I(inode)->meta_lock); 247 241 248 242 ubh_bforget(ind_ubh); 249 243 ufs_free_blocks (inode, tmp, uspi->s_fpb); ··· 288 278 return 1; 289 279 } 290 280 if (!dind_bh) { 281 + write_seqlock(&UFS_I(inode)->meta_lock); 291 282 ufs_data_ptr_clear(uspi, p); 283 + write_sequnlock(&UFS_I(inode)->meta_lock); 292 284 return 0; 293 285 } 294 286 ··· 309 297 break; 310 298 if (i >= uspi->s_apb) { 311 299 tmp = ufs_data_ptr_to_cpu(sb, p); 300 + write_seqlock(&UFS_I(inode)->meta_lock); 312 301 ufs_data_ptr_clear(uspi, p); 302 + write_sequnlock(&UFS_I(inode)->meta_lock); 313 303 314 304 ubh_bforget(dind_bh); 315 305 ufs_free_blocks(inode, tmp, uspi->s_fpb); ··· 353 339 return 1; 354 340 } 355 341 if (!tind_bh) { 342 + write_seqlock(&ufsi->meta_lock); 356 343 ufs_data_ptr_clear(uspi, p); 344 + write_sequnlock(&ufsi->meta_lock); 357 345 return 0; 358 346 } 359 347 ··· 371 355 break; 372 356 if (i >= uspi->s_apb) { 373 357 tmp = ufs_data_ptr_to_cpu(sb, p); 358 + write_seqlock(&ufsi->meta_lock); 374 359 ufs_data_ptr_clear(uspi, p); 360 + write_sequnlock(&ufsi->meta_lock); 375 361 376 362 ubh_bforget(tind_bh); 377 363 ufs_free_blocks(inode, tmp, uspi->s_fpb); ··· 465 447 struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; 466 448 int retry; 467 449 468 - lock_ufs(sb); 450 + mutex_lock(&ufsi->truncate_mutex); 469 451 while (1) { 470 452 retry = ufs_trunc_direct(inode); 471 453 retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK, ··· 483 465 } 484 466 485 467 ufsi->i_lastfrag = DIRECT_FRAGMENT; 486 - unlock_ufs(sb); 468 + mutex_unlock(&ufsi->truncate_mutex); 487 469 } 488 470 489 471 int ufs_truncate(struct inode *inode, loff_t size)
+2
fs/ufs/ufs.h
··· 46 46 __u32 i_oeftflag; 47 47 __u16 i_osync; 48 48 __u64 i_lastfrag; 49 + seqlock_t meta_lock; 50 + struct mutex truncate_mutex; 49 51 __u32 i_dir_start_lookup; 50 52 struct inode vfs_inode; 51 53 };