ext4: Fix circular locking dependency with migrate and rm.

In order to prevent a circular locking dependency when an unlink
operation is racing with an ext4 migration, we delay taking i_data_sem
until just before switch the inode format, and use i_mutex to prevent
writes and truncates during the first part of the migration operation.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

authored by Aneesh Kumar K.V and committed by Theodore Ts'o 8009f9fb 0040d987

+74 -43
+74 -43
fs/ext4/migrate.c
··· 61 retval = ext4_journal_restart(handle, needed); 62 if (retval) 63 goto err_out; 64 - } 65 - if (needed) { 66 retval = ext4_journal_extend(handle, needed); 67 - if (retval != 0) { 68 /* 69 * IF not able to extend the journal restart the journal 70 */ ··· 219 220 } 221 222 static int free_dind_blocks(handle_t *handle, 223 struct inode *inode, __le32 i_data) 224 { ··· 253 254 tmp_idata = (__le32 *)bh->b_data; 255 for (i = 0; i < max_entries; i++) { 256 - if (tmp_idata[i]) 257 ext4_free_blocks(handle, inode, 258 le32_to_cpu(tmp_idata[i]), 1, 1); 259 } 260 put_bh(bh); 261 ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); 262 return 0; 263 } ··· 289 } 290 } 291 put_bh(bh); 292 ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); 293 return 0; 294 } 295 296 - static int free_ind_block(handle_t *handle, struct inode *inode) 297 { 298 int retval; 299 - struct ext4_inode_info *ei = EXT4_I(inode); 300 301 - if (ei->i_data[EXT4_IND_BLOCK]) 302 ext4_free_blocks(handle, inode, 303 - le32_to_cpu(ei->i_data[EXT4_IND_BLOCK]), 1, 1); 304 305 - if (ei->i_data[EXT4_DIND_BLOCK]) { 306 - retval = free_dind_blocks(handle, inode, 307 - ei->i_data[EXT4_DIND_BLOCK]); 308 if (retval) 309 return retval; 310 } 311 312 - if (ei->i_data[EXT4_TIND_BLOCK]) { 313 - retval = free_tind_blocks(handle, inode, 314 - ei->i_data[EXT4_TIND_BLOCK]); 315 if (retval) 316 return retval; 317 } ··· 322 } 323 324 static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, 325 - struct inode *tmp_inode, int retval) 326 { 327 struct ext4_inode_info *ei = EXT4_I(inode); 328 struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode); 329 - 330 - retval = free_ind_block(handle, inode); 331 - if (retval) 332 - goto err_out; 333 334 /* 335 * One credit accounted for writing the ··· 340 goto err_out; 341 } 342 343 /* 344 * We have the extent map build with the tmp inode. 345 * Now copy the i_data across ··· 364 spin_lock(&inode->i_lock); 365 inode->i_blocks += tmp_inode->i_blocks; 366 spin_unlock(&inode->i_lock); 367 368 ext4_mark_inode_dirty(handle, inode); 369 err_out: 370 return retval; 371 } ··· 400 } 401 } 402 put_bh(bh); 403 ext4_free_blocks(handle, inode, block, 1, 1); 404 return retval; 405 } ··· 456 */ 457 return retval; 458 459 - down_write(&EXT4_I(inode)->i_data_sem); 460 handle = ext4_journal_start(inode, 461 EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 462 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + ··· 489 ext4_orphan_add(handle, tmp_inode); 490 ext4_journal_stop(handle); 491 492 - ei = EXT4_I(inode); 493 - i_data = ei->i_data; 494 - memset(&lb, 0, sizeof(lb)); 495 - 496 - /* 32 bit block address 4 bytes */ 497 - max_entries = inode->i_sb->s_blocksize >> 2; 498 - 499 /* 500 * start with one credit accounted for 501 * superblock modification. ··· 497 * trascation that created the inode. Later as and 498 * when we add extents we extent the journal 499 */ 500 handle = ext4_journal_start(inode, 1); 501 for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) { 502 if (i_data[i]) { 503 retval = update_extent_range(handle, tmp_inode, ··· 548 */ 549 retval = finish_range(handle, tmp_inode, &lb); 550 err_out: 551 - /* 552 - * We are either freeing extent information or indirect 553 - * blocks. During this we touch superblock, group descriptor 554 - * and block bitmap. Later we mark the tmp_inode dirty 555 - * via ext4_ext_tree_init. So allocate a credit of 4 556 - * We may update quota (user and group). 557 - * 558 - * FIXME!! we may be touching bitmaps in different block groups. 559 - */ 560 - if (ext4_journal_extend(handle, 561 - 4 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)) != 0) 562 - ext4_journal_restart(handle, 563 - 4 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)); 564 if (retval) 565 /* 566 * Failure case delete the extent information with the ··· 556 free_ext_block(handle, tmp_inode); 557 else 558 retval = ext4_ext_swap_inode_data(handle, inode, 559 - tmp_inode, retval); 560 561 /* 562 * Mark the tmp_inode as of size zero ··· 588 tmp_inode->i_nlink = 0; 589 590 ext4_journal_stop(handle); 591 - 592 - up_write(&EXT4_I(inode)->i_data_sem); 593 594 if (tmp_inode) 595 iput(tmp_inode);
··· 61 retval = ext4_journal_restart(handle, needed); 62 if (retval) 63 goto err_out; 64 + } else if (needed) { 65 retval = ext4_journal_extend(handle, needed); 66 + if (retval) { 67 /* 68 * IF not able to extend the journal restart the journal 69 */ ··· 220 221 } 222 223 + static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) 224 + { 225 + int retval = 0, needed; 226 + 227 + if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) 228 + return 0; 229 + /* 230 + * We are freeing a blocks. During this we touch 231 + * superblock, group descriptor and block bitmap. 232 + * So allocate a credit of 3. We may update 233 + * quota (user and group). 234 + */ 235 + needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 236 + 237 + if (ext4_journal_extend(handle, needed) != 0) 238 + retval = ext4_journal_restart(handle, needed); 239 + 240 + return retval; 241 + } 242 + 243 static int free_dind_blocks(handle_t *handle, 244 struct inode *inode, __le32 i_data) 245 { ··· 234 235 tmp_idata = (__le32 *)bh->b_data; 236 for (i = 0; i < max_entries; i++) { 237 + if (tmp_idata[i]) { 238 + extend_credit_for_blkdel(handle, inode); 239 ext4_free_blocks(handle, inode, 240 le32_to_cpu(tmp_idata[i]), 1, 1); 241 + } 242 } 243 put_bh(bh); 244 + extend_credit_for_blkdel(handle, inode); 245 ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); 246 return 0; 247 } ··· 267 } 268 } 269 put_bh(bh); 270 + extend_credit_for_blkdel(handle, inode); 271 ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); 272 return 0; 273 } 274 275 + static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) 276 { 277 int retval; 278 279 + /* ei->i_data[EXT4_IND_BLOCK] */ 280 + if (i_data[0]) { 281 + extend_credit_for_blkdel(handle, inode); 282 ext4_free_blocks(handle, inode, 283 + le32_to_cpu(i_data[0]), 1, 1); 284 + } 285 286 + /* ei->i_data[EXT4_DIND_BLOCK] */ 287 + if (i_data[1]) { 288 + retval = free_dind_blocks(handle, inode, i_data[1]); 289 if (retval) 290 return retval; 291 } 292 293 + /* ei->i_data[EXT4_TIND_BLOCK] */ 294 + if (i_data[2]) { 295 + retval = free_tind_blocks(handle, inode, i_data[2]); 296 if (retval) 297 return retval; 298 } ··· 297 } 298 299 static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, 300 + struct inode *tmp_inode) 301 { 302 + int retval; 303 + __le32 i_data[3]; 304 struct ext4_inode_info *ei = EXT4_I(inode); 305 struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode); 306 307 /* 308 * One credit accounted for writing the ··· 317 goto err_out; 318 } 319 320 + i_data[0] = ei->i_data[EXT4_IND_BLOCK]; 321 + i_data[1] = ei->i_data[EXT4_DIND_BLOCK]; 322 + i_data[2] = ei->i_data[EXT4_TIND_BLOCK]; 323 + 324 + down_write(&EXT4_I(inode)->i_data_sem); 325 /* 326 * We have the extent map build with the tmp inode. 327 * Now copy the i_data across ··· 336 spin_lock(&inode->i_lock); 337 inode->i_blocks += tmp_inode->i_blocks; 338 spin_unlock(&inode->i_lock); 339 + up_write(&EXT4_I(inode)->i_data_sem); 340 341 + /* 342 + * We mark the inode dirty after, because we decrement the 343 + * i_blocks when freeing the indirect meta-data blocks 344 + */ 345 + retval = free_ind_block(handle, inode, i_data); 346 ext4_mark_inode_dirty(handle, inode); 347 + 348 err_out: 349 return retval; 350 } ··· 365 } 366 } 367 put_bh(bh); 368 + extend_credit_for_blkdel(handle, inode); 369 ext4_free_blocks(handle, inode, block, 1, 1); 370 return retval; 371 } ··· 420 */ 421 return retval; 422 423 handle = ext4_journal_start(inode, 424 EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 425 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + ··· 454 ext4_orphan_add(handle, tmp_inode); 455 ext4_journal_stop(handle); 456 457 /* 458 * start with one credit accounted for 459 * superblock modification. ··· 469 * trascation that created the inode. Later as and 470 * when we add extents we extent the journal 471 */ 472 + /* 473 + * inode_mutex prevent write and truncate on the file. Read still goes 474 + * through. We take i_data_sem in ext4_ext_swap_inode_data before we 475 + * switch the inode format to prevent read. 476 + */ 477 + mutex_lock(&(inode->i_mutex)); 478 handle = ext4_journal_start(inode, 1); 479 + 480 + ei = EXT4_I(inode); 481 + i_data = ei->i_data; 482 + memset(&lb, 0, sizeof(lb)); 483 + 484 + /* 32 bit block address 4 bytes */ 485 + max_entries = inode->i_sb->s_blocksize >> 2; 486 for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) { 487 if (i_data[i]) { 488 retval = update_extent_range(handle, tmp_inode, ··· 507 */ 508 retval = finish_range(handle, tmp_inode, &lb); 509 err_out: 510 if (retval) 511 /* 512 * Failure case delete the extent information with the ··· 528 free_ext_block(handle, tmp_inode); 529 else 530 retval = ext4_ext_swap_inode_data(handle, inode, 531 + tmp_inode); 532 + 533 + /* We mark the tmp_inode dirty via ext4_ext_tree_init. */ 534 + if (ext4_journal_extend(handle, 1) != 0) 535 + ext4_journal_restart(handle, 1); 536 537 /* 538 * Mark the tmp_inode as of size zero ··· 556 tmp_inode->i_nlink = 0; 557 558 ext4_journal_stop(handle); 559 + mutex_unlock(&(inode->i_mutex)); 560 561 if (tmp_inode) 562 iput(tmp_inode);