ext4: Fix circular locking dependency with migrate and rm.

In order to prevent a circular locking dependency when an unlink
operation is racing with an ext4 migration, we delay taking i_data_sem
until just before switch the inode format, and use i_mutex to prevent
writes and truncates during the first part of the migration operation.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

authored by Aneesh Kumar K.V and committed by Theodore Ts'o 8009f9fb 0040d987

+74 -43
+74 -43
fs/ext4/migrate.c
··· 61 61 retval = ext4_journal_restart(handle, needed); 62 62 if (retval) 63 63 goto err_out; 64 - } 65 - if (needed) { 64 + } else if (needed) { 66 65 retval = ext4_journal_extend(handle, needed); 67 - if (retval != 0) { 66 + if (retval) { 68 67 /* 69 68 * IF not able to extend the journal restart the journal 70 69 */ ··· 219 220 220 221 } 221 222 223 + static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) 224 + { 225 + int retval = 0, needed; 226 + 227 + if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) 228 + return 0; 229 + /* 230 + * We are freeing a blocks. During this we touch 231 + * superblock, group descriptor and block bitmap. 232 + * So allocate a credit of 3. We may update 233 + * quota (user and group). 234 + */ 235 + needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 236 + 237 + if (ext4_journal_extend(handle, needed) != 0) 238 + retval = ext4_journal_restart(handle, needed); 239 + 240 + return retval; 241 + } 242 + 222 243 static int free_dind_blocks(handle_t *handle, 223 244 struct inode *inode, __le32 i_data) 224 245 { ··· 253 234 254 235 tmp_idata = (__le32 *)bh->b_data; 255 236 for (i = 0; i < max_entries; i++) { 256 - if (tmp_idata[i]) 237 + if (tmp_idata[i]) { 238 + extend_credit_for_blkdel(handle, inode); 257 239 ext4_free_blocks(handle, inode, 258 240 le32_to_cpu(tmp_idata[i]), 1, 1); 241 + } 259 242 } 260 243 put_bh(bh); 244 + extend_credit_for_blkdel(handle, inode); 261 245 ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); 262 246 return 0; 263 247 } ··· 289 267 } 290 268 } 291 269 put_bh(bh); 270 + extend_credit_for_blkdel(handle, inode); 292 271 ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); 293 272 return 0; 294 273 } 295 274 296 - static int free_ind_block(handle_t *handle, struct inode *inode) 275 + static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) 297 276 { 298 277 int retval; 299 - struct ext4_inode_info *ei = EXT4_I(inode); 300 278 301 - if (ei->i_data[EXT4_IND_BLOCK]) 279 + /* ei->i_data[EXT4_IND_BLOCK] */ 280 + if (i_data[0]) { 281 + extend_credit_for_blkdel(handle, inode); 302 282 ext4_free_blocks(handle, inode, 303 - le32_to_cpu(ei->i_data[EXT4_IND_BLOCK]), 1, 1); 283 + le32_to_cpu(i_data[0]), 1, 1); 284 + } 304 285 305 - if (ei->i_data[EXT4_DIND_BLOCK]) { 306 - retval = free_dind_blocks(handle, inode, 307 - ei->i_data[EXT4_DIND_BLOCK]); 286 + /* ei->i_data[EXT4_DIND_BLOCK] */ 287 + if (i_data[1]) { 288 + retval = free_dind_blocks(handle, inode, i_data[1]); 308 289 if (retval) 309 290 return retval; 310 291 } 311 292 312 - if (ei->i_data[EXT4_TIND_BLOCK]) { 313 - retval = free_tind_blocks(handle, inode, 314 - ei->i_data[EXT4_TIND_BLOCK]); 293 + /* ei->i_data[EXT4_TIND_BLOCK] */ 294 + if (i_data[2]) { 295 + retval = free_tind_blocks(handle, inode, i_data[2]); 315 296 if (retval) 316 297 return retval; 317 298 } ··· 322 297 } 323 298 324 299 static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, 325 - struct inode *tmp_inode, int retval) 300 + struct inode *tmp_inode) 326 301 { 302 + int retval; 303 + __le32 i_data[3]; 327 304 struct ext4_inode_info *ei = EXT4_I(inode); 328 305 struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode); 329 - 330 - retval = free_ind_block(handle, inode); 331 - if (retval) 332 - goto err_out; 333 306 334 307 /* 335 308 * One credit accounted for writing the ··· 340 317 goto err_out; 341 318 } 342 319 320 + i_data[0] = ei->i_data[EXT4_IND_BLOCK]; 321 + i_data[1] = ei->i_data[EXT4_DIND_BLOCK]; 322 + i_data[2] = ei->i_data[EXT4_TIND_BLOCK]; 323 + 324 + down_write(&EXT4_I(inode)->i_data_sem); 343 325 /* 344 326 * We have the extent map build with the tmp inode. 345 327 * Now copy the i_data across ··· 364 336 spin_lock(&inode->i_lock); 365 337 inode->i_blocks += tmp_inode->i_blocks; 366 338 spin_unlock(&inode->i_lock); 339 + up_write(&EXT4_I(inode)->i_data_sem); 367 340 341 + /* 342 + * We mark the inode dirty after, because we decrement the 343 + * i_blocks when freeing the indirect meta-data blocks 344 + */ 345 + retval = free_ind_block(handle, inode, i_data); 368 346 ext4_mark_inode_dirty(handle, inode); 347 + 369 348 err_out: 370 349 return retval; 371 350 } ··· 400 365 } 401 366 } 402 367 put_bh(bh); 368 + extend_credit_for_blkdel(handle, inode); 403 369 ext4_free_blocks(handle, inode, block, 1, 1); 404 370 return retval; 405 371 } ··· 456 420 */ 457 421 return retval; 458 422 459 - down_write(&EXT4_I(inode)->i_data_sem); 460 423 handle = ext4_journal_start(inode, 461 424 EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 462 425 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + ··· 489 454 ext4_orphan_add(handle, tmp_inode); 490 455 ext4_journal_stop(handle); 491 456 492 - ei = EXT4_I(inode); 493 - i_data = ei->i_data; 494 - memset(&lb, 0, sizeof(lb)); 495 - 496 - /* 32 bit block address 4 bytes */ 497 - max_entries = inode->i_sb->s_blocksize >> 2; 498 - 499 457 /* 500 458 * start with one credit accounted for 501 459 * superblock modification. ··· 497 469 * trascation that created the inode. Later as and 498 470 * when we add extents we extent the journal 499 471 */ 472 + /* 473 + * inode_mutex prevent write and truncate on the file. Read still goes 474 + * through. We take i_data_sem in ext4_ext_swap_inode_data before we 475 + * switch the inode format to prevent read. 476 + */ 477 + mutex_lock(&(inode->i_mutex)); 500 478 handle = ext4_journal_start(inode, 1); 479 + 480 + ei = EXT4_I(inode); 481 + i_data = ei->i_data; 482 + memset(&lb, 0, sizeof(lb)); 483 + 484 + /* 32 bit block address 4 bytes */ 485 + max_entries = inode->i_sb->s_blocksize >> 2; 501 486 for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) { 502 487 if (i_data[i]) { 503 488 retval = update_extent_range(handle, tmp_inode, ··· 548 507 */ 549 508 retval = finish_range(handle, tmp_inode, &lb); 550 509 err_out: 551 - /* 552 - * We are either freeing extent information or indirect 553 - * blocks. During this we touch superblock, group descriptor 554 - * and block bitmap. Later we mark the tmp_inode dirty 555 - * via ext4_ext_tree_init. So allocate a credit of 4 556 - * We may update quota (user and group). 557 - * 558 - * FIXME!! we may be touching bitmaps in different block groups. 559 - */ 560 - if (ext4_journal_extend(handle, 561 - 4 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)) != 0) 562 - ext4_journal_restart(handle, 563 - 4 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)); 564 510 if (retval) 565 511 /* 566 512 * Failure case delete the extent information with the ··· 556 528 free_ext_block(handle, tmp_inode); 557 529 else 558 530 retval = ext4_ext_swap_inode_data(handle, inode, 559 - tmp_inode, retval); 531 + tmp_inode); 532 + 533 + /* We mark the tmp_inode dirty via ext4_ext_tree_init. */ 534 + if (ext4_journal_extend(handle, 1) != 0) 535 + ext4_journal_restart(handle, 1); 560 536 561 537 /* 562 538 * Mark the tmp_inode as of size zero ··· 588 556 tmp_inode->i_nlink = 0; 589 557 590 558 ext4_journal_stop(handle); 591 - 592 - up_write(&EXT4_I(inode)->i_data_sem); 559 + mutex_unlock(&(inode->i_mutex)); 593 560 594 561 if (tmp_inode) 595 562 iput(tmp_inode);