Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bcachefs: Log finsert/fcollapse operations

Now that we have the logged operations btree, we can make
finsert/fcollapse atomic w.r.t. unclean shutdown as well.

This adds bch_logged_op_finsert to represent the state of an finsert or
fcollapse, which is a bit more complicated than truncate since we need
to track our position in the "shift extents" operation.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

+194 -137
+21 -2
fs/bcachefs/bcachefs_format.h
··· 371 371 x(inode_v3, 29) \ 372 372 x(bucket_gens, 30) \ 373 373 x(snapshot_tree, 31) \ 374 - x(logged_op_truncate, 32) 374 + x(logged_op_truncate, 32) \ 375 + x(logged_op_finsert, 33) 375 376 376 377 enum bch_bkey_type { 377 378 #define x(name, nr) KEY_TYPE_##name = nr, ··· 1193 1192 __le32 pad; 1194 1193 __le64 inum; 1195 1194 __le64 new_i_size; 1195 + }; 1196 + 1197 + enum logged_op_finsert_state { 1198 + LOGGED_OP_FINSERT_start, 1199 + LOGGED_OP_FINSERT_shift_extents, 1200 + LOGGED_OP_FINSERT_finish, 1201 + }; 1202 + 1203 + struct bch_logged_op_finsert { 1204 + struct bch_val v; 1205 + __u8 state; 1206 + __u8 pad[3]; 1207 + __le32 subvol; 1208 + __le64 inum; 1209 + __le64 dst_offset; 1210 + __le64 src_offset; 1211 + __le64 pos; 1196 1212 }; 1197 1213 1198 1214 /* Optional/variable size superblock sections: */ ··· 2280 2262 x(deleted_inodes, 16, BTREE_ID_SNAPSHOTS, \ 2281 2263 BIT_ULL(KEY_TYPE_set)) \ 2282 2264 x(logged_ops, 17, 0, \ 2283 - BIT_ULL(KEY_TYPE_logged_op_truncate)) 2265 + BIT_ULL(KEY_TYPE_logged_op_truncate)| \ 2266 + BIT_ULL(KEY_TYPE_logged_op_finsert)) 2284 2267 2285 2268 enum btree_id { 2286 2269 #define x(name, nr, ...) BTREE_ID_##name = nr,
+161 -134
fs/bcachefs/io_misc.c
··· 293 293 __bch2_resume_logged_op_truncate(&trans, &op.k_i, i_sectors_delta)); 294 294 } 295 295 296 + /* finsert/fcollapse: */ 297 + 298 + void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) 299 + { 300 + struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k); 301 + 302 + prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol)); 303 + prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum)); 304 + prt_printf(out, " dst_offset=%lli", le64_to_cpu(op.v->dst_offset)); 305 + prt_printf(out, " src_offset=%llu", le64_to_cpu(op.v->src_offset)); 306 + } 307 + 296 308 static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset, s64 len) 297 309 { 298 310 struct btree_iter iter; ··· 339 327 return ret; 340 328 } 341 329 330 + static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, 331 + struct bkey_i *op_k, 332 + u64 *i_sectors_delta) 333 + { 334 + struct bch_fs *c = trans->c; 335 + struct btree_iter iter; 336 + struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k); 337 + subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; 338 + u64 dst_offset = le64_to_cpu(op->v.dst_offset); 339 + u64 src_offset = le64_to_cpu(op->v.src_offset); 340 + s64 shift = dst_offset - src_offset; 341 + u64 len = abs(shift); 342 + u64 pos = le64_to_cpu(op->v.pos); 343 + bool insert = shift > 0; 344 + int ret = 0; 345 + 346 + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 347 + POS(inum.inum, 0), 348 + BTREE_ITER_INTENT); 349 + 350 + switch (op->v.state) { 351 + case LOGGED_OP_FINSERT_start: 352 + op->v.state = LOGGED_OP_FINSERT_shift_extents; 353 + 354 + if (insert) { 355 + ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, 356 + adjust_i_size(trans, inum, src_offset, len) ?: 357 + bch2_logged_op_update(trans, &op->k_i)); 358 + if (ret) 359 + goto err; 360 + } else { 361 + bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset)); 362 + 363 + ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta); 364 + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) 365 + goto err; 366 + 367 + ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, 368 + bch2_logged_op_update(trans, &op->k_i)); 369 + } 370 + 371 + fallthrough; 372 + case LOGGED_OP_FINSERT_shift_extents: 373 + while (1) { 374 + struct disk_reservation disk_res = 375 + bch2_disk_reservation_init(c, 0); 376 + struct bkey_i delete, *copy; 377 + struct bkey_s_c k; 378 + struct bpos src_pos = POS(inum.inum, src_offset); 379 + u32 snapshot; 380 + 381 + bch2_trans_begin(trans); 382 + 383 + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 384 + if (ret) 385 + goto btree_err; 386 + 387 + bch2_btree_iter_set_snapshot(&iter, snapshot); 388 + bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot)); 389 + 390 + k = insert 391 + ? bch2_btree_iter_peek_prev(&iter) 392 + : bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX)); 393 + if ((ret = bkey_err(k))) 394 + goto btree_err; 395 + 396 + if (!k.k || 397 + k.k->p.inode != inum.inum || 398 + bkey_le(k.k->p, POS(inum.inum, src_offset))) 399 + break; 400 + 401 + copy = bch2_bkey_make_mut_noupdate(trans, k); 402 + if ((ret = PTR_ERR_OR_ZERO(copy))) 403 + goto btree_err; 404 + 405 + if (insert && 406 + bkey_lt(bkey_start_pos(k.k), src_pos)) { 407 + bch2_cut_front(src_pos, copy); 408 + 409 + /* Splitting compressed extent? */ 410 + bch2_disk_reservation_add(c, &disk_res, 411 + copy->k.size * 412 + bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)), 413 + BCH_DISK_RESERVATION_NOFAIL); 414 + } 415 + 416 + bkey_init(&delete.k); 417 + delete.k.p = copy->k.p; 418 + delete.k.p.snapshot = snapshot; 419 + delete.k.size = copy->k.size; 420 + 421 + copy->k.p.offset += shift; 422 + copy->k.p.snapshot = snapshot; 423 + 424 + op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); 425 + 426 + ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: 427 + bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: 428 + bch2_logged_op_update(trans, &op->k_i) ?: 429 + bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL); 430 + btree_err: 431 + bch2_disk_reservation_put(c, &disk_res); 432 + 433 + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 434 + continue; 435 + if (ret) 436 + goto err; 437 + 438 + pos = le64_to_cpu(op->v.pos); 439 + } 440 + 441 + op->v.state = LOGGED_OP_FINSERT_finish; 442 + 443 + if (!insert) { 444 + ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, 445 + adjust_i_size(trans, inum, src_offset, shift) ?: 446 + bch2_logged_op_update(trans, &op->k_i)); 447 + } else { 448 + /* We need an inode update to update bi_journal_seq for fsync: */ 449 + ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, 450 + adjust_i_size(trans, inum, 0, 0) ?: 451 + bch2_logged_op_update(trans, &op->k_i)); 452 + } 453 + 454 + fallthrough; 455 + case LOGGED_OP_FINSERT_finish: 456 + ret = ret; 457 + } 458 + err: 459 + bch2_logged_op_finish(trans, op_k); 460 + bch2_trans_iter_exit(trans, &iter); 461 + return ret; 462 + } 463 + 464 + int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k) 465 + { 466 + return __bch2_resume_logged_op_finsert(trans, op_k, NULL); 467 + } 468 + 342 469 int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum, 343 470 u64 offset, u64 len, bool insert, 344 471 s64 *i_sectors_delta) 345 472 { 346 - struct bkey_buf copy; 347 - struct btree_trans trans; 348 - struct btree_iter src = { NULL }, dst = { NULL }, del = { NULL }; 473 + struct bkey_i_logged_op_finsert op; 349 474 s64 shift = insert ? len : -len; 350 - int ret = 0; 351 475 352 - bch2_bkey_buf_init(&copy); 353 - bch2_trans_init(&trans, c, 0, 1024); 476 + bkey_logged_op_finsert_init(&op.k_i); 477 + op.v.subvol = cpu_to_le32(inum.subvol); 478 + op.v.inum = cpu_to_le64(inum.inum); 479 + op.v.dst_offset = cpu_to_le64(offset + shift); 480 + op.v.src_offset = cpu_to_le64(offset); 481 + op.v.pos = cpu_to_le64(insert ? U64_MAX : offset); 354 482 355 - bch2_trans_iter_init(&trans, &src, BTREE_ID_extents, 356 - POS(inum.inum, U64_MAX), 357 - BTREE_ITER_INTENT); 358 - bch2_trans_copy_iter(&dst, &src); 359 - bch2_trans_copy_iter(&del, &src); 360 - 361 - if (insert) { 362 - ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL, 363 - adjust_i_size(&trans, inum, offset, len)); 364 - if (ret) 365 - goto err; 366 - } else { 367 - bch2_btree_iter_set_pos(&src, POS(inum.inum, offset)); 368 - 369 - ret = bch2_fpunch_at(&trans, &src, inum, offset + len, i_sectors_delta); 370 - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) 371 - goto err; 372 - 373 - bch2_btree_iter_set_pos(&src, POS(inum.inum, offset + len)); 374 - } 375 - 376 - while (ret == 0 || bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 377 - struct disk_reservation disk_res = 378 - bch2_disk_reservation_init(c, 0); 379 - struct bkey_i delete; 380 - struct bkey_s_c k; 381 - struct bpos next_pos; 382 - struct bpos move_pos = POS(inum.inum, offset); 383 - struct bpos atomic_end; 384 - unsigned trigger_flags = 0; 385 - u32 snapshot; 386 - 387 - bch2_trans_begin(&trans); 388 - 389 - ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); 390 - if (ret) 391 - continue; 392 - 393 - bch2_btree_iter_set_snapshot(&src, snapshot); 394 - bch2_btree_iter_set_snapshot(&dst, snapshot); 395 - bch2_btree_iter_set_snapshot(&del, snapshot); 396 - 397 - bch2_trans_begin(&trans); 398 - 399 - k = insert 400 - ? bch2_btree_iter_peek_prev(&src) 401 - : bch2_btree_iter_peek_upto(&src, POS(inum.inum, U64_MAX)); 402 - if ((ret = bkey_err(k))) 403 - continue; 404 - 405 - if (!k.k || k.k->p.inode != inum.inum) 406 - break; 407 - 408 - if (insert && 409 - bkey_le(k.k->p, POS(inum.inum, offset))) 410 - break; 411 - reassemble: 412 - bch2_bkey_buf_reassemble(&copy, c, k); 413 - 414 - if (insert && 415 - bkey_lt(bkey_start_pos(k.k), move_pos)) 416 - bch2_cut_front(move_pos, copy.k); 417 - 418 - copy.k->k.p.offset += shift; 419 - bch2_btree_iter_set_pos(&dst, bkey_start_pos(&copy.k->k)); 420 - 421 - ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end); 422 - if (ret) 423 - continue; 424 - 425 - if (!bkey_eq(atomic_end, copy.k->k.p)) { 426 - if (insert) { 427 - move_pos = atomic_end; 428 - move_pos.offset -= shift; 429 - goto reassemble; 430 - } else { 431 - bch2_cut_back(atomic_end, copy.k); 432 - } 433 - } 434 - 435 - bkey_init(&delete.k); 436 - delete.k.p = copy.k->k.p; 437 - delete.k.size = copy.k->k.size; 438 - delete.k.p.offset -= shift; 439 - bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k)); 440 - 441 - next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; 442 - 443 - if (copy.k->k.size != k.k->size) { 444 - /* We might end up splitting compressed extents: */ 445 - unsigned nr_ptrs = 446 - bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k)); 447 - 448 - ret = bch2_disk_reservation_get(c, &disk_res, 449 - copy.k->k.size, nr_ptrs, 450 - BCH_DISK_RESERVATION_NOFAIL); 451 - BUG_ON(ret); 452 - } 453 - 454 - ret = bch2_btree_iter_traverse(&del) ?: 455 - bch2_trans_update(&trans, &del, &delete, trigger_flags) ?: 456 - bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?: 457 - bch2_trans_commit(&trans, &disk_res, NULL, 458 - BTREE_INSERT_NOFAIL); 459 - bch2_disk_reservation_put(c, &disk_res); 460 - 461 - if (!ret) 462 - bch2_btree_iter_set_pos(&src, next_pos); 463 - } 464 - 465 - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) 466 - goto err; 467 - 468 - if (!insert) { 469 - ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL, 470 - adjust_i_size(&trans, inum, offset, -len)); 471 - } else { 472 - /* We need an inode update to update bi_journal_seq for fsync: */ 473 - ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL, 474 - adjust_i_size(&trans, inum, 0, 0)); 475 - } 476 - err: 477 - bch2_trans_iter_exit(&trans, &del); 478 - bch2_trans_iter_exit(&trans, &dst); 479 - bch2_trans_iter_exit(&trans, &src); 480 - bch2_trans_exit(&trans); 481 - bch2_bkey_buf_exit(&copy, c); 482 - return ret; 483 + return bch2_trans_run(c, 484 + bch2_logged_op_start(&trans, &op.k_i) ?: 485 + __bch2_resume_logged_op_finsert(&trans, &op.k_i, i_sectors_delta)); 483 486 }
+10
fs/bcachefs/io_misc.h
··· 19 19 int bch2_resume_logged_op_truncate(struct btree_trans *, struct bkey_i *); 20 20 21 21 int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *); 22 + 23 + void bch2_logged_op_finsert_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); 24 + 25 + #define bch2_bkey_ops_logged_op_finsert ((struct bkey_ops) { \ 26 + .val_to_text = bch2_logged_op_finsert_to_text, \ 27 + .min_val_size = 24, \ 28 + }) 29 + 30 + int bch2_resume_logged_op_finsert(struct btree_trans *, struct bkey_i *); 31 + 22 32 int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *); 23 33 24 34 #endif /* _BCACHEFS_IO_MISC_H */
+2 -1
fs/bcachefs/logged_ops.h
··· 5 5 #include "bkey.h" 6 6 7 7 #define BCH_LOGGED_OPS() \ 8 - x(truncate) 8 + x(truncate) \ 9 + x(finsert) 9 10 10 11 static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op) 11 12 {