bcachefs: Split up fs-io.[ch] · tjh.dev/kernel@dbbfca9

+3

fs/bcachefs/Makefile

··· 38 38 fs-common.o \ 39 39 fs-ioctl.o \ 40 40 fs-io.o \ 41 + fs-io-buffered.o \ 42 + fs-io-direct.o \ 43 + fs-io-pagecache.o \ 41 44 fsck.o \ 42 45 inode.o \ 43 46 io.o \

+1098

fs/bcachefs/fs-io-buffered.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #ifndef NO_BCACHEFS_FS 3 + 4 + #include "bcachefs.h" 5 + #include "alloc_foreground.h" 6 + #include "bkey_buf.h" 7 + #include "fs-io.h" 8 + #include "fs-io-buffered.h" 9 + #include "fs-io-direct.h" 10 + #include "fs-io-pagecache.h" 11 + #include "io.h" 12 + 13 + #include <linux/backing-dev.h> 14 + #include <linux/pagemap.h> 15 + #include <linux/writeback.h> 16 + 17 + static inline bool bio_full(struct bio *bio, unsigned len) 18 + { 19 + if (bio->bi_vcnt >= bio->bi_max_vecs) 20 + return true; 21 + if (bio->bi_iter.bi_size > UINT_MAX - len) 22 + return true; 23 + return false; 24 + } 25 + 26 + /* readpage(s): */ 27 + 28 + static void bch2_readpages_end_io(struct bio *bio) 29 + { 30 + struct folio_iter fi; 31 + 32 + bio_for_each_folio_all(fi, bio) { 33 + if (!bio->bi_status) { 34 + folio_mark_uptodate(fi.folio); 35 + } else { 36 + folio_clear_uptodate(fi.folio); 37 + folio_set_error(fi.folio); 38 + } 39 + folio_unlock(fi.folio); 40 + } 41 + 42 + bio_put(bio); 43 + } 44 + 45 + struct readpages_iter { 46 + struct address_space *mapping; 47 + unsigned idx; 48 + folios folios; 49 + }; 50 + 51 + static int readpages_iter_init(struct readpages_iter *iter, 52 + struct readahead_control *ractl) 53 + { 54 + struct folio **fi; 55 + int ret; 56 + 57 + memset(iter, 0, sizeof(*iter)); 58 + 59 + iter->mapping = ractl->mapping; 60 + 61 + ret = bch2_filemap_get_contig_folios_d(iter->mapping, 62 + ractl->_index << PAGE_SHIFT, 63 + (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT, 64 + 0, mapping_gfp_mask(iter->mapping), 65 + &iter->folios); 66 + if (ret) 67 + return ret; 68 + 69 + darray_for_each(iter->folios, fi) { 70 + ractl->_nr_pages -= 1U << folio_order(*fi); 71 + __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL); 72 + folio_put(*fi); 73 + folio_put(*fi); 74 + } 75 + 76 + return 0; 77 + } 78 + 79 + static inline struct folio *readpage_iter_peek(struct readpages_iter *iter) 80 + { 81 + if (iter->idx >= iter->folios.nr) 82 + return NULL; 83 + return iter->folios.data[iter->idx]; 84 + } 85 + 86 + static inline void readpage_iter_advance(struct readpages_iter *iter) 87 + { 88 + iter->idx++; 89 + } 90 + 91 + static bool extent_partial_reads_expensive(struct bkey_s_c k) 92 + { 93 + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 94 + struct bch_extent_crc_unpacked crc; 95 + const union bch_extent_entry *i; 96 + 97 + bkey_for_each_crc(k.k, ptrs, crc, i) 98 + if (crc.csum_type || crc.compression_type) 99 + return true; 100 + return false; 101 + } 102 + 103 + static int readpage_bio_extend(struct btree_trans *trans, 104 + struct readpages_iter *iter, 105 + struct bio *bio, 106 + unsigned sectors_this_extent, 107 + bool get_more) 108 + { 109 + /* Don't hold btree locks while allocating memory: */ 110 + bch2_trans_unlock(trans); 111 + 112 + while (bio_sectors(bio) < sectors_this_extent && 113 + bio->bi_vcnt < bio->bi_max_vecs) { 114 + struct folio *folio = readpage_iter_peek(iter); 115 + int ret; 116 + 117 + if (folio) { 118 + readpage_iter_advance(iter); 119 + } else { 120 + pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; 121 + 122 + if (!get_more) 123 + break; 124 + 125 + folio = xa_load(&iter->mapping->i_pages, folio_offset); 126 + if (folio && !xa_is_value(folio)) 127 + break; 128 + 129 + folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0); 130 + if (!folio) 131 + break; 132 + 133 + if (!__bch2_folio_create(folio, GFP_KERNEL)) { 134 + folio_put(folio); 135 + break; 136 + } 137 + 138 + ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL); 139 + if (ret) { 140 + __bch2_folio_release(folio); 141 + folio_put(folio); 142 + break; 143 + } 144 + 145 + folio_put(folio); 146 + } 147 + 148 + BUG_ON(folio_sector(folio) != bio_end_sector(bio)); 149 + 150 + BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0)); 151 + } 152 + 153 + return bch2_trans_relock(trans); 154 + } 155 + 156 + static void bchfs_read(struct btree_trans *trans, 157 + struct bch_read_bio *rbio, 158 + subvol_inum inum, 159 + struct readpages_iter *readpages_iter) 160 + { 161 + struct bch_fs *c = trans->c; 162 + struct btree_iter iter; 163 + struct bkey_buf sk; 164 + int flags = BCH_READ_RETRY_IF_STALE| 165 + BCH_READ_MAY_PROMOTE; 166 + u32 snapshot; 167 + int ret = 0; 168 + 169 + rbio->c = c; 170 + rbio->start_time = local_clock(); 171 + rbio->subvol = inum.subvol; 172 + 173 + bch2_bkey_buf_init(&sk); 174 + retry: 175 + bch2_trans_begin(trans); 176 + iter = (struct btree_iter) { NULL }; 177 + 178 + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 179 + if (ret) 180 + goto err; 181 + 182 + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 183 + SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), 184 + BTREE_ITER_SLOTS); 185 + while (1) { 186 + struct bkey_s_c k; 187 + unsigned bytes, sectors, offset_into_extent; 188 + enum btree_id data_btree = BTREE_ID_extents; 189 + 190 + /* 191 + * read_extent -> io_time_reset may cause a transaction restart 192 + * without returning an error, we need to check for that here: 193 + */ 194 + ret = bch2_trans_relock(trans); 195 + if (ret) 196 + break; 197 + 198 + bch2_btree_iter_set_pos(&iter, 199 + POS(inum.inum, rbio->bio.bi_iter.bi_sector)); 200 + 201 + k = bch2_btree_iter_peek_slot(&iter); 202 + ret = bkey_err(k); 203 + if (ret) 204 + break; 205 + 206 + offset_into_extent = iter.pos.offset - 207 + bkey_start_offset(k.k); 208 + sectors = k.k->size - offset_into_extent; 209 + 210 + bch2_bkey_buf_reassemble(&sk, c, k); 211 + 212 + ret = bch2_read_indirect_extent(trans, &data_btree, 213 + &offset_into_extent, &sk); 214 + if (ret) 215 + break; 216 + 217 + k = bkey_i_to_s_c(sk.k); 218 + 219 + sectors = min(sectors, k.k->size - offset_into_extent); 220 + 221 + if (readpages_iter) { 222 + ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, 223 + extent_partial_reads_expensive(k)); 224 + if (ret) 225 + break; 226 + } 227 + 228 + bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; 229 + swap(rbio->bio.bi_iter.bi_size, bytes); 230 + 231 + if (rbio->bio.bi_iter.bi_size == bytes) 232 + flags |= BCH_READ_LAST_FRAGMENT; 233 + 234 + bch2_bio_page_state_set(&rbio->bio, k); 235 + 236 + bch2_read_extent(trans, rbio, iter.pos, 237 + data_btree, k, offset_into_extent, flags); 238 + 239 + if (flags & BCH_READ_LAST_FRAGMENT) 240 + break; 241 + 242 + swap(rbio->bio.bi_iter.bi_size, bytes); 243 + bio_advance(&rbio->bio, bytes); 244 + 245 + ret = btree_trans_too_many_iters(trans); 246 + if (ret) 247 + break; 248 + } 249 + err: 250 + bch2_trans_iter_exit(trans, &iter); 251 + 252 + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 253 + goto retry; 254 + 255 + if (ret) { 256 + bch_err_inum_offset_ratelimited(c, 257 + iter.pos.inode, 258 + iter.pos.offset << 9, 259 + "read error %i from btree lookup", ret); 260 + rbio->bio.bi_status = BLK_STS_IOERR; 261 + bio_endio(&rbio->bio); 262 + } 263 + 264 + bch2_bkey_buf_exit(&sk, c); 265 + } 266 + 267 + void bch2_readahead(struct readahead_control *ractl) 268 + { 269 + struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); 270 + struct bch_fs *c = inode->v.i_sb->s_fs_info; 271 + struct bch_io_opts opts; 272 + struct btree_trans trans; 273 + struct folio *folio; 274 + struct readpages_iter readpages_iter; 275 + int ret; 276 + 277 + bch2_inode_opts_get(&opts, c, &inode->ei_inode); 278 + 279 + ret = readpages_iter_init(&readpages_iter, ractl); 280 + BUG_ON(ret); 281 + 282 + bch2_trans_init(&trans, c, 0, 0); 283 + 284 + bch2_pagecache_add_get(inode); 285 + 286 + while ((folio = readpage_iter_peek(&readpages_iter))) { 287 + unsigned n = min_t(unsigned, 288 + readpages_iter.folios.nr - 289 + readpages_iter.idx, 290 + BIO_MAX_VECS); 291 + struct bch_read_bio *rbio = 292 + rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, 293 + GFP_KERNEL, &c->bio_read), 294 + opts); 295 + 296 + readpage_iter_advance(&readpages_iter); 297 + 298 + rbio->bio.bi_iter.bi_sector = folio_sector(folio); 299 + rbio->bio.bi_end_io = bch2_readpages_end_io; 300 + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); 301 + 302 + bchfs_read(&trans, rbio, inode_inum(inode), 303 + &readpages_iter); 304 + bch2_trans_unlock(&trans); 305 + } 306 + 307 + bch2_pagecache_add_put(inode); 308 + 309 + bch2_trans_exit(&trans); 310 + darray_exit(&readpages_iter.folios); 311 + } 312 + 313 + static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio, 314 + subvol_inum inum, struct folio *folio) 315 + { 316 + struct btree_trans trans; 317 + 318 + bch2_folio_create(folio, __GFP_NOFAIL); 319 + 320 + rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; 321 + rbio->bio.bi_iter.bi_sector = folio_sector(folio); 322 + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); 323 + 324 + bch2_trans_init(&trans, c, 0, 0); 325 + bchfs_read(&trans, rbio, inum, NULL); 326 + bch2_trans_exit(&trans); 327 + } 328 + 329 + static void bch2_read_single_folio_end_io(struct bio *bio) 330 + { 331 + complete(bio->bi_private); 332 + } 333 + 334 + int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) 335 + { 336 + struct bch_inode_info *inode = to_bch_ei(mapping->host); 337 + struct bch_fs *c = inode->v.i_sb->s_fs_info; 338 + struct bch_read_bio *rbio; 339 + struct bch_io_opts opts; 340 + int ret; 341 + DECLARE_COMPLETION_ONSTACK(done); 342 + 343 + bch2_inode_opts_get(&opts, c, &inode->ei_inode); 344 + 345 + rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), 346 + opts); 347 + rbio->bio.bi_private = &done; 348 + rbio->bio.bi_end_io = bch2_read_single_folio_end_io; 349 + 350 + __bchfs_readfolio(c, rbio, inode_inum(inode), folio); 351 + wait_for_completion(&done); 352 + 353 + ret = blk_status_to_errno(rbio->bio.bi_status); 354 + bio_put(&rbio->bio); 355 + 356 + if (ret < 0) 357 + return ret; 358 + 359 + folio_mark_uptodate(folio); 360 + return 0; 361 + } 362 + 363 + int bch2_read_folio(struct file *file, struct folio *folio) 364 + { 365 + int ret; 366 + 367 + ret = bch2_read_single_folio(folio, folio->mapping); 368 + folio_unlock(folio); 369 + return bch2_err_class(ret); 370 + } 371 + 372 + /* writepages: */ 373 + 374 + struct bch_writepage_io { 375 + struct bch_inode_info *inode; 376 + 377 + /* must be last: */ 378 + struct bch_write_op op; 379 + }; 380 + 381 + struct bch_writepage_state { 382 + struct bch_writepage_io *io; 383 + struct bch_io_opts opts; 384 + struct bch_folio_sector *tmp; 385 + unsigned tmp_sectors; 386 + }; 387 + 388 + static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, 389 + struct bch_inode_info *inode) 390 + { 391 + struct bch_writepage_state ret = { 0 }; 392 + 393 + bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode); 394 + return ret; 395 + } 396 + 397 + static void bch2_writepage_io_done(struct bch_write_op *op) 398 + { 399 + struct bch_writepage_io *io = 400 + container_of(op, struct bch_writepage_io, op); 401 + struct bch_fs *c = io->op.c; 402 + struct bio *bio = &io->op.wbio.bio; 403 + struct folio_iter fi; 404 + unsigned i; 405 + 406 + if (io->op.error) { 407 + set_bit(EI_INODE_ERROR, &io->inode->ei_flags); 408 + 409 + bio_for_each_folio_all(fi, bio) { 410 + struct bch_folio *s; 411 + 412 + folio_set_error(fi.folio); 413 + mapping_set_error(fi.folio->mapping, -EIO); 414 + 415 + s = __bch2_folio(fi.folio); 416 + spin_lock(&s->lock); 417 + for (i = 0; i < folio_sectors(fi.folio); i++) 418 + s->s[i].nr_replicas = 0; 419 + spin_unlock(&s->lock); 420 + } 421 + } 422 + 423 + if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { 424 + bio_for_each_folio_all(fi, bio) { 425 + struct bch_folio *s; 426 + 427 + s = __bch2_folio(fi.folio); 428 + spin_lock(&s->lock); 429 + for (i = 0; i < folio_sectors(fi.folio); i++) 430 + s->s[i].nr_replicas = 0; 431 + spin_unlock(&s->lock); 432 + } 433 + } 434 + 435 + /* 436 + * racing with fallocate can cause us to add fewer sectors than 437 + * expected - but we shouldn't add more sectors than expected: 438 + */ 439 + WARN_ON_ONCE(io->op.i_sectors_delta > 0); 440 + 441 + /* 442 + * (error (due to going RO) halfway through a page can screw that up 443 + * slightly) 444 + * XXX wtf? 445 + BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); 446 + */ 447 + 448 + /* 449 + * PageWriteback is effectively our ref on the inode - fixup i_blocks 450 + * before calling end_page_writeback: 451 + */ 452 + bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); 453 + 454 + bio_for_each_folio_all(fi, bio) { 455 + struct bch_folio *s = __bch2_folio(fi.folio); 456 + 457 + if (atomic_dec_and_test(&s->write_count)) 458 + folio_end_writeback(fi.folio); 459 + } 460 + 461 + bio_put(&io->op.wbio.bio); 462 + } 463 + 464 + static void bch2_writepage_do_io(struct bch_writepage_state *w) 465 + { 466 + struct bch_writepage_io *io = w->io; 467 + 468 + w->io = NULL; 469 + closure_call(&io->op.cl, bch2_write, NULL, NULL); 470 + } 471 + 472 + /* 473 + * Get a bch_writepage_io and add @page to it - appending to an existing one if 474 + * possible, else allocating a new one: 475 + */ 476 + static void bch2_writepage_io_alloc(struct bch_fs *c, 477 + struct writeback_control *wbc, 478 + struct bch_writepage_state *w, 479 + struct bch_inode_info *inode, 480 + u64 sector, 481 + unsigned nr_replicas) 482 + { 483 + struct bch_write_op *op; 484 + 485 + w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS, 486 + REQ_OP_WRITE, 487 + GFP_KERNEL, 488 + &c->writepage_bioset), 489 + struct bch_writepage_io, op.wbio.bio); 490 + 491 + w->io->inode = inode; 492 + op = &w->io->op; 493 + bch2_write_op_init(op, c, w->opts); 494 + op->target = w->opts.foreground_target; 495 + op->nr_replicas = nr_replicas; 496 + op->res.nr_replicas = nr_replicas; 497 + op->write_point = writepoint_hashed(inode->ei_last_dirtied); 498 + op->subvol = inode->ei_subvol; 499 + op->pos = POS(inode->v.i_ino, sector); 500 + op->end_io = bch2_writepage_io_done; 501 + op->devs_need_flush = &inode->ei_devs_need_flush; 502 + op->wbio.bio.bi_iter.bi_sector = sector; 503 + op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); 504 + } 505 + 506 + static int __bch2_writepage(struct folio *folio, 507 + struct writeback_control *wbc, 508 + void *data) 509 + { 510 + struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); 511 + struct bch_fs *c = inode->v.i_sb->s_fs_info; 512 + struct bch_writepage_state *w = data; 513 + struct bch_folio *s; 514 + unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX; 515 + loff_t i_size = i_size_read(&inode->v); 516 + int ret; 517 + 518 + EBUG_ON(!folio_test_uptodate(folio)); 519 + 520 + /* Is the folio fully inside i_size? */ 521 + if (folio_end_pos(folio) <= i_size) 522 + goto do_io; 523 + 524 + /* Is the folio fully outside i_size? (truncate in progress) */ 525 + if (folio_pos(folio) >= i_size) { 526 + folio_unlock(folio); 527 + return 0; 528 + } 529 + 530 + /* 531 + * The folio straddles i_size. It must be zeroed out on each and every 532 + * writepage invocation because it may be mmapped. "A file is mapped 533 + * in multiples of the folio size. For a file that is not a multiple of 534 + * the folio size, the remaining memory is zeroed when mapped, and 535 + * writes to that region are not written out to the file." 536 + */ 537 + folio_zero_segment(folio, 538 + i_size - folio_pos(folio), 539 + folio_size(folio)); 540 + do_io: 541 + f_sectors = folio_sectors(folio); 542 + s = bch2_folio(folio); 543 + 544 + if (f_sectors > w->tmp_sectors) { 545 + kfree(w->tmp); 546 + w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL); 547 + w->tmp_sectors = f_sectors; 548 + } 549 + 550 + /* 551 + * Things get really hairy with errors during writeback: 552 + */ 553 + ret = bch2_get_folio_disk_reservation(c, inode, folio, false); 554 + BUG_ON(ret); 555 + 556 + /* Before unlocking the page, get copy of reservations: */ 557 + spin_lock(&s->lock); 558 + memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors); 559 + 560 + for (i = 0; i < f_sectors; i++) { 561 + if (s->s[i].state < SECTOR_dirty) 562 + continue; 563 + 564 + nr_replicas_this_write = 565 + min_t(unsigned, nr_replicas_this_write, 566 + s->s[i].nr_replicas + 567 + s->s[i].replicas_reserved); 568 + } 569 + 570 + for (i = 0; i < f_sectors; i++) { 571 + if (s->s[i].state < SECTOR_dirty) 572 + continue; 573 + 574 + s->s[i].nr_replicas = w->opts.compression 575 + ? 0 : nr_replicas_this_write; 576 + 577 + s->s[i].replicas_reserved = 0; 578 + bch2_folio_sector_set(folio, s, i, SECTOR_allocated); 579 + } 580 + spin_unlock(&s->lock); 581 + 582 + BUG_ON(atomic_read(&s->write_count)); 583 + atomic_set(&s->write_count, 1); 584 + 585 + BUG_ON(folio_test_writeback(folio)); 586 + folio_start_writeback(folio); 587 + 588 + folio_unlock(folio); 589 + 590 + offset = 0; 591 + while (1) { 592 + unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0; 593 + u64 sector; 594 + 595 + while (offset < f_sectors && 596 + w->tmp[offset].state < SECTOR_dirty) 597 + offset++; 598 + 599 + if (offset == f_sectors) 600 + break; 601 + 602 + while (offset + sectors < f_sectors && 603 + w->tmp[offset + sectors].state >= SECTOR_dirty) { 604 + reserved_sectors += w->tmp[offset + sectors].replicas_reserved; 605 + dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty; 606 + sectors++; 607 + } 608 + BUG_ON(!sectors); 609 + 610 + sector = folio_sector(folio) + offset; 611 + 612 + if (w->io && 613 + (w->io->op.res.nr_replicas != nr_replicas_this_write || 614 + bio_full(&w->io->op.wbio.bio, sectors << 9) || 615 + w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= 616 + (BIO_MAX_VECS * PAGE_SIZE) || 617 + bio_end_sector(&w->io->op.wbio.bio) != sector)) 618 + bch2_writepage_do_io(w); 619 + 620 + if (!w->io) 621 + bch2_writepage_io_alloc(c, wbc, w, inode, sector, 622 + nr_replicas_this_write); 623 + 624 + atomic_inc(&s->write_count); 625 + 626 + BUG_ON(inode != w->io->inode); 627 + BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, 628 + sectors << 9, offset << 9)); 629 + 630 + /* Check for writing past i_size: */ 631 + WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > 632 + round_up(i_size, block_bytes(c)) && 633 + !test_bit(BCH_FS_EMERGENCY_RO, &c->flags), 634 + "writing past i_size: %llu > %llu (unrounded %llu)\n", 635 + bio_end_sector(&w->io->op.wbio.bio) << 9, 636 + round_up(i_size, block_bytes(c)), 637 + i_size); 638 + 639 + w->io->op.res.sectors += reserved_sectors; 640 + w->io->op.i_sectors_delta -= dirty_sectors; 641 + w->io->op.new_i_size = i_size; 642 + 643 + offset += sectors; 644 + } 645 + 646 + if (atomic_dec_and_test(&s->write_count)) 647 + folio_end_writeback(folio); 648 + 649 + return 0; 650 + } 651 + 652 + int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) 653 + { 654 + struct bch_fs *c = mapping->host->i_sb->s_fs_info; 655 + struct bch_writepage_state w = 656 + bch_writepage_state_init(c, to_bch_ei(mapping->host)); 657 + struct blk_plug plug; 658 + int ret; 659 + 660 + blk_start_plug(&plug); 661 + ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); 662 + if (w.io) 663 + bch2_writepage_do_io(&w); 664 + blk_finish_plug(&plug); 665 + kfree(w.tmp); 666 + return bch2_err_class(ret); 667 + } 668 + 669 + /* buffered writes: */ 670 + 671 + int bch2_write_begin(struct file *file, struct address_space *mapping, 672 + loff_t pos, unsigned len, 673 + struct page **pagep, void **fsdata) 674 + { 675 + struct bch_inode_info *inode = to_bch_ei(mapping->host); 676 + struct bch_fs *c = inode->v.i_sb->s_fs_info; 677 + struct bch2_folio_reservation *res; 678 + struct folio *folio; 679 + unsigned offset; 680 + int ret = -ENOMEM; 681 + 682 + res = kmalloc(sizeof(*res), GFP_KERNEL); 683 + if (!res) 684 + return -ENOMEM; 685 + 686 + bch2_folio_reservation_init(c, inode, res); 687 + *fsdata = res; 688 + 689 + bch2_pagecache_add_get(inode); 690 + 691 + folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, 692 + FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE, 693 + mapping_gfp_mask(mapping)); 694 + if (IS_ERR_OR_NULL(folio)) 695 + goto err_unlock; 696 + 697 + if (folio_test_uptodate(folio)) 698 + goto out; 699 + 700 + offset = pos - folio_pos(folio); 701 + len = min_t(size_t, len, folio_end_pos(folio) - pos); 702 + 703 + /* If we're writing entire folio, don't need to read it in first: */ 704 + if (!offset && len == folio_size(folio)) 705 + goto out; 706 + 707 + if (!offset && pos + len >= inode->v.i_size) { 708 + folio_zero_segment(folio, len, folio_size(folio)); 709 + flush_dcache_folio(folio); 710 + goto out; 711 + } 712 + 713 + if (folio_pos(folio) >= inode->v.i_size) { 714 + folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio)); 715 + flush_dcache_folio(folio); 716 + goto out; 717 + } 718 + readpage: 719 + ret = bch2_read_single_folio(folio, mapping); 720 + if (ret) 721 + goto err; 722 + out: 723 + ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); 724 + if (ret) 725 + goto err; 726 + 727 + ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len); 728 + if (ret) { 729 + if (!folio_test_uptodate(folio)) { 730 + /* 731 + * If the folio hasn't been read in, we won't know if we 732 + * actually need a reservation - we don't actually need 733 + * to read here, we just need to check if the folio is 734 + * fully backed by uncompressed data: 735 + */ 736 + goto readpage; 737 + } 738 + 739 + goto err; 740 + } 741 + 742 + *pagep = &folio->page; 743 + return 0; 744 + err: 745 + folio_unlock(folio); 746 + folio_put(folio); 747 + *pagep = NULL; 748 + err_unlock: 749 + bch2_pagecache_add_put(inode); 750 + kfree(res); 751 + *fsdata = NULL; 752 + return bch2_err_class(ret); 753 + } 754 + 755 + int bch2_write_end(struct file *file, struct address_space *mapping, 756 + loff_t pos, unsigned len, unsigned copied, 757 + struct page *page, void *fsdata) 758 + { 759 + struct bch_inode_info *inode = to_bch_ei(mapping->host); 760 + struct bch_fs *c = inode->v.i_sb->s_fs_info; 761 + struct bch2_folio_reservation *res = fsdata; 762 + struct folio *folio = page_folio(page); 763 + unsigned offset = pos - folio_pos(folio); 764 + 765 + lockdep_assert_held(&inode->v.i_rwsem); 766 + BUG_ON(offset + copied > folio_size(folio)); 767 + 768 + if (unlikely(copied < len && !folio_test_uptodate(folio))) { 769 + /* 770 + * The folio needs to be read in, but that would destroy 771 + * our partial write - simplest thing is to just force 772 + * userspace to redo the write: 773 + */ 774 + folio_zero_range(folio, 0, folio_size(folio)); 775 + flush_dcache_folio(folio); 776 + copied = 0; 777 + } 778 + 779 + spin_lock(&inode->v.i_lock); 780 + if (pos + copied > inode->v.i_size) 781 + i_size_write(&inode->v, pos + copied); 782 + spin_unlock(&inode->v.i_lock); 783 + 784 + if (copied) { 785 + if (!folio_test_uptodate(folio)) 786 + folio_mark_uptodate(folio); 787 + 788 + bch2_set_folio_dirty(c, inode, folio, res, offset, copied); 789 + 790 + inode->ei_last_dirtied = (unsigned long) current; 791 + } 792 + 793 + folio_unlock(folio); 794 + folio_put(folio); 795 + bch2_pagecache_add_put(inode); 796 + 797 + bch2_folio_reservation_put(c, inode, res); 798 + kfree(res); 799 + 800 + return copied; 801 + } 802 + 803 + static noinline void folios_trunc(folios *folios, struct folio **fi) 804 + { 805 + while (folios->data + folios->nr > fi) { 806 + struct folio *f = darray_pop(folios); 807 + 808 + folio_unlock(f); 809 + folio_put(f); 810 + } 811 + } 812 + 813 + static int __bch2_buffered_write(struct bch_inode_info *inode, 814 + struct address_space *mapping, 815 + struct iov_iter *iter, 816 + loff_t pos, unsigned len) 817 + { 818 + struct bch_fs *c = inode->v.i_sb->s_fs_info; 819 + struct bch2_folio_reservation res; 820 + folios folios; 821 + struct folio **fi, *f; 822 + unsigned copied = 0, f_offset; 823 + u64 end = pos + len, f_pos; 824 + loff_t last_folio_pos = inode->v.i_size; 825 + int ret = 0; 826 + 827 + BUG_ON(!len); 828 + 829 + bch2_folio_reservation_init(c, inode, &res); 830 + darray_init(&folios); 831 + 832 + ret = bch2_filemap_get_contig_folios_d(mapping, pos, end, 833 + FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT, 834 + mapping_gfp_mask(mapping), 835 + &folios); 836 + if (ret) 837 + goto out; 838 + 839 + BUG_ON(!folios.nr); 840 + 841 + f = darray_first(folios); 842 + if (pos != folio_pos(f) && !folio_test_uptodate(f)) { 843 + ret = bch2_read_single_folio(f, mapping); 844 + if (ret) 845 + goto out; 846 + } 847 + 848 + f = darray_last(folios); 849 + end = min(end, folio_end_pos(f)); 850 + last_folio_pos = folio_pos(f); 851 + if (end != folio_end_pos(f) && !folio_test_uptodate(f)) { 852 + if (end >= inode->v.i_size) { 853 + folio_zero_range(f, 0, folio_size(f)); 854 + } else { 855 + ret = bch2_read_single_folio(f, mapping); 856 + if (ret) 857 + goto out; 858 + } 859 + } 860 + 861 + ret = bch2_folio_set(c, inode_inum(inode), folios.data, folios.nr); 862 + if (ret) 863 + goto out; 864 + 865 + f_pos = pos; 866 + f_offset = pos - folio_pos(darray_first(folios)); 867 + darray_for_each(folios, fi) { 868 + struct folio *f = *fi; 869 + u64 f_len = min(end, folio_end_pos(f)) - f_pos; 870 + 871 + /* 872 + * XXX: per POSIX and fstests generic/275, on -ENOSPC we're 873 + * supposed to write as much as we have disk space for. 874 + * 875 + * On failure here we should still write out a partial page if 876 + * we aren't completely out of disk space - we don't do that 877 + * yet: 878 + */ 879 + ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len); 880 + if (unlikely(ret)) { 881 + folios_trunc(&folios, fi); 882 + if (!folios.nr) 883 + goto out; 884 + 885 + end = min(end, folio_end_pos(darray_last(folios))); 886 + break; 887 + } 888 + 889 + f_pos = folio_end_pos(f); 890 + f_offset = 0; 891 + } 892 + 893 + if (mapping_writably_mapped(mapping)) 894 + darray_for_each(folios, fi) 895 + flush_dcache_folio(*fi); 896 + 897 + f_pos = pos; 898 + f_offset = pos - folio_pos(darray_first(folios)); 899 + darray_for_each(folios, fi) { 900 + struct folio *f = *fi; 901 + u64 f_len = min(end, folio_end_pos(f)) - f_pos; 902 + unsigned f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter); 903 + 904 + if (!f_copied) { 905 + folios_trunc(&folios, fi); 906 + break; 907 + } 908 + 909 + if (!folio_test_uptodate(f) && 910 + f_copied != folio_size(f) && 911 + pos + copied + f_copied < inode->v.i_size) { 912 + folio_zero_range(f, 0, folio_size(f)); 913 + folios_trunc(&folios, fi); 914 + break; 915 + } 916 + 917 + flush_dcache_folio(f); 918 + copied += f_copied; 919 + 920 + if (f_copied != f_len) { 921 + folios_trunc(&folios, fi + 1); 922 + break; 923 + } 924 + 925 + f_pos = folio_end_pos(f); 926 + f_offset = 0; 927 + } 928 + 929 + if (!copied) 930 + goto out; 931 + 932 + end = pos + copied; 933 + 934 + spin_lock(&inode->v.i_lock); 935 + if (end > inode->v.i_size) 936 + i_size_write(&inode->v, end); 937 + spin_unlock(&inode->v.i_lock); 938 + 939 + f_pos = pos; 940 + f_offset = pos - folio_pos(darray_first(folios)); 941 + darray_for_each(folios, fi) { 942 + struct folio *f = *fi; 943 + u64 f_len = min(end, folio_end_pos(f)) - f_pos; 944 + 945 + if (!folio_test_uptodate(f)) 946 + folio_mark_uptodate(f); 947 + 948 + bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len); 949 + 950 + f_pos = folio_end_pos(f); 951 + f_offset = 0; 952 + } 953 + 954 + inode->ei_last_dirtied = (unsigned long) current; 955 + out: 956 + darray_for_each(folios, fi) { 957 + folio_unlock(*fi); 958 + folio_put(*fi); 959 + } 960 + 961 + /* 962 + * If the last folio added to the mapping starts beyond current EOF, we 963 + * performed a short write but left around at least one post-EOF folio. 964 + * Clean up the mapping before we return. 965 + */ 966 + if (last_folio_pos >= inode->v.i_size) 967 + truncate_pagecache(&inode->v, inode->v.i_size); 968 + 969 + darray_exit(&folios); 970 + bch2_folio_reservation_put(c, inode, &res); 971 + 972 + return copied ?: ret; 973 + } 974 + 975 + static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) 976 + { 977 + struct file *file = iocb->ki_filp; 978 + struct address_space *mapping = file->f_mapping; 979 + struct bch_inode_info *inode = file_bch_inode(file); 980 + loff_t pos = iocb->ki_pos; 981 + ssize_t written = 0; 982 + int ret = 0; 983 + 984 + bch2_pagecache_add_get(inode); 985 + 986 + do { 987 + unsigned offset = pos & (PAGE_SIZE - 1); 988 + unsigned bytes = iov_iter_count(iter); 989 + again: 990 + /* 991 + * Bring in the user page that we will copy from _first_. 992 + * Otherwise there's a nasty deadlock on copying from the 993 + * same page as we're writing to, without it being marked 994 + * up-to-date. 995 + * 996 + * Not only is this an optimisation, but it is also required 997 + * to check that the address is actually valid, when atomic 998 + * usercopies are used, below. 999 + */ 1000 + if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { 1001 + bytes = min_t(unsigned long, iov_iter_count(iter), 1002 + PAGE_SIZE - offset); 1003 + 1004 + if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { 1005 + ret = -EFAULT; 1006 + break; 1007 + } 1008 + } 1009 + 1010 + if (unlikely(fatal_signal_pending(current))) { 1011 + ret = -EINTR; 1012 + break; 1013 + } 1014 + 1015 + ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); 1016 + if (unlikely(ret < 0)) 1017 + break; 1018 + 1019 + cond_resched(); 1020 + 1021 + if (unlikely(ret == 0)) { 1022 + /* 1023 + * If we were unable to copy any data at all, we must 1024 + * fall back to a single segment length write. 1025 + * 1026 + * If we didn't fallback here, we could livelock 1027 + * because not all segments in the iov can be copied at 1028 + * once without a pagefault. 1029 + */ 1030 + bytes = min_t(unsigned long, PAGE_SIZE - offset, 1031 + iov_iter_single_seg_count(iter)); 1032 + goto again; 1033 + } 1034 + pos += ret; 1035 + written += ret; 1036 + ret = 0; 1037 + 1038 + balance_dirty_pages_ratelimited(mapping); 1039 + } while (iov_iter_count(iter)); 1040 + 1041 + bch2_pagecache_add_put(inode); 1042 + 1043 + return written ? written : ret; 1044 + } 1045 + 1046 + ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) 1047 + { 1048 + struct file *file = iocb->ki_filp; 1049 + struct bch_inode_info *inode = file_bch_inode(file); 1050 + ssize_t ret; 1051 + 1052 + if (iocb->ki_flags & IOCB_DIRECT) { 1053 + ret = bch2_direct_write(iocb, from); 1054 + goto out; 1055 + } 1056 + 1057 + inode_lock(&inode->v); 1058 + 1059 + ret = generic_write_checks(iocb, from); 1060 + if (ret <= 0) 1061 + goto unlock; 1062 + 1063 + ret = file_remove_privs(file); 1064 + if (ret) 1065 + goto unlock; 1066 + 1067 + ret = file_update_time(file); 1068 + if (ret) 1069 + goto unlock; 1070 + 1071 + ret = bch2_buffered_write(iocb, from); 1072 + if (likely(ret > 0)) 1073 + iocb->ki_pos += ret; 1074 + unlock: 1075 + inode_unlock(&inode->v); 1076 + 1077 + if (ret > 0) 1078 + ret = generic_write_sync(iocb, ret); 1079 + out: 1080 + return bch2_err_class(ret); 1081 + } 1082 + 1083 + void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) 1084 + { 1085 + bioset_exit(&c->writepage_bioset); 1086 + } 1087 + 1088 + int bch2_fs_fs_io_buffered_init(struct bch_fs *c) 1089 + { 1090 + if (bioset_init(&c->writepage_bioset, 1091 + 4, offsetof(struct bch_writepage_io, op.wbio.bio), 1092 + BIOSET_NEED_BVECS)) 1093 + return -BCH_ERR_ENOMEM_writepage_bioset_init; 1094 + 1095 + return 0; 1096 + } 1097 + 1098 + #endif /* NO_BCACHEFS_FS */

+27

fs/bcachefs/fs-io-buffered.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _BCACHEFS_FS_IO_BUFFERED_H 3 + #define _BCACHEFS_FS_IO_BUFFERED_H 4 + 5 + #ifndef NO_BCACHEFS_FS 6 + 7 + int bch2_read_single_folio(struct folio *, struct address_space *); 8 + int bch2_read_folio(struct file *, struct folio *); 9 + 10 + int bch2_writepages(struct address_space *, struct writeback_control *); 11 + void bch2_readahead(struct readahead_control *); 12 + 13 + int bch2_write_begin(struct file *, struct address_space *, loff_t, 14 + unsigned, struct page **, void **); 15 + int bch2_write_end(struct file *, struct address_space *, loff_t, 16 + unsigned, unsigned, struct page *, void *); 17 + 18 + ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); 19 + 20 + void bch2_fs_fs_io_buffered_exit(struct bch_fs *); 21 + int bch2_fs_fs_io_buffered_init(struct bch_fs *); 22 + #else 23 + static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {} 24 + static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; } 25 + #endif 26 + 27 + #endif /* _BCACHEFS_FS_IO_BUFFERED_H */

+679

fs/bcachefs/fs-io-direct.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #ifndef NO_BCACHEFS_FS 3 + 4 + #include "bcachefs.h" 5 + #include "alloc_foreground.h" 6 + #include "fs.h" 7 + #include "fs-io.h" 8 + #include "fs-io-direct.h" 9 + #include "fs-io-pagecache.h" 10 + #include "io.h" 11 + 12 + #include <linux/kthread.h> 13 + #include <linux/pagemap.h> 14 + #include <linux/task_io_accounting_ops.h> 15 + 16 + /* O_DIRECT reads */ 17 + 18 + struct dio_read { 19 + struct closure cl; 20 + struct kiocb *req; 21 + long ret; 22 + bool should_dirty; 23 + struct bch_read_bio rbio; 24 + }; 25 + 26 + static void bio_check_or_release(struct bio *bio, bool check_dirty) 27 + { 28 + if (check_dirty) { 29 + bio_check_pages_dirty(bio); 30 + } else { 31 + bio_release_pages(bio, false); 32 + bio_put(bio); 33 + } 34 + } 35 + 36 + static void bch2_dio_read_complete(struct closure *cl) 37 + { 38 + struct dio_read *dio = container_of(cl, struct dio_read, cl); 39 + 40 + dio->req->ki_complete(dio->req, dio->ret); 41 + bio_check_or_release(&dio->rbio.bio, dio->should_dirty); 42 + } 43 + 44 + static void bch2_direct_IO_read_endio(struct bio *bio) 45 + { 46 + struct dio_read *dio = bio->bi_private; 47 + 48 + if (bio->bi_status) 49 + dio->ret = blk_status_to_errno(bio->bi_status); 50 + 51 + closure_put(&dio->cl); 52 + } 53 + 54 + static void bch2_direct_IO_read_split_endio(struct bio *bio) 55 + { 56 + struct dio_read *dio = bio->bi_private; 57 + bool should_dirty = dio->should_dirty; 58 + 59 + bch2_direct_IO_read_endio(bio); 60 + bio_check_or_release(bio, should_dirty); 61 + } 62 + 63 + static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) 64 + { 65 + struct file *file = req->ki_filp; 66 + struct bch_inode_info *inode = file_bch_inode(file); 67 + struct bch_fs *c = inode->v.i_sb->s_fs_info; 68 + struct bch_io_opts opts; 69 + struct dio_read *dio; 70 + struct bio *bio; 71 + loff_t offset = req->ki_pos; 72 + bool sync = is_sync_kiocb(req); 73 + size_t shorten; 74 + ssize_t ret; 75 + 76 + bch2_inode_opts_get(&opts, c, &inode->ei_inode); 77 + 78 + if ((offset|iter->count) & (block_bytes(c) - 1)) 79 + return -EINVAL; 80 + 81 + ret = min_t(loff_t, iter->count, 82 + max_t(loff_t, 0, i_size_read(&inode->v) - offset)); 83 + 84 + if (!ret) 85 + return ret; 86 + 87 + shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); 88 + iter->count -= shorten; 89 + 90 + bio = bio_alloc_bioset(NULL, 91 + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), 92 + REQ_OP_READ, 93 + GFP_KERNEL, 94 + &c->dio_read_bioset); 95 + 96 + bio->bi_end_io = bch2_direct_IO_read_endio; 97 + 98 + dio = container_of(bio, struct dio_read, rbio.bio); 99 + closure_init(&dio->cl, NULL); 100 + 101 + /* 102 + * this is a _really_ horrible hack just to avoid an atomic sub at the 103 + * end: 104 + */ 105 + if (!sync) { 106 + set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); 107 + atomic_set(&dio->cl.remaining, 108 + CLOSURE_REMAINING_INITIALIZER - 109 + CLOSURE_RUNNING + 110 + CLOSURE_DESTRUCTOR); 111 + } else { 112 + atomic_set(&dio->cl.remaining, 113 + CLOSURE_REMAINING_INITIALIZER + 1); 114 + } 115 + 116 + dio->req = req; 117 + dio->ret = ret; 118 + /* 119 + * This is one of the sketchier things I've encountered: we have to skip 120 + * the dirtying of requests that are internal from the kernel (i.e. from 121 + * loopback), because we'll deadlock on page_lock. 122 + */ 123 + dio->should_dirty = iter_is_iovec(iter); 124 + 125 + goto start; 126 + while (iter->count) { 127 + bio = bio_alloc_bioset(NULL, 128 + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), 129 + REQ_OP_READ, 130 + GFP_KERNEL, 131 + &c->bio_read); 132 + bio->bi_end_io = bch2_direct_IO_read_split_endio; 133 + start: 134 + bio->bi_opf = REQ_OP_READ|REQ_SYNC; 135 + bio->bi_iter.bi_sector = offset >> 9; 136 + bio->bi_private = dio; 137 + 138 + ret = bio_iov_iter_get_pages(bio, iter); 139 + if (ret < 0) { 140 + /* XXX: fault inject this path */ 141 + bio->bi_status = BLK_STS_RESOURCE; 142 + bio_endio(bio); 143 + break; 144 + } 145 + 146 + offset += bio->bi_iter.bi_size; 147 + 148 + if (dio->should_dirty) 149 + bio_set_pages_dirty(bio); 150 + 151 + if (iter->count) 152 + closure_get(&dio->cl); 153 + 154 + bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); 155 + } 156 + 157 + iter->count += shorten; 158 + 159 + if (sync) { 160 + closure_sync(&dio->cl); 161 + closure_debug_destroy(&dio->cl); 162 + ret = dio->ret; 163 + bio_check_or_release(&dio->rbio.bio, dio->should_dirty); 164 + return ret; 165 + } else { 166 + return -EIOCBQUEUED; 167 + } 168 + } 169 + 170 + ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) 171 + { 172 + struct file *file = iocb->ki_filp; 173 + struct bch_inode_info *inode = file_bch_inode(file); 174 + struct address_space *mapping = file->f_mapping; 175 + size_t count = iov_iter_count(iter); 176 + ssize_t ret; 177 + 178 + if (!count) 179 + return 0; /* skip atime */ 180 + 181 + if (iocb->ki_flags & IOCB_DIRECT) { 182 + struct blk_plug plug; 183 + 184 + if (unlikely(mapping->nrpages)) { 185 + ret = filemap_write_and_wait_range(mapping, 186 + iocb->ki_pos, 187 + iocb->ki_pos + count - 1); 188 + if (ret < 0) 189 + goto out; 190 + } 191 + 192 + file_accessed(file); 193 + 194 + blk_start_plug(&plug); 195 + ret = bch2_direct_IO_read(iocb, iter); 196 + blk_finish_plug(&plug); 197 + 198 + if (ret >= 0) 199 + iocb->ki_pos += ret; 200 + } else { 201 + bch2_pagecache_add_get(inode); 202 + ret = generic_file_read_iter(iocb, iter); 203 + bch2_pagecache_add_put(inode); 204 + } 205 + out: 206 + return bch2_err_class(ret); 207 + } 208 + 209 + /* O_DIRECT writes */ 210 + 211 + struct dio_write { 212 + struct kiocb *req; 213 + struct address_space *mapping; 214 + struct bch_inode_info *inode; 215 + struct mm_struct *mm; 216 + unsigned loop:1, 217 + extending:1, 218 + sync:1, 219 + flush:1, 220 + free_iov:1; 221 + struct quota_res quota_res; 222 + u64 written; 223 + 224 + struct iov_iter iter; 225 + struct iovec inline_vecs[2]; 226 + 227 + /* must be last: */ 228 + struct bch_write_op op; 229 + }; 230 + 231 + static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, 232 + u64 offset, u64 size, 233 + unsigned nr_replicas, bool compressed) 234 + { 235 + struct btree_trans trans; 236 + struct btree_iter iter; 237 + struct bkey_s_c k; 238 + u64 end = offset + size; 239 + u32 snapshot; 240 + bool ret = true; 241 + int err; 242 + 243 + bch2_trans_init(&trans, c, 0, 0); 244 + retry: 245 + bch2_trans_begin(&trans); 246 + 247 + err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); 248 + if (err) 249 + goto err; 250 + 251 + for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, 252 + SPOS(inum.inum, offset, snapshot), 253 + BTREE_ITER_SLOTS, k, err) { 254 + if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end))) 255 + break; 256 + 257 + if (k.k->p.snapshot != snapshot || 258 + nr_replicas > bch2_bkey_replicas(c, k) || 259 + (!compressed && bch2_bkey_sectors_compressed(k))) { 260 + ret = false; 261 + break; 262 + } 263 + } 264 + 265 + offset = iter.pos.offset; 266 + bch2_trans_iter_exit(&trans, &iter); 267 + err: 268 + if (bch2_err_matches(err, BCH_ERR_transaction_restart)) 269 + goto retry; 270 + bch2_trans_exit(&trans); 271 + 272 + return err ? false : ret; 273 + } 274 + 275 + static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio) 276 + { 277 + struct bch_fs *c = dio->op.c; 278 + struct bch_inode_info *inode = dio->inode; 279 + struct bio *bio = &dio->op.wbio.bio; 280 + 281 + return bch2_check_range_allocated(c, inode_inum(inode), 282 + dio->op.pos.offset, bio_sectors(bio), 283 + dio->op.opts.data_replicas, 284 + dio->op.opts.compression != 0); 285 + } 286 + 287 + static void bch2_dio_write_loop_async(struct bch_write_op *); 288 + static __always_inline long bch2_dio_write_done(struct dio_write *dio); 289 + 290 + /* 291 + * We're going to return -EIOCBQUEUED, but we haven't finished consuming the 292 + * iov_iter yet, so we need to stash a copy of the iovec: it might be on the 293 + * caller's stack, we're not guaranteed that it will live for the duration of 294 + * the IO: 295 + */ 296 + static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) 297 + { 298 + struct iovec *iov = dio->inline_vecs; 299 + 300 + /* 301 + * iov_iter has a single embedded iovec - nothing to do: 302 + */ 303 + if (iter_is_ubuf(&dio->iter)) 304 + return 0; 305 + 306 + /* 307 + * We don't currently handle non-iovec iov_iters here - return an error, 308 + * and we'll fall back to doing the IO synchronously: 309 + */ 310 + if (!iter_is_iovec(&dio->iter)) 311 + return -1; 312 + 313 + if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { 314 + iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), 315 + GFP_KERNEL); 316 + if (unlikely(!iov)) 317 + return -ENOMEM; 318 + 319 + dio->free_iov = true; 320 + } 321 + 322 + memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov)); 323 + dio->iter.__iov = iov; 324 + return 0; 325 + } 326 + 327 + static void bch2_dio_write_flush_done(struct closure *cl) 328 + { 329 + struct dio_write *dio = container_of(cl, struct dio_write, op.cl); 330 + struct bch_fs *c = dio->op.c; 331 + 332 + closure_debug_destroy(cl); 333 + 334 + dio->op.error = bch2_journal_error(&c->journal); 335 + 336 + bch2_dio_write_done(dio); 337 + } 338 + 339 + static noinline void bch2_dio_write_flush(struct dio_write *dio) 340 + { 341 + struct bch_fs *c = dio->op.c; 342 + struct bch_inode_unpacked inode; 343 + int ret; 344 + 345 + dio->flush = 0; 346 + 347 + closure_init(&dio->op.cl, NULL); 348 + 349 + if (!dio->op.error) { 350 + ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode); 351 + if (ret) { 352 + dio->op.error = ret; 353 + } else { 354 + bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, 355 + &dio->op.cl); 356 + bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl); 357 + } 358 + } 359 + 360 + if (dio->sync) { 361 + closure_sync(&dio->op.cl); 362 + closure_debug_destroy(&dio->op.cl); 363 + } else { 364 + continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL); 365 + } 366 + } 367 + 368 + static __always_inline long bch2_dio_write_done(struct dio_write *dio) 369 + { 370 + struct kiocb *req = dio->req; 371 + struct bch_inode_info *inode = dio->inode; 372 + bool sync = dio->sync; 373 + long ret; 374 + 375 + if (unlikely(dio->flush)) { 376 + bch2_dio_write_flush(dio); 377 + if (!sync) 378 + return -EIOCBQUEUED; 379 + } 380 + 381 + bch2_pagecache_block_put(inode); 382 + 383 + if (dio->free_iov) 384 + kfree(dio->iter.__iov); 385 + 386 + ret = dio->op.error ?: ((long) dio->written << 9); 387 + bio_put(&dio->op.wbio.bio); 388 + 389 + /* inode->i_dio_count is our ref on inode and thus bch_fs */ 390 + inode_dio_end(&inode->v); 391 + 392 + if (ret < 0) 393 + ret = bch2_err_class(ret); 394 + 395 + if (!sync) { 396 + req->ki_complete(req, ret); 397 + ret = -EIOCBQUEUED; 398 + } 399 + return ret; 400 + } 401 + 402 + static __always_inline void bch2_dio_write_end(struct dio_write *dio) 403 + { 404 + struct bch_fs *c = dio->op.c; 405 + struct kiocb *req = dio->req; 406 + struct bch_inode_info *inode = dio->inode; 407 + struct bio *bio = &dio->op.wbio.bio; 408 + 409 + req->ki_pos += (u64) dio->op.written << 9; 410 + dio->written += dio->op.written; 411 + 412 + if (dio->extending) { 413 + spin_lock(&inode->v.i_lock); 414 + if (req->ki_pos > inode->v.i_size) 415 + i_size_write(&inode->v, req->ki_pos); 416 + spin_unlock(&inode->v.i_lock); 417 + } 418 + 419 + if (dio->op.i_sectors_delta || dio->quota_res.sectors) { 420 + mutex_lock(&inode->ei_quota_lock); 421 + __bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta); 422 + __bch2_quota_reservation_put(c, inode, &dio->quota_res); 423 + mutex_unlock(&inode->ei_quota_lock); 424 + } 425 + 426 + bio_release_pages(bio, false); 427 + 428 + if (unlikely(dio->op.error)) 429 + set_bit(EI_INODE_ERROR, &inode->ei_flags); 430 + } 431 + 432 + static __always_inline long bch2_dio_write_loop(struct dio_write *dio) 433 + { 434 + struct bch_fs *c = dio->op.c; 435 + struct kiocb *req = dio->req; 436 + struct address_space *mapping = dio->mapping; 437 + struct bch_inode_info *inode = dio->inode; 438 + struct bch_io_opts opts; 439 + struct bio *bio = &dio->op.wbio.bio; 440 + unsigned unaligned, iter_count; 441 + bool sync = dio->sync, dropped_locks; 442 + long ret; 443 + 444 + bch2_inode_opts_get(&opts, c, &inode->ei_inode); 445 + 446 + while (1) { 447 + iter_count = dio->iter.count; 448 + 449 + EBUG_ON(current->faults_disabled_mapping); 450 + current->faults_disabled_mapping = mapping; 451 + 452 + ret = bio_iov_iter_get_pages(bio, &dio->iter); 453 + 454 + dropped_locks = fdm_dropped_locks(); 455 + 456 + current->faults_disabled_mapping = NULL; 457 + 458 + /* 459 + * If the fault handler returned an error but also signalled 460 + * that it dropped & retook ei_pagecache_lock, we just need to 461 + * re-shoot down the page cache and retry: 462 + */ 463 + if (dropped_locks && ret) 464 + ret = 0; 465 + 466 + if (unlikely(ret < 0)) 467 + goto err; 468 + 469 + if (unlikely(dropped_locks)) { 470 + ret = bch2_write_invalidate_inode_pages_range(mapping, 471 + req->ki_pos, 472 + req->ki_pos + iter_count - 1); 473 + if (unlikely(ret)) 474 + goto err; 475 + 476 + if (!bio->bi_iter.bi_size) 477 + continue; 478 + } 479 + 480 + unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); 481 + bio->bi_iter.bi_size -= unaligned; 482 + iov_iter_revert(&dio->iter, unaligned); 483 + 484 + if (!bio->bi_iter.bi_size) { 485 + /* 486 + * bio_iov_iter_get_pages was only able to get < 487 + * blocksize worth of pages: 488 + */ 489 + ret = -EFAULT; 490 + goto err; 491 + } 492 + 493 + bch2_write_op_init(&dio->op, c, opts); 494 + dio->op.end_io = sync 495 + ? NULL 496 + : bch2_dio_write_loop_async; 497 + dio->op.target = dio->op.opts.foreground_target; 498 + dio->op.write_point = writepoint_hashed((unsigned long) current); 499 + dio->op.nr_replicas = dio->op.opts.data_replicas; 500 + dio->op.subvol = inode->ei_subvol; 501 + dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); 502 + dio->op.devs_need_flush = &inode->ei_devs_need_flush; 503 + 504 + if (sync) 505 + dio->op.flags |= BCH_WRITE_SYNC; 506 + dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; 507 + 508 + ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, 509 + bio_sectors(bio), true); 510 + if (unlikely(ret)) 511 + goto err; 512 + 513 + ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), 514 + dio->op.opts.data_replicas, 0); 515 + if (unlikely(ret) && 516 + !bch2_dio_write_check_allocated(dio)) 517 + goto err; 518 + 519 + task_io_account_write(bio->bi_iter.bi_size); 520 + 521 + if (unlikely(dio->iter.count) && 522 + !dio->sync && 523 + !dio->loop && 524 + bch2_dio_write_copy_iov(dio)) 525 + dio->sync = sync = true; 526 + 527 + dio->loop = true; 528 + closure_call(&dio->op.cl, bch2_write, NULL, NULL); 529 + 530 + if (!sync) 531 + return -EIOCBQUEUED; 532 + 533 + bch2_dio_write_end(dio); 534 + 535 + if (likely(!dio->iter.count) || dio->op.error) 536 + break; 537 + 538 + bio_reset(bio, NULL, REQ_OP_WRITE); 539 + } 540 + out: 541 + return bch2_dio_write_done(dio); 542 + err: 543 + dio->op.error = ret; 544 + 545 + bio_release_pages(bio, false); 546 + 547 + bch2_quota_reservation_put(c, inode, &dio->quota_res); 548 + goto out; 549 + } 550 + 551 + static noinline __cold void bch2_dio_write_continue(struct dio_write *dio) 552 + { 553 + struct mm_struct *mm = dio->mm; 554 + 555 + bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE); 556 + 557 + if (mm) 558 + kthread_use_mm(mm); 559 + bch2_dio_write_loop(dio); 560 + if (mm) 561 + kthread_unuse_mm(mm); 562 + } 563 + 564 + static void bch2_dio_write_loop_async(struct bch_write_op *op) 565 + { 566 + struct dio_write *dio = container_of(op, struct dio_write, op); 567 + 568 + bch2_dio_write_end(dio); 569 + 570 + if (likely(!dio->iter.count) || dio->op.error) 571 + bch2_dio_write_done(dio); 572 + else 573 + bch2_dio_write_continue(dio); 574 + } 575 + 576 + ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) 577 + { 578 + struct file *file = req->ki_filp; 579 + struct address_space *mapping = file->f_mapping; 580 + struct bch_inode_info *inode = file_bch_inode(file); 581 + struct bch_fs *c = inode->v.i_sb->s_fs_info; 582 + struct dio_write *dio; 583 + struct bio *bio; 584 + bool locked = true, extending; 585 + ssize_t ret; 586 + 587 + prefetch(&c->opts); 588 + prefetch((void *) &c->opts + 64); 589 + prefetch(&inode->ei_inode); 590 + prefetch((void *) &inode->ei_inode + 64); 591 + 592 + inode_lock(&inode->v); 593 + 594 + ret = generic_write_checks(req, iter); 595 + if (unlikely(ret <= 0)) 596 + goto err; 597 + 598 + ret = file_remove_privs(file); 599 + if (unlikely(ret)) 600 + goto err; 601 + 602 + ret = file_update_time(file); 603 + if (unlikely(ret)) 604 + goto err; 605 + 606 + if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) 607 + goto err; 608 + 609 + inode_dio_begin(&inode->v); 610 + bch2_pagecache_block_get(inode); 611 + 612 + extending = req->ki_pos + iter->count > inode->v.i_size; 613 + if (!extending) { 614 + inode_unlock(&inode->v); 615 + locked = false; 616 + } 617 + 618 + bio = bio_alloc_bioset(NULL, 619 + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), 620 + REQ_OP_WRITE, 621 + GFP_KERNEL, 622 + &c->dio_write_bioset); 623 + dio = container_of(bio, struct dio_write, op.wbio.bio); 624 + dio->req = req; 625 + dio->mapping = mapping; 626 + dio->inode = inode; 627 + dio->mm = current->mm; 628 + dio->loop = false; 629 + dio->extending = extending; 630 + dio->sync = is_sync_kiocb(req) || extending; 631 + dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled; 632 + dio->free_iov = false; 633 + dio->quota_res.sectors = 0; 634 + dio->written = 0; 635 + dio->iter = *iter; 636 + dio->op.c = c; 637 + 638 + if (unlikely(mapping->nrpages)) { 639 + ret = bch2_write_invalidate_inode_pages_range(mapping, 640 + req->ki_pos, 641 + req->ki_pos + iter->count - 1); 642 + if (unlikely(ret)) 643 + goto err_put_bio; 644 + } 645 + 646 + ret = bch2_dio_write_loop(dio); 647 + err: 648 + if (locked) 649 + inode_unlock(&inode->v); 650 + return ret; 651 + err_put_bio: 652 + bch2_pagecache_block_put(inode); 653 + bio_put(bio); 654 + inode_dio_end(&inode->v); 655 + goto err; 656 + } 657 + 658 + void bch2_fs_fs_io_direct_exit(struct bch_fs *c) 659 + { 660 + bioset_exit(&c->dio_write_bioset); 661 + bioset_exit(&c->dio_read_bioset); 662 + } 663 + 664 + int bch2_fs_fs_io_direct_init(struct bch_fs *c) 665 + { 666 + if (bioset_init(&c->dio_read_bioset, 667 + 4, offsetof(struct dio_read, rbio.bio), 668 + BIOSET_NEED_BVECS)) 669 + return -BCH_ERR_ENOMEM_dio_read_bioset_init; 670 + 671 + if (bioset_init(&c->dio_write_bioset, 672 + 4, offsetof(struct dio_write, op.wbio.bio), 673 + BIOSET_NEED_BVECS)) 674 + return -BCH_ERR_ENOMEM_dio_write_bioset_init; 675 + 676 + return 0; 677 + } 678 + 679 + #endif /* NO_BCACHEFS_FS */

+16

fs/bcachefs/fs-io-direct.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _BCACHEFS_FS_IO_DIRECT_H 3 + #define _BCACHEFS_FS_IO_DIRECT_H 4 + 5 + #ifndef NO_BCACHEFS_FS 6 + ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *); 7 + ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); 8 + 9 + void bch2_fs_fs_io_direct_exit(struct bch_fs *); 10 + int bch2_fs_fs_io_direct_init(struct bch_fs *); 11 + #else 12 + static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {} 13 + static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; } 14 + #endif 15 + 16 + #endif /* _BCACHEFS_FS_IO_DIRECT_H */

+780

fs/bcachefs/fs-io-pagecache.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #ifndef NO_BCACHEFS_FS 3 + 4 + #include "bcachefs.h" 5 + #include "btree_iter.h" 6 + #include "extents.h" 7 + #include "fs-io.h" 8 + #include "fs-io-pagecache.h" 9 + #include "subvolume.h" 10 + 11 + #include <linux/pagevec.h> 12 + #include <linux/writeback.h> 13 + 14 + int bch2_filemap_get_contig_folios_d(struct address_space *mapping, 15 + loff_t start, u64 end, 16 + int fgp_flags, gfp_t gfp, 17 + folios *folios) 18 + { 19 + struct folio *f; 20 + u64 pos = start; 21 + int ret = 0; 22 + 23 + while (pos < end) { 24 + if ((u64) pos >= (u64) start + (1ULL << 20)) 25 + fgp_flags &= ~FGP_CREAT; 26 + 27 + ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL); 28 + if (ret) 29 + break; 30 + 31 + f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp); 32 + if (IS_ERR_OR_NULL(f)) 33 + break; 34 + 35 + BUG_ON(folios->nr && folio_pos(f) != pos); 36 + 37 + pos = folio_end_pos(f); 38 + darray_push(folios, f); 39 + } 40 + 41 + if (!folios->nr && !ret && (fgp_flags & FGP_CREAT)) 42 + ret = -ENOMEM; 43 + 44 + return folios->nr ? 0 : ret; 45 + } 46 + 47 + /* pagecache_block must be held */ 48 + int bch2_write_invalidate_inode_pages_range(struct address_space *mapping, 49 + loff_t start, loff_t end) 50 + { 51 + int ret; 52 + 53 + /* 54 + * XXX: the way this is currently implemented, we can spin if a process 55 + * is continually redirtying a specific page 56 + */ 57 + do { 58 + if (!mapping->nrpages) 59 + return 0; 60 + 61 + ret = filemap_write_and_wait_range(mapping, start, end); 62 + if (ret) 63 + break; 64 + 65 + if (!mapping->nrpages) 66 + return 0; 67 + 68 + ret = invalidate_inode_pages2_range(mapping, 69 + start >> PAGE_SHIFT, 70 + end >> PAGE_SHIFT); 71 + } while (ret == -EBUSY); 72 + 73 + return ret; 74 + } 75 + 76 + static const char * const bch2_folio_sector_states[] = { 77 + #define x(n) #n, 78 + BCH_FOLIO_SECTOR_STATE() 79 + #undef x 80 + NULL 81 + }; 82 + 83 + static inline enum bch_folio_sector_state 84 + folio_sector_dirty(enum bch_folio_sector_state state) 85 + { 86 + switch (state) { 87 + case SECTOR_unallocated: 88 + return SECTOR_dirty; 89 + case SECTOR_reserved: 90 + return SECTOR_dirty_reserved; 91 + default: 92 + return state; 93 + } 94 + } 95 + 96 + static inline enum bch_folio_sector_state 97 + folio_sector_undirty(enum bch_folio_sector_state state) 98 + { 99 + switch (state) { 100 + case SECTOR_dirty: 101 + return SECTOR_unallocated; 102 + case SECTOR_dirty_reserved: 103 + return SECTOR_reserved; 104 + default: 105 + return state; 106 + } 107 + } 108 + 109 + static inline enum bch_folio_sector_state 110 + folio_sector_reserve(enum bch_folio_sector_state state) 111 + { 112 + switch (state) { 113 + case SECTOR_unallocated: 114 + return SECTOR_reserved; 115 + case SECTOR_dirty: 116 + return SECTOR_dirty_reserved; 117 + default: 118 + return state; 119 + } 120 + } 121 + 122 + /* for newly allocated folios: */ 123 + struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp) 124 + { 125 + struct bch_folio *s; 126 + 127 + s = kzalloc(sizeof(*s) + 128 + sizeof(struct bch_folio_sector) * 129 + folio_sectors(folio), gfp); 130 + if (!s) 131 + return NULL; 132 + 133 + spin_lock_init(&s->lock); 134 + folio_attach_private(folio, s); 135 + return s; 136 + } 137 + 138 + struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp) 139 + { 140 + return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp); 141 + } 142 + 143 + static unsigned bkey_to_sector_state(struct bkey_s_c k) 144 + { 145 + if (bkey_extent_is_reservation(k)) 146 + return SECTOR_reserved; 147 + if (bkey_extent_is_allocation(k.k)) 148 + return SECTOR_allocated; 149 + return SECTOR_unallocated; 150 + } 151 + 152 + static void __bch2_folio_set(struct folio *folio, 153 + unsigned pg_offset, unsigned pg_len, 154 + unsigned nr_ptrs, unsigned state) 155 + { 156 + struct bch_folio *s = bch2_folio(folio); 157 + unsigned i, sectors = folio_sectors(folio); 158 + 159 + BUG_ON(pg_offset >= sectors); 160 + BUG_ON(pg_offset + pg_len > sectors); 161 + 162 + spin_lock(&s->lock); 163 + 164 + for (i = pg_offset; i < pg_offset + pg_len; i++) { 165 + s->s[i].nr_replicas = nr_ptrs; 166 + bch2_folio_sector_set(folio, s, i, state); 167 + } 168 + 169 + if (i == sectors) 170 + s->uptodate = true; 171 + 172 + spin_unlock(&s->lock); 173 + } 174 + 175 + /* 176 + * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the 177 + * extents btree: 178 + */ 179 + int bch2_folio_set(struct bch_fs *c, subvol_inum inum, 180 + struct folio **folios, unsigned nr_folios) 181 + { 182 + struct btree_trans trans; 183 + struct btree_iter iter; 184 + struct bkey_s_c k; 185 + struct bch_folio *s; 186 + u64 offset = folio_sector(folios[0]); 187 + unsigned folio_idx; 188 + u32 snapshot; 189 + bool need_set = false; 190 + int ret; 191 + 192 + for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) { 193 + s = bch2_folio_create(folios[folio_idx], GFP_KERNEL); 194 + if (!s) 195 + return -ENOMEM; 196 + 197 + need_set |= !s->uptodate; 198 + } 199 + 200 + if (!need_set) 201 + return 0; 202 + 203 + folio_idx = 0; 204 + bch2_trans_init(&trans, c, 0, 0); 205 + retry: 206 + bch2_trans_begin(&trans); 207 + 208 + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); 209 + if (ret) 210 + goto err; 211 + 212 + for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, 213 + SPOS(inum.inum, offset, snapshot), 214 + BTREE_ITER_SLOTS, k, ret) { 215 + unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); 216 + unsigned state = bkey_to_sector_state(k); 217 + 218 + while (folio_idx < nr_folios) { 219 + struct folio *folio = folios[folio_idx]; 220 + u64 folio_start = folio_sector(folio); 221 + u64 folio_end = folio_end_sector(folio); 222 + unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - 223 + folio_start; 224 + unsigned folio_len = min(k.k->p.offset, folio_end) - 225 + folio_offset - folio_start; 226 + 227 + BUG_ON(k.k->p.offset < folio_start); 228 + BUG_ON(bkey_start_offset(k.k) > folio_end); 229 + 230 + if (!bch2_folio(folio)->uptodate) 231 + __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); 232 + 233 + if (k.k->p.offset < folio_end) 234 + break; 235 + folio_idx++; 236 + } 237 + 238 + if (folio_idx == nr_folios) 239 + break; 240 + } 241 + 242 + offset = iter.pos.offset; 243 + bch2_trans_iter_exit(&trans, &iter); 244 + err: 245 + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 246 + goto retry; 247 + bch2_trans_exit(&trans); 248 + 249 + return ret; 250 + } 251 + 252 + void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) 253 + { 254 + struct bvec_iter iter; 255 + struct folio_vec fv; 256 + unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v 257 + ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); 258 + unsigned state = bkey_to_sector_state(k); 259 + 260 + bio_for_each_folio(fv, bio, iter) 261 + __bch2_folio_set(fv.fv_folio, 262 + fv.fv_offset >> 9, 263 + fv.fv_len >> 9, 264 + nr_ptrs, state); 265 + } 266 + 267 + void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode, 268 + u64 start, u64 end) 269 + { 270 + pgoff_t index = start >> PAGE_SECTORS_SHIFT; 271 + pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; 272 + struct folio_batch fbatch; 273 + unsigned i, j; 274 + 275 + if (end <= start) 276 + return; 277 + 278 + folio_batch_init(&fbatch); 279 + 280 + while (filemap_get_folios(inode->v.i_mapping, 281 + &index, end_index, &fbatch)) { 282 + for (i = 0; i < folio_batch_count(&fbatch); i++) { 283 + struct folio *folio = fbatch.folios[i]; 284 + u64 folio_start = folio_sector(folio); 285 + u64 folio_end = folio_end_sector(folio); 286 + unsigned folio_offset = max(start, folio_start) - folio_start; 287 + unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; 288 + struct bch_folio *s; 289 + 290 + BUG_ON(end <= folio_start); 291 + 292 + folio_lock(folio); 293 + s = bch2_folio(folio); 294 + 295 + if (s) { 296 + spin_lock(&s->lock); 297 + for (j = folio_offset; j < folio_offset + folio_len; j++) 298 + s->s[j].nr_replicas = 0; 299 + spin_unlock(&s->lock); 300 + } 301 + 302 + folio_unlock(folio); 303 + } 304 + folio_batch_release(&fbatch); 305 + cond_resched(); 306 + } 307 + } 308 + 309 + void bch2_mark_pagecache_reserved(struct bch_inode_info *inode, 310 + u64 start, u64 end) 311 + { 312 + struct bch_fs *c = inode->v.i_sb->s_fs_info; 313 + pgoff_t index = start >> PAGE_SECTORS_SHIFT; 314 + pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; 315 + struct folio_batch fbatch; 316 + s64 i_sectors_delta = 0; 317 + unsigned i, j; 318 + 319 + if (end <= start) 320 + return; 321 + 322 + folio_batch_init(&fbatch); 323 + 324 + while (filemap_get_folios(inode->v.i_mapping, 325 + &index, end_index, &fbatch)) { 326 + for (i = 0; i < folio_batch_count(&fbatch); i++) { 327 + struct folio *folio = fbatch.folios[i]; 328 + u64 folio_start = folio_sector(folio); 329 + u64 folio_end = folio_end_sector(folio); 330 + unsigned folio_offset = max(start, folio_start) - folio_start; 331 + unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; 332 + struct bch_folio *s; 333 + 334 + BUG_ON(end <= folio_start); 335 + 336 + folio_lock(folio); 337 + s = bch2_folio(folio); 338 + 339 + if (s) { 340 + spin_lock(&s->lock); 341 + for (j = folio_offset; j < folio_offset + folio_len; j++) { 342 + i_sectors_delta -= s->s[j].state == SECTOR_dirty; 343 + bch2_folio_sector_set(folio, s, j, 344 + folio_sector_reserve(s->s[j].state)); 345 + } 346 + spin_unlock(&s->lock); 347 + } 348 + 349 + folio_unlock(folio); 350 + } 351 + folio_batch_release(&fbatch); 352 + cond_resched(); 353 + } 354 + 355 + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 356 + } 357 + 358 + static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, 359 + unsigned nr_replicas) 360 + { 361 + return max(0, (int) nr_replicas - 362 + s->nr_replicas - 363 + s->replicas_reserved); 364 + } 365 + 366 + int bch2_get_folio_disk_reservation(struct bch_fs *c, 367 + struct bch_inode_info *inode, 368 + struct folio *folio, bool check_enospc) 369 + { 370 + struct bch_folio *s = bch2_folio_create(folio, 0); 371 + unsigned nr_replicas = inode_nr_replicas(c, inode); 372 + struct disk_reservation disk_res = { 0 }; 373 + unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0; 374 + int ret; 375 + 376 + if (!s) 377 + return -ENOMEM; 378 + 379 + for (i = 0; i < sectors; i++) 380 + disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); 381 + 382 + if (!disk_res_sectors) 383 + return 0; 384 + 385 + ret = bch2_disk_reservation_get(c, &disk_res, 386 + disk_res_sectors, 1, 387 + !check_enospc 388 + ? BCH_DISK_RESERVATION_NOFAIL 389 + : 0); 390 + if (unlikely(ret)) 391 + return ret; 392 + 393 + for (i = 0; i < sectors; i++) 394 + s->s[i].replicas_reserved += 395 + sectors_to_reserve(&s->s[i], nr_replicas); 396 + 397 + return 0; 398 + } 399 + 400 + void bch2_folio_reservation_put(struct bch_fs *c, 401 + struct bch_inode_info *inode, 402 + struct bch2_folio_reservation *res) 403 + { 404 + bch2_disk_reservation_put(c, &res->disk); 405 + bch2_quota_reservation_put(c, inode, &res->quota); 406 + } 407 + 408 + int bch2_folio_reservation_get(struct bch_fs *c, 409 + struct bch_inode_info *inode, 410 + struct folio *folio, 411 + struct bch2_folio_reservation *res, 412 + unsigned offset, unsigned len) 413 + { 414 + struct bch_folio *s = bch2_folio_create(folio, 0); 415 + unsigned i, disk_sectors = 0, quota_sectors = 0; 416 + int ret; 417 + 418 + if (!s) 419 + return -ENOMEM; 420 + 421 + BUG_ON(!s->uptodate); 422 + 423 + for (i = round_down(offset, block_bytes(c)) >> 9; 424 + i < round_up(offset + len, block_bytes(c)) >> 9; 425 + i++) { 426 + disk_sectors += sectors_to_reserve(&s->s[i], 427 + res->disk.nr_replicas); 428 + quota_sectors += s->s[i].state == SECTOR_unallocated; 429 + } 430 + 431 + if (disk_sectors) { 432 + ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0); 433 + if (unlikely(ret)) 434 + return ret; 435 + } 436 + 437 + if (quota_sectors) { 438 + ret = bch2_quota_reservation_add(c, inode, &res->quota, 439 + quota_sectors, true); 440 + if (unlikely(ret)) { 441 + struct disk_reservation tmp = { 442 + .sectors = disk_sectors 443 + }; 444 + 445 + bch2_disk_reservation_put(c, &tmp); 446 + res->disk.sectors -= disk_sectors; 447 + return ret; 448 + } 449 + } 450 + 451 + return 0; 452 + } 453 + 454 + static void bch2_clear_folio_bits(struct folio *folio) 455 + { 456 + struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); 457 + struct bch_fs *c = inode->v.i_sb->s_fs_info; 458 + struct bch_folio *s = bch2_folio(folio); 459 + struct disk_reservation disk_res = { 0 }; 460 + int i, sectors = folio_sectors(folio), dirty_sectors = 0; 461 + 462 + if (!s) 463 + return; 464 + 465 + EBUG_ON(!folio_test_locked(folio)); 466 + EBUG_ON(folio_test_writeback(folio)); 467 + 468 + for (i = 0; i < sectors; i++) { 469 + disk_res.sectors += s->s[i].replicas_reserved; 470 + s->s[i].replicas_reserved = 0; 471 + 472 + dirty_sectors -= s->s[i].state == SECTOR_dirty; 473 + bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state)); 474 + } 475 + 476 + bch2_disk_reservation_put(c, &disk_res); 477 + 478 + bch2_i_sectors_acct(c, inode, NULL, dirty_sectors); 479 + 480 + bch2_folio_release(folio); 481 + } 482 + 483 + void bch2_set_folio_dirty(struct bch_fs *c, 484 + struct bch_inode_info *inode, 485 + struct folio *folio, 486 + struct bch2_folio_reservation *res, 487 + unsigned offset, unsigned len) 488 + { 489 + struct bch_folio *s = bch2_folio(folio); 490 + unsigned i, dirty_sectors = 0; 491 + 492 + WARN_ON((u64) folio_pos(folio) + offset + len > 493 + round_up((u64) i_size_read(&inode->v), block_bytes(c))); 494 + 495 + BUG_ON(!s->uptodate); 496 + 497 + spin_lock(&s->lock); 498 + 499 + for (i = round_down(offset, block_bytes(c)) >> 9; 500 + i < round_up(offset + len, block_bytes(c)) >> 9; 501 + i++) { 502 + unsigned sectors = sectors_to_reserve(&s->s[i], 503 + res->disk.nr_replicas); 504 + 505 + /* 506 + * This can happen if we race with the error path in 507 + * bch2_writepage_io_done(): 508 + */ 509 + sectors = min_t(unsigned, sectors, res->disk.sectors); 510 + 511 + s->s[i].replicas_reserved += sectors; 512 + res->disk.sectors -= sectors; 513 + 514 + dirty_sectors += s->s[i].state == SECTOR_unallocated; 515 + 516 + bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state)); 517 + } 518 + 519 + spin_unlock(&s->lock); 520 + 521 + bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors); 522 + 523 + if (!folio_test_dirty(folio)) 524 + filemap_dirty_folio(inode->v.i_mapping, folio); 525 + } 526 + 527 + vm_fault_t bch2_page_fault(struct vm_fault *vmf) 528 + { 529 + struct file *file = vmf->vma->vm_file; 530 + struct address_space *mapping = file->f_mapping; 531 + struct address_space *fdm = faults_disabled_mapping(); 532 + struct bch_inode_info *inode = file_bch_inode(file); 533 + vm_fault_t ret; 534 + 535 + if (fdm == mapping) 536 + return VM_FAULT_SIGBUS; 537 + 538 + /* Lock ordering: */ 539 + if (fdm > mapping) { 540 + struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); 541 + 542 + if (bch2_pagecache_add_tryget(inode)) 543 + goto got_lock; 544 + 545 + bch2_pagecache_block_put(fdm_host); 546 + 547 + bch2_pagecache_add_get(inode); 548 + bch2_pagecache_add_put(inode); 549 + 550 + bch2_pagecache_block_get(fdm_host); 551 + 552 + /* Signal that lock has been dropped: */ 553 + set_fdm_dropped_locks(); 554 + return VM_FAULT_SIGBUS; 555 + } 556 + 557 + bch2_pagecache_add_get(inode); 558 + got_lock: 559 + ret = filemap_fault(vmf); 560 + bch2_pagecache_add_put(inode); 561 + 562 + return ret; 563 + } 564 + 565 + vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) 566 + { 567 + struct folio *folio = page_folio(vmf->page); 568 + struct file *file = vmf->vma->vm_file; 569 + struct bch_inode_info *inode = file_bch_inode(file); 570 + struct address_space *mapping = file->f_mapping; 571 + struct bch_fs *c = inode->v.i_sb->s_fs_info; 572 + struct bch2_folio_reservation res; 573 + unsigned len; 574 + loff_t isize; 575 + vm_fault_t ret; 576 + 577 + bch2_folio_reservation_init(c, inode, &res); 578 + 579 + sb_start_pagefault(inode->v.i_sb); 580 + file_update_time(file); 581 + 582 + /* 583 + * Not strictly necessary, but helps avoid dio writes livelocking in 584 + * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get 585 + * a bch2_write_invalidate_inode_pages_range() that works without dropping 586 + * page lock before invalidating page 587 + */ 588 + bch2_pagecache_add_get(inode); 589 + 590 + folio_lock(folio); 591 + isize = i_size_read(&inode->v); 592 + 593 + if (folio->mapping != mapping || folio_pos(folio) >= isize) { 594 + folio_unlock(folio); 595 + ret = VM_FAULT_NOPAGE; 596 + goto out; 597 + } 598 + 599 + len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio)); 600 + 601 + if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?: 602 + bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) { 603 + folio_unlock(folio); 604 + ret = VM_FAULT_SIGBUS; 605 + goto out; 606 + } 607 + 608 + bch2_set_folio_dirty(c, inode, folio, &res, 0, len); 609 + bch2_folio_reservation_put(c, inode, &res); 610 + 611 + folio_wait_stable(folio); 612 + ret = VM_FAULT_LOCKED; 613 + out: 614 + bch2_pagecache_add_put(inode); 615 + sb_end_pagefault(inode->v.i_sb); 616 + 617 + return ret; 618 + } 619 + 620 + void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length) 621 + { 622 + if (offset || length < folio_size(folio)) 623 + return; 624 + 625 + bch2_clear_folio_bits(folio); 626 + } 627 + 628 + bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) 629 + { 630 + if (folio_test_dirty(folio) || folio_test_writeback(folio)) 631 + return false; 632 + 633 + bch2_clear_folio_bits(folio); 634 + return true; 635 + } 636 + 637 + /* fseek: */ 638 + 639 + static int folio_data_offset(struct folio *folio, loff_t pos, 640 + unsigned min_replicas) 641 + { 642 + struct bch_folio *s = bch2_folio(folio); 643 + unsigned i, sectors = folio_sectors(folio); 644 + 645 + if (s) 646 + for (i = folio_pos_to_s(folio, pos); i < sectors; i++) 647 + if (s->s[i].state >= SECTOR_dirty && 648 + s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas) 649 + return i << SECTOR_SHIFT; 650 + 651 + return -1; 652 + } 653 + 654 + loff_t bch2_seek_pagecache_data(struct inode *vinode, 655 + loff_t start_offset, 656 + loff_t end_offset, 657 + unsigned min_replicas, 658 + bool nonblock) 659 + { 660 + struct folio_batch fbatch; 661 + pgoff_t start_index = start_offset >> PAGE_SHIFT; 662 + pgoff_t end_index = end_offset >> PAGE_SHIFT; 663 + pgoff_t index = start_index; 664 + unsigned i; 665 + loff_t ret; 666 + int offset; 667 + 668 + folio_batch_init(&fbatch); 669 + 670 + while (filemap_get_folios(vinode->i_mapping, 671 + &index, end_index, &fbatch)) { 672 + for (i = 0; i < folio_batch_count(&fbatch); i++) { 673 + struct folio *folio = fbatch.folios[i]; 674 + 675 + if (!nonblock) { 676 + folio_lock(folio); 677 + } else if (!folio_trylock(folio)) { 678 + folio_batch_release(&fbatch); 679 + return -EAGAIN; 680 + } 681 + 682 + offset = folio_data_offset(folio, 683 + max(folio_pos(folio), start_offset), 684 + min_replicas); 685 + if (offset >= 0) { 686 + ret = clamp(folio_pos(folio) + offset, 687 + start_offset, end_offset); 688 + folio_unlock(folio); 689 + folio_batch_release(&fbatch); 690 + return ret; 691 + } 692 + folio_unlock(folio); 693 + } 694 + folio_batch_release(&fbatch); 695 + cond_resched(); 696 + } 697 + 698 + return end_offset; 699 + } 700 + 701 + static int folio_hole_offset(struct address_space *mapping, loff_t *offset, 702 + unsigned min_replicas, bool nonblock) 703 + { 704 + struct folio *folio; 705 + struct bch_folio *s; 706 + unsigned i, sectors; 707 + bool ret = true; 708 + 709 + folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT, 710 + FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0); 711 + if (folio == ERR_PTR(-EAGAIN)) 712 + return -EAGAIN; 713 + if (IS_ERR_OR_NULL(folio)) 714 + return true; 715 + 716 + s = bch2_folio(folio); 717 + if (!s) 718 + goto unlock; 719 + 720 + sectors = folio_sectors(folio); 721 + for (i = folio_pos_to_s(folio, *offset); i < sectors; i++) 722 + if (s->s[i].state < SECTOR_dirty || 723 + s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) { 724 + *offset = max(*offset, 725 + folio_pos(folio) + (i << SECTOR_SHIFT)); 726 + goto unlock; 727 + } 728 + 729 + *offset = folio_end_pos(folio); 730 + ret = false; 731 + unlock: 732 + folio_unlock(folio); 733 + folio_put(folio); 734 + return ret; 735 + } 736 + 737 + loff_t bch2_seek_pagecache_hole(struct inode *vinode, 738 + loff_t start_offset, 739 + loff_t end_offset, 740 + unsigned min_replicas, 741 + bool nonblock) 742 + { 743 + struct address_space *mapping = vinode->i_mapping; 744 + loff_t offset = start_offset; 745 + 746 + while (offset < end_offset && 747 + !folio_hole_offset(mapping, &offset, min_replicas, nonblock)) 748 + ; 749 + 750 + return min(offset, end_offset); 751 + } 752 + 753 + int bch2_clamp_data_hole(struct inode *inode, 754 + u64 *hole_start, 755 + u64 *hole_end, 756 + unsigned min_replicas, 757 + bool nonblock) 758 + { 759 + loff_t ret; 760 + 761 + ret = bch2_seek_pagecache_hole(inode, 762 + *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; 763 + if (ret < 0) 764 + return ret; 765 + 766 + *hole_start = ret; 767 + 768 + if (*hole_start == *hole_end) 769 + return 0; 770 + 771 + ret = bch2_seek_pagecache_data(inode, 772 + *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; 773 + if (ret < 0) 774 + return ret; 775 + 776 + *hole_end = ret; 777 + return 0; 778 + } 779 + 780 + #endif /* NO_BCACHEFS_FS */

+176

fs/bcachefs/fs-io-pagecache.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _BCACHEFS_FS_IO_PAGECACHE_H 3 + #define _BCACHEFS_FS_IO_PAGECACHE_H 4 + 5 + #include <linux/pagemap.h> 6 + 7 + typedef DARRAY(struct folio *) folios; 8 + 9 + int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t, 10 + u64, int, gfp_t, folios *); 11 + int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t); 12 + 13 + /* 14 + * Use u64 for the end pos and sector helpers because if the folio covers the 15 + * max supported range of the mapping, the start offset of the next folio 16 + * overflows loff_t. This breaks much of the range based processing in the 17 + * buffered write path. 18 + */ 19 + static inline u64 folio_end_pos(struct folio *folio) 20 + { 21 + return folio_pos(folio) + folio_size(folio); 22 + } 23 + 24 + static inline size_t folio_sectors(struct folio *folio) 25 + { 26 + return PAGE_SECTORS << folio_order(folio); 27 + } 28 + 29 + static inline loff_t folio_sector(struct folio *folio) 30 + { 31 + return folio_pos(folio) >> 9; 32 + } 33 + 34 + static inline u64 folio_end_sector(struct folio *folio) 35 + { 36 + return folio_end_pos(folio) >> 9; 37 + } 38 + 39 + #define BCH_FOLIO_SECTOR_STATE() \ 40 + x(unallocated) \ 41 + x(reserved) \ 42 + x(dirty) \ 43 + x(dirty_reserved) \ 44 + x(allocated) 45 + 46 + enum bch_folio_sector_state { 47 + #define x(n) SECTOR_##n, 48 + BCH_FOLIO_SECTOR_STATE() 49 + #undef x 50 + }; 51 + 52 + struct bch_folio_sector { 53 + /* Uncompressed, fully allocated replicas (or on disk reservation): */ 54 + unsigned nr_replicas:4; 55 + 56 + /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ 57 + unsigned replicas_reserved:4; 58 + 59 + /* i_sectors: */ 60 + enum bch_folio_sector_state state:8; 61 + }; 62 + 63 + struct bch_folio { 64 + spinlock_t lock; 65 + atomic_t write_count; 66 + /* 67 + * Is the sector state up to date with the btree? 68 + * (Not the data itself) 69 + */ 70 + bool uptodate; 71 + struct bch_folio_sector s[]; 72 + }; 73 + 74 + /* Helper for when we need to add debug instrumentation: */ 75 + static inline void bch2_folio_sector_set(struct folio *folio, 76 + struct bch_folio *s, 77 + unsigned i, unsigned n) 78 + { 79 + s->s[i].state = n; 80 + } 81 + 82 + /* file offset (to folio offset) to bch_folio_sector index */ 83 + static inline int folio_pos_to_s(struct folio *folio, loff_t pos) 84 + { 85 + u64 f_offset = pos - folio_pos(folio); 86 + 87 + BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio)); 88 + return f_offset >> SECTOR_SHIFT; 89 + } 90 + 91 + /* for newly allocated folios: */ 92 + static inline void __bch2_folio_release(struct folio *folio) 93 + { 94 + kfree(folio_detach_private(folio)); 95 + } 96 + 97 + static inline void bch2_folio_release(struct folio *folio) 98 + { 99 + EBUG_ON(!folio_test_locked(folio)); 100 + __bch2_folio_release(folio); 101 + } 102 + 103 + static inline struct bch_folio *__bch2_folio(struct folio *folio) 104 + { 105 + return folio_has_private(folio) 106 + ? (struct bch_folio *) folio_get_private(folio) 107 + : NULL; 108 + } 109 + 110 + static inline struct bch_folio *bch2_folio(struct folio *folio) 111 + { 112 + EBUG_ON(!folio_test_locked(folio)); 113 + 114 + return __bch2_folio(folio); 115 + } 116 + 117 + struct bch_folio *__bch2_folio_create(struct folio *, gfp_t); 118 + struct bch_folio *bch2_folio_create(struct folio *, gfp_t); 119 + 120 + struct bch2_folio_reservation { 121 + struct disk_reservation disk; 122 + struct quota_res quota; 123 + }; 124 + 125 + static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) 126 + { 127 + /* XXX: this should not be open coded */ 128 + return inode->ei_inode.bi_data_replicas 129 + ? inode->ei_inode.bi_data_replicas - 1 130 + : c->opts.data_replicas; 131 + } 132 + 133 + static inline void bch2_folio_reservation_init(struct bch_fs *c, 134 + struct bch_inode_info *inode, 135 + struct bch2_folio_reservation *res) 136 + { 137 + memset(res, 0, sizeof(*res)); 138 + 139 + res->disk.nr_replicas = inode_nr_replicas(c, inode); 140 + } 141 + 142 + int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned); 143 + void bch2_bio_page_state_set(struct bio *, struct bkey_s_c); 144 + 145 + void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64); 146 + void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64); 147 + 148 + int bch2_get_folio_disk_reservation(struct bch_fs *, 149 + struct bch_inode_info *, 150 + struct folio *, bool); 151 + 152 + void bch2_folio_reservation_put(struct bch_fs *, 153 + struct bch_inode_info *, 154 + struct bch2_folio_reservation *); 155 + int bch2_folio_reservation_get(struct bch_fs *, 156 + struct bch_inode_info *, 157 + struct folio *, 158 + struct bch2_folio_reservation *, 159 + unsigned, unsigned); 160 + 161 + void bch2_set_folio_dirty(struct bch_fs *, 162 + struct bch_inode_info *, 163 + struct folio *, 164 + struct bch2_folio_reservation *, 165 + unsigned, unsigned); 166 + 167 + vm_fault_t bch2_page_fault(struct vm_fault *); 168 + vm_fault_t bch2_page_mkwrite(struct vm_fault *); 169 + void bch2_invalidate_folio(struct folio *, size_t, size_t); 170 + bool bch2_release_folio(struct folio *, gfp_t); 171 + 172 + loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool); 173 + loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool); 174 + int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool); 175 + 176 + #endif /* _BCACHEFS_FS_IO_PAGECACHE_H */

+18 -2783

fs/bcachefs/fs-io.c

··· 3 3 4 4 #include "bcachefs.h" 5 5 #include "alloc_foreground.h" 6 - #include "bkey_buf.h" 7 6 #include "btree_update.h" 8 7 #include "buckets.h" 9 8 #include "clock.h" ··· 11 12 #include "extent_update.h" 12 13 #include "fs.h" 13 14 #include "fs-io.h" 15 + #include "fs-io-buffered.h" 16 + #include "fs-io-pagecache.h" 14 17 #include "fsck.h" 15 18 #include "inode.h" 16 19 #include "journal.h" ··· 32 31 #include <linux/sched/signal.h> 33 32 #include <linux/task_io_accounting_ops.h> 34 33 #include <linux/uio.h> 35 - #include <linux/writeback.h> 36 34 37 35 #include <trace/events/writeback.h> 38 - 39 - static int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool); 40 - 41 - struct folio_vec { 42 - struct folio *fv_folio; 43 - size_t fv_offset; 44 - size_t fv_len; 45 - }; 46 - 47 - static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv) 48 - { 49 - 50 - struct folio *folio = page_folio(bv.bv_page); 51 - size_t offset = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) + 52 - bv.bv_offset; 53 - size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len); 54 - 55 - return (struct folio_vec) { 56 - .fv_folio = folio, 57 - .fv_offset = offset, 58 - .fv_len = len, 59 - }; 60 - } 61 - 62 - static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio, 63 - struct bvec_iter iter) 64 - { 65 - return biovec_to_foliovec(bio_iter_iovec(bio, iter)); 66 - } 67 - 68 - #define __bio_for_each_folio(bvl, bio, iter, start) \ 69 - for (iter = (start); \ 70 - (iter).bi_size && \ 71 - ((bvl = bio_iter_iovec_folio((bio), (iter))), 1); \ 72 - bio_advance_iter_single((bio), &(iter), (bvl).fv_len)) 73 - 74 - /** 75 - * bio_for_each_folio - iterate over folios within a bio 76 - * 77 - * Like other non-_all versions, this iterates over what bio->bi_iter currently 78 - * points to. This version is for drivers, where the bio may have previously 79 - * been split or cloned. 80 - */ 81 - #define bio_for_each_folio(bvl, bio, iter) \ 82 - __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter) 83 - 84 - /* 85 - * Use u64 for the end pos and sector helpers because if the folio covers the 86 - * max supported range of the mapping, the start offset of the next folio 87 - * overflows loff_t. This breaks much of the range based processing in the 88 - * buffered write path. 89 - */ 90 - static inline u64 folio_end_pos(struct folio *folio) 91 - { 92 - return folio_pos(folio) + folio_size(folio); 93 - } 94 - 95 - static inline size_t folio_sectors(struct folio *folio) 96 - { 97 - return PAGE_SECTORS << folio_order(folio); 98 - } 99 - 100 - static inline loff_t folio_sector(struct folio *folio) 101 - { 102 - return folio_pos(folio) >> 9; 103 - } 104 - 105 - static inline u64 folio_end_sector(struct folio *folio) 106 - { 107 - return folio_end_pos(folio) >> 9; 108 - } 109 - 110 - typedef DARRAY(struct folio *) folios; 111 - 112 - static int filemap_get_contig_folios_d(struct address_space *mapping, 113 - loff_t start, u64 end, 114 - int fgp_flags, gfp_t gfp, 115 - folios *folios) 116 - { 117 - struct folio *f; 118 - u64 pos = start; 119 - int ret = 0; 120 - 121 - while (pos < end) { 122 - if ((u64) pos >= (u64) start + (1ULL << 20)) 123 - fgp_flags &= ~FGP_CREAT; 124 - 125 - ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL); 126 - if (ret) 127 - break; 128 - 129 - f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp); 130 - if (IS_ERR_OR_NULL(f)) 131 - break; 132 - 133 - BUG_ON(folios->nr && folio_pos(f) != pos); 134 - 135 - pos = folio_end_pos(f); 136 - darray_push(folios, f); 137 - } 138 - 139 - if (!folios->nr && !ret && (fgp_flags & FGP_CREAT)) 140 - ret = -ENOMEM; 141 - 142 - return folios->nr ? 0 : ret; 143 - } 144 36 145 37 struct nocow_flush { 146 38 struct closure *cl; ··· 51 157 bio_put(&bio->bio); 52 158 } 53 159 54 - static void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, 55 - struct bch_inode_info *inode, 56 - struct closure *cl) 160 + void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, 161 + struct bch_inode_info *inode, 162 + struct closure *cl) 57 163 { 58 164 struct nocow_flush *bio; 59 165 struct bch_dev *ca; ··· 101 207 return 0; 102 208 } 103 209 104 - static inline bool bio_full(struct bio *bio, unsigned len) 105 - { 106 - if (bio->bi_vcnt >= bio->bi_max_vecs) 107 - return true; 108 - if (bio->bi_iter.bi_size > UINT_MAX - len) 109 - return true; 110 - return false; 111 - } 112 - 113 - static inline struct address_space *faults_disabled_mapping(void) 114 - { 115 - return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); 116 - } 117 - 118 - static inline void set_fdm_dropped_locks(void) 119 - { 120 - current->faults_disabled_mapping = 121 - (void *) (((unsigned long) current->faults_disabled_mapping)|1); 122 - } 123 - 124 - static inline bool fdm_dropped_locks(void) 125 - { 126 - return ((unsigned long) current->faults_disabled_mapping) & 1; 127 - } 128 - 129 - struct quota_res { 130 - u64 sectors; 131 - }; 132 - 133 - struct bch_writepage_io { 134 - struct bch_inode_info *inode; 135 - 136 - /* must be last: */ 137 - struct bch_write_op op; 138 - }; 139 - 140 - struct dio_write { 141 - struct kiocb *req; 142 - struct address_space *mapping; 143 - struct bch_inode_info *inode; 144 - struct mm_struct *mm; 145 - unsigned loop:1, 146 - extending:1, 147 - sync:1, 148 - flush:1, 149 - free_iov:1; 150 - struct quota_res quota_res; 151 - u64 written; 152 - 153 - struct iov_iter iter; 154 - struct iovec inline_vecs[2]; 155 - 156 - /* must be last: */ 157 - struct bch_write_op op; 158 - }; 159 - 160 - struct dio_read { 161 - struct closure cl; 162 - struct kiocb *req; 163 - long ret; 164 - bool should_dirty; 165 - struct bch_read_bio rbio; 166 - }; 167 - 168 - /* pagecache_block must be held */ 169 - static noinline int write_invalidate_inode_pages_range(struct address_space *mapping, 170 - loff_t start, loff_t end) 171 - { 172 - int ret; 173 - 174 - /* 175 - * XXX: the way this is currently implemented, we can spin if a process 176 - * is continually redirtying a specific page 177 - */ 178 - do { 179 - if (!mapping->nrpages) 180 - return 0; 181 - 182 - ret = filemap_write_and_wait_range(mapping, start, end); 183 - if (ret) 184 - break; 185 - 186 - if (!mapping->nrpages) 187 - return 0; 188 - 189 - ret = invalidate_inode_pages2_range(mapping, 190 - start >> PAGE_SHIFT, 191 - end >> PAGE_SHIFT); 192 - } while (ret == -EBUSY); 193 - 194 - return ret; 195 - } 196 - 197 - /* quotas */ 198 - 199 - #ifdef CONFIG_BCACHEFS_QUOTA 200 - 201 - static void __bch2_quota_reservation_put(struct bch_fs *c, 202 - struct bch_inode_info *inode, 203 - struct quota_res *res) 204 - { 205 - BUG_ON(res->sectors > inode->ei_quota_reserved); 206 - 207 - bch2_quota_acct(c, inode->ei_qid, Q_SPC, 208 - -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); 209 - inode->ei_quota_reserved -= res->sectors; 210 - res->sectors = 0; 211 - } 212 - 213 - static void bch2_quota_reservation_put(struct bch_fs *c, 214 - struct bch_inode_info *inode, 215 - struct quota_res *res) 216 - { 217 - if (res->sectors) { 218 - mutex_lock(&inode->ei_quota_lock); 219 - __bch2_quota_reservation_put(c, inode, res); 220 - mutex_unlock(&inode->ei_quota_lock); 221 - } 222 - } 223 - 224 - static int bch2_quota_reservation_add(struct bch_fs *c, 225 - struct bch_inode_info *inode, 226 - struct quota_res *res, 227 - u64 sectors, 228 - bool check_enospc) 229 - { 230 - int ret; 231 - 232 - if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags)) 233 - return 0; 234 - 235 - mutex_lock(&inode->ei_quota_lock); 236 - ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, 237 - check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); 238 - if (likely(!ret)) { 239 - inode->ei_quota_reserved += sectors; 240 - res->sectors += sectors; 241 - } 242 - mutex_unlock(&inode->ei_quota_lock); 243 - 244 - return ret; 245 - } 246 - 247 - #else 248 - 249 - static void __bch2_quota_reservation_put(struct bch_fs *c, 250 - struct bch_inode_info *inode, 251 - struct quota_res *res) {} 252 - 253 - static void bch2_quota_reservation_put(struct bch_fs *c, 254 - struct bch_inode_info *inode, 255 - struct quota_res *res) {} 256 - 257 - static int bch2_quota_reservation_add(struct bch_fs *c, 258 - struct bch_inode_info *inode, 259 - struct quota_res *res, 260 - unsigned sectors, 261 - bool check_enospc) 262 - { 263 - return 0; 264 - } 265 - 266 - #endif 267 - 268 210 /* i_size updates: */ 269 211 270 212 struct inode_new_size { ··· 139 409 return bch2_write_inode(c, inode, inode_set_size, &s, fields); 140 410 } 141 411 142 - static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, 412 + void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, 143 413 struct quota_res *quota_res, s64 sectors) 144 414 { 145 415 bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, ··· 161 431 bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); 162 432 } 163 433 #endif 164 - } 165 - 166 - static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, 167 - struct quota_res *quota_res, s64 sectors) 168 - { 169 - if (sectors) { 170 - mutex_lock(&inode->ei_quota_lock); 171 - __i_sectors_acct(c, inode, quota_res, sectors); 172 - mutex_unlock(&inode->ei_quota_lock); 173 - } 174 - } 175 - 176 - /* page state: */ 177 - 178 - /* stored in page->private: */ 179 - 180 - #define BCH_FOLIO_SECTOR_STATE() \ 181 - x(unallocated) \ 182 - x(reserved) \ 183 - x(dirty) \ 184 - x(dirty_reserved) \ 185 - x(allocated) 186 - 187 - enum bch_folio_sector_state { 188 - #define x(n) SECTOR_##n, 189 - BCH_FOLIO_SECTOR_STATE() 190 - #undef x 191 - }; 192 - 193 - static const char * const bch2_folio_sector_states[] = { 194 - #define x(n) #n, 195 - BCH_FOLIO_SECTOR_STATE() 196 - #undef x 197 - NULL 198 - }; 199 - 200 - static inline enum bch_folio_sector_state 201 - folio_sector_dirty(enum bch_folio_sector_state state) 202 - { 203 - switch (state) { 204 - case SECTOR_unallocated: 205 - return SECTOR_dirty; 206 - case SECTOR_reserved: 207 - return SECTOR_dirty_reserved; 208 - default: 209 - return state; 210 - } 211 - } 212 - 213 - static inline enum bch_folio_sector_state 214 - folio_sector_undirty(enum bch_folio_sector_state state) 215 - { 216 - switch (state) { 217 - case SECTOR_dirty: 218 - return SECTOR_unallocated; 219 - case SECTOR_dirty_reserved: 220 - return SECTOR_reserved; 221 - default: 222 - return state; 223 - } 224 - } 225 - 226 - static inline enum bch_folio_sector_state 227 - folio_sector_reserve(enum bch_folio_sector_state state) 228 - { 229 - switch (state) { 230 - case SECTOR_unallocated: 231 - return SECTOR_reserved; 232 - case SECTOR_dirty: 233 - return SECTOR_dirty_reserved; 234 - default: 235 - return state; 236 - } 237 - } 238 - 239 - struct bch_folio_sector { 240 - /* Uncompressed, fully allocated replicas (or on disk reservation): */ 241 - unsigned nr_replicas:4; 242 - 243 - /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ 244 - unsigned replicas_reserved:4; 245 - 246 - /* i_sectors: */ 247 - enum bch_folio_sector_state state:8; 248 - }; 249 - 250 - struct bch_folio { 251 - spinlock_t lock; 252 - atomic_t write_count; 253 - /* 254 - * Is the sector state up to date with the btree? 255 - * (Not the data itself) 256 - */ 257 - bool uptodate; 258 - struct bch_folio_sector s[]; 259 - }; 260 - 261 - static inline void folio_sector_set(struct folio *folio, 262 - struct bch_folio *s, 263 - unsigned i, unsigned n) 264 - { 265 - s->s[i].state = n; 266 - } 267 - 268 - /* file offset (to folio offset) to bch_folio_sector index */ 269 - static inline int folio_pos_to_s(struct folio *folio, loff_t pos) 270 - { 271 - u64 f_offset = pos - folio_pos(folio); 272 - BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio)); 273 - return f_offset >> SECTOR_SHIFT; 274 - } 275 - 276 - static inline struct bch_folio *__bch2_folio(struct folio *folio) 277 - { 278 - return folio_has_private(folio) 279 - ? (struct bch_folio *) folio_get_private(folio) 280 - : NULL; 281 - } 282 - 283 - static inline struct bch_folio *bch2_folio(struct folio *folio) 284 - { 285 - EBUG_ON(!folio_test_locked(folio)); 286 - 287 - return __bch2_folio(folio); 288 - } 289 - 290 - /* for newly allocated folios: */ 291 - static void __bch2_folio_release(struct folio *folio) 292 - { 293 - kfree(folio_detach_private(folio)); 294 - } 295 - 296 - static void bch2_folio_release(struct folio *folio) 297 - { 298 - EBUG_ON(!folio_test_locked(folio)); 299 - __bch2_folio_release(folio); 300 - } 301 - 302 - /* for newly allocated folios: */ 303 - static struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp) 304 - { 305 - struct bch_folio *s; 306 - 307 - s = kzalloc(sizeof(*s) + 308 - sizeof(struct bch_folio_sector) * 309 - folio_sectors(folio), gfp); 310 - if (!s) 311 - return NULL; 312 - 313 - spin_lock_init(&s->lock); 314 - folio_attach_private(folio, s); 315 - return s; 316 - } 317 - 318 - static struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp) 319 - { 320 - return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp); 321 - } 322 - 323 - static unsigned bkey_to_sector_state(struct bkey_s_c k) 324 - { 325 - if (bkey_extent_is_reservation(k)) 326 - return SECTOR_reserved; 327 - if (bkey_extent_is_allocation(k.k)) 328 - return SECTOR_allocated; 329 - return SECTOR_unallocated; 330 - } 331 - 332 - static void __bch2_folio_set(struct folio *folio, 333 - unsigned pg_offset, unsigned pg_len, 334 - unsigned nr_ptrs, unsigned state) 335 - { 336 - struct bch_folio *s = bch2_folio(folio); 337 - unsigned i, sectors = folio_sectors(folio); 338 - 339 - BUG_ON(pg_offset >= sectors); 340 - BUG_ON(pg_offset + pg_len > sectors); 341 - 342 - spin_lock(&s->lock); 343 - 344 - for (i = pg_offset; i < pg_offset + pg_len; i++) { 345 - s->s[i].nr_replicas = nr_ptrs; 346 - folio_sector_set(folio, s, i, state); 347 - } 348 - 349 - if (i == sectors) 350 - s->uptodate = true; 351 - 352 - spin_unlock(&s->lock); 353 - } 354 - 355 - /* 356 - * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the 357 - * extents btree: 358 - */ 359 - static int bch2_folio_set(struct bch_fs *c, subvol_inum inum, 360 - struct folio **folios, unsigned nr_folios) 361 - { 362 - struct btree_trans trans; 363 - struct btree_iter iter; 364 - struct bkey_s_c k; 365 - struct bch_folio *s; 366 - u64 offset = folio_sector(folios[0]); 367 - unsigned folio_idx; 368 - u32 snapshot; 369 - bool need_set = false; 370 - int ret; 371 - 372 - for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) { 373 - s = bch2_folio_create(folios[folio_idx], GFP_KERNEL); 374 - if (!s) 375 - return -ENOMEM; 376 - 377 - need_set |= !s->uptodate; 378 - } 379 - 380 - if (!need_set) 381 - return 0; 382 - 383 - folio_idx = 0; 384 - bch2_trans_init(&trans, c, 0, 0); 385 - retry: 386 - bch2_trans_begin(&trans); 387 - 388 - ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); 389 - if (ret) 390 - goto err; 391 - 392 - for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, 393 - SPOS(inum.inum, offset, snapshot), 394 - BTREE_ITER_SLOTS, k, ret) { 395 - unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); 396 - unsigned state = bkey_to_sector_state(k); 397 - 398 - while (folio_idx < nr_folios) { 399 - struct folio *folio = folios[folio_idx]; 400 - u64 folio_start = folio_sector(folio); 401 - u64 folio_end = folio_end_sector(folio); 402 - unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - folio_start; 403 - unsigned folio_len = min(k.k->p.offset, folio_end) - folio_offset - folio_start; 404 - 405 - BUG_ON(k.k->p.offset < folio_start); 406 - BUG_ON(bkey_start_offset(k.k) > folio_end); 407 - 408 - if (!bch2_folio(folio)->uptodate) 409 - __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); 410 - 411 - if (k.k->p.offset < folio_end) 412 - break; 413 - folio_idx++; 414 - } 415 - 416 - if (folio_idx == nr_folios) 417 - break; 418 - } 419 - 420 - offset = iter.pos.offset; 421 - bch2_trans_iter_exit(&trans, &iter); 422 - err: 423 - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 424 - goto retry; 425 - bch2_trans_exit(&trans); 426 - 427 - return ret; 428 - } 429 - 430 - static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) 431 - { 432 - struct bvec_iter iter; 433 - struct folio_vec fv; 434 - unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v 435 - ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); 436 - unsigned state = bkey_to_sector_state(k); 437 - 438 - bio_for_each_folio(fv, bio, iter) 439 - __bch2_folio_set(fv.fv_folio, 440 - fv.fv_offset >> 9, 441 - fv.fv_len >> 9, 442 - nr_ptrs, state); 443 - } 444 - 445 - static void mark_pagecache_unallocated(struct bch_inode_info *inode, 446 - u64 start, u64 end) 447 - { 448 - pgoff_t index = start >> PAGE_SECTORS_SHIFT; 449 - pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; 450 - struct folio_batch fbatch; 451 - unsigned i, j; 452 - 453 - if (end <= start) 454 - return; 455 - 456 - folio_batch_init(&fbatch); 457 - 458 - while (filemap_get_folios(inode->v.i_mapping, 459 - &index, end_index, &fbatch)) { 460 - for (i = 0; i < folio_batch_count(&fbatch); i++) { 461 - struct folio *folio = fbatch.folios[i]; 462 - u64 folio_start = folio_sector(folio); 463 - u64 folio_end = folio_end_sector(folio); 464 - unsigned folio_offset = max(start, folio_start) - folio_start; 465 - unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; 466 - struct bch_folio *s; 467 - 468 - BUG_ON(end <= folio_start); 469 - 470 - folio_lock(folio); 471 - s = bch2_folio(folio); 472 - 473 - if (s) { 474 - spin_lock(&s->lock); 475 - for (j = folio_offset; j < folio_offset + folio_len; j++) 476 - s->s[j].nr_replicas = 0; 477 - spin_unlock(&s->lock); 478 - } 479 - 480 - folio_unlock(folio); 481 - } 482 - folio_batch_release(&fbatch); 483 - cond_resched(); 484 - } 485 - } 486 - 487 - static void mark_pagecache_reserved(struct bch_inode_info *inode, 488 - u64 start, u64 end) 489 - { 490 - struct bch_fs *c = inode->v.i_sb->s_fs_info; 491 - pgoff_t index = start >> PAGE_SECTORS_SHIFT; 492 - pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; 493 - struct folio_batch fbatch; 494 - s64 i_sectors_delta = 0; 495 - unsigned i, j; 496 - 497 - if (end <= start) 498 - return; 499 - 500 - folio_batch_init(&fbatch); 501 - 502 - while (filemap_get_folios(inode->v.i_mapping, 503 - &index, end_index, &fbatch)) { 504 - for (i = 0; i < folio_batch_count(&fbatch); i++) { 505 - struct folio *folio = fbatch.folios[i]; 506 - u64 folio_start = folio_sector(folio); 507 - u64 folio_end = folio_end_sector(folio); 508 - unsigned folio_offset = max(start, folio_start) - folio_start; 509 - unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; 510 - struct bch_folio *s; 511 - 512 - BUG_ON(end <= folio_start); 513 - 514 - folio_lock(folio); 515 - s = bch2_folio(folio); 516 - 517 - if (s) { 518 - spin_lock(&s->lock); 519 - for (j = folio_offset; j < folio_offset + folio_len; j++) { 520 - i_sectors_delta -= s->s[j].state == SECTOR_dirty; 521 - folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state)); 522 - } 523 - spin_unlock(&s->lock); 524 - } 525 - 526 - folio_unlock(folio); 527 - } 528 - folio_batch_release(&fbatch); 529 - cond_resched(); 530 - } 531 - 532 - i_sectors_acct(c, inode, NULL, i_sectors_delta); 533 - } 534 - 535 - static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) 536 - { 537 - /* XXX: this should not be open coded */ 538 - return inode->ei_inode.bi_data_replicas 539 - ? inode->ei_inode.bi_data_replicas - 1 540 - : c->opts.data_replicas; 541 - } 542 - 543 - static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, 544 - unsigned nr_replicas) 545 - { 546 - return max(0, (int) nr_replicas - 547 - s->nr_replicas - 548 - s->replicas_reserved); 549 - } 550 - 551 - static int bch2_get_folio_disk_reservation(struct bch_fs *c, 552 - struct bch_inode_info *inode, 553 - struct folio *folio, bool check_enospc) 554 - { 555 - struct bch_folio *s = bch2_folio_create(folio, 0); 556 - unsigned nr_replicas = inode_nr_replicas(c, inode); 557 - struct disk_reservation disk_res = { 0 }; 558 - unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0; 559 - int ret; 560 - 561 - if (!s) 562 - return -ENOMEM; 563 - 564 - for (i = 0; i < sectors; i++) 565 - disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); 566 - 567 - if (!disk_res_sectors) 568 - return 0; 569 - 570 - ret = bch2_disk_reservation_get(c, &disk_res, 571 - disk_res_sectors, 1, 572 - !check_enospc 573 - ? BCH_DISK_RESERVATION_NOFAIL 574 - : 0); 575 - if (unlikely(ret)) 576 - return ret; 577 - 578 - for (i = 0; i < sectors; i++) 579 - s->s[i].replicas_reserved += 580 - sectors_to_reserve(&s->s[i], nr_replicas); 581 - 582 - return 0; 583 - } 584 - 585 - struct bch2_folio_reservation { 586 - struct disk_reservation disk; 587 - struct quota_res quota; 588 - }; 589 - 590 - static void bch2_folio_reservation_init(struct bch_fs *c, 591 - struct bch_inode_info *inode, 592 - struct bch2_folio_reservation *res) 593 - { 594 - memset(res, 0, sizeof(*res)); 595 - 596 - res->disk.nr_replicas = inode_nr_replicas(c, inode); 597 - } 598 - 599 - static void bch2_folio_reservation_put(struct bch_fs *c, 600 - struct bch_inode_info *inode, 601 - struct bch2_folio_reservation *res) 602 - { 603 - bch2_disk_reservation_put(c, &res->disk); 604 - bch2_quota_reservation_put(c, inode, &res->quota); 605 - } 606 - 607 - static int bch2_folio_reservation_get(struct bch_fs *c, 608 - struct bch_inode_info *inode, 609 - struct folio *folio, 610 - struct bch2_folio_reservation *res, 611 - unsigned offset, unsigned len) 612 - { 613 - struct bch_folio *s = bch2_folio_create(folio, 0); 614 - unsigned i, disk_sectors = 0, quota_sectors = 0; 615 - int ret; 616 - 617 - if (!s) 618 - return -ENOMEM; 619 - 620 - BUG_ON(!s->uptodate); 621 - 622 - for (i = round_down(offset, block_bytes(c)) >> 9; 623 - i < round_up(offset + len, block_bytes(c)) >> 9; 624 - i++) { 625 - disk_sectors += sectors_to_reserve(&s->s[i], 626 - res->disk.nr_replicas); 627 - quota_sectors += s->s[i].state == SECTOR_unallocated; 628 - } 629 - 630 - if (disk_sectors) { 631 - ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0); 632 - if (unlikely(ret)) 633 - return ret; 634 - } 635 - 636 - if (quota_sectors) { 637 - ret = bch2_quota_reservation_add(c, inode, &res->quota, 638 - quota_sectors, true); 639 - if (unlikely(ret)) { 640 - struct disk_reservation tmp = { 641 - .sectors = disk_sectors 642 - }; 643 - 644 - bch2_disk_reservation_put(c, &tmp); 645 - res->disk.sectors -= disk_sectors; 646 - return ret; 647 - } 648 - } 649 - 650 - return 0; 651 - } 652 - 653 - static void bch2_clear_folio_bits(struct folio *folio) 654 - { 655 - struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); 656 - struct bch_fs *c = inode->v.i_sb->s_fs_info; 657 - struct bch_folio *s = bch2_folio(folio); 658 - struct disk_reservation disk_res = { 0 }; 659 - int i, sectors = folio_sectors(folio), dirty_sectors = 0; 660 - 661 - if (!s) 662 - return; 663 - 664 - EBUG_ON(!folio_test_locked(folio)); 665 - EBUG_ON(folio_test_writeback(folio)); 666 - 667 - for (i = 0; i < sectors; i++) { 668 - disk_res.sectors += s->s[i].replicas_reserved; 669 - s->s[i].replicas_reserved = 0; 670 - 671 - dirty_sectors -= s->s[i].state == SECTOR_dirty; 672 - folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state)); 673 - } 674 - 675 - bch2_disk_reservation_put(c, &disk_res); 676 - 677 - i_sectors_acct(c, inode, NULL, dirty_sectors); 678 - 679 - bch2_folio_release(folio); 680 - } 681 - 682 - static void bch2_set_folio_dirty(struct bch_fs *c, 683 - struct bch_inode_info *inode, 684 - struct folio *folio, 685 - struct bch2_folio_reservation *res, 686 - unsigned offset, unsigned len) 687 - { 688 - struct bch_folio *s = bch2_folio(folio); 689 - unsigned i, dirty_sectors = 0; 690 - 691 - WARN_ON((u64) folio_pos(folio) + offset + len > 692 - round_up((u64) i_size_read(&inode->v), block_bytes(c))); 693 - 694 - BUG_ON(!s->uptodate); 695 - 696 - spin_lock(&s->lock); 697 - 698 - for (i = round_down(offset, block_bytes(c)) >> 9; 699 - i < round_up(offset + len, block_bytes(c)) >> 9; 700 - i++) { 701 - unsigned sectors = sectors_to_reserve(&s->s[i], 702 - res->disk.nr_replicas); 703 - 704 - /* 705 - * This can happen if we race with the error path in 706 - * bch2_writepage_io_done(): 707 - */ 708 - sectors = min_t(unsigned, sectors, res->disk.sectors); 709 - 710 - s->s[i].replicas_reserved += sectors; 711 - res->disk.sectors -= sectors; 712 - 713 - dirty_sectors += s->s[i].state == SECTOR_unallocated; 714 - 715 - folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state)); 716 - } 717 - 718 - spin_unlock(&s->lock); 719 - 720 - i_sectors_acct(c, inode, &res->quota, dirty_sectors); 721 - 722 - if (!folio_test_dirty(folio)) 723 - filemap_dirty_folio(inode->v.i_mapping, folio); 724 - } 725 - 726 - vm_fault_t bch2_page_fault(struct vm_fault *vmf) 727 - { 728 - struct file *file = vmf->vma->vm_file; 729 - struct address_space *mapping = file->f_mapping; 730 - struct address_space *fdm = faults_disabled_mapping(); 731 - struct bch_inode_info *inode = file_bch_inode(file); 732 - vm_fault_t ret; 733 - 734 - if (fdm == mapping) 735 - return VM_FAULT_SIGBUS; 736 - 737 - /* Lock ordering: */ 738 - if (fdm > mapping) { 739 - struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); 740 - 741 - if (bch2_pagecache_add_tryget(inode)) 742 - goto got_lock; 743 - 744 - bch2_pagecache_block_put(fdm_host); 745 - 746 - bch2_pagecache_add_get(inode); 747 - bch2_pagecache_add_put(inode); 748 - 749 - bch2_pagecache_block_get(fdm_host); 750 - 751 - /* Signal that lock has been dropped: */ 752 - set_fdm_dropped_locks(); 753 - return VM_FAULT_SIGBUS; 754 - } 755 - 756 - bch2_pagecache_add_get(inode); 757 - got_lock: 758 - ret = filemap_fault(vmf); 759 - bch2_pagecache_add_put(inode); 760 - 761 - return ret; 762 - } 763 - 764 - vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) 765 - { 766 - struct folio *folio = page_folio(vmf->page); 767 - struct file *file = vmf->vma->vm_file; 768 - struct bch_inode_info *inode = file_bch_inode(file); 769 - struct address_space *mapping = file->f_mapping; 770 - struct bch_fs *c = inode->v.i_sb->s_fs_info; 771 - struct bch2_folio_reservation res; 772 - unsigned len; 773 - loff_t isize; 774 - vm_fault_t ret; 775 - 776 - bch2_folio_reservation_init(c, inode, &res); 777 - 778 - sb_start_pagefault(inode->v.i_sb); 779 - file_update_time(file); 780 - 781 - /* 782 - * Not strictly necessary, but helps avoid dio writes livelocking in 783 - * write_invalidate_inode_pages_range() - can drop this if/when we get 784 - * a write_invalidate_inode_pages_range() that works without dropping 785 - * page lock before invalidating page 786 - */ 787 - bch2_pagecache_add_get(inode); 788 - 789 - folio_lock(folio); 790 - isize = i_size_read(&inode->v); 791 - 792 - if (folio->mapping != mapping || folio_pos(folio) >= isize) { 793 - folio_unlock(folio); 794 - ret = VM_FAULT_NOPAGE; 795 - goto out; 796 - } 797 - 798 - len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio)); 799 - 800 - if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?: 801 - bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) { 802 - folio_unlock(folio); 803 - ret = VM_FAULT_SIGBUS; 804 - goto out; 805 - } 806 - 807 - bch2_set_folio_dirty(c, inode, folio, &res, 0, len); 808 - bch2_folio_reservation_put(c, inode, &res); 809 - 810 - folio_wait_stable(folio); 811 - ret = VM_FAULT_LOCKED; 812 - out: 813 - bch2_pagecache_add_put(inode); 814 - sb_end_pagefault(inode->v.i_sb); 815 - 816 - return ret; 817 - } 818 - 819 - void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length) 820 - { 821 - if (offset || length < folio_size(folio)) 822 - return; 823 - 824 - bch2_clear_folio_bits(folio); 825 - } 826 - 827 - bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) 828 - { 829 - if (folio_test_dirty(folio) || folio_test_writeback(folio)) 830 - return false; 831 - 832 - bch2_clear_folio_bits(folio); 833 - return true; 834 - } 835 - 836 - /* readpage(s): */ 837 - 838 - static void bch2_readpages_end_io(struct bio *bio) 839 - { 840 - struct folio_iter fi; 841 - 842 - bio_for_each_folio_all(fi, bio) { 843 - if (!bio->bi_status) { 844 - folio_mark_uptodate(fi.folio); 845 - } else { 846 - folio_clear_uptodate(fi.folio); 847 - folio_set_error(fi.folio); 848 - } 849 - folio_unlock(fi.folio); 850 - } 851 - 852 - bio_put(bio); 853 - } 854 - 855 - struct readpages_iter { 856 - struct address_space *mapping; 857 - unsigned idx; 858 - folios folios; 859 - }; 860 - 861 - static int readpages_iter_init(struct readpages_iter *iter, 862 - struct readahead_control *ractl) 863 - { 864 - struct folio **fi; 865 - int ret; 866 - 867 - memset(iter, 0, sizeof(*iter)); 868 - 869 - iter->mapping = ractl->mapping; 870 - 871 - ret = filemap_get_contig_folios_d(iter->mapping, 872 - ractl->_index << PAGE_SHIFT, 873 - (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT, 874 - 0, mapping_gfp_mask(iter->mapping), 875 - &iter->folios); 876 - if (ret) 877 - return ret; 878 - 879 - darray_for_each(iter->folios, fi) { 880 - ractl->_nr_pages -= 1U << folio_order(*fi); 881 - __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL); 882 - folio_put(*fi); 883 - folio_put(*fi); 884 - } 885 - 886 - return 0; 887 - } 888 - 889 - static inline struct folio *readpage_iter_peek(struct readpages_iter *iter) 890 - { 891 - if (iter->idx >= iter->folios.nr) 892 - return NULL; 893 - return iter->folios.data[iter->idx]; 894 - } 895 - 896 - static inline void readpage_iter_advance(struct readpages_iter *iter) 897 - { 898 - iter->idx++; 899 - } 900 - 901 - static bool extent_partial_reads_expensive(struct bkey_s_c k) 902 - { 903 - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 904 - struct bch_extent_crc_unpacked crc; 905 - const union bch_extent_entry *i; 906 - 907 - bkey_for_each_crc(k.k, ptrs, crc, i) 908 - if (crc.csum_type || crc.compression_type) 909 - return true; 910 - return false; 911 - } 912 - 913 - static int readpage_bio_extend(struct btree_trans *trans, 914 - struct readpages_iter *iter, 915 - struct bio *bio, 916 - unsigned sectors_this_extent, 917 - bool get_more) 918 - { 919 - /* Don't hold btree locks while allocating memory: */ 920 - bch2_trans_unlock(trans); 921 - 922 - while (bio_sectors(bio) < sectors_this_extent && 923 - bio->bi_vcnt < bio->bi_max_vecs) { 924 - struct folio *folio = readpage_iter_peek(iter); 925 - int ret; 926 - 927 - if (folio) { 928 - readpage_iter_advance(iter); 929 - } else { 930 - pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; 931 - 932 - if (!get_more) 933 - break; 934 - 935 - folio = xa_load(&iter->mapping->i_pages, folio_offset); 936 - if (folio && !xa_is_value(folio)) 937 - break; 938 - 939 - folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0); 940 - if (!folio) 941 - break; 942 - 943 - if (!__bch2_folio_create(folio, GFP_KERNEL)) { 944 - folio_put(folio); 945 - break; 946 - } 947 - 948 - ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL); 949 - if (ret) { 950 - __bch2_folio_release(folio); 951 - folio_put(folio); 952 - break; 953 - } 954 - 955 - folio_put(folio); 956 - } 957 - 958 - BUG_ON(folio_sector(folio) != bio_end_sector(bio)); 959 - 960 - BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0)); 961 - } 962 - 963 - return bch2_trans_relock(trans); 964 - } 965 - 966 - static void bchfs_read(struct btree_trans *trans, 967 - struct bch_read_bio *rbio, 968 - subvol_inum inum, 969 - struct readpages_iter *readpages_iter) 970 - { 971 - struct bch_fs *c = trans->c; 972 - struct btree_iter iter; 973 - struct bkey_buf sk; 974 - int flags = BCH_READ_RETRY_IF_STALE| 975 - BCH_READ_MAY_PROMOTE; 976 - u32 snapshot; 977 - int ret = 0; 978 - 979 - rbio->c = c; 980 - rbio->start_time = local_clock(); 981 - rbio->subvol = inum.subvol; 982 - 983 - bch2_bkey_buf_init(&sk); 984 - retry: 985 - bch2_trans_begin(trans); 986 - iter = (struct btree_iter) { NULL }; 987 - 988 - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 989 - if (ret) 990 - goto err; 991 - 992 - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 993 - SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), 994 - BTREE_ITER_SLOTS); 995 - while (1) { 996 - struct bkey_s_c k; 997 - unsigned bytes, sectors, offset_into_extent; 998 - enum btree_id data_btree = BTREE_ID_extents; 999 - 1000 - /* 1001 - * read_extent -> io_time_reset may cause a transaction restart 1002 - * without returning an error, we need to check for that here: 1003 - */ 1004 - ret = bch2_trans_relock(trans); 1005 - if (ret) 1006 - break; 1007 - 1008 - bch2_btree_iter_set_pos(&iter, 1009 - POS(inum.inum, rbio->bio.bi_iter.bi_sector)); 1010 - 1011 - k = bch2_btree_iter_peek_slot(&iter); 1012 - ret = bkey_err(k); 1013 - if (ret) 1014 - break; 1015 - 1016 - offset_into_extent = iter.pos.offset - 1017 - bkey_start_offset(k.k); 1018 - sectors = k.k->size - offset_into_extent; 1019 - 1020 - bch2_bkey_buf_reassemble(&sk, c, k); 1021 - 1022 - ret = bch2_read_indirect_extent(trans, &data_btree, 1023 - &offset_into_extent, &sk); 1024 - if (ret) 1025 - break; 1026 - 1027 - k = bkey_i_to_s_c(sk.k); 1028 - 1029 - sectors = min(sectors, k.k->size - offset_into_extent); 1030 - 1031 - if (readpages_iter) { 1032 - ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, 1033 - extent_partial_reads_expensive(k)); 1034 - if (ret) 1035 - break; 1036 - } 1037 - 1038 - bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; 1039 - swap(rbio->bio.bi_iter.bi_size, bytes); 1040 - 1041 - if (rbio->bio.bi_iter.bi_size == bytes) 1042 - flags |= BCH_READ_LAST_FRAGMENT; 1043 - 1044 - bch2_bio_page_state_set(&rbio->bio, k); 1045 - 1046 - bch2_read_extent(trans, rbio, iter.pos, 1047 - data_btree, k, offset_into_extent, flags); 1048 - 1049 - if (flags & BCH_READ_LAST_FRAGMENT) 1050 - break; 1051 - 1052 - swap(rbio->bio.bi_iter.bi_size, bytes); 1053 - bio_advance(&rbio->bio, bytes); 1054 - 1055 - ret = btree_trans_too_many_iters(trans); 1056 - if (ret) 1057 - break; 1058 - } 1059 - err: 1060 - bch2_trans_iter_exit(trans, &iter); 1061 - 1062 - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1063 - goto retry; 1064 - 1065 - if (ret) { 1066 - bch_err_inum_offset_ratelimited(c, 1067 - iter.pos.inode, 1068 - iter.pos.offset << 9, 1069 - "read error %i from btree lookup", ret); 1070 - rbio->bio.bi_status = BLK_STS_IOERR; 1071 - bio_endio(&rbio->bio); 1072 - } 1073 - 1074 - bch2_bkey_buf_exit(&sk, c); 1075 - } 1076 - 1077 - void bch2_readahead(struct readahead_control *ractl) 1078 - { 1079 - struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); 1080 - struct bch_fs *c = inode->v.i_sb->s_fs_info; 1081 - struct bch_io_opts opts; 1082 - struct btree_trans trans; 1083 - struct folio *folio; 1084 - struct readpages_iter readpages_iter; 1085 - int ret; 1086 - 1087 - bch2_inode_opts_get(&opts, c, &inode->ei_inode); 1088 - 1089 - ret = readpages_iter_init(&readpages_iter, ractl); 1090 - BUG_ON(ret); 1091 - 1092 - bch2_trans_init(&trans, c, 0, 0); 1093 - 1094 - bch2_pagecache_add_get(inode); 1095 - 1096 - while ((folio = readpage_iter_peek(&readpages_iter))) { 1097 - unsigned n = min_t(unsigned, 1098 - readpages_iter.folios.nr - 1099 - readpages_iter.idx, 1100 - BIO_MAX_VECS); 1101 - struct bch_read_bio *rbio = 1102 - rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, 1103 - GFP_KERNEL, &c->bio_read), 1104 - opts); 1105 - 1106 - readpage_iter_advance(&readpages_iter); 1107 - 1108 - rbio->bio.bi_iter.bi_sector = folio_sector(folio); 1109 - rbio->bio.bi_end_io = bch2_readpages_end_io; 1110 - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); 1111 - 1112 - bchfs_read(&trans, rbio, inode_inum(inode), 1113 - &readpages_iter); 1114 - bch2_trans_unlock(&trans); 1115 - } 1116 - 1117 - bch2_pagecache_add_put(inode); 1118 - 1119 - bch2_trans_exit(&trans); 1120 - darray_exit(&readpages_iter.folios); 1121 - } 1122 - 1123 - static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio, 1124 - subvol_inum inum, struct folio *folio) 1125 - { 1126 - struct btree_trans trans; 1127 - 1128 - bch2_folio_create(folio, __GFP_NOFAIL); 1129 - 1130 - rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; 1131 - rbio->bio.bi_iter.bi_sector = folio_sector(folio); 1132 - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); 1133 - 1134 - bch2_trans_init(&trans, c, 0, 0); 1135 - bchfs_read(&trans, rbio, inum, NULL); 1136 - bch2_trans_exit(&trans); 1137 - } 1138 - 1139 - static void bch2_read_single_folio_end_io(struct bio *bio) 1140 - { 1141 - complete(bio->bi_private); 1142 - } 1143 - 1144 - static int bch2_read_single_folio(struct folio *folio, 1145 - struct address_space *mapping) 1146 - { 1147 - struct bch_inode_info *inode = to_bch_ei(mapping->host); 1148 - struct bch_fs *c = inode->v.i_sb->s_fs_info; 1149 - struct bch_read_bio *rbio; 1150 - struct bch_io_opts opts; 1151 - int ret; 1152 - DECLARE_COMPLETION_ONSTACK(done); 1153 - 1154 - bch2_inode_opts_get(&opts, c, &inode->ei_inode); 1155 - 1156 - rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), 1157 - opts); 1158 - rbio->bio.bi_private = &done; 1159 - rbio->bio.bi_end_io = bch2_read_single_folio_end_io; 1160 - 1161 - __bchfs_readfolio(c, rbio, inode_inum(inode), folio); 1162 - wait_for_completion(&done); 1163 - 1164 - ret = blk_status_to_errno(rbio->bio.bi_status); 1165 - bio_put(&rbio->bio); 1166 - 1167 - if (ret < 0) 1168 - return ret; 1169 - 1170 - folio_mark_uptodate(folio); 1171 - return 0; 1172 - } 1173 - 1174 - int bch2_read_folio(struct file *file, struct folio *folio) 1175 - { 1176 - int ret; 1177 - 1178 - ret = bch2_read_single_folio(folio, folio->mapping); 1179 - folio_unlock(folio); 1180 - return bch2_err_class(ret); 1181 - } 1182 - 1183 - /* writepages: */ 1184 - 1185 - struct bch_writepage_state { 1186 - struct bch_writepage_io *io; 1187 - struct bch_io_opts opts; 1188 - struct bch_folio_sector *tmp; 1189 - unsigned tmp_sectors; 1190 - }; 1191 - 1192 - static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, 1193 - struct bch_inode_info *inode) 1194 - { 1195 - struct bch_writepage_state ret = { 0 }; 1196 - 1197 - bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode); 1198 - return ret; 1199 - } 1200 - 1201 - static void bch2_writepage_io_done(struct bch_write_op *op) 1202 - { 1203 - struct bch_writepage_io *io = 1204 - container_of(op, struct bch_writepage_io, op); 1205 - struct bch_fs *c = io->op.c; 1206 - struct bio *bio = &io->op.wbio.bio; 1207 - struct folio_iter fi; 1208 - unsigned i; 1209 - 1210 - if (io->op.error) { 1211 - set_bit(EI_INODE_ERROR, &io->inode->ei_flags); 1212 - 1213 - bio_for_each_folio_all(fi, bio) { 1214 - struct bch_folio *s; 1215 - 1216 - folio_set_error(fi.folio); 1217 - mapping_set_error(fi.folio->mapping, -EIO); 1218 - 1219 - s = __bch2_folio(fi.folio); 1220 - spin_lock(&s->lock); 1221 - for (i = 0; i < folio_sectors(fi.folio); i++) 1222 - s->s[i].nr_replicas = 0; 1223 - spin_unlock(&s->lock); 1224 - } 1225 - } 1226 - 1227 - if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { 1228 - bio_for_each_folio_all(fi, bio) { 1229 - struct bch_folio *s; 1230 - 1231 - s = __bch2_folio(fi.folio); 1232 - spin_lock(&s->lock); 1233 - for (i = 0; i < folio_sectors(fi.folio); i++) 1234 - s->s[i].nr_replicas = 0; 1235 - spin_unlock(&s->lock); 1236 - } 1237 - } 1238 - 1239 - /* 1240 - * racing with fallocate can cause us to add fewer sectors than 1241 - * expected - but we shouldn't add more sectors than expected: 1242 - */ 1243 - WARN_ON_ONCE(io->op.i_sectors_delta > 0); 1244 - 1245 - /* 1246 - * (error (due to going RO) halfway through a page can screw that up 1247 - * slightly) 1248 - * XXX wtf? 1249 - BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); 1250 - */ 1251 - 1252 - /* 1253 - * PageWriteback is effectively our ref on the inode - fixup i_blocks 1254 - * before calling end_page_writeback: 1255 - */ 1256 - i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); 1257 - 1258 - bio_for_each_folio_all(fi, bio) { 1259 - struct bch_folio *s = __bch2_folio(fi.folio); 1260 - 1261 - if (atomic_dec_and_test(&s->write_count)) 1262 - folio_end_writeback(fi.folio); 1263 - } 1264 - 1265 - bio_put(&io->op.wbio.bio); 1266 - } 1267 - 1268 - static void bch2_writepage_do_io(struct bch_writepage_state *w) 1269 - { 1270 - struct bch_writepage_io *io = w->io; 1271 - 1272 - w->io = NULL; 1273 - closure_call(&io->op.cl, bch2_write, NULL, NULL); 1274 - } 1275 - 1276 - /* 1277 - * Get a bch_writepage_io and add @page to it - appending to an existing one if 1278 - * possible, else allocating a new one: 1279 - */ 1280 - static void bch2_writepage_io_alloc(struct bch_fs *c, 1281 - struct writeback_control *wbc, 1282 - struct bch_writepage_state *w, 1283 - struct bch_inode_info *inode, 1284 - u64 sector, 1285 - unsigned nr_replicas) 1286 - { 1287 - struct bch_write_op *op; 1288 - 1289 - w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS, 1290 - REQ_OP_WRITE, 1291 - GFP_KERNEL, 1292 - &c->writepage_bioset), 1293 - struct bch_writepage_io, op.wbio.bio); 1294 - 1295 - w->io->inode = inode; 1296 - op = &w->io->op; 1297 - bch2_write_op_init(op, c, w->opts); 1298 - op->target = w->opts.foreground_target; 1299 - op->nr_replicas = nr_replicas; 1300 - op->res.nr_replicas = nr_replicas; 1301 - op->write_point = writepoint_hashed(inode->ei_last_dirtied); 1302 - op->subvol = inode->ei_subvol; 1303 - op->pos = POS(inode->v.i_ino, sector); 1304 - op->end_io = bch2_writepage_io_done; 1305 - op->devs_need_flush = &inode->ei_devs_need_flush; 1306 - op->wbio.bio.bi_iter.bi_sector = sector; 1307 - op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); 1308 - } 1309 - 1310 - static int __bch2_writepage(struct folio *folio, 1311 - struct writeback_control *wbc, 1312 - void *data) 1313 - { 1314 - struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); 1315 - struct bch_fs *c = inode->v.i_sb->s_fs_info; 1316 - struct bch_writepage_state *w = data; 1317 - struct bch_folio *s; 1318 - unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX; 1319 - loff_t i_size = i_size_read(&inode->v); 1320 - int ret; 1321 - 1322 - EBUG_ON(!folio_test_uptodate(folio)); 1323 - 1324 - /* Is the folio fully inside i_size? */ 1325 - if (folio_end_pos(folio) <= i_size) 1326 - goto do_io; 1327 - 1328 - /* Is the folio fully outside i_size? (truncate in progress) */ 1329 - if (folio_pos(folio) >= i_size) { 1330 - folio_unlock(folio); 1331 - return 0; 1332 - } 1333 - 1334 - /* 1335 - * The folio straddles i_size. It must be zeroed out on each and every 1336 - * writepage invocation because it may be mmapped. "A file is mapped 1337 - * in multiples of the folio size. For a file that is not a multiple of 1338 - * the folio size, the remaining memory is zeroed when mapped, and 1339 - * writes to that region are not written out to the file." 1340 - */ 1341 - folio_zero_segment(folio, 1342 - i_size - folio_pos(folio), 1343 - folio_size(folio)); 1344 - do_io: 1345 - f_sectors = folio_sectors(folio); 1346 - s = bch2_folio(folio); 1347 - 1348 - if (f_sectors > w->tmp_sectors) { 1349 - kfree(w->tmp); 1350 - w->tmp = kzalloc(sizeof(struct bch_folio_sector) * 1351 - f_sectors, __GFP_NOFAIL); 1352 - w->tmp_sectors = f_sectors; 1353 - } 1354 - 1355 - /* 1356 - * Things get really hairy with errors during writeback: 1357 - */ 1358 - ret = bch2_get_folio_disk_reservation(c, inode, folio, false); 1359 - BUG_ON(ret); 1360 - 1361 - /* Before unlocking the page, get copy of reservations: */ 1362 - spin_lock(&s->lock); 1363 - memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors); 1364 - 1365 - for (i = 0; i < f_sectors; i++) { 1366 - if (s->s[i].state < SECTOR_dirty) 1367 - continue; 1368 - 1369 - nr_replicas_this_write = 1370 - min_t(unsigned, nr_replicas_this_write, 1371 - s->s[i].nr_replicas + 1372 - s->s[i].replicas_reserved); 1373 - } 1374 - 1375 - for (i = 0; i < f_sectors; i++) { 1376 - if (s->s[i].state < SECTOR_dirty) 1377 - continue; 1378 - 1379 - s->s[i].nr_replicas = w->opts.compression 1380 - ? 0 : nr_replicas_this_write; 1381 - 1382 - s->s[i].replicas_reserved = 0; 1383 - folio_sector_set(folio, s, i, SECTOR_allocated); 1384 - } 1385 - spin_unlock(&s->lock); 1386 - 1387 - BUG_ON(atomic_read(&s->write_count)); 1388 - atomic_set(&s->write_count, 1); 1389 - 1390 - BUG_ON(folio_test_writeback(folio)); 1391 - folio_start_writeback(folio); 1392 - 1393 - folio_unlock(folio); 1394 - 1395 - offset = 0; 1396 - while (1) { 1397 - unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0; 1398 - u64 sector; 1399 - 1400 - while (offset < f_sectors && 1401 - w->tmp[offset].state < SECTOR_dirty) 1402 - offset++; 1403 - 1404 - if (offset == f_sectors) 1405 - break; 1406 - 1407 - while (offset + sectors < f_sectors && 1408 - w->tmp[offset + sectors].state >= SECTOR_dirty) { 1409 - reserved_sectors += w->tmp[offset + sectors].replicas_reserved; 1410 - dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty; 1411 - sectors++; 1412 - } 1413 - BUG_ON(!sectors); 1414 - 1415 - sector = folio_sector(folio) + offset; 1416 - 1417 - if (w->io && 1418 - (w->io->op.res.nr_replicas != nr_replicas_this_write || 1419 - bio_full(&w->io->op.wbio.bio, sectors << 9) || 1420 - w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= 1421 - (BIO_MAX_VECS * PAGE_SIZE) || 1422 - bio_end_sector(&w->io->op.wbio.bio) != sector)) 1423 - bch2_writepage_do_io(w); 1424 - 1425 - if (!w->io) 1426 - bch2_writepage_io_alloc(c, wbc, w, inode, sector, 1427 - nr_replicas_this_write); 1428 - 1429 - atomic_inc(&s->write_count); 1430 - 1431 - BUG_ON(inode != w->io->inode); 1432 - BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, 1433 - sectors << 9, offset << 9)); 1434 - 1435 - /* Check for writing past i_size: */ 1436 - WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > 1437 - round_up(i_size, block_bytes(c)) && 1438 - !test_bit(BCH_FS_EMERGENCY_RO, &c->flags), 1439 - "writing past i_size: %llu > %llu (unrounded %llu)\n", 1440 - bio_end_sector(&w->io->op.wbio.bio) << 9, 1441 - round_up(i_size, block_bytes(c)), 1442 - i_size); 1443 - 1444 - w->io->op.res.sectors += reserved_sectors; 1445 - w->io->op.i_sectors_delta -= dirty_sectors; 1446 - w->io->op.new_i_size = i_size; 1447 - 1448 - offset += sectors; 1449 - } 1450 - 1451 - if (atomic_dec_and_test(&s->write_count)) 1452 - folio_end_writeback(folio); 1453 - 1454 - return 0; 1455 - } 1456 - 1457 - int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) 1458 - { 1459 - struct bch_fs *c = mapping->host->i_sb->s_fs_info; 1460 - struct bch_writepage_state w = 1461 - bch_writepage_state_init(c, to_bch_ei(mapping->host)); 1462 - struct blk_plug plug; 1463 - int ret; 1464 - 1465 - blk_start_plug(&plug); 1466 - ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); 1467 - if (w.io) 1468 - bch2_writepage_do_io(&w); 1469 - blk_finish_plug(&plug); 1470 - kfree(w.tmp); 1471 - return bch2_err_class(ret); 1472 - } 1473 - 1474 - /* buffered writes: */ 1475 - 1476 - int bch2_write_begin(struct file *file, struct address_space *mapping, 1477 - loff_t pos, unsigned len, 1478 - struct page **pagep, void **fsdata) 1479 - { 1480 - struct bch_inode_info *inode = to_bch_ei(mapping->host); 1481 - struct bch_fs *c = inode->v.i_sb->s_fs_info; 1482 - struct bch2_folio_reservation *res; 1483 - struct folio *folio; 1484 - unsigned offset; 1485 - int ret = -ENOMEM; 1486 - 1487 - res = kmalloc(sizeof(*res), GFP_KERNEL); 1488 - if (!res) 1489 - return -ENOMEM; 1490 - 1491 - bch2_folio_reservation_init(c, inode, res); 1492 - *fsdata = res; 1493 - 1494 - bch2_pagecache_add_get(inode); 1495 - 1496 - folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, 1497 - FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE, 1498 - mapping_gfp_mask(mapping)); 1499 - if (IS_ERR_OR_NULL(folio)) 1500 - goto err_unlock; 1501 - 1502 - if (folio_test_uptodate(folio)) 1503 - goto out; 1504 - 1505 - offset = pos - folio_pos(folio); 1506 - len = min_t(size_t, len, folio_end_pos(folio) - pos); 1507 - 1508 - /* If we're writing entire folio, don't need to read it in first: */ 1509 - if (!offset && len == folio_size(folio)) 1510 - goto out; 1511 - 1512 - if (!offset && pos + len >= inode->v.i_size) { 1513 - folio_zero_segment(folio, len, folio_size(folio)); 1514 - flush_dcache_folio(folio); 1515 - goto out; 1516 - } 1517 - 1518 - if (folio_pos(folio) >= inode->v.i_size) { 1519 - folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio)); 1520 - flush_dcache_folio(folio); 1521 - goto out; 1522 - } 1523 - readpage: 1524 - ret = bch2_read_single_folio(folio, mapping); 1525 - if (ret) 1526 - goto err; 1527 - out: 1528 - ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); 1529 - if (ret) 1530 - goto err; 1531 - 1532 - ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len); 1533 - if (ret) { 1534 - if (!folio_test_uptodate(folio)) { 1535 - /* 1536 - * If the folio hasn't been read in, we won't know if we 1537 - * actually need a reservation - we don't actually need 1538 - * to read here, we just need to check if the folio is 1539 - * fully backed by uncompressed data: 1540 - */ 1541 - goto readpage; 1542 - } 1543 - 1544 - goto err; 1545 - } 1546 - 1547 - *pagep = &folio->page; 1548 - return 0; 1549 - err: 1550 - folio_unlock(folio); 1551 - folio_put(folio); 1552 - *pagep = NULL; 1553 - err_unlock: 1554 - bch2_pagecache_add_put(inode); 1555 - kfree(res); 1556 - *fsdata = NULL; 1557 - return bch2_err_class(ret); 1558 - } 1559 - 1560 - int bch2_write_end(struct file *file, struct address_space *mapping, 1561 - loff_t pos, unsigned len, unsigned copied, 1562 - struct page *page, void *fsdata) 1563 - { 1564 - struct bch_inode_info *inode = to_bch_ei(mapping->host); 1565 - struct bch_fs *c = inode->v.i_sb->s_fs_info; 1566 - struct bch2_folio_reservation *res = fsdata; 1567 - struct folio *folio = page_folio(page); 1568 - unsigned offset = pos - folio_pos(folio); 1569 - 1570 - lockdep_assert_held(&inode->v.i_rwsem); 1571 - BUG_ON(offset + copied > folio_size(folio)); 1572 - 1573 - if (unlikely(copied < len && !folio_test_uptodate(folio))) { 1574 - /* 1575 - * The folio needs to be read in, but that would destroy 1576 - * our partial write - simplest thing is to just force 1577 - * userspace to redo the write: 1578 - */ 1579 - folio_zero_range(folio, 0, folio_size(folio)); 1580 - flush_dcache_folio(folio); 1581 - copied = 0; 1582 - } 1583 - 1584 - spin_lock(&inode->v.i_lock); 1585 - if (pos + copied > inode->v.i_size) 1586 - i_size_write(&inode->v, pos + copied); 1587 - spin_unlock(&inode->v.i_lock); 1588 - 1589 - if (copied) { 1590 - if (!folio_test_uptodate(folio)) 1591 - folio_mark_uptodate(folio); 1592 - 1593 - bch2_set_folio_dirty(c, inode, folio, res, offset, copied); 1594 - 1595 - inode->ei_last_dirtied = (unsigned long) current; 1596 - } 1597 - 1598 - folio_unlock(folio); 1599 - folio_put(folio); 1600 - bch2_pagecache_add_put(inode); 1601 - 1602 - bch2_folio_reservation_put(c, inode, res); 1603 - kfree(res); 1604 - 1605 - return copied; 1606 - } 1607 - 1608 - static noinline void folios_trunc(folios *folios, struct folio **fi) 1609 - { 1610 - while (folios->data + folios->nr > fi) { 1611 - struct folio *f = darray_pop(folios); 1612 - 1613 - folio_unlock(f); 1614 - folio_put(f); 1615 - } 1616 - } 1617 - 1618 - static int __bch2_buffered_write(struct bch_inode_info *inode, 1619 - struct address_space *mapping, 1620 - struct iov_iter *iter, 1621 - loff_t pos, unsigned len) 1622 - { 1623 - struct bch_fs *c = inode->v.i_sb->s_fs_info; 1624 - struct bch2_folio_reservation res; 1625 - folios folios; 1626 - struct folio **fi, *f; 1627 - unsigned copied = 0, f_offset; 1628 - u64 end = pos + len, f_pos; 1629 - loff_t last_folio_pos = inode->v.i_size; 1630 - int ret = 0; 1631 - 1632 - BUG_ON(!len); 1633 - 1634 - bch2_folio_reservation_init(c, inode, &res); 1635 - darray_init(&folios); 1636 - 1637 - ret = filemap_get_contig_folios_d(mapping, pos, end, 1638 - FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT, 1639 - mapping_gfp_mask(mapping), 1640 - &folios); 1641 - if (ret) 1642 - goto out; 1643 - 1644 - BUG_ON(!folios.nr); 1645 - 1646 - f = darray_first(folios); 1647 - if (pos != folio_pos(f) && !folio_test_uptodate(f)) { 1648 - ret = bch2_read_single_folio(f, mapping); 1649 - if (ret) 1650 - goto out; 1651 - } 1652 - 1653 - f = darray_last(folios); 1654 - end = min(end, folio_end_pos(f)); 1655 - last_folio_pos = folio_pos(f); 1656 - if (end != folio_end_pos(f) && !folio_test_uptodate(f)) { 1657 - if (end >= inode->v.i_size) { 1658 - folio_zero_range(f, 0, folio_size(f)); 1659 - } else { 1660 - ret = bch2_read_single_folio(f, mapping); 1661 - if (ret) 1662 - goto out; 1663 - } 1664 - } 1665 - 1666 - ret = bch2_folio_set(c, inode_inum(inode), folios.data, folios.nr); 1667 - if (ret) 1668 - goto out; 1669 - 1670 - f_pos = pos; 1671 - f_offset = pos - folio_pos(darray_first(folios)); 1672 - darray_for_each(folios, fi) { 1673 - struct folio *f = *fi; 1674 - u64 f_len = min(end, folio_end_pos(f)) - f_pos; 1675 - 1676 - /* 1677 - * XXX: per POSIX and fstests generic/275, on -ENOSPC we're 1678 - * supposed to write as much as we have disk space for. 1679 - * 1680 - * On failure here we should still write out a partial page if 1681 - * we aren't completely out of disk space - we don't do that 1682 - * yet: 1683 - */ 1684 - ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len); 1685 - if (unlikely(ret)) { 1686 - folios_trunc(&folios, fi); 1687 - if (!folios.nr) 1688 - goto out; 1689 - 1690 - end = min(end, folio_end_pos(darray_last(folios))); 1691 - break; 1692 - } 1693 - 1694 - f_pos = folio_end_pos(f); 1695 - f_offset = 0; 1696 - } 1697 - 1698 - if (mapping_writably_mapped(mapping)) 1699 - darray_for_each(folios, fi) 1700 - flush_dcache_folio(*fi); 1701 - 1702 - f_pos = pos; 1703 - f_offset = pos - folio_pos(darray_first(folios)); 1704 - darray_for_each(folios, fi) { 1705 - struct folio *f = *fi; 1706 - u64 f_len = min(end, folio_end_pos(f)) - f_pos; 1707 - unsigned f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter); 1708 - 1709 - if (!f_copied) { 1710 - folios_trunc(&folios, fi); 1711 - break; 1712 - } 1713 - 1714 - if (!folio_test_uptodate(f) && 1715 - f_copied != folio_size(f) && 1716 - pos + copied + f_copied < inode->v.i_size) { 1717 - folio_zero_range(f, 0, folio_size(f)); 1718 - folios_trunc(&folios, fi); 1719 - break; 1720 - } 1721 - 1722 - flush_dcache_folio(f); 1723 - copied += f_copied; 1724 - 1725 - if (f_copied != f_len) { 1726 - folios_trunc(&folios, fi + 1); 1727 - break; 1728 - } 1729 - 1730 - f_pos = folio_end_pos(f); 1731 - f_offset = 0; 1732 - } 1733 - 1734 - if (!copied) 1735 - goto out; 1736 - 1737 - end = pos + copied; 1738 - 1739 - spin_lock(&inode->v.i_lock); 1740 - if (end > inode->v.i_size) 1741 - i_size_write(&inode->v, end); 1742 - spin_unlock(&inode->v.i_lock); 1743 - 1744 - f_pos = pos; 1745 - f_offset = pos - folio_pos(darray_first(folios)); 1746 - darray_for_each(folios, fi) { 1747 - struct folio *f = *fi; 1748 - u64 f_len = min(end, folio_end_pos(f)) - f_pos; 1749 - 1750 - if (!folio_test_uptodate(f)) 1751 - folio_mark_uptodate(f); 1752 - 1753 - bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len); 1754 - 1755 - f_pos = folio_end_pos(f); 1756 - f_offset = 0; 1757 - } 1758 - 1759 - inode->ei_last_dirtied = (unsigned long) current; 1760 - out: 1761 - darray_for_each(folios, fi) { 1762 - folio_unlock(*fi); 1763 - folio_put(*fi); 1764 - } 1765 - 1766 - /* 1767 - * If the last folio added to the mapping starts beyond current EOF, we 1768 - * performed a short write but left around at least one post-EOF folio. 1769 - * Clean up the mapping before we return. 1770 - */ 1771 - if (last_folio_pos >= inode->v.i_size) 1772 - truncate_pagecache(&inode->v, inode->v.i_size); 1773 - 1774 - darray_exit(&folios); 1775 - bch2_folio_reservation_put(c, inode, &res); 1776 - 1777 - return copied ?: ret; 1778 - } 1779 - 1780 - static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) 1781 - { 1782 - struct file *file = iocb->ki_filp; 1783 - struct address_space *mapping = file->f_mapping; 1784 - struct bch_inode_info *inode = file_bch_inode(file); 1785 - loff_t pos = iocb->ki_pos; 1786 - ssize_t written = 0; 1787 - int ret = 0; 1788 - 1789 - bch2_pagecache_add_get(inode); 1790 - 1791 - do { 1792 - unsigned offset = pos & (PAGE_SIZE - 1); 1793 - unsigned bytes = iov_iter_count(iter); 1794 - again: 1795 - /* 1796 - * Bring in the user page that we will copy from _first_. 1797 - * Otherwise there's a nasty deadlock on copying from the 1798 - * same page as we're writing to, without it being marked 1799 - * up-to-date. 1800 - * 1801 - * Not only is this an optimisation, but it is also required 1802 - * to check that the address is actually valid, when atomic 1803 - * usercopies are used, below. 1804 - */ 1805 - if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { 1806 - bytes = min_t(unsigned long, iov_iter_count(iter), 1807 - PAGE_SIZE - offset); 1808 - 1809 - if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { 1810 - ret = -EFAULT; 1811 - break; 1812 - } 1813 - } 1814 - 1815 - if (unlikely(fatal_signal_pending(current))) { 1816 - ret = -EINTR; 1817 - break; 1818 - } 1819 - 1820 - ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); 1821 - if (unlikely(ret < 0)) 1822 - break; 1823 - 1824 - cond_resched(); 1825 - 1826 - if (unlikely(ret == 0)) { 1827 - /* 1828 - * If we were unable to copy any data at all, we must 1829 - * fall back to a single segment length write. 1830 - * 1831 - * If we didn't fallback here, we could livelock 1832 - * because not all segments in the iov can be copied at 1833 - * once without a pagefault. 1834 - */ 1835 - bytes = min_t(unsigned long, PAGE_SIZE - offset, 1836 - iov_iter_single_seg_count(iter)); 1837 - goto again; 1838 - } 1839 - pos += ret; 1840 - written += ret; 1841 - ret = 0; 1842 - 1843 - balance_dirty_pages_ratelimited(mapping); 1844 - } while (iov_iter_count(iter)); 1845 - 1846 - bch2_pagecache_add_put(inode); 1847 - 1848 - return written ? written : ret; 1849 - } 1850 - 1851 - /* O_DIRECT reads */ 1852 - 1853 - static void bio_check_or_release(struct bio *bio, bool check_dirty) 1854 - { 1855 - if (check_dirty) { 1856 - bio_check_pages_dirty(bio); 1857 - } else { 1858 - bio_release_pages(bio, false); 1859 - bio_put(bio); 1860 - } 1861 - } 1862 - 1863 - static void bch2_dio_read_complete(struct closure *cl) 1864 - { 1865 - struct dio_read *dio = container_of(cl, struct dio_read, cl); 1866 - 1867 - dio->req->ki_complete(dio->req, dio->ret); 1868 - bio_check_or_release(&dio->rbio.bio, dio->should_dirty); 1869 - } 1870 - 1871 - static void bch2_direct_IO_read_endio(struct bio *bio) 1872 - { 1873 - struct dio_read *dio = bio->bi_private; 1874 - 1875 - if (bio->bi_status) 1876 - dio->ret = blk_status_to_errno(bio->bi_status); 1877 - 1878 - closure_put(&dio->cl); 1879 - } 1880 - 1881 - static void bch2_direct_IO_read_split_endio(struct bio *bio) 1882 - { 1883 - struct dio_read *dio = bio->bi_private; 1884 - bool should_dirty = dio->should_dirty; 1885 - 1886 - bch2_direct_IO_read_endio(bio); 1887 - bio_check_or_release(bio, should_dirty); 1888 - } 1889 - 1890 - static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) 1891 - { 1892 - struct file *file = req->ki_filp; 1893 - struct bch_inode_info *inode = file_bch_inode(file); 1894 - struct bch_fs *c = inode->v.i_sb->s_fs_info; 1895 - struct bch_io_opts opts; 1896 - struct dio_read *dio; 1897 - struct bio *bio; 1898 - loff_t offset = req->ki_pos; 1899 - bool sync = is_sync_kiocb(req); 1900 - size_t shorten; 1901 - ssize_t ret; 1902 - 1903 - bch2_inode_opts_get(&opts, c, &inode->ei_inode); 1904 - 1905 - if ((offset|iter->count) & (block_bytes(c) - 1)) 1906 - return -EINVAL; 1907 - 1908 - ret = min_t(loff_t, iter->count, 1909 - max_t(loff_t, 0, i_size_read(&inode->v) - offset)); 1910 - 1911 - if (!ret) 1912 - return ret; 1913 - 1914 - shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); 1915 - iter->count -= shorten; 1916 - 1917 - bio = bio_alloc_bioset(NULL, 1918 - bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), 1919 - REQ_OP_READ, 1920 - GFP_KERNEL, 1921 - &c->dio_read_bioset); 1922 - 1923 - bio->bi_end_io = bch2_direct_IO_read_endio; 1924 - 1925 - dio = container_of(bio, struct dio_read, rbio.bio); 1926 - closure_init(&dio->cl, NULL); 1927 - 1928 - /* 1929 - * this is a _really_ horrible hack just to avoid an atomic sub at the 1930 - * end: 1931 - */ 1932 - if (!sync) { 1933 - set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); 1934 - atomic_set(&dio->cl.remaining, 1935 - CLOSURE_REMAINING_INITIALIZER - 1936 - CLOSURE_RUNNING + 1937 - CLOSURE_DESTRUCTOR); 1938 - } else { 1939 - atomic_set(&dio->cl.remaining, 1940 - CLOSURE_REMAINING_INITIALIZER + 1); 1941 - } 1942 - 1943 - dio->req = req; 1944 - dio->ret = ret; 1945 - /* 1946 - * This is one of the sketchier things I've encountered: we have to skip 1947 - * the dirtying of requests that are internal from the kernel (i.e. from 1948 - * loopback), because we'll deadlock on page_lock. 1949 - */ 1950 - dio->should_dirty = iter_is_iovec(iter); 1951 - 1952 - goto start; 1953 - while (iter->count) { 1954 - bio = bio_alloc_bioset(NULL, 1955 - bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), 1956 - REQ_OP_READ, 1957 - GFP_KERNEL, 1958 - &c->bio_read); 1959 - bio->bi_end_io = bch2_direct_IO_read_split_endio; 1960 - start: 1961 - bio->bi_opf = REQ_OP_READ|REQ_SYNC; 1962 - bio->bi_iter.bi_sector = offset >> 9; 1963 - bio->bi_private = dio; 1964 - 1965 - ret = bio_iov_iter_get_pages(bio, iter); 1966 - if (ret < 0) { 1967 - /* XXX: fault inject this path */ 1968 - bio->bi_status = BLK_STS_RESOURCE; 1969 - bio_endio(bio); 1970 - break; 1971 - } 1972 - 1973 - offset += bio->bi_iter.bi_size; 1974 - 1975 - if (dio->should_dirty) 1976 - bio_set_pages_dirty(bio); 1977 - 1978 - if (iter->count) 1979 - closure_get(&dio->cl); 1980 - 1981 - bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); 1982 - } 1983 - 1984 - iter->count += shorten; 1985 - 1986 - if (sync) { 1987 - closure_sync(&dio->cl); 1988 - closure_debug_destroy(&dio->cl); 1989 - ret = dio->ret; 1990 - bio_check_or_release(&dio->rbio.bio, dio->should_dirty); 1991 - return ret; 1992 - } else { 1993 - return -EIOCBQUEUED; 1994 - } 1995 - } 1996 - 1997 - ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) 1998 - { 1999 - struct file *file = iocb->ki_filp; 2000 - struct bch_inode_info *inode = file_bch_inode(file); 2001 - struct address_space *mapping = file->f_mapping; 2002 - size_t count = iov_iter_count(iter); 2003 - ssize_t ret; 2004 - 2005 - if (!count) 2006 - return 0; /* skip atime */ 2007 - 2008 - if (iocb->ki_flags & IOCB_DIRECT) { 2009 - struct blk_plug plug; 2010 - 2011 - if (unlikely(mapping->nrpages)) { 2012 - ret = filemap_write_and_wait_range(mapping, 2013 - iocb->ki_pos, 2014 - iocb->ki_pos + count - 1); 2015 - if (ret < 0) 2016 - goto out; 2017 - } 2018 - 2019 - file_accessed(file); 2020 - 2021 - blk_start_plug(&plug); 2022 - ret = bch2_direct_IO_read(iocb, iter); 2023 - blk_finish_plug(&plug); 2024 - 2025 - if (ret >= 0) 2026 - iocb->ki_pos += ret; 2027 - } else { 2028 - bch2_pagecache_add_get(inode); 2029 - ret = generic_file_read_iter(iocb, iter); 2030 - bch2_pagecache_add_put(inode); 2031 - } 2032 - out: 2033 - return bch2_err_class(ret); 2034 - } 2035 - 2036 - /* O_DIRECT writes */ 2037 - 2038 - static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, 2039 - u64 offset, u64 size, 2040 - unsigned nr_replicas, bool compressed) 2041 - { 2042 - struct btree_trans trans; 2043 - struct btree_iter iter; 2044 - struct bkey_s_c k; 2045 - u64 end = offset + size; 2046 - u32 snapshot; 2047 - bool ret = true; 2048 - int err; 2049 - 2050 - bch2_trans_init(&trans, c, 0, 0); 2051 - retry: 2052 - bch2_trans_begin(&trans); 2053 - 2054 - err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); 2055 - if (err) 2056 - goto err; 2057 - 2058 - for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, 2059 - SPOS(inum.inum, offset, snapshot), 2060 - BTREE_ITER_SLOTS, k, err) { 2061 - if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end))) 2062 - break; 2063 - 2064 - if (k.k->p.snapshot != snapshot || 2065 - nr_replicas > bch2_bkey_replicas(c, k) || 2066 - (!compressed && bch2_bkey_sectors_compressed(k))) { 2067 - ret = false; 2068 - break; 2069 - } 2070 - } 2071 - 2072 - offset = iter.pos.offset; 2073 - bch2_trans_iter_exit(&trans, &iter); 2074 - err: 2075 - if (bch2_err_matches(err, BCH_ERR_transaction_restart)) 2076 - goto retry; 2077 - bch2_trans_exit(&trans); 2078 - 2079 - return err ? false : ret; 2080 - } 2081 - 2082 - static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio) 2083 - { 2084 - struct bch_fs *c = dio->op.c; 2085 - struct bch_inode_info *inode = dio->inode; 2086 - struct bio *bio = &dio->op.wbio.bio; 2087 - 2088 - return bch2_check_range_allocated(c, inode_inum(inode), 2089 - dio->op.pos.offset, bio_sectors(bio), 2090 - dio->op.opts.data_replicas, 2091 - dio->op.opts.compression != 0); 2092 - } 2093 - 2094 - static void bch2_dio_write_loop_async(struct bch_write_op *); 2095 - static __always_inline long bch2_dio_write_done(struct dio_write *dio); 2096 - 2097 - /* 2098 - * We're going to return -EIOCBQUEUED, but we haven't finished consuming the 2099 - * iov_iter yet, so we need to stash a copy of the iovec: it might be on the 2100 - * caller's stack, we're not guaranteed that it will live for the duration of 2101 - * the IO: 2102 - */ 2103 - static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) 2104 - { 2105 - struct iovec *iov = dio->inline_vecs; 2106 - 2107 - /* 2108 - * iov_iter has a single embedded iovec - nothing to do: 2109 - */ 2110 - if (iter_is_ubuf(&dio->iter)) 2111 - return 0; 2112 - 2113 - /* 2114 - * We don't currently handle non-iovec iov_iters here - return an error, 2115 - * and we'll fall back to doing the IO synchronously: 2116 - */ 2117 - if (!iter_is_iovec(&dio->iter)) 2118 - return -1; 2119 - 2120 - if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { 2121 - iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), 2122 - GFP_KERNEL); 2123 - if (unlikely(!iov)) 2124 - return -ENOMEM; 2125 - 2126 - dio->free_iov = true; 2127 - } 2128 - 2129 - memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov)); 2130 - dio->iter.__iov = iov; 2131 - return 0; 2132 - } 2133 - 2134 - static void bch2_dio_write_flush_done(struct closure *cl) 2135 - { 2136 - struct dio_write *dio = container_of(cl, struct dio_write, op.cl); 2137 - struct bch_fs *c = dio->op.c; 2138 - 2139 - closure_debug_destroy(cl); 2140 - 2141 - dio->op.error = bch2_journal_error(&c->journal); 2142 - 2143 - bch2_dio_write_done(dio); 2144 - } 2145 - 2146 - static noinline void bch2_dio_write_flush(struct dio_write *dio) 2147 - { 2148 - struct bch_fs *c = dio->op.c; 2149 - struct bch_inode_unpacked inode; 2150 - int ret; 2151 - 2152 - dio->flush = 0; 2153 - 2154 - closure_init(&dio->op.cl, NULL); 2155 - 2156 - if (!dio->op.error) { 2157 - ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode); 2158 - if (ret) { 2159 - dio->op.error = ret; 2160 - } else { 2161 - bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl); 2162 - bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl); 2163 - } 2164 - } 2165 - 2166 - if (dio->sync) { 2167 - closure_sync(&dio->op.cl); 2168 - closure_debug_destroy(&dio->op.cl); 2169 - } else { 2170 - continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL); 2171 - } 2172 - } 2173 - 2174 - static __always_inline long bch2_dio_write_done(struct dio_write *dio) 2175 - { 2176 - struct kiocb *req = dio->req; 2177 - struct bch_inode_info *inode = dio->inode; 2178 - bool sync = dio->sync; 2179 - long ret; 2180 - 2181 - if (unlikely(dio->flush)) { 2182 - bch2_dio_write_flush(dio); 2183 - if (!sync) 2184 - return -EIOCBQUEUED; 2185 - } 2186 - 2187 - bch2_pagecache_block_put(inode); 2188 - 2189 - if (dio->free_iov) 2190 - kfree(dio->iter.__iov); 2191 - 2192 - ret = dio->op.error ?: ((long) dio->written << 9); 2193 - bio_put(&dio->op.wbio.bio); 2194 - 2195 - /* inode->i_dio_count is our ref on inode and thus bch_fs */ 2196 - inode_dio_end(&inode->v); 2197 - 2198 - if (ret < 0) 2199 - ret = bch2_err_class(ret); 2200 - 2201 - if (!sync) { 2202 - req->ki_complete(req, ret); 2203 - ret = -EIOCBQUEUED; 2204 - } 2205 - return ret; 2206 - } 2207 - 2208 - static __always_inline void bch2_dio_write_end(struct dio_write *dio) 2209 - { 2210 - struct bch_fs *c = dio->op.c; 2211 - struct kiocb *req = dio->req; 2212 - struct bch_inode_info *inode = dio->inode; 2213 - struct bio *bio = &dio->op.wbio.bio; 2214 - 2215 - req->ki_pos += (u64) dio->op.written << 9; 2216 - dio->written += dio->op.written; 2217 - 2218 - if (dio->extending) { 2219 - spin_lock(&inode->v.i_lock); 2220 - if (req->ki_pos > inode->v.i_size) 2221 - i_size_write(&inode->v, req->ki_pos); 2222 - spin_unlock(&inode->v.i_lock); 2223 - } 2224 - 2225 - if (dio->op.i_sectors_delta || dio->quota_res.sectors) { 2226 - mutex_lock(&inode->ei_quota_lock); 2227 - __i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta); 2228 - __bch2_quota_reservation_put(c, inode, &dio->quota_res); 2229 - mutex_unlock(&inode->ei_quota_lock); 2230 - } 2231 - 2232 - bio_release_pages(bio, false); 2233 - 2234 - if (unlikely(dio->op.error)) 2235 - set_bit(EI_INODE_ERROR, &inode->ei_flags); 2236 - } 2237 - 2238 - static __always_inline long bch2_dio_write_loop(struct dio_write *dio) 2239 - { 2240 - struct bch_fs *c = dio->op.c; 2241 - struct kiocb *req = dio->req; 2242 - struct address_space *mapping = dio->mapping; 2243 - struct bch_inode_info *inode = dio->inode; 2244 - struct bch_io_opts opts; 2245 - struct bio *bio = &dio->op.wbio.bio; 2246 - unsigned unaligned, iter_count; 2247 - bool sync = dio->sync, dropped_locks; 2248 - long ret; 2249 - 2250 - bch2_inode_opts_get(&opts, c, &inode->ei_inode); 2251 - 2252 - while (1) { 2253 - iter_count = dio->iter.count; 2254 - 2255 - EBUG_ON(current->faults_disabled_mapping); 2256 - current->faults_disabled_mapping = mapping; 2257 - 2258 - ret = bio_iov_iter_get_pages(bio, &dio->iter); 2259 - 2260 - dropped_locks = fdm_dropped_locks(); 2261 - 2262 - current->faults_disabled_mapping = NULL; 2263 - 2264 - /* 2265 - * If the fault handler returned an error but also signalled 2266 - * that it dropped & retook ei_pagecache_lock, we just need to 2267 - * re-shoot down the page cache and retry: 2268 - */ 2269 - if (dropped_locks && ret) 2270 - ret = 0; 2271 - 2272 - if (unlikely(ret < 0)) 2273 - goto err; 2274 - 2275 - if (unlikely(dropped_locks)) { 2276 - ret = write_invalidate_inode_pages_range(mapping, 2277 - req->ki_pos, 2278 - req->ki_pos + iter_count - 1); 2279 - if (unlikely(ret)) 2280 - goto err; 2281 - 2282 - if (!bio->bi_iter.bi_size) 2283 - continue; 2284 - } 2285 - 2286 - unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); 2287 - bio->bi_iter.bi_size -= unaligned; 2288 - iov_iter_revert(&dio->iter, unaligned); 2289 - 2290 - if (!bio->bi_iter.bi_size) { 2291 - /* 2292 - * bio_iov_iter_get_pages was only able to get < 2293 - * blocksize worth of pages: 2294 - */ 2295 - ret = -EFAULT; 2296 - goto err; 2297 - } 2298 - 2299 - bch2_write_op_init(&dio->op, c, opts); 2300 - dio->op.end_io = sync 2301 - ? NULL 2302 - : bch2_dio_write_loop_async; 2303 - dio->op.target = dio->op.opts.foreground_target; 2304 - dio->op.write_point = writepoint_hashed((unsigned long) current); 2305 - dio->op.nr_replicas = dio->op.opts.data_replicas; 2306 - dio->op.subvol = inode->ei_subvol; 2307 - dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); 2308 - dio->op.devs_need_flush = &inode->ei_devs_need_flush; 2309 - 2310 - if (sync) 2311 - dio->op.flags |= BCH_WRITE_SYNC; 2312 - dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; 2313 - 2314 - ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, 2315 - bio_sectors(bio), true); 2316 - if (unlikely(ret)) 2317 - goto err; 2318 - 2319 - ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), 2320 - dio->op.opts.data_replicas, 0); 2321 - if (unlikely(ret) && 2322 - !bch2_dio_write_check_allocated(dio)) 2323 - goto err; 2324 - 2325 - task_io_account_write(bio->bi_iter.bi_size); 2326 - 2327 - if (unlikely(dio->iter.count) && 2328 - !dio->sync && 2329 - !dio->loop && 2330 - bch2_dio_write_copy_iov(dio)) 2331 - dio->sync = sync = true; 2332 - 2333 - dio->loop = true; 2334 - closure_call(&dio->op.cl, bch2_write, NULL, NULL); 2335 - 2336 - if (!sync) 2337 - return -EIOCBQUEUED; 2338 - 2339 - bch2_dio_write_end(dio); 2340 - 2341 - if (likely(!dio->iter.count) || dio->op.error) 2342 - break; 2343 - 2344 - bio_reset(bio, NULL, REQ_OP_WRITE); 2345 - } 2346 - out: 2347 - return bch2_dio_write_done(dio); 2348 - err: 2349 - dio->op.error = ret; 2350 - 2351 - bio_release_pages(bio, false); 2352 - 2353 - bch2_quota_reservation_put(c, inode, &dio->quota_res); 2354 - goto out; 2355 - } 2356 - 2357 - static noinline __cold void bch2_dio_write_continue(struct dio_write *dio) 2358 - { 2359 - struct mm_struct *mm = dio->mm; 2360 - 2361 - bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE); 2362 - 2363 - if (mm) 2364 - kthread_use_mm(mm); 2365 - bch2_dio_write_loop(dio); 2366 - if (mm) 2367 - kthread_unuse_mm(mm); 2368 - } 2369 - 2370 - static void bch2_dio_write_loop_async(struct bch_write_op *op) 2371 - { 2372 - struct dio_write *dio = container_of(op, struct dio_write, op); 2373 - 2374 - bch2_dio_write_end(dio); 2375 - 2376 - if (likely(!dio->iter.count) || dio->op.error) 2377 - bch2_dio_write_done(dio); 2378 - else 2379 - bch2_dio_write_continue(dio); 2380 - } 2381 - 2382 - static noinline 2383 - ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) 2384 - { 2385 - struct file *file = req->ki_filp; 2386 - struct address_space *mapping = file->f_mapping; 2387 - struct bch_inode_info *inode = file_bch_inode(file); 2388 - struct bch_fs *c = inode->v.i_sb->s_fs_info; 2389 - struct dio_write *dio; 2390 - struct bio *bio; 2391 - bool locked = true, extending; 2392 - ssize_t ret; 2393 - 2394 - prefetch(&c->opts); 2395 - prefetch((void *) &c->opts + 64); 2396 - prefetch(&inode->ei_inode); 2397 - prefetch((void *) &inode->ei_inode + 64); 2398 - 2399 - inode_lock(&inode->v); 2400 - 2401 - ret = generic_write_checks(req, iter); 2402 - if (unlikely(ret <= 0)) 2403 - goto err; 2404 - 2405 - ret = file_remove_privs(file); 2406 - if (unlikely(ret)) 2407 - goto err; 2408 - 2409 - ret = file_update_time(file); 2410 - if (unlikely(ret)) 2411 - goto err; 2412 - 2413 - if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) 2414 - goto err; 2415 - 2416 - inode_dio_begin(&inode->v); 2417 - bch2_pagecache_block_get(inode); 2418 - 2419 - extending = req->ki_pos + iter->count > inode->v.i_size; 2420 - if (!extending) { 2421 - inode_unlock(&inode->v); 2422 - locked = false; 2423 - } 2424 - 2425 - bio = bio_alloc_bioset(NULL, 2426 - bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), 2427 - REQ_OP_WRITE, 2428 - GFP_KERNEL, 2429 - &c->dio_write_bioset); 2430 - dio = container_of(bio, struct dio_write, op.wbio.bio); 2431 - dio->req = req; 2432 - dio->mapping = mapping; 2433 - dio->inode = inode; 2434 - dio->mm = current->mm; 2435 - dio->loop = false; 2436 - dio->extending = extending; 2437 - dio->sync = is_sync_kiocb(req) || extending; 2438 - dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled; 2439 - dio->free_iov = false; 2440 - dio->quota_res.sectors = 0; 2441 - dio->written = 0; 2442 - dio->iter = *iter; 2443 - dio->op.c = c; 2444 - 2445 - if (unlikely(mapping->nrpages)) { 2446 - ret = write_invalidate_inode_pages_range(mapping, 2447 - req->ki_pos, 2448 - req->ki_pos + iter->count - 1); 2449 - if (unlikely(ret)) 2450 - goto err_put_bio; 2451 - } 2452 - 2453 - ret = bch2_dio_write_loop(dio); 2454 - err: 2455 - if (locked) 2456 - inode_unlock(&inode->v); 2457 - return ret; 2458 - err_put_bio: 2459 - bch2_pagecache_block_put(inode); 2460 - bio_put(bio); 2461 - inode_dio_end(&inode->v); 2462 - goto err; 2463 - } 2464 - 2465 - ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) 2466 - { 2467 - struct file *file = iocb->ki_filp; 2468 - struct bch_inode_info *inode = file_bch_inode(file); 2469 - ssize_t ret; 2470 - 2471 - if (iocb->ki_flags & IOCB_DIRECT) { 2472 - ret = bch2_direct_write(iocb, from); 2473 - goto out; 2474 - } 2475 - 2476 - inode_lock(&inode->v); 2477 - 2478 - ret = generic_write_checks(iocb, from); 2479 - if (ret <= 0) 2480 - goto unlock; 2481 - 2482 - ret = file_remove_privs(file); 2483 - if (ret) 2484 - goto unlock; 2485 - 2486 - ret = file_update_time(file); 2487 - if (ret) 2488 - goto unlock; 2489 - 2490 - ret = bch2_buffered_write(iocb, from); 2491 - if (likely(ret > 0)) 2492 - iocb->ki_pos += ret; 2493 - unlock: 2494 - inode_unlock(&inode->v); 2495 - 2496 - if (ret > 0) 2497 - ret = generic_write_sync(iocb, ret); 2498 - out: 2499 - return bch2_err_class(ret); 2500 434 } 2501 435 2502 436 /* fsync: */ ··· 302 2908 s->s[i].nr_replicas = 0; 303 2909 304 2910 i_sectors_delta -= s->s[i].state == SECTOR_dirty; 305 - folio_sector_set(folio, s, i, SECTOR_unallocated); 2911 + bch2_folio_sector_set(folio, s, i, SECTOR_unallocated); 306 2912 } 307 2913 308 - i_sectors_acct(c, inode, NULL, i_sectors_delta); 2914 + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 309 2915 310 2916 /* 311 2917 * Caller needs to know whether this folio will be written out by ··· 496 3102 ret = bch2_fpunch(c, inode_inum(inode), 497 3103 round_up(iattr->ia_size, block_bytes(c)) >> 9, 498 3104 U64_MAX, &i_sectors_delta); 499 - i_sectors_acct(c, inode, NULL, i_sectors_delta); 3105 + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 500 3106 501 3107 bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && 502 3108 !bch2_journal_error(&c->journal), c, ··· 550 3156 ret = bch2_fpunch(c, inode_inum(inode), 551 3157 block_start >> 9, block_end >> 9, 552 3158 &i_sectors_delta); 553 - i_sectors_acct(c, inode, NULL, i_sectors_delta); 3159 + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 554 3160 } 555 3161 556 3162 mutex_lock(&inode->ei_update_lock); ··· 601 3207 602 3208 new_size = inode->v.i_size + shift; 603 3209 604 - ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); 3210 + ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); 605 3211 if (ret) 606 3212 return ret; 607 3213 ··· 617 3223 ret = bch2_fpunch(c, inode_inum(inode), 618 3224 offset >> 9, (offset + len) >> 9, 619 3225 &i_sectors_delta); 620 - i_sectors_acct(c, inode, NULL, i_sectors_delta); 3226 + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); 621 3227 622 3228 if (ret) 623 3229 return ret; ··· 838 3444 if (ret) 839 3445 goto bkey_err; 840 3446 841 - i_sectors_acct(c, inode, &quota_res, i_sectors_delta); 3447 + bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta); 842 3448 843 3449 drop_locks_do(&trans, 844 - (mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0)); 3450 + (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0)); 845 3451 bkey_err: 846 3452 bch2_quota_reservation_put(c, inode, &quota_res); 847 3453 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ··· 854 3460 855 3461 bch2_fpunch_at(&trans, &iter, inode_inum(inode), 856 3462 end_sector, &i_sectors_delta); 857 - i_sectors_acct(c, inode, &quota_res, i_sectors_delta); 3463 + bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta); 858 3464 bch2_quota_reservation_put(c, inode, &quota_res); 859 3465 } 860 3466 ··· 1048 3654 1049 3655 aligned_len = round_up((u64) len, block_bytes(c)); 1050 3656 1051 - ret = write_invalidate_inode_pages_range(dst->v.i_mapping, 3657 + ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping, 1052 3658 pos_dst, pos_dst + len - 1); 1053 3659 if (ret) 1054 3660 goto err; ··· 1060 3666 1061 3667 file_update_time(file_dst); 1062 3668 1063 - mark_pagecache_unallocated(src, pos_src >> 9, 3669 + bch2_mark_pagecache_unallocated(src, pos_src >> 9, 1064 3670 (pos_src + aligned_len) >> 9); 1065 3671 1066 3672 ret = bch2_remap_range(c, ··· 1076 3682 */ 1077 3683 ret = min((u64) ret << 9, (u64) len); 1078 3684 1079 - i_sectors_acct(c, dst, &quota_res, i_sectors_delta); 3685 + bch2_i_sectors_acct(c, dst, &quota_res, i_sectors_delta); 1080 3686 1081 3687 spin_lock(&dst->v.i_lock); 1082 3688 if (pos_dst + ret > dst->v.i_size) ··· 1094 3700 } 1095 3701 1096 3702 /* fseek: */ 1097 - 1098 - static int folio_data_offset(struct folio *folio, loff_t pos, 1099 - unsigned min_replicas) 1100 - { 1101 - struct bch_folio *s = bch2_folio(folio); 1102 - unsigned i, sectors = folio_sectors(folio); 1103 - 1104 - if (s) 1105 - for (i = folio_pos_to_s(folio, pos); i < sectors; i++) 1106 - if (s->s[i].state >= SECTOR_dirty && 1107 - s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas) 1108 - return i << SECTOR_SHIFT; 1109 - 1110 - return -1; 1111 - } 1112 - 1113 - static loff_t bch2_seek_pagecache_data(struct inode *vinode, 1114 - loff_t start_offset, 1115 - loff_t end_offset, 1116 - unsigned min_replicas, 1117 - bool nonblock) 1118 - { 1119 - struct folio_batch fbatch; 1120 - pgoff_t start_index = start_offset >> PAGE_SHIFT; 1121 - pgoff_t end_index = end_offset >> PAGE_SHIFT; 1122 - pgoff_t index = start_index; 1123 - unsigned i; 1124 - loff_t ret; 1125 - int offset; 1126 - 1127 - folio_batch_init(&fbatch); 1128 - 1129 - while (filemap_get_folios(vinode->i_mapping, 1130 - &index, end_index, &fbatch)) { 1131 - for (i = 0; i < folio_batch_count(&fbatch); i++) { 1132 - struct folio *folio = fbatch.folios[i]; 1133 - 1134 - if (!nonblock) { 1135 - folio_lock(folio); 1136 - } else if (!folio_trylock(folio)) { 1137 - folio_batch_release(&fbatch); 1138 - return -EAGAIN; 1139 - } 1140 - 1141 - offset = folio_data_offset(folio, 1142 - max(folio_pos(folio), start_offset), 1143 - min_replicas); 1144 - if (offset >= 0) { 1145 - ret = clamp(folio_pos(folio) + offset, 1146 - start_offset, end_offset); 1147 - folio_unlock(folio); 1148 - folio_batch_release(&fbatch); 1149 - return ret; 1150 - } 1151 - folio_unlock(folio); 1152 - } 1153 - folio_batch_release(&fbatch); 1154 - cond_resched(); 1155 - } 1156 - 1157 - return end_offset; 1158 - } 1159 3703 1160 3704 static loff_t bch2_seek_data(struct file *file, u64 offset) 1161 3705 { ··· 1146 3814 return -ENXIO; 1147 3815 1148 3816 return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); 1149 - } 1150 - 1151 - static int folio_hole_offset(struct address_space *mapping, loff_t *offset, 1152 - unsigned min_replicas, bool nonblock) 1153 - { 1154 - struct folio *folio; 1155 - struct bch_folio *s; 1156 - unsigned i, sectors; 1157 - bool ret = true; 1158 - 1159 - folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT, 1160 - FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0); 1161 - if (folio == ERR_PTR(-EAGAIN)) 1162 - return -EAGAIN; 1163 - if (IS_ERR_OR_NULL(folio)) 1164 - return true; 1165 - 1166 - s = bch2_folio(folio); 1167 - if (!s) 1168 - goto unlock; 1169 - 1170 - sectors = folio_sectors(folio); 1171 - for (i = folio_pos_to_s(folio, *offset); i < sectors; i++) 1172 - if (s->s[i].state < SECTOR_dirty || 1173 - s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) { 1174 - *offset = max(*offset, 1175 - folio_pos(folio) + (i << SECTOR_SHIFT)); 1176 - goto unlock; 1177 - } 1178 - 1179 - *offset = folio_end_pos(folio); 1180 - ret = false; 1181 - unlock: 1182 - folio_unlock(folio); 1183 - folio_put(folio); 1184 - return ret; 1185 - } 1186 - 1187 - static loff_t bch2_seek_pagecache_hole(struct inode *vinode, 1188 - loff_t start_offset, 1189 - loff_t end_offset, 1190 - unsigned min_replicas, 1191 - bool nonblock) 1192 - { 1193 - struct address_space *mapping = vinode->i_mapping; 1194 - loff_t offset = start_offset; 1195 - 1196 - while (offset < end_offset && 1197 - !folio_hole_offset(mapping, &offset, min_replicas, nonblock)) 1198 - ; 1199 - 1200 - return min(offset, end_offset); 1201 - } 1202 - 1203 - static int bch2_clamp_data_hole(struct inode *inode, 1204 - u64 *hole_start, 1205 - u64 *hole_end, 1206 - unsigned min_replicas, 1207 - bool nonblock) 1208 - { 1209 - loff_t ret; 1210 - 1211 - ret = bch2_seek_pagecache_hole(inode, 1212 - *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; 1213 - if (ret < 0) 1214 - return ret; 1215 - 1216 - *hole_start = ret; 1217 - 1218 - if (*hole_start == *hole_end) 1219 - return 0; 1220 - 1221 - ret = bch2_seek_pagecache_data(inode, 1222 - *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; 1223 - if (ret < 0) 1224 - return ret; 1225 - 1226 - *hole_end = ret; 1227 - return 0; 1228 3817 } 1229 3818 1230 3819 static loff_t bch2_seek_hole(struct file *file, u64 offset) ··· 1232 3979 void bch2_fs_fsio_exit(struct bch_fs *c) 1233 3980 { 1234 3981 bioset_exit(&c->nocow_flush_bioset); 1235 - bioset_exit(&c->dio_write_bioset); 1236 - bioset_exit(&c->dio_read_bioset); 1237 - bioset_exit(&c->writepage_bioset); 1238 3982 } 1239 3983 1240 3984 int bch2_fs_fsio_init(struct bch_fs *c) 1241 3985 { 1242 - if (bioset_init(&c->writepage_bioset, 1243 - 4, offsetof(struct bch_writepage_io, op.wbio.bio), 1244 - BIOSET_NEED_BVECS)) 1245 - return -BCH_ERR_ENOMEM_writepage_bioset_init; 1246 - 1247 - if (bioset_init(&c->dio_read_bioset, 1248 - 4, offsetof(struct dio_read, rbio.bio), 1249 - BIOSET_NEED_BVECS)) 1250 - return -BCH_ERR_ENOMEM_dio_read_bioset_init; 1251 - 1252 - if (bioset_init(&c->dio_write_bioset, 1253 - 4, offsetof(struct dio_write, op.wbio.bio), 1254 - BIOSET_NEED_BVECS)) 1255 - return -BCH_ERR_ENOMEM_dio_write_bioset_init; 1256 - 1257 3986 if (bioset_init(&c->nocow_flush_bioset, 1258 3987 1, offsetof(struct nocow_flush, bio), 0)) 1259 3988 return -BCH_ERR_ENOMEM_nocow_flush_bioset_init;

+149 -19

fs/bcachefs/fs-io.h

··· 5 5 #ifndef NO_BCACHEFS_FS 6 6 7 7 #include "buckets.h" 8 + #include "fs.h" 8 9 #include "io_types.h" 10 + #include "quota.h" 9 11 10 12 #include <linux/uio.h> 11 13 12 - struct quota_res; 14 + struct folio_vec { 15 + struct folio *fv_folio; 16 + size_t fv_offset; 17 + size_t fv_len; 18 + }; 19 + 20 + static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv) 21 + { 22 + 23 + struct folio *folio = page_folio(bv.bv_page); 24 + size_t offset = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) + 25 + bv.bv_offset; 26 + size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len); 27 + 28 + return (struct folio_vec) { 29 + .fv_folio = folio, 30 + .fv_offset = offset, 31 + .fv_len = len, 32 + }; 33 + } 34 + 35 + static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio, 36 + struct bvec_iter iter) 37 + { 38 + return biovec_to_foliovec(bio_iter_iovec(bio, iter)); 39 + } 40 + 41 + #define __bio_for_each_folio(bvl, bio, iter, start) \ 42 + for (iter = (start); \ 43 + (iter).bi_size && \ 44 + ((bvl = bio_iter_iovec_folio((bio), (iter))), 1); \ 45 + bio_advance_iter_single((bio), &(iter), (bvl).fv_len)) 46 + 47 + /** 48 + * bio_for_each_folio - iterate over folios within a bio 49 + * 50 + * Like other non-_all versions, this iterates over what bio->bi_iter currently 51 + * points to. This version is for drivers, where the bio may have previously 52 + * been split or cloned. 53 + */ 54 + #define bio_for_each_folio(bvl, bio, iter) \ 55 + __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter) 56 + 57 + struct quota_res { 58 + u64 sectors; 59 + }; 60 + 61 + #ifdef CONFIG_BCACHEFS_QUOTA 62 + 63 + static inline void __bch2_quota_reservation_put(struct bch_fs *c, 64 + struct bch_inode_info *inode, 65 + struct quota_res *res) 66 + { 67 + BUG_ON(res->sectors > inode->ei_quota_reserved); 68 + 69 + bch2_quota_acct(c, inode->ei_qid, Q_SPC, 70 + -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); 71 + inode->ei_quota_reserved -= res->sectors; 72 + res->sectors = 0; 73 + } 74 + 75 + static inline void bch2_quota_reservation_put(struct bch_fs *c, 76 + struct bch_inode_info *inode, 77 + struct quota_res *res) 78 + { 79 + if (res->sectors) { 80 + mutex_lock(&inode->ei_quota_lock); 81 + __bch2_quota_reservation_put(c, inode, res); 82 + mutex_unlock(&inode->ei_quota_lock); 83 + } 84 + } 85 + 86 + static inline int bch2_quota_reservation_add(struct bch_fs *c, 87 + struct bch_inode_info *inode, 88 + struct quota_res *res, 89 + u64 sectors, 90 + bool check_enospc) 91 + { 92 + int ret; 93 + 94 + if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags)) 95 + return 0; 96 + 97 + mutex_lock(&inode->ei_quota_lock); 98 + ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, 99 + check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); 100 + if (likely(!ret)) { 101 + inode->ei_quota_reserved += sectors; 102 + res->sectors += sectors; 103 + } 104 + mutex_unlock(&inode->ei_quota_lock); 105 + 106 + return ret; 107 + } 108 + 109 + #else 110 + 111 + static inline void __bch2_quota_reservation_put(struct bch_fs *c, 112 + struct bch_inode_info *inode, 113 + struct quota_res *res) {} 114 + 115 + static inline void bch2_quota_reservation_put(struct bch_fs *c, 116 + struct bch_inode_info *inode, 117 + struct quota_res *res) {} 118 + 119 + static inline int bch2_quota_reservation_add(struct bch_fs *c, 120 + struct bch_inode_info *inode, 121 + struct quota_res *res, 122 + unsigned sectors, 123 + bool check_enospc) 124 + { 125 + return 0; 126 + } 127 + 128 + #endif 129 + 130 + void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *, 131 + struct quota_res *, s64); 132 + 133 + static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, 134 + struct quota_res *quota_res, s64 sectors) 135 + { 136 + if (sectors) { 137 + mutex_lock(&inode->ei_quota_lock); 138 + __bch2_i_sectors_acct(c, inode, quota_res, sectors); 139 + mutex_unlock(&inode->ei_quota_lock); 140 + } 141 + } 142 + 143 + static inline struct address_space *faults_disabled_mapping(void) 144 + { 145 + return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); 146 + } 147 + 148 + static inline void set_fdm_dropped_locks(void) 149 + { 150 + current->faults_disabled_mapping = 151 + (void *) (((unsigned long) current->faults_disabled_mapping)|1); 152 + } 153 + 154 + static inline bool fdm_dropped_locks(void) 155 + { 156 + return ((unsigned long) current->faults_disabled_mapping) & 1; 157 + } 158 + 159 + void bch2_inode_flush_nocow_writes_async(struct bch_fs *, 160 + struct bch_inode_info *, struct closure *); 13 161 14 162 int __must_check bch2_write_inode_size(struct bch_fs *, 15 163 struct bch_inode_info *, 16 164 loff_t, unsigned); 17 - 18 - int bch2_read_folio(struct file *, struct folio *); 19 - 20 - int bch2_writepages(struct address_space *, struct writeback_control *); 21 - void bch2_readahead(struct readahead_control *); 22 - 23 - int bch2_write_begin(struct file *, struct address_space *, loff_t, 24 - unsigned, struct page **, void **); 25 - int bch2_write_end(struct file *, struct address_space *, loff_t, 26 - unsigned, unsigned, struct page *, void *); 27 - 28 - ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); 29 - ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); 30 165 31 166 int bch2_fsync(struct file *, loff_t, loff_t, int); 32 167 ··· 173 38 loff_t, loff_t, unsigned); 174 39 175 40 loff_t bch2_llseek(struct file *, loff_t, int); 176 - 177 - vm_fault_t bch2_page_fault(struct vm_fault *); 178 - vm_fault_t bch2_page_mkwrite(struct vm_fault *); 179 - void bch2_invalidate_folio(struct folio *, size_t, size_t); 180 - bool bch2_release_folio(struct folio *, gfp_t); 181 41 182 42 void bch2_fs_fsio_exit(struct bch_fs *); 183 43 int bch2_fs_fsio_init(struct bch_fs *);

+3

fs/bcachefs/fs.c

··· 14 14 #include "fs-common.h" 15 15 #include "fs-io.h" 16 16 #include "fs-ioctl.h" 17 + #include "fs-io-buffered.h" 18 + #include "fs-io-direct.h" 19 + #include "fs-io-pagecache.h" 17 20 #include "fsck.h" 18 21 #include "inode.h" 19 22 #include "io.h"

+7 -1

fs/bcachefs/super.c

··· 30 30 #include "error.h" 31 31 #include "fs.h" 32 32 #include "fs-io.h" 33 + #include "fs-io-buffered.h" 34 + #include "fs-io-direct.h" 33 35 #include "fsck.h" 34 36 #include "inode.h" 35 37 #include "io.h" ··· 471 469 bch2_fs_counters_exit(c); 472 470 bch2_fs_snapshots_exit(c); 473 471 bch2_fs_quota_exit(c); 472 + bch2_fs_fs_io_direct_exit(c); 473 + bch2_fs_fs_io_buffered_exit(c); 474 474 bch2_fs_fsio_exit(c); 475 475 bch2_fs_ec_exit(c); 476 476 bch2_fs_encryption_exit(c); ··· 846 842 bch2_fs_encryption_init(c) ?: 847 843 bch2_fs_compress_init(c) ?: 848 844 bch2_fs_ec_init(c) ?: 849 - bch2_fs_fsio_init(c); 845 + bch2_fs_fsio_init(c) ?: 846 + bch2_fs_fs_io_buffered_init(c); 847 + bch2_fs_fs_io_direct_init(c); 850 848 if (ret) 851 849 goto err; 852 850