Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfs: Simplify the writeback code

Use the new folio_queue structures to simplify the writeback code. The
problem with referring to the i_pages xarray directly is that we may have
gaps in the sequence of folios we're writing from that we need to skip when
we're removing the writeback mark from the folios we're writing back from.

At the moment the code tries to deal with this by carefully tracking the
gaps in each writeback stream (eg. write to server and write to cache) and
divining when there's a gap that spans folios (something that's not helped
by folios not being a consistent size).

Instead, the folio_queue buffer contains pointers only the folios we're
dealing with, has them in ascending order and indicates a gap by placing
non-consequitive folios next to each other. This makes it possible to
track where we need to clean up to by just keeping track of where we've
processed to on each stream and taking the minimum.

Note that the I/O iterator is always rounded up to the end of the folio,
even if that is beyond the EOF position, so that the cache can do DIO from
the page. The excess space is cleared, though mmapped writes clobber it.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20240814203850.2240469-18-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>

authored by

David Howells and committed by
Christian Brauner
983cdcf8 bfaa33b8

+45 -171
+24 -122
fs/netfs/write_collect.c
··· 15 15 16 16 /* Notes made in the collector */ 17 17 #define HIT_PENDING 0x01 /* A front op was still pending */ 18 - #define SOME_EMPTY 0x02 /* One of more streams are empty */ 19 - #define ALL_EMPTY 0x04 /* All streams are empty */ 20 - #define MAYBE_DISCONTIG 0x08 /* A front op may be discontiguous (rounded to PAGE_SIZE) */ 21 - #define NEED_REASSESS 0x10 /* Need to loop round and reassess */ 22 - #define REASSESS_DISCONTIG 0x20 /* Reassess discontiguity if contiguity advances */ 23 - #define MADE_PROGRESS 0x40 /* Made progress cleaning up a stream or the folio set */ 24 - #define BUFFERED 0x80 /* The pagecache needs cleaning up */ 25 - #define NEED_RETRY 0x100 /* A front op requests retrying */ 26 - #define SAW_FAILURE 0x200 /* One stream or hit a permanent failure */ 18 + #define NEED_REASSESS 0x02 /* Need to loop round and reassess */ 19 + #define MADE_PROGRESS 0x04 /* Made progress cleaning up a stream or the folio set */ 20 + #define BUFFERED 0x08 /* The pagecache needs cleaning up */ 21 + #define NEED_RETRY 0x10 /* A front op requests retrying */ 22 + #define SAW_FAILURE 0x20 /* One stream or hit a permanent failure */ 27 23 28 24 /* 29 25 * Successful completion of write of a folio to the server and/or cache. Note ··· 81 85 * Unlock any folios we've finished with. 82 86 */ 83 87 static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq, 84 - unsigned long long collected_to, 85 88 unsigned int *notes) 86 89 { 87 90 struct folio_queue *folioq = wreq->buffer; 91 + unsigned long long collected_to = wreq->collected_to; 88 92 unsigned int slot = wreq->buffer_head_slot; 89 93 90 94 if (slot >= folioq_nr_slots(folioq)) { ··· 112 116 fend = min_t(unsigned long long, fpos + flen, wreq->i_size); 113 117 114 118 trace_netfs_collect_folio(wreq, folio, fend, collected_to); 115 - 116 - if (fpos + fsize > wreq->contiguity) { 117 - trace_netfs_collect_contig(wreq, fpos + fsize, 118 - netfs_contig_trace_unlock); 119 - wreq->contiguity = fpos + fsize; 120 - } 121 119 122 120 /* Unlock any folio we've transferred all of. */ 123 121 if (collected_to < fend) ··· 370 380 { 371 381 struct netfs_io_subrequest *front, *remove; 372 382 struct netfs_io_stream *stream; 373 - unsigned long long collected_to; 383 + unsigned long long collected_to, issued_to; 374 384 unsigned int notes; 375 385 int s; 376 386 ··· 379 389 trace_netfs_rreq(wreq, netfs_rreq_trace_collect); 380 390 381 391 reassess_streams: 392 + issued_to = atomic64_read(&wreq->issued_to); 382 393 smp_rmb(); 383 394 collected_to = ULLONG_MAX; 384 - if (wreq->origin == NETFS_WRITEBACK) 385 - notes = ALL_EMPTY | BUFFERED | MAYBE_DISCONTIG; 386 - else if (wreq->origin == NETFS_WRITETHROUGH) 387 - notes = ALL_EMPTY | BUFFERED; 395 + if (wreq->origin == NETFS_WRITEBACK || 396 + wreq->origin == NETFS_WRITETHROUGH) 397 + notes = BUFFERED; 388 398 else 389 - notes = ALL_EMPTY; 399 + notes = 0; 390 400 391 401 /* Remove completed subrequests from the front of the streams and 392 402 * advance the completion point on each stream. We stop when we hit 393 403 * something that's in progress. The issuer thread may be adding stuff 394 404 * to the tail whilst we're doing this. 395 - * 396 - * We must not, however, merge in discontiguities that span whole 397 - * folios that aren't under writeback. This is made more complicated 398 - * by the folios in the gap being of unpredictable sizes - if they even 399 - * exist - but we don't want to look them up. 400 405 */ 401 406 for (s = 0; s < NR_IO_STREAMS; s++) { 402 - loff_t rstart, rend; 403 - 404 407 stream = &wreq->io_streams[s]; 405 408 /* Read active flag before list pointers */ 406 409 if (!smp_load_acquire(&stream->active)) ··· 405 422 //_debug("sreq [%x] %llx %zx/%zx", 406 423 // front->debug_index, front->start, front->transferred, front->len); 407 424 408 - /* Stall if there may be a discontinuity. */ 409 - rstart = round_down(front->start, PAGE_SIZE); 410 - if (rstart > wreq->contiguity) { 411 - if (wreq->contiguity > stream->collected_to) { 412 - trace_netfs_collect_gap(wreq, stream, 413 - wreq->contiguity, 'D'); 414 - stream->collected_to = wreq->contiguity; 415 - } 416 - notes |= REASSESS_DISCONTIG; 417 - break; 425 + if (stream->collected_to < front->start) { 426 + trace_netfs_collect_gap(wreq, stream, issued_to, 'F'); 427 + stream->collected_to = front->start; 418 428 } 419 - rend = round_up(front->start + front->len, PAGE_SIZE); 420 - if (rend > wreq->contiguity) { 421 - trace_netfs_collect_contig(wreq, rend, 422 - netfs_contig_trace_collect); 423 - wreq->contiguity = rend; 424 - if (notes & REASSESS_DISCONTIG) 425 - notes |= NEED_REASSESS; 426 - } 427 - notes &= ~MAYBE_DISCONTIG; 428 429 429 430 /* Stall if the front is still undergoing I/O. */ 430 431 if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags)) { ··· 450 483 front = list_first_entry_or_null(&stream->subrequests, 451 484 struct netfs_io_subrequest, rreq_link); 452 485 stream->front = front; 453 - if (!front) { 454 - unsigned long long jump_to = atomic64_read(&wreq->issued_to); 455 - 456 - if (stream->collected_to < jump_to) { 457 - trace_netfs_collect_gap(wreq, stream, jump_to, 'A'); 458 - stream->collected_to = jump_to; 459 - } 460 - } 461 - 462 486 spin_unlock_bh(&wreq->lock); 463 487 netfs_put_subrequest(remove, false, 464 488 notes & SAW_FAILURE ? ··· 457 499 netfs_sreq_trace_put_done); 458 500 } 459 501 460 - if (front) 461 - notes &= ~ALL_EMPTY; 462 - else 463 - notes |= SOME_EMPTY; 502 + /* If we have an empty stream, we need to jump it forward 503 + * otherwise the collection point will never advance. 504 + */ 505 + if (!front && issued_to > stream->collected_to) { 506 + trace_netfs_collect_gap(wreq, stream, issued_to, 'E'); 507 + stream->collected_to = issued_to; 508 + } 464 509 465 510 if (stream->collected_to < collected_to) 466 511 collected_to = stream->collected_to; ··· 471 510 472 511 if (collected_to != ULLONG_MAX && collected_to > wreq->collected_to) 473 512 wreq->collected_to = collected_to; 474 - 475 - /* If we have an empty stream, we need to jump it forward over any gap 476 - * otherwise the collection point will never advance. 477 - * 478 - * Note that the issuer always adds to the stream with the lowest 479 - * so-far submitted start, so if we see two consecutive subreqs in one 480 - * stream with nothing between then in another stream, then the second 481 - * stream has a gap that can be jumped. 482 - */ 483 - if (notes & SOME_EMPTY) { 484 - unsigned long long jump_to = wreq->start + READ_ONCE(wreq->submitted); 485 - 486 - for (s = 0; s < NR_IO_STREAMS; s++) { 487 - stream = &wreq->io_streams[s]; 488 - if (stream->active && 489 - stream->front && 490 - stream->front->start < jump_to) 491 - jump_to = stream->front->start; 492 - } 493 - 494 - for (s = 0; s < NR_IO_STREAMS; s++) { 495 - stream = &wreq->io_streams[s]; 496 - if (stream->active && 497 - !stream->front && 498 - stream->collected_to < jump_to) { 499 - trace_netfs_collect_gap(wreq, stream, jump_to, 'B'); 500 - stream->collected_to = jump_to; 501 - } 502 - } 503 - } 504 513 505 514 for (s = 0; s < NR_IO_STREAMS; s++) { 506 515 stream = &wreq->io_streams[s]; ··· 482 551 483 552 /* Unlock any folios that we have now finished with. */ 484 553 if (notes & BUFFERED) { 485 - unsigned long long clean_to = min(wreq->collected_to, wreq->contiguity); 486 - 487 - if (wreq->cleaned_to < clean_to) 488 - netfs_writeback_unlock_folios(wreq, clean_to, &notes); 554 + if (wreq->cleaned_to < wreq->collected_to) 555 + netfs_writeback_unlock_folios(wreq, &notes); 489 556 } else { 490 557 wreq->cleaned_to = wreq->collected_to; 491 558 } 492 559 493 560 // TODO: Discard encryption buffers 494 - 495 - /* If all streams are discontiguous with the last folio we cleared, we 496 - * may need to skip a set of folios. 497 - */ 498 - if ((notes & (MAYBE_DISCONTIG | ALL_EMPTY)) == MAYBE_DISCONTIG) { 499 - unsigned long long jump_to = ULLONG_MAX; 500 - 501 - for (s = 0; s < NR_IO_STREAMS; s++) { 502 - stream = &wreq->io_streams[s]; 503 - if (stream->active && stream->front && 504 - stream->front->start < jump_to) 505 - jump_to = stream->front->start; 506 - } 507 - 508 - trace_netfs_collect_contig(wreq, jump_to, netfs_contig_trace_jump); 509 - wreq->contiguity = jump_to; 510 - wreq->cleaned_to = jump_to; 511 - wreq->collected_to = jump_to; 512 - for (s = 0; s < NR_IO_STREAMS; s++) { 513 - stream = &wreq->io_streams[s]; 514 - if (stream->collected_to < jump_to) 515 - stream->collected_to = jump_to; 516 - } 517 - //cond_resched(); 518 - notes |= MADE_PROGRESS; 519 - goto reassess_streams; 520 - } 521 561 522 562 if (notes & NEED_RETRY) 523 563 goto need_retry;
+19 -17
fs/netfs/write_issue.c
··· 107 107 if (is_buffered && netfs_is_cache_enabled(ictx)) 108 108 fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx)); 109 109 110 - wreq->contiguity = wreq->start; 111 110 wreq->cleaned_to = wreq->start; 112 111 113 112 wreq->io_streams[0].stream_nr = 0; ··· 157 158 subreq->source = stream->source; 158 159 subreq->start = start; 159 160 subreq->stream_nr = stream->stream_nr; 161 + subreq->io_iter = wreq->io_iter; 160 162 161 163 _enter("R=%x[%x]", wreq->debug_id, subreq->debug_index); 162 164 ··· 213 213 * netfs_write_subrequest_terminated() when complete. 214 214 */ 215 215 static void netfs_do_issue_write(struct netfs_io_stream *stream, 216 - struct netfs_io_subrequest *subreq, 217 - struct iov_iter *source) 216 + struct netfs_io_subrequest *subreq) 218 217 { 219 218 struct netfs_io_request *wreq = subreq->rreq; 220 - size_t size = subreq->len - subreq->transferred; 221 219 222 220 _enter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len); 223 221 224 222 if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) 225 223 return netfs_write_subrequest_terminated(subreq, subreq->error, false); 226 - 227 - // TODO: Use encrypted buffer 228 - subreq->io_iter = *source; 229 - iov_iter_advance(source, size); 230 - iov_iter_truncate(&subreq->io_iter, size); 231 224 232 225 trace_netfs_sreq(subreq, netfs_sreq_trace_submit); 233 226 stream->issue_write(subreq); ··· 230 237 struct netfs_io_subrequest *subreq, 231 238 struct iov_iter *source) 232 239 { 240 + size_t size = subreq->len - subreq->transferred; 241 + 242 + // TODO: Use encrypted buffer 243 + subreq->io_iter = *source; 244 + iov_iter_advance(source, size); 245 + iov_iter_truncate(&subreq->io_iter, size); 246 + 233 247 __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); 234 - netfs_do_issue_write(stream, subreq, source); 248 + netfs_do_issue_write(stream, subreq); 235 249 } 236 250 237 251 static void netfs_issue_write(struct netfs_io_request *wreq, ··· 249 249 if (!subreq) 250 250 return; 251 251 stream->construct = NULL; 252 - 253 - if (subreq->start + subreq->len > wreq->start + wreq->submitted) 254 - WRITE_ONCE(wreq->submitted, subreq->start + subreq->len - wreq->start); 255 - netfs_do_issue_write(stream, subreq, &wreq->io_iter); 252 + subreq->io_iter.count = subreq->len; 253 + netfs_do_issue_write(stream, subreq); 256 254 } 257 255 258 256 /* ··· 462 464 if (choose_s < 0) 463 465 break; 464 466 stream = &wreq->io_streams[choose_s]; 467 + wreq->io_iter.iov_offset = stream->submit_off; 465 468 469 + atomic64_set(&wreq->issued_to, fpos + stream->submit_off); 466 470 part = netfs_advance_write(wreq, stream, fpos + stream->submit_off, 467 471 stream->submit_len, to_eof); 468 - atomic64_set(&wreq->issued_to, fpos + stream->submit_off); 469 472 stream->submit_off += part; 470 473 stream->submit_max_len -= part; 471 474 if (part > stream->submit_len) ··· 477 478 debug = true; 478 479 } 479 480 481 + wreq->io_iter.iov_offset = 0; 482 + iov_iter_advance(&wreq->io_iter, fsize); 480 483 atomic64_set(&wreq->issued_to, fpos + fsize); 481 484 482 485 if (!debug) ··· 527 526 netfs_stat(&netfs_n_wh_writepages); 528 527 529 528 do { 530 - _debug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted); 529 + _debug("wbiter %lx %llx", folio->index, atomic64_read(&wreq->issued_to)); 531 530 532 531 /* It appears we don't have to handle cyclic writeback wrapping. */ 533 - WARN_ON_ONCE(wreq && folio_pos(folio) < wreq->start + wreq->submitted); 532 + WARN_ON_ONCE(wreq && folio_pos(folio) < atomic64_read(&wreq->issued_to)); 534 533 535 534 if (netfs_folio_group(folio) != NETFS_FOLIO_COPY_TO_CACHE && 536 535 unlikely(!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))) { ··· 674 673 part = netfs_advance_write(wreq, upload, start, len, false); 675 674 start += part; 676 675 len -= part; 676 + iov_iter_advance(&wreq->io_iter, part); 677 677 if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { 678 678 trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause); 679 679 wait_on_bit(&wreq->flags, NETFS_RREQ_PAUSE, TASK_UNINTERRUPTIBLE);
-1
include/linux/netfs.h
··· 257 257 unsigned long long i_size; /* Size of the file */ 258 258 unsigned long long start; /* Start position */ 259 259 atomic64_t issued_to; /* Write issuer folio cursor */ 260 - unsigned long long contiguity; /* Tracking for gaps in the writeback sequence */ 261 260 unsigned long long collected_to; /* Point we've collected to */ 262 261 unsigned long long cleaned_to; /* Position we've cleaned folios to */ 263 262 pgoff_t no_unlock_folio; /* Don't unlock this folio after read */
+2 -31
include/trace/events/netfs.h
··· 512 512 __entry->start + __entry->len) 513 513 ); 514 514 515 - TRACE_EVENT(netfs_collect_contig, 516 - TP_PROTO(const struct netfs_io_request *wreq, unsigned long long to, 517 - enum netfs_collect_contig_trace type), 518 - 519 - TP_ARGS(wreq, to, type), 520 - 521 - TP_STRUCT__entry( 522 - __field(unsigned int, wreq) 523 - __field(enum netfs_collect_contig_trace, type) 524 - __field(unsigned long long, contiguity) 525 - __field(unsigned long long, to) 526 - ), 527 - 528 - TP_fast_assign( 529 - __entry->wreq = wreq->debug_id; 530 - __entry->type = type; 531 - __entry->contiguity = wreq->contiguity; 532 - __entry->to = to; 533 - ), 534 - 535 - TP_printk("R=%08x %llx -> %llx %s", 536 - __entry->wreq, 537 - __entry->contiguity, 538 - __entry->to, 539 - __print_symbolic(__entry->type, netfs_collect_contig_traces)) 540 - ); 541 - 542 515 TRACE_EVENT(netfs_collect_sreq, 543 516 TP_PROTO(const struct netfs_io_request *wreq, 544 517 const struct netfs_io_subrequest *subreq), ··· 583 610 __field(unsigned int, notes ) 584 611 __field(unsigned long long, collected_to ) 585 612 __field(unsigned long long, cleaned_to ) 586 - __field(unsigned long long, contiguity ) 587 613 ), 588 614 589 615 TP_fast_assign( ··· 590 618 __entry->notes = notes; 591 619 __entry->collected_to = collected_to; 592 620 __entry->cleaned_to = wreq->cleaned_to; 593 - __entry->contiguity = wreq->contiguity; 594 621 ), 595 622 596 - TP_printk("R=%08x cto=%llx fto=%llx ctg=%llx n=%x", 623 + TP_printk("R=%08x col=%llx cln=%llx n=%x", 597 624 __entry->wreq, __entry->collected_to, 598 - __entry->cleaned_to, __entry->contiguity, 625 + __entry->cleaned_to, 599 626 __entry->notes) 600 627 ); 601 628