Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: add PSI accounting around ->read_folio and ->readahead calls

PSI tries to account for the cost of bringing back in pages discarded by
the MM LRU management. Currently the prime place for that is hooked into
the bio submission path, which is a rather bad place:

- it does not actually account I/O for non-block file systems, of which
we have many
- it adds overhead and a layering violation to the block layer

Add the accounting into the two places in the core MM code that read
pages into an address space by calling into ->read_folio and ->readahead
so that the entire file system operations are covered, to broaden
the coverage and allow removing the accounting in the block layer going
forward.

As psi_memstall_enter can deal with nested calls this will not lead to
double accounting even while the bio annotations are still present.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/r/20220915094200.139713-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Christoph Hellwig and committed by
Jens Axboe
17604240 e8848087

+27 -4
+2
include/linux/pagemap.h
··· 1173 1173 pgoff_t _index; 1174 1174 unsigned int _nr_pages; 1175 1175 unsigned int _batch_count; 1176 + bool _workingset; 1177 + unsigned long _pflags; 1176 1178 }; 1177 1179 1178 1180 #define DEFINE_READAHEAD(ractl, f, r, m, i) \
+7
mm/filemap.c
··· 2382 2382 static int filemap_read_folio(struct file *file, filler_t filler, 2383 2383 struct folio *folio) 2384 2384 { 2385 + bool workingset = folio_test_workingset(folio); 2386 + unsigned long pflags; 2385 2387 int error; 2386 2388 2387 2389 /* ··· 2392 2390 * fails. 2393 2391 */ 2394 2392 folio_clear_error(folio); 2393 + 2395 2394 /* Start the actual read. The read will unlock the page. */ 2395 + if (unlikely(workingset)) 2396 + psi_memstall_enter(&pflags); 2396 2397 error = filler(file, folio); 2398 + if (unlikely(workingset)) 2399 + psi_memstall_leave(&pflags); 2397 2400 if (error) 2398 2401 return error; 2399 2402
+18 -4
mm/readahead.c
··· 122 122 #include <linux/task_io_accounting_ops.h> 123 123 #include <linux/pagevec.h> 124 124 #include <linux/pagemap.h> 125 + #include <linux/psi.h> 125 126 #include <linux/syscalls.h> 126 127 #include <linux/file.h> 127 128 #include <linux/mm_inline.h> ··· 153 152 if (!readahead_count(rac)) 154 153 return; 155 154 155 + if (unlikely(rac->_workingset)) 156 + psi_memstall_enter(&rac->_pflags); 156 157 blk_start_plug(&plug); 157 158 158 159 if (aops->readahead) { ··· 182 179 } 183 180 184 181 blk_finish_plug(&plug); 182 + if (unlikely(rac->_workingset)) 183 + psi_memstall_leave(&rac->_pflags); 184 + rac->_workingset = false; 185 185 186 186 BUG_ON(readahead_count(rac)); 187 187 } ··· 258 252 } 259 253 if (i == nr_to_read - lookahead_size) 260 254 folio_set_readahead(folio); 255 + ractl->_workingset |= folio_test_workingset(folio); 261 256 ractl->_nr_pages++; 262 257 } 263 258 ··· 487 480 if (index == mark) 488 481 folio_set_readahead(folio); 489 482 err = filemap_add_folio(ractl->mapping, folio, index, gfp); 490 - if (err) 483 + if (err) { 491 484 folio_put(folio); 492 - else 493 - ractl->_nr_pages += 1UL << order; 494 - return err; 485 + return err; 486 + } 487 + 488 + ractl->_nr_pages += 1UL << order; 489 + ractl->_workingset |= folio_test_workingset(folio); 490 + return 0; 495 491 } 496 492 497 493 void page_cache_ra_order(struct readahead_control *ractl, ··· 835 825 if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) { 836 826 put_page(page); 837 827 return; 828 + } 829 + if (unlikely(PageWorkingset(page)) && !ractl->_workingset) { 830 + ractl->_workingset = true; 831 + psi_memstall_enter(&ractl->_pflags); 838 832 } 839 833 ractl->_nr_pages++; 840 834 if (ra) {