Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

writeback: update writeback tracepoints to report cgroup

The following tracepoints are updated to report the cgroup used during
cgroup writeback.

* writeback_write_inode[_start]
* writeback_queue
* writeback_exec
* writeback_start
* writeback_written
* writeback_wait
* writeback_nowork
* writeback_wake_background
* wbc_writepage
* writeback_queue_io
* bdi_dirty_ratelimit
* balance_dirty_pages
* writeback_sb_inodes_requeue
* writeback_single_inode[_start]

Note that writeback_bdi_register is separated out from writeback_class
as reporting cgroup doesn't make sense to it. Tracepoints which take
bdi are updated to take bdi_writeback instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>

authored by

Tejun Heo and committed by
Jens Axboe
5634cc2a 9acee9c5

+152 -50
+7 -7
fs/fs-writeback.c
··· 176 176 static void wb_queue_work(struct bdi_writeback *wb, 177 177 struct wb_writeback_work *work) 178 178 { 179 - trace_writeback_queue(wb->bdi, work); 179 + trace_writeback_queue(wb, work); 180 180 181 181 spin_lock_bh(&wb->work_lock); 182 182 if (!test_bit(WB_registered, &wb->state)) ··· 883 883 */ 884 884 work = kzalloc(sizeof(*work), GFP_ATOMIC); 885 885 if (!work) { 886 - trace_writeback_nowork(wb->bdi); 886 + trace_writeback_nowork(wb); 887 887 wb_wakeup(wb); 888 888 return; 889 889 } ··· 913 913 * We just wake up the flusher thread. It will perform background 914 914 * writeback as soon as there is no other work to do. 915 915 */ 916 - trace_writeback_wake_background(wb->bdi); 916 + trace_writeback_wake_background(wb); 917 917 wb_wakeup(wb); 918 918 } 919 919 ··· 1616 1616 } else if (work->for_background) 1617 1617 oldest_jif = jiffies; 1618 1618 1619 - trace_writeback_start(wb->bdi, work); 1619 + trace_writeback_start(wb, work); 1620 1620 if (list_empty(&wb->b_io)) 1621 1621 queue_io(wb, work); 1622 1622 if (work->sb) 1623 1623 progress = writeback_sb_inodes(work->sb, wb, work); 1624 1624 else 1625 1625 progress = __writeback_inodes_wb(wb, work); 1626 - trace_writeback_written(wb->bdi, work); 1626 + trace_writeback_written(wb, work); 1627 1627 1628 1628 wb_update_bandwidth(wb, wb_start); 1629 1629 ··· 1648 1648 * we'll just busyloop. 1649 1649 */ 1650 1650 if (!list_empty(&wb->b_more_io)) { 1651 - trace_writeback_wait(wb->bdi, work); 1651 + trace_writeback_wait(wb, work); 1652 1652 inode = wb_inode(wb->b_more_io.prev); 1653 1653 spin_lock(&inode->i_lock); 1654 1654 spin_unlock(&wb->list_lock); ··· 1754 1754 while ((work = get_next_work_item(wb)) != NULL) { 1755 1755 struct wb_completion *done = work->done; 1756 1756 1757 - trace_writeback_exec(wb->bdi, work); 1757 + trace_writeback_exec(wb, work); 1758 1758 1759 1759 wrote += wb_writeback(wb, work); 1760 1760
+142 -40
include/trace/events/writeback.h
··· 131 131 TP_ARGS(inode, flags) 132 132 ); 133 133 134 + #ifdef CREATE_TRACE_POINTS 135 + #ifdef CONFIG_CGROUP_WRITEBACK 136 + 137 + static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb) 138 + { 139 + return kernfs_path_len(wb->memcg_css->cgroup->kn) + 1; 140 + } 141 + 142 + static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb) 143 + { 144 + struct cgroup *cgrp = wb->memcg_css->cgroup; 145 + char *path; 146 + 147 + path = cgroup_path(cgrp, buf, kernfs_path_len(cgrp->kn) + 1); 148 + WARN_ON_ONCE(path != buf); 149 + } 150 + 151 + static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc) 152 + { 153 + if (wbc->wb) 154 + return __trace_wb_cgroup_size(wbc->wb); 155 + else 156 + return 2; 157 + } 158 + 159 + static inline void __trace_wbc_assign_cgroup(char *buf, 160 + struct writeback_control *wbc) 161 + { 162 + if (wbc->wb) 163 + __trace_wb_assign_cgroup(buf, wbc->wb); 164 + else 165 + strcpy(buf, "/"); 166 + } 167 + 168 + #else /* CONFIG_CGROUP_WRITEBACK */ 169 + 170 + static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb) 171 + { 172 + return 2; 173 + } 174 + 175 + static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb) 176 + { 177 + strcpy(buf, "/"); 178 + } 179 + 180 + static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc) 181 + { 182 + return 2; 183 + } 184 + 185 + static inline void __trace_wbc_assign_cgroup(char *buf, 186 + struct writeback_control *wbc) 187 + { 188 + strcpy(buf, "/"); 189 + } 190 + 191 + #endif /* CONFIG_CGROUP_WRITEBACK */ 192 + #endif /* CREATE_TRACE_POINTS */ 193 + 134 194 DECLARE_EVENT_CLASS(writeback_write_inode_template, 135 195 136 196 TP_PROTO(struct inode *inode, struct writeback_control *wbc), ··· 201 141 __array(char, name, 32) 202 142 __field(unsigned long, ino) 203 143 __field(int, sync_mode) 144 + __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc)) 204 145 ), 205 146 206 147 TP_fast_assign( ··· 209 148 dev_name(inode_to_bdi(inode)->dev), 32); 210 149 __entry->ino = inode->i_ino; 211 150 __entry->sync_mode = wbc->sync_mode; 151 + __trace_wbc_assign_cgroup(__get_str(cgroup), wbc); 212 152 ), 213 153 214 - TP_printk("bdi %s: ino=%lu sync_mode=%d", 154 + TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup=%s", 215 155 __entry->name, 216 156 __entry->ino, 217 - __entry->sync_mode 157 + __entry->sync_mode, 158 + __get_str(cgroup) 218 159 ) 219 160 ); 220 161 ··· 235 172 ); 236 173 237 174 DECLARE_EVENT_CLASS(writeback_work_class, 238 - TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), 239 - TP_ARGS(bdi, work), 175 + TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), 176 + TP_ARGS(wb, work), 240 177 TP_STRUCT__entry( 241 178 __array(char, name, 32) 242 179 __field(long, nr_pages) ··· 246 183 __field(int, range_cyclic) 247 184 __field(int, for_background) 248 185 __field(int, reason) 186 + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) 249 187 ), 250 188 TP_fast_assign( 251 189 strncpy(__entry->name, 252 - bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); 190 + wb->bdi->dev ? dev_name(wb->bdi->dev) : "(unknown)", 32); 253 191 __entry->nr_pages = work->nr_pages; 254 192 __entry->sb_dev = work->sb ? work->sb->s_dev : 0; 255 193 __entry->sync_mode = work->sync_mode; ··· 258 194 __entry->range_cyclic = work->range_cyclic; 259 195 __entry->for_background = work->for_background; 260 196 __entry->reason = work->reason; 197 + __trace_wb_assign_cgroup(__get_str(cgroup), wb); 261 198 ), 262 199 TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " 263 - "kupdate=%d range_cyclic=%d background=%d reason=%s", 200 + "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup=%s", 264 201 __entry->name, 265 202 MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), 266 203 __entry->nr_pages, ··· 269 204 __entry->for_kupdate, 270 205 __entry->range_cyclic, 271 206 __entry->for_background, 272 - __print_symbolic(__entry->reason, WB_WORK_REASON) 207 + __print_symbolic(__entry->reason, WB_WORK_REASON), 208 + __get_str(cgroup) 273 209 ) 274 210 ); 275 211 #define DEFINE_WRITEBACK_WORK_EVENT(name) \ 276 212 DEFINE_EVENT(writeback_work_class, name, \ 277 - TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \ 278 - TP_ARGS(bdi, work)) 213 + TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \ 214 + TP_ARGS(wb, work)) 279 215 DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); 280 216 DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); 281 217 DEFINE_WRITEBACK_WORK_EVENT(writeback_start); ··· 296 230 ); 297 231 298 232 DECLARE_EVENT_CLASS(writeback_class, 233 + TP_PROTO(struct bdi_writeback *wb), 234 + TP_ARGS(wb), 235 + TP_STRUCT__entry( 236 + __array(char, name, 32) 237 + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) 238 + ), 239 + TP_fast_assign( 240 + strncpy(__entry->name, dev_name(wb->bdi->dev), 32); 241 + __trace_wb_assign_cgroup(__get_str(cgroup), wb); 242 + ), 243 + TP_printk("bdi %s: cgroup=%s", 244 + __entry->name, 245 + __get_str(cgroup) 246 + ) 247 + ); 248 + #define DEFINE_WRITEBACK_EVENT(name) \ 249 + DEFINE_EVENT(writeback_class, name, \ 250 + TP_PROTO(struct bdi_writeback *wb), \ 251 + TP_ARGS(wb)) 252 + 253 + DEFINE_WRITEBACK_EVENT(writeback_nowork); 254 + DEFINE_WRITEBACK_EVENT(writeback_wake_background); 255 + 256 + TRACE_EVENT(writeback_bdi_register, 299 257 TP_PROTO(struct backing_dev_info *bdi), 300 258 TP_ARGS(bdi), 301 259 TP_STRUCT__entry( ··· 329 239 strncpy(__entry->name, dev_name(bdi->dev), 32); 330 240 ), 331 241 TP_printk("bdi %s", 332 - __entry->name 242 + __entry->name 333 243 ) 334 244 ); 335 - #define DEFINE_WRITEBACK_EVENT(name) \ 336 - DEFINE_EVENT(writeback_class, name, \ 337 - TP_PROTO(struct backing_dev_info *bdi), \ 338 - TP_ARGS(bdi)) 339 - 340 - DEFINE_WRITEBACK_EVENT(writeback_nowork); 341 - DEFINE_WRITEBACK_EVENT(writeback_wake_background); 342 - DEFINE_WRITEBACK_EVENT(writeback_bdi_register); 343 245 344 246 DECLARE_EVENT_CLASS(wbc_class, 345 247 TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), ··· 347 265 __field(int, range_cyclic) 348 266 __field(long, range_start) 349 267 __field(long, range_end) 268 + __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc)) 350 269 ), 351 270 352 271 TP_fast_assign( ··· 361 278 __entry->range_cyclic = wbc->range_cyclic; 362 279 __entry->range_start = (long)wbc->range_start; 363 280 __entry->range_end = (long)wbc->range_end; 281 + __trace_wbc_assign_cgroup(__get_str(cgroup), wbc); 364 282 ), 365 283 366 284 TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " 367 285 "bgrd=%d reclm=%d cyclic=%d " 368 - "start=0x%lx end=0x%lx", 286 + "start=0x%lx end=0x%lx cgroup=%s", 369 287 __entry->name, 370 288 __entry->nr_to_write, 371 289 __entry->pages_skipped, ··· 376 292 __entry->for_reclaim, 377 293 __entry->range_cyclic, 378 294 __entry->range_start, 379 - __entry->range_end) 295 + __entry->range_end, 296 + __get_str(cgroup) 297 + ) 380 298 ) 381 299 382 300 #define DEFINE_WBC_EVENT(name) \ ··· 398 312 __field(long, age) 399 313 __field(int, moved) 400 314 __field(int, reason) 315 + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) 401 316 ), 402 317 TP_fast_assign( 403 318 unsigned long *older_than_this = work->older_than_this; ··· 408 321 (jiffies - *older_than_this) * 1000 / HZ : -1; 409 322 __entry->moved = moved; 410 323 __entry->reason = work->reason; 324 + __trace_wb_assign_cgroup(__get_str(cgroup), wb); 411 325 ), 412 - TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s", 326 + TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup=%s", 413 327 __entry->name, 414 328 __entry->older, /* older_than_this in jiffies */ 415 329 __entry->age, /* older_than_this in relative milliseconds */ 416 330 __entry->moved, 417 - __print_symbolic(__entry->reason, WB_WORK_REASON) 331 + __print_symbolic(__entry->reason, WB_WORK_REASON), 332 + __get_str(cgroup) 418 333 ) 419 334 ); 420 335 ··· 470 381 471 382 TRACE_EVENT(bdi_dirty_ratelimit, 472 383 473 - TP_PROTO(struct backing_dev_info *bdi, 384 + TP_PROTO(struct bdi_writeback *wb, 474 385 unsigned long dirty_rate, 475 386 unsigned long task_ratelimit), 476 387 477 - TP_ARGS(bdi, dirty_rate, task_ratelimit), 388 + TP_ARGS(wb, dirty_rate, task_ratelimit), 478 389 479 390 TP_STRUCT__entry( 480 391 __array(char, bdi, 32) ··· 484 395 __field(unsigned long, dirty_ratelimit) 485 396 __field(unsigned long, task_ratelimit) 486 397 __field(unsigned long, balanced_dirty_ratelimit) 398 + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) 487 399 ), 488 400 489 401 TP_fast_assign( 490 - strlcpy(__entry->bdi, dev_name(bdi->dev), 32); 491 - __entry->write_bw = KBps(bdi->wb.write_bandwidth); 492 - __entry->avg_write_bw = KBps(bdi->wb.avg_write_bandwidth); 402 + strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32); 403 + __entry->write_bw = KBps(wb->write_bandwidth); 404 + __entry->avg_write_bw = KBps(wb->avg_write_bandwidth); 493 405 __entry->dirty_rate = KBps(dirty_rate); 494 - __entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit); 406 + __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit); 495 407 __entry->task_ratelimit = KBps(task_ratelimit); 496 408 __entry->balanced_dirty_ratelimit = 497 - KBps(bdi->wb.balanced_dirty_ratelimit); 409 + KBps(wb->balanced_dirty_ratelimit); 410 + __trace_wb_assign_cgroup(__get_str(cgroup), wb); 498 411 ), 499 412 500 413 TP_printk("bdi %s: " 501 414 "write_bw=%lu awrite_bw=%lu dirty_rate=%lu " 502 415 "dirty_ratelimit=%lu task_ratelimit=%lu " 503 - "balanced_dirty_ratelimit=%lu", 416 + "balanced_dirty_ratelimit=%lu cgroup=%s", 504 417 __entry->bdi, 505 418 __entry->write_bw, /* write bandwidth */ 506 419 __entry->avg_write_bw, /* avg write bandwidth */ 507 420 __entry->dirty_rate, /* bdi dirty rate */ 508 421 __entry->dirty_ratelimit, /* base ratelimit */ 509 422 __entry->task_ratelimit, /* ratelimit with position control */ 510 - __entry->balanced_dirty_ratelimit /* the balanced ratelimit */ 423 + __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */ 424 + __get_str(cgroup) 511 425 ) 512 426 ); 513 427 514 428 TRACE_EVENT(balance_dirty_pages, 515 429 516 - TP_PROTO(struct backing_dev_info *bdi, 430 + TP_PROTO(struct bdi_writeback *wb, 517 431 unsigned long thresh, 518 432 unsigned long bg_thresh, 519 433 unsigned long dirty, ··· 529 437 long pause, 530 438 unsigned long start_time), 531 439 532 - TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, 440 + TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, 533 441 dirty_ratelimit, task_ratelimit, 534 442 dirtied, period, pause, start_time), 535 443 ··· 548 456 __field( long, pause) 549 457 __field(unsigned long, period) 550 458 __field( long, think) 459 + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) 551 460 ), 552 461 553 462 TP_fast_assign( 554 463 unsigned long freerun = (thresh + bg_thresh) / 2; 555 - strlcpy(__entry->bdi, dev_name(bdi->dev), 32); 464 + strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32); 556 465 557 466 __entry->limit = global_wb_domain.dirty_limit; 558 467 __entry->setpoint = (global_wb_domain.dirty_limit + ··· 571 478 __entry->period = period * 1000 / HZ; 572 479 __entry->pause = pause * 1000 / HZ; 573 480 __entry->paused = (jiffies - start_time) * 1000 / HZ; 481 + __trace_wb_assign_cgroup(__get_str(cgroup), wb); 574 482 ), 575 483 576 484 ··· 580 486 "bdi_setpoint=%lu bdi_dirty=%lu " 581 487 "dirty_ratelimit=%lu task_ratelimit=%lu " 582 488 "dirtied=%u dirtied_pause=%u " 583 - "paused=%lu pause=%ld period=%lu think=%ld", 489 + "paused=%lu pause=%ld period=%lu think=%ld cgroup=%s", 584 490 __entry->bdi, 585 491 __entry->limit, 586 492 __entry->setpoint, ··· 594 500 __entry->paused, /* ms */ 595 501 __entry->pause, /* ms */ 596 502 __entry->period, /* ms */ 597 - __entry->think /* ms */ 503 + __entry->think, /* ms */ 504 + __get_str(cgroup) 598 505 ) 599 506 ); 600 507 ··· 609 514 __field(unsigned long, ino) 610 515 __field(unsigned long, state) 611 516 __field(unsigned long, dirtied_when) 517 + __dynamic_array(char, cgroup, 518 + __trace_wb_cgroup_size(inode_to_wb(inode))) 612 519 ), 613 520 614 521 TP_fast_assign( ··· 619 522 __entry->ino = inode->i_ino; 620 523 __entry->state = inode->i_state; 621 524 __entry->dirtied_when = inode->dirtied_when; 525 + __trace_wb_assign_cgroup(__get_str(cgroup), inode_to_wb(inode)); 622 526 ), 623 527 624 - TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu", 528 + TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup=%s", 625 529 __entry->name, 626 530 __entry->ino, 627 531 show_inode_state(__entry->state), 628 532 __entry->dirtied_when, 629 - (jiffies - __entry->dirtied_when) / HZ 533 + (jiffies - __entry->dirtied_when) / HZ, 534 + __get_str(cgroup) 630 535 ) 631 536 ); 632 537 ··· 684 585 __field(unsigned long, writeback_index) 685 586 __field(long, nr_to_write) 686 587 __field(unsigned long, wrote) 588 + __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc)) 687 589 ), 688 590 689 591 TP_fast_assign( ··· 696 596 __entry->writeback_index = inode->i_mapping->writeback_index; 697 597 __entry->nr_to_write = nr_to_write; 698 598 __entry->wrote = nr_to_write - wbc->nr_to_write; 599 + __trace_wbc_assign_cgroup(__get_str(cgroup), wbc); 699 600 ), 700 601 701 602 TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " 702 - "index=%lu to_write=%ld wrote=%lu", 603 + "index=%lu to_write=%ld wrote=%lu cgroup=%s", 703 604 __entry->name, 704 605 __entry->ino, 705 606 show_inode_state(__entry->state), ··· 708 607 (jiffies - __entry->dirtied_when) / HZ, 709 608 __entry->writeback_index, 710 609 __entry->nr_to_write, 711 - __entry->wrote 610 + __entry->wrote, 611 + __get_str(cgroup) 712 612 ) 713 613 ); 714 614
+3 -3
mm/page-writeback.c
··· 1289 1289 wb->dirty_ratelimit = max(dirty_ratelimit, 1UL); 1290 1290 wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; 1291 1291 1292 - trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit); 1292 + trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit); 1293 1293 } 1294 1294 1295 1295 static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, ··· 1683 1683 * do a reset, as it may be a light dirtier. 1684 1684 */ 1685 1685 if (pause < min_pause) { 1686 - trace_balance_dirty_pages(bdi, 1686 + trace_balance_dirty_pages(wb, 1687 1687 sdtc->thresh, 1688 1688 sdtc->bg_thresh, 1689 1689 sdtc->dirty, ··· 1712 1712 } 1713 1713 1714 1714 pause: 1715 - trace_balance_dirty_pages(bdi, 1715 + trace_balance_dirty_pages(wb, 1716 1716 sdtc->thresh, 1717 1717 sdtc->bg_thresh, 1718 1718 sdtc->dirty,