Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drbd: improve throttling decisions of background resynchronisation

Background resynchronisation does some "side-stepping", or throttles
itself, if it detects application IO activity, and the current resync
rate estimate is above the configured "cmin-rate".

What was not detected: if there is no application IO,
because it blocks on activity log transactions.

Introduce a new atomic_t ap_actlog_cnt, tracking such blocked requests,
and count non-zero as application IO activity.
This counter is exposed at proc_details level 2 and above.

Also make sure to release the currently locked resync extent
if we side-step due to such voluntary throttling.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>

authored by

Lars Ellenberg and committed by
Philipp Reisner
ad3fee79 7753a4c1

+51 -18
+26 -3
drivers/block/drbd/drbd_actlog.c
··· 991 991 struct lc_element *e; 992 992 struct bm_extent *bm_ext; 993 993 int i; 994 + bool throttle = drbd_rs_should_slow_down(device, sector, true); 995 + 996 + /* If we need to throttle, a half-locked (only marked BME_NO_WRITES, 997 + * not yet BME_LOCKED) extent needs to be kicked out explicitly if we 998 + * need to throttle. There is at most one such half-locked extent, 999 + * which is remembered in resync_wenr. */ 1000 + 1001 + if (throttle && device->resync_wenr != enr) 1002 + return -EAGAIN; 994 1003 995 1004 spin_lock_irq(&device->al_lock); 996 1005 if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) { ··· 1023 1014 D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); 1024 1015 clear_bit(BME_NO_WRITES, &bm_ext->flags); 1025 1016 device->resync_wenr = LC_FREE; 1026 - if (lc_put(device->resync, &bm_ext->lce) == 0) 1017 + if (lc_put(device->resync, &bm_ext->lce) == 0) { 1018 + bm_ext->flags = 0; 1027 1019 device->resync_locked--; 1020 + } 1028 1021 wake_up(&device->al_wait); 1029 1022 } else { 1030 1023 drbd_alert(device, "LOGIC BUG\n"); ··· 1088 1077 return 0; 1089 1078 1090 1079 try_again: 1091 - if (bm_ext) 1092 - device->resync_wenr = enr; 1080 + if (bm_ext) { 1081 + if (throttle) { 1082 + D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); 1083 + D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); 1084 + clear_bit(BME_NO_WRITES, &bm_ext->flags); 1085 + device->resync_wenr = LC_FREE; 1086 + if (lc_put(device->resync, &bm_ext->lce) == 0) { 1087 + bm_ext->flags = 0; 1088 + device->resync_locked--; 1089 + } 1090 + wake_up(&device->al_wait); 1091 + } else 1092 + device->resync_wenr = enr; 1093 + } 1093 1094 spin_unlock_irq(&device->al_lock); 1094 1095 return -EAGAIN; 1095 1096 }
+3 -1
drivers/block/drbd/drbd_int.h
··· 797 797 unsigned int al_writ_cnt; 798 798 unsigned int bm_writ_cnt; 799 799 atomic_t ap_bio_cnt; /* Requests we need to complete */ 800 + atomic_t ap_actlog_cnt; /* Requests waiting for activity log */ 800 801 atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */ 801 802 atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ 802 803 atomic_t unacked_cnt; /* Need to send replies for */ ··· 1455 1454 extern int drbd_receiver(struct drbd_thread *thi); 1456 1455 extern int drbd_asender(struct drbd_thread *thi); 1457 1456 extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); 1458 - extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector); 1457 + extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, 1458 + bool throttle_if_app_is_waiting); 1459 1459 extern int drbd_submit_peer_request(struct drbd_device *, 1460 1460 struct drbd_peer_request *, const unsigned, 1461 1461 const int);
+1
drivers/block/drbd/drbd_main.c
··· 1909 1909 drbd_set_defaults(device); 1910 1910 1911 1911 atomic_set(&device->ap_bio_cnt, 0); 1912 + atomic_set(&device->ap_actlog_cnt, 0); 1912 1913 atomic_set(&device->ap_pending_cnt, 0); 1913 1914 atomic_set(&device->rs_pending_cnt, 0); 1914 1915 atomic_set(&device->unacked_cnt, 0);
+3
drivers/block/drbd/drbd_proc.c
··· 335 335 lc_seq_printf_stats(seq, device->act_log); 336 336 put_ldev(device); 337 337 } 338 + 339 + if (proc_details >= 2) 340 + seq_printf(seq, "\tblocked on activity log: %d\n", atomic_read(&device->ap_actlog_cnt)); 338 341 } 339 342 rcu_read_unlock(); 340 343
+12 -7
drivers/block/drbd/drbd_receiver.c
··· 2417 2417 * The current sync rate used here uses only the most recent two step marks, 2418 2418 * to have a short time average so we can react faster. 2419 2419 */ 2420 - bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector) 2420 + bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, 2421 + bool throttle_if_app_is_waiting) 2421 2422 { 2422 2423 struct lc_element *tmp; 2423 - bool throttle = true; 2424 + bool throttle = drbd_rs_c_min_rate_throttle(device); 2424 2425 2425 - if (!drbd_rs_c_min_rate_throttle(device)) 2426 - return false; 2426 + if (!throttle || throttle_if_app_is_waiting) 2427 + return throttle; 2427 2428 2428 2429 spin_lock_irq(&device->al_lock); 2429 2430 tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector)); ··· 2432 2431 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 2433 2432 if (test_bit(BME_PRIORITY, &bm_ext->flags)) 2434 2433 throttle = false; 2435 - /* Do not slow down if app IO is already waiting for this extent */ 2434 + /* Do not slow down if app IO is already waiting for this extent, 2435 + * and our progress is necessary for application IO to complete. */ 2436 2436 } 2437 2437 spin_unlock_irq(&device->al_lock); 2438 2438 ··· 2458 2456 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 2459 2457 (int)part_stat_read(&disk->part0, sectors[1]) - 2460 2458 atomic_read(&device->rs_sect_ev); 2461 - if (!device->rs_last_events || curr_events - device->rs_last_events > 64) { 2459 + 2460 + if (atomic_read(&device->ap_actlog_cnt) 2461 + || !device->rs_last_events || curr_events - device->rs_last_events > 64) { 2462 2462 unsigned long rs_left; 2463 2463 int i; 2464 2464 ··· 2650 2646 * we would also throttle its application reads. 2651 2647 * In that case, throttling is done on the SyncTarget only. 2652 2648 */ 2653 - if (device->state.peer != R_PRIMARY && drbd_rs_should_slow_down(device, sector)) 2649 + if (device->state.peer != R_PRIMARY 2650 + && drbd_rs_should_slow_down(device, sector, false)) 2654 2651 schedule_timeout_uninterruptible(HZ/10); 2655 2652 if (drbd_rs_begin_io(device, sector)) 2656 2653 goto out_free_e;
+4
drivers/block/drbd/drbd_req.c
··· 1218 1218 if (rw == WRITE && req->private_bio && req->i.size 1219 1219 && !test_bit(AL_SUSPENDED, &device->flags)) { 1220 1220 if (!drbd_al_begin_io_fastpath(device, &req->i)) { 1221 + atomic_inc(&device->ap_actlog_cnt); 1221 1222 drbd_queue_write(device, req); 1222 1223 return NULL; 1223 1224 } ··· 1355 1354 1356 1355 req->rq_state |= RQ_IN_ACT_LOG; 1357 1356 req->in_actlog_jif = jiffies; 1357 + atomic_dec(&device->ap_actlog_cnt); 1358 1358 } 1359 1359 1360 1360 list_del_init(&req->tl_requests); ··· 1441 1439 list_for_each_entry_safe(req, tmp, &pending, tl_requests) { 1442 1440 req->rq_state |= RQ_IN_ACT_LOG; 1443 1441 req->in_actlog_jif = jiffies; 1442 + atomic_dec(&device->ap_actlog_cnt); 1444 1443 list_del_init(&req->tl_requests); 1445 1444 drbd_send_and_submit(device, req); 1446 1445 } ··· 1457 1454 if (!was_cold) { 1458 1455 req->rq_state |= RQ_IN_ACT_LOG; 1459 1456 req->in_actlog_jif = jiffies; 1457 + atomic_dec(&device->ap_actlog_cnt); 1460 1458 /* Corresponding extent was hot after all? */ 1461 1459 drbd_send_and_submit(device, req); 1462 1460 } else {
+2 -7
drivers/block/drbd/drbd_worker.c
··· 395 395 if (!get_ldev(device)) 396 396 return -EIO; 397 397 398 - if (drbd_rs_should_slow_down(device, sector)) 399 - goto defer; 400 - 401 398 /* GFP_TRY, because if there is no memory available right now, this may 402 399 * be rescheduled for later. It is "only" background resync, after all. */ 403 400 peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector, ··· 648 651 649 652 sector = BM_BIT_TO_SECT(bit); 650 653 651 - if (drbd_rs_should_slow_down(device, sector) || 652 - drbd_try_rs_begin_io(device, sector)) { 654 + if (drbd_try_rs_begin_io(device, sector)) { 653 655 device->bm_resync_fo = bit; 654 656 goto requeue; 655 657 } ··· 779 783 780 784 size = BM_BLOCK_SIZE; 781 785 782 - if (drbd_rs_should_slow_down(device, sector) || 783 - drbd_try_rs_begin_io(device, sector)) { 786 + if (drbd_try_rs_begin_io(device, sector)) { 784 787 device->ov_position = sector; 785 788 goto requeue; 786 789 }