drbd: resync should only lock out specific ranges

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

During resync, if we need to block some specific incoming write because
of active resync requests to that same range, we potentially caused
*all* new application writes (to "cold" activity log extents) to block
until this one request has been processed.

Improve the do_submit() logic to
* grab all incoming requests to some "incoming" list
* process this list
- move aside requests that are blocked by resync
- prepare activity log transactions,
- commit transactions and submit corresponding requests
- if there are remaining requests that only wait for
activity log extents to become free, stop the fast path
(mark activity log as "starving")
- iterate until no more requests are waiting for the activity log,
but all potentially remaining requests are only blocked by resync
* only then grab new incoming requests

That way, very busy IO on currently "hot" activity log extents cannot
starve scattered IO to "cold" extents. And blocked-by-resync requests
are processed once resync traffic on the affected region has ceased,
without blocking anything else.

The only blocking mode left is when we cannot start requests to "cold"
extents because all currently "hot" extents are actually used.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>

authored by

Lars Ellenberg and committed by

Philipp Reisner 11 years ago f5b90b6b cc356f85

+98 -53

2 changed files

expand all

drivers

block

drbd

drbd_actlog.c

drbd_req.c

+13 -2

drivers/block/drbd/drbd_actlog.c

··· 357 357 /* We want all necessary updates for a given request within the same transaction 358 358 * We could first check how many updates are *actually* needed, 359 359 * and use that instead of the worst-case nr_al_extents */ 360 - if (available_update_slots < nr_al_extents) 361 - return -EWOULDBLOCK; 360 + if (available_update_slots < nr_al_extents) { 361 + /* Too many activity log extents are currently "hot". 362 + * 363 + * If we have accumulated pending changes already, 364 + * we made progress. 365 + * 366 + * If we cannot get even a single pending change through, 367 + * stop the fast path until we made some progress, 368 + * or requests to "cold" extents could be starved. */ 369 + if (!al->pending_changes) 370 + __set_bit(__LC_STARVING, &device->act_log->flags); 371 + return -ENOBUFS; 372 + } 362 373 363 374 /* Is resync active in this area? */ 364 375 for (enr = first; enr <= last; enr++) {

+85 -51

drivers/block/drbd/drbd_req.c

··· 1182 1182 &device->pending_master_completion[1 /* WRITE */]); 1183 1183 spin_unlock_irq(&device->resource->req_lock); 1184 1184 queue_work(device->submit.wq, &device->submit.worker); 1185 + /* do_submit() may sleep internally on al_wait, too */ 1186 + wake_up(&device->al_wait); 1185 1187 } 1186 1188 1187 1189 /* returns the new drbd_request pointer, if the caller is expected to ··· 1367 1365 1368 1366 static bool prepare_al_transaction_nonblock(struct drbd_device *device, 1369 1367 struct list_head *incoming, 1370 - struct list_head *pending) 1368 + struct list_head *pending, 1369 + struct list_head *later) 1371 1370 { 1372 1371 struct drbd_request *req, *tmp; 1373 1372 int wake = 0; ··· 1377 1374 spin_lock_irq(&device->al_lock); 1378 1375 list_for_each_entry_safe(req, tmp, incoming, tl_requests) { 1379 1376 err = drbd_al_begin_io_nonblock(device, &req->i); 1377 + if (err == -ENOBUFS) 1378 + break; 1380 1379 if (err == -EBUSY) 1381 1380 wake = 1; 1382 1381 if (err) 1383 - continue; 1384 - list_move_tail(&req->tl_requests, pending); 1382 + list_move_tail(&req->tl_requests, later); 1383 + else 1384 + list_move_tail(&req->tl_requests, pending); 1385 1385 } 1386 1386 spin_unlock_irq(&device->al_lock); 1387 1387 if (wake) 1388 1388 wake_up(&device->al_wait); 1389 - 1390 1389 return !list_empty(pending); 1390 + } 1391 + 1392 + void send_and_submit_pending(struct drbd_device *device, struct list_head *pending) 1393 + { 1394 + struct drbd_request *req, *tmp; 1395 + 1396 + list_for_each_entry_safe(req, tmp, pending, tl_requests) { 1397 + req->rq_state |= RQ_IN_ACT_LOG; 1398 + req->in_actlog_jif = jiffies; 1399 + atomic_dec(&device->ap_actlog_cnt); 1400 + list_del_init(&req->tl_requests); 1401 + drbd_send_and_submit(device, req); 1402 + } 1391 1403 } 1392 1404 1393 1405 void do_submit(struct work_struct *ws) 1394 1406 { 1395 1407 struct drbd_device *device = container_of(ws, struct drbd_device, submit.worker); 1396 - LIST_HEAD(incoming); 1397 - LIST_HEAD(pending); 1398 - struct drbd_request *req, *tmp; 1408 + LIST_HEAD(incoming); /* from drbd_make_request() */ 1409 + LIST_HEAD(pending); /* to be submitted after next AL-transaction commit */ 1410 + LIST_HEAD(busy); /* blocked by resync requests */ 1411 + 1412 + /* grab new incoming requests */ 1413 + spin_lock_irq(&device->resource->req_lock); 1414 + list_splice_tail_init(&device->submit.writes, &incoming); 1415 + spin_unlock_irq(&device->resource->req_lock); 1399 1416 1400 1417 for (;;) { 1401 - spin_lock_irq(&device->resource->req_lock); 1402 - list_splice_tail_init(&device->submit.writes, &incoming); 1403 - spin_unlock_irq(&device->resource->req_lock); 1418 + DEFINE_WAIT(wait); 1404 1419 1420 + /* move used-to-be-busy back to front of incoming */ 1421 + list_splice_init(&busy, &incoming); 1405 1422 submit_fast_path(device, &incoming); 1406 1423 if (list_empty(&incoming)) 1407 1424 break; 1408 1425 1409 - skip_fast_path: 1410 - wait_event(device->al_wait, prepare_al_transaction_nonblock(device, &incoming, &pending)); 1411 - /* Maybe more was queued, while we prepared the transaction? 1412 - * Try to stuff them into this transaction as well. 1413 - * Be strictly non-blocking here, no wait_event, we already 1414 - * have something to commit. 1415 - * Stop if we don't make any more progres. 1416 - */ 1417 1426 for (;;) { 1427 + prepare_to_wait(&device->al_wait, &wait, TASK_UNINTERRUPTIBLE); 1428 + 1429 + list_splice_init(&busy, &incoming); 1430 + prepare_al_transaction_nonblock(device, &incoming, &pending, &busy); 1431 + if (!list_empty(&pending)) 1432 + break; 1433 + 1434 + schedule(); 1435 + 1436 + /* If all currently "hot" activity log extents are kept busy by 1437 + * incoming requests, we still must not totally starve new 1438 + * requests to "cold" extents. 1439 + * Something left on &incoming means there had not been 1440 + * enough update slots available, and the activity log 1441 + * has been marked as "starving". 1442 + * 1443 + * Try again now, without looking for new requests, 1444 + * effectively blocking all new requests until we made 1445 + * at least _some_ progress with what we currently have. 1446 + */ 1447 + if (!list_empty(&incoming)) 1448 + continue; 1449 + 1450 + /* Nothing moved to pending, but nothing left 1451 + * on incoming: all moved to busy! 1452 + * Grab new and iterate. */ 1453 + spin_lock_irq(&device->resource->req_lock); 1454 + list_splice_tail_init(&device->submit.writes, &incoming); 1455 + spin_unlock_irq(&device->resource->req_lock); 1456 + } 1457 + finish_wait(&device->al_wait, &wait); 1458 + 1459 + /* If the transaction was full, before all incoming requests 1460 + * had been processed, skip ahead to commit, and iterate 1461 + * without splicing in more incoming requests from upper layers. 1462 + * 1463 + * Else, if all incoming have been processed, 1464 + * they have become either "pending" (to be submitted after 1465 + * next transaction commit) or "busy" (blocked by resync). 1466 + * 1467 + * Maybe more was queued, while we prepared the transaction? 1468 + * Try to stuff those into this transaction as well. 1469 + * Be strictly non-blocking here, 1470 + * we already have something to commit. 1471 + * 1472 + * Commit if we don't make any more progres. 1473 + */ 1474 + 1475 + while (list_empty(&incoming)) { 1418 1476 LIST_HEAD(more_pending); 1419 1477 LIST_HEAD(more_incoming); 1420 1478 bool made_progress; ··· 1492 1428 if (list_empty(&more_incoming)) 1493 1429 break; 1494 1430 1495 - made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending); 1431 + made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending, &busy); 1496 1432 1497 1433 list_splice_tail_init(&more_pending, &pending); 1498 1434 list_splice_tail_init(&more_incoming, &incoming); 1499 - 1500 1435 if (!made_progress) 1501 1436 break; 1502 1437 } 1438 + 1503 1439 drbd_al_begin_io_commit(device); 1504 - 1505 - list_for_each_entry_safe(req, tmp, &pending, tl_requests) { 1506 - req->rq_state |= RQ_IN_ACT_LOG; 1507 - req->in_actlog_jif = jiffies; 1508 - atomic_dec(&device->ap_actlog_cnt); 1509 - list_del_init(&req->tl_requests); 1510 - drbd_send_and_submit(device, req); 1511 - } 1512 - 1513 - /* If all currently hot activity log extents are kept busy by 1514 - * incoming requests, we still must not totally starve new 1515 - * requests to cold extents. In that case, prepare one request 1516 - * in blocking mode. */ 1517 - list_for_each_entry_safe(req, tmp, &incoming, tl_requests) { 1518 - bool was_cold; 1519 - list_del_init(&req->tl_requests); 1520 - was_cold = drbd_al_begin_io_prepare(device, &req->i); 1521 - if (!was_cold) { 1522 - req->rq_state |= RQ_IN_ACT_LOG; 1523 - req->in_actlog_jif = jiffies; 1524 - atomic_dec(&device->ap_actlog_cnt); 1525 - /* Corresponding extent was hot after all? */ 1526 - drbd_send_and_submit(device, req); 1527 - } else { 1528 - /* Found a request to a cold extent. 1529 - * Put on "pending" list, 1530 - * and try to cumulate with more. */ 1531 - list_add(&req->tl_requests, &pending); 1532 - goto skip_fast_path; 1533 - } 1534 - } 1440 + send_and_submit_pending(device, &pending); 1535 1441 } 1536 1442 } 1537 1443