Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drbd: introduce P_ZEROES (REQ_OP_WRITE_ZEROES on the "wire")

And also re-enable partial-zero-out + discard aligned.

With the introduction of REQ_OP_WRITE_ZEROES,
we started to use that for both WRITE_ZEROES and DISCARDS,
hoping that WRITE_ZEROES would "do what we want",
UNMAP if possible, zero-out the rest.

The example scenario is some LVM "thin" backend.

While an un-allocated block on dm-thin reads as zeroes, on a dm-thin
with "skip_block_zeroing=true", after a partial block write allocated
that block, that same block may well map "undefined old garbage" from
the backends on LBAs that have not yet been written to.

If we cannot distinguish between zero-out and discard on the receiving
side, to avoid "undefined old garbage" to pop up randomly at later times
on supposedly zero-initialized blocks, we'd need to map all discards to
zero-out on the receiving side. But that would potentially do a full
alloc on thinly provisioned backends, even when the expectation was to
unmap/trim/discard/de-allocate.

We need to distinguish on the protocol level, whether we need to guarantee
zeroes (and thus use zero-out, potentially doing the mentioned full-alloc),
or if we want to put the emphasis on discard, and only do a "best effort
zeroing" (by "discarding" blocks aligned to discard-granularity, and zeroing
only potential unaligned head and tail clippings to at least *try* to
avoid "false positives" in an online-verify later), hoping that someone
set skip_block_zeroing=false.

For some discussion regarding this on dm-devel, see also
https://www.mail-archive.com/dm-devel%40redhat.com/msg07965.html
https://www.redhat.com/archives/dm-devel/2018-January/msg00271.html

For backward compatibility, P_TRIM means zero-out, unless the
DRBD_FF_WZEROES feature flag is agreed upon during handshake.

To have upper layers even try to submit WRITE ZEROES requests,
we need to announce "efficient zeroout" independently.

We need to fixup max_write_zeroes_sectors after blk_queue_stack_limits():
if we can handle "zeroes" efficiently on the protocol,
we want to do that, even if our backend does not announce
max_write_zeroes_sectors itself.

Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Lars Ellenberg and committed by
Jens Axboe
f31e583a 9848b6dd

+252 -31
+2
drivers/block/drbd/drbd_debugfs.c
··· 237 237 seq_print_rq_state_bit(m, f & EE_CALL_AL_COMPLETE_IO, &sep, "in-AL"); 238 238 seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C"); 239 239 seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync"); 240 + seq_print_rq_state_bit(m, f & EE_TRIM, &sep, "trim"); 241 + seq_print_rq_state_bit(m, f & EE_ZEROOUT, &sep, "zero-out"); 240 242 seq_print_rq_state_bit(m, f & EE_WRITE_SAME, &sep, "write-same"); 241 243 seq_putc(m, '\n'); 242 244 }
+9 -2
drivers/block/drbd/drbd_int.h
··· 430 430 __EE_MAY_SET_IN_SYNC, 431 431 432 432 /* is this a TRIM aka REQ_OP_DISCARD? */ 433 - __EE_IS_TRIM, 433 + __EE_TRIM, 434 + /* explicit zero-out requested, or 435 + * our lower level cannot handle trim, 436 + * and we want to fall back to zeroout instead */ 437 + __EE_ZEROOUT, 434 438 435 439 /* In case a barrier failed, 436 440 * we need to resubmit without the barrier flag. */ ··· 476 472 }; 477 473 #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) 478 474 #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) 479 - #define EE_IS_TRIM (1<<__EE_IS_TRIM) 475 + #define EE_TRIM (1<<__EE_TRIM) 476 + #define EE_ZEROOUT (1<<__EE_ZEROOUT) 480 477 #define EE_RESUBMITTED (1<<__EE_RESUBMITTED) 481 478 #define EE_WAS_ERROR (1<<__EE_WAS_ERROR) 482 479 #define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) ··· 1561 1556 extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req); 1562 1557 1563 1558 /* drbd_receiver.c */ 1559 + extern int drbd_issue_discard_or_zero_out(struct drbd_device *device, 1560 + sector_t start, unsigned int nr_sectors, int flags); 1564 1561 extern int drbd_receiver(struct drbd_thread *thi); 1565 1562 extern int drbd_ack_receiver(struct drbd_thread *thi); 1566 1563 extern void drbd_send_ping_wf(struct work_struct *ws);
+8 -3
drivers/block/drbd/drbd_main.c
··· 1668 1668 (bio->bi_opf & REQ_PREFLUSH ? DP_FLUSH : 0) | 1669 1669 (bio_op(bio) == REQ_OP_WRITE_SAME ? DP_WSAME : 0) | 1670 1670 (bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0) | 1671 - (bio_op(bio) == REQ_OP_WRITE_ZEROES ? DP_DISCARD : 0); 1671 + (bio_op(bio) == REQ_OP_WRITE_ZEROES ? 1672 + ((connection->agreed_features & DRBD_FF_WZEROES) ? 1673 + (DP_ZEROES |(!(bio->bi_opf & REQ_NOUNMAP) ? DP_DISCARD : 0)) 1674 + : DP_DISCARD) 1675 + : 0); 1672 1676 else 1673 1677 return bio->bi_opf & REQ_SYNC ? DP_RW_SYNC : 0; 1674 1678 } ··· 1716 1712 } 1717 1713 p->dp_flags = cpu_to_be32(dp_flags); 1718 1714 1719 - if (dp_flags & DP_DISCARD) { 1715 + if (dp_flags & (DP_DISCARD|DP_ZEROES)) { 1716 + enum drbd_packet cmd = (dp_flags & DP_ZEROES) ? P_ZEROES : P_TRIM; 1720 1717 struct p_trim *t = (struct p_trim*)p; 1721 1718 t->size = cpu_to_be32(req->i.size); 1722 - err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0); 1719 + err = __send_command(peer_device->connection, device->vnr, sock, cmd, sizeof(*t), NULL, 0); 1723 1720 goto out; 1724 1721 } 1725 1722 if (dp_flags & DP_WSAME) {
+16
drivers/block/drbd/drbd_nl.c
··· 1261 1261 } 1262 1262 } 1263 1263 1264 + static void fixup_write_zeroes(struct drbd_device *device, struct request_queue *q) 1265 + { 1266 + /* Fixup max_write_zeroes_sectors after blk_queue_stack_limits(): 1267 + * if we can handle "zeroes" efficiently on the protocol, 1268 + * we want to do that, even if our backend does not announce 1269 + * max_write_zeroes_sectors itself. */ 1270 + struct drbd_connection *connection = first_peer_device(device)->connection; 1271 + /* If the peer announces WZEROES support, use it. Otherwise, rather 1272 + * send explicit zeroes than rely on some discard-zeroes-data magic. */ 1273 + if (connection->agreed_features & DRBD_FF_WZEROES) 1274 + q->limits.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS; 1275 + else 1276 + q->limits.max_write_zeroes_sectors = 0; 1277 + } 1278 + 1264 1279 static void decide_on_write_same_support(struct drbd_device *device, 1265 1280 struct request_queue *q, 1266 1281 struct request_queue *b, struct o_qlim *o, ··· 1386 1371 } 1387 1372 } 1388 1373 fixup_discard_if_not_supported(q); 1374 + fixup_write_zeroes(device, q); 1389 1375 } 1390 1376 1391 1377 void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o)
+47
drivers/block/drbd/drbd_protocol.h
··· 70 70 * we may fall back to an opencoded loop instead. */ 71 71 P_WSAME = 0x34, 72 72 73 + /* 0x35 already claimed in DRBD 9 */ 74 + P_ZEROES = 0x36, /* data sock: zero-out, WRITE_ZEROES */ 75 + 76 + /* 0x40 .. 0x48 already claimed in DRBD 9 */ 77 + 73 78 P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ 74 79 P_MAX_OPT_CMD = 0x101, 75 80 ··· 135 130 #define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */ 136 131 #define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */ 137 132 #define DP_WSAME 512 /* equiv. REQ_WRITE_SAME */ 133 + #define DP_ZEROES 1024 /* equiv. REQ_OP_WRITE_ZEROES */ 134 + 135 + /* possible combinations: 136 + * REQ_OP_WRITE_ZEROES: DP_DISCARD | DP_ZEROES 137 + * REQ_OP_WRITE_ZEROES + REQ_NOUNMAP: DP_ZEROES 138 + */ 138 139 139 140 struct p_data { 140 141 u64 sector; /* 64 bits sector number */ ··· 207 196 * drbd_send_sizes()/receive_sizes() 208 197 */ 209 198 #define DRBD_FF_WSAME 4 199 + 200 + /* supports REQ_OP_WRITE_ZEROES on the "wire" protocol. 201 + * 202 + * We used to map that to "discard" on the sending side, and if we cannot 203 + * guarantee that discard zeroes data, the receiving side would map discard 204 + * back to zero-out. 205 + * 206 + * With the introduction of REQ_OP_WRITE_ZEROES, 207 + * we started to use that for both WRITE_ZEROES and DISCARDS, 208 + * hoping that WRITE_ZEROES would "do what we want", 209 + * UNMAP if possible, zero-out the rest. 210 + * 211 + * The example scenario is some LVM "thin" backend. 212 + * 213 + * While an un-allocated block on dm-thin reads as zeroes, on a dm-thin 214 + * with "skip_block_zeroing=true", after a partial block write allocated 215 + * that block, that same block may well map "undefined old garbage" from 216 + * the backends on LBAs that have not yet been written to. 217 + * 218 + * If we cannot distinguish between zero-out and discard on the receiving 219 + * side, to avoid "undefined old garbage" to pop up randomly at later times 220 + * on supposedly zero-initialized blocks, we'd need to map all discards to 221 + * zero-out on the receiving side. But that would potentially do a full 222 + * alloc on thinly provisioned backends, even when the expectation was to 223 + * unmap/trim/discard/de-allocate. 224 + * 225 + * We need to distinguish on the protocol level, whether we need to guarantee 226 + * zeroes (and thus use zero-out, potentially doing the mentioned full-alloc), 227 + * or if we want to put the emphasis on discard, and only do a "best effort 228 + * zeroing" (by "discarding" blocks aligned to discard-granularity, and zeroing 229 + * only potential unaligned head and tail clippings), to at least *try* to 230 + * avoid "false positives" in an online-verify later, hoping that someone 231 + * set skip_block_zeroing=false. 232 + */ 233 + #define DRBD_FF_WZEROES 8 234 + 210 235 211 236 struct p_connection_features { 212 237 u32 protocol_min;
+156 -15
drivers/block/drbd/drbd_receiver.c
··· 50 50 #include "drbd_req.h" 51 51 #include "drbd_vli.h" 52 52 53 - #define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME) 53 + #define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME|DRBD_FF_WZEROES) 54 54 55 55 struct packet_info { 56 56 enum drbd_packet cmd; ··· 1490 1490 drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]); 1491 1491 } 1492 1492 1493 - static void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer_request *peer_req) 1493 + /* 1494 + * Mapping "discard" to ZEROOUT with UNMAP does not work for us: 1495 + * Drivers have to "announce" q->limits.max_write_zeroes_sectors, or it 1496 + * will directly go to fallback mode, submitting normal writes, and 1497 + * never even try to UNMAP. 1498 + * 1499 + * And dm-thin does not do this (yet), mostly because in general it has 1500 + * to assume that "skip_block_zeroing" is set. See also: 1501 + * https://www.mail-archive.com/dm-devel%40redhat.com/msg07965.html 1502 + * https://www.redhat.com/archives/dm-devel/2018-January/msg00271.html 1503 + * 1504 + * We *may* ignore the discard-zeroes-data setting, if so configured. 1505 + * 1506 + * Assumption is that this "discard_zeroes_data=0" is only because the backend 1507 + * may ignore partial unaligned discards. 1508 + * 1509 + * LVM/DM thin as of at least 1510 + * LVM version: 2.02.115(2)-RHEL7 (2015-01-28) 1511 + * Library version: 1.02.93-RHEL7 (2015-01-28) 1512 + * Driver version: 4.29.0 1513 + * still behaves this way. 1514 + * 1515 + * For unaligned (wrt. alignment and granularity) or too small discards, 1516 + * we zero-out the initial (and/or) trailing unaligned partial chunks, 1517 + * but discard all the aligned full chunks. 1518 + * 1519 + * At least for LVM/DM thin, with skip_block_zeroing=false, 1520 + * the result is effectively "discard_zeroes_data=1". 1521 + */ 1522 + /* flags: EE_TRIM|EE_ZEROOUT */ 1523 + int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, int flags) 1494 1524 { 1495 1525 struct block_device *bdev = device->ldev->backing_bdev; 1526 + struct request_queue *q = bdev_get_queue(bdev); 1527 + sector_t tmp, nr; 1528 + unsigned int max_discard_sectors, granularity; 1529 + int alignment; 1530 + int err = 0; 1496 1531 1497 - if (blkdev_issue_zeroout(bdev, peer_req->i.sector, peer_req->i.size >> 9, 1498 - GFP_NOIO, 0)) 1532 + if ((flags & EE_ZEROOUT) || !(flags & EE_TRIM)) 1533 + goto zero_out; 1534 + 1535 + /* Zero-sector (unknown) and one-sector granularities are the same. */ 1536 + granularity = max(q->limits.discard_granularity >> 9, 1U); 1537 + alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; 1538 + 1539 + max_discard_sectors = min(q->limits.max_discard_sectors, (1U << 22)); 1540 + max_discard_sectors -= max_discard_sectors % granularity; 1541 + if (unlikely(!max_discard_sectors)) 1542 + goto zero_out; 1543 + 1544 + if (nr_sectors < granularity) 1545 + goto zero_out; 1546 + 1547 + tmp = start; 1548 + if (sector_div(tmp, granularity) != alignment) { 1549 + if (nr_sectors < 2*granularity) 1550 + goto zero_out; 1551 + /* start + gran - (start + gran - align) % gran */ 1552 + tmp = start + granularity - alignment; 1553 + tmp = start + granularity - sector_div(tmp, granularity); 1554 + 1555 + nr = tmp - start; 1556 + /* don't flag BLKDEV_ZERO_NOUNMAP, we don't know how many 1557 + * layers are below us, some may have smaller granularity */ 1558 + err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0); 1559 + nr_sectors -= nr; 1560 + start = tmp; 1561 + } 1562 + while (nr_sectors >= max_discard_sectors) { 1563 + err |= blkdev_issue_discard(bdev, start, max_discard_sectors, GFP_NOIO, 0); 1564 + nr_sectors -= max_discard_sectors; 1565 + start += max_discard_sectors; 1566 + } 1567 + if (nr_sectors) { 1568 + /* max_discard_sectors is unsigned int (and a multiple of 1569 + * granularity, we made sure of that above already); 1570 + * nr is < max_discard_sectors; 1571 + * I don't need sector_div here, even though nr is sector_t */ 1572 + nr = nr_sectors; 1573 + nr -= (unsigned int)nr % granularity; 1574 + if (nr) { 1575 + err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0); 1576 + nr_sectors -= nr; 1577 + start += nr; 1578 + } 1579 + } 1580 + zero_out: 1581 + if (nr_sectors) { 1582 + err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO, 1583 + (flags & EE_TRIM) ? 0 : BLKDEV_ZERO_NOUNMAP); 1584 + } 1585 + return err != 0; 1586 + } 1587 + 1588 + static bool can_do_reliable_discards(struct drbd_device *device) 1589 + { 1590 + struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev); 1591 + struct disk_conf *dc; 1592 + bool can_do; 1593 + 1594 + if (!blk_queue_discard(q)) 1595 + return false; 1596 + 1597 + rcu_read_lock(); 1598 + dc = rcu_dereference(device->ldev->disk_conf); 1599 + can_do = dc->discard_zeroes_if_aligned; 1600 + rcu_read_unlock(); 1601 + return can_do; 1602 + } 1603 + 1604 + static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, struct drbd_peer_request *peer_req) 1605 + { 1606 + /* If the backend cannot discard, or does not guarantee 1607 + * read-back zeroes in discarded ranges, we fall back to 1608 + * zero-out. Unless configuration specifically requested 1609 + * otherwise. */ 1610 + if (!can_do_reliable_discards(device)) 1611 + peer_req->flags |= EE_ZEROOUT; 1612 + 1613 + if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector, 1614 + peer_req->i.size >> 9, peer_req->flags & (EE_ZEROOUT|EE_TRIM))) 1499 1615 peer_req->flags |= EE_WAS_ERROR; 1500 - 1501 1616 drbd_endio_write_sec_final(peer_req); 1502 1617 } 1503 1618 ··· 1665 1550 * Correctness first, performance later. Next step is to code an 1666 1551 * asynchronous variant of the same. 1667 1552 */ 1668 - if (peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) { 1553 + if (peer_req->flags & (EE_TRIM|EE_WRITE_SAME|EE_ZEROOUT)) { 1669 1554 /* wait for all pending IO completions, before we start 1670 1555 * zeroing things out. */ 1671 1556 conn_wait_active_ee_empty(peer_req->peer_device->connection); ··· 1682 1567 spin_unlock_irq(&device->resource->req_lock); 1683 1568 } 1684 1569 1685 - if (peer_req->flags & EE_IS_TRIM) 1686 - drbd_issue_peer_discard(device, peer_req); 1570 + if (peer_req->flags & (EE_TRIM|EE_ZEROOUT)) 1571 + drbd_issue_peer_discard_or_zero_out(device, peer_req); 1687 1572 else /* EE_WRITE_SAME */ 1688 1573 drbd_issue_peer_wsame(device, peer_req); 1689 1574 return 0; ··· 1880 1765 void *dig_vv = peer_device->connection->int_dig_vv; 1881 1766 unsigned long *data; 1882 1767 struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL; 1768 + struct p_trim *zeroes = (pi->cmd == P_ZEROES) ? pi->data : NULL; 1883 1769 struct p_trim *wsame = (pi->cmd == P_WSAME) ? pi->data : NULL; 1884 1770 1885 1771 digest_size = 0; ··· 1902 1786 if (!expect(data_size == 0)) 1903 1787 return NULL; 1904 1788 ds = be32_to_cpu(trim->size); 1789 + } else if (zeroes) { 1790 + if (!expect(data_size == 0)) 1791 + return NULL; 1792 + ds = be32_to_cpu(zeroes->size); 1905 1793 } else if (wsame) { 1906 1794 if (data_size != queue_logical_block_size(device->rq_queue)) { 1907 1795 drbd_err(peer_device, "data size (%u) != drbd logical block size (%u)\n", ··· 1922 1802 1923 1803 if (!expect(IS_ALIGNED(ds, 512))) 1924 1804 return NULL; 1925 - if (trim || wsame) { 1805 + if (trim || wsame || zeroes) { 1926 1806 if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9))) 1927 1807 return NULL; 1928 1808 } else if (!expect(ds <= DRBD_MAX_BIO_SIZE)) ··· 1947 1827 1948 1828 peer_req->flags |= EE_WRITE; 1949 1829 if (trim) { 1950 - peer_req->flags |= EE_IS_TRIM; 1830 + peer_req->flags |= EE_TRIM; 1831 + return peer_req; 1832 + } 1833 + if (zeroes) { 1834 + peer_req->flags |= EE_ZEROOUT; 1951 1835 return peer_req; 1952 1836 } 1953 1837 if (wsame) ··· 2450 2326 2451 2327 static unsigned long wire_flags_to_bio_op(u32 dpf) 2452 2328 { 2453 - if (dpf & DP_DISCARD) 2329 + if (dpf & DP_ZEROES) 2454 2330 return REQ_OP_WRITE_ZEROES; 2331 + if (dpf & DP_DISCARD) 2332 + return REQ_OP_DISCARD; 2333 + if (dpf & DP_WSAME) 2334 + return REQ_OP_WRITE_SAME; 2455 2335 else 2456 2336 return REQ_OP_WRITE; 2457 2337 } ··· 2646 2518 op_flags = wire_flags_to_bio_flags(dp_flags); 2647 2519 if (pi->cmd == P_TRIM) { 2648 2520 D_ASSERT(peer_device, peer_req->i.size > 0); 2521 + D_ASSERT(peer_device, op == REQ_OP_DISCARD); 2522 + D_ASSERT(peer_device, peer_req->pages == NULL); 2523 + /* need to play safe: an older DRBD sender 2524 + * may mean zero-out while sending P_TRIM. */ 2525 + if (0 == (connection->agreed_features & DRBD_FF_WZEROES)) 2526 + peer_req->flags |= EE_ZEROOUT; 2527 + } else if (pi->cmd == P_ZEROES) { 2528 + D_ASSERT(peer_device, peer_req->i.size > 0); 2649 2529 D_ASSERT(peer_device, op == REQ_OP_WRITE_ZEROES); 2650 2530 D_ASSERT(peer_device, peer_req->pages == NULL); 2531 + /* Do (not) pass down BLKDEV_ZERO_NOUNMAP? */ 2532 + if (dp_flags & DP_DISCARD) 2533 + peer_req->flags |= EE_TRIM; 2651 2534 } else if (peer_req->pages == NULL) { 2652 2535 D_ASSERT(device, peer_req->i.size == 0); 2653 2536 D_ASSERT(device, dp_flags & DP_FLUSH); ··· 2726 2587 * we wait for all pending requests, respectively wait for 2727 2588 * active_ee to become empty in drbd_submit_peer_request(); 2728 2589 * better not add ourselves here. */ 2729 - if ((peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) == 0) 2590 + if ((peer_req->flags & (EE_TRIM|EE_WRITE_SAME|EE_ZEROOUT)) == 0) 2730 2591 list_add_tail(&peer_req->w.list, &device->active_ee); 2731 2592 spin_unlock_irq(&device->resource->req_lock); 2732 2593 ··· 5032 4893 5033 4894 peer_req->w.cb = e_end_resync_block; 5034 4895 peer_req->submit_jif = jiffies; 5035 - peer_req->flags |= EE_IS_TRIM; 4896 + peer_req->flags |= EE_TRIM; 5036 4897 5037 4898 spin_lock_irq(&device->resource->req_lock); 5038 4899 list_add_tail(&peer_req->w.list, &device->sync_ee); ··· 5100 4961 [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, 5101 4962 [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, 5102 4963 [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data }, 4964 + [P_ZEROES] = { 0, sizeof(struct p_trim), receive_Data }, 5103 4965 [P_RS_DEALLOCATED] = { 0, sizeof(struct p_block_desc), receive_rs_deallocated }, 5104 4966 [P_WSAME] = { 1, sizeof(struct p_wsame), receive_Data }, 5105 4967 }; ··· 5385 5245 drbd_info(connection, "Handshake successful: " 5386 5246 "Agreed network protocol version %d\n", connection->agreed_pro_version); 5387 5247 5388 - drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s.\n", 5248 + drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s%s.\n", 5389 5249 connection->agreed_features, 5390 5250 connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "", 5391 5251 connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "", 5392 - connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" : 5252 + connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" : "", 5253 + connection->agreed_features & DRBD_FF_WZEROES ? " WRITE_ZEROES" : 5393 5254 connection->agreed_features ? "" : " none"); 5394 5255 5395 5256 return 1;
+10 -9
drivers/block/drbd/drbd_req.c
··· 63 63 drbd_req_make_private_bio(req, bio_src); 64 64 req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0) 65 65 | (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0) 66 - | (bio_op(bio_src) == REQ_OP_WRITE_ZEROES ? RQ_UNMAP : 0) 66 + | (bio_op(bio_src) == REQ_OP_WRITE_ZEROES ? RQ_ZEROES : 0) 67 67 | (bio_op(bio_src) == REQ_OP_DISCARD ? RQ_UNMAP : 0); 68 68 req->device = device; 69 69 req->master_bio = bio_src; ··· 1155 1155 return remote; 1156 1156 } 1157 1157 1158 - static void drbd_process_discard_req(struct drbd_request *req) 1158 + static void drbd_process_discard_or_zeroes_req(struct drbd_request *req, int flags) 1159 1159 { 1160 - struct block_device *bdev = req->device->ldev->backing_bdev; 1161 - 1162 - if (blkdev_issue_zeroout(bdev, req->i.sector, req->i.size >> 9, 1163 - GFP_NOIO, 0)) 1160 + int err = drbd_issue_discard_or_zero_out(req->device, 1161 + req->i.sector, req->i.size >> 9, flags); 1162 + if (err) 1164 1163 req->private_bio->bi_status = BLK_STS_IOERR; 1165 1164 bio_endio(req->private_bio); 1166 1165 } ··· 1188 1189 if (get_ldev(device)) { 1189 1190 if (drbd_insert_fault(device, type)) 1190 1191 bio_io_error(bio); 1191 - else if (bio_op(bio) == REQ_OP_WRITE_ZEROES || 1192 - bio_op(bio) == REQ_OP_DISCARD) 1193 - drbd_process_discard_req(req); 1192 + else if (bio_op(bio) == REQ_OP_WRITE_ZEROES) 1193 + drbd_process_discard_or_zeroes_req(req, EE_ZEROOUT | 1194 + ((bio->bi_opf & REQ_NOUNMAP) ? 0 : EE_TRIM)); 1195 + else if (bio_op(bio) == REQ_OP_DISCARD) 1196 + drbd_process_discard_or_zeroes_req(req, EE_TRIM); 1194 1197 else 1195 1198 generic_make_request(bio); 1196 1199 put_ldev(device);
+2
drivers/block/drbd/drbd_req.h
··· 208 208 __RQ_WRITE, 209 209 __RQ_WSAME, 210 210 __RQ_UNMAP, 211 + __RQ_ZEROES, 211 212 212 213 /* Should call drbd_al_complete_io() for this request... */ 213 214 __RQ_IN_ACT_LOG, ··· 254 253 #define RQ_WRITE (1UL << __RQ_WRITE) 255 254 #define RQ_WSAME (1UL << __RQ_WSAME) 256 255 #define RQ_UNMAP (1UL << __RQ_UNMAP) 256 + #define RQ_ZEROES (1UL << __RQ_ZEROES) 257 257 #define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG) 258 258 #define RQ_UNPLUG (1UL << __RQ_UNPLUG) 259 259 #define RQ_POSTPONED (1UL << __RQ_POSTPONED)
+1 -1
drivers/block/drbd/drbd_worker.c
··· 153 153 do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee); 154 154 155 155 /* FIXME do we want to detach for failed REQ_OP_DISCARD? 156 - * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */ 156 + * ((peer_req->flags & (EE_WAS_ERROR|EE_TRIM)) == EE_WAS_ERROR) */ 157 157 if (peer_req->flags & EE_WAS_ERROR) 158 158 __drbd_chk_io_error(device, DRBD_WRITE_ERROR); 159 159
+1 -1
include/linux/drbd.h
··· 51 51 #endif 52 52 53 53 extern const char *drbd_buildtag(void); 54 - #define REL_VERSION "8.4.10" 54 + #define REL_VERSION "8.4.11" 55 55 #define API_VERSION 1 56 56 #define PRO_VERSION_MIN 86 57 57 #define PRO_VERSION_MAX 101