Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drbd: application writes may set-in-sync in protocol != C

If "dirty" blocks are written to during resync,
that brings them in-sync.

By explicitly requesting write-acks during resync even in protocol != C,
we now can actually respect this.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>

authored by

Lars Ellenberg and committed by
Philipp Reisner
08d0dabf 5d0b17f1

+49 -31
+3 -1
drivers/block/drbd/drbd_interval.h
··· 10 10 unsigned int size; /* size in bytes */ 11 11 sector_t end; /* highest interval end in subtree */ 12 12 int local:1 /* local or remote request? */; 13 - int waiting:1; 13 + int waiting:1; /* someone is waiting for this to complete */ 14 + int completed:1; /* this has been completed already; 15 + * ignore for conflict detection */ 14 16 }; 15 17 16 18 static inline void drbd_clear_interval(struct drbd_interval *i)
+4 -1
drivers/block/drbd/drbd_main.c
··· 1639 1639 if (peer_device->connection->agreed_pro_version >= 100) { 1640 1640 if (req->rq_state & RQ_EXP_RECEIVE_ACK) 1641 1641 dp_flags |= DP_SEND_RECEIVE_ACK; 1642 - if (req->rq_state & RQ_EXP_WRITE_ACK) 1642 + /* During resync, request an explicit write ack, 1643 + * even in protocol != C */ 1644 + if (req->rq_state & RQ_EXP_WRITE_ACK 1645 + || (dp_flags & DP_MAY_SET_IN_SYNC)) 1643 1646 dp_flags |= DP_SEND_WRITE_ACK; 1644 1647 } 1645 1648 p->dp_flags = cpu_to_be32(dp_flags);
+3
drivers/block/drbd/drbd_receiver.c
··· 1930 1930 } 1931 1931 dec_unacked(device); 1932 1932 } 1933 + 1933 1934 /* we delete from the conflict detection hash _after_ we sent out the 1934 1935 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ 1935 1936 if (peer_req->flags & EE_IN_INTERVAL_TREE) { ··· 2156 2155 repeat: 2157 2156 drbd_for_each_overlap(i, &device->write_requests, sector, size) { 2158 2157 if (i == &peer_req->i) 2158 + continue; 2159 + if (i->completed) 2159 2160 continue; 2160 2161 2161 2162 if (!i->local) {
+39 -29
drivers/block/drbd/drbd_req.c
··· 92 92 return req; 93 93 } 94 94 95 + static void drbd_remove_request_interval(struct rb_root *root, 96 + struct drbd_request *req) 97 + { 98 + struct drbd_device *device = req->device; 99 + struct drbd_interval *i = &req->i; 100 + 101 + drbd_remove_interval(root, i); 102 + 103 + /* Wake up any processes waiting for this request to complete. */ 104 + if (i->waiting) 105 + wake_up(&device->misc_wait); 106 + } 107 + 95 108 void drbd_req_destroy(struct kref *kref) 96 109 { 97 110 struct drbd_request *req = container_of(kref, struct drbd_request, kref); ··· 127 114 * initialized in drbd_req_new(), so we can list_del() it 128 115 * here unconditionally */ 129 116 list_del_init(&req->tl_requests); 117 + 118 + /* finally remove the request from the conflict detection 119 + * respective block_id verification interval tree. */ 120 + if (!drbd_interval_empty(&req->i)) { 121 + struct rb_root *root; 122 + 123 + if (s & RQ_WRITE) 124 + root = &device->write_requests; 125 + else 126 + root = &device->read_requests; 127 + drbd_remove_request_interval(root, req); 128 + } else if (s & (RQ_NET_MASK & ~RQ_NET_DONE) && req->i.size != 0) 129 + drbd_err(device, "drbd_req_destroy: Logic BUG: interval empty, but: rq_state=0x%x, sect=%llu, size=%u\n", 130 + s, (unsigned long long)req->i.sector, req->i.size); 130 131 131 132 /* if it was a write, we may have to set the corresponding 132 133 * bit(s) out-of-sync first. If it had a local part, we need to ··· 215 188 } 216 189 217 190 218 - static void drbd_remove_request_interval(struct rb_root *root, 219 - struct drbd_request *req) 220 - { 221 - struct drbd_device *device = req->device; 222 - struct drbd_interval *i = &req->i; 223 - 224 - drbd_remove_interval(root, i); 225 - 226 - /* Wake up any processes waiting for this request to complete. */ 227 - if (i->waiting) 228 - wake_up(&device->misc_wait); 229 - } 230 - 231 191 /* Helper for __req_mod(). 232 192 * Set m->bio to the master bio, if it is fit to be completed, 233 193 * or leave it alone (it is initialized to NULL in __req_mod), ··· 268 254 ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); 269 255 error = PTR_ERR(req->private_bio); 270 256 271 - /* remove the request from the conflict detection 272 - * respective block_id verification hash */ 273 - if (!drbd_interval_empty(&req->i)) { 274 - struct rb_root *root; 275 - 276 - if (rw == WRITE) 277 - root = &device->write_requests; 278 - else 279 - root = &device->read_requests; 280 - drbd_remove_request_interval(root, req); 281 - } 282 - 283 257 /* Before we can signal completion to the upper layers, 284 258 * we may need to close the current transfer log epoch. 285 259 * We are within the request lock, so we can simply compare ··· 303 301 m->error = ok ? 0 : (error ?: -EIO); 304 302 m->bio = req->master_bio; 305 303 req->master_bio = NULL; 304 + /* We leave it in the tree, to be able to verify later 305 + * write-acks in protocol != C during resync. 306 + * But we mark it as "complete", so it won't be counted as 307 + * conflict in a multi-primary setup. */ 308 + req->i.completed = true; 306 309 } 310 + 311 + if (req->i.waiting) 312 + wake_up(&device->misc_wait); 307 313 } 308 314 309 315 static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put) ··· 670 660 case WRITE_ACKED_BY_PEER_AND_SIS: 671 661 req->rq_state |= RQ_NET_SIS; 672 662 case WRITE_ACKED_BY_PEER: 673 - D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK); 674 - /* protocol C; successfully written on peer. 663 + /* Normal operation protocol C: successfully written on peer. 664 + * During resync, even in protocol != C, 665 + * we requested an explicit write ack anyways. 666 + * Which means we cannot even assert anything here. 675 667 * Nothing more to do here. 676 668 * We want to keep the tl in place for all protocols, to cater 677 669 * for volatile write-back caches on lower level devices. */ 678 - 679 670 goto ack_common; 680 671 case RECV_ACKED_BY_PEER: 681 672 D_ASSERT(device, req->rq_state & RQ_EXP_RECEIVE_ACK); ··· 684 673 * see also notes above in HANDED_OVER_TO_NETWORK about 685 674 * protocol != C */ 686 675 ack_common: 687 - D_ASSERT(device, req->rq_state & RQ_NET_PENDING); 688 676 mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK); 689 677 break; 690 678