drivers/block/drbd/drbd_receiver.c at v6.2-rc7

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / drivers / block / drbd / drbd_receiver.c
at v6.2-rc7 6125 lines 181 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3   drbd_receiver.c
   4
   5   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   6
   7   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   8   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   9   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
  10
  11 */
  12
  13
  14#include <linux/module.h>
  15
  16#include <linux/uaccess.h>
  17#include <net/sock.h>
  18
  19#include <linux/drbd.h>
  20#include <linux/fs.h>
  21#include <linux/file.h>
  22#include <linux/in.h>
  23#include <linux/mm.h>
  24#include <linux/memcontrol.h>
  25#include <linux/mm_inline.h>
  26#include <linux/slab.h>
  27#include <uapi/linux/sched/types.h>
  28#include <linux/sched/signal.h>
  29#include <linux/pkt_sched.h>
  30#define __KERNEL_SYSCALLS__
  31#include <linux/unistd.h>
  32#include <linux/vmalloc.h>
  33#include <linux/random.h>
  34#include <linux/string.h>
  35#include <linux/scatterlist.h>
  36#include <linux/part_stat.h>
  37#include "drbd_int.h"
  38#include "drbd_protocol.h"
  39#include "drbd_req.h"
  40#include "drbd_vli.h"
  41
  42#define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME|DRBD_FF_WZEROES)
  43
  44struct packet_info {
  45	enum drbd_packet cmd;
  46	unsigned int size;
  47	unsigned int vnr;
  48	void *data;
  49};
  50
  51enum finish_epoch {
  52	FE_STILL_LIVE,
  53	FE_DESTROYED,
  54	FE_RECYCLED,
  55};
  56
  57static int drbd_do_features(struct drbd_connection *connection);
  58static int drbd_do_auth(struct drbd_connection *connection);
  59static int drbd_disconnected(struct drbd_peer_device *);
  60static void conn_wait_active_ee_empty(struct drbd_connection *connection);
  61static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event);
  62static int e_end_block(struct drbd_work *, int);
  63
  64
  65#define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
  66
  67/*
  68 * some helper functions to deal with single linked page lists,
  69 * page->private being our "next" pointer.
  70 */
  71
  72/* If at least n pages are linked at head, get n pages off.
  73 * Otherwise, don't modify head, and return NULL.
  74 * Locking is the responsibility of the caller.
  75 */
  76static struct page *page_chain_del(struct page **head, int n)
  77{
  78	struct page *page;
  79	struct page *tmp;
  80
  81	BUG_ON(!n);
  82	BUG_ON(!head);
  83
  84	page = *head;
  85
  86	if (!page)
  87		return NULL;
  88
  89	while (page) {
  90		tmp = page_chain_next(page);
  91		if (--n == 0)
  92			break; /* found sufficient pages */
  93		if (tmp == NULL)
  94			/* insufficient pages, don't use any of them. */
  95			return NULL;
  96		page = tmp;
  97	}
  98
  99	/* add end of list marker for the returned list */
 100	set_page_private(page, 0);
 101	/* actual return value, and adjustment of head */
 102	page = *head;
 103	*head = tmp;
 104	return page;
 105}
 106
 107/* may be used outside of locks to find the tail of a (usually short)
 108 * "private" page chain, before adding it back to a global chain head
 109 * with page_chain_add() under a spinlock. */
 110static struct page *page_chain_tail(struct page *page, int *len)
 111{
 112	struct page *tmp;
 113	int i = 1;
 114	while ((tmp = page_chain_next(page))) {
 115		++i;
 116		page = tmp;
 117	}
 118	if (len)
 119		*len = i;
 120	return page;
 121}
 122
 123static int page_chain_free(struct page *page)
 124{
 125	struct page *tmp;
 126	int i = 0;
 127	page_chain_for_each_safe(page, tmp) {
 128		put_page(page);
 129		++i;
 130	}
 131	return i;
 132}
 133
 134static void page_chain_add(struct page **head,
 135		struct page *chain_first, struct page *chain_last)
 136{
 137#if 1
 138	struct page *tmp;
 139	tmp = page_chain_tail(chain_first, NULL);
 140	BUG_ON(tmp != chain_last);
 141#endif
 142
 143	/* add chain to head */
 144	set_page_private(chain_last, (unsigned long)*head);
 145	*head = chain_first;
 146}
 147
 148static struct page *__drbd_alloc_pages(struct drbd_device *device,
 149				       unsigned int number)
 150{
 151	struct page *page = NULL;
 152	struct page *tmp = NULL;
 153	unsigned int i = 0;
 154
 155	/* Yes, testing drbd_pp_vacant outside the lock is racy.
 156	 * So what. It saves a spin_lock. */
 157	if (drbd_pp_vacant >= number) {
 158		spin_lock(&drbd_pp_lock);
 159		page = page_chain_del(&drbd_pp_pool, number);
 160		if (page)
 161			drbd_pp_vacant -= number;
 162		spin_unlock(&drbd_pp_lock);
 163		if (page)
 164			return page;
 165	}
 166
 167	/* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
 168	 * "criss-cross" setup, that might cause write-out on some other DRBD,
 169	 * which in turn might block on the other node at this very place.  */
 170	for (i = 0; i < number; i++) {
 171		tmp = alloc_page(GFP_TRY);
 172		if (!tmp)
 173			break;
 174		set_page_private(tmp, (unsigned long)page);
 175		page = tmp;
 176	}
 177
 178	if (i == number)
 179		return page;
 180
 181	/* Not enough pages immediately available this time.
 182	 * No need to jump around here, drbd_alloc_pages will retry this
 183	 * function "soon". */
 184	if (page) {
 185		tmp = page_chain_tail(page, NULL);
 186		spin_lock(&drbd_pp_lock);
 187		page_chain_add(&drbd_pp_pool, page, tmp);
 188		drbd_pp_vacant += i;
 189		spin_unlock(&drbd_pp_lock);
 190	}
 191	return NULL;
 192}
 193
 194static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
 195					   struct list_head *to_be_freed)
 196{
 197	struct drbd_peer_request *peer_req, *tmp;
 198
 199	/* The EEs are always appended to the end of the list. Since
 200	   they are sent in order over the wire, they have to finish
 201	   in order. As soon as we see the first not finished we can
 202	   stop to examine the list... */
 203
 204	list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) {
 205		if (drbd_peer_req_has_active_page(peer_req))
 206			break;
 207		list_move(&peer_req->w.list, to_be_freed);
 208	}
 209}
 210
 211static void drbd_reclaim_net_peer_reqs(struct drbd_device *device)
 212{
 213	LIST_HEAD(reclaimed);
 214	struct drbd_peer_request *peer_req, *t;
 215
 216	spin_lock_irq(&device->resource->req_lock);
 217	reclaim_finished_net_peer_reqs(device, &reclaimed);
 218	spin_unlock_irq(&device->resource->req_lock);
 219	list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
 220		drbd_free_net_peer_req(device, peer_req);
 221}
 222
 223static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection)
 224{
 225	struct drbd_peer_device *peer_device;
 226	int vnr;
 227
 228	rcu_read_lock();
 229	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
 230		struct drbd_device *device = peer_device->device;
 231		if (!atomic_read(&device->pp_in_use_by_net))
 232			continue;
 233
 234		kref_get(&device->kref);
 235		rcu_read_unlock();
 236		drbd_reclaim_net_peer_reqs(device);
 237		kref_put(&device->kref, drbd_destroy_device);
 238		rcu_read_lock();
 239	}
 240	rcu_read_unlock();
 241}
 242
 243/**
 244 * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
 245 * @peer_device:	DRBD device.
 246 * @number:		number of pages requested
 247 * @retry:		whether to retry, if not enough pages are available right now
 248 *
 249 * Tries to allocate number pages, first from our own page pool, then from
 250 * the kernel.
 251 * Possibly retry until DRBD frees sufficient pages somewhere else.
 252 *
 253 * If this allocation would exceed the max_buffers setting, we throttle
 254 * allocation (schedule_timeout) to give the system some room to breathe.
 255 *
 256 * We do not use max-buffers as hard limit, because it could lead to
 257 * congestion and further to a distributed deadlock during online-verify or
 258 * (checksum based) resync, if the max-buffers, socket buffer sizes and
 259 * resync-rate settings are mis-configured.
 260 *
 261 * Returns a page chain linked via page->private.
 262 */
 263struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number,
 264			      bool retry)
 265{
 266	struct drbd_device *device = peer_device->device;
 267	struct page *page = NULL;
 268	struct net_conf *nc;
 269	DEFINE_WAIT(wait);
 270	unsigned int mxb;
 271
 272	rcu_read_lock();
 273	nc = rcu_dereference(peer_device->connection->net_conf);
 274	mxb = nc ? nc->max_buffers : 1000000;
 275	rcu_read_unlock();
 276
 277	if (atomic_read(&device->pp_in_use) < mxb)
 278		page = __drbd_alloc_pages(device, number);
 279
 280	/* Try to keep the fast path fast, but occasionally we need
 281	 * to reclaim the pages we lended to the network stack. */
 282	if (page && atomic_read(&device->pp_in_use_by_net) > 512)
 283		drbd_reclaim_net_peer_reqs(device);
 284
 285	while (page == NULL) {
 286		prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
 287
 288		drbd_reclaim_net_peer_reqs(device);
 289
 290		if (atomic_read(&device->pp_in_use) < mxb) {
 291			page = __drbd_alloc_pages(device, number);
 292			if (page)
 293				break;
 294		}
 295
 296		if (!retry)
 297			break;
 298
 299		if (signal_pending(current)) {
 300			drbd_warn(device, "drbd_alloc_pages interrupted!\n");
 301			break;
 302		}
 303
 304		if (schedule_timeout(HZ/10) == 0)
 305			mxb = UINT_MAX;
 306	}
 307	finish_wait(&drbd_pp_wait, &wait);
 308
 309	if (page)
 310		atomic_add(number, &device->pp_in_use);
 311	return page;
 312}
 313
 314/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
 315 * Is also used from inside an other spin_lock_irq(&resource->req_lock);
 316 * Either links the page chain back to the global pool,
 317 * or returns all pages to the system. */
 318static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net)
 319{
 320	atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use;
 321	int i;
 322
 323	if (page == NULL)
 324		return;
 325
 326	if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count)
 327		i = page_chain_free(page);
 328	else {
 329		struct page *tmp;
 330		tmp = page_chain_tail(page, &i);
 331		spin_lock(&drbd_pp_lock);
 332		page_chain_add(&drbd_pp_pool, page, tmp);
 333		drbd_pp_vacant += i;
 334		spin_unlock(&drbd_pp_lock);
 335	}
 336	i = atomic_sub_return(i, a);
 337	if (i < 0)
 338		drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n",
 339			is_net ? "pp_in_use_by_net" : "pp_in_use", i);
 340	wake_up(&drbd_pp_wait);
 341}
 342
 343/*
 344You need to hold the req_lock:
 345 _drbd_wait_ee_list_empty()
 346
 347You must not have the req_lock:
 348 drbd_free_peer_req()
 349 drbd_alloc_peer_req()
 350 drbd_free_peer_reqs()
 351 drbd_ee_fix_bhs()
 352 drbd_finish_peer_reqs()
 353 drbd_clear_done_ee()
 354 drbd_wait_ee_list_empty()
 355*/
 356
 357/* normal: payload_size == request size (bi_size)
 358 * w_same: payload_size == logical_block_size
 359 * trim: payload_size == 0 */
 360struct drbd_peer_request *
 361drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
 362		    unsigned int request_size, unsigned int payload_size, gfp_t gfp_mask) __must_hold(local)
 363{
 364	struct drbd_device *device = peer_device->device;
 365	struct drbd_peer_request *peer_req;
 366	struct page *page = NULL;
 367	unsigned int nr_pages = PFN_UP(payload_size);
 368
 369	if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
 370		return NULL;
 371
 372	peer_req = mempool_alloc(&drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
 373	if (!peer_req) {
 374		if (!(gfp_mask & __GFP_NOWARN))
 375			drbd_err(device, "%s: allocation failed\n", __func__);
 376		return NULL;
 377	}
 378
 379	if (nr_pages) {
 380		page = drbd_alloc_pages(peer_device, nr_pages,
 381					gfpflags_allow_blocking(gfp_mask));
 382		if (!page)
 383			goto fail;
 384	}
 385
 386	memset(peer_req, 0, sizeof(*peer_req));
 387	INIT_LIST_HEAD(&peer_req->w.list);
 388	drbd_clear_interval(&peer_req->i);
 389	peer_req->i.size = request_size;
 390	peer_req->i.sector = sector;
 391	peer_req->submit_jif = jiffies;
 392	peer_req->peer_device = peer_device;
 393	peer_req->pages = page;
 394	/*
 395	 * The block_id is opaque to the receiver.  It is not endianness
 396	 * converted, and sent back to the sender unchanged.
 397	 */
 398	peer_req->block_id = id;
 399
 400	return peer_req;
 401
 402 fail:
 403	mempool_free(peer_req, &drbd_ee_mempool);
 404	return NULL;
 405}
 406
 407void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
 408		       int is_net)
 409{
 410	might_sleep();
 411	if (peer_req->flags & EE_HAS_DIGEST)
 412		kfree(peer_req->digest);
 413	drbd_free_pages(device, peer_req->pages, is_net);
 414	D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
 415	D_ASSERT(device, drbd_interval_empty(&peer_req->i));
 416	if (!expect(device, !(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
 417		peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
 418		drbd_al_complete_io(device, &peer_req->i);
 419	}
 420	mempool_free(peer_req, &drbd_ee_mempool);
 421}
 422
 423int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
 424{
 425	LIST_HEAD(work_list);
 426	struct drbd_peer_request *peer_req, *t;
 427	int count = 0;
 428	int is_net = list == &device->net_ee;
 429
 430	spin_lock_irq(&device->resource->req_lock);
 431	list_splice_init(list, &work_list);
 432	spin_unlock_irq(&device->resource->req_lock);
 433
 434	list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
 435		__drbd_free_peer_req(device, peer_req, is_net);
 436		count++;
 437	}
 438	return count;
 439}
 440
 441/*
 442 * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
 443 */
 444static int drbd_finish_peer_reqs(struct drbd_device *device)
 445{
 446	LIST_HEAD(work_list);
 447	LIST_HEAD(reclaimed);
 448	struct drbd_peer_request *peer_req, *t;
 449	int err = 0;
 450
 451	spin_lock_irq(&device->resource->req_lock);
 452	reclaim_finished_net_peer_reqs(device, &reclaimed);
 453	list_splice_init(&device->done_ee, &work_list);
 454	spin_unlock_irq(&device->resource->req_lock);
 455
 456	list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
 457		drbd_free_net_peer_req(device, peer_req);
 458
 459	/* possible callbacks here:
 460	 * e_end_block, and e_end_resync_block, e_send_superseded.
 461	 * all ignore the last argument.
 462	 */
 463	list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
 464		int err2;
 465
 466		/* list_del not necessary, next/prev members not touched */
 467		err2 = peer_req->w.cb(&peer_req->w, !!err);
 468		if (!err)
 469			err = err2;
 470		drbd_free_peer_req(device, peer_req);
 471	}
 472	wake_up(&device->ee_wait);
 473
 474	return err;
 475}
 476
 477static void _drbd_wait_ee_list_empty(struct drbd_device *device,
 478				     struct list_head *head)
 479{
 480	DEFINE_WAIT(wait);
 481
 482	/* avoids spin_lock/unlock
 483	 * and calling prepare_to_wait in the fast path */
 484	while (!list_empty(head)) {
 485		prepare_to_wait(&device->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
 486		spin_unlock_irq(&device->resource->req_lock);
 487		io_schedule();
 488		finish_wait(&device->ee_wait, &wait);
 489		spin_lock_irq(&device->resource->req_lock);
 490	}
 491}
 492
 493static void drbd_wait_ee_list_empty(struct drbd_device *device,
 494				    struct list_head *head)
 495{
 496	spin_lock_irq(&device->resource->req_lock);
 497	_drbd_wait_ee_list_empty(device, head);
 498	spin_unlock_irq(&device->resource->req_lock);
 499}
 500
 501static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
 502{
 503	struct kvec iov = {
 504		.iov_base = buf,
 505		.iov_len = size,
 506	};
 507	struct msghdr msg = {
 508		.msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
 509	};
 510	iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, size);
 511	return sock_recvmsg(sock, &msg, msg.msg_flags);
 512}
 513
 514static int drbd_recv(struct drbd_connection *connection, void *buf, size_t size)
 515{
 516	int rv;
 517
 518	rv = drbd_recv_short(connection->data.socket, buf, size, 0);
 519
 520	if (rv < 0) {
 521		if (rv == -ECONNRESET)
 522			drbd_info(connection, "sock was reset by peer\n");
 523		else if (rv != -ERESTARTSYS)
 524			drbd_err(connection, "sock_recvmsg returned %d\n", rv);
 525	} else if (rv == 0) {
 526		if (test_bit(DISCONNECT_SENT, &connection->flags)) {
 527			long t;
 528			rcu_read_lock();
 529			t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
 530			rcu_read_unlock();
 531
 532			t = wait_event_timeout(connection->ping_wait, connection->cstate < C_WF_REPORT_PARAMS, t);
 533
 534			if (t)
 535				goto out;
 536		}
 537		drbd_info(connection, "sock was shut down by peer\n");
 538	}
 539
 540	if (rv != size)
 541		conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
 542
 543out:
 544	return rv;
 545}
 546
 547static int drbd_recv_all(struct drbd_connection *connection, void *buf, size_t size)
 548{
 549	int err;
 550
 551	err = drbd_recv(connection, buf, size);
 552	if (err != size) {
 553		if (err >= 0)
 554			err = -EIO;
 555	} else
 556		err = 0;
 557	return err;
 558}
 559
 560static int drbd_recv_all_warn(struct drbd_connection *connection, void *buf, size_t size)
 561{
 562	int err;
 563
 564	err = drbd_recv_all(connection, buf, size);
 565	if (err && !signal_pending(current))
 566		drbd_warn(connection, "short read (expected size %d)\n", (int)size);
 567	return err;
 568}
 569
 570/* quoting tcp(7):
 571 *   On individual connections, the socket buffer size must be set prior to the
 572 *   listen(2) or connect(2) calls in order to have it take effect.
 573 * This is our wrapper to do so.
 574 */
 575static void drbd_setbufsize(struct socket *sock, unsigned int snd,
 576		unsigned int rcv)
 577{
 578	/* open coded SO_SNDBUF, SO_RCVBUF */
 579	if (snd) {
 580		sock->sk->sk_sndbuf = snd;
 581		sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 582	}
 583	if (rcv) {
 584		sock->sk->sk_rcvbuf = rcv;
 585		sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 586	}
 587}
 588
 589static struct socket *drbd_try_connect(struct drbd_connection *connection)
 590{
 591	const char *what;
 592	struct socket *sock;
 593	struct sockaddr_in6 src_in6;
 594	struct sockaddr_in6 peer_in6;
 595	struct net_conf *nc;
 596	int err, peer_addr_len, my_addr_len;
 597	int sndbuf_size, rcvbuf_size, connect_int;
 598	int disconnect_on_error = 1;
 599
 600	rcu_read_lock();
 601	nc = rcu_dereference(connection->net_conf);
 602	if (!nc) {
 603		rcu_read_unlock();
 604		return NULL;
 605	}
 606	sndbuf_size = nc->sndbuf_size;
 607	rcvbuf_size = nc->rcvbuf_size;
 608	connect_int = nc->connect_int;
 609	rcu_read_unlock();
 610
 611	my_addr_len = min_t(int, connection->my_addr_len, sizeof(src_in6));
 612	memcpy(&src_in6, &connection->my_addr, my_addr_len);
 613
 614	if (((struct sockaddr *)&connection->my_addr)->sa_family == AF_INET6)
 615		src_in6.sin6_port = 0;
 616	else
 617		((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
 618
 619	peer_addr_len = min_t(int, connection->peer_addr_len, sizeof(src_in6));
 620	memcpy(&peer_in6, &connection->peer_addr, peer_addr_len);
 621
 622	what = "sock_create_kern";
 623	err = sock_create_kern(&init_net, ((struct sockaddr *)&src_in6)->sa_family,
 624			       SOCK_STREAM, IPPROTO_TCP, &sock);
 625	if (err < 0) {
 626		sock = NULL;
 627		goto out;
 628	}
 629
 630	sock->sk->sk_rcvtimeo =
 631	sock->sk->sk_sndtimeo = connect_int * HZ;
 632	drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
 633
 634       /* explicitly bind to the configured IP as source IP
 635	*  for the outgoing connections.
 636	*  This is needed for multihomed hosts and to be
 637	*  able to use lo: interfaces for drbd.
 638	* Make sure to use 0 as port number, so linux selects
 639	*  a free one dynamically.
 640	*/
 641	what = "bind before connect";
 642	err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
 643	if (err < 0)
 644		goto out;
 645
 646	/* connect may fail, peer not yet available.
 647	 * stay C_WF_CONNECTION, don't go Disconnecting! */
 648	disconnect_on_error = 0;
 649	what = "connect";
 650	err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
 651
 652out:
 653	if (err < 0) {
 654		if (sock) {
 655			sock_release(sock);
 656			sock = NULL;
 657		}
 658		switch (-err) {
 659			/* timeout, busy, signal pending */
 660		case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
 661		case EINTR: case ERESTARTSYS:
 662			/* peer not (yet) available, network problem */
 663		case ECONNREFUSED: case ENETUNREACH:
 664		case EHOSTDOWN:    case EHOSTUNREACH:
 665			disconnect_on_error = 0;
 666			break;
 667		default:
 668			drbd_err(connection, "%s failed, err = %d\n", what, err);
 669		}
 670		if (disconnect_on_error)
 671			conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
 672	}
 673
 674	return sock;
 675}
 676
 677struct accept_wait_data {
 678	struct drbd_connection *connection;
 679	struct socket *s_listen;
 680	struct completion door_bell;
 681	void (*original_sk_state_change)(struct sock *sk);
 682
 683};
 684
 685static void drbd_incoming_connection(struct sock *sk)
 686{
 687	struct accept_wait_data *ad = sk->sk_user_data;
 688	void (*state_change)(struct sock *sk);
 689
 690	state_change = ad->original_sk_state_change;
 691	if (sk->sk_state == TCP_ESTABLISHED)
 692		complete(&ad->door_bell);
 693	state_change(sk);
 694}
 695
 696static int prepare_listen_socket(struct drbd_connection *connection, struct accept_wait_data *ad)
 697{
 698	int err, sndbuf_size, rcvbuf_size, my_addr_len;
 699	struct sockaddr_in6 my_addr;
 700	struct socket *s_listen;
 701	struct net_conf *nc;
 702	const char *what;
 703
 704	rcu_read_lock();
 705	nc = rcu_dereference(connection->net_conf);
 706	if (!nc) {
 707		rcu_read_unlock();
 708		return -EIO;
 709	}
 710	sndbuf_size = nc->sndbuf_size;
 711	rcvbuf_size = nc->rcvbuf_size;
 712	rcu_read_unlock();
 713
 714	my_addr_len = min_t(int, connection->my_addr_len, sizeof(struct sockaddr_in6));
 715	memcpy(&my_addr, &connection->my_addr, my_addr_len);
 716
 717	what = "sock_create_kern";
 718	err = sock_create_kern(&init_net, ((struct sockaddr *)&my_addr)->sa_family,
 719			       SOCK_STREAM, IPPROTO_TCP, &s_listen);
 720	if (err) {
 721		s_listen = NULL;
 722		goto out;
 723	}
 724
 725	s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
 726	drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
 727
 728	what = "bind before listen";
 729	err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
 730	if (err < 0)
 731		goto out;
 732
 733	ad->s_listen = s_listen;
 734	write_lock_bh(&s_listen->sk->sk_callback_lock);
 735	ad->original_sk_state_change = s_listen->sk->sk_state_change;
 736	s_listen->sk->sk_state_change = drbd_incoming_connection;
 737	s_listen->sk->sk_user_data = ad;
 738	write_unlock_bh(&s_listen->sk->sk_callback_lock);
 739
 740	what = "listen";
 741	err = s_listen->ops->listen(s_listen, 5);
 742	if (err < 0)
 743		goto out;
 744
 745	return 0;
 746out:
 747	if (s_listen)
 748		sock_release(s_listen);
 749	if (err < 0) {
 750		if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
 751			drbd_err(connection, "%s failed, err = %d\n", what, err);
 752			conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
 753		}
 754	}
 755
 756	return -EIO;
 757}
 758
 759static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad)
 760{
 761	write_lock_bh(&sk->sk_callback_lock);
 762	sk->sk_state_change = ad->original_sk_state_change;
 763	sk->sk_user_data = NULL;
 764	write_unlock_bh(&sk->sk_callback_lock);
 765}
 766
 767static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, struct accept_wait_data *ad)
 768{
 769	int timeo, connect_int, err = 0;
 770	struct socket *s_estab = NULL;
 771	struct net_conf *nc;
 772
 773	rcu_read_lock();
 774	nc = rcu_dereference(connection->net_conf);
 775	if (!nc) {
 776		rcu_read_unlock();
 777		return NULL;
 778	}
 779	connect_int = nc->connect_int;
 780	rcu_read_unlock();
 781
 782	timeo = connect_int * HZ;
 783	/* 28.5% random jitter */
 784	timeo += get_random_u32_below(2) ? timeo / 7 : -timeo / 7;
 785
 786	err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
 787	if (err <= 0)
 788		return NULL;
 789
 790	err = kernel_accept(ad->s_listen, &s_estab, 0);
 791	if (err < 0) {
 792		if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
 793			drbd_err(connection, "accept failed, err = %d\n", err);
 794			conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
 795		}
 796	}
 797
 798	if (s_estab)
 799		unregister_state_change(s_estab->sk, ad);
 800
 801	return s_estab;
 802}
 803
 804static int decode_header(struct drbd_connection *, void *, struct packet_info *);
 805
 806static int send_first_packet(struct drbd_connection *connection, struct drbd_socket *sock,
 807			     enum drbd_packet cmd)
 808{
 809	if (!conn_prepare_command(connection, sock))
 810		return -EIO;
 811	return conn_send_command(connection, sock, cmd, 0, NULL, 0);
 812}
 813
 814static int receive_first_packet(struct drbd_connection *connection, struct socket *sock)
 815{
 816	unsigned int header_size = drbd_header_size(connection);
 817	struct packet_info pi;
 818	struct net_conf *nc;
 819	int err;
 820
 821	rcu_read_lock();
 822	nc = rcu_dereference(connection->net_conf);
 823	if (!nc) {
 824		rcu_read_unlock();
 825		return -EIO;
 826	}
 827	sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10;
 828	rcu_read_unlock();
 829
 830	err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0);
 831	if (err != header_size) {
 832		if (err >= 0)
 833			err = -EIO;
 834		return err;
 835	}
 836	err = decode_header(connection, connection->data.rbuf, &pi);
 837	if (err)
 838		return err;
 839	return pi.cmd;
 840}
 841
 842/**
 843 * drbd_socket_okay() - Free the socket if its connection is not okay
 844 * @sock:	pointer to the pointer to the socket.
 845 */
 846static bool drbd_socket_okay(struct socket **sock)
 847{
 848	int rr;
 849	char tb[4];
 850
 851	if (!*sock)
 852		return false;
 853
 854	rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
 855
 856	if (rr > 0 || rr == -EAGAIN) {
 857		return true;
 858	} else {
 859		sock_release(*sock);
 860		*sock = NULL;
 861		return false;
 862	}
 863}
 864
 865static bool connection_established(struct drbd_connection *connection,
 866				   struct socket **sock1,
 867				   struct socket **sock2)
 868{
 869	struct net_conf *nc;
 870	int timeout;
 871	bool ok;
 872
 873	if (!*sock1 || !*sock2)
 874		return false;
 875
 876	rcu_read_lock();
 877	nc = rcu_dereference(connection->net_conf);
 878	timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10;
 879	rcu_read_unlock();
 880	schedule_timeout_interruptible(timeout);
 881
 882	ok = drbd_socket_okay(sock1);
 883	ok = drbd_socket_okay(sock2) && ok;
 884
 885	return ok;
 886}
 887
 888/* Gets called if a connection is established, or if a new minor gets created
 889   in a connection */
 890int drbd_connected(struct drbd_peer_device *peer_device)
 891{
 892	struct drbd_device *device = peer_device->device;
 893	int err;
 894
 895	atomic_set(&device->packet_seq, 0);
 896	device->peer_seq = 0;
 897
 898	device->state_mutex = peer_device->connection->agreed_pro_version < 100 ?
 899		&peer_device->connection->cstate_mutex :
 900		&device->own_state_mutex;
 901
 902	err = drbd_send_sync_param(peer_device);
 903	if (!err)
 904		err = drbd_send_sizes(peer_device, 0, 0);
 905	if (!err)
 906		err = drbd_send_uuids(peer_device);
 907	if (!err)
 908		err = drbd_send_current_state(peer_device);
 909	clear_bit(USE_DEGR_WFC_T, &device->flags);
 910	clear_bit(RESIZE_PENDING, &device->flags);
 911	atomic_set(&device->ap_in_flight, 0);
 912	mod_timer(&device->request_timer, jiffies + HZ); /* just start it here. */
 913	return err;
 914}
 915
 916/*
 917 * return values:
 918 *   1 yes, we have a valid connection
 919 *   0 oops, did not work out, please try again
 920 *  -1 peer talks different language,
 921 *     no point in trying again, please go standalone.
 922 *  -2 We do not have a network config...
 923 */
 924static int conn_connect(struct drbd_connection *connection)
 925{
 926	struct drbd_socket sock, msock;
 927	struct drbd_peer_device *peer_device;
 928	struct net_conf *nc;
 929	int vnr, timeout, h;
 930	bool discard_my_data, ok;
 931	enum drbd_state_rv rv;
 932	struct accept_wait_data ad = {
 933		.connection = connection,
 934		.door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
 935	};
 936
 937	clear_bit(DISCONNECT_SENT, &connection->flags);
 938	if (conn_request_state(connection, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
 939		return -2;
 940
 941	mutex_init(&sock.mutex);
 942	sock.sbuf = connection->data.sbuf;
 943	sock.rbuf = connection->data.rbuf;
 944	sock.socket = NULL;
 945	mutex_init(&msock.mutex);
 946	msock.sbuf = connection->meta.sbuf;
 947	msock.rbuf = connection->meta.rbuf;
 948	msock.socket = NULL;
 949
 950	/* Assume that the peer only understands protocol 80 until we know better.  */
 951	connection->agreed_pro_version = 80;
 952
 953	if (prepare_listen_socket(connection, &ad))
 954		return 0;
 955
 956	do {
 957		struct socket *s;
 958
 959		s = drbd_try_connect(connection);
 960		if (s) {
 961			if (!sock.socket) {
 962				sock.socket = s;
 963				send_first_packet(connection, &sock, P_INITIAL_DATA);
 964			} else if (!msock.socket) {
 965				clear_bit(RESOLVE_CONFLICTS, &connection->flags);
 966				msock.socket = s;
 967				send_first_packet(connection, &msock, P_INITIAL_META);
 968			} else {
 969				drbd_err(connection, "Logic error in conn_connect()\n");
 970				goto out_release_sockets;
 971			}
 972		}
 973
 974		if (connection_established(connection, &sock.socket, &msock.socket))
 975			break;
 976
 977retry:
 978		s = drbd_wait_for_connect(connection, &ad);
 979		if (s) {
 980			int fp = receive_first_packet(connection, s);
 981			drbd_socket_okay(&sock.socket);
 982			drbd_socket_okay(&msock.socket);
 983			switch (fp) {
 984			case P_INITIAL_DATA:
 985				if (sock.socket) {
 986					drbd_warn(connection, "initial packet S crossed\n");
 987					sock_release(sock.socket);
 988					sock.socket = s;
 989					goto randomize;
 990				}
 991				sock.socket = s;
 992				break;
 993			case P_INITIAL_META:
 994				set_bit(RESOLVE_CONFLICTS, &connection->flags);
 995				if (msock.socket) {
 996					drbd_warn(connection, "initial packet M crossed\n");
 997					sock_release(msock.socket);
 998					msock.socket = s;
 999					goto randomize;
1000				}
1001				msock.socket = s;
1002				break;
1003			default:
1004				drbd_warn(connection, "Error receiving initial packet\n");
1005				sock_release(s);
1006randomize:
1007				if (get_random_u32_below(2))
1008					goto retry;
1009			}
1010		}
1011
1012		if (connection->cstate <= C_DISCONNECTING)
1013			goto out_release_sockets;
1014		if (signal_pending(current)) {
1015			flush_signals(current);
1016			smp_rmb();
1017			if (get_t_state(&connection->receiver) == EXITING)
1018				goto out_release_sockets;
1019		}
1020
1021		ok = connection_established(connection, &sock.socket, &msock.socket);
1022	} while (!ok);
1023
1024	if (ad.s_listen)
1025		sock_release(ad.s_listen);
1026
1027	sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
1028	msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
1029
1030	sock.socket->sk->sk_allocation = GFP_NOIO;
1031	msock.socket->sk->sk_allocation = GFP_NOIO;
1032
1033	sock.socket->sk->sk_use_task_frag = false;
1034	msock.socket->sk->sk_use_task_frag = false;
1035
1036	sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
1037	msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
1038
1039	/* NOT YET ...
1040	 * sock.socket->sk->sk_sndtimeo = connection->net_conf->timeout*HZ/10;
1041	 * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1042	 * first set it to the P_CONNECTION_FEATURES timeout,
1043	 * which we set to 4x the configured ping_timeout. */
1044	rcu_read_lock();
1045	nc = rcu_dereference(connection->net_conf);
1046
1047	sock.socket->sk->sk_sndtimeo =
1048	sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
1049
1050	msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
1051	timeout = nc->timeout * HZ / 10;
1052	discard_my_data = nc->discard_my_data;
1053	rcu_read_unlock();
1054
1055	msock.socket->sk->sk_sndtimeo = timeout;
1056
1057	/* we don't want delays.
1058	 * we use TCP_CORK where appropriate, though */
1059	tcp_sock_set_nodelay(sock.socket->sk);
1060	tcp_sock_set_nodelay(msock.socket->sk);
1061
1062	connection->data.socket = sock.socket;
1063	connection->meta.socket = msock.socket;
1064	connection->last_received = jiffies;
1065
1066	h = drbd_do_features(connection);
1067	if (h <= 0)
1068		return h;
1069
1070	if (connection->cram_hmac_tfm) {
1071		/* drbd_request_state(device, NS(conn, WFAuth)); */
1072		switch (drbd_do_auth(connection)) {
1073		case -1:
1074			drbd_err(connection, "Authentication of peer failed\n");
1075			return -1;
1076		case 0:
1077			drbd_err(connection, "Authentication of peer failed, trying again.\n");
1078			return 0;
1079		}
1080	}
1081
1082	connection->data.socket->sk->sk_sndtimeo = timeout;
1083	connection->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1084
1085	if (drbd_send_protocol(connection) == -EOPNOTSUPP)
1086		return -1;
1087
1088	/* Prevent a race between resync-handshake and
1089	 * being promoted to Primary.
1090	 *
1091	 * Grab and release the state mutex, so we know that any current
1092	 * drbd_set_role() is finished, and any incoming drbd_set_role
1093	 * will see the STATE_SENT flag, and wait for it to be cleared.
1094	 */
1095	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
1096		mutex_lock(peer_device->device->state_mutex);
1097
1098	/* avoid a race with conn_request_state( C_DISCONNECTING ) */
1099	spin_lock_irq(&connection->resource->req_lock);
1100	set_bit(STATE_SENT, &connection->flags);
1101	spin_unlock_irq(&connection->resource->req_lock);
1102
1103	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
1104		mutex_unlock(peer_device->device->state_mutex);
1105
1106	rcu_read_lock();
1107	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1108		struct drbd_device *device = peer_device->device;
1109		kref_get(&device->kref);
1110		rcu_read_unlock();
1111
1112		if (discard_my_data)
1113			set_bit(DISCARD_MY_DATA, &device->flags);
1114		else
1115			clear_bit(DISCARD_MY_DATA, &device->flags);
1116
1117		drbd_connected(peer_device);
1118		kref_put(&device->kref, drbd_destroy_device);
1119		rcu_read_lock();
1120	}
1121	rcu_read_unlock();
1122
1123	rv = conn_request_state(connection, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
1124	if (rv < SS_SUCCESS || connection->cstate != C_WF_REPORT_PARAMS) {
1125		clear_bit(STATE_SENT, &connection->flags);
1126		return 0;
1127	}
1128
1129	drbd_thread_start(&connection->ack_receiver);
1130	/* opencoded create_singlethread_workqueue(),
1131	 * to be able to use format string arguments */
1132	connection->ack_sender =
1133		alloc_ordered_workqueue("drbd_as_%s", WQ_MEM_RECLAIM, connection->resource->name);
1134	if (!connection->ack_sender) {
1135		drbd_err(connection, "Failed to create workqueue ack_sender\n");
1136		return 0;
1137	}
1138
1139	mutex_lock(&connection->resource->conf_update);
1140	/* The discard_my_data flag is a single-shot modifier to the next
1141	 * connection attempt, the handshake of which is now well underway.
1142	 * No need for rcu style copying of the whole struct
1143	 * just to clear a single value. */
1144	connection->net_conf->discard_my_data = 0;
1145	mutex_unlock(&connection->resource->conf_update);
1146
1147	return h;
1148
1149out_release_sockets:
1150	if (ad.s_listen)
1151		sock_release(ad.s_listen);
1152	if (sock.socket)
1153		sock_release(sock.socket);
1154	if (msock.socket)
1155		sock_release(msock.socket);
1156	return -1;
1157}
1158
1159static int decode_header(struct drbd_connection *connection, void *header, struct packet_info *pi)
1160{
1161	unsigned int header_size = drbd_header_size(connection);
1162
1163	if (header_size == sizeof(struct p_header100) &&
1164	    *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
1165		struct p_header100 *h = header;
1166		if (h->pad != 0) {
1167			drbd_err(connection, "Header padding is not zero\n");
1168			return -EINVAL;
1169		}
1170		pi->vnr = be16_to_cpu(h->volume);
1171		pi->cmd = be16_to_cpu(h->command);
1172		pi->size = be32_to_cpu(h->length);
1173	} else if (header_size == sizeof(struct p_header95) &&
1174		   *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
1175		struct p_header95 *h = header;
1176		pi->cmd = be16_to_cpu(h->command);
1177		pi->size = be32_to_cpu(h->length);
1178		pi->vnr = 0;
1179	} else if (header_size == sizeof(struct p_header80) &&
1180		   *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
1181		struct p_header80 *h = header;
1182		pi->cmd = be16_to_cpu(h->command);
1183		pi->size = be16_to_cpu(h->length);
1184		pi->vnr = 0;
1185	} else {
1186		drbd_err(connection, "Wrong magic value 0x%08x in protocol version %d\n",
1187			 be32_to_cpu(*(__be32 *)header),
1188			 connection->agreed_pro_version);
1189		return -EINVAL;
1190	}
1191	pi->data = header + header_size;
1192	return 0;
1193}
1194
1195static void drbd_unplug_all_devices(struct drbd_connection *connection)
1196{
1197	if (current->plug == &connection->receiver_plug) {
1198		blk_finish_plug(&connection->receiver_plug);
1199		blk_start_plug(&connection->receiver_plug);
1200	} /* else: maybe just schedule() ?? */
1201}
1202
1203static int drbd_recv_header(struct drbd_connection *connection, struct packet_info *pi)
1204{
1205	void *buffer = connection->data.rbuf;
1206	int err;
1207
1208	err = drbd_recv_all_warn(connection, buffer, drbd_header_size(connection));
1209	if (err)
1210		return err;
1211
1212	err = decode_header(connection, buffer, pi);
1213	connection->last_received = jiffies;
1214
1215	return err;
1216}
1217
1218static int drbd_recv_header_maybe_unplug(struct drbd_connection *connection, struct packet_info *pi)
1219{
1220	void *buffer = connection->data.rbuf;
1221	unsigned int size = drbd_header_size(connection);
1222	int err;
1223
1224	err = drbd_recv_short(connection->data.socket, buffer, size, MSG_NOSIGNAL|MSG_DONTWAIT);
1225	if (err != size) {
1226		/* If we have nothing in the receive buffer now, to reduce
1227		 * application latency, try to drain the backend queues as
1228		 * quickly as possible, and let remote TCP know what we have
1229		 * received so far. */
1230		if (err == -EAGAIN) {
1231			tcp_sock_set_quickack(connection->data.socket->sk, 2);
1232			drbd_unplug_all_devices(connection);
1233		}
1234		if (err > 0) {
1235			buffer += err;
1236			size -= err;
1237		}
1238		err = drbd_recv_all_warn(connection, buffer, size);
1239		if (err)
1240			return err;
1241	}
1242
1243	err = decode_header(connection, connection->data.rbuf, pi);
1244	connection->last_received = jiffies;
1245
1246	return err;
1247}
1248/* This is blkdev_issue_flush, but asynchronous.
1249 * We want to submit to all component volumes in parallel,
1250 * then wait for all completions.
1251 */
1252struct issue_flush_context {
1253	atomic_t pending;
1254	int error;
1255	struct completion done;
1256};
1257struct one_flush_context {
1258	struct drbd_device *device;
1259	struct issue_flush_context *ctx;
1260};
1261
1262static void one_flush_endio(struct bio *bio)
1263{
1264	struct one_flush_context *octx = bio->bi_private;
1265	struct drbd_device *device = octx->device;
1266	struct issue_flush_context *ctx = octx->ctx;
1267
1268	if (bio->bi_status) {
1269		ctx->error = blk_status_to_errno(bio->bi_status);
1270		drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_status);
1271	}
1272	kfree(octx);
1273	bio_put(bio);
1274
1275	clear_bit(FLUSH_PENDING, &device->flags);
1276	put_ldev(device);
1277	kref_put(&device->kref, drbd_destroy_device);
1278
1279	if (atomic_dec_and_test(&ctx->pending))
1280		complete(&ctx->done);
1281}
1282
1283static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx)
1284{
1285	struct bio *bio = bio_alloc(device->ldev->backing_bdev, 0,
1286				    REQ_OP_FLUSH | REQ_PREFLUSH, GFP_NOIO);
1287	struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO);
1288
1289	if (!octx) {
1290		drbd_warn(device, "Could not allocate a octx, CANNOT ISSUE FLUSH\n");
1291		/* FIXME: what else can I do now?  disconnecting or detaching
1292		 * really does not help to improve the state of the world, either.
1293		 */
1294		bio_put(bio);
1295
1296		ctx->error = -ENOMEM;
1297		put_ldev(device);
1298		kref_put(&device->kref, drbd_destroy_device);
1299		return;
1300	}
1301
1302	octx->device = device;
1303	octx->ctx = ctx;
1304	bio->bi_private = octx;
1305	bio->bi_end_io = one_flush_endio;
1306
1307	device->flush_jif = jiffies;
1308	set_bit(FLUSH_PENDING, &device->flags);
1309	atomic_inc(&ctx->pending);
1310	submit_bio(bio);
1311}
1312
1313static void drbd_flush(struct drbd_connection *connection)
1314{
1315	if (connection->resource->write_ordering >= WO_BDEV_FLUSH) {
1316		struct drbd_peer_device *peer_device;
1317		struct issue_flush_context ctx;
1318		int vnr;
1319
1320		atomic_set(&ctx.pending, 1);
1321		ctx.error = 0;
1322		init_completion(&ctx.done);
1323
1324		rcu_read_lock();
1325		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1326			struct drbd_device *device = peer_device->device;
1327
1328			if (!get_ldev(device))
1329				continue;
1330			kref_get(&device->kref);
1331			rcu_read_unlock();
1332
1333			submit_one_flush(device, &ctx);
1334
1335			rcu_read_lock();
1336		}
1337		rcu_read_unlock();
1338
1339		/* Do we want to add a timeout,
1340		 * if disk-timeout is set? */
1341		if (!atomic_dec_and_test(&ctx.pending))
1342			wait_for_completion(&ctx.done);
1343
1344		if (ctx.error) {
1345			/* would rather check on EOPNOTSUPP, but that is not reliable.
1346			 * don't try again for ANY return value != 0
1347			 * if (rv == -EOPNOTSUPP) */
1348			/* Any error is already reported by bio_endio callback. */
1349			drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
1350		}
1351	}
1352}
1353
1354/**
1355 * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
1356 * @connection:	DRBD connection.
1357 * @epoch:	Epoch object.
1358 * @ev:		Epoch event.
1359 */
1360static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connection,
1361					       struct drbd_epoch *epoch,
1362					       enum epoch_event ev)
1363{
1364	int epoch_size;
1365	struct drbd_epoch *next_epoch;
1366	enum finish_epoch rv = FE_STILL_LIVE;
1367
1368	spin_lock(&connection->epoch_lock);
1369	do {
1370		next_epoch = NULL;
1371
1372		epoch_size = atomic_read(&epoch->epoch_size);
1373
1374		switch (ev & ~EV_CLEANUP) {
1375		case EV_PUT:
1376			atomic_dec(&epoch->active);
1377			break;
1378		case EV_GOT_BARRIER_NR:
1379			set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
1380			break;
1381		case EV_BECAME_LAST:
1382			/* nothing to do*/
1383			break;
1384		}
1385
1386		if (epoch_size != 0 &&
1387		    atomic_read(&epoch->active) == 0 &&
1388		    (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
1389			if (!(ev & EV_CLEANUP)) {
1390				spin_unlock(&connection->epoch_lock);
1391				drbd_send_b_ack(epoch->connection, epoch->barrier_nr, epoch_size);
1392				spin_lock(&connection->epoch_lock);
1393			}
1394#if 0
1395			/* FIXME: dec unacked on connection, once we have
1396			 * something to count pending connection packets in. */
1397			if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
1398				dec_unacked(epoch->connection);
1399#endif
1400
1401			if (connection->current_epoch != epoch) {
1402				next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
1403				list_del(&epoch->list);
1404				ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
1405				connection->epochs--;
1406				kfree(epoch);
1407
1408				if (rv == FE_STILL_LIVE)
1409					rv = FE_DESTROYED;
1410			} else {
1411				epoch->flags = 0;
1412				atomic_set(&epoch->epoch_size, 0);
1413				/* atomic_set(&epoch->active, 0); is already zero */
1414				if (rv == FE_STILL_LIVE)
1415					rv = FE_RECYCLED;
1416			}
1417		}
1418
1419		if (!next_epoch)
1420			break;
1421
1422		epoch = next_epoch;
1423	} while (1);
1424
1425	spin_unlock(&connection->epoch_lock);
1426
1427	return rv;
1428}
1429
1430static enum write_ordering_e
1431max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
1432{
1433	struct disk_conf *dc;
1434
1435	dc = rcu_dereference(bdev->disk_conf);
1436
1437	if (wo == WO_BDEV_FLUSH && !dc->disk_flushes)
1438		wo = WO_DRAIN_IO;
1439	if (wo == WO_DRAIN_IO && !dc->disk_drain)
1440		wo = WO_NONE;
1441
1442	return wo;
1443}
1444
1445/*
1446 * drbd_bump_write_ordering() - Fall back to an other write ordering method
1447 * @wo:		Write ordering method to try.
1448 */
1449void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
1450			      enum write_ordering_e wo)
1451{
1452	struct drbd_device *device;
1453	enum write_ordering_e pwo;
1454	int vnr;
1455	static char *write_ordering_str[] = {
1456		[WO_NONE] = "none",
1457		[WO_DRAIN_IO] = "drain",
1458		[WO_BDEV_FLUSH] = "flush",
1459	};
1460
1461	pwo = resource->write_ordering;
1462	if (wo != WO_BDEV_FLUSH)
1463		wo = min(pwo, wo);
1464	rcu_read_lock();
1465	idr_for_each_entry(&resource->devices, device, vnr) {
1466		if (get_ldev(device)) {
1467			wo = max_allowed_wo(device->ldev, wo);
1468			if (device->ldev == bdev)
1469				bdev = NULL;
1470			put_ldev(device);
1471		}
1472	}
1473
1474	if (bdev)
1475		wo = max_allowed_wo(bdev, wo);
1476
1477	rcu_read_unlock();
1478
1479	resource->write_ordering = wo;
1480	if (pwo != resource->write_ordering || wo == WO_BDEV_FLUSH)
1481		drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
1482}
1483
1484/*
1485 * Mapping "discard" to ZEROOUT with UNMAP does not work for us:
1486 * Drivers have to "announce" q->limits.max_write_zeroes_sectors, or it
1487 * will directly go to fallback mode, submitting normal writes, and
1488 * never even try to UNMAP.
1489 *
1490 * And dm-thin does not do this (yet), mostly because in general it has
1491 * to assume that "skip_block_zeroing" is set.  See also:
1492 * https://www.mail-archive.com/dm-devel%40redhat.com/msg07965.html
1493 * https://www.redhat.com/archives/dm-devel/2018-January/msg00271.html
1494 *
1495 * We *may* ignore the discard-zeroes-data setting, if so configured.
1496 *
1497 * Assumption is that this "discard_zeroes_data=0" is only because the backend
1498 * may ignore partial unaligned discards.
1499 *
1500 * LVM/DM thin as of at least
1501 *   LVM version:     2.02.115(2)-RHEL7 (2015-01-28)
1502 *   Library version: 1.02.93-RHEL7 (2015-01-28)
1503 *   Driver version:  4.29.0
1504 * still behaves this way.
1505 *
1506 * For unaligned (wrt. alignment and granularity) or too small discards,
1507 * we zero-out the initial (and/or) trailing unaligned partial chunks,
1508 * but discard all the aligned full chunks.
1509 *
1510 * At least for LVM/DM thin, with skip_block_zeroing=false,
1511 * the result is effectively "discard_zeroes_data=1".
1512 */
1513/* flags: EE_TRIM|EE_ZEROOUT */
1514int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, int flags)
1515{
1516	struct block_device *bdev = device->ldev->backing_bdev;
1517	sector_t tmp, nr;
1518	unsigned int max_discard_sectors, granularity;
1519	int alignment;
1520	int err = 0;
1521
1522	if ((flags & EE_ZEROOUT) || !(flags & EE_TRIM))
1523		goto zero_out;
1524
1525	/* Zero-sector (unknown) and one-sector granularities are the same.  */
1526	granularity = max(bdev_discard_granularity(bdev) >> 9, 1U);
1527	alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
1528
1529	max_discard_sectors = min(bdev_max_discard_sectors(bdev), (1U << 22));
1530	max_discard_sectors -= max_discard_sectors % granularity;
1531	if (unlikely(!max_discard_sectors))
1532		goto zero_out;
1533
1534	if (nr_sectors < granularity)
1535		goto zero_out;
1536
1537	tmp = start;
1538	if (sector_div(tmp, granularity) != alignment) {
1539		if (nr_sectors < 2*granularity)
1540			goto zero_out;
1541		/* start + gran - (start + gran - align) % gran */
1542		tmp = start + granularity - alignment;
1543		tmp = start + granularity - sector_div(tmp, granularity);
1544
1545		nr = tmp - start;
1546		/* don't flag BLKDEV_ZERO_NOUNMAP, we don't know how many
1547		 * layers are below us, some may have smaller granularity */
1548		err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0);
1549		nr_sectors -= nr;
1550		start = tmp;
1551	}
1552	while (nr_sectors >= max_discard_sectors) {
1553		err |= blkdev_issue_discard(bdev, start, max_discard_sectors,
1554					    GFP_NOIO);
1555		nr_sectors -= max_discard_sectors;
1556		start += max_discard_sectors;
1557	}
1558	if (nr_sectors) {
1559		/* max_discard_sectors is unsigned int (and a multiple of
1560		 * granularity, we made sure of that above already);
1561		 * nr is < max_discard_sectors;
1562		 * I don't need sector_div here, even though nr is sector_t */
1563		nr = nr_sectors;
1564		nr -= (unsigned int)nr % granularity;
1565		if (nr) {
1566			err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO);
1567			nr_sectors -= nr;
1568			start += nr;
1569		}
1570	}
1571 zero_out:
1572	if (nr_sectors) {
1573		err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO,
1574				(flags & EE_TRIM) ? 0 : BLKDEV_ZERO_NOUNMAP);
1575	}
1576	return err != 0;
1577}
1578
1579static bool can_do_reliable_discards(struct drbd_device *device)
1580{
1581	struct disk_conf *dc;
1582	bool can_do;
1583
1584	if (!bdev_max_discard_sectors(device->ldev->backing_bdev))
1585		return false;
1586
1587	rcu_read_lock();
1588	dc = rcu_dereference(device->ldev->disk_conf);
1589	can_do = dc->discard_zeroes_if_aligned;
1590	rcu_read_unlock();
1591	return can_do;
1592}
1593
1594static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, struct drbd_peer_request *peer_req)
1595{
1596	/* If the backend cannot discard, or does not guarantee
1597	 * read-back zeroes in discarded ranges, we fall back to
1598	 * zero-out.  Unless configuration specifically requested
1599	 * otherwise. */
1600	if (!can_do_reliable_discards(device))
1601		peer_req->flags |= EE_ZEROOUT;
1602
1603	if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector,
1604	    peer_req->i.size >> 9, peer_req->flags & (EE_ZEROOUT|EE_TRIM)))
1605		peer_req->flags |= EE_WAS_ERROR;
1606	drbd_endio_write_sec_final(peer_req);
1607}
1608
1609static int peer_request_fault_type(struct drbd_peer_request *peer_req)
1610{
1611	if (peer_req_op(peer_req) == REQ_OP_READ) {
1612		return peer_req->flags & EE_APPLICATION ?
1613			DRBD_FAULT_DT_RD : DRBD_FAULT_RS_RD;
1614	} else {
1615		return peer_req->flags & EE_APPLICATION ?
1616			DRBD_FAULT_DT_WR : DRBD_FAULT_RS_WR;
1617	}
1618}
1619
1620/**
1621 * drbd_submit_peer_request()
1622 * @peer_req:	peer request
1623 *
1624 * May spread the pages to multiple bios,
1625 * depending on bio_add_page restrictions.
1626 *
1627 * Returns 0 if all bios have been submitted,
1628 * -ENOMEM if we could not allocate enough bios,
1629 * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
1630 *  single page to an empty bio (which should never happen and likely indicates
1631 *  that the lower level IO stack is in some way broken). This has been observed
1632 *  on certain Xen deployments.
1633 */
1634/* TODO allocate from our own bio_set. */
1635int drbd_submit_peer_request(struct drbd_peer_request *peer_req)
1636{
1637	struct drbd_device *device = peer_req->peer_device->device;
1638	struct bio *bios = NULL;
1639	struct bio *bio;
1640	struct page *page = peer_req->pages;
1641	sector_t sector = peer_req->i.sector;
1642	unsigned int data_size = peer_req->i.size;
1643	unsigned int n_bios = 0;
1644	unsigned int nr_pages = PFN_UP(data_size);
1645
1646	/* TRIM/DISCARD: for now, always use the helper function
1647	 * blkdev_issue_zeroout(..., discard=true).
1648	 * It's synchronous, but it does the right thing wrt. bio splitting.
1649	 * Correctness first, performance later.  Next step is to code an
1650	 * asynchronous variant of the same.
1651	 */
1652	if (peer_req->flags & (EE_TRIM | EE_ZEROOUT)) {
1653		/* wait for all pending IO completions, before we start
1654		 * zeroing things out. */
1655		conn_wait_active_ee_empty(peer_req->peer_device->connection);
1656		/* add it to the active list now,
1657		 * so we can find it to present it in debugfs */
1658		peer_req->submit_jif = jiffies;
1659		peer_req->flags |= EE_SUBMITTED;
1660
1661		/* If this was a resync request from receive_rs_deallocated(),
1662		 * it is already on the sync_ee list */
1663		if (list_empty(&peer_req->w.list)) {
1664			spin_lock_irq(&device->resource->req_lock);
1665			list_add_tail(&peer_req->w.list, &device->active_ee);
1666			spin_unlock_irq(&device->resource->req_lock);
1667		}
1668
1669		drbd_issue_peer_discard_or_zero_out(device, peer_req);
1670		return 0;
1671	}
1672
1673	/* In most cases, we will only need one bio.  But in case the lower
1674	 * level restrictions happen to be different at this offset on this
1675	 * side than those of the sending peer, we may need to submit the
1676	 * request in more than one bio.
1677	 *
1678	 * Plain bio_alloc is good enough here, this is no DRBD internally
1679	 * generated bio, but a bio allocated on behalf of the peer.
1680	 */
1681next_bio:
1682	/* _DISCARD, _WRITE_ZEROES handled above.
1683	 * REQ_OP_FLUSH (empty flush) not expected,
1684	 * should have been mapped to a "drbd protocol barrier".
1685	 * REQ_OP_SECURE_ERASE: I don't see how we could ever support that.
1686	 */
1687	if (!(peer_req_op(peer_req) == REQ_OP_WRITE ||
1688				peer_req_op(peer_req) == REQ_OP_READ)) {
1689		drbd_err(device, "Invalid bio op received: 0x%x\n", peer_req->opf);
1690		return -EINVAL;
1691	}
1692
1693	bio = bio_alloc(device->ldev->backing_bdev, nr_pages, peer_req->opf, GFP_NOIO);
1694	/* > peer_req->i.sector, unless this is the first bio */
1695	bio->bi_iter.bi_sector = sector;
1696	bio->bi_private = peer_req;
1697	bio->bi_end_io = drbd_peer_request_endio;
1698
1699	bio->bi_next = bios;
1700	bios = bio;
1701	++n_bios;
1702
1703	page_chain_for_each(page) {
1704		unsigned len = min_t(unsigned, data_size, PAGE_SIZE);
1705		if (!bio_add_page(bio, page, len, 0))
1706			goto next_bio;
1707		data_size -= len;
1708		sector += len >> 9;
1709		--nr_pages;
1710	}
1711	D_ASSERT(device, data_size == 0);
1712	D_ASSERT(device, page == NULL);
1713
1714	atomic_set(&peer_req->pending_bios, n_bios);
1715	/* for debugfs: update timestamp, mark as submitted */
1716	peer_req->submit_jif = jiffies;
1717	peer_req->flags |= EE_SUBMITTED;
1718	do {
1719		bio = bios;
1720		bios = bios->bi_next;
1721		bio->bi_next = NULL;
1722
1723		drbd_submit_bio_noacct(device, peer_request_fault_type(peer_req), bio);
1724	} while (bios);
1725	return 0;
1726}
1727
1728static void drbd_remove_epoch_entry_interval(struct drbd_device *device,
1729					     struct drbd_peer_request *peer_req)
1730{
1731	struct drbd_interval *i = &peer_req->i;
1732
1733	drbd_remove_interval(&device->write_requests, i);
1734	drbd_clear_interval(i);
1735
1736	/* Wake up any processes waiting for this peer request to complete.  */
1737	if (i->waiting)
1738		wake_up(&device->misc_wait);
1739}
1740
1741static void conn_wait_active_ee_empty(struct drbd_connection *connection)
1742{
1743	struct drbd_peer_device *peer_device;
1744	int vnr;
1745
1746	rcu_read_lock();
1747	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1748		struct drbd_device *device = peer_device->device;
1749
1750		kref_get(&device->kref);
1751		rcu_read_unlock();
1752		drbd_wait_ee_list_empty(device, &device->active_ee);
1753		kref_put(&device->kref, drbd_destroy_device);
1754		rcu_read_lock();
1755	}
1756	rcu_read_unlock();
1757}
1758
1759static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi)
1760{
1761	int rv;
1762	struct p_barrier *p = pi->data;
1763	struct drbd_epoch *epoch;
1764
1765	/* FIXME these are unacked on connection,
1766	 * not a specific (peer)device.
1767	 */
1768	connection->current_epoch->barrier_nr = p->barrier;
1769	connection->current_epoch->connection = connection;
1770	rv = drbd_may_finish_epoch(connection, connection->current_epoch, EV_GOT_BARRIER_NR);
1771
1772	/* P_BARRIER_ACK may imply that the corresponding extent is dropped from
1773	 * the activity log, which means it would not be resynced in case the
1774	 * R_PRIMARY crashes now.
1775	 * Therefore we must send the barrier_ack after the barrier request was
1776	 * completed. */
1777	switch (connection->resource->write_ordering) {
1778	case WO_NONE:
1779		if (rv == FE_RECYCLED)
1780			return 0;
1781
1782		/* receiver context, in the writeout path of the other node.
1783		 * avoid potential distributed deadlock */
1784		epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1785		if (epoch)
1786			break;
1787		else
1788			drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
1789		fallthrough;
1790
1791	case WO_BDEV_FLUSH:
1792	case WO_DRAIN_IO:
1793		conn_wait_active_ee_empty(connection);
1794		drbd_flush(connection);
1795
1796		if (atomic_read(&connection->current_epoch->epoch_size)) {
1797			epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1798			if (epoch)
1799				break;
1800		}
1801
1802		return 0;
1803	default:
1804		drbd_err(connection, "Strangeness in connection->write_ordering %d\n",
1805			 connection->resource->write_ordering);
1806		return -EIO;
1807	}
1808
1809	epoch->flags = 0;
1810	atomic_set(&epoch->epoch_size, 0);
1811	atomic_set(&epoch->active, 0);
1812
1813	spin_lock(&connection->epoch_lock);
1814	if (atomic_read(&connection->current_epoch->epoch_size)) {
1815		list_add(&epoch->list, &connection->current_epoch->list);
1816		connection->current_epoch = epoch;
1817		connection->epochs++;
1818	} else {
1819		/* The current_epoch got recycled while we allocated this one... */
1820		kfree(epoch);
1821	}
1822	spin_unlock(&connection->epoch_lock);
1823
1824	return 0;
1825}
1826
1827/* quick wrapper in case payload size != request_size (write same) */
1828static void drbd_csum_ee_size(struct crypto_shash *h,
1829			      struct drbd_peer_request *r, void *d,
1830			      unsigned int payload_size)
1831{
1832	unsigned int tmp = r->i.size;
1833	r->i.size = payload_size;
1834	drbd_csum_ee(h, r, d);
1835	r->i.size = tmp;
1836}
1837
1838/* used from receive_RSDataReply (recv_resync_read)
1839 * and from receive_Data.
1840 * data_size: actual payload ("data in")
1841 * 	for normal writes that is bi_size.
1842 * 	for discards, that is zero.
1843 * 	for write same, it is logical_block_size.
1844 * both trim and write same have the bi_size ("data len to be affected")
1845 * as extra argument in the packet header.
1846 */
1847static struct drbd_peer_request *
1848read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
1849	      struct packet_info *pi) __must_hold(local)
1850{
1851	struct drbd_device *device = peer_device->device;
1852	const sector_t capacity = get_capacity(device->vdisk);
1853	struct drbd_peer_request *peer_req;
1854	struct page *page;
1855	int digest_size, err;
1856	unsigned int data_size = pi->size, ds;
1857	void *dig_in = peer_device->connection->int_dig_in;
1858	void *dig_vv = peer_device->connection->int_dig_vv;
1859	unsigned long *data;
1860	struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
1861	struct p_trim *zeroes = (pi->cmd == P_ZEROES) ? pi->data : NULL;
1862
1863	digest_size = 0;
1864	if (!trim && peer_device->connection->peer_integrity_tfm) {
1865		digest_size = crypto_shash_digestsize(peer_device->connection->peer_integrity_tfm);
1866		/*
1867		 * FIXME: Receive the incoming digest into the receive buffer
1868		 *	  here, together with its struct p_data?
1869		 */
1870		err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
1871		if (err)
1872			return NULL;
1873		data_size -= digest_size;
1874	}
1875
1876	/* assume request_size == data_size, but special case trim. */
1877	ds = data_size;
1878	if (trim) {
1879		if (!expect(peer_device, data_size == 0))
1880			return NULL;
1881		ds = be32_to_cpu(trim->size);
1882	} else if (zeroes) {
1883		if (!expect(peer_device, data_size == 0))
1884			return NULL;
1885		ds = be32_to_cpu(zeroes->size);
1886	}
1887
1888	if (!expect(peer_device, IS_ALIGNED(ds, 512)))
1889		return NULL;
1890	if (trim || zeroes) {
1891		if (!expect(peer_device, ds <= (DRBD_MAX_BBIO_SECTORS << 9)))
1892			return NULL;
1893	} else if (!expect(peer_device, ds <= DRBD_MAX_BIO_SIZE))
1894		return NULL;
1895
1896	/* even though we trust out peer,
1897	 * we sometimes have to double check. */
1898	if (sector + (ds>>9) > capacity) {
1899		drbd_err(device, "request from peer beyond end of local disk: "
1900			"capacity: %llus < sector: %llus + size: %u\n",
1901			(unsigned long long)capacity,
1902			(unsigned long long)sector, ds);
1903		return NULL;
1904	}
1905
1906	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1907	 * "criss-cross" setup, that might cause write-out on some other DRBD,
1908	 * which in turn might block on the other node at this very place.  */
1909	peer_req = drbd_alloc_peer_req(peer_device, id, sector, ds, data_size, GFP_NOIO);
1910	if (!peer_req)
1911		return NULL;
1912
1913	peer_req->flags |= EE_WRITE;
1914	if (trim) {
1915		peer_req->flags |= EE_TRIM;
1916		return peer_req;
1917	}
1918	if (zeroes) {
1919		peer_req->flags |= EE_ZEROOUT;
1920		return peer_req;
1921	}
1922
1923	/* receive payload size bytes into page chain */
1924	ds = data_size;
1925	page = peer_req->pages;
1926	page_chain_for_each(page) {
1927		unsigned len = min_t(int, ds, PAGE_SIZE);
1928		data = kmap(page);
1929		err = drbd_recv_all_warn(peer_device->connection, data, len);
1930		if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) {
1931			drbd_err(device, "Fault injection: Corrupting data on receive\n");
1932			data[0] = data[0] ^ (unsigned long)-1;
1933		}
1934		kunmap(page);
1935		if (err) {
1936			drbd_free_peer_req(device, peer_req);
1937			return NULL;
1938		}
1939		ds -= len;
1940	}
1941
1942	if (digest_size) {
1943		drbd_csum_ee_size(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv, data_size);
1944		if (memcmp(dig_in, dig_vv, digest_size)) {
1945			drbd_err(device, "Digest integrity check FAILED: %llus +%u\n",
1946				(unsigned long long)sector, data_size);
1947			drbd_free_peer_req(device, peer_req);
1948			return NULL;
1949		}
1950	}
1951	device->recv_cnt += data_size >> 9;
1952	return peer_req;
1953}
1954
1955/* drbd_drain_block() just takes a data block
1956 * out of the socket input buffer, and discards it.
1957 */
1958static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
1959{
1960	struct page *page;
1961	int err = 0;
1962	void *data;
1963
1964	if (!data_size)
1965		return 0;
1966
1967	page = drbd_alloc_pages(peer_device, 1, 1);
1968
1969	data = kmap(page);
1970	while (data_size) {
1971		unsigned int len = min_t(int, data_size, PAGE_SIZE);
1972
1973		err = drbd_recv_all_warn(peer_device->connection, data, len);
1974		if (err)
1975			break;
1976		data_size -= len;
1977	}
1978	kunmap(page);
1979	drbd_free_pages(peer_device->device, page, 0);
1980	return err;
1981}
1982
1983static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_request *req,
1984			   sector_t sector, int data_size)
1985{
1986	struct bio_vec bvec;
1987	struct bvec_iter iter;
1988	struct bio *bio;
1989	int digest_size, err, expect;
1990	void *dig_in = peer_device->connection->int_dig_in;
1991	void *dig_vv = peer_device->connection->int_dig_vv;
1992
1993	digest_size = 0;
1994	if (peer_device->connection->peer_integrity_tfm) {
1995		digest_size = crypto_shash_digestsize(peer_device->connection->peer_integrity_tfm);
1996		err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
1997		if (err)
1998			return err;
1999		data_size -= digest_size;
2000	}
2001
2002	/* optimistically update recv_cnt.  if receiving fails below,
2003	 * we disconnect anyways, and counters will be reset. */
2004	peer_device->device->recv_cnt += data_size>>9;
2005
2006	bio = req->master_bio;
2007	D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector);
2008
2009	bio_for_each_segment(bvec, bio, iter) {
2010		void *mapped = bvec_kmap_local(&bvec);
2011		expect = min_t(int, data_size, bvec.bv_len);
2012		err = drbd_recv_all_warn(peer_device->connection, mapped, expect);
2013		kunmap_local(mapped);
2014		if (err)
2015			return err;
2016		data_size -= expect;
2017	}
2018
2019	if (digest_size) {
2020		drbd_csum_bio(peer_device->connection->peer_integrity_tfm, bio, dig_vv);
2021		if (memcmp(dig_in, dig_vv, digest_size)) {
2022			drbd_err(peer_device, "Digest integrity check FAILED. Broken NICs?\n");
2023			return -EINVAL;
2024		}
2025	}
2026
2027	D_ASSERT(peer_device->device, data_size == 0);
2028	return 0;
2029}
2030
2031/*
2032 * e_end_resync_block() is called in ack_sender context via
2033 * drbd_finish_peer_reqs().
2034 */
2035static int e_end_resync_block(struct drbd_work *w, int unused)
2036{
2037	struct drbd_peer_request *peer_req =
2038		container_of(w, struct drbd_peer_request, w);
2039	struct drbd_peer_device *peer_device = peer_req->peer_device;
2040	struct drbd_device *device = peer_device->device;
2041	sector_t sector = peer_req->i.sector;
2042	int err;
2043
2044	D_ASSERT(device, drbd_interval_empty(&peer_req->i));
2045
2046	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
2047		drbd_set_in_sync(device, sector, peer_req->i.size);
2048		err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req);
2049	} else {
2050		/* Record failure to sync */
2051		drbd_rs_failed_io(device, sector, peer_req->i.size);
2052
2053		err  = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
2054	}
2055	dec_unacked(device);
2056
2057	return err;
2058}
2059
2060static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector,
2061			    struct packet_info *pi) __releases(local)
2062{
2063	struct drbd_device *device = peer_device->device;
2064	struct drbd_peer_request *peer_req;
2065
2066	peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi);
2067	if (!peer_req)
2068		goto fail;
2069
2070	dec_rs_pending(device);
2071
2072	inc_unacked(device);
2073	/* corresponding dec_unacked() in e_end_resync_block()
2074	 * respective _drbd_clear_done_ee */
2075
2076	peer_req->w.cb = e_end_resync_block;
2077	peer_req->opf = REQ_OP_WRITE;
2078	peer_req->submit_jif = jiffies;
2079
2080	spin_lock_irq(&device->resource->req_lock);
2081	list_add_tail(&peer_req->w.list, &device->sync_ee);
2082	spin_unlock_irq(&device->resource->req_lock);
2083
2084	atomic_add(pi->size >> 9, &device->rs_sect_ev);
2085	if (drbd_submit_peer_request(peer_req) == 0)
2086		return 0;
2087
2088	/* don't care for the reason here */
2089	drbd_err(device, "submit failed, triggering re-connect\n");
2090	spin_lock_irq(&device->resource->req_lock);
2091	list_del(&peer_req->w.list);
2092	spin_unlock_irq(&device->resource->req_lock);
2093
2094	drbd_free_peer_req(device, peer_req);
2095fail:
2096	put_ldev(device);
2097	return -EIO;
2098}
2099
2100static struct drbd_request *
2101find_request(struct drbd_device *device, struct rb_root *root, u64 id,
2102	     sector_t sector, bool missing_ok, const char *func)
2103{
2104	struct drbd_request *req;
2105
2106	/* Request object according to our peer */
2107	req = (struct drbd_request *)(unsigned long)id;
2108	if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
2109		return req;
2110	if (!missing_ok) {
2111		drbd_err(device, "%s: failed to find request 0x%lx, sector %llus\n", func,
2112			(unsigned long)id, (unsigned long long)sector);
2113	}
2114	return NULL;
2115}
2116
2117static int receive_DataReply(struct drbd_connection *connection, struct packet_info *pi)
2118{
2119	struct drbd_peer_device *peer_device;
2120	struct drbd_device *device;
2121	struct drbd_request *req;
2122	sector_t sector;
2123	int err;
2124	struct p_data *p = pi->data;
2125
2126	peer_device = conn_peer_device(connection, pi->vnr);
2127	if (!peer_device)
2128		return -EIO;
2129	device = peer_device->device;
2130
2131	sector = be64_to_cpu(p->sector);
2132
2133	spin_lock_irq(&device->resource->req_lock);
2134	req = find_request(device, &device->read_requests, p->block_id, sector, false, __func__);
2135	spin_unlock_irq(&device->resource->req_lock);
2136	if (unlikely(!req))
2137		return -EIO;
2138
2139	err = recv_dless_read(peer_device, req, sector, pi->size);
2140	if (!err)
2141		req_mod(req, DATA_RECEIVED);
2142	/* else: nothing. handled from drbd_disconnect...
2143	 * I don't think we may complete this just yet
2144	 * in case we are "on-disconnect: freeze" */
2145
2146	return err;
2147}
2148
2149static int receive_RSDataReply(struct drbd_connection *connection, struct packet_info *pi)
2150{
2151	struct drbd_peer_device *peer_device;
2152	struct drbd_device *device;
2153	sector_t sector;
2154	int err;
2155	struct p_data *p = pi->data;
2156
2157	peer_device = conn_peer_device(connection, pi->vnr);
2158	if (!peer_device)
2159		return -EIO;
2160	device = peer_device->device;
2161
2162	sector = be64_to_cpu(p->sector);
2163	D_ASSERT(device, p->block_id == ID_SYNCER);
2164
2165	if (get_ldev(device)) {
2166		/* data is submitted to disk within recv_resync_read.
2167		 * corresponding put_ldev done below on error,
2168		 * or in drbd_peer_request_endio. */
2169		err = recv_resync_read(peer_device, sector, pi);
2170	} else {
2171		if (drbd_ratelimit())
2172			drbd_err(device, "Can not write resync data to local disk.\n");
2173
2174		err = drbd_drain_block(peer_device, pi->size);
2175
2176		drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
2177	}
2178
2179	atomic_add(pi->size >> 9, &device->rs_sect_in);
2180
2181	return err;
2182}
2183
2184static void restart_conflicting_writes(struct drbd_device *device,
2185				       sector_t sector, int size)
2186{
2187	struct drbd_interval *i;
2188	struct drbd_request *req;
2189
2190	drbd_for_each_overlap(i, &device->write_requests, sector, size) {
2191		if (!i->local)
2192			continue;
2193		req = container_of(i, struct drbd_request, i);
2194		if (req->rq_state & RQ_LOCAL_PENDING ||
2195		    !(req->rq_state & RQ_POSTPONED))
2196			continue;
2197		/* as it is RQ_POSTPONED, this will cause it to
2198		 * be queued on the retry workqueue. */
2199		__req_mod(req, CONFLICT_RESOLVED, NULL);
2200	}
2201}
2202
2203/*
2204 * e_end_block() is called in ack_sender context via drbd_finish_peer_reqs().
2205 */
2206static int e_end_block(struct drbd_work *w, int cancel)
2207{
2208	struct drbd_peer_request *peer_req =
2209		container_of(w, struct drbd_peer_request, w);
2210	struct drbd_peer_device *peer_device = peer_req->peer_device;
2211	struct drbd_device *device = peer_device->device;
2212	sector_t sector = peer_req->i.sector;
2213	int err = 0, pcmd;
2214
2215	if (peer_req->flags & EE_SEND_WRITE_ACK) {
2216		if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
2217			pcmd = (device->state.conn >= C_SYNC_SOURCE &&
2218				device->state.conn <= C_PAUSED_SYNC_T &&
2219				peer_req->flags & EE_MAY_SET_IN_SYNC) ?
2220				P_RS_WRITE_ACK : P_WRITE_ACK;
2221			err = drbd_send_ack(peer_device, pcmd, peer_req);
2222			if (pcmd == P_RS_WRITE_ACK)
2223				drbd_set_in_sync(device, sector, peer_req->i.size);
2224		} else {
2225			err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
2226			/* we expect it to be marked out of sync anyways...
2227			 * maybe assert this?  */
2228		}
2229		dec_unacked(device);
2230	}
2231
2232	/* we delete from the conflict detection hash _after_ we sent out the
2233	 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
2234	if (peer_req->flags & EE_IN_INTERVAL_TREE) {
2235		spin_lock_irq(&device->resource->req_lock);
2236		D_ASSERT(device, !drbd_interval_empty(&peer_req->i));
2237		drbd_remove_epoch_entry_interval(device, peer_req);
2238		if (peer_req->flags & EE_RESTART_REQUESTS)
2239			restart_conflicting_writes(device, sector, peer_req->i.size);
2240		spin_unlock_irq(&device->resource->req_lock);
2241	} else
2242		D_ASSERT(device, drbd_interval_empty(&peer_req->i));
2243
2244	drbd_may_finish_epoch(peer_device->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
2245
2246	return err;
2247}
2248
2249static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
2250{
2251	struct drbd_peer_request *peer_req =
2252		container_of(w, struct drbd_peer_request, w);
2253	struct drbd_peer_device *peer_device = peer_req->peer_device;
2254	int err;
2255
2256	err = drbd_send_ack(peer_device, ack, peer_req);
2257	dec_unacked(peer_device->device);
2258
2259	return err;
2260}
2261
2262static int e_send_superseded(struct drbd_work *w, int unused)
2263{
2264	return e_send_ack(w, P_SUPERSEDED);
2265}
2266
2267static int e_send_retry_write(struct drbd_work *w, int unused)
2268{
2269	struct drbd_peer_request *peer_req =
2270		container_of(w, struct drbd_peer_request, w);
2271	struct drbd_connection *connection = peer_req->peer_device->connection;
2272
2273	return e_send_ack(w, connection->agreed_pro_version >= 100 ?
2274			     P_RETRY_WRITE : P_SUPERSEDED);
2275}
2276
2277static bool seq_greater(u32 a, u32 b)
2278{
2279	/*
2280	 * We assume 32-bit wrap-around here.
2281	 * For 24-bit wrap-around, we would have to shift:
2282	 *  a <<= 8; b <<= 8;
2283	 */
2284	return (s32)a - (s32)b > 0;
2285}
2286
2287static u32 seq_max(u32 a, u32 b)
2288{
2289	return seq_greater(a, b) ? a : b;
2290}
2291
2292static void update_peer_seq(struct drbd_peer_device *peer_device, unsigned int peer_seq)
2293{
2294	struct drbd_device *device = peer_device->device;
2295	unsigned int newest_peer_seq;
2296
2297	if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) {
2298		spin_lock(&device->peer_seq_lock);
2299		newest_peer_seq = seq_max(device->peer_seq, peer_seq);
2300		device->peer_seq = newest_peer_seq;
2301		spin_unlock(&device->peer_seq_lock);
2302		/* wake up only if we actually changed device->peer_seq */
2303		if (peer_seq == newest_peer_seq)
2304			wake_up(&device->seq_wait);
2305	}
2306}
2307
2308static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
2309{
2310	return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
2311}
2312
2313/* maybe change sync_ee into interval trees as well? */
2314static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req)
2315{
2316	struct drbd_peer_request *rs_req;
2317	bool rv = false;
2318
2319	spin_lock_irq(&device->resource->req_lock);
2320	list_for_each_entry(rs_req, &device->sync_ee, w.list) {
2321		if (overlaps(peer_req->i.sector, peer_req->i.size,
2322			     rs_req->i.sector, rs_req->i.size)) {
2323			rv = true;
2324			break;
2325		}
2326	}
2327	spin_unlock_irq(&device->resource->req_lock);
2328
2329	return rv;
2330}
2331
2332/* Called from receive_Data.
2333 * Synchronize packets on sock with packets on msock.
2334 *
2335 * This is here so even when a P_DATA packet traveling via sock overtook an Ack
2336 * packet traveling on msock, they are still processed in the order they have
2337 * been sent.
2338 *
2339 * Note: we don't care for Ack packets overtaking P_DATA packets.
2340 *
2341 * In case packet_seq is larger than device->peer_seq number, there are
2342 * outstanding packets on the msock. We wait for them to arrive.
2343 * In case we are the logically next packet, we update device->peer_seq
2344 * ourselves. Correctly handles 32bit wrap around.
2345 *
2346 * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
2347 * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
2348 * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
2349 * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
2350 *
2351 * returns 0 if we may process the packet,
2352 * -ERESTARTSYS if we were interrupted (by disconnect signal). */
2353static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, const u32 peer_seq)
2354{
2355	struct drbd_device *device = peer_device->device;
2356	DEFINE_WAIT(wait);
2357	long timeout;
2358	int ret = 0, tp;
2359
2360	if (!test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags))
2361		return 0;
2362
2363	spin_lock(&device->peer_seq_lock);
2364	for (;;) {
2365		if (!seq_greater(peer_seq - 1, device->peer_seq)) {
2366			device->peer_seq = seq_max(device->peer_seq, peer_seq);
2367			break;
2368		}
2369
2370		if (signal_pending(current)) {
2371			ret = -ERESTARTSYS;
2372			break;
2373		}
2374
2375		rcu_read_lock();
2376		tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries;
2377		rcu_read_unlock();
2378
2379		if (!tp)
2380			break;
2381
2382		/* Only need to wait if two_primaries is enabled */
2383		prepare_to_wait(&device->seq_wait, &wait, TASK_INTERRUPTIBLE);
2384		spin_unlock(&device->peer_seq_lock);
2385		rcu_read_lock();
2386		timeout = rcu_dereference(peer_device->connection->net_conf)->ping_timeo*HZ/10;
2387		rcu_read_unlock();
2388		timeout = schedule_timeout(timeout);
2389		spin_lock(&device->peer_seq_lock);
2390		if (!timeout) {
2391			ret = -ETIMEDOUT;
2392			drbd_err(device, "Timed out waiting for missing ack packets; disconnecting\n");
2393			break;
2394		}
2395	}
2396	spin_unlock(&device->peer_seq_lock);
2397	finish_wait(&device->seq_wait, &wait);
2398	return ret;
2399}
2400
2401static enum req_op wire_flags_to_bio_op(u32 dpf)
2402{
2403	if (dpf & DP_ZEROES)
2404		return REQ_OP_WRITE_ZEROES;
2405	if (dpf & DP_DISCARD)
2406		return REQ_OP_DISCARD;
2407	else
2408		return REQ_OP_WRITE;
2409}
2410
2411/* see also bio_flags_to_wire() */
2412static blk_opf_t wire_flags_to_bio(struct drbd_connection *connection, u32 dpf)
2413{
2414	return wire_flags_to_bio_op(dpf) |
2415		(dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
2416		(dpf & DP_FUA ? REQ_FUA : 0) |
2417		(dpf & DP_FLUSH ? REQ_PREFLUSH : 0);
2418}
2419
2420static void fail_postponed_requests(struct drbd_device *device, sector_t sector,
2421				    unsigned int size)
2422{
2423	struct drbd_interval *i;
2424
2425    repeat:
2426	drbd_for_each_overlap(i, &device->write_requests, sector, size) {
2427		struct drbd_request *req;
2428		struct bio_and_error m;
2429
2430		if (!i->local)
2431			continue;
2432		req = container_of(i, struct drbd_request, i);
2433		if (!(req->rq_state & RQ_POSTPONED))
2434			continue;
2435		req->rq_state &= ~RQ_POSTPONED;
2436		__req_mod(req, NEG_ACKED, &m);
2437		spin_unlock_irq(&device->resource->req_lock);
2438		if (m.bio)
2439			complete_master_bio(device, &m);
2440		spin_lock_irq(&device->resource->req_lock);
2441		goto repeat;
2442	}
2443}
2444
2445static int handle_write_conflicts(struct drbd_device *device,
2446				  struct drbd_peer_request *peer_req)
2447{
2448	struct drbd_connection *connection = peer_req->peer_device->connection;
2449	bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &connection->flags);
2450	sector_t sector = peer_req->i.sector;
2451	const unsigned int size = peer_req->i.size;
2452	struct drbd_interval *i;
2453	bool equal;
2454	int err;
2455
2456	/*
2457	 * Inserting the peer request into the write_requests tree will prevent
2458	 * new conflicting local requests from being added.
2459	 */
2460	drbd_insert_interval(&device->write_requests, &peer_req->i);
2461
2462    repeat:
2463	drbd_for_each_overlap(i, &device->write_requests, sector, size) {
2464		if (i == &peer_req->i)
2465			continue;
2466		if (i->completed)
2467			continue;
2468
2469		if (!i->local) {
2470			/*
2471			 * Our peer has sent a conflicting remote request; this
2472			 * should not happen in a two-node setup.  Wait for the
2473			 * earlier peer request to complete.
2474			 */
2475			err = drbd_wait_misc(device, i);
2476			if (err)
2477				goto out;
2478			goto repeat;
2479		}
2480
2481		equal = i->sector == sector && i->size == size;
2482		if (resolve_conflicts) {
2483			/*
2484			 * If the peer request is fully contained within the
2485			 * overlapping request, it can be considered overwritten
2486			 * and thus superseded; otherwise, it will be retried
2487			 * once all overlapping requests have completed.
2488			 */
2489			bool superseded = i->sector <= sector && i->sector +
2490				       (i->size >> 9) >= sector + (size >> 9);
2491
2492			if (!equal)
2493				drbd_alert(device, "Concurrent writes detected: "
2494					       "local=%llus +%u, remote=%llus +%u, "
2495					       "assuming %s came first\n",
2496					  (unsigned long long)i->sector, i->size,
2497					  (unsigned long long)sector, size,
2498					  superseded ? "local" : "remote");
2499
2500			peer_req->w.cb = superseded ? e_send_superseded :
2501						   e_send_retry_write;
2502			list_add_tail(&peer_req->w.list, &device->done_ee);
2503			queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work);
2504
2505			err = -ENOENT;
2506			goto out;
2507		} else {
2508			struct drbd_request *req =
2509				container_of(i, struct drbd_request, i);
2510
2511			if (!equal)
2512				drbd_alert(device, "Concurrent writes detected: "
2513					       "local=%llus +%u, remote=%llus +%u\n",
2514					  (unsigned long long)i->sector, i->size,
2515					  (unsigned long long)sector, size);
2516
2517			if (req->rq_state & RQ_LOCAL_PENDING ||
2518			    !(req->rq_state & RQ_POSTPONED)) {
2519				/*
2520				 * Wait for the node with the discard flag to
2521				 * decide if this request has been superseded
2522				 * or needs to be retried.
2523				 * Requests that have been superseded will
2524				 * disappear from the write_requests tree.
2525				 *
2526				 * In addition, wait for the conflicting
2527				 * request to finish locally before submitting
2528				 * the conflicting peer request.
2529				 */
2530				err = drbd_wait_misc(device, &req->i);
2531				if (err) {
2532					_conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
2533					fail_postponed_requests(device, sector, size);
2534					goto out;
2535				}
2536				goto repeat;
2537			}
2538			/*
2539			 * Remember to restart the conflicting requests after
2540			 * the new peer request has completed.
2541			 */
2542			peer_req->flags |= EE_RESTART_REQUESTS;
2543		}
2544	}
2545	err = 0;
2546
2547    out:
2548	if (err)
2549		drbd_remove_epoch_entry_interval(device, peer_req);
2550	return err;
2551}
2552
2553/* mirrored write */
2554static int receive_Data(struct drbd_connection *connection, struct packet_info *pi)
2555{
2556	struct drbd_peer_device *peer_device;
2557	struct drbd_device *device;
2558	struct net_conf *nc;
2559	sector_t sector;
2560	struct drbd_peer_request *peer_req;
2561	struct p_data *p = pi->data;
2562	u32 peer_seq = be32_to_cpu(p->seq_num);
2563	u32 dp_flags;
2564	int err, tp;
2565
2566	peer_device = conn_peer_device(connection, pi->vnr);
2567	if (!peer_device)
2568		return -EIO;
2569	device = peer_device->device;
2570
2571	if (!get_ldev(device)) {
2572		int err2;
2573
2574		err = wait_for_and_update_peer_seq(peer_device, peer_seq);
2575		drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
2576		atomic_inc(&connection->current_epoch->epoch_size);
2577		err2 = drbd_drain_block(peer_device, pi->size);
2578		if (!err)
2579			err = err2;
2580		return err;
2581	}
2582
2583	/*
2584	 * Corresponding put_ldev done either below (on various errors), or in
2585	 * drbd_peer_request_endio, if we successfully submit the data at the
2586	 * end of this function.
2587	 */
2588
2589	sector = be64_to_cpu(p->sector);
2590	peer_req = read_in_block(peer_device, p->block_id, sector, pi);
2591	if (!peer_req) {
2592		put_ldev(device);
2593		return -EIO;
2594	}
2595
2596	peer_req->w.cb = e_end_block;
2597	peer_req->submit_jif = jiffies;
2598	peer_req->flags |= EE_APPLICATION;
2599
2600	dp_flags = be32_to_cpu(p->dp_flags);
2601	peer_req->opf = wire_flags_to_bio(connection, dp_flags);
2602	if (pi->cmd == P_TRIM) {
2603		D_ASSERT(peer_device, peer_req->i.size > 0);
2604		D_ASSERT(peer_device, peer_req_op(peer_req) == REQ_OP_DISCARD);
2605		D_ASSERT(peer_device, peer_req->pages == NULL);
2606		/* need to play safe: an older DRBD sender
2607		 * may mean zero-out while sending P_TRIM. */
2608		if (0 == (connection->agreed_features & DRBD_FF_WZEROES))
2609			peer_req->flags |= EE_ZEROOUT;
2610	} else if (pi->cmd == P_ZEROES) {
2611		D_ASSERT(peer_device, peer_req->i.size > 0);
2612		D_ASSERT(peer_device, peer_req_op(peer_req) == REQ_OP_WRITE_ZEROES);
2613		D_ASSERT(peer_device, peer_req->pages == NULL);
2614		/* Do (not) pass down BLKDEV_ZERO_NOUNMAP? */
2615		if (dp_flags & DP_DISCARD)
2616			peer_req->flags |= EE_TRIM;
2617	} else if (peer_req->pages == NULL) {
2618		D_ASSERT(device, peer_req->i.size == 0);
2619		D_ASSERT(device, dp_flags & DP_FLUSH);
2620	}
2621
2622	if (dp_flags & DP_MAY_SET_IN_SYNC)
2623		peer_req->flags |= EE_MAY_SET_IN_SYNC;
2624
2625	spin_lock(&connection->epoch_lock);
2626	peer_req->epoch = connection->current_epoch;
2627	atomic_inc(&peer_req->epoch->epoch_size);
2628	atomic_inc(&peer_req->epoch->active);
2629	spin_unlock(&connection->epoch_lock);
2630
2631	rcu_read_lock();
2632	nc = rcu_dereference(peer_device->connection->net_conf);
2633	tp = nc->two_primaries;
2634	if (peer_device->connection->agreed_pro_version < 100) {
2635		switch (nc->wire_protocol) {
2636		case DRBD_PROT_C:
2637			dp_flags |= DP_SEND_WRITE_ACK;
2638			break;
2639		case DRBD_PROT_B:
2640			dp_flags |= DP_SEND_RECEIVE_ACK;
2641			break;
2642		}
2643	}
2644	rcu_read_unlock();
2645
2646	if (dp_flags & DP_SEND_WRITE_ACK) {
2647		peer_req->flags |= EE_SEND_WRITE_ACK;
2648		inc_unacked(device);
2649		/* corresponding dec_unacked() in e_end_block()
2650		 * respective _drbd_clear_done_ee */
2651	}
2652
2653	if (dp_flags & DP_SEND_RECEIVE_ACK) {
2654		/* I really don't like it that the receiver thread
2655		 * sends on the msock, but anyways */
2656		drbd_send_ack(peer_device, P_RECV_ACK, peer_req);
2657	}
2658
2659	if (tp) {
2660		/* two primaries implies protocol C */
2661		D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK);
2662		peer_req->flags |= EE_IN_INTERVAL_TREE;
2663		err = wait_for_and_update_peer_seq(peer_device, peer_seq);
2664		if (err)
2665			goto out_interrupted;
2666		spin_lock_irq(&device->resource->req_lock);
2667		err = handle_write_conflicts(device, peer_req);
2668		if (err) {
2669			spin_unlock_irq(&device->resource->req_lock);
2670			if (err == -ENOENT) {
2671				put_ldev(device);
2672				return 0;
2673			}
2674			goto out_interrupted;
2675		}
2676	} else {
2677		update_peer_seq(peer_device, peer_seq);
2678		spin_lock_irq(&device->resource->req_lock);
2679	}
2680	/* TRIM and is processed synchronously,
2681	 * we wait for all pending requests, respectively wait for
2682	 * active_ee to become empty in drbd_submit_peer_request();
2683	 * better not add ourselves here. */
2684	if ((peer_req->flags & (EE_TRIM | EE_ZEROOUT)) == 0)
2685		list_add_tail(&peer_req->w.list, &device->active_ee);
2686	spin_unlock_irq(&device->resource->req_lock);
2687
2688	if (device->state.conn == C_SYNC_TARGET)
2689		wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req));
2690
2691	if (device->state.pdsk < D_INCONSISTENT) {
2692		/* In case we have the only disk of the cluster, */
2693		drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
2694		peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
2695		drbd_al_begin_io(device, &peer_req->i);
2696		peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
2697	}
2698
2699	err = drbd_submit_peer_request(peer_req);
2700	if (!err)
2701		return 0;
2702
2703	/* don't care for the reason here */
2704	drbd_err(device, "submit failed, triggering re-connect\n");
2705	spin_lock_irq(&device->resource->req_lock);
2706	list_del(&peer_req->w.list);
2707	drbd_remove_epoch_entry_interval(device, peer_req);
2708	spin_unlock_irq(&device->resource->req_lock);
2709	if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) {
2710		peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
2711		drbd_al_complete_io(device, &peer_req->i);
2712	}
2713
2714out_interrupted:
2715	drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT | EV_CLEANUP);
2716	put_ldev(device);
2717	drbd_free_peer_req(device, peer_req);
2718	return err;
2719}
2720
2721/* We may throttle resync, if the lower device seems to be busy,
2722 * and current sync rate is above c_min_rate.
2723 *
2724 * To decide whether or not the lower device is busy, we use a scheme similar
2725 * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
2726 * (more than 64 sectors) of activity we cannot account for with our own resync
2727 * activity, it obviously is "busy".
2728 *
2729 * The current sync rate used here uses only the most recent two step marks,
2730 * to have a short time average so we can react faster.
2731 */
2732bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
2733		bool throttle_if_app_is_waiting)
2734{
2735	struct lc_element *tmp;
2736	bool throttle = drbd_rs_c_min_rate_throttle(device);
2737
2738	if (!throttle || throttle_if_app_is_waiting)
2739		return throttle;
2740
2741	spin_lock_irq(&device->al_lock);
2742	tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
2743	if (tmp) {
2744		struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
2745		if (test_bit(BME_PRIORITY, &bm_ext->flags))
2746			throttle = false;
2747		/* Do not slow down if app IO is already waiting for this extent,
2748		 * and our progress is necessary for application IO to complete. */
2749	}
2750	spin_unlock_irq(&device->al_lock);
2751
2752	return throttle;
2753}
2754
2755bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
2756{
2757	struct gendisk *disk = device->ldev->backing_bdev->bd_disk;
2758	unsigned long db, dt, dbdt;
2759	unsigned int c_min_rate;
2760	int curr_events;
2761
2762	rcu_read_lock();
2763	c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate;
2764	rcu_read_unlock();
2765
2766	/* feature disabled? */
2767	if (c_min_rate == 0)
2768		return false;
2769
2770	curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
2771			atomic_read(&device->rs_sect_ev);
2772
2773	if (atomic_read(&device->ap_actlog_cnt)
2774	    || curr_events - device->rs_last_events > 64) {
2775		unsigned long rs_left;
2776		int i;
2777
2778		device->rs_last_events = curr_events;
2779
2780		/* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
2781		 * approx. */
2782		i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
2783
2784		if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
2785			rs_left = device->ov_left;
2786		else
2787			rs_left = drbd_bm_total_weight(device) - device->rs_failed;
2788
2789		dt = ((long)jiffies - (long)device->rs_mark_time[i]) / HZ;
2790		if (!dt)
2791			dt++;
2792		db = device->rs_mark_left[i] - rs_left;
2793		dbdt = Bit2KB(db/dt);
2794
2795		if (dbdt > c_min_rate)
2796			return true;
2797	}
2798	return false;
2799}
2800
2801static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi)
2802{
2803	struct drbd_peer_device *peer_device;
2804	struct drbd_device *device;
2805	sector_t sector;
2806	sector_t capacity;
2807	struct drbd_peer_request *peer_req;
2808	struct digest_info *di = NULL;
2809	int size, verb;
2810	struct p_block_req *p =	pi->data;
2811
2812	peer_device = conn_peer_device(connection, pi->vnr);
2813	if (!peer_device)
2814		return -EIO;
2815	device = peer_device->device;
2816	capacity = get_capacity(device->vdisk);
2817
2818	sector = be64_to_cpu(p->sector);
2819	size   = be32_to_cpu(p->blksize);
2820
2821	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
2822		drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2823				(unsigned long long)sector, size);
2824		return -EINVAL;
2825	}
2826	if (sector + (size>>9) > capacity) {
2827		drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2828				(unsigned long long)sector, size);
2829		return -EINVAL;
2830	}
2831
2832	if (!get_ldev_if_state(device, D_UP_TO_DATE)) {
2833		verb = 1;
2834		switch (pi->cmd) {
2835		case P_DATA_REQUEST:
2836			drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p);
2837			break;
2838		case P_RS_THIN_REQ:
2839		case P_RS_DATA_REQUEST:
2840		case P_CSUM_RS_REQUEST:
2841		case P_OV_REQUEST:
2842			drbd_send_ack_rp(peer_device, P_NEG_RS_DREPLY , p);
2843			break;
2844		case P_OV_REPLY:
2845			verb = 0;
2846			dec_rs_pending(device);
2847			drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC);
2848			break;
2849		default:
2850			BUG();
2851		}
2852		if (verb && drbd_ratelimit())
2853			drbd_err(device, "Can not satisfy peer's read request, "
2854			    "no local data.\n");
2855
2856		/* drain possibly payload */
2857		return drbd_drain_block(peer_device, pi->size);
2858	}
2859
2860	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
2861	 * "criss-cross" setup, that might cause write-out on some other DRBD,
2862	 * which in turn might block on the other node at this very place.  */
2863	peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
2864			size, GFP_NOIO);
2865	if (!peer_req) {
2866		put_ldev(device);
2867		return -ENOMEM;
2868	}
2869	peer_req->opf = REQ_OP_READ;
2870
2871	switch (pi->cmd) {
2872	case P_DATA_REQUEST:
2873		peer_req->w.cb = w_e_end_data_req;
2874		/* application IO, don't drbd_rs_begin_io */
2875		peer_req->flags |= EE_APPLICATION;
2876		goto submit;
2877
2878	case P_RS_THIN_REQ:
2879		/* If at some point in the future we have a smart way to
2880		   find out if this data block is completely deallocated,
2881		   then we would do something smarter here than reading
2882		   the block... */
2883		peer_req->flags |= EE_RS_THIN_REQ;
2884		fallthrough;
2885	case P_RS_DATA_REQUEST:
2886		peer_req->w.cb = w_e_end_rsdata_req;
2887		/* used in the sector offset progress display */
2888		device->bm_resync_fo = BM_SECT_TO_BIT(sector);
2889		break;
2890
2891	case P_OV_REPLY:
2892	case P_CSUM_RS_REQUEST:
2893		di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
2894		if (!di)
2895			goto out_free_e;
2896
2897		di->digest_size = pi->size;
2898		di->digest = (((char *)di)+sizeof(struct digest_info));
2899
2900		peer_req->digest = di;
2901		peer_req->flags |= EE_HAS_DIGEST;
2902
2903		if (drbd_recv_all(peer_device->connection, di->digest, pi->size))
2904			goto out_free_e;
2905
2906		if (pi->cmd == P_CSUM_RS_REQUEST) {
2907			D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
2908			peer_req->w.cb = w_e_end_csum_rs_req;
2909			/* used in the sector offset progress display */
2910			device->bm_resync_fo = BM_SECT_TO_BIT(sector);
2911			/* remember to report stats in drbd_resync_finished */
2912			device->use_csums = true;
2913		} else if (pi->cmd == P_OV_REPLY) {
2914			/* track progress, we may need to throttle */
2915			atomic_add(size >> 9, &device->rs_sect_in);
2916			peer_req->w.cb = w_e_end_ov_reply;
2917			dec_rs_pending(device);
2918			/* drbd_rs_begin_io done when we sent this request,
2919			 * but accounting still needs to be done. */
2920			goto submit_for_resync;
2921		}
2922		break;
2923
2924	case P_OV_REQUEST:
2925		if (device->ov_start_sector == ~(sector_t)0 &&
2926		    peer_device->connection->agreed_pro_version >= 90) {
2927			unsigned long now = jiffies;
2928			int i;
2929			device->ov_start_sector = sector;
2930			device->ov_position = sector;
2931			device->ov_left = drbd_bm_bits(device) - BM_SECT_TO_BIT(sector);
2932			device->rs_total = device->ov_left;
2933			for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2934				device->rs_mark_left[i] = device->ov_left;
2935				device->rs_mark_time[i] = now;
2936			}
2937			drbd_info(device, "Online Verify start sector: %llu\n",
2938					(unsigned long long)sector);
2939		}
2940		peer_req->w.cb = w_e_end_ov_req;
2941		break;
2942
2943	default:
2944		BUG();
2945	}
2946
2947	/* Throttle, drbd_rs_begin_io and submit should become asynchronous
2948	 * wrt the receiver, but it is not as straightforward as it may seem.
2949	 * Various places in the resync start and stop logic assume resync
2950	 * requests are processed in order, requeuing this on the worker thread
2951	 * introduces a bunch of new code for synchronization between threads.
2952	 *
2953	 * Unlimited throttling before drbd_rs_begin_io may stall the resync
2954	 * "forever", throttling after drbd_rs_begin_io will lock that extent
2955	 * for application writes for the same time.  For now, just throttle
2956	 * here, where the rest of the code expects the receiver to sleep for
2957	 * a while, anyways.
2958	 */
2959
2960	/* Throttle before drbd_rs_begin_io, as that locks out application IO;
2961	 * this defers syncer requests for some time, before letting at least
2962	 * on request through.  The resync controller on the receiving side
2963	 * will adapt to the incoming rate accordingly.
2964	 *
2965	 * We cannot throttle here if remote is Primary/SyncTarget:
2966	 * we would also throttle its application reads.
2967	 * In that case, throttling is done on the SyncTarget only.
2968	 */
2969
2970	/* Even though this may be a resync request, we do add to "read_ee";
2971	 * "sync_ee" is only used for resync WRITEs.
2972	 * Add to list early, so debugfs can find this request
2973	 * even if we have to sleep below. */
2974	spin_lock_irq(&device->resource->req_lock);
2975	list_add_tail(&peer_req->w.list, &device->read_ee);
2976	spin_unlock_irq(&device->resource->req_lock);
2977
2978	update_receiver_timing_details(connection, drbd_rs_should_slow_down);
2979	if (device->state.peer != R_PRIMARY
2980	&& drbd_rs_should_slow_down(device, sector, false))
2981		schedule_timeout_uninterruptible(HZ/10);
2982	update_receiver_timing_details(connection, drbd_rs_begin_io);
2983	if (drbd_rs_begin_io(device, sector))
2984		goto out_free_e;
2985
2986submit_for_resync:
2987	atomic_add(size >> 9, &device->rs_sect_ev);
2988
2989submit:
2990	update_receiver_timing_details(connection, drbd_submit_peer_request);
2991	inc_unacked(device);
2992	if (drbd_submit_peer_request(peer_req) == 0)
2993		return 0;
2994
2995	/* don't care for the reason here */
2996	drbd_err(device, "submit failed, triggering re-connect\n");
2997
2998out_free_e:
2999	spin_lock_irq(&device->resource->req_lock);
3000	list_del(&peer_req->w.list);
3001	spin_unlock_irq(&device->resource->req_lock);
3002	/* no drbd_rs_complete_io(), we are dropping the connection anyways */
3003
3004	put_ldev(device);
3005	drbd_free_peer_req(device, peer_req);
3006	return -EIO;
3007}
3008
3009/*
3010 * drbd_asb_recover_0p  -  Recover after split-brain with no remaining primaries
3011 */
3012static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold(local)
3013{
3014	struct drbd_device *device = peer_device->device;
3015	int self, peer, rv = -100;
3016	unsigned long ch_self, ch_peer;
3017	enum drbd_after_sb_p after_sb_0p;
3018
3019	self = device->ldev->md.uuid[UI_BITMAP] & 1;
3020	peer = device->p_uuid[UI_BITMAP] & 1;
3021
3022	ch_peer = device->p_uuid[UI_SIZE];
3023	ch_self = device->comm_bm_set;
3024
3025	rcu_read_lock();
3026	after_sb_0p = rcu_dereference(peer_device->connection->net_conf)->after_sb_0p;
3027	rcu_read_unlock();
3028	switch (after_sb_0p) {
3029	case ASB_CONSENSUS:
3030	case ASB_DISCARD_SECONDARY:
3031	case ASB_CALL_HELPER:
3032	case ASB_VIOLENTLY:
3033		drbd_err(device, "Configuration error.\n");
3034		break;
3035	case ASB_DISCONNECT:
3036		break;
3037	case ASB_DISCARD_YOUNGER_PRI:
3038		if (self == 0 && peer == 1) {
3039			rv = -1;
3040			break;
3041		}
3042		if (self == 1 && peer == 0) {
3043			rv =  1;
3044			break;
3045		}
3046		fallthrough;	/* to one of the other strategies */
3047	case ASB_DISCARD_OLDER_PRI:
3048		if (self == 0 && peer == 1) {
3049			rv = 1;
3050			break;
3051		}
3052		if (self == 1 && peer == 0) {
3053			rv = -1;
3054			break;
3055		}
3056		/* Else fall through to one of the other strategies... */
3057		drbd_warn(device, "Discard younger/older primary did not find a decision\n"
3058		     "Using discard-least-changes instead\n");
3059		fallthrough;
3060	case ASB_DISCARD_ZERO_CHG:
3061		if (ch_peer == 0 && ch_self == 0) {
3062			rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
3063				? -1 : 1;
3064			break;
3065		} else {
3066			if (ch_peer == 0) { rv =  1; break; }
3067			if (ch_self == 0) { rv = -1; break; }
3068		}
3069		if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
3070			break;
3071		fallthrough;
3072	case ASB_DISCARD_LEAST_CHG:
3073		if	(ch_self < ch_peer)
3074			rv = -1;
3075		else if (ch_self > ch_peer)
3076			rv =  1;
3077		else /* ( ch_self == ch_peer ) */
3078		     /* Well, then use something else. */
3079			rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
3080				? -1 : 1;
3081		break;
3082	case ASB_DISCARD_LOCAL:
3083		rv = -1;
3084		break;
3085	case ASB_DISCARD_REMOTE:
3086		rv =  1;
3087	}
3088
3089	return rv;
3090}
3091
3092/*
3093 * drbd_asb_recover_1p  -  Recover after split-brain with one remaining primary
3094 */
3095static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold(local)
3096{
3097	struct drbd_device *device = peer_device->device;
3098	int hg, rv = -100;
3099	enum drbd_after_sb_p after_sb_1p;
3100
3101	rcu_read_lock();
3102	after_sb_1p = rcu_dereference(peer_device->connection->net_conf)->after_sb_1p;
3103	rcu_read_unlock();
3104	switch (after_sb_1p) {
3105	case ASB_DISCARD_YOUNGER_PRI:
3106	case ASB_DISCARD_OLDER_PRI:
3107	case ASB_DISCARD_LEAST_CHG:
3108	case ASB_DISCARD_LOCAL:
3109	case ASB_DISCARD_REMOTE:
3110	case ASB_DISCARD_ZERO_CHG:
3111		drbd_err(device, "Configuration error.\n");
3112		break;
3113	case ASB_DISCONNECT:
3114		break;
3115	case ASB_CONSENSUS:
3116		hg = drbd_asb_recover_0p(peer_device);
3117		if (hg == -1 && device->state.role == R_SECONDARY)
3118			rv = hg;
3119		if (hg == 1  && device->state.role == R_PRIMARY)
3120			rv = hg;
3121		break;
3122	case ASB_VIOLENTLY:
3123		rv = drbd_asb_recover_0p(peer_device);
3124		break;
3125	case ASB_DISCARD_SECONDARY:
3126		return device->state.role == R_PRIMARY ? 1 : -1;
3127	case ASB_CALL_HELPER:
3128		hg = drbd_asb_recover_0p(peer_device);
3129		if (hg == -1 && device->state.role == R_PRIMARY) {
3130			enum drbd_state_rv rv2;
3131
3132			 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
3133			  * we might be here in C_WF_REPORT_PARAMS which is transient.
3134			  * we do not need to wait for the after state change work either. */
3135			rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
3136			if (rv2 != SS_SUCCESS) {
3137				drbd_khelper(device, "pri-lost-after-sb");
3138			} else {
3139				drbd_warn(device, "Successfully gave up primary role.\n");
3140				rv = hg;
3141			}
3142		} else
3143			rv = hg;
3144	}
3145
3146	return rv;
3147}
3148
3149/*
3150 * drbd_asb_recover_2p  -  Recover after split-brain with two remaining primaries
3151 */
3152static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold(local)
3153{
3154	struct drbd_device *device = peer_device->device;
3155	int hg, rv = -100;
3156	enum drbd_after_sb_p after_sb_2p;
3157
3158	rcu_read_lock();
3159	after_sb_2p = rcu_dereference(peer_device->connection->net_conf)->after_sb_2p;
3160	rcu_read_unlock();
3161	switch (after_sb_2p) {
3162	case ASB_DISCARD_YOUNGER_PRI:
3163	case ASB_DISCARD_OLDER_PRI:
3164	case ASB_DISCARD_LEAST_CHG:
3165	case ASB_DISCARD_LOCAL:
3166	case ASB_DISCARD_REMOTE:
3167	case ASB_CONSENSUS:
3168	case ASB_DISCARD_SECONDARY:
3169	case ASB_DISCARD_ZERO_CHG:
3170		drbd_err(device, "Configuration error.\n");
3171		break;
3172	case ASB_VIOLENTLY:
3173		rv = drbd_asb_recover_0p(peer_device);
3174		break;
3175	case ASB_DISCONNECT:
3176		break;
3177	case ASB_CALL_HELPER:
3178		hg = drbd_asb_recover_0p(peer_device);
3179		if (hg == -1) {
3180			enum drbd_state_rv rv2;
3181
3182			 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
3183			  * we might be here in C_WF_REPORT_PARAMS which is transient.
3184			  * we do not need to wait for the after state change work either. */
3185			rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
3186			if (rv2 != SS_SUCCESS) {
3187				drbd_khelper(device, "pri-lost-after-sb");
3188			} else {
3189				drbd_warn(device, "Successfully gave up primary role.\n");
3190				rv = hg;
3191			}
3192		} else
3193			rv = hg;
3194	}
3195
3196	return rv;
3197}
3198
3199static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
3200			   u64 bits, u64 flags)
3201{
3202	if (!uuid) {
3203		drbd_info(device, "%s uuid info vanished while I was looking!\n", text);
3204		return;
3205	}
3206	drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
3207	     text,
3208	     (unsigned long long)uuid[UI_CURRENT],
3209	     (unsigned long long)uuid[UI_BITMAP],
3210	     (unsigned long long)uuid[UI_HISTORY_START],
3211	     (unsigned long long)uuid[UI_HISTORY_END],
3212	     (unsigned long long)bits,
3213	     (unsigned long long)flags);
3214}
3215
3216/*
3217  100	after split brain try auto recover
3218    2	C_SYNC_SOURCE set BitMap
3219    1	C_SYNC_SOURCE use BitMap
3220    0	no Sync
3221   -1	C_SYNC_TARGET use BitMap
3222   -2	C_SYNC_TARGET set BitMap
3223 -100	after split brain, disconnect
3224-1000	unrelated data
3225-1091   requires proto 91
3226-1096   requires proto 96
3227 */
3228
3229static int drbd_uuid_compare(struct drbd_device *const device, enum drbd_role const peer_role, int *rule_nr) __must_hold(local)
3230{
3231	struct drbd_peer_device *const peer_device = first_peer_device(device);
3232	struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
3233	u64 self, peer;
3234	int i, j;
3235
3236	self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
3237	peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
3238
3239	*rule_nr = 10;
3240	if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
3241		return 0;
3242
3243	*rule_nr = 20;
3244	if ((self == UUID_JUST_CREATED || self == (u64)0) &&
3245	     peer != UUID_JUST_CREATED)
3246		return -2;
3247
3248	*rule_nr = 30;
3249	if (self != UUID_JUST_CREATED &&
3250	    (peer == UUID_JUST_CREATED || peer == (u64)0))
3251		return 2;
3252
3253	if (self == peer) {
3254		int rct, dc; /* roles at crash time */
3255
3256		if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) {
3257
3258			if (connection->agreed_pro_version < 91)
3259				return -1091;
3260
3261			if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
3262			    (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
3263				drbd_info(device, "was SyncSource, missed the resync finished event, corrected myself:\n");
3264				drbd_uuid_move_history(device);
3265				device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
3266				device->ldev->md.uuid[UI_BITMAP] = 0;
3267
3268				drbd_uuid_dump(device, "self", device->ldev->md.uuid,
3269					       device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
3270				*rule_nr = 34;
3271			} else {
3272				drbd_info(device, "was SyncSource (peer failed to write sync_uuid)\n");
3273				*rule_nr = 36;
3274			}
3275
3276			return 1;
3277		}
3278
3279		if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) {
3280
3281			if (connection->agreed_pro_version < 91)
3282				return -1091;
3283
3284			if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) &&
3285			    (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
3286				drbd_info(device, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
3287
3288				device->p_uuid[UI_HISTORY_START + 1] = device->p_uuid[UI_HISTORY_START];
3289				device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_BITMAP];
3290				device->p_uuid[UI_BITMAP] = 0UL;
3291
3292				drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
3293				*rule_nr = 35;
3294			} else {
3295				drbd_info(device, "was SyncTarget (failed to write sync_uuid)\n");
3296				*rule_nr = 37;
3297			}
3298
3299			return -1;
3300		}
3301
3302		/* Common power [off|failure] */
3303		rct = (test_bit(CRASHED_PRIMARY, &device->flags) ? 1 : 0) +
3304			(device->p_uuid[UI_FLAGS] & 2);
3305		/* lowest bit is set when we were primary,
3306		 * next bit (weight 2) is set when peer was primary */
3307		*rule_nr = 40;
3308
3309		/* Neither has the "crashed primary" flag set,
3310		 * only a replication link hickup. */
3311		if (rct == 0)
3312			return 0;
3313
3314		/* Current UUID equal and no bitmap uuid; does not necessarily
3315		 * mean this was a "simultaneous hard crash", maybe IO was
3316		 * frozen, so no UUID-bump happened.
3317		 * This is a protocol change, overload DRBD_FF_WSAME as flag
3318		 * for "new-enough" peer DRBD version. */
3319		if (device->state.role == R_PRIMARY || peer_role == R_PRIMARY) {
3320			*rule_nr = 41;
3321			if (!(connection->agreed_features & DRBD_FF_WSAME)) {
3322				drbd_warn(peer_device, "Equivalent unrotated UUIDs, but current primary present.\n");
3323				return -(0x10000 | PRO_VERSION_MAX | (DRBD_FF_WSAME << 8));
3324			}
3325			if (device->state.role == R_PRIMARY && peer_role == R_PRIMARY) {
3326				/* At least one has the "crashed primary" bit set,
3327				 * both are primary now, but neither has rotated its UUIDs?
3328				 * "Can not happen." */
3329				drbd_err(peer_device, "Equivalent unrotated UUIDs, but both are primary. Can not resolve this.\n");
3330				return -100;
3331			}
3332			if (device->state.role == R_PRIMARY)
3333				return 1;
3334			return -1;
3335		}
3336
3337		/* Both are secondary.
3338		 * Really looks like recovery from simultaneous hard crash.
3339		 * Check which had been primary before, and arbitrate. */
3340		switch (rct) {
3341		case 0: /* !self_pri && !peer_pri */ return 0; /* already handled */
3342		case 1: /*  self_pri && !peer_pri */ return 1;
3343		case 2: /* !self_pri &&  peer_pri */ return -1;
3344		case 3: /*  self_pri &&  peer_pri */
3345			dc = test_bit(RESOLVE_CONFLICTS, &connection->flags);
3346			return dc ? -1 : 1;
3347		}
3348	}
3349
3350	*rule_nr = 50;
3351	peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
3352	if (self == peer)
3353		return -1;
3354
3355	*rule_nr = 51;
3356	peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1);
3357	if (self == peer) {
3358		if (connection->agreed_pro_version < 96 ?
3359		    (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
3360		    (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
3361		    peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) {
3362			/* The last P_SYNC_UUID did not get though. Undo the last start of
3363			   resync as sync source modifications of the peer's UUIDs. */
3364
3365			if (connection->agreed_pro_version < 91)
3366				return -1091;
3367
3368			device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START];
3369			device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_HISTORY_START + 1];
3370
3371			drbd_info(device, "Lost last syncUUID packet, corrected:\n");
3372			drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
3373
3374			return -1;
3375		}
3376	}
3377
3378	*rule_nr = 60;
3379	self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
3380	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
3381		peer = device->p_uuid[i] & ~((u64)1);
3382		if (self == peer)
3383			return -2;
3384	}
3385
3386	*rule_nr = 70;
3387	self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
3388	peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
3389	if (self == peer)
3390		return 1;
3391
3392	*rule_nr = 71;
3393	self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
3394	if (self == peer) {
3395		if (connection->agreed_pro_version < 96 ?
3396		    (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
3397		    (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
3398		    self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
3399			/* The last P_SYNC_UUID did not get though. Undo the last start of
3400			   resync as sync source modifications of our UUIDs. */
3401
3402			if (connection->agreed_pro_version < 91)
3403				return -1091;
3404
3405			__drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]);
3406			__drbd_uuid_set(device, UI_HISTORY_START, device->ldev->md.uuid[UI_HISTORY_START + 1]);
3407
3408			drbd_info(device, "Last syncUUID did not get through, corrected:\n");
3409			drbd_uuid_dump(device, "self", device->ldev->md.uuid,
3410				       device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
3411
3412			return 1;
3413		}
3414	}
3415
3416
3417	*rule_nr = 80;
3418	peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
3419	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
3420		self = device->ldev->md.uuid[i] & ~((u64)1);
3421		if (self == peer)
3422			return 2;
3423	}
3424
3425	*rule_nr = 90;
3426	self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
3427	peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
3428	if (self == peer && self != ((u64)0))
3429		return 100;
3430
3431	*rule_nr = 100;
3432	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
3433		self = device->ldev->md.uuid[i] & ~((u64)1);
3434		for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
3435			peer = device->p_uuid[j] & ~((u64)1);
3436			if (self == peer)
3437				return -100;
3438		}
3439	}
3440
3441	return -1000;
3442}
3443
3444/* drbd_sync_handshake() returns the new conn state on success, or
3445   CONN_MASK (-1) on failure.
3446 */
3447static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
3448					   enum drbd_role peer_role,
3449					   enum drbd_disk_state peer_disk) __must_hold(local)
3450{
3451	struct drbd_device *device = peer_device->device;
3452	enum drbd_conns rv = C_MASK;
3453	enum drbd_disk_state mydisk;
3454	struct net_conf *nc;
3455	int hg, rule_nr, rr_conflict, tentative, always_asbp;
3456
3457	mydisk = device->state.disk;
3458	if (mydisk == D_NEGOTIATING)
3459		mydisk = device->new_state_tmp.disk;
3460
3461	drbd_info(device, "drbd_sync_handshake:\n");
3462
3463	spin_lock_irq(&device->ldev->md.uuid_lock);
3464	drbd_uuid_dump(device, "self", device->ldev->md.uuid, device->comm_bm_set, 0);
3465	drbd_uuid_dump(device, "peer", device->p_uuid,
3466		       device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
3467
3468	hg = drbd_uuid_compare(device, peer_role, &rule_nr);
3469	spin_unlock_irq(&device->ldev->md.uuid_lock);
3470
3471	drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
3472
3473	if (hg == -1000) {
3474		drbd_alert(device, "Unrelated data, aborting!\n");
3475		return C_MASK;
3476	}
3477	if (hg < -0x10000) {
3478		int proto, fflags;
3479		hg = -hg;
3480		proto = hg & 0xff;
3481		fflags = (hg >> 8) & 0xff;
3482		drbd_alert(device, "To resolve this both sides have to support at least protocol %d and feature flags 0x%x\n",
3483					proto, fflags);
3484		return C_MASK;
3485	}
3486	if (hg < -1000) {
3487		drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
3488		return C_MASK;
3489	}
3490
3491	if    ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
3492	    (peer_disk == D_INCONSISTENT && mydisk    > D_INCONSISTENT)) {
3493		int f = (hg == -100) || abs(hg) == 2;
3494		hg = mydisk > D_INCONSISTENT ? 1 : -1;
3495		if (f)
3496			hg = hg*2;
3497		drbd_info(device, "Becoming sync %s due to disk states.\n",
3498		     hg > 0 ? "source" : "target");
3499	}
3500
3501	if (abs(hg) == 100)
3502		drbd_khelper(device, "initial-split-brain");
3503
3504	rcu_read_lock();
3505	nc = rcu_dereference(peer_device->connection->net_conf);
3506	always_asbp = nc->always_asbp;
3507	rr_conflict = nc->rr_conflict;
3508	tentative = nc->tentative;
3509	rcu_read_unlock();
3510
3511	if (hg == 100 || (hg == -100 && always_asbp)) {
3512		int pcount = (device->state.role == R_PRIMARY)
3513			   + (peer_role == R_PRIMARY);
3514		int forced = (hg == -100);
3515
3516		switch (pcount) {
3517		case 0:
3518			hg = drbd_asb_recover_0p(peer_device);
3519			break;
3520		case 1:
3521			hg = drbd_asb_recover_1p(peer_device);
3522			break;
3523		case 2:
3524			hg = drbd_asb_recover_2p(peer_device);
3525			break;
3526		}
3527		if (abs(hg) < 100) {
3528			drbd_warn(device, "Split-Brain detected, %d primaries, "
3529			     "automatically solved. Sync from %s node\n",
3530			     pcount, (hg < 0) ? "peer" : "this");
3531			if (forced) {
3532				drbd_warn(device, "Doing a full sync, since"
3533				     " UUIDs where ambiguous.\n");
3534				hg = hg*2;
3535			}
3536		}
3537	}
3538
3539	if (hg == -100) {
3540		if (test_bit(DISCARD_MY_DATA, &device->flags) && !(device->p_uuid[UI_FLAGS]&1))
3541			hg = -1;
3542		if (!test_bit(DISCARD_MY_DATA, &device->flags) && (device->p_uuid[UI_FLAGS]&1))
3543			hg = 1;
3544
3545		if (abs(hg) < 100)
3546			drbd_warn(device, "Split-Brain detected, manually solved. "
3547			     "Sync from %s node\n",
3548			     (hg < 0) ? "peer" : "this");
3549	}
3550
3551	if (hg == -100) {
3552		/* FIXME this log message is not correct if we end up here
3553		 * after an attempted attach on a diskless node.
3554		 * We just refuse to attach -- well, we drop the "connection"
3555		 * to that disk, in a way... */
3556		drbd_alert(device, "Split-Brain detected but unresolved, dropping connection!\n");
3557		drbd_khelper(device, "split-brain");
3558		return C_MASK;
3559	}
3560
3561	if (hg > 0 && mydisk <= D_INCONSISTENT) {
3562		drbd_err(device, "I shall become SyncSource, but I am inconsistent!\n");
3563		return C_MASK;
3564	}
3565
3566	if (hg < 0 && /* by intention we do not use mydisk here. */
3567	    device->state.role == R_PRIMARY && device->state.disk >= D_CONSISTENT) {
3568		switch (rr_conflict) {
3569		case ASB_CALL_HELPER:
3570			drbd_khelper(device, "pri-lost");
3571			fallthrough;
3572		case ASB_DISCONNECT:
3573			drbd_err(device, "I shall become SyncTarget, but I am primary!\n");
3574			return C_MASK;
3575		case ASB_VIOLENTLY:
3576			drbd_warn(device, "Becoming SyncTarget, violating the stable-data"
3577			     "assumption\n");
3578		}
3579	}
3580
3581	if (tentative || test_bit(CONN_DRY_RUN, &peer_device->connection->flags)) {
3582		if (hg == 0)
3583			drbd_info(device, "dry-run connect: No resync, would become Connected immediately.\n");
3584		else
3585			drbd_info(device, "dry-run connect: Would become %s, doing a %s resync.",
3586				 drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
3587				 abs(hg) >= 2 ? "full" : "bit-map based");
3588		return C_MASK;
3589	}
3590
3591	if (abs(hg) >= 2) {
3592		drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
3593		if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
3594					BM_LOCKED_SET_ALLOWED))
3595			return C_MASK;
3596	}
3597
3598	if (hg > 0) { /* become sync source. */
3599		rv = C_WF_BITMAP_S;
3600	} else if (hg < 0) { /* become sync target */
3601		rv = C_WF_BITMAP_T;
3602	} else {
3603		rv = C_CONNECTED;
3604		if (drbd_bm_total_weight(device)) {
3605			drbd_info(device, "No resync, but %lu bits in bitmap!\n",
3606			     drbd_bm_total_weight(device));
3607		}
3608	}
3609
3610	return rv;
3611}
3612
3613static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
3614{
3615	/* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
3616	if (peer == ASB_DISCARD_REMOTE)
3617		return ASB_DISCARD_LOCAL;
3618
3619	/* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
3620	if (peer == ASB_DISCARD_LOCAL)
3621		return ASB_DISCARD_REMOTE;
3622
3623	/* everything else is valid if they are equal on both sides. */
3624	return peer;
3625}
3626
3627static int receive_protocol(struct drbd_connection *connection, struct packet_info *pi)
3628{
3629	struct p_protocol *p = pi->data;
3630	enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
3631	int p_proto, p_discard_my_data, p_two_primaries, cf;
3632	struct net_conf *nc, *old_net_conf, *new_net_conf = NULL;
3633	char integrity_alg[SHARED_SECRET_MAX] = "";
3634	struct crypto_shash *peer_integrity_tfm = NULL;
3635	void *int_dig_in = NULL, *int_dig_vv = NULL;
3636
3637	p_proto		= be32_to_cpu(p->protocol);
3638	p_after_sb_0p	= be32_to_cpu(p->after_sb_0p);
3639	p_after_sb_1p	= be32_to_cpu(p->after_sb_1p);
3640	p_after_sb_2p	= be32_to_cpu(p->after_sb_2p);
3641	p_two_primaries = be32_to_cpu(p->two_primaries);
3642	cf		= be32_to_cpu(p->conn_flags);
3643	p_discard_my_data = cf & CF_DISCARD_MY_DATA;
3644
3645	if (connection->agreed_pro_version >= 87) {
3646		int err;
3647
3648		if (pi->size > sizeof(integrity_alg))
3649			return -EIO;
3650		err = drbd_recv_all(connection, integrity_alg, pi->size);
3651		if (err)
3652			return err;
3653		integrity_alg[SHARED_SECRET_MAX - 1] = 0;
3654	}
3655
3656	if (pi->cmd != P_PROTOCOL_UPDATE) {
3657		clear_bit(CONN_DRY_RUN, &connection->flags);
3658
3659		if (cf & CF_DRY_RUN)
3660			set_bit(CONN_DRY_RUN, &connection->flags);
3661
3662		rcu_read_lock();
3663		nc = rcu_dereference(connection->net_conf);
3664
3665		if (p_proto != nc->wire_protocol) {
3666			drbd_err(connection, "incompatible %s settings\n", "protocol");
3667			goto disconnect_rcu_unlock;
3668		}
3669
3670		if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) {
3671			drbd_err(connection, "incompatible %s settings\n", "after-sb-0pri");
3672			goto disconnect_rcu_unlock;
3673		}
3674
3675		if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) {
3676			drbd_err(connection, "incompatible %s settings\n", "after-sb-1pri");
3677			goto disconnect_rcu_unlock;
3678		}
3679
3680		if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) {
3681			drbd_err(connection, "incompatible %s settings\n", "after-sb-2pri");
3682			goto disconnect_rcu_unlock;
3683		}
3684
3685		if (p_discard_my_data && nc->discard_my_data) {
3686			drbd_err(connection, "incompatible %s settings\n", "discard-my-data");
3687			goto disconnect_rcu_unlock;
3688		}
3689
3690		if (p_two_primaries != nc->two_primaries) {
3691			drbd_err(connection, "incompatible %s settings\n", "allow-two-primaries");
3692			goto disconnect_rcu_unlock;
3693		}
3694
3695		if (strcmp(integrity_alg, nc->integrity_alg)) {
3696			drbd_err(connection, "incompatible %s settings\n", "data-integrity-alg");
3697			goto disconnect_rcu_unlock;
3698		}
3699
3700		rcu_read_unlock();
3701	}
3702
3703	if (integrity_alg[0]) {
3704		int hash_size;
3705
3706		/*
3707		 * We can only change the peer data integrity algorithm
3708		 * here.  Changing our own data integrity algorithm
3709		 * requires that we send a P_PROTOCOL_UPDATE packet at
3710		 * the same time; otherwise, the peer has no way to
3711		 * tell between which packets the algorithm should
3712		 * change.
3713		 */
3714
3715		peer_integrity_tfm = crypto_alloc_shash(integrity_alg, 0, 0);
3716		if (IS_ERR(peer_integrity_tfm)) {
3717			peer_integrity_tfm = NULL;
3718			drbd_err(connection, "peer data-integrity-alg %s not supported\n",
3719				 integrity_alg);
3720			goto disconnect;
3721		}
3722
3723		hash_size = crypto_shash_digestsize(peer_integrity_tfm);
3724		int_dig_in = kmalloc(hash_size, GFP_KERNEL);
3725		int_dig_vv = kmalloc(hash_size, GFP_KERNEL);
3726		if (!(int_dig_in && int_dig_vv)) {
3727			drbd_err(connection, "Allocation of buffers for data integrity checking failed\n");
3728			goto disconnect;
3729		}
3730	}
3731
3732	new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
3733	if (!new_net_conf)
3734		goto disconnect;
3735
3736	mutex_lock(&connection->data.mutex);
3737	mutex_lock(&connection->resource->conf_update);
3738	old_net_conf = connection->net_conf;
3739	*new_net_conf = *old_net_conf;
3740
3741	new_net_conf->wire_protocol = p_proto;
3742	new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p);
3743	new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p);
3744	new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
3745	new_net_conf->two_primaries = p_two_primaries;
3746
3747	rcu_assign_pointer(connection->net_conf, new_net_conf);
3748	mutex_unlock(&connection->resource->conf_update);
3749	mutex_unlock(&connection->data.mutex);
3750
3751	crypto_free_shash(connection->peer_integrity_tfm);
3752	kfree(connection->int_dig_in);
3753	kfree(connection->int_dig_vv);
3754	connection->peer_integrity_tfm = peer_integrity_tfm;
3755	connection->int_dig_in = int_dig_in;
3756	connection->int_dig_vv = int_dig_vv;
3757
3758	if (strcmp(old_net_conf->integrity_alg, integrity_alg))
3759		drbd_info(connection, "peer data-integrity-alg: %s\n",
3760			  integrity_alg[0] ? integrity_alg : "(none)");
3761
3762	kvfree_rcu(old_net_conf);
3763	return 0;
3764
3765disconnect_rcu_unlock:
3766	rcu_read_unlock();
3767disconnect:
3768	crypto_free_shash(peer_integrity_tfm);
3769	kfree(int_dig_in);
3770	kfree(int_dig_vv);
3771	conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
3772	return -EIO;
3773}
3774
3775/* helper function
3776 * input: alg name, feature name
3777 * return: NULL (alg name was "")
3778 *         ERR_PTR(error) if something goes wrong
3779 *         or the crypto hash ptr, if it worked out ok. */
3780static struct crypto_shash *drbd_crypto_alloc_digest_safe(
3781		const struct drbd_device *device,
3782		const char *alg, const char *name)
3783{
3784	struct crypto_shash *tfm;
3785
3786	if (!alg[0])
3787		return NULL;
3788
3789	tfm = crypto_alloc_shash(alg, 0, 0);
3790	if (IS_ERR(tfm)) {
3791		drbd_err(device, "Can not allocate \"%s\" as %s (reason: %ld)\n",
3792			alg, name, PTR_ERR(tfm));
3793		return tfm;
3794	}
3795	return tfm;
3796}
3797
3798static int ignore_remaining_packet(struct drbd_connection *connection, struct packet_info *pi)
3799{
3800	void *buffer = connection->data.rbuf;
3801	int size = pi->size;
3802
3803	while (size) {
3804		int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
3805		s = drbd_recv(connection, buffer, s);
3806		if (s <= 0) {
3807			if (s < 0)
3808				return s;
3809			break;
3810		}
3811		size -= s;
3812	}
3813	if (size)
3814		return -EIO;
3815	return 0;
3816}
3817
3818/*
3819 * config_unknown_volume  -  device configuration command for unknown volume
3820 *
3821 * When a device is added to an existing connection, the node on which the
3822 * device is added first will send configuration commands to its peer but the
3823 * peer will not know about the device yet.  It will warn and ignore these
3824 * commands.  Once the device is added on the second node, the second node will
3825 * send the same device configuration commands, but in the other direction.
3826 *
3827 * (We can also end up here if drbd is misconfigured.)
3828 */
3829static int config_unknown_volume(struct drbd_connection *connection, struct packet_info *pi)
3830{
3831	drbd_warn(connection, "%s packet received for volume %u, which is not configured locally\n",
3832		  cmdname(pi->cmd), pi->vnr);
3833	return ignore_remaining_packet(connection, pi);
3834}
3835
3836static int receive_SyncParam(struct drbd_connection *connection, struct packet_info *pi)
3837{
3838	struct drbd_peer_device *peer_device;
3839	struct drbd_device *device;
3840	struct p_rs_param_95 *p;
3841	unsigned int header_size, data_size, exp_max_sz;
3842	struct crypto_shash *verify_tfm = NULL;
3843	struct crypto_shash *csums_tfm = NULL;
3844	struct net_conf *old_net_conf, *new_net_conf = NULL;
3845	struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
3846	const int apv = connection->agreed_pro_version;
3847	struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
3848	unsigned int fifo_size = 0;
3849	int err;
3850
3851	peer_device = conn_peer_device(connection, pi->vnr);
3852	if (!peer_device)
3853		return config_unknown_volume(connection, pi);
3854	device = peer_device->device;
3855
3856	exp_max_sz  = apv <= 87 ? sizeof(struct p_rs_param)
3857		    : apv == 88 ? sizeof(struct p_rs_param)
3858					+ SHARED_SECRET_MAX
3859		    : apv <= 94 ? sizeof(struct p_rs_param_89)
3860		    : /* apv >= 95 */ sizeof(struct p_rs_param_95);
3861
3862	if (pi->size > exp_max_sz) {
3863		drbd_err(device, "SyncParam packet too long: received %u, expected <= %u bytes\n",
3864		    pi->size, exp_max_sz);
3865		return -EIO;
3866	}
3867
3868	if (apv <= 88) {
3869		header_size = sizeof(struct p_rs_param);
3870		data_size = pi->size - header_size;
3871	} else if (apv <= 94) {
3872		header_size = sizeof(struct p_rs_param_89);
3873		data_size = pi->size - header_size;
3874		D_ASSERT(device, data_size == 0);
3875	} else {
3876		header_size = sizeof(struct p_rs_param_95);
3877		data_size = pi->size - header_size;
3878		D_ASSERT(device, data_size == 0);
3879	}
3880
3881	/* initialize verify_alg and csums_alg */
3882	p = pi->data;
3883	BUILD_BUG_ON(sizeof(p->algs) != 2 * SHARED_SECRET_MAX);
3884	memset(&p->algs, 0, sizeof(p->algs));
3885
3886	err = drbd_recv_all(peer_device->connection, p, header_size);
3887	if (err)
3888		return err;
3889
3890	mutex_lock(&connection->resource->conf_update);
3891	old_net_conf = peer_device->connection->net_conf;
3892	if (get_ldev(device)) {
3893		new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
3894		if (!new_disk_conf) {
3895			put_ldev(device);
3896			mutex_unlock(&connection->resource->conf_update);
3897			drbd_err(device, "Allocation of new disk_conf failed\n");
3898			return -ENOMEM;
3899		}
3900
3901		old_disk_conf = device->ldev->disk_conf;
3902		*new_disk_conf = *old_disk_conf;
3903
3904		new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
3905	}
3906
3907	if (apv >= 88) {
3908		if (apv == 88) {
3909			if (data_size > SHARED_SECRET_MAX || data_size == 0) {
3910				drbd_err(device, "verify-alg of wrong size, "
3911					"peer wants %u, accepting only up to %u byte\n",
3912					data_size, SHARED_SECRET_MAX);
3913				goto reconnect;
3914			}
3915
3916			err = drbd_recv_all(peer_device->connection, p->verify_alg, data_size);
3917			if (err)
3918				goto reconnect;
3919			/* we expect NUL terminated string */
3920			/* but just in case someone tries to be evil */
3921			D_ASSERT(device, p->verify_alg[data_size-1] == 0);
3922			p->verify_alg[data_size-1] = 0;
3923
3924		} else /* apv >= 89 */ {
3925			/* we still expect NUL terminated strings */
3926			/* but just in case someone tries to be evil */
3927			D_ASSERT(device, p->verify_alg[SHARED_SECRET_MAX-1] == 0);
3928			D_ASSERT(device, p->csums_alg[SHARED_SECRET_MAX-1] == 0);
3929			p->verify_alg[SHARED_SECRET_MAX-1] = 0;
3930			p->csums_alg[SHARED_SECRET_MAX-1] = 0;
3931		}
3932
3933		if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
3934			if (device->state.conn == C_WF_REPORT_PARAMS) {
3935				drbd_err(device, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
3936				    old_net_conf->verify_alg, p->verify_alg);
3937				goto disconnect;
3938			}
3939			verify_tfm = drbd_crypto_alloc_digest_safe(device,
3940					p->verify_alg, "verify-alg");
3941			if (IS_ERR(verify_tfm)) {
3942				verify_tfm = NULL;
3943				goto disconnect;
3944			}
3945		}
3946
3947		if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
3948			if (device->state.conn == C_WF_REPORT_PARAMS) {
3949				drbd_err(device, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
3950				    old_net_conf->csums_alg, p->csums_alg);
3951				goto disconnect;
3952			}
3953			csums_tfm = drbd_crypto_alloc_digest_safe(device,
3954					p->csums_alg, "csums-alg");
3955			if (IS_ERR(csums_tfm)) {
3956				csums_tfm = NULL;
3957				goto disconnect;
3958			}
3959		}
3960
3961		if (apv > 94 && new_disk_conf) {
3962			new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
3963			new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
3964			new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
3965			new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
3966
3967			fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
3968			if (fifo_size != device->rs_plan_s->size) {
3969				new_plan = fifo_alloc(fifo_size);
3970				if (!new_plan) {
3971					drbd_err(device, "kmalloc of fifo_buffer failed");
3972					put_ldev(device);
3973					goto disconnect;
3974				}
3975			}
3976		}
3977
3978		if (verify_tfm || csums_tfm) {
3979			new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
3980			if (!new_net_conf)
3981				goto disconnect;
3982
3983			*new_net_conf = *old_net_conf;
3984
3985			if (verify_tfm) {
3986				strcpy(new_net_conf->verify_alg, p->verify_alg);
3987				new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
3988				crypto_free_shash(peer_device->connection->verify_tfm);
3989				peer_device->connection->verify_tfm = verify_tfm;
3990				drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg);
3991			}
3992			if (csums_tfm) {
3993				strcpy(new_net_conf->csums_alg, p->csums_alg);
3994				new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
3995				crypto_free_shash(peer_device->connection->csums_tfm);
3996				peer_device->connection->csums_tfm = csums_tfm;
3997				drbd_info(device, "using csums-alg: \"%s\"\n", p->csums_alg);
3998			}
3999			rcu_assign_pointer(connection->net_conf, new_net_conf);
4000		}
4001	}
4002
4003	if (new_disk_conf) {
4004		rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
4005		put_ldev(device);
4006	}
4007
4008	if (new_plan) {
4009		old_plan = device->rs_plan_s;
4010		rcu_assign_pointer(device->rs_plan_s, new_plan);
4011	}
4012
4013	mutex_unlock(&connection->resource->conf_update);
4014	synchronize_rcu();
4015	if (new_net_conf)
4016		kfree(old_net_conf);
4017	kfree(old_disk_conf);
4018	kfree(old_plan);
4019
4020	return 0;
4021
4022reconnect:
4023	if (new_disk_conf) {
4024		put_ldev(device);
4025		kfree(new_disk_conf);
4026	}
4027	mutex_unlock(&connection->resource->conf_update);
4028	return -EIO;
4029
4030disconnect:
4031	kfree(new_plan);
4032	if (new_disk_conf) {
4033		put_ldev(device);
4034		kfree(new_disk_conf);
4035	}
4036	mutex_unlock(&connection->resource->conf_update);
4037	/* just for completeness: actually not needed,
4038	 * as this is not reached if csums_tfm was ok. */
4039	crypto_free_shash(csums_tfm);
4040	/* but free the verify_tfm again, if csums_tfm did not work out */
4041	crypto_free_shash(verify_tfm);
4042	conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4043	return -EIO;
4044}
4045
4046/* warn if the arguments differ by more than 12.5% */
4047static void warn_if_differ_considerably(struct drbd_device *device,
4048	const char *s, sector_t a, sector_t b)
4049{
4050	sector_t d;
4051	if (a == 0 || b == 0)
4052		return;
4053	d = (a > b) ? (a - b) : (b - a);
4054	if (d > (a>>3) || d > (b>>3))
4055		drbd_warn(device, "Considerable difference in %s: %llus vs. %llus\n", s,
4056		     (unsigned long long)a, (unsigned long long)b);
4057}
4058
4059static int receive_sizes(struct drbd_connection *connection, struct packet_info *pi)
4060{
4061	struct drbd_peer_device *peer_device;
4062	struct drbd_device *device;
4063	struct p_sizes *p = pi->data;
4064	struct o_qlim *o = (connection->agreed_features & DRBD_FF_WSAME) ? p->qlim : NULL;
4065	enum determine_dev_size dd = DS_UNCHANGED;
4066	sector_t p_size, p_usize, p_csize, my_usize;
4067	sector_t new_size, cur_size;
4068	int ldsc = 0; /* local disk size changed */
4069	enum dds_flags ddsf;
4070
4071	peer_device = conn_peer_device(connection, pi->vnr);
4072	if (!peer_device)
4073		return config_unknown_volume(connection, pi);
4074	device = peer_device->device;
4075	cur_size = get_capacity(device->vdisk);
4076
4077	p_size = be64_to_cpu(p->d_size);
4078	p_usize = be64_to_cpu(p->u_size);
4079	p_csize = be64_to_cpu(p->c_size);
4080
4081	/* just store the peer's disk size for now.
4082	 * we still need to figure out whether we accept that. */
4083	device->p_size = p_size;
4084
4085	if (get_ldev(device)) {
4086		rcu_read_lock();
4087		my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size;
4088		rcu_read_unlock();
4089
4090		warn_if_differ_considerably(device, "lower level device sizes",
4091			   p_size, drbd_get_max_capacity(device->ldev));
4092		warn_if_differ_considerably(device, "user requested size",
4093					    p_usize, my_usize);
4094
4095		/* if this is the first connect, or an otherwise expected
4096		 * param exchange, choose the minimum */
4097		if (device->state.conn == C_WF_REPORT_PARAMS)
4098			p_usize = min_not_zero(my_usize, p_usize);
4099
4100		/* Never shrink a device with usable data during connect,
4101		 * or "attach" on the peer.
4102		 * But allow online shrinking if we are connected. */
4103		new_size = drbd_new_dev_size(device, device->ldev, p_usize, 0);
4104		if (new_size < cur_size &&
4105		    device->state.disk >= D_OUTDATED &&
4106		    (device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS)) {
4107			drbd_err(device, "The peer's disk size is too small! (%llu < %llu sectors)\n",
4108					(unsigned long long)new_size, (unsigned long long)cur_size);
4109			conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4110			put_ldev(device);
4111			return -EIO;
4112		}
4113
4114		if (my_usize != p_usize) {
4115			struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
4116
4117			new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
4118			if (!new_disk_conf) {
4119				put_ldev(device);
4120				return -ENOMEM;
4121			}
4122
4123			mutex_lock(&connection->resource->conf_update);
4124			old_disk_conf = device->ldev->disk_conf;
4125			*new_disk_conf = *old_disk_conf;
4126			new_disk_conf->disk_size = p_usize;
4127
4128			rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
4129			mutex_unlock(&connection->resource->conf_update);
4130			kvfree_rcu(old_disk_conf);
4131
4132			drbd_info(device, "Peer sets u_size to %lu sectors (old: %lu)\n",
4133				 (unsigned long)p_usize, (unsigned long)my_usize);
4134		}
4135
4136		put_ldev(device);
4137	}
4138
4139	device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
4140	/* Leave drbd_reconsider_queue_parameters() before drbd_determine_dev_size().
4141	   In case we cleared the QUEUE_FLAG_DISCARD from our queue in
4142	   drbd_reconsider_queue_parameters(), we can be sure that after
4143	   drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
4144
4145	ddsf = be16_to_cpu(p->dds_flags);
4146	if (get_ldev(device)) {
4147		drbd_reconsider_queue_parameters(device, device->ldev, o);
4148		dd = drbd_determine_dev_size(device, ddsf, NULL);
4149		put_ldev(device);
4150		if (dd == DS_ERROR)
4151			return -EIO;
4152		drbd_md_sync(device);
4153	} else {
4154		/*
4155		 * I am diskless, need to accept the peer's *current* size.
4156		 * I must NOT accept the peers backing disk size,
4157		 * it may have been larger than mine all along...
4158		 *
4159		 * At this point, the peer knows more about my disk, or at
4160		 * least about what we last agreed upon, than myself.
4161		 * So if his c_size is less than his d_size, the most likely
4162		 * reason is that *my* d_size was smaller last time we checked.
4163		 *
4164		 * However, if he sends a zero current size,
4165		 * take his (user-capped or) backing disk size anyways.
4166		 *
4167		 * Unless of course he does not have a disk himself.
4168		 * In which case we ignore this completely.
4169		 */
4170		sector_t new_size = p_csize ?: p_usize ?: p_size;
4171		drbd_reconsider_queue_parameters(device, NULL, o);
4172		if (new_size == 0) {
4173			/* Ignore, peer does not know nothing. */
4174		} else if (new_size == cur_size) {
4175			/* nothing to do */
4176		} else if (cur_size != 0 && p_size == 0) {
4177			drbd_warn(device, "Ignored diskless peer device size (peer:%llu != me:%llu sectors)!\n",
4178					(unsigned long long)new_size, (unsigned long long)cur_size);
4179		} else if (new_size < cur_size && device->state.role == R_PRIMARY) {
4180			drbd_err(device, "The peer's device size is too small! (%llu < %llu sectors); demote me first!\n",
4181					(unsigned long long)new_size, (unsigned long long)cur_size);
4182			conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4183			return -EIO;
4184		} else {
4185			/* I believe the peer, if
4186			 *  - I don't have a current size myself
4187			 *  - we agree on the size anyways
4188			 *  - I do have a current size, am Secondary,
4189			 *    and he has the only disk
4190			 *  - I do have a current size, am Primary,
4191			 *    and he has the only disk,
4192			 *    which is larger than my current size
4193			 */
4194			drbd_set_my_capacity(device, new_size);
4195		}
4196	}
4197
4198	if (get_ldev(device)) {
4199		if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) {
4200			device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
4201			ldsc = 1;
4202		}
4203
4204		put_ldev(device);
4205	}
4206
4207	if (device->state.conn > C_WF_REPORT_PARAMS) {
4208		if (be64_to_cpu(p->c_size) != get_capacity(device->vdisk) ||
4209		    ldsc) {
4210			/* we have different sizes, probably peer
4211			 * needs to know my new size... */
4212			drbd_send_sizes(peer_device, 0, ddsf);
4213		}
4214		if (test_and_clear_bit(RESIZE_PENDING, &device->flags) ||
4215		    (dd == DS_GREW && device->state.conn == C_CONNECTED)) {
4216			if (device->state.pdsk >= D_INCONSISTENT &&
4217			    device->state.disk >= D_INCONSISTENT) {
4218				if (ddsf & DDSF_NO_RESYNC)
4219					drbd_info(device, "Resync of new storage suppressed with --assume-clean\n");
4220				else
4221					resync_after_online_grow(device);
4222			} else
4223				set_bit(RESYNC_AFTER_NEG, &device->flags);
4224		}
4225	}
4226
4227	return 0;
4228}
4229
4230static int receive_uuids(struct drbd_connection *connection, struct packet_info *pi)
4231{
4232	struct drbd_peer_device *peer_device;
4233	struct drbd_device *device;
4234	struct p_uuids *p = pi->data;
4235	u64 *p_uuid;
4236	int i, updated_uuids = 0;
4237
4238	peer_device = conn_peer_device(connection, pi->vnr);
4239	if (!peer_device)
4240		return config_unknown_volume(connection, pi);
4241	device = peer_device->device;
4242
4243	p_uuid = kmalloc_array(UI_EXTENDED_SIZE, sizeof(*p_uuid), GFP_NOIO);
4244	if (!p_uuid)
4245		return false;
4246
4247	for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
4248		p_uuid[i] = be64_to_cpu(p->uuid[i]);
4249
4250	kfree(device->p_uuid);
4251	device->p_uuid = p_uuid;
4252
4253	if ((device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS) &&
4254	    device->state.disk < D_INCONSISTENT &&
4255	    device->state.role == R_PRIMARY &&
4256	    (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
4257		drbd_err(device, "Can only connect to data with current UUID=%016llX\n",
4258		    (unsigned long long)device->ed_uuid);
4259		conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4260		return -EIO;
4261	}
4262
4263	if (get_ldev(device)) {
4264		int skip_initial_sync =
4265			device->state.conn == C_CONNECTED &&
4266			peer_device->connection->agreed_pro_version >= 90 &&
4267			device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
4268			(p_uuid[UI_FLAGS] & 8);
4269		if (skip_initial_sync) {
4270			drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n");
4271			drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
4272					"clear_n_write from receive_uuids",
4273					BM_LOCKED_TEST_ALLOWED);
4274			_drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]);
4275			_drbd_uuid_set(device, UI_BITMAP, 0);
4276			_drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
4277					CS_VERBOSE, NULL);
4278			drbd_md_sync(device);
4279			updated_uuids = 1;
4280		}
4281		put_ldev(device);
4282	} else if (device->state.disk < D_INCONSISTENT &&
4283		   device->state.role == R_PRIMARY) {
4284		/* I am a diskless primary, the peer just created a new current UUID
4285		   for me. */
4286		updated_uuids = drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
4287	}
4288
4289	/* Before we test for the disk state, we should wait until an eventually
4290	   ongoing cluster wide state change is finished. That is important if
4291	   we are primary and are detaching from our disk. We need to see the
4292	   new disk state... */
4293	mutex_lock(device->state_mutex);
4294	mutex_unlock(device->state_mutex);
4295	if (device->state.conn >= C_CONNECTED && device->state.disk < D_INCONSISTENT)
4296		updated_uuids |= drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
4297
4298	if (updated_uuids)
4299		drbd_print_uuids(device, "receiver updated UUIDs to");
4300
4301	return 0;
4302}
4303
4304/**
4305 * convert_state() - Converts the peer's view of the cluster state to our point of view
4306 * @ps:		The state as seen by the peer.
4307 */
4308static union drbd_state convert_state(union drbd_state ps)
4309{
4310	union drbd_state ms;
4311
4312	static enum drbd_conns c_tab[] = {
4313		[C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
4314		[C_CONNECTED] = C_CONNECTED,
4315
4316		[C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
4317		[C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
4318		[C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
4319		[C_VERIFY_S]       = C_VERIFY_T,
4320		[C_MASK]   = C_MASK,
4321	};
4322
4323	ms.i = ps.i;
4324
4325	ms.conn = c_tab[ps.conn];
4326	ms.peer = ps.role;
4327	ms.role = ps.peer;
4328	ms.pdsk = ps.disk;
4329	ms.disk = ps.pdsk;
4330	ms.peer_isp = (ps.aftr_isp | ps.user_isp);
4331
4332	return ms;
4333}
4334
4335static int receive_req_state(struct drbd_connection *connection, struct packet_info *pi)
4336{
4337	struct drbd_peer_device *peer_device;
4338	struct drbd_device *device;
4339	struct p_req_state *p = pi->data;
4340	union drbd_state mask, val;
4341	enum drbd_state_rv rv;
4342
4343	peer_device = conn_peer_device(connection, pi->vnr);
4344	if (!peer_device)
4345		return -EIO;
4346	device = peer_device->device;
4347
4348	mask.i = be32_to_cpu(p->mask);
4349	val.i = be32_to_cpu(p->val);
4350
4351	if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) &&
4352	    mutex_is_locked(device->state_mutex)) {
4353		drbd_send_sr_reply(peer_device, SS_CONCURRENT_ST_CHG);
4354		return 0;
4355	}
4356
4357	mask = convert_state(mask);
4358	val = convert_state(val);
4359
4360	rv = drbd_change_state(device, CS_VERBOSE, mask, val);
4361	drbd_send_sr_reply(peer_device, rv);
4362
4363	drbd_md_sync(device);
4364
4365	return 0;
4366}
4367
4368static int receive_req_conn_state(struct drbd_connection *connection, struct packet_info *pi)
4369{
4370	struct p_req_state *p = pi->data;
4371	union drbd_state mask, val;
4372	enum drbd_state_rv rv;
4373
4374	mask.i = be32_to_cpu(p->mask);
4375	val.i = be32_to_cpu(p->val);
4376
4377	if (test_bit(RESOLVE_CONFLICTS, &connection->flags) &&
4378	    mutex_is_locked(&connection->cstate_mutex)) {
4379		conn_send_sr_reply(connection, SS_CONCURRENT_ST_CHG);
4380		return 0;
4381	}
4382
4383	mask = convert_state(mask);
4384	val = convert_state(val);
4385
4386	rv = conn_request_state(connection, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
4387	conn_send_sr_reply(connection, rv);
4388
4389	return 0;
4390}
4391
4392static int receive_state(struct drbd_connection *connection, struct packet_info *pi)
4393{
4394	struct drbd_peer_device *peer_device;
4395	struct drbd_device *device;
4396	struct p_state *p = pi->data;
4397	union drbd_state os, ns, peer_state;
4398	enum drbd_disk_state real_peer_disk;
4399	enum chg_state_flags cs_flags;
4400	int rv;
4401
4402	peer_device = conn_peer_device(connection, pi->vnr);
4403	if (!peer_device)
4404		return config_unknown_volume(connection, pi);
4405	device = peer_device->device;
4406
4407	peer_state.i = be32_to_cpu(p->state);
4408
4409	real_peer_disk = peer_state.disk;
4410	if (peer_state.disk == D_NEGOTIATING) {
4411		real_peer_disk = device->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
4412		drbd_info(device, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
4413	}
4414
4415	spin_lock_irq(&device->resource->req_lock);
4416 retry:
4417	os = ns = drbd_read_state(device);
4418	spin_unlock_irq(&device->resource->req_lock);
4419
4420	/* If some other part of the code (ack_receiver thread, timeout)
4421	 * already decided to close the connection again,
4422	 * we must not "re-establish" it here. */
4423	if (os.conn <= C_TEAR_DOWN)
4424		return -ECONNRESET;
4425
4426	/* If this is the "end of sync" confirmation, usually the peer disk
4427	 * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
4428	 * set) resync started in PausedSyncT, or if the timing of pause-/
4429	 * unpause-sync events has been "just right", the peer disk may
4430	 * transition from D_CONSISTENT to D_UP_TO_DATE as well.
4431	 */
4432	if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
4433	    real_peer_disk == D_UP_TO_DATE &&
4434	    os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
4435		/* If we are (becoming) SyncSource, but peer is still in sync
4436		 * preparation, ignore its uptodate-ness to avoid flapping, it
4437		 * will change to inconsistent once the peer reaches active
4438		 * syncing states.
4439		 * It may have changed syncer-paused flags, however, so we
4440		 * cannot ignore this completely. */
4441		if (peer_state.conn > C_CONNECTED &&
4442		    peer_state.conn < C_SYNC_SOURCE)
4443			real_peer_disk = D_INCONSISTENT;
4444
4445		/* if peer_state changes to connected at the same time,
4446		 * it explicitly notifies us that it finished resync.
4447		 * Maybe we should finish it up, too? */
4448		else if (os.conn >= C_SYNC_SOURCE &&
4449			 peer_state.conn == C_CONNECTED) {
4450			if (drbd_bm_total_weight(device) <= device->rs_failed)
4451				drbd_resync_finished(device);
4452			return 0;
4453		}
4454	}
4455
4456	/* explicit verify finished notification, stop sector reached. */
4457	if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
4458	    peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
4459		ov_out_of_sync_print(device);
4460		drbd_resync_finished(device);
4461		return 0;
4462	}
4463
4464	/* peer says his disk is inconsistent, while we think it is uptodate,
4465	 * and this happens while the peer still thinks we have a sync going on,
4466	 * but we think we are already done with the sync.
4467	 * We ignore this to avoid flapping pdsk.
4468	 * This should not happen, if the peer is a recent version of drbd. */
4469	if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
4470	    os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
4471		real_peer_disk = D_UP_TO_DATE;
4472
4473	if (ns.conn == C_WF_REPORT_PARAMS)
4474		ns.conn = C_CONNECTED;
4475
4476	if (peer_state.conn == C_AHEAD)
4477		ns.conn = C_BEHIND;
4478
4479	/* TODO:
4480	 * if (primary and diskless and peer uuid != effective uuid)
4481	 *     abort attach on peer;
4482	 *
4483	 * If this node does not have good data, was already connected, but
4484	 * the peer did a late attach only now, trying to "negotiate" with me,
4485	 * AND I am currently Primary, possibly frozen, with some specific
4486	 * "effective" uuid, this should never be reached, really, because
4487	 * we first send the uuids, then the current state.
4488	 *
4489	 * In this scenario, we already dropped the connection hard
4490	 * when we received the unsuitable uuids (receive_uuids().
4491	 *
4492	 * Should we want to change this, that is: not drop the connection in
4493	 * receive_uuids() already, then we would need to add a branch here
4494	 * that aborts the attach of "unsuitable uuids" on the peer in case
4495	 * this node is currently Diskless Primary.
4496	 */
4497
4498	if (device->p_uuid && peer_state.disk >= D_NEGOTIATING &&
4499	    get_ldev_if_state(device, D_NEGOTIATING)) {
4500		int cr; /* consider resync */
4501
4502		/* if we established a new connection */
4503		cr  = (os.conn < C_CONNECTED);
4504		/* if we had an established connection
4505		 * and one of the nodes newly attaches a disk */
4506		cr |= (os.conn == C_CONNECTED &&
4507		       (peer_state.disk == D_NEGOTIATING ||
4508			os.disk == D_NEGOTIATING));
4509		/* if we have both been inconsistent, and the peer has been
4510		 * forced to be UpToDate with --force */
4511		cr |= test_bit(CONSIDER_RESYNC, &device->flags);
4512		/* if we had been plain connected, and the admin requested to
4513		 * start a sync by "invalidate" or "invalidate-remote" */
4514		cr |= (os.conn == C_CONNECTED &&
4515				(peer_state.conn >= C_STARTING_SYNC_S &&
4516				 peer_state.conn <= C_WF_BITMAP_T));
4517
4518		if (cr)
4519			ns.conn = drbd_sync_handshake(peer_device, peer_state.role, real_peer_disk);
4520
4521		put_ldev(device);
4522		if (ns.conn == C_MASK) {
4523			ns.conn = C_CONNECTED;
4524			if (device->state.disk == D_NEGOTIATING) {
4525				drbd_force_state(device, NS(disk, D_FAILED));
4526			} else if (peer_state.disk == D_NEGOTIATING) {
4527				drbd_err(device, "Disk attach process on the peer node was aborted.\n");
4528				peer_state.disk = D_DISKLESS;
4529				real_peer_disk = D_DISKLESS;
4530			} else {
4531				if (test_and_clear_bit(CONN_DRY_RUN, &peer_device->connection->flags))
4532					return -EIO;
4533				D_ASSERT(device, os.conn == C_WF_REPORT_PARAMS);
4534				conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4535				return -EIO;
4536			}
4537		}
4538	}
4539
4540	spin_lock_irq(&device->resource->req_lock);
4541	if (os.i != drbd_read_state(device).i)
4542		goto retry;
4543	clear_bit(CONSIDER_RESYNC, &device->flags);
4544	ns.peer = peer_state.role;
4545	ns.pdsk = real_peer_disk;
4546	ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
4547	if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
4548		ns.disk = device->new_state_tmp.disk;
4549	cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
4550	if (ns.pdsk == D_CONSISTENT && drbd_suspended(device) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
4551	    test_bit(NEW_CUR_UUID, &device->flags)) {
4552		/* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
4553		   for temporal network outages! */
4554		spin_unlock_irq(&device->resource->req_lock);
4555		drbd_err(device, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
4556		tl_clear(peer_device->connection);
4557		drbd_uuid_new_current(device);
4558		clear_bit(NEW_CUR_UUID, &device->flags);
4559		conn_request_state(peer_device->connection, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
4560		return -EIO;
4561	}
4562	rv = _drbd_set_state(device, ns, cs_flags, NULL);
4563	ns = drbd_read_state(device);
4564	spin_unlock_irq(&device->resource->req_lock);
4565
4566	if (rv < SS_SUCCESS) {
4567		conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4568		return -EIO;
4569	}
4570
4571	if (os.conn > C_WF_REPORT_PARAMS) {
4572		if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
4573		    peer_state.disk != D_NEGOTIATING ) {
4574			/* we want resync, peer has not yet decided to sync... */
4575			/* Nowadays only used when forcing a node into primary role and
4576			   setting its disk to UpToDate with that */
4577			drbd_send_uuids(peer_device);
4578			drbd_send_current_state(peer_device);
4579		}
4580	}
4581
4582	clear_bit(DISCARD_MY_DATA, &device->flags);
4583
4584	drbd_md_sync(device); /* update connected indicator, la_size_sect, ... */
4585
4586	return 0;
4587}
4588
4589static int receive_sync_uuid(struct drbd_connection *connection, struct packet_info *pi)
4590{
4591	struct drbd_peer_device *peer_device;
4592	struct drbd_device *device;
4593	struct p_rs_uuid *p = pi->data;
4594
4595	peer_device = conn_peer_device(connection, pi->vnr);
4596	if (!peer_device)
4597		return -EIO;
4598	device = peer_device->device;
4599
4600	wait_event(device->misc_wait,
4601		   device->state.conn == C_WF_SYNC_UUID ||
4602		   device->state.conn == C_BEHIND ||
4603		   device->state.conn < C_CONNECTED ||
4604		   device->state.disk < D_NEGOTIATING);
4605
4606	/* D_ASSERT(device,  device->state.conn == C_WF_SYNC_UUID ); */
4607
4608	/* Here the _drbd_uuid_ functions are right, current should
4609	   _not_ be rotated into the history */
4610	if (get_ldev_if_state(device, D_NEGOTIATING)) {
4611		_drbd_uuid_set(device, UI_CURRENT, be64_to_cpu(p->uuid));
4612		_drbd_uuid_set(device, UI_BITMAP, 0UL);
4613
4614		drbd_print_uuids(device, "updated sync uuid");
4615		drbd_start_resync(device, C_SYNC_TARGET);
4616
4617		put_ldev(device);
4618	} else
4619		drbd_err(device, "Ignoring SyncUUID packet!\n");
4620
4621	return 0;
4622}
4623
4624/*
4625 * receive_bitmap_plain
4626 *
4627 * Return 0 when done, 1 when another iteration is needed, and a negative error
4628 * code upon failure.
4629 */
4630static int
4631receive_bitmap_plain(struct drbd_peer_device *peer_device, unsigned int size,
4632		     unsigned long *p, struct bm_xfer_ctx *c)
4633{
4634	unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
4635				 drbd_header_size(peer_device->connection);
4636	unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
4637				       c->bm_words - c->word_offset);
4638	unsigned int want = num_words * sizeof(*p);
4639	int err;
4640
4641	if (want != size) {
4642		drbd_err(peer_device, "%s:want (%u) != size (%u)\n", __func__, want, size);
4643		return -EIO;
4644	}
4645	if (want == 0)
4646		return 0;
4647	err = drbd_recv_all(peer_device->connection, p, want);
4648	if (err)
4649		return err;
4650
4651	drbd_bm_merge_lel(peer_device->device, c->word_offset, num_words, p);
4652
4653	c->word_offset += num_words;
4654	c->bit_offset = c->word_offset * BITS_PER_LONG;
4655	if (c->bit_offset > c->bm_bits)
4656		c->bit_offset = c->bm_bits;
4657
4658	return 1;
4659}
4660
4661static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
4662{
4663	return (enum drbd_bitmap_code)(p->encoding & 0x0f);
4664}
4665
4666static int dcbp_get_start(struct p_compressed_bm *p)
4667{
4668	return (p->encoding & 0x80) != 0;
4669}
4670
4671static int dcbp_get_pad_bits(struct p_compressed_bm *p)
4672{
4673	return (p->encoding >> 4) & 0x7;
4674}
4675
4676/*
4677 * recv_bm_rle_bits
4678 *
4679 * Return 0 when done, 1 when another iteration is needed, and a negative error
4680 * code upon failure.
4681 */
4682static int
4683recv_bm_rle_bits(struct drbd_peer_device *peer_device,
4684		struct p_compressed_bm *p,
4685		 struct bm_xfer_ctx *c,
4686		 unsigned int len)
4687{
4688	struct bitstream bs;
4689	u64 look_ahead;
4690	u64 rl;
4691	u64 tmp;
4692	unsigned long s = c->bit_offset;
4693	unsigned long e;
4694	int toggle = dcbp_get_start(p);
4695	int have;
4696	int bits;
4697
4698	bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
4699
4700	bits = bitstream_get_bits(&bs, &look_ahead, 64);
4701	if (bits < 0)
4702		return -EIO;
4703
4704	for (have = bits; have > 0; s += rl, toggle = !toggle) {
4705		bits = vli_decode_bits(&rl, look_ahead);
4706		if (bits <= 0)
4707			return -EIO;
4708
4709		if (toggle) {
4710			e = s + rl -1;
4711			if (e >= c->bm_bits) {
4712				drbd_err(peer_device, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
4713				return -EIO;
4714			}
4715			_drbd_bm_set_bits(peer_device->device, s, e);
4716		}
4717
4718		if (have < bits) {
4719			drbd_err(peer_device, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
4720				have, bits, look_ahead,
4721				(unsigned int)(bs.cur.b - p->code),
4722				(unsigned int)bs.buf_len);
4723			return -EIO;
4724		}
4725		/* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */
4726		if (likely(bits < 64))
4727			look_ahead >>= bits;
4728		else
4729			look_ahead = 0;
4730		have -= bits;
4731
4732		bits = bitstream_get_bits(&bs, &tmp, 64 - have);
4733		if (bits < 0)
4734			return -EIO;
4735		look_ahead |= tmp << have;
4736		have += bits;
4737	}
4738
4739	c->bit_offset = s;
4740	bm_xfer_ctx_bit_to_word_offset(c);
4741
4742	return (s != c->bm_bits);
4743}
4744
4745/*
4746 * decode_bitmap_c
4747 *
4748 * Return 0 when done, 1 when another iteration is needed, and a negative error
4749 * code upon failure.
4750 */
4751static int
4752decode_bitmap_c(struct drbd_peer_device *peer_device,
4753		struct p_compressed_bm *p,
4754		struct bm_xfer_ctx *c,
4755		unsigned int len)
4756{
4757	if (dcbp_get_code(p) == RLE_VLI_Bits)
4758		return recv_bm_rle_bits(peer_device, p, c, len - sizeof(*p));
4759
4760	/* other variants had been implemented for evaluation,
4761	 * but have been dropped as this one turned out to be "best"
4762	 * during all our tests. */
4763
4764	drbd_err(peer_device, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
4765	conn_request_state(peer_device->connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
4766	return -EIO;
4767}
4768
4769void INFO_bm_xfer_stats(struct drbd_device *device,
4770		const char *direction, struct bm_xfer_ctx *c)
4771{
4772	/* what would it take to transfer it "plaintext" */
4773	unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
4774	unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
4775	unsigned int plain =
4776		header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
4777		c->bm_words * sizeof(unsigned long);
4778	unsigned int total = c->bytes[0] + c->bytes[1];
4779	unsigned int r;
4780
4781	/* total can not be zero. but just in case: */
4782	if (total == 0)
4783		return;
4784
4785	/* don't report if not compressed */
4786	if (total >= plain)
4787		return;
4788
4789	/* total < plain. check for overflow, still */
4790	r = (total > UINT_MAX/1000) ? (total / (plain/1000))
4791		                    : (1000 * total / plain);
4792
4793	if (r > 1000)
4794		r = 1000;
4795
4796	r = 1000 - r;
4797	drbd_info(device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
4798	     "total %u; compression: %u.%u%%\n",
4799			direction,
4800			c->bytes[1], c->packets[1],
4801			c->bytes[0], c->packets[0],
4802			total, r/10, r % 10);
4803}
4804
4805/* Since we are processing the bitfield from lower addresses to higher,
4806   it does not matter if the process it in 32 bit chunks or 64 bit
4807   chunks as long as it is little endian. (Understand it as byte stream,
4808   beginning with the lowest byte...) If we would use big endian
4809   we would need to process it from the highest address to the lowest,
4810   in order to be agnostic to the 32 vs 64 bits issue.
4811
4812   returns 0 on failure, 1 if we successfully received it. */
4813static int receive_bitmap(struct drbd_connection *connection, struct packet_info *pi)
4814{
4815	struct drbd_peer_device *peer_device;
4816	struct drbd_device *device;
4817	struct bm_xfer_ctx c;
4818	int err;
4819
4820	peer_device = conn_peer_device(connection, pi->vnr);
4821	if (!peer_device)
4822		return -EIO;
4823	device = peer_device->device;
4824
4825	drbd_bm_lock(device, "receive bitmap", BM_LOCKED_SET_ALLOWED);
4826	/* you are supposed to send additional out-of-sync information
4827	 * if you actually set bits during this phase */
4828
4829	c = (struct bm_xfer_ctx) {
4830		.bm_bits = drbd_bm_bits(device),
4831		.bm_words = drbd_bm_words(device),
4832	};
4833
4834	for(;;) {
4835		if (pi->cmd == P_BITMAP)
4836			err = receive_bitmap_plain(peer_device, pi->size, pi->data, &c);
4837		else if (pi->cmd == P_COMPRESSED_BITMAP) {
4838			/* MAYBE: sanity check that we speak proto >= 90,
4839			 * and the feature is enabled! */
4840			struct p_compressed_bm *p = pi->data;
4841
4842			if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(connection)) {
4843				drbd_err(device, "ReportCBitmap packet too large\n");
4844				err = -EIO;
4845				goto out;
4846			}
4847			if (pi->size <= sizeof(*p)) {
4848				drbd_err(device, "ReportCBitmap packet too small (l:%u)\n", pi->size);
4849				err = -EIO;
4850				goto out;
4851			}
4852			err = drbd_recv_all(peer_device->connection, p, pi->size);
4853			if (err)
4854			       goto out;
4855			err = decode_bitmap_c(peer_device, p, &c, pi->size);
4856		} else {
4857			drbd_warn(device, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
4858			err = -EIO;
4859			goto out;
4860		}
4861
4862		c.packets[pi->cmd == P_BITMAP]++;
4863		c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(connection) + pi->size;
4864
4865		if (err <= 0) {
4866			if (err < 0)
4867				goto out;
4868			break;
4869		}
4870		err = drbd_recv_header(peer_device->connection, pi);
4871		if (err)
4872			goto out;
4873	}
4874
4875	INFO_bm_xfer_stats(device, "receive", &c);
4876
4877	if (device->state.conn == C_WF_BITMAP_T) {
4878		enum drbd_state_rv rv;
4879
4880		err = drbd_send_bitmap(device);
4881		if (err)
4882			goto out;
4883		/* Omit CS_ORDERED with this state transition to avoid deadlocks. */
4884		rv = _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
4885		D_ASSERT(device, rv == SS_SUCCESS);
4886	} else if (device->state.conn != C_WF_BITMAP_S) {
4887		/* admin may have requested C_DISCONNECTING,
4888		 * other threads may have noticed network errors */
4889		drbd_info(device, "unexpected cstate (%s) in receive_bitmap\n",
4890		    drbd_conn_str(device->state.conn));
4891	}
4892	err = 0;
4893
4894 out:
4895	drbd_bm_unlock(device);
4896	if (!err && device->state.conn == C_WF_BITMAP_S)
4897		drbd_start_resync(device, C_SYNC_SOURCE);
4898	return err;
4899}
4900
4901static int receive_skip(struct drbd_connection *connection, struct packet_info *pi)
4902{
4903	drbd_warn(connection, "skipping unknown optional packet type %d, l: %d!\n",
4904		 pi->cmd, pi->size);
4905
4906	return ignore_remaining_packet(connection, pi);
4907}
4908
4909static int receive_UnplugRemote(struct drbd_connection *connection, struct packet_info *pi)
4910{
4911	/* Make sure we've acked all the TCP data associated
4912	 * with the data requests being unplugged */
4913	tcp_sock_set_quickack(connection->data.socket->sk, 2);
4914	return 0;
4915}
4916
4917static int receive_out_of_sync(struct drbd_connection *connection, struct packet_info *pi)
4918{
4919	struct drbd_peer_device *peer_device;
4920	struct drbd_device *device;
4921	struct p_block_desc *p = pi->data;
4922
4923	peer_device = conn_peer_device(connection, pi->vnr);
4924	if (!peer_device)
4925		return -EIO;
4926	device = peer_device->device;
4927
4928	switch (device->state.conn) {
4929	case C_WF_SYNC_UUID:
4930	case C_WF_BITMAP_T:
4931	case C_BEHIND:
4932			break;
4933	default:
4934		drbd_err(device, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
4935				drbd_conn_str(device->state.conn));
4936	}
4937
4938	drbd_set_out_of_sync(device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
4939
4940	return 0;
4941}
4942
4943static int receive_rs_deallocated(struct drbd_connection *connection, struct packet_info *pi)
4944{
4945	struct drbd_peer_device *peer_device;
4946	struct p_block_desc *p = pi->data;
4947	struct drbd_device *device;
4948	sector_t sector;
4949	int size, err = 0;
4950
4951	peer_device = conn_peer_device(connection, pi->vnr);
4952	if (!peer_device)
4953		return -EIO;
4954	device = peer_device->device;
4955
4956	sector = be64_to_cpu(p->sector);
4957	size = be32_to_cpu(p->blksize);
4958
4959	dec_rs_pending(device);
4960
4961	if (get_ldev(device)) {
4962		struct drbd_peer_request *peer_req;
4963
4964		peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
4965					       size, 0, GFP_NOIO);
4966		if (!peer_req) {
4967			put_ldev(device);
4968			return -ENOMEM;
4969		}
4970
4971		peer_req->w.cb = e_end_resync_block;
4972		peer_req->opf = REQ_OP_DISCARD;
4973		peer_req->submit_jif = jiffies;
4974		peer_req->flags |= EE_TRIM;
4975
4976		spin_lock_irq(&device->resource->req_lock);
4977		list_add_tail(&peer_req->w.list, &device->sync_ee);
4978		spin_unlock_irq(&device->resource->req_lock);
4979
4980		atomic_add(pi->size >> 9, &device->rs_sect_ev);
4981		err = drbd_submit_peer_request(peer_req);
4982
4983		if (err) {
4984			spin_lock_irq(&device->resource->req_lock);
4985			list_del(&peer_req->w.list);
4986			spin_unlock_irq(&device->resource->req_lock);
4987
4988			drbd_free_peer_req(device, peer_req);
4989			put_ldev(device);
4990			err = 0;
4991			goto fail;
4992		}
4993
4994		inc_unacked(device);
4995
4996		/* No put_ldev() here. Gets called in drbd_endio_write_sec_final(),
4997		   as well as drbd_rs_complete_io() */
4998	} else {
4999	fail:
5000		drbd_rs_complete_io(device, sector);
5001		drbd_send_ack_ex(peer_device, P_NEG_ACK, sector, size, ID_SYNCER);
5002	}
5003
5004	atomic_add(size >> 9, &device->rs_sect_in);
5005
5006	return err;
5007}
5008
5009struct data_cmd {
5010	int expect_payload;
5011	unsigned int pkt_size;
5012	int (*fn)(struct drbd_connection *, struct packet_info *);
5013};
5014
5015static struct data_cmd drbd_cmd_handler[] = {
5016	[P_DATA]	    = { 1, sizeof(struct p_data), receive_Data },
5017	[P_DATA_REPLY]	    = { 1, sizeof(struct p_data), receive_DataReply },
5018	[P_RS_DATA_REPLY]   = { 1, sizeof(struct p_data), receive_RSDataReply } ,
5019	[P_BARRIER]	    = { 0, sizeof(struct p_barrier), receive_Barrier } ,
5020	[P_BITMAP]	    = { 1, 0, receive_bitmap } ,
5021	[P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
5022	[P_UNPLUG_REMOTE]   = { 0, 0, receive_UnplugRemote },
5023	[P_DATA_REQUEST]    = { 0, sizeof(struct p_block_req), receive_DataRequest },
5024	[P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
5025	[P_SYNC_PARAM]	    = { 1, 0, receive_SyncParam },
5026	[P_SYNC_PARAM89]    = { 1, 0, receive_SyncParam },
5027	[P_PROTOCOL]        = { 1, sizeof(struct p_protocol), receive_protocol },
5028	[P_UUIDS]	    = { 0, sizeof(struct p_uuids), receive_uuids },
5029	[P_SIZES]	    = { 0, sizeof(struct p_sizes), receive_sizes },
5030	[P_STATE]	    = { 0, sizeof(struct p_state), receive_state },
5031	[P_STATE_CHG_REQ]   = { 0, sizeof(struct p_req_state), receive_req_state },
5032	[P_SYNC_UUID]       = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
5033	[P_OV_REQUEST]      = { 0, sizeof(struct p_block_req), receive_DataRequest },
5034	[P_OV_REPLY]        = { 1, sizeof(struct p_block_req), receive_DataRequest },
5035	[P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
5036	[P_RS_THIN_REQ]     = { 0, sizeof(struct p_block_req), receive_DataRequest },
5037	[P_DELAY_PROBE]     = { 0, sizeof(struct p_delay_probe93), receive_skip },
5038	[P_OUT_OF_SYNC]     = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
5039	[P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
5040	[P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
5041	[P_TRIM]	    = { 0, sizeof(struct p_trim), receive_Data },
5042	[P_ZEROES]	    = { 0, sizeof(struct p_trim), receive_Data },
5043	[P_RS_DEALLOCATED]  = { 0, sizeof(struct p_block_desc), receive_rs_deallocated },
5044};
5045
5046static void drbdd(struct drbd_connection *connection)
5047{
5048	struct packet_info pi;
5049	size_t shs; /* sub header size */
5050	int err;
5051
5052	while (get_t_state(&connection->receiver) == RUNNING) {
5053		struct data_cmd const *cmd;
5054
5055		drbd_thread_current_set_cpu(&connection->receiver);
5056		update_receiver_timing_details(connection, drbd_recv_header_maybe_unplug);
5057		if (drbd_recv_header_maybe_unplug(connection, &pi))
5058			goto err_out;
5059
5060		cmd = &drbd_cmd_handler[pi.cmd];
5061		if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
5062			drbd_err(connection, "Unexpected data packet %s (0x%04x)",
5063				 cmdname(pi.cmd), pi.cmd);
5064			goto err_out;
5065		}
5066
5067		shs = cmd->pkt_size;
5068		if (pi.cmd == P_SIZES && connection->agreed_features & DRBD_FF_WSAME)
5069			shs += sizeof(struct o_qlim);
5070		if (pi.size > shs && !cmd->expect_payload) {
5071			drbd_err(connection, "No payload expected %s l:%d\n",
5072				 cmdname(pi.cmd), pi.size);
5073			goto err_out;
5074		}
5075		if (pi.size < shs) {
5076			drbd_err(connection, "%s: unexpected packet size, expected:%d received:%d\n",
5077				 cmdname(pi.cmd), (int)shs, pi.size);
5078			goto err_out;
5079		}
5080
5081		if (shs) {
5082			update_receiver_timing_details(connection, drbd_recv_all_warn);
5083			err = drbd_recv_all_warn(connection, pi.data, shs);
5084			if (err)
5085				goto err_out;
5086			pi.size -= shs;
5087		}
5088
5089		update_receiver_timing_details(connection, cmd->fn);
5090		err = cmd->fn(connection, &pi);
5091		if (err) {
5092			drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
5093				 cmdname(pi.cmd), err, pi.size);
5094			goto err_out;
5095		}
5096	}
5097	return;
5098
5099    err_out:
5100	conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
5101}
5102
5103static void conn_disconnect(struct drbd_connection *connection)
5104{
5105	struct drbd_peer_device *peer_device;
5106	enum drbd_conns oc;
5107	int vnr;
5108
5109	if (connection->cstate == C_STANDALONE)
5110		return;
5111
5112	/* We are about to start the cleanup after connection loss.
5113	 * Make sure drbd_make_request knows about that.
5114	 * Usually we should be in some network failure state already,
5115	 * but just in case we are not, we fix it up here.
5116	 */
5117	conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
5118
5119	/* ack_receiver does not clean up anything. it must not interfere, either */
5120	drbd_thread_stop(&connection->ack_receiver);
5121	if (connection->ack_sender) {
5122		destroy_workqueue(connection->ack_sender);
5123		connection->ack_sender = NULL;
5124	}
5125	drbd_free_sock(connection);
5126
5127	rcu_read_lock();
5128	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
5129		struct drbd_device *device = peer_device->device;
5130		kref_get(&device->kref);
5131		rcu_read_unlock();
5132		drbd_disconnected(peer_device);
5133		kref_put(&device->kref, drbd_destroy_device);
5134		rcu_read_lock();
5135	}
5136	rcu_read_unlock();
5137
5138	if (!list_empty(&connection->current_epoch->list))
5139		drbd_err(connection, "ASSERTION FAILED: connection->current_epoch->list not empty\n");
5140	/* ok, no more ee's on the fly, it is safe to reset the epoch_size */
5141	atomic_set(&connection->current_epoch->epoch_size, 0);
5142	connection->send.seen_any_write_yet = false;
5143
5144	drbd_info(connection, "Connection closed\n");
5145
5146	if (conn_highest_role(connection) == R_PRIMARY && conn_highest_pdsk(connection) >= D_UNKNOWN)
5147		conn_try_outdate_peer_async(connection);
5148
5149	spin_lock_irq(&connection->resource->req_lock);
5150	oc = connection->cstate;
5151	if (oc >= C_UNCONNECTED)
5152		_conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
5153
5154	spin_unlock_irq(&connection->resource->req_lock);
5155
5156	if (oc == C_DISCONNECTING)
5157		conn_request_state(connection, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
5158}
5159
5160static int drbd_disconnected(struct drbd_peer_device *peer_device)
5161{
5162	struct drbd_device *device = peer_device->device;
5163	unsigned int i;
5164
5165	/* wait for current activity to cease. */
5166	spin_lock_irq(&device->resource->req_lock);
5167	_drbd_wait_ee_list_empty(device, &device->active_ee);
5168	_drbd_wait_ee_list_empty(device, &device->sync_ee);
5169	_drbd_wait_ee_list_empty(device, &device->read_ee);
5170	spin_unlock_irq(&device->resource->req_lock);
5171
5172	/* We do not have data structures that would allow us to
5173	 * get the rs_pending_cnt down to 0 again.
5174	 *  * On C_SYNC_TARGET we do not have any data structures describing
5175	 *    the pending RSDataRequest's we have sent.
5176	 *  * On C_SYNC_SOURCE there is no data structure that tracks
5177	 *    the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
5178	 *  And no, it is not the sum of the reference counts in the
5179	 *  resync_LRU. The resync_LRU tracks the whole operation including
5180	 *  the disk-IO, while the rs_pending_cnt only tracks the blocks
5181	 *  on the fly. */
5182	drbd_rs_cancel_all(device);
5183	device->rs_total = 0;
5184	device->rs_failed = 0;
5185	atomic_set(&device->rs_pending_cnt, 0);
5186	wake_up(&device->misc_wait);
5187
5188	del_timer_sync(&device->resync_timer);
5189	resync_timer_fn(&device->resync_timer);
5190
5191	/* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
5192	 * w_make_resync_request etc. which may still be on the worker queue
5193	 * to be "canceled" */
5194	drbd_flush_workqueue(&peer_device->connection->sender_work);
5195
5196	drbd_finish_peer_reqs(device);
5197
5198	/* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
5199	   might have issued a work again. The one before drbd_finish_peer_reqs() is
5200	   necessary to reclain net_ee in drbd_finish_peer_reqs(). */
5201	drbd_flush_workqueue(&peer_device->connection->sender_work);
5202
5203	/* need to do it again, drbd_finish_peer_reqs() may have populated it
5204	 * again via drbd_try_clear_on_disk_bm(). */
5205	drbd_rs_cancel_all(device);
5206
5207	kfree(device->p_uuid);
5208	device->p_uuid = NULL;
5209
5210	if (!drbd_suspended(device))
5211		tl_clear(peer_device->connection);
5212
5213	drbd_md_sync(device);
5214
5215	if (get_ldev(device)) {
5216		drbd_bitmap_io(device, &drbd_bm_write_copy_pages,
5217				"write from disconnected", BM_LOCKED_CHANGE_ALLOWED);
5218		put_ldev(device);
5219	}
5220
5221	/* tcp_close and release of sendpage pages can be deferred.  I don't
5222	 * want to use SO_LINGER, because apparently it can be deferred for
5223	 * more than 20 seconds (longest time I checked).
5224	 *
5225	 * Actually we don't care for exactly when the network stack does its
5226	 * put_page(), but release our reference on these pages right here.
5227	 */
5228	i = drbd_free_peer_reqs(device, &device->net_ee);
5229	if (i)
5230		drbd_info(device, "net_ee not empty, killed %u entries\n", i);
5231	i = atomic_read(&device->pp_in_use_by_net);
5232	if (i)
5233		drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i);
5234	i = atomic_read(&device->pp_in_use);
5235	if (i)
5236		drbd_info(device, "pp_in_use = %d, expected 0\n", i);
5237
5238	D_ASSERT(device, list_empty(&device->read_ee));
5239	D_ASSERT(device, list_empty(&device->active_ee));
5240	D_ASSERT(device, list_empty(&device->sync_ee));
5241	D_ASSERT(device, list_empty(&device->done_ee));
5242
5243	return 0;
5244}
5245
5246/*
5247 * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
5248 * we can agree on is stored in agreed_pro_version.
5249 *
5250 * feature flags and the reserved array should be enough room for future
5251 * enhancements of the handshake protocol, and possible plugins...
5252 *
5253 * for now, they are expected to be zero, but ignored.
5254 */
5255static int drbd_send_features(struct drbd_connection *connection)
5256{
5257	struct drbd_socket *sock;
5258	struct p_connection_features *p;
5259
5260	sock = &connection->data;
5261	p = conn_prepare_command(connection, sock);
5262	if (!p)
5263		return -EIO;
5264	memset(p, 0, sizeof(*p));
5265	p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
5266	p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
5267	p->feature_flags = cpu_to_be32(PRO_FEATURES);
5268	return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
5269}
5270
5271/*
5272 * return values:
5273 *   1 yes, we have a valid connection
5274 *   0 oops, did not work out, please try again
5275 *  -1 peer talks different language,
5276 *     no point in trying again, please go standalone.
5277 */
5278static int drbd_do_features(struct drbd_connection *connection)
5279{
5280	/* ASSERT current == connection->receiver ... */
5281	struct p_connection_features *p;
5282	const int expect = sizeof(struct p_connection_features);
5283	struct packet_info pi;
5284	int err;
5285
5286	err = drbd_send_features(connection);
5287	if (err)
5288		return 0;
5289
5290	err = drbd_recv_header(connection, &pi);
5291	if (err)
5292		return 0;
5293
5294	if (pi.cmd != P_CONNECTION_FEATURES) {
5295		drbd_err(connection, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
5296			 cmdname(pi.cmd), pi.cmd);
5297		return -1;
5298	}
5299
5300	if (pi.size != expect) {
5301		drbd_err(connection, "expected ConnectionFeatures length: %u, received: %u\n",
5302		     expect, pi.size);
5303		return -1;
5304	}
5305
5306	p = pi.data;
5307	err = drbd_recv_all_warn(connection, p, expect);
5308	if (err)
5309		return 0;
5310
5311	p->protocol_min = be32_to_cpu(p->protocol_min);
5312	p->protocol_max = be32_to_cpu(p->protocol_max);
5313	if (p->protocol_max == 0)
5314		p->protocol_max = p->protocol_min;
5315
5316	if (PRO_VERSION_MAX < p->protocol_min ||
5317	    PRO_VERSION_MIN > p->protocol_max)
5318		goto incompat;
5319
5320	connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
5321	connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags);
5322
5323	drbd_info(connection, "Handshake successful: "
5324	     "Agreed network protocol version %d\n", connection->agreed_pro_version);
5325
5326	drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s%s.\n",
5327		  connection->agreed_features,
5328		  connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "",
5329		  connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "",
5330		  connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" : "",
5331		  connection->agreed_features & DRBD_FF_WZEROES ? " WRITE_ZEROES" :
5332		  connection->agreed_features ? "" : " none");
5333
5334	return 1;
5335
5336 incompat:
5337	drbd_err(connection, "incompatible DRBD dialects: "
5338	    "I support %d-%d, peer supports %d-%d\n",
5339	    PRO_VERSION_MIN, PRO_VERSION_MAX,
5340	    p->protocol_min, p->protocol_max);
5341	return -1;
5342}
5343
5344#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
5345static int drbd_do_auth(struct drbd_connection *connection)
5346{
5347	drbd_err(connection, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
5348	drbd_err(connection, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
5349	return -1;
5350}
5351#else
5352#define CHALLENGE_LEN 64
5353
5354/* Return value:
5355	1 - auth succeeded,
5356	0 - failed, try again (network error),
5357	-1 - auth failed, don't try again.
5358*/
5359
5360static int drbd_do_auth(struct drbd_connection *connection)
5361{
5362	struct drbd_socket *sock;
5363	char my_challenge[CHALLENGE_LEN];  /* 64 Bytes... */
5364	char *response = NULL;
5365	char *right_response = NULL;
5366	char *peers_ch = NULL;
5367	unsigned int key_len;
5368	char secret[SHARED_SECRET_MAX]; /* 64 byte */
5369	unsigned int resp_size;
5370	struct shash_desc *desc;
5371	struct packet_info pi;
5372	struct net_conf *nc;
5373	int err, rv;
5374
5375	/* FIXME: Put the challenge/response into the preallocated socket buffer.  */
5376
5377	rcu_read_lock();
5378	nc = rcu_dereference(connection->net_conf);
5379	key_len = strlen(nc->shared_secret);
5380	memcpy(secret, nc->shared_secret, key_len);
5381	rcu_read_unlock();
5382
5383	desc = kmalloc(sizeof(struct shash_desc) +
5384		       crypto_shash_descsize(connection->cram_hmac_tfm),
5385		       GFP_KERNEL);
5386	if (!desc) {
5387		rv = -1;
5388		goto fail;
5389	}
5390	desc->tfm = connection->cram_hmac_tfm;
5391
5392	rv = crypto_shash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len);
5393	if (rv) {
5394		drbd_err(connection, "crypto_shash_setkey() failed with %d\n", rv);
5395		rv = -1;
5396		goto fail;
5397	}
5398
5399	get_random_bytes(my_challenge, CHALLENGE_LEN);
5400
5401	sock = &connection->data;
5402	if (!conn_prepare_command(connection, sock)) {
5403		rv = 0;
5404		goto fail;
5405	}
5406	rv = !conn_send_command(connection, sock, P_AUTH_CHALLENGE, 0,
5407				my_challenge, CHALLENGE_LEN);
5408	if (!rv)
5409		goto fail;
5410
5411	err = drbd_recv_header(connection, &pi);
5412	if (err) {
5413		rv = 0;
5414		goto fail;
5415	}
5416
5417	if (pi.cmd != P_AUTH_CHALLENGE) {
5418		drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n",
5419			 cmdname(pi.cmd), pi.cmd);
5420		rv = -1;
5421		goto fail;
5422	}
5423
5424	if (pi.size > CHALLENGE_LEN * 2) {
5425		drbd_err(connection, "expected AuthChallenge payload too big.\n");
5426		rv = -1;
5427		goto fail;
5428	}
5429
5430	if (pi.size < CHALLENGE_LEN) {
5431		drbd_err(connection, "AuthChallenge payload too small.\n");
5432		rv = -1;
5433		goto fail;
5434	}
5435
5436	peers_ch = kmalloc(pi.size, GFP_NOIO);
5437	if (!peers_ch) {
5438		rv = -1;
5439		goto fail;
5440	}
5441
5442	err = drbd_recv_all_warn(connection, peers_ch, pi.size);
5443	if (err) {
5444		rv = 0;
5445		goto fail;
5446	}
5447
5448	if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) {
5449		drbd_err(connection, "Peer presented the same challenge!\n");
5450		rv = -1;
5451		goto fail;
5452	}
5453
5454	resp_size = crypto_shash_digestsize(connection->cram_hmac_tfm);
5455	response = kmalloc(resp_size, GFP_NOIO);
5456	if (!response) {
5457		rv = -1;
5458		goto fail;
5459	}
5460
5461	rv = crypto_shash_digest(desc, peers_ch, pi.size, response);
5462	if (rv) {
5463		drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
5464		rv = -1;
5465		goto fail;
5466	}
5467
5468	if (!conn_prepare_command(connection, sock)) {
5469		rv = 0;
5470		goto fail;
5471	}
5472	rv = !conn_send_command(connection, sock, P_AUTH_RESPONSE, 0,
5473				response, resp_size);
5474	if (!rv)
5475		goto fail;
5476
5477	err = drbd_recv_header(connection, &pi);
5478	if (err) {
5479		rv = 0;
5480		goto fail;
5481	}
5482
5483	if (pi.cmd != P_AUTH_RESPONSE) {
5484		drbd_err(connection, "expected AuthResponse packet, received: %s (0x%04x)\n",
5485			 cmdname(pi.cmd), pi.cmd);
5486		rv = 0;
5487		goto fail;
5488	}
5489
5490	if (pi.size != resp_size) {
5491		drbd_err(connection, "expected AuthResponse payload of wrong size\n");
5492		rv = 0;
5493		goto fail;
5494	}
5495
5496	err = drbd_recv_all_warn(connection, response , resp_size);
5497	if (err) {
5498		rv = 0;
5499		goto fail;
5500	}
5501
5502	right_response = kmalloc(resp_size, GFP_NOIO);
5503	if (!right_response) {
5504		rv = -1;
5505		goto fail;
5506	}
5507
5508	rv = crypto_shash_digest(desc, my_challenge, CHALLENGE_LEN,
5509				 right_response);
5510	if (rv) {
5511		drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
5512		rv = -1;
5513		goto fail;
5514	}
5515
5516	rv = !memcmp(response, right_response, resp_size);
5517
5518	if (rv)
5519		drbd_info(connection, "Peer authenticated using %d bytes HMAC\n",
5520		     resp_size);
5521	else
5522		rv = -1;
5523
5524 fail:
5525	kfree(peers_ch);
5526	kfree(response);
5527	kfree(right_response);
5528	if (desc) {
5529		shash_desc_zero(desc);
5530		kfree(desc);
5531	}
5532
5533	return rv;
5534}
5535#endif
5536
5537int drbd_receiver(struct drbd_thread *thi)
5538{
5539	struct drbd_connection *connection = thi->connection;
5540	int h;
5541
5542	drbd_info(connection, "receiver (re)started\n");
5543
5544	do {
5545		h = conn_connect(connection);
5546		if (h == 0) {
5547			conn_disconnect(connection);
5548			schedule_timeout_interruptible(HZ);
5549		}
5550		if (h == -1) {
5551			drbd_warn(connection, "Discarding network configuration.\n");
5552			conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
5553		}
5554	} while (h == 0);
5555
5556	if (h > 0) {
5557		blk_start_plug(&connection->receiver_plug);
5558		drbdd(connection);
5559		blk_finish_plug(&connection->receiver_plug);
5560	}
5561
5562	conn_disconnect(connection);
5563
5564	drbd_info(connection, "receiver terminated\n");
5565	return 0;
5566}
5567
5568/* ********* acknowledge sender ******** */
5569
5570static int got_conn_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
5571{
5572	struct p_req_state_reply *p = pi->data;
5573	int retcode = be32_to_cpu(p->retcode);
5574
5575	if (retcode >= SS_SUCCESS) {
5576		set_bit(CONN_WD_ST_CHG_OKAY, &connection->flags);
5577	} else {
5578		set_bit(CONN_WD_ST_CHG_FAIL, &connection->flags);
5579		drbd_err(connection, "Requested state change failed by peer: %s (%d)\n",
5580			 drbd_set_st_err_str(retcode), retcode);
5581	}
5582	wake_up(&connection->ping_wait);
5583
5584	return 0;
5585}
5586
5587static int got_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
5588{
5589	struct drbd_peer_device *peer_device;
5590	struct drbd_device *device;
5591	struct p_req_state_reply *p = pi->data;
5592	int retcode = be32_to_cpu(p->retcode);
5593
5594	peer_device = conn_peer_device(connection, pi->vnr);
5595	if (!peer_device)
5596		return -EIO;
5597	device = peer_device->device;
5598
5599	if (test_bit(CONN_WD_ST_CHG_REQ, &connection->flags)) {
5600		D_ASSERT(device, connection->agreed_pro_version < 100);
5601		return got_conn_RqSReply(connection, pi);
5602	}
5603
5604	if (retcode >= SS_SUCCESS) {
5605		set_bit(CL_ST_CHG_SUCCESS, &device->flags);
5606	} else {
5607		set_bit(CL_ST_CHG_FAIL, &device->flags);
5608		drbd_err(device, "Requested state change failed by peer: %s (%d)\n",
5609			drbd_set_st_err_str(retcode), retcode);
5610	}
5611	wake_up(&device->state_wait);
5612
5613	return 0;
5614}
5615
5616static int got_Ping(struct drbd_connection *connection, struct packet_info *pi)
5617{
5618	return drbd_send_ping_ack(connection);
5619
5620}
5621
5622static int got_PingAck(struct drbd_connection *connection, struct packet_info *pi)
5623{
5624	/* restore idle timeout */
5625	connection->meta.socket->sk->sk_rcvtimeo = connection->net_conf->ping_int*HZ;
5626	if (!test_and_set_bit(GOT_PING_ACK, &connection->flags))
5627		wake_up(&connection->ping_wait);
5628
5629	return 0;
5630}
5631
5632static int got_IsInSync(struct drbd_connection *connection, struct packet_info *pi)
5633{
5634	struct drbd_peer_device *peer_device;
5635	struct drbd_device *device;
5636	struct p_block_ack *p = pi->data;
5637	sector_t sector = be64_to_cpu(p->sector);
5638	int blksize = be32_to_cpu(p->blksize);
5639
5640	peer_device = conn_peer_device(connection, pi->vnr);
5641	if (!peer_device)
5642		return -EIO;
5643	device = peer_device->device;
5644
5645	D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
5646
5647	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5648
5649	if (get_ldev(device)) {
5650		drbd_rs_complete_io(device, sector);
5651		drbd_set_in_sync(device, sector, blksize);
5652		/* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
5653		device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
5654		put_ldev(device);
5655	}
5656	dec_rs_pending(device);
5657	atomic_add(blksize >> 9, &device->rs_sect_in);
5658
5659	return 0;
5660}
5661
5662static int
5663validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t sector,
5664			      struct rb_root *root, const char *func,
5665			      enum drbd_req_event what, bool missing_ok)
5666{
5667	struct drbd_request *req;
5668	struct bio_and_error m;
5669
5670	spin_lock_irq(&device->resource->req_lock);
5671	req = find_request(device, root, id, sector, missing_ok, func);
5672	if (unlikely(!req)) {
5673		spin_unlock_irq(&device->resource->req_lock);
5674		return -EIO;
5675	}
5676	__req_mod(req, what, &m);
5677	spin_unlock_irq(&device->resource->req_lock);
5678
5679	if (m.bio)
5680		complete_master_bio(device, &m);
5681	return 0;
5682}
5683
5684static int got_BlockAck(struct drbd_connection *connection, struct packet_info *pi)
5685{
5686	struct drbd_peer_device *peer_device;
5687	struct drbd_device *device;
5688	struct p_block_ack *p = pi->data;
5689	sector_t sector = be64_to_cpu(p->sector);
5690	int blksize = be32_to_cpu(p->blksize);
5691	enum drbd_req_event what;
5692
5693	peer_device = conn_peer_device(connection, pi->vnr);
5694	if (!peer_device)
5695		return -EIO;
5696	device = peer_device->device;
5697
5698	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5699
5700	if (p->block_id == ID_SYNCER) {
5701		drbd_set_in_sync(device, sector, blksize);
5702		dec_rs_pending(device);
5703		return 0;
5704	}
5705	switch (pi->cmd) {
5706	case P_RS_WRITE_ACK:
5707		what = WRITE_ACKED_BY_PEER_AND_SIS;
5708		break;
5709	case P_WRITE_ACK:
5710		what = WRITE_ACKED_BY_PEER;
5711		break;
5712	case P_RECV_ACK:
5713		what = RECV_ACKED_BY_PEER;
5714		break;
5715	case P_SUPERSEDED:
5716		what = CONFLICT_RESOLVED;
5717		break;
5718	case P_RETRY_WRITE:
5719		what = POSTPONE_WRITE;
5720		break;
5721	default:
5722		BUG();
5723	}
5724
5725	return validate_req_change_req_state(device, p->block_id, sector,
5726					     &device->write_requests, __func__,
5727					     what, false);
5728}
5729
5730static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi)
5731{
5732	struct drbd_peer_device *peer_device;
5733	struct drbd_device *device;
5734	struct p_block_ack *p = pi->data;
5735	sector_t sector = be64_to_cpu(p->sector);
5736	int size = be32_to_cpu(p->blksize);
5737	int err;
5738
5739	peer_device = conn_peer_device(connection, pi->vnr);
5740	if (!peer_device)
5741		return -EIO;
5742	device = peer_device->device;
5743
5744	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5745
5746	if (p->block_id == ID_SYNCER) {
5747		dec_rs_pending(device);
5748		drbd_rs_failed_io(device, sector, size);
5749		return 0;
5750	}
5751
5752	err = validate_req_change_req_state(device, p->block_id, sector,
5753					    &device->write_requests, __func__,
5754					    NEG_ACKED, true);
5755	if (err) {
5756		/* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
5757		   The master bio might already be completed, therefore the
5758		   request is no longer in the collision hash. */
5759		/* In Protocol B we might already have got a P_RECV_ACK
5760		   but then get a P_NEG_ACK afterwards. */
5761		drbd_set_out_of_sync(device, sector, size);
5762	}
5763	return 0;
5764}
5765
5766static int got_NegDReply(struct drbd_connection *connection, struct packet_info *pi)
5767{
5768	struct drbd_peer_device *peer_device;
5769	struct drbd_device *device;
5770	struct p_block_ack *p = pi->data;
5771	sector_t sector = be64_to_cpu(p->sector);
5772
5773	peer_device = conn_peer_device(connection, pi->vnr);
5774	if (!peer_device)
5775		return -EIO;
5776	device = peer_device->device;
5777
5778	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5779
5780	drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n",
5781	    (unsigned long long)sector, be32_to_cpu(p->blksize));
5782
5783	return validate_req_change_req_state(device, p->block_id, sector,
5784					     &device->read_requests, __func__,
5785					     NEG_ACKED, false);
5786}
5787
5788static int got_NegRSDReply(struct drbd_connection *connection, struct packet_info *pi)
5789{
5790	struct drbd_peer_device *peer_device;
5791	struct drbd_device *device;
5792	sector_t sector;
5793	int size;
5794	struct p_block_ack *p = pi->data;
5795
5796	peer_device = conn_peer_device(connection, pi->vnr);
5797	if (!peer_device)
5798		return -EIO;
5799	device = peer_device->device;
5800
5801	sector = be64_to_cpu(p->sector);
5802	size = be32_to_cpu(p->blksize);
5803
5804	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5805
5806	dec_rs_pending(device);
5807
5808	if (get_ldev_if_state(device, D_FAILED)) {
5809		drbd_rs_complete_io(device, sector);
5810		switch (pi->cmd) {
5811		case P_NEG_RS_DREPLY:
5812			drbd_rs_failed_io(device, sector, size);
5813			break;
5814		case P_RS_CANCEL:
5815			break;
5816		default:
5817			BUG();
5818		}
5819		put_ldev(device);
5820	}
5821
5822	return 0;
5823}
5824
5825static int got_BarrierAck(struct drbd_connection *connection, struct packet_info *pi)
5826{
5827	struct p_barrier_ack *p = pi->data;
5828	struct drbd_peer_device *peer_device;
5829	int vnr;
5830
5831	tl_release(connection, p->barrier, be32_to_cpu(p->set_size));
5832
5833	rcu_read_lock();
5834	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
5835		struct drbd_device *device = peer_device->device;
5836
5837		if (device->state.conn == C_AHEAD &&
5838		    atomic_read(&device->ap_in_flight) == 0 &&
5839		    !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &device->flags)) {
5840			device->start_resync_timer.expires = jiffies + HZ;
5841			add_timer(&device->start_resync_timer);
5842		}
5843	}
5844	rcu_read_unlock();
5845
5846	return 0;
5847}
5848
5849static int got_OVResult(struct drbd_connection *connection, struct packet_info *pi)
5850{
5851	struct drbd_peer_device *peer_device;
5852	struct drbd_device *device;
5853	struct p_block_ack *p = pi->data;
5854	struct drbd_device_work *dw;
5855	sector_t sector;
5856	int size;
5857
5858	peer_device = conn_peer_device(connection, pi->vnr);
5859	if (!peer_device)
5860		return -EIO;
5861	device = peer_device->device;
5862
5863	sector = be64_to_cpu(p->sector);
5864	size = be32_to_cpu(p->blksize);
5865
5866	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5867
5868	if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
5869		drbd_ov_out_of_sync_found(device, sector, size);
5870	else
5871		ov_out_of_sync_print(device);
5872
5873	if (!get_ldev(device))
5874		return 0;
5875
5876	drbd_rs_complete_io(device, sector);
5877	dec_rs_pending(device);
5878
5879	--device->ov_left;
5880
5881	/* let's advance progress step marks only for every other megabyte */
5882	if ((device->ov_left & 0x200) == 0x200)
5883		drbd_advance_rs_marks(device, device->ov_left);
5884
5885	if (device->ov_left == 0) {
5886		dw = kmalloc(sizeof(*dw), GFP_NOIO);
5887		if (dw) {
5888			dw->w.cb = w_ov_finished;
5889			dw->device = device;
5890			drbd_queue_work(&peer_device->connection->sender_work, &dw->w);
5891		} else {
5892			drbd_err(device, "kmalloc(dw) failed.");
5893			ov_out_of_sync_print(device);
5894			drbd_resync_finished(device);
5895		}
5896	}
5897	put_ldev(device);
5898	return 0;
5899}
5900
5901static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
5902{
5903	return 0;
5904}
5905
5906struct meta_sock_cmd {
5907	size_t pkt_size;
5908	int (*fn)(struct drbd_connection *connection, struct packet_info *);
5909};
5910
5911static void set_rcvtimeo(struct drbd_connection *connection, bool ping_timeout)
5912{
5913	long t;
5914	struct net_conf *nc;
5915
5916	rcu_read_lock();
5917	nc = rcu_dereference(connection->net_conf);
5918	t = ping_timeout ? nc->ping_timeo : nc->ping_int;
5919	rcu_read_unlock();
5920
5921	t *= HZ;
5922	if (ping_timeout)
5923		t /= 10;
5924
5925	connection->meta.socket->sk->sk_rcvtimeo = t;
5926}
5927
5928static void set_ping_timeout(struct drbd_connection *connection)
5929{
5930	set_rcvtimeo(connection, 1);
5931}
5932
5933static void set_idle_timeout(struct drbd_connection *connection)
5934{
5935	set_rcvtimeo(connection, 0);
5936}
5937
5938static struct meta_sock_cmd ack_receiver_tbl[] = {
5939	[P_PING]	    = { 0, got_Ping },
5940	[P_PING_ACK]	    = { 0, got_PingAck },
5941	[P_RECV_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
5942	[P_WRITE_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
5943	[P_RS_WRITE_ACK]    = { sizeof(struct p_block_ack), got_BlockAck },
5944	[P_SUPERSEDED]   = { sizeof(struct p_block_ack), got_BlockAck },
5945	[P_NEG_ACK]	    = { sizeof(struct p_block_ack), got_NegAck },
5946	[P_NEG_DREPLY]	    = { sizeof(struct p_block_ack), got_NegDReply },
5947	[P_NEG_RS_DREPLY]   = { sizeof(struct p_block_ack), got_NegRSDReply },
5948	[P_OV_RESULT]	    = { sizeof(struct p_block_ack), got_OVResult },
5949	[P_BARRIER_ACK]	    = { sizeof(struct p_barrier_ack), got_BarrierAck },
5950	[P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
5951	[P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
5952	[P_DELAY_PROBE]     = { sizeof(struct p_delay_probe93), got_skip },
5953	[P_RS_CANCEL]       = { sizeof(struct p_block_ack), got_NegRSDReply },
5954	[P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
5955	[P_RETRY_WRITE]	    = { sizeof(struct p_block_ack), got_BlockAck },
5956};
5957
5958int drbd_ack_receiver(struct drbd_thread *thi)
5959{
5960	struct drbd_connection *connection = thi->connection;
5961	struct meta_sock_cmd *cmd = NULL;
5962	struct packet_info pi;
5963	unsigned long pre_recv_jif;
5964	int rv;
5965	void *buf    = connection->meta.rbuf;
5966	int received = 0;
5967	unsigned int header_size = drbd_header_size(connection);
5968	int expect   = header_size;
5969	bool ping_timeout_active = false;
5970
5971	sched_set_fifo_low(current);
5972
5973	while (get_t_state(thi) == RUNNING) {
5974		drbd_thread_current_set_cpu(thi);
5975
5976		conn_reclaim_net_peer_reqs(connection);
5977
5978		if (test_and_clear_bit(SEND_PING, &connection->flags)) {
5979			if (drbd_send_ping(connection)) {
5980				drbd_err(connection, "drbd_send_ping has failed\n");
5981				goto reconnect;
5982			}
5983			set_ping_timeout(connection);
5984			ping_timeout_active = true;
5985		}
5986
5987		pre_recv_jif = jiffies;
5988		rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0);
5989
5990		/* Note:
5991		 * -EINTR	 (on meta) we got a signal
5992		 * -EAGAIN	 (on meta) rcvtimeo expired
5993		 * -ECONNRESET	 other side closed the connection
5994		 * -ERESTARTSYS  (on data) we got a signal
5995		 * rv <  0	 other than above: unexpected error!
5996		 * rv == expected: full header or command
5997		 * rv <  expected: "woken" by signal during receive
5998		 * rv == 0	 : "connection shut down by peer"
5999		 */
6000		if (likely(rv > 0)) {
6001			received += rv;
6002			buf	 += rv;
6003		} else if (rv == 0) {
6004			if (test_bit(DISCONNECT_SENT, &connection->flags)) {
6005				long t;
6006				rcu_read_lock();
6007				t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
6008				rcu_read_unlock();
6009
6010				t = wait_event_timeout(connection->ping_wait,
6011						       connection->cstate < C_WF_REPORT_PARAMS,
6012						       t);
6013				if (t)
6014					break;
6015			}
6016			drbd_err(connection, "meta connection shut down by peer.\n");
6017			goto reconnect;
6018		} else if (rv == -EAGAIN) {
6019			/* If the data socket received something meanwhile,
6020			 * that is good enough: peer is still alive. */
6021			if (time_after(connection->last_received, pre_recv_jif))
6022				continue;
6023			if (ping_timeout_active) {
6024				drbd_err(connection, "PingAck did not arrive in time.\n");
6025				goto reconnect;
6026			}
6027			set_bit(SEND_PING, &connection->flags);
6028			continue;
6029		} else if (rv == -EINTR) {
6030			/* maybe drbd_thread_stop(): the while condition will notice.
6031			 * maybe woken for send_ping: we'll send a ping above,
6032			 * and change the rcvtimeo */
6033			flush_signals(current);
6034			continue;
6035		} else {
6036			drbd_err(connection, "sock_recvmsg returned %d\n", rv);
6037			goto reconnect;
6038		}
6039
6040		if (received == expect && cmd == NULL) {
6041			if (decode_header(connection, connection->meta.rbuf, &pi))
6042				goto reconnect;
6043			cmd = &ack_receiver_tbl[pi.cmd];
6044			if (pi.cmd >= ARRAY_SIZE(ack_receiver_tbl) || !cmd->fn) {
6045				drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
6046					 cmdname(pi.cmd), pi.cmd);
6047				goto disconnect;
6048			}
6049			expect = header_size + cmd->pkt_size;
6050			if (pi.size != expect - header_size) {
6051				drbd_err(connection, "Wrong packet size on meta (c: %d, l: %d)\n",
6052					pi.cmd, pi.size);
6053				goto reconnect;
6054			}
6055		}
6056		if (received == expect) {
6057			bool err;
6058
6059			err = cmd->fn(connection, &pi);
6060			if (err) {
6061				drbd_err(connection, "%ps failed\n", cmd->fn);
6062				goto reconnect;
6063			}
6064
6065			connection->last_received = jiffies;
6066
6067			if (cmd == &ack_receiver_tbl[P_PING_ACK]) {
6068				set_idle_timeout(connection);
6069				ping_timeout_active = false;
6070			}
6071
6072			buf	 = connection->meta.rbuf;
6073			received = 0;
6074			expect	 = header_size;
6075			cmd	 = NULL;
6076		}
6077	}
6078
6079	if (0) {
6080reconnect:
6081		conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
6082		conn_md_sync(connection);
6083	}
6084	if (0) {
6085disconnect:
6086		conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
6087	}
6088
6089	drbd_info(connection, "ack_receiver terminated\n");
6090
6091	return 0;
6092}
6093
6094void drbd_send_acks_wf(struct work_struct *ws)
6095{
6096	struct drbd_peer_device *peer_device =
6097		container_of(ws, struct drbd_peer_device, send_acks_work);
6098	struct drbd_connection *connection = peer_device->connection;
6099	struct drbd_device *device = peer_device->device;
6100	struct net_conf *nc;
6101	int tcp_cork, err;
6102
6103	rcu_read_lock();
6104	nc = rcu_dereference(connection->net_conf);
6105	tcp_cork = nc->tcp_cork;
6106	rcu_read_unlock();
6107
6108	if (tcp_cork)
6109		tcp_sock_set_cork(connection->meta.socket->sk, true);
6110
6111	err = drbd_finish_peer_reqs(device);
6112	kref_put(&device->kref, drbd_destroy_device);
6113	/* get is in drbd_endio_write_sec_final(). That is necessary to keep the
6114	   struct work_struct send_acks_work alive, which is in the peer_device object */
6115
6116	if (err) {
6117		conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
6118		return;
6119	}
6120
6121	if (tcp_cork)
6122		tcp_sock_set_cork(connection->meta.socket->sk, false);
6123
6124	return;
6125}
Configure Feed

Configure Feed