drivers/block/drbd/drbd_nl.c at v3.15-rc8

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / drivers / block / drbd / drbd_nl.c
at v3.15-rc8 3549 lines 103 kB view raw
wrap content
   1/*
   2   drbd_nl.c
   3
   4   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   5
   6   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   7   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   8   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
   9
  10   drbd is free software; you can redistribute it and/or modify
  11   it under the terms of the GNU General Public License as published by
  12   the Free Software Foundation; either version 2, or (at your option)
  13   any later version.
  14
  15   drbd is distributed in the hope that it will be useful,
  16   but WITHOUT ANY WARRANTY; without even the implied warranty of
  17   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18   GNU General Public License for more details.
  19
  20   You should have received a copy of the GNU General Public License
  21   along with drbd; see the file COPYING.  If not, write to
  22   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  23
  24 */
  25
  26#include <linux/module.h>
  27#include <linux/drbd.h>
  28#include <linux/in.h>
  29#include <linux/fs.h>
  30#include <linux/file.h>
  31#include <linux/slab.h>
  32#include <linux/blkpg.h>
  33#include <linux/cpumask.h>
  34#include "drbd_int.h"
  35#include "drbd_protocol.h"
  36#include "drbd_req.h"
  37#include "drbd_wrappers.h"
  38#include <asm/unaligned.h>
  39#include <linux/drbd_limits.h>
  40#include <linux/kthread.h>
  41
  42#include <net/genetlink.h>
  43
  44/* .doit */
  45// int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info);
  46// int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info);
  47
  48int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info);
  49int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info);
  50
  51int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info);
  52int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info);
  53int drbd_adm_down(struct sk_buff *skb, struct genl_info *info);
  54
  55int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info);
  56int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info);
  57int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info);
  58int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info);
  59int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info);
  60int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info);
  61int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info);
  62int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info);
  63int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info);
  64int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info);
  65int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info);
  66int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info);
  67int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info);
  68int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info);
  69int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info);
  70int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info);
  71int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info);
  72int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info);
  73int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
  74int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
  75/* .dumpit */
  76int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
  77
  78#include <linux/drbd_genl_api.h>
  79#include "drbd_nla.h"
  80#include <linux/genl_magic_func.h>
  81
  82/* used blkdev_get_by_path, to claim our meta data device(s) */
  83static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
  84
  85/* Configuration is strictly serialized, because generic netlink message
  86 * processing is strictly serialized by the genl_lock().
  87 * Which means we can use one static global drbd_config_context struct.
  88 */
  89static struct drbd_config_context {
  90	/* assigned from drbd_genlmsghdr */
  91	unsigned int minor;
  92	/* assigned from request attributes, if present */
  93	unsigned int volume;
  94#define VOLUME_UNSPECIFIED		(-1U)
  95	/* pointer into the request skb,
  96	 * limited lifetime! */
  97	char *resource_name;
  98	struct nlattr *my_addr;
  99	struct nlattr *peer_addr;
 100
 101	/* reply buffer */
 102	struct sk_buff *reply_skb;
 103	/* pointer into reply buffer */
 104	struct drbd_genlmsghdr *reply_dh;
 105	/* resolved from attributes, if possible */
 106	struct drbd_device *device;
 107	struct drbd_resource *resource;
 108	struct drbd_connection *connection;
 109} adm_ctx;
 110
 111static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
 112{
 113	genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
 114	if (genlmsg_reply(skb, info))
 115		printk(KERN_ERR "drbd: error sending genl reply\n");
 116}
 117
 118/* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
 119 * reason it could fail was no space in skb, and there are 4k available. */
 120int drbd_msg_put_info(const char *info)
 121{
 122	struct sk_buff *skb = adm_ctx.reply_skb;
 123	struct nlattr *nla;
 124	int err = -EMSGSIZE;
 125
 126	if (!info || !info[0])
 127		return 0;
 128
 129	nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY);
 130	if (!nla)
 131		return err;
 132
 133	err = nla_put_string(skb, T_info_text, info);
 134	if (err) {
 135		nla_nest_cancel(skb, nla);
 136		return err;
 137	} else
 138		nla_nest_end(skb, nla);
 139	return 0;
 140}
 141
 142/* This would be a good candidate for a "pre_doit" hook,
 143 * and per-family private info->pointers.
 144 * But we need to stay compatible with older kernels.
 145 * If it returns successfully, adm_ctx members are valid.
 146 */
 147#define DRBD_ADM_NEED_MINOR	1
 148#define DRBD_ADM_NEED_RESOURCE	2
 149#define DRBD_ADM_NEED_CONNECTION 4
 150static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info,
 151		unsigned flags)
 152{
 153	struct drbd_genlmsghdr *d_in = info->userhdr;
 154	const u8 cmd = info->genlhdr->cmd;
 155	int err;
 156
 157	memset(&adm_ctx, 0, sizeof(adm_ctx));
 158
 159	/* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */
 160	if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN))
 161	       return -EPERM;
 162
 163	adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
 164	if (!adm_ctx.reply_skb) {
 165		err = -ENOMEM;
 166		goto fail;
 167	}
 168
 169	adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb,
 170					info, &drbd_genl_family, 0, cmd);
 171	/* put of a few bytes into a fresh skb of >= 4k will always succeed.
 172	 * but anyways */
 173	if (!adm_ctx.reply_dh) {
 174		err = -ENOMEM;
 175		goto fail;
 176	}
 177
 178	adm_ctx.reply_dh->minor = d_in->minor;
 179	adm_ctx.reply_dh->ret_code = NO_ERROR;
 180
 181	adm_ctx.volume = VOLUME_UNSPECIFIED;
 182	if (info->attrs[DRBD_NLA_CFG_CONTEXT]) {
 183		struct nlattr *nla;
 184		/* parse and validate only */
 185		err = drbd_cfg_context_from_attrs(NULL, info);
 186		if (err)
 187			goto fail;
 188
 189		/* It was present, and valid,
 190		 * copy it over to the reply skb. */
 191		err = nla_put_nohdr(adm_ctx.reply_skb,
 192				info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len,
 193				info->attrs[DRBD_NLA_CFG_CONTEXT]);
 194		if (err)
 195			goto fail;
 196
 197		/* and assign stuff to the global adm_ctx */
 198		nla = nested_attr_tb[__nla_type(T_ctx_volume)];
 199		if (nla)
 200			adm_ctx.volume = nla_get_u32(nla);
 201		nla = nested_attr_tb[__nla_type(T_ctx_resource_name)];
 202		if (nla)
 203			adm_ctx.resource_name = nla_data(nla);
 204		adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)];
 205		adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)];
 206		if ((adm_ctx.my_addr &&
 207		     nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.connection->my_addr)) ||
 208		    (adm_ctx.peer_addr &&
 209		     nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.connection->peer_addr))) {
 210			err = -EINVAL;
 211			goto fail;
 212		}
 213	}
 214
 215	adm_ctx.minor = d_in->minor;
 216	adm_ctx.device = minor_to_device(d_in->minor);
 217	if (adm_ctx.resource_name) {
 218		adm_ctx.resource = drbd_find_resource(adm_ctx.resource_name);
 219	}
 220
 221	if (!adm_ctx.device && (flags & DRBD_ADM_NEED_MINOR)) {
 222		drbd_msg_put_info("unknown minor");
 223		return ERR_MINOR_INVALID;
 224	}
 225	if (!adm_ctx.resource && (flags & DRBD_ADM_NEED_RESOURCE)) {
 226		drbd_msg_put_info("unknown resource");
 227		if (adm_ctx.resource_name)
 228			return ERR_RES_NOT_KNOWN;
 229		return ERR_INVALID_REQUEST;
 230	}
 231
 232	if (flags & DRBD_ADM_NEED_CONNECTION) {
 233		if (adm_ctx.resource) {
 234			drbd_msg_put_info("no resource name expected");
 235			return ERR_INVALID_REQUEST;
 236		}
 237		if (adm_ctx.device) {
 238			drbd_msg_put_info("no minor number expected");
 239			return ERR_INVALID_REQUEST;
 240		}
 241		if (adm_ctx.my_addr && adm_ctx.peer_addr)
 242			adm_ctx.connection = conn_get_by_addrs(nla_data(adm_ctx.my_addr),
 243							  nla_len(adm_ctx.my_addr),
 244							  nla_data(adm_ctx.peer_addr),
 245							  nla_len(adm_ctx.peer_addr));
 246		if (!adm_ctx.connection) {
 247			drbd_msg_put_info("unknown connection");
 248			return ERR_INVALID_REQUEST;
 249		}
 250	}
 251
 252	/* some more paranoia, if the request was over-determined */
 253	if (adm_ctx.device && adm_ctx.resource &&
 254	    adm_ctx.device->resource != adm_ctx.resource) {
 255		pr_warning("request: minor=%u, resource=%s; but that minor belongs to resource %s\n",
 256				adm_ctx.minor, adm_ctx.resource->name,
 257				adm_ctx.device->resource->name);
 258		drbd_msg_put_info("minor exists in different resource");
 259		return ERR_INVALID_REQUEST;
 260	}
 261	if (adm_ctx.device &&
 262	    adm_ctx.volume != VOLUME_UNSPECIFIED &&
 263	    adm_ctx.volume != adm_ctx.device->vnr) {
 264		pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n",
 265				adm_ctx.minor, adm_ctx.volume,
 266				adm_ctx.device->vnr,
 267				adm_ctx.device->resource->name);
 268		drbd_msg_put_info("minor exists as different volume");
 269		return ERR_INVALID_REQUEST;
 270	}
 271
 272	return NO_ERROR;
 273
 274fail:
 275	nlmsg_free(adm_ctx.reply_skb);
 276	adm_ctx.reply_skb = NULL;
 277	return err;
 278}
 279
 280static int drbd_adm_finish(struct genl_info *info, int retcode)
 281{
 282	if (adm_ctx.connection) {
 283		kref_put(&adm_ctx.connection->kref, drbd_destroy_connection);
 284		adm_ctx.connection = NULL;
 285	}
 286	if (adm_ctx.resource) {
 287		kref_put(&adm_ctx.resource->kref, drbd_destroy_resource);
 288		adm_ctx.resource = NULL;
 289	}
 290
 291	if (!adm_ctx.reply_skb)
 292		return -ENOMEM;
 293
 294	adm_ctx.reply_dh->ret_code = retcode;
 295	drbd_adm_send_reply(adm_ctx.reply_skb, info);
 296	return 0;
 297}
 298
 299static void setup_khelper_env(struct drbd_connection *connection, char **envp)
 300{
 301	char *afs;
 302
 303	/* FIXME: A future version will not allow this case. */
 304	if (connection->my_addr_len == 0 || connection->peer_addr_len == 0)
 305		return;
 306
 307	switch (((struct sockaddr *)&connection->peer_addr)->sa_family) {
 308	case AF_INET6:
 309		afs = "ipv6";
 310		snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6",
 311			 &((struct sockaddr_in6 *)&connection->peer_addr)->sin6_addr);
 312		break;
 313	case AF_INET:
 314		afs = "ipv4";
 315		snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
 316			 &((struct sockaddr_in *)&connection->peer_addr)->sin_addr);
 317		break;
 318	default:
 319		afs = "ssocks";
 320		snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
 321			 &((struct sockaddr_in *)&connection->peer_addr)->sin_addr);
 322	}
 323	snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs);
 324}
 325
 326int drbd_khelper(struct drbd_device *device, char *cmd)
 327{
 328	char *envp[] = { "HOME=/",
 329			"TERM=linux",
 330			"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
 331			 (char[20]) { }, /* address family */
 332			 (char[60]) { }, /* address */
 333			NULL };
 334	char mb[12];
 335	char *argv[] = {usermode_helper, cmd, mb, NULL };
 336	struct drbd_connection *connection = first_peer_device(device)->connection;
 337	struct sib_info sib;
 338	int ret;
 339
 340	if (current == connection->worker.task)
 341		set_bit(CALLBACK_PENDING, &connection->flags);
 342
 343	snprintf(mb, 12, "minor-%d", device_to_minor(device));
 344	setup_khelper_env(connection, envp);
 345
 346	/* The helper may take some time.
 347	 * write out any unsynced meta data changes now */
 348	drbd_md_sync(device);
 349
 350	drbd_info(device, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
 351	sib.sib_reason = SIB_HELPER_PRE;
 352	sib.helper_name = cmd;
 353	drbd_bcast_event(device, &sib);
 354	ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
 355	if (ret)
 356		drbd_warn(device, "helper command: %s %s %s exit code %u (0x%x)\n",
 357				usermode_helper, cmd, mb,
 358				(ret >> 8) & 0xff, ret);
 359	else
 360		drbd_info(device, "helper command: %s %s %s exit code %u (0x%x)\n",
 361				usermode_helper, cmd, mb,
 362				(ret >> 8) & 0xff, ret);
 363	sib.sib_reason = SIB_HELPER_POST;
 364	sib.helper_exit_code = ret;
 365	drbd_bcast_event(device, &sib);
 366
 367	if (current == connection->worker.task)
 368		clear_bit(CALLBACK_PENDING, &connection->flags);
 369
 370	if (ret < 0) /* Ignore any ERRNOs we got. */
 371		ret = 0;
 372
 373	return ret;
 374}
 375
 376static int conn_khelper(struct drbd_connection *connection, char *cmd)
 377{
 378	char *envp[] = { "HOME=/",
 379			"TERM=linux",
 380			"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
 381			 (char[20]) { }, /* address family */
 382			 (char[60]) { }, /* address */
 383			NULL };
 384	char *resource_name = connection->resource->name;
 385	char *argv[] = {usermode_helper, cmd, resource_name, NULL };
 386	int ret;
 387
 388	setup_khelper_env(connection, envp);
 389	conn_md_sync(connection);
 390
 391	drbd_info(connection, "helper command: %s %s %s\n", usermode_helper, cmd, resource_name);
 392	/* TODO: conn_bcast_event() ?? */
 393
 394	ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
 395	if (ret)
 396		drbd_warn(connection, "helper command: %s %s %s exit code %u (0x%x)\n",
 397			  usermode_helper, cmd, resource_name,
 398			  (ret >> 8) & 0xff, ret);
 399	else
 400		drbd_info(connection, "helper command: %s %s %s exit code %u (0x%x)\n",
 401			  usermode_helper, cmd, resource_name,
 402			  (ret >> 8) & 0xff, ret);
 403	/* TODO: conn_bcast_event() ?? */
 404
 405	if (ret < 0) /* Ignore any ERRNOs we got. */
 406		ret = 0;
 407
 408	return ret;
 409}
 410
 411static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connection)
 412{
 413	enum drbd_fencing_p fp = FP_NOT_AVAIL;
 414	struct drbd_peer_device *peer_device;
 415	int vnr;
 416
 417	rcu_read_lock();
 418	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
 419		struct drbd_device *device = peer_device->device;
 420		if (get_ldev_if_state(device, D_CONSISTENT)) {
 421			struct disk_conf *disk_conf =
 422				rcu_dereference(peer_device->device->ldev->disk_conf);
 423			fp = max_t(enum drbd_fencing_p, fp, disk_conf->fencing);
 424			put_ldev(device);
 425		}
 426	}
 427	rcu_read_unlock();
 428
 429	return fp;
 430}
 431
 432bool conn_try_outdate_peer(struct drbd_connection *connection)
 433{
 434	unsigned int connect_cnt;
 435	union drbd_state mask = { };
 436	union drbd_state val = { };
 437	enum drbd_fencing_p fp;
 438	char *ex_to_string;
 439	int r;
 440
 441	if (connection->cstate >= C_WF_REPORT_PARAMS) {
 442		drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n");
 443		return false;
 444	}
 445
 446	spin_lock_irq(&connection->resource->req_lock);
 447	connect_cnt = connection->connect_cnt;
 448	spin_unlock_irq(&connection->resource->req_lock);
 449
 450	fp = highest_fencing_policy(connection);
 451	switch (fp) {
 452	case FP_NOT_AVAIL:
 453		drbd_warn(connection, "Not fencing peer, I'm not even Consistent myself.\n");
 454		goto out;
 455	case FP_DONT_CARE:
 456		return true;
 457	default: ;
 458	}
 459
 460	r = conn_khelper(connection, "fence-peer");
 461
 462	switch ((r>>8) & 0xff) {
 463	case 3: /* peer is inconsistent */
 464		ex_to_string = "peer is inconsistent or worse";
 465		mask.pdsk = D_MASK;
 466		val.pdsk = D_INCONSISTENT;
 467		break;
 468	case 4: /* peer got outdated, or was already outdated */
 469		ex_to_string = "peer was fenced";
 470		mask.pdsk = D_MASK;
 471		val.pdsk = D_OUTDATED;
 472		break;
 473	case 5: /* peer was down */
 474		if (conn_highest_disk(connection) == D_UP_TO_DATE) {
 475			/* we will(have) create(d) a new UUID anyways... */
 476			ex_to_string = "peer is unreachable, assumed to be dead";
 477			mask.pdsk = D_MASK;
 478			val.pdsk = D_OUTDATED;
 479		} else {
 480			ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
 481		}
 482		break;
 483	case 6: /* Peer is primary, voluntarily outdate myself.
 484		 * This is useful when an unconnected R_SECONDARY is asked to
 485		 * become R_PRIMARY, but finds the other peer being active. */
 486		ex_to_string = "peer is active";
 487		drbd_warn(connection, "Peer is primary, outdating myself.\n");
 488		mask.disk = D_MASK;
 489		val.disk = D_OUTDATED;
 490		break;
 491	case 7:
 492		if (fp != FP_STONITH)
 493			drbd_err(connection, "fence-peer() = 7 && fencing != Stonith !!!\n");
 494		ex_to_string = "peer was stonithed";
 495		mask.pdsk = D_MASK;
 496		val.pdsk = D_OUTDATED;
 497		break;
 498	default:
 499		/* The script is broken ... */
 500		drbd_err(connection, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
 501		return false; /* Eventually leave IO frozen */
 502	}
 503
 504	drbd_info(connection, "fence-peer helper returned %d (%s)\n",
 505		  (r>>8) & 0xff, ex_to_string);
 506
 507 out:
 508
 509	/* Not using
 510	   conn_request_state(connection, mask, val, CS_VERBOSE);
 511	   here, because we might were able to re-establish the connection in the
 512	   meantime. */
 513	spin_lock_irq(&connection->resource->req_lock);
 514	if (connection->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &connection->flags)) {
 515		if (connection->connect_cnt != connect_cnt)
 516			/* In case the connection was established and droped
 517			   while the fence-peer handler was running, ignore it */
 518			drbd_info(connection, "Ignoring fence-peer exit code\n");
 519		else
 520			_conn_request_state(connection, mask, val, CS_VERBOSE);
 521	}
 522	spin_unlock_irq(&connection->resource->req_lock);
 523
 524	return conn_highest_pdsk(connection) <= D_OUTDATED;
 525}
 526
 527static int _try_outdate_peer_async(void *data)
 528{
 529	struct drbd_connection *connection = (struct drbd_connection *)data;
 530
 531	conn_try_outdate_peer(connection);
 532
 533	kref_put(&connection->kref, drbd_destroy_connection);
 534	return 0;
 535}
 536
 537void conn_try_outdate_peer_async(struct drbd_connection *connection)
 538{
 539	struct task_struct *opa;
 540
 541	kref_get(&connection->kref);
 542	opa = kthread_run(_try_outdate_peer_async, connection, "drbd_async_h");
 543	if (IS_ERR(opa)) {
 544		drbd_err(connection, "out of mem, failed to invoke fence-peer helper\n");
 545		kref_put(&connection->kref, drbd_destroy_connection);
 546	}
 547}
 548
 549enum drbd_state_rv
 550drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
 551{
 552	const int max_tries = 4;
 553	enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
 554	struct net_conf *nc;
 555	int try = 0;
 556	int forced = 0;
 557	union drbd_state mask, val;
 558
 559	if (new_role == R_PRIMARY) {
 560		struct drbd_connection *connection;
 561
 562		/* Detect dead peers as soon as possible.  */
 563
 564		rcu_read_lock();
 565		for_each_connection(connection, device->resource)
 566			request_ping(connection);
 567		rcu_read_unlock();
 568	}
 569
 570	mutex_lock(device->state_mutex);
 571
 572	mask.i = 0; mask.role = R_MASK;
 573	val.i  = 0; val.role  = new_role;
 574
 575	while (try++ < max_tries) {
 576		rv = _drbd_request_state(device, mask, val, CS_WAIT_COMPLETE);
 577
 578		/* in case we first succeeded to outdate,
 579		 * but now suddenly could establish a connection */
 580		if (rv == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
 581			val.pdsk = 0;
 582			mask.pdsk = 0;
 583			continue;
 584		}
 585
 586		if (rv == SS_NO_UP_TO_DATE_DISK && force &&
 587		    (device->state.disk < D_UP_TO_DATE &&
 588		     device->state.disk >= D_INCONSISTENT)) {
 589			mask.disk = D_MASK;
 590			val.disk  = D_UP_TO_DATE;
 591			forced = 1;
 592			continue;
 593		}
 594
 595		if (rv == SS_NO_UP_TO_DATE_DISK &&
 596		    device->state.disk == D_CONSISTENT && mask.pdsk == 0) {
 597			D_ASSERT(device, device->state.pdsk == D_UNKNOWN);
 598
 599			if (conn_try_outdate_peer(first_peer_device(device)->connection)) {
 600				val.disk = D_UP_TO_DATE;
 601				mask.disk = D_MASK;
 602			}
 603			continue;
 604		}
 605
 606		if (rv == SS_NOTHING_TO_DO)
 607			goto out;
 608		if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
 609			if (!conn_try_outdate_peer(first_peer_device(device)->connection) && force) {
 610				drbd_warn(device, "Forced into split brain situation!\n");
 611				mask.pdsk = D_MASK;
 612				val.pdsk  = D_OUTDATED;
 613
 614			}
 615			continue;
 616		}
 617		if (rv == SS_TWO_PRIMARIES) {
 618			/* Maybe the peer is detected as dead very soon...
 619			   retry at most once more in this case. */
 620			int timeo;
 621			rcu_read_lock();
 622			nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
 623			timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1;
 624			rcu_read_unlock();
 625			schedule_timeout_interruptible(timeo);
 626			if (try < max_tries)
 627				try = max_tries - 1;
 628			continue;
 629		}
 630		if (rv < SS_SUCCESS) {
 631			rv = _drbd_request_state(device, mask, val,
 632						CS_VERBOSE + CS_WAIT_COMPLETE);
 633			if (rv < SS_SUCCESS)
 634				goto out;
 635		}
 636		break;
 637	}
 638
 639	if (rv < SS_SUCCESS)
 640		goto out;
 641
 642	if (forced)
 643		drbd_warn(device, "Forced to consider local data as UpToDate!\n");
 644
 645	/* Wait until nothing is on the fly :) */
 646	wait_event(device->misc_wait, atomic_read(&device->ap_pending_cnt) == 0);
 647
 648	/* FIXME also wait for all pending P_BARRIER_ACK? */
 649
 650	if (new_role == R_SECONDARY) {
 651		set_disk_ro(device->vdisk, true);
 652		if (get_ldev(device)) {
 653			device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
 654			put_ldev(device);
 655		}
 656	} else {
 657		mutex_lock(&device->resource->conf_update);
 658		nc = first_peer_device(device)->connection->net_conf;
 659		if (nc)
 660			nc->discard_my_data = 0; /* without copy; single bit op is atomic */
 661		mutex_unlock(&device->resource->conf_update);
 662
 663		set_disk_ro(device->vdisk, false);
 664		if (get_ldev(device)) {
 665			if (((device->state.conn < C_CONNECTED ||
 666			       device->state.pdsk <= D_FAILED)
 667			      && device->ldev->md.uuid[UI_BITMAP] == 0) || forced)
 668				drbd_uuid_new_current(device);
 669
 670			device->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
 671			put_ldev(device);
 672		}
 673	}
 674
 675	/* writeout of activity log covered areas of the bitmap
 676	 * to stable storage done in after state change already */
 677
 678	if (device->state.conn >= C_WF_REPORT_PARAMS) {
 679		/* if this was forced, we should consider sync */
 680		if (forced)
 681			drbd_send_uuids(first_peer_device(device));
 682		drbd_send_current_state(first_peer_device(device));
 683	}
 684
 685	drbd_md_sync(device);
 686
 687	kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
 688out:
 689	mutex_unlock(device->state_mutex);
 690	return rv;
 691}
 692
 693static const char *from_attrs_err_to_txt(int err)
 694{
 695	return	err == -ENOMSG ? "required attribute missing" :
 696		err == -EOPNOTSUPP ? "unknown mandatory attribute" :
 697		err == -EEXIST ? "can not change invariant setting" :
 698		"invalid attribute value";
 699}
 700
 701int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info)
 702{
 703	struct set_role_parms parms;
 704	int err;
 705	enum drbd_ret_code retcode;
 706
 707	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
 708	if (!adm_ctx.reply_skb)
 709		return retcode;
 710	if (retcode != NO_ERROR)
 711		goto out;
 712
 713	memset(&parms, 0, sizeof(parms));
 714	if (info->attrs[DRBD_NLA_SET_ROLE_PARMS]) {
 715		err = set_role_parms_from_attrs(&parms, info);
 716		if (err) {
 717			retcode = ERR_MANDATORY_TAG;
 718			drbd_msg_put_info(from_attrs_err_to_txt(err));
 719			goto out;
 720		}
 721	}
 722
 723	if (info->genlhdr->cmd == DRBD_ADM_PRIMARY)
 724		retcode = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate);
 725	else
 726		retcode = drbd_set_role(adm_ctx.device, R_SECONDARY, 0);
 727out:
 728	drbd_adm_finish(info, retcode);
 729	return 0;
 730}
 731
 732/* Initializes the md.*_offset members, so we are able to find
 733 * the on disk meta data.
 734 *
 735 * We currently have two possible layouts:
 736 * external:
 737 *   |----------- md_size_sect ------------------|
 738 *   [ 4k superblock ][ activity log ][  Bitmap  ]
 739 *   | al_offset == 8 |
 740 *   | bm_offset = al_offset + X      |
 741 *  ==> bitmap sectors = md_size_sect - bm_offset
 742 *
 743 * internal:
 744 *            |----------- md_size_sect ------------------|
 745 * [data.....][  Bitmap  ][ activity log ][ 4k superblock ]
 746 *                        | al_offset < 0 |
 747 *            | bm_offset = al_offset - Y |
 748 *  ==> bitmap sectors = Y = al_offset - bm_offset
 749 *
 750 *  Activity log size used to be fixed 32kB,
 751 *  but is about to become configurable.
 752 */
 753static void drbd_md_set_sector_offsets(struct drbd_device *device,
 754				       struct drbd_backing_dev *bdev)
 755{
 756	sector_t md_size_sect = 0;
 757	unsigned int al_size_sect = bdev->md.al_size_4k * 8;
 758
 759	bdev->md.md_offset = drbd_md_ss(bdev);
 760
 761	switch (bdev->md.meta_dev_idx) {
 762	default:
 763		/* v07 style fixed size indexed meta data */
 764		bdev->md.md_size_sect = MD_128MB_SECT;
 765		bdev->md.al_offset = MD_4kB_SECT;
 766		bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
 767		break;
 768	case DRBD_MD_INDEX_FLEX_EXT:
 769		/* just occupy the full device; unit: sectors */
 770		bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
 771		bdev->md.al_offset = MD_4kB_SECT;
 772		bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
 773		break;
 774	case DRBD_MD_INDEX_INTERNAL:
 775	case DRBD_MD_INDEX_FLEX_INT:
 776		/* al size is still fixed */
 777		bdev->md.al_offset = -al_size_sect;
 778		/* we need (slightly less than) ~ this much bitmap sectors: */
 779		md_size_sect = drbd_get_capacity(bdev->backing_bdev);
 780		md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
 781		md_size_sect = BM_SECT_TO_EXT(md_size_sect);
 782		md_size_sect = ALIGN(md_size_sect, 8);
 783
 784		/* plus the "drbd meta data super block",
 785		 * and the activity log; */
 786		md_size_sect += MD_4kB_SECT + al_size_sect;
 787
 788		bdev->md.md_size_sect = md_size_sect;
 789		/* bitmap offset is adjusted by 'super' block size */
 790		bdev->md.bm_offset   = -md_size_sect + MD_4kB_SECT;
 791		break;
 792	}
 793}
 794
 795/* input size is expected to be in KB */
 796char *ppsize(char *buf, unsigned long long size)
 797{
 798	/* Needs 9 bytes at max including trailing NUL:
 799	 * -1ULL ==> "16384 EB" */
 800	static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
 801	int base = 0;
 802	while (size >= 10000 && base < sizeof(units)-1) {
 803		/* shift + round */
 804		size = (size >> 10) + !!(size & (1<<9));
 805		base++;
 806	}
 807	sprintf(buf, "%u %cB", (unsigned)size, units[base]);
 808
 809	return buf;
 810}
 811
 812/* there is still a theoretical deadlock when called from receiver
 813 * on an D_INCONSISTENT R_PRIMARY:
 814 *  remote READ does inc_ap_bio, receiver would need to receive answer
 815 *  packet from remote to dec_ap_bio again.
 816 *  receiver receive_sizes(), comes here,
 817 *  waits for ap_bio_cnt == 0. -> deadlock.
 818 * but this cannot happen, actually, because:
 819 *  R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable
 820 *  (not connected, or bad/no disk on peer):
 821 *  see drbd_fail_request_early, ap_bio_cnt is zero.
 822 *  R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
 823 *  peer may not initiate a resize.
 824 */
 825/* Note these are not to be confused with
 826 * drbd_adm_suspend_io/drbd_adm_resume_io,
 827 * which are (sub) state changes triggered by admin (drbdsetup),
 828 * and can be long lived.
 829 * This changes an device->flag, is triggered by drbd internals,
 830 * and should be short-lived. */
 831void drbd_suspend_io(struct drbd_device *device)
 832{
 833	set_bit(SUSPEND_IO, &device->flags);
 834	if (drbd_suspended(device))
 835		return;
 836	wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt));
 837}
 838
 839void drbd_resume_io(struct drbd_device *device)
 840{
 841	clear_bit(SUSPEND_IO, &device->flags);
 842	wake_up(&device->misc_wait);
 843}
 844
 845/**
 846 * drbd_determine_dev_size() -  Sets the right device size obeying all constraints
 847 * @device:	DRBD device.
 848 *
 849 * Returns 0 on success, negative return values indicate errors.
 850 * You should call drbd_md_sync() after calling this function.
 851 */
 852enum determine_dev_size
 853drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local)
 854{
 855	sector_t prev_first_sect, prev_size; /* previous meta location */
 856	sector_t la_size_sect, u_size;
 857	struct drbd_md *md = &device->ldev->md;
 858	u32 prev_al_stripe_size_4k;
 859	u32 prev_al_stripes;
 860	sector_t size;
 861	char ppb[10];
 862	void *buffer;
 863
 864	int md_moved, la_size_changed;
 865	enum determine_dev_size rv = DS_UNCHANGED;
 866
 867	/* race:
 868	 * application request passes inc_ap_bio,
 869	 * but then cannot get an AL-reference.
 870	 * this function later may wait on ap_bio_cnt == 0. -> deadlock.
 871	 *
 872	 * to avoid that:
 873	 * Suspend IO right here.
 874	 * still lock the act_log to not trigger ASSERTs there.
 875	 */
 876	drbd_suspend_io(device);
 877	buffer = drbd_md_get_buffer(device); /* Lock meta-data IO */
 878	if (!buffer) {
 879		drbd_resume_io(device);
 880		return DS_ERROR;
 881	}
 882
 883	/* no wait necessary anymore, actually we could assert that */
 884	wait_event(device->al_wait, lc_try_lock(device->act_log));
 885
 886	prev_first_sect = drbd_md_first_sector(device->ldev);
 887	prev_size = device->ldev->md.md_size_sect;
 888	la_size_sect = device->ldev->md.la_size_sect;
 889
 890	if (rs) {
 891		/* rs is non NULL if we should change the AL layout only */
 892
 893		prev_al_stripes = md->al_stripes;
 894		prev_al_stripe_size_4k = md->al_stripe_size_4k;
 895
 896		md->al_stripes = rs->al_stripes;
 897		md->al_stripe_size_4k = rs->al_stripe_size / 4;
 898		md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4;
 899	}
 900
 901	drbd_md_set_sector_offsets(device, device->ldev);
 902
 903	rcu_read_lock();
 904	u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
 905	rcu_read_unlock();
 906	size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED);
 907
 908	if (size < la_size_sect) {
 909		if (rs && u_size == 0) {
 910			/* Remove "rs &&" later. This check should always be active, but
 911			   right now the receiver expects the permissive behavior */
 912			drbd_warn(device, "Implicit shrink not allowed. "
 913				 "Use --size=%llus for explicit shrink.\n",
 914				 (unsigned long long)size);
 915			rv = DS_ERROR_SHRINK;
 916		}
 917		if (u_size > size)
 918			rv = DS_ERROR_SPACE_MD;
 919		if (rv != DS_UNCHANGED)
 920			goto err_out;
 921	}
 922
 923	if (drbd_get_capacity(device->this_bdev) != size ||
 924	    drbd_bm_capacity(device) != size) {
 925		int err;
 926		err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC));
 927		if (unlikely(err)) {
 928			/* currently there is only one error: ENOMEM! */
 929			size = drbd_bm_capacity(device)>>1;
 930			if (size == 0) {
 931				drbd_err(device, "OUT OF MEMORY! "
 932				    "Could not allocate bitmap!\n");
 933			} else {
 934				drbd_err(device, "BM resizing failed. "
 935				    "Leaving size unchanged at size = %lu KB\n",
 936				    (unsigned long)size);
 937			}
 938			rv = DS_ERROR;
 939		}
 940		/* racy, see comments above. */
 941		drbd_set_my_capacity(device, size);
 942		device->ldev->md.la_size_sect = size;
 943		drbd_info(device, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
 944		     (unsigned long long)size>>1);
 945	}
 946	if (rv <= DS_ERROR)
 947		goto err_out;
 948
 949	la_size_changed = (la_size_sect != device->ldev->md.la_size_sect);
 950
 951	md_moved = prev_first_sect != drbd_md_first_sector(device->ldev)
 952		|| prev_size	   != device->ldev->md.md_size_sect;
 953
 954	if (la_size_changed || md_moved || rs) {
 955		u32 prev_flags;
 956
 957		drbd_al_shrink(device); /* All extents inactive. */
 958
 959		prev_flags = md->flags;
 960		md->flags &= ~MDF_PRIMARY_IND;
 961		drbd_md_write(device, buffer);
 962
 963		drbd_info(device, "Writing the whole bitmap, %s\n",
 964			 la_size_changed && md_moved ? "size changed and md moved" :
 965			 la_size_changed ? "size changed" : "md moved");
 966		/* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
 967		drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
 968			       "size changed", BM_LOCKED_MASK);
 969		drbd_initialize_al(device, buffer);
 970
 971		md->flags = prev_flags;
 972		drbd_md_write(device, buffer);
 973
 974		if (rs)
 975			drbd_info(device, "Changed AL layout to al-stripes = %d, al-stripe-size-kB = %d\n",
 976				  md->al_stripes, md->al_stripe_size_4k * 4);
 977	}
 978
 979	if (size > la_size_sect)
 980		rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO;
 981	if (size < la_size_sect)
 982		rv = DS_SHRUNK;
 983
 984	if (0) {
 985	err_out:
 986		if (rs) {
 987			md->al_stripes = prev_al_stripes;
 988			md->al_stripe_size_4k = prev_al_stripe_size_4k;
 989			md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k;
 990
 991			drbd_md_set_sector_offsets(device, device->ldev);
 992		}
 993	}
 994	lc_unlock(device->act_log);
 995	wake_up(&device->al_wait);
 996	drbd_md_put_buffer(device);
 997	drbd_resume_io(device);
 998
 999	return rv;
1000}
1001
1002sector_t
1003drbd_new_dev_size(struct drbd_device *device, struct drbd_backing_dev *bdev,
1004		  sector_t u_size, int assume_peer_has_space)
1005{
1006	sector_t p_size = device->p_size;   /* partner's disk size. */
1007	sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */
1008	sector_t m_size; /* my size */
1009	sector_t size = 0;
1010
1011	m_size = drbd_get_max_capacity(bdev);
1012
1013	if (device->state.conn < C_CONNECTED && assume_peer_has_space) {
1014		drbd_warn(device, "Resize while not connected was forced by the user!\n");
1015		p_size = m_size;
1016	}
1017
1018	if (p_size && m_size) {
1019		size = min_t(sector_t, p_size, m_size);
1020	} else {
1021		if (la_size_sect) {
1022			size = la_size_sect;
1023			if (m_size && m_size < size)
1024				size = m_size;
1025			if (p_size && p_size < size)
1026				size = p_size;
1027		} else {
1028			if (m_size)
1029				size = m_size;
1030			if (p_size)
1031				size = p_size;
1032		}
1033	}
1034
1035	if (size == 0)
1036		drbd_err(device, "Both nodes diskless!\n");
1037
1038	if (u_size) {
1039		if (u_size > size)
1040			drbd_err(device, "Requested disk size is too big (%lu > %lu)\n",
1041			    (unsigned long)u_size>>1, (unsigned long)size>>1);
1042		else
1043			size = u_size;
1044	}
1045
1046	return size;
1047}
1048
1049/**
1050 * drbd_check_al_size() - Ensures that the AL is of the right size
1051 * @device:	DRBD device.
1052 *
1053 * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation
1054 * failed, and 0 on success. You should call drbd_md_sync() after you called
1055 * this function.
1056 */
1057static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
1058{
1059	struct lru_cache *n, *t;
1060	struct lc_element *e;
1061	unsigned int in_use;
1062	int i;
1063
1064	if (device->act_log &&
1065	    device->act_log->nr_elements == dc->al_extents)
1066		return 0;
1067
1068	in_use = 0;
1069	t = device->act_log;
1070	n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION,
1071		dc->al_extents, sizeof(struct lc_element), 0);
1072
1073	if (n == NULL) {
1074		drbd_err(device, "Cannot allocate act_log lru!\n");
1075		return -ENOMEM;
1076	}
1077	spin_lock_irq(&device->al_lock);
1078	if (t) {
1079		for (i = 0; i < t->nr_elements; i++) {
1080			e = lc_element_by_index(t, i);
1081			if (e->refcnt)
1082				drbd_err(device, "refcnt(%d)==%d\n",
1083				    e->lc_number, e->refcnt);
1084			in_use += e->refcnt;
1085		}
1086	}
1087	if (!in_use)
1088		device->act_log = n;
1089	spin_unlock_irq(&device->al_lock);
1090	if (in_use) {
1091		drbd_err(device, "Activity log still in use!\n");
1092		lc_destroy(n);
1093		return -EBUSY;
1094	} else {
1095		if (t)
1096			lc_destroy(t);
1097	}
1098	drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */
1099	return 0;
1100}
1101
1102static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_bio_size)
1103{
1104	struct request_queue * const q = device->rq_queue;
1105	unsigned int max_hw_sectors = max_bio_size >> 9;
1106	unsigned int max_segments = 0;
1107
1108	if (get_ldev_if_state(device, D_ATTACHING)) {
1109		struct request_queue * const b = device->ldev->backing_bdev->bd_disk->queue;
1110
1111		max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
1112		rcu_read_lock();
1113		max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs;
1114		rcu_read_unlock();
1115		put_ldev(device);
1116	}
1117
1118	blk_queue_logical_block_size(q, 512);
1119	blk_queue_max_hw_sectors(q, max_hw_sectors);
1120	/* This is the workaround for "bio would need to, but cannot, be split" */
1121	blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
1122	blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1);
1123
1124	if (get_ldev_if_state(device, D_ATTACHING)) {
1125		struct request_queue * const b = device->ldev->backing_bdev->bd_disk->queue;
1126
1127		blk_queue_stack_limits(q, b);
1128
1129		if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
1130			drbd_info(device, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
1131				 q->backing_dev_info.ra_pages,
1132				 b->backing_dev_info.ra_pages);
1133			q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
1134		}
1135		put_ldev(device);
1136	}
1137}
1138
1139void drbd_reconsider_max_bio_size(struct drbd_device *device)
1140{
1141	unsigned int now, new, local, peer;
1142
1143	now = queue_max_hw_sectors(device->rq_queue) << 9;
1144	local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */
1145	peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */
1146
1147	if (get_ldev_if_state(device, D_ATTACHING)) {
1148		local = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9;
1149		device->local_max_bio_size = local;
1150		put_ldev(device);
1151	}
1152	local = min(local, DRBD_MAX_BIO_SIZE);
1153
1154	/* We may ignore peer limits if the peer is modern enough.
1155	   Because new from 8.3.8 onwards the peer can use multiple
1156	   BIOs for a single peer_request */
1157	if (device->state.conn >= C_WF_REPORT_PARAMS) {
1158		if (first_peer_device(device)->connection->agreed_pro_version < 94)
1159			peer = min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
1160			/* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
1161		else if (first_peer_device(device)->connection->agreed_pro_version == 94)
1162			peer = DRBD_MAX_SIZE_H80_PACKET;
1163		else if (first_peer_device(device)->connection->agreed_pro_version < 100)
1164			peer = DRBD_MAX_BIO_SIZE_P95;  /* drbd 8.3.8 onwards, before 8.4.0 */
1165		else
1166			peer = DRBD_MAX_BIO_SIZE;
1167	}
1168
1169	new = min(local, peer);
1170
1171	if (device->state.role == R_PRIMARY && new < now)
1172		drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n", new, now);
1173
1174	if (new != now)
1175		drbd_info(device, "max BIO size = %u\n", new);
1176
1177	drbd_setup_queue_param(device, new);
1178}
1179
1180/* Starts the worker thread */
1181static void conn_reconfig_start(struct drbd_connection *connection)
1182{
1183	drbd_thread_start(&connection->worker);
1184	drbd_flush_workqueue(&connection->sender_work);
1185}
1186
1187/* if still unconfigured, stops worker again. */
1188static void conn_reconfig_done(struct drbd_connection *connection)
1189{
1190	bool stop_threads;
1191	spin_lock_irq(&connection->resource->req_lock);
1192	stop_threads = conn_all_vols_unconf(connection) &&
1193		connection->cstate == C_STANDALONE;
1194	spin_unlock_irq(&connection->resource->req_lock);
1195	if (stop_threads) {
1196		/* asender is implicitly stopped by receiver
1197		 * in conn_disconnect() */
1198		drbd_thread_stop(&connection->receiver);
1199		drbd_thread_stop(&connection->worker);
1200	}
1201}
1202
1203/* Make sure IO is suspended before calling this function(). */
1204static void drbd_suspend_al(struct drbd_device *device)
1205{
1206	int s = 0;
1207
1208	if (!lc_try_lock(device->act_log)) {
1209		drbd_warn(device, "Failed to lock al in drbd_suspend_al()\n");
1210		return;
1211	}
1212
1213	drbd_al_shrink(device);
1214	spin_lock_irq(&device->resource->req_lock);
1215	if (device->state.conn < C_CONNECTED)
1216		s = !test_and_set_bit(AL_SUSPENDED, &device->flags);
1217	spin_unlock_irq(&device->resource->req_lock);
1218	lc_unlock(device->act_log);
1219
1220	if (s)
1221		drbd_info(device, "Suspended AL updates\n");
1222}
1223
1224
1225static bool should_set_defaults(struct genl_info *info)
1226{
1227	unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags;
1228	return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS);
1229}
1230
1231static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
1232{
1233	/* This is limited by 16 bit "slot" numbers,
1234	 * and by available on-disk context storage.
1235	 *
1236	 * Also (u16)~0 is special (denotes a "free" extent).
1237	 *
1238	 * One transaction occupies one 4kB on-disk block,
1239	 * we have n such blocks in the on disk ring buffer,
1240	 * the "current" transaction may fail (n-1),
1241	 * and there is 919 slot numbers context information per transaction.
1242	 *
1243	 * 72 transaction blocks amounts to more than 2**16 context slots,
1244	 * so cap there first.
1245	 */
1246	const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX;
1247	const unsigned int sufficient_on_disk =
1248		(max_al_nr + AL_CONTEXT_PER_TRANSACTION -1)
1249		/AL_CONTEXT_PER_TRANSACTION;
1250
1251	unsigned int al_size_4k = bdev->md.al_size_4k;
1252
1253	if (al_size_4k > sufficient_on_disk)
1254		return max_al_nr;
1255
1256	return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION;
1257}
1258
1259int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
1260{
1261	enum drbd_ret_code retcode;
1262	struct drbd_device *device;
1263	struct disk_conf *new_disk_conf, *old_disk_conf;
1264	struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
1265	int err, fifo_size;
1266
1267	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
1268	if (!adm_ctx.reply_skb)
1269		return retcode;
1270	if (retcode != NO_ERROR)
1271		goto out;
1272
1273	device = adm_ctx.device;
1274
1275	/* we also need a disk
1276	 * to change the options on */
1277	if (!get_ldev(device)) {
1278		retcode = ERR_NO_DISK;
1279		goto out;
1280	}
1281
1282	new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
1283	if (!new_disk_conf) {
1284		retcode = ERR_NOMEM;
1285		goto fail;
1286	}
1287
1288	mutex_lock(&device->resource->conf_update);
1289	old_disk_conf = device->ldev->disk_conf;
1290	*new_disk_conf = *old_disk_conf;
1291	if (should_set_defaults(info))
1292		set_disk_conf_defaults(new_disk_conf);
1293
1294	err = disk_conf_from_attrs_for_change(new_disk_conf, info);
1295	if (err && err != -ENOMSG) {
1296		retcode = ERR_MANDATORY_TAG;
1297		drbd_msg_put_info(from_attrs_err_to_txt(err));
1298		goto fail_unlock;
1299	}
1300
1301	if (!expect(new_disk_conf->resync_rate >= 1))
1302		new_disk_conf->resync_rate = 1;
1303
1304	if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
1305		new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
1306	if (new_disk_conf->al_extents > drbd_al_extents_max(device->ldev))
1307		new_disk_conf->al_extents = drbd_al_extents_max(device->ldev);
1308
1309	if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
1310		new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
1311
1312	fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
1313	if (fifo_size != device->rs_plan_s->size) {
1314		new_plan = fifo_alloc(fifo_size);
1315		if (!new_plan) {
1316			drbd_err(device, "kmalloc of fifo_buffer failed");
1317			retcode = ERR_NOMEM;
1318			goto fail_unlock;
1319		}
1320	}
1321
1322	drbd_suspend_io(device);
1323	wait_event(device->al_wait, lc_try_lock(device->act_log));
1324	drbd_al_shrink(device);
1325	err = drbd_check_al_size(device, new_disk_conf);
1326	lc_unlock(device->act_log);
1327	wake_up(&device->al_wait);
1328	drbd_resume_io(device);
1329
1330	if (err) {
1331		retcode = ERR_NOMEM;
1332		goto fail_unlock;
1333	}
1334
1335	write_lock_irq(&global_state_lock);
1336	retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
1337	if (retcode == NO_ERROR) {
1338		rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
1339		drbd_resync_after_changed(device);
1340	}
1341	write_unlock_irq(&global_state_lock);
1342
1343	if (retcode != NO_ERROR)
1344		goto fail_unlock;
1345
1346	if (new_plan) {
1347		old_plan = device->rs_plan_s;
1348		rcu_assign_pointer(device->rs_plan_s, new_plan);
1349	}
1350
1351	mutex_unlock(&device->resource->conf_update);
1352
1353	if (new_disk_conf->al_updates)
1354		device->ldev->md.flags &= ~MDF_AL_DISABLED;
1355	else
1356		device->ldev->md.flags |= MDF_AL_DISABLED;
1357
1358	if (new_disk_conf->md_flushes)
1359		clear_bit(MD_NO_FUA, &device->flags);
1360	else
1361		set_bit(MD_NO_FUA, &device->flags);
1362
1363	drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush);
1364
1365	drbd_md_sync(device);
1366
1367	if (device->state.conn >= C_CONNECTED) {
1368		struct drbd_peer_device *peer_device;
1369
1370		for_each_peer_device(peer_device, device)
1371			drbd_send_sync_param(peer_device);
1372	}
1373
1374	synchronize_rcu();
1375	kfree(old_disk_conf);
1376	kfree(old_plan);
1377	mod_timer(&device->request_timer, jiffies + HZ);
1378	goto success;
1379
1380fail_unlock:
1381	mutex_unlock(&device->resource->conf_update);
1382 fail:
1383	kfree(new_disk_conf);
1384	kfree(new_plan);
1385success:
1386	put_ldev(device);
1387 out:
1388	drbd_adm_finish(info, retcode);
1389	return 0;
1390}
1391
1392int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1393{
1394	struct drbd_device *device;
1395	int err;
1396	enum drbd_ret_code retcode;
1397	enum determine_dev_size dd;
1398	sector_t max_possible_sectors;
1399	sector_t min_md_device_sectors;
1400	struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
1401	struct disk_conf *new_disk_conf = NULL;
1402	struct block_device *bdev;
1403	struct lru_cache *resync_lru = NULL;
1404	struct fifo_buffer *new_plan = NULL;
1405	union drbd_state ns, os;
1406	enum drbd_state_rv rv;
1407	struct net_conf *nc;
1408
1409	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
1410	if (!adm_ctx.reply_skb)
1411		return retcode;
1412	if (retcode != NO_ERROR)
1413		goto finish;
1414
1415	device = adm_ctx.device;
1416	conn_reconfig_start(first_peer_device(device)->connection);
1417
1418	/* if you want to reconfigure, please tear down first */
1419	if (device->state.disk > D_DISKLESS) {
1420		retcode = ERR_DISK_CONFIGURED;
1421		goto fail;
1422	}
1423	/* It may just now have detached because of IO error.  Make sure
1424	 * drbd_ldev_destroy is done already, we may end up here very fast,
1425	 * e.g. if someone calls attach from the on-io-error handler,
1426	 * to realize a "hot spare" feature (not that I'd recommend that) */
1427	wait_event(device->misc_wait, !atomic_read(&device->local_cnt));
1428
1429	/* make sure there is no leftover from previous force-detach attempts */
1430	clear_bit(FORCE_DETACH, &device->flags);
1431	clear_bit(WAS_IO_ERROR, &device->flags);
1432	clear_bit(WAS_READ_ERROR, &device->flags);
1433
1434	/* and no leftover from previously aborted resync or verify, either */
1435	device->rs_total = 0;
1436	device->rs_failed = 0;
1437	atomic_set(&device->rs_pending_cnt, 0);
1438
1439	/* allocation not in the IO path, drbdsetup context */
1440	nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
1441	if (!nbc) {
1442		retcode = ERR_NOMEM;
1443		goto fail;
1444	}
1445	spin_lock_init(&nbc->md.uuid_lock);
1446
1447	new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
1448	if (!new_disk_conf) {
1449		retcode = ERR_NOMEM;
1450		goto fail;
1451	}
1452	nbc->disk_conf = new_disk_conf;
1453
1454	set_disk_conf_defaults(new_disk_conf);
1455	err = disk_conf_from_attrs(new_disk_conf, info);
1456	if (err) {
1457		retcode = ERR_MANDATORY_TAG;
1458		drbd_msg_put_info(from_attrs_err_to_txt(err));
1459		goto fail;
1460	}
1461
1462	if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
1463		new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
1464
1465	new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ);
1466	if (!new_plan) {
1467		retcode = ERR_NOMEM;
1468		goto fail;
1469	}
1470
1471	if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
1472		retcode = ERR_MD_IDX_INVALID;
1473		goto fail;
1474	}
1475
1476	write_lock_irq(&global_state_lock);
1477	retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
1478	write_unlock_irq(&global_state_lock);
1479	if (retcode != NO_ERROR)
1480		goto fail;
1481
1482	rcu_read_lock();
1483	nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
1484	if (nc) {
1485		if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
1486			rcu_read_unlock();
1487			retcode = ERR_STONITH_AND_PROT_A;
1488			goto fail;
1489		}
1490	}
1491	rcu_read_unlock();
1492
1493	bdev = blkdev_get_by_path(new_disk_conf->backing_dev,
1494				  FMODE_READ | FMODE_WRITE | FMODE_EXCL, device);
1495	if (IS_ERR(bdev)) {
1496		drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev,
1497			PTR_ERR(bdev));
1498		retcode = ERR_OPEN_DISK;
1499		goto fail;
1500	}
1501	nbc->backing_bdev = bdev;
1502
1503	/*
1504	 * meta_dev_idx >= 0: external fixed size, possibly multiple
1505	 * drbd sharing one meta device.  TODO in that case, paranoia
1506	 * check that [md_bdev, meta_dev_idx] is not yet used by some
1507	 * other drbd minor!  (if you use drbd.conf + drbdadm, that
1508	 * should check it for you already; but if you don't, or
1509	 * someone fooled it, we need to double check here)
1510	 */
1511	bdev = blkdev_get_by_path(new_disk_conf->meta_dev,
1512				  FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1513				  (new_disk_conf->meta_dev_idx < 0) ?
1514				  (void *)device : (void *)drbd_m_holder);
1515	if (IS_ERR(bdev)) {
1516		drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev,
1517			PTR_ERR(bdev));
1518		retcode = ERR_OPEN_MD_DISK;
1519		goto fail;
1520	}
1521	nbc->md_bdev = bdev;
1522
1523	if ((nbc->backing_bdev == nbc->md_bdev) !=
1524	    (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
1525	     new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
1526		retcode = ERR_MD_IDX_INVALID;
1527		goto fail;
1528	}
1529
1530	resync_lru = lc_create("resync", drbd_bm_ext_cache,
1531			1, 61, sizeof(struct bm_extent),
1532			offsetof(struct bm_extent, lce));
1533	if (!resync_lru) {
1534		retcode = ERR_NOMEM;
1535		goto fail;
1536	}
1537
1538	/* Read our meta data super block early.
1539	 * This also sets other on-disk offsets. */
1540	retcode = drbd_md_read(device, nbc);
1541	if (retcode != NO_ERROR)
1542		goto fail;
1543
1544	if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
1545		new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
1546	if (new_disk_conf->al_extents > drbd_al_extents_max(nbc))
1547		new_disk_conf->al_extents = drbd_al_extents_max(nbc);
1548
1549	if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
1550		drbd_err(device, "max capacity %llu smaller than disk size %llu\n",
1551			(unsigned long long) drbd_get_max_capacity(nbc),
1552			(unsigned long long) new_disk_conf->disk_size);
1553		retcode = ERR_DISK_TOO_SMALL;
1554		goto fail;
1555	}
1556
1557	if (new_disk_conf->meta_dev_idx < 0) {
1558		max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
1559		/* at least one MB, otherwise it does not make sense */
1560		min_md_device_sectors = (2<<10);
1561	} else {
1562		max_possible_sectors = DRBD_MAX_SECTORS;
1563		min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1);
1564	}
1565
1566	if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
1567		retcode = ERR_MD_DISK_TOO_SMALL;
1568		drbd_warn(device, "refusing attach: md-device too small, "
1569		     "at least %llu sectors needed for this meta-disk type\n",
1570		     (unsigned long long) min_md_device_sectors);
1571		goto fail;
1572	}
1573
1574	/* Make sure the new disk is big enough
1575	 * (we may currently be R_PRIMARY with no local disk...) */
1576	if (drbd_get_max_capacity(nbc) <
1577	    drbd_get_capacity(device->this_bdev)) {
1578		retcode = ERR_DISK_TOO_SMALL;
1579		goto fail;
1580	}
1581
1582	nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
1583
1584	if (nbc->known_size > max_possible_sectors) {
1585		drbd_warn(device, "==> truncating very big lower level device "
1586			"to currently maximum possible %llu sectors <==\n",
1587			(unsigned long long) max_possible_sectors);
1588		if (new_disk_conf->meta_dev_idx >= 0)
1589			drbd_warn(device, "==>> using internal or flexible "
1590				      "meta data may help <<==\n");
1591	}
1592
1593	drbd_suspend_io(device);
1594	/* also wait for the last barrier ack. */
1595	/* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171
1596	 * We need a way to either ignore barrier acks for barriers sent before a device
1597	 * was attached, or a way to wait for all pending barrier acks to come in.
1598	 * As barriers are counted per resource,
1599	 * we'd need to suspend io on all devices of a resource.
1600	 */
1601	wait_event(device->misc_wait, !atomic_read(&device->ap_pending_cnt) || drbd_suspended(device));
1602	/* and for any other previously queued work */
1603	drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work);
1604
1605	rv = _drbd_request_state(device, NS(disk, D_ATTACHING), CS_VERBOSE);
1606	retcode = rv;  /* FIXME: Type mismatch. */
1607	drbd_resume_io(device);
1608	if (rv < SS_SUCCESS)
1609		goto fail;
1610
1611	if (!get_ldev_if_state(device, D_ATTACHING))
1612		goto force_diskless;
1613
1614	if (!device->bitmap) {
1615		if (drbd_bm_init(device)) {
1616			retcode = ERR_NOMEM;
1617			goto force_diskless_dec;
1618		}
1619	}
1620
1621	if (device->state.conn < C_CONNECTED &&
1622	    device->state.role == R_PRIMARY &&
1623	    (device->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
1624		drbd_err(device, "Can only attach to data with current UUID=%016llX\n",
1625		    (unsigned long long)device->ed_uuid);
1626		retcode = ERR_DATA_NOT_CURRENT;
1627		goto force_diskless_dec;
1628	}
1629
1630	/* Since we are diskless, fix the activity log first... */
1631	if (drbd_check_al_size(device, new_disk_conf)) {
1632		retcode = ERR_NOMEM;
1633		goto force_diskless_dec;
1634	}
1635
1636	/* Prevent shrinking of consistent devices ! */
1637	if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
1638	    drbd_new_dev_size(device, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) {
1639		drbd_warn(device, "refusing to truncate a consistent device\n");
1640		retcode = ERR_DISK_TOO_SMALL;
1641		goto force_diskless_dec;
1642	}
1643
1644	/* Reset the "barriers don't work" bits here, then force meta data to
1645	 * be written, to ensure we determine if barriers are supported. */
1646	if (new_disk_conf->md_flushes)
1647		clear_bit(MD_NO_FUA, &device->flags);
1648	else
1649		set_bit(MD_NO_FUA, &device->flags);
1650
1651	/* Point of no return reached.
1652	 * Devices and memory are no longer released by error cleanup below.
1653	 * now device takes over responsibility, and the state engine should
1654	 * clean it up somewhere.  */
1655	D_ASSERT(device, device->ldev == NULL);
1656	device->ldev = nbc;
1657	device->resync = resync_lru;
1658	device->rs_plan_s = new_plan;
1659	nbc = NULL;
1660	resync_lru = NULL;
1661	new_disk_conf = NULL;
1662	new_plan = NULL;
1663
1664	drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush);
1665
1666	if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
1667		set_bit(CRASHED_PRIMARY, &device->flags);
1668	else
1669		clear_bit(CRASHED_PRIMARY, &device->flags);
1670
1671	if (drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) &&
1672	    !(device->state.role == R_PRIMARY && device->resource->susp_nod))
1673		set_bit(CRASHED_PRIMARY, &device->flags);
1674
1675	device->send_cnt = 0;
1676	device->recv_cnt = 0;
1677	device->read_cnt = 0;
1678	device->writ_cnt = 0;
1679
1680	drbd_reconsider_max_bio_size(device);
1681
1682	/* If I am currently not R_PRIMARY,
1683	 * but meta data primary indicator is set,
1684	 * I just now recover from a hard crash,
1685	 * and have been R_PRIMARY before that crash.
1686	 *
1687	 * Now, if I had no connection before that crash
1688	 * (have been degraded R_PRIMARY), chances are that
1689	 * I won't find my peer now either.
1690	 *
1691	 * In that case, and _only_ in that case,
1692	 * we use the degr-wfc-timeout instead of the default,
1693	 * so we can automatically recover from a crash of a
1694	 * degraded but active "cluster" after a certain timeout.
1695	 */
1696	clear_bit(USE_DEGR_WFC_T, &device->flags);
1697	if (device->state.role != R_PRIMARY &&
1698	     drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) &&
1699	    !drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND))
1700		set_bit(USE_DEGR_WFC_T, &device->flags);
1701
1702	dd = drbd_determine_dev_size(device, 0, NULL);
1703	if (dd <= DS_ERROR) {
1704		retcode = ERR_NOMEM_BITMAP;
1705		goto force_diskless_dec;
1706	} else if (dd == DS_GREW)
1707		set_bit(RESYNC_AFTER_NEG, &device->flags);
1708
1709	if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ||
1710	    (test_bit(CRASHED_PRIMARY, &device->flags) &&
1711	     drbd_md_test_flag(device->ldev, MDF_AL_DISABLED))) {
1712		drbd_info(device, "Assuming that all blocks are out of sync "
1713		     "(aka FullSync)\n");
1714		if (drbd_bitmap_io(device, &drbd_bmio_set_n_write,
1715			"set_n_write from attaching", BM_LOCKED_MASK)) {
1716			retcode = ERR_IO_MD_DISK;
1717			goto force_diskless_dec;
1718		}
1719	} else {
1720		if (drbd_bitmap_io(device, &drbd_bm_read,
1721			"read from attaching", BM_LOCKED_MASK)) {
1722			retcode = ERR_IO_MD_DISK;
1723			goto force_diskless_dec;
1724		}
1725	}
1726
1727	if (_drbd_bm_total_weight(device) == drbd_bm_bits(device))
1728		drbd_suspend_al(device); /* IO is still suspended here... */
1729
1730	spin_lock_irq(&device->resource->req_lock);
1731	os = drbd_read_state(device);
1732	ns = os;
1733	/* If MDF_CONSISTENT is not set go into inconsistent state,
1734	   otherwise investigate MDF_WasUpToDate...
1735	   If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
1736	   otherwise into D_CONSISTENT state.
1737	*/
1738	if (drbd_md_test_flag(device->ldev, MDF_CONSISTENT)) {
1739		if (drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE))
1740			ns.disk = D_CONSISTENT;
1741		else
1742			ns.disk = D_OUTDATED;
1743	} else {
1744		ns.disk = D_INCONSISTENT;
1745	}
1746
1747	if (drbd_md_test_flag(device->ldev, MDF_PEER_OUT_DATED))
1748		ns.pdsk = D_OUTDATED;
1749
1750	rcu_read_lock();
1751	if (ns.disk == D_CONSISTENT &&
1752	    (ns.pdsk == D_OUTDATED || rcu_dereference(device->ldev->disk_conf)->fencing == FP_DONT_CARE))
1753		ns.disk = D_UP_TO_DATE;
1754
1755	/* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
1756	   MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before
1757	   this point, because drbd_request_state() modifies these
1758	   flags. */
1759
1760	if (rcu_dereference(device->ldev->disk_conf)->al_updates)
1761		device->ldev->md.flags &= ~MDF_AL_DISABLED;
1762	else
1763		device->ldev->md.flags |= MDF_AL_DISABLED;
1764
1765	rcu_read_unlock();
1766
1767	/* In case we are C_CONNECTED postpone any decision on the new disk
1768	   state after the negotiation phase. */
1769	if (device->state.conn == C_CONNECTED) {
1770		device->new_state_tmp.i = ns.i;
1771		ns.i = os.i;
1772		ns.disk = D_NEGOTIATING;
1773
1774		/* We expect to receive up-to-date UUIDs soon.
1775		   To avoid a race in receive_state, free p_uuid while
1776		   holding req_lock. I.e. atomic with the state change */
1777		kfree(device->p_uuid);
1778		device->p_uuid = NULL;
1779	}
1780
1781	rv = _drbd_set_state(device, ns, CS_VERBOSE, NULL);
1782	spin_unlock_irq(&device->resource->req_lock);
1783
1784	if (rv < SS_SUCCESS)
1785		goto force_diskless_dec;
1786
1787	mod_timer(&device->request_timer, jiffies + HZ);
1788
1789	if (device->state.role == R_PRIMARY)
1790		device->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
1791	else
1792		device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
1793
1794	drbd_md_mark_dirty(device);
1795	drbd_md_sync(device);
1796
1797	kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
1798	put_ldev(device);
1799	conn_reconfig_done(first_peer_device(device)->connection);
1800	drbd_adm_finish(info, retcode);
1801	return 0;
1802
1803 force_diskless_dec:
1804	put_ldev(device);
1805 force_diskless:
1806	drbd_force_state(device, NS(disk, D_DISKLESS));
1807	drbd_md_sync(device);
1808 fail:
1809	conn_reconfig_done(first_peer_device(device)->connection);
1810	if (nbc) {
1811		if (nbc->backing_bdev)
1812			blkdev_put(nbc->backing_bdev,
1813				   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1814		if (nbc->md_bdev)
1815			blkdev_put(nbc->md_bdev,
1816				   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1817		kfree(nbc);
1818	}
1819	kfree(new_disk_conf);
1820	lc_destroy(resync_lru);
1821	kfree(new_plan);
1822
1823 finish:
1824	drbd_adm_finish(info, retcode);
1825	return 0;
1826}
1827
1828static int adm_detach(struct drbd_device *device, int force)
1829{
1830	enum drbd_state_rv retcode;
1831	int ret;
1832
1833	if (force) {
1834		set_bit(FORCE_DETACH, &device->flags);
1835		drbd_force_state(device, NS(disk, D_FAILED));
1836		retcode = SS_SUCCESS;
1837		goto out;
1838	}
1839
1840	drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */
1841	drbd_md_get_buffer(device); /* make sure there is no in-flight meta-data IO */
1842	retcode = drbd_request_state(device, NS(disk, D_FAILED));
1843	drbd_md_put_buffer(device);
1844	/* D_FAILED will transition to DISKLESS. */
1845	ret = wait_event_interruptible(device->misc_wait,
1846			device->state.disk != D_FAILED);
1847	drbd_resume_io(device);
1848	if ((int)retcode == (int)SS_IS_DISKLESS)
1849		retcode = SS_NOTHING_TO_DO;
1850	if (ret)
1851		retcode = ERR_INTR;
1852out:
1853	return retcode;
1854}
1855
1856/* Detaching the disk is a process in multiple stages.  First we need to lock
1857 * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
1858 * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
1859 * internal references as well.
1860 * Only then we have finally detached. */
1861int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info)
1862{
1863	enum drbd_ret_code retcode;
1864	struct detach_parms parms = { };
1865	int err;
1866
1867	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
1868	if (!adm_ctx.reply_skb)
1869		return retcode;
1870	if (retcode != NO_ERROR)
1871		goto out;
1872
1873	if (info->attrs[DRBD_NLA_DETACH_PARMS]) {
1874		err = detach_parms_from_attrs(&parms, info);
1875		if (err) {
1876			retcode = ERR_MANDATORY_TAG;
1877			drbd_msg_put_info(from_attrs_err_to_txt(err));
1878			goto out;
1879		}
1880	}
1881
1882	retcode = adm_detach(adm_ctx.device, parms.force_detach);
1883out:
1884	drbd_adm_finish(info, retcode);
1885	return 0;
1886}
1887
1888static bool conn_resync_running(struct drbd_connection *connection)
1889{
1890	struct drbd_peer_device *peer_device;
1891	bool rv = false;
1892	int vnr;
1893
1894	rcu_read_lock();
1895	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1896		struct drbd_device *device = peer_device->device;
1897		if (device->state.conn == C_SYNC_SOURCE ||
1898		    device->state.conn == C_SYNC_TARGET ||
1899		    device->state.conn == C_PAUSED_SYNC_S ||
1900		    device->state.conn == C_PAUSED_SYNC_T) {
1901			rv = true;
1902			break;
1903		}
1904	}
1905	rcu_read_unlock();
1906
1907	return rv;
1908}
1909
1910static bool conn_ov_running(struct drbd_connection *connection)
1911{
1912	struct drbd_peer_device *peer_device;
1913	bool rv = false;
1914	int vnr;
1915
1916	rcu_read_lock();
1917	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1918		struct drbd_device *device = peer_device->device;
1919		if (device->state.conn == C_VERIFY_S ||
1920		    device->state.conn == C_VERIFY_T) {
1921			rv = true;
1922			break;
1923		}
1924	}
1925	rcu_read_unlock();
1926
1927	return rv;
1928}
1929
1930static enum drbd_ret_code
1931_check_net_options(struct drbd_connection *connection, struct net_conf *old_net_conf, struct net_conf *new_net_conf)
1932{
1933	struct drbd_peer_device *peer_device;
1934	int i;
1935
1936	if (old_net_conf && connection->cstate == C_WF_REPORT_PARAMS && connection->agreed_pro_version < 100) {
1937		if (new_net_conf->wire_protocol != old_net_conf->wire_protocol)
1938			return ERR_NEED_APV_100;
1939
1940		if (new_net_conf->two_primaries != old_net_conf->two_primaries)
1941			return ERR_NEED_APV_100;
1942
1943		if (strcmp(new_net_conf->integrity_alg, old_net_conf->integrity_alg))
1944			return ERR_NEED_APV_100;
1945	}
1946
1947	if (!new_net_conf->two_primaries &&
1948	    conn_highest_role(connection) == R_PRIMARY &&
1949	    conn_highest_peer(connection) == R_PRIMARY)
1950		return ERR_NEED_ALLOW_TWO_PRI;
1951
1952	if (new_net_conf->two_primaries &&
1953	    (new_net_conf->wire_protocol != DRBD_PROT_C))
1954		return ERR_NOT_PROTO_C;
1955
1956	idr_for_each_entry(&connection->peer_devices, peer_device, i) {
1957		struct drbd_device *device = peer_device->device;
1958		if (get_ldev(device)) {
1959			enum drbd_fencing_p fp = rcu_dereference(device->ldev->disk_conf)->fencing;
1960			put_ldev(device);
1961			if (new_net_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH)
1962				return ERR_STONITH_AND_PROT_A;
1963		}
1964		if (device->state.role == R_PRIMARY && new_net_conf->discard_my_data)
1965			return ERR_DISCARD_IMPOSSIBLE;
1966	}
1967
1968	if (new_net_conf->on_congestion != OC_BLOCK && new_net_conf->wire_protocol != DRBD_PROT_A)
1969		return ERR_CONG_NOT_PROTO_A;
1970
1971	return NO_ERROR;
1972}
1973
1974static enum drbd_ret_code
1975check_net_options(struct drbd_connection *connection, struct net_conf *new_net_conf)
1976{
1977	static enum drbd_ret_code rv;
1978	struct drbd_peer_device *peer_device;
1979	int i;
1980
1981	rcu_read_lock();
1982	rv = _check_net_options(connection, rcu_dereference(connection->net_conf), new_net_conf);
1983	rcu_read_unlock();
1984
1985	/* connection->volumes protected by genl_lock() here */
1986	idr_for_each_entry(&connection->peer_devices, peer_device, i) {
1987		struct drbd_device *device = peer_device->device;
1988		if (!device->bitmap) {
1989			if (drbd_bm_init(device))
1990				return ERR_NOMEM;
1991		}
1992	}
1993
1994	return rv;
1995}
1996
1997struct crypto {
1998	struct crypto_hash *verify_tfm;
1999	struct crypto_hash *csums_tfm;
2000	struct crypto_hash *cram_hmac_tfm;
2001	struct crypto_hash *integrity_tfm;
2002};
2003
2004static int
2005alloc_hash(struct crypto_hash **tfm, char *tfm_name, int err_alg)
2006{
2007	if (!tfm_name[0])
2008		return NO_ERROR;
2009
2010	*tfm = crypto_alloc_hash(tfm_name, 0, CRYPTO_ALG_ASYNC);
2011	if (IS_ERR(*tfm)) {
2012		*tfm = NULL;
2013		return err_alg;
2014	}
2015
2016	return NO_ERROR;
2017}
2018
2019static enum drbd_ret_code
2020alloc_crypto(struct crypto *crypto, struct net_conf *new_net_conf)
2021{
2022	char hmac_name[CRYPTO_MAX_ALG_NAME];
2023	enum drbd_ret_code rv;
2024
2025	rv = alloc_hash(&crypto->csums_tfm, new_net_conf->csums_alg,
2026		       ERR_CSUMS_ALG);
2027	if (rv != NO_ERROR)
2028		return rv;
2029	rv = alloc_hash(&crypto->verify_tfm, new_net_conf->verify_alg,
2030		       ERR_VERIFY_ALG);
2031	if (rv != NO_ERROR)
2032		return rv;
2033	rv = alloc_hash(&crypto->integrity_tfm, new_net_conf->integrity_alg,
2034		       ERR_INTEGRITY_ALG);
2035	if (rv != NO_ERROR)
2036		return rv;
2037	if (new_net_conf->cram_hmac_alg[0] != 0) {
2038		snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
2039			 new_net_conf->cram_hmac_alg);
2040
2041		rv = alloc_hash(&crypto->cram_hmac_tfm, hmac_name,
2042			       ERR_AUTH_ALG);
2043	}
2044
2045	return rv;
2046}
2047
2048static void free_crypto(struct crypto *crypto)
2049{
2050	crypto_free_hash(crypto->cram_hmac_tfm);
2051	crypto_free_hash(crypto->integrity_tfm);
2052	crypto_free_hash(crypto->csums_tfm);
2053	crypto_free_hash(crypto->verify_tfm);
2054}
2055
2056int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
2057{
2058	enum drbd_ret_code retcode;
2059	struct drbd_connection *connection;
2060	struct net_conf *old_net_conf, *new_net_conf = NULL;
2061	int err;
2062	int ovr; /* online verify running */
2063	int rsr; /* re-sync running */
2064	struct crypto crypto = { };
2065
2066	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION);
2067	if (!adm_ctx.reply_skb)
2068		return retcode;
2069	if (retcode != NO_ERROR)
2070		goto out;
2071
2072	connection = adm_ctx.connection;
2073
2074	new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
2075	if (!new_net_conf) {
2076		retcode = ERR_NOMEM;
2077		goto out;
2078	}
2079
2080	conn_reconfig_start(connection);
2081
2082	mutex_lock(&connection->data.mutex);
2083	mutex_lock(&connection->resource->conf_update);
2084	old_net_conf = connection->net_conf;
2085
2086	if (!old_net_conf) {
2087		drbd_msg_put_info("net conf missing, try connect");
2088		retcode = ERR_INVALID_REQUEST;
2089		goto fail;
2090	}
2091
2092	*new_net_conf = *old_net_conf;
2093	if (should_set_defaults(info))
2094		set_net_conf_defaults(new_net_conf);
2095
2096	err = net_conf_from_attrs_for_change(new_net_conf, info);
2097	if (err && err != -ENOMSG) {
2098		retcode = ERR_MANDATORY_TAG;
2099		drbd_msg_put_info(from_attrs_err_to_txt(err));
2100		goto fail;
2101	}
2102
2103	retcode = check_net_options(connection, new_net_conf);
2104	if (retcode != NO_ERROR)
2105		goto fail;
2106
2107	/* re-sync running */
2108	rsr = conn_resync_running(connection);
2109	if (rsr && strcmp(new_net_conf->csums_alg, old_net_conf->csums_alg)) {
2110		retcode = ERR_CSUMS_RESYNC_RUNNING;
2111		goto fail;
2112	}
2113
2114	/* online verify running */
2115	ovr = conn_ov_running(connection);
2116	if (ovr && strcmp(new_net_conf->verify_alg, old_net_conf->verify_alg)) {
2117		retcode = ERR_VERIFY_RUNNING;
2118		goto fail;
2119	}
2120
2121	retcode = alloc_crypto(&crypto, new_net_conf);
2122	if (retcode != NO_ERROR)
2123		goto fail;
2124
2125	rcu_assign_pointer(connection->net_conf, new_net_conf);
2126
2127	if (!rsr) {
2128		crypto_free_hash(connection->csums_tfm);
2129		connection->csums_tfm = crypto.csums_tfm;
2130		crypto.csums_tfm = NULL;
2131	}
2132	if (!ovr) {
2133		crypto_free_hash(connection->verify_tfm);
2134		connection->verify_tfm = crypto.verify_tfm;
2135		crypto.verify_tfm = NULL;
2136	}
2137
2138	crypto_free_hash(connection->integrity_tfm);
2139	connection->integrity_tfm = crypto.integrity_tfm;
2140	if (connection->cstate >= C_WF_REPORT_PARAMS && connection->agreed_pro_version >= 100)
2141		/* Do this without trying to take connection->data.mutex again.  */
2142		__drbd_send_protocol(connection, P_PROTOCOL_UPDATE);
2143
2144	crypto_free_hash(connection->cram_hmac_tfm);
2145	connection->cram_hmac_tfm = crypto.cram_hmac_tfm;
2146
2147	mutex_unlock(&connection->resource->conf_update);
2148	mutex_unlock(&connection->data.mutex);
2149	synchronize_rcu();
2150	kfree(old_net_conf);
2151
2152	if (connection->cstate >= C_WF_REPORT_PARAMS) {
2153		struct drbd_peer_device *peer_device;
2154		int vnr;
2155
2156		idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
2157			drbd_send_sync_param(peer_device);
2158	}
2159
2160	goto done;
2161
2162 fail:
2163	mutex_unlock(&connection->resource->conf_update);
2164	mutex_unlock(&connection->data.mutex);
2165	free_crypto(&crypto);
2166	kfree(new_net_conf);
2167 done:
2168	conn_reconfig_done(connection);
2169 out:
2170	drbd_adm_finish(info, retcode);
2171	return 0;
2172}
2173
2174int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
2175{
2176	struct drbd_peer_device *peer_device;
2177	struct net_conf *old_net_conf, *new_net_conf = NULL;
2178	struct crypto crypto = { };
2179	struct drbd_resource *resource;
2180	struct drbd_connection *connection;
2181	enum drbd_ret_code retcode;
2182	int i;
2183	int err;
2184
2185	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
2186
2187	if (!adm_ctx.reply_skb)
2188		return retcode;
2189	if (retcode != NO_ERROR)
2190		goto out;
2191	if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) {
2192		drbd_msg_put_info("connection endpoint(s) missing");
2193		retcode = ERR_INVALID_REQUEST;
2194		goto out;
2195	}
2196
2197	/* No need for _rcu here. All reconfiguration is
2198	 * strictly serialized on genl_lock(). We are protected against
2199	 * concurrent reconfiguration/addition/deletion */
2200	for_each_resource(resource, &drbd_resources) {
2201		for_each_connection(connection, resource) {
2202			if (nla_len(adm_ctx.my_addr) == connection->my_addr_len &&
2203			    !memcmp(nla_data(adm_ctx.my_addr), &connection->my_addr,
2204				    connection->my_addr_len)) {
2205				retcode = ERR_LOCAL_ADDR;
2206				goto out;
2207			}
2208
2209			if (nla_len(adm_ctx.peer_addr) == connection->peer_addr_len &&
2210			    !memcmp(nla_data(adm_ctx.peer_addr), &connection->peer_addr,
2211				    connection->peer_addr_len)) {
2212				retcode = ERR_PEER_ADDR;
2213				goto out;
2214			}
2215		}
2216	}
2217
2218	connection = first_connection(adm_ctx.resource);
2219	conn_reconfig_start(connection);
2220
2221	if (connection->cstate > C_STANDALONE) {
2222		retcode = ERR_NET_CONFIGURED;
2223		goto fail;
2224	}
2225
2226	/* allocation not in the IO path, drbdsetup / netlink process context */
2227	new_net_conf = kzalloc(sizeof(*new_net_conf), GFP_KERNEL);
2228	if (!new_net_conf) {
2229		retcode = ERR_NOMEM;
2230		goto fail;
2231	}
2232
2233	set_net_conf_defaults(new_net_conf);
2234
2235	err = net_conf_from_attrs(new_net_conf, info);
2236	if (err && err != -ENOMSG) {
2237		retcode = ERR_MANDATORY_TAG;
2238		drbd_msg_put_info(from_attrs_err_to_txt(err));
2239		goto fail;
2240	}
2241
2242	retcode = check_net_options(connection, new_net_conf);
2243	if (retcode != NO_ERROR)
2244		goto fail;
2245
2246	retcode = alloc_crypto(&crypto, new_net_conf);
2247	if (retcode != NO_ERROR)
2248		goto fail;
2249
2250	((char *)new_net_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
2251
2252	drbd_flush_workqueue(&connection->sender_work);
2253
2254	mutex_lock(&adm_ctx.resource->conf_update);
2255	old_net_conf = connection->net_conf;
2256	if (old_net_conf) {
2257		retcode = ERR_NET_CONFIGURED;
2258		mutex_unlock(&adm_ctx.resource->conf_update);
2259		goto fail;
2260	}
2261	rcu_assign_pointer(connection->net_conf, new_net_conf);
2262
2263	conn_free_crypto(connection);
2264	connection->cram_hmac_tfm = crypto.cram_hmac_tfm;
2265	connection->integrity_tfm = crypto.integrity_tfm;
2266	connection->csums_tfm = crypto.csums_tfm;
2267	connection->verify_tfm = crypto.verify_tfm;
2268
2269	connection->my_addr_len = nla_len(adm_ctx.my_addr);
2270	memcpy(&connection->my_addr, nla_data(adm_ctx.my_addr), connection->my_addr_len);
2271	connection->peer_addr_len = nla_len(adm_ctx.peer_addr);
2272	memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len);
2273
2274	mutex_unlock(&adm_ctx.resource->conf_update);
2275
2276	rcu_read_lock();
2277	idr_for_each_entry(&connection->peer_devices, peer_device, i) {
2278		struct drbd_device *device = peer_device->device;
2279		device->send_cnt = 0;
2280		device->recv_cnt = 0;
2281	}
2282	rcu_read_unlock();
2283
2284	retcode = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
2285
2286	conn_reconfig_done(connection);
2287	drbd_adm_finish(info, retcode);
2288	return 0;
2289
2290fail:
2291	free_crypto(&crypto);
2292	kfree(new_net_conf);
2293
2294	conn_reconfig_done(connection);
2295out:
2296	drbd_adm_finish(info, retcode);
2297	return 0;
2298}
2299
2300static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection, bool force)
2301{
2302	enum drbd_state_rv rv;
2303
2304	rv = conn_request_state(connection, NS(conn, C_DISCONNECTING),
2305			force ? CS_HARD : 0);
2306
2307	switch (rv) {
2308	case SS_NOTHING_TO_DO:
2309		break;
2310	case SS_ALREADY_STANDALONE:
2311		return SS_SUCCESS;
2312	case SS_PRIMARY_NOP:
2313		/* Our state checking code wants to see the peer outdated. */
2314		rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0);
2315
2316		if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */
2317			rv = conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_VERBOSE);
2318
2319		break;
2320	case SS_CW_FAILED_BY_PEER:
2321		/* The peer probably wants to see us outdated. */
2322		rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING,
2323							disk, D_OUTDATED), 0);
2324		if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) {
2325			rv = conn_request_state(connection, NS(conn, C_DISCONNECTING),
2326					CS_HARD);
2327		}
2328		break;
2329	default:;
2330		/* no special handling necessary */
2331	}
2332
2333	if (rv >= SS_SUCCESS) {
2334		enum drbd_state_rv rv2;
2335		/* No one else can reconfigure the network while I am here.
2336		 * The state handling only uses drbd_thread_stop_nowait(),
2337		 * we want to really wait here until the receiver is no more.
2338		 */
2339		drbd_thread_stop(&connection->receiver);
2340
2341		/* Race breaker.  This additional state change request may be
2342		 * necessary, if this was a forced disconnect during a receiver
2343		 * restart.  We may have "killed" the receiver thread just
2344		 * after drbd_receiver() returned.  Typically, we should be
2345		 * C_STANDALONE already, now, and this becomes a no-op.
2346		 */
2347		rv2 = conn_request_state(connection, NS(conn, C_STANDALONE),
2348				CS_VERBOSE | CS_HARD);
2349		if (rv2 < SS_SUCCESS)
2350			drbd_err(connection,
2351				"unexpected rv2=%d in conn_try_disconnect()\n",
2352				rv2);
2353	}
2354	return rv;
2355}
2356
2357int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info)
2358{
2359	struct disconnect_parms parms;
2360	struct drbd_connection *connection;
2361	enum drbd_state_rv rv;
2362	enum drbd_ret_code retcode;
2363	int err;
2364
2365	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION);
2366	if (!adm_ctx.reply_skb)
2367		return retcode;
2368	if (retcode != NO_ERROR)
2369		goto fail;
2370
2371	connection = adm_ctx.connection;
2372	memset(&parms, 0, sizeof(parms));
2373	if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) {
2374		err = disconnect_parms_from_attrs(&parms, info);
2375		if (err) {
2376			retcode = ERR_MANDATORY_TAG;
2377			drbd_msg_put_info(from_attrs_err_to_txt(err));
2378			goto fail;
2379		}
2380	}
2381
2382	rv = conn_try_disconnect(connection, parms.force_disconnect);
2383	if (rv < SS_SUCCESS)
2384		retcode = rv;  /* FIXME: Type mismatch. */
2385	else
2386		retcode = NO_ERROR;
2387 fail:
2388	drbd_adm_finish(info, retcode);
2389	return 0;
2390}
2391
2392void resync_after_online_grow(struct drbd_device *device)
2393{
2394	int iass; /* I am sync source */
2395
2396	drbd_info(device, "Resync of new storage after online grow\n");
2397	if (device->state.role != device->state.peer)
2398		iass = (device->state.role == R_PRIMARY);
2399	else
2400		iass = test_bit(RESOLVE_CONFLICTS, &first_peer_device(device)->connection->flags);
2401
2402	if (iass)
2403		drbd_start_resync(device, C_SYNC_SOURCE);
2404	else
2405		_drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
2406}
2407
2408int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
2409{
2410	struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
2411	struct resize_parms rs;
2412	struct drbd_device *device;
2413	enum drbd_ret_code retcode;
2414	enum determine_dev_size dd;
2415	bool change_al_layout = false;
2416	enum dds_flags ddsf;
2417	sector_t u_size;
2418	int err;
2419
2420	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2421	if (!adm_ctx.reply_skb)
2422		return retcode;
2423	if (retcode != NO_ERROR)
2424		goto fail;
2425
2426	device = adm_ctx.device;
2427	if (!get_ldev(device)) {
2428		retcode = ERR_NO_DISK;
2429		goto fail;
2430	}
2431
2432	memset(&rs, 0, sizeof(struct resize_parms));
2433	rs.al_stripes = device->ldev->md.al_stripes;
2434	rs.al_stripe_size = device->ldev->md.al_stripe_size_4k * 4;
2435	if (info->attrs[DRBD_NLA_RESIZE_PARMS]) {
2436		err = resize_parms_from_attrs(&rs, info);
2437		if (err) {
2438			retcode = ERR_MANDATORY_TAG;
2439			drbd_msg_put_info(from_attrs_err_to_txt(err));
2440			goto fail_ldev;
2441		}
2442	}
2443
2444	if (device->state.conn > C_CONNECTED) {
2445		retcode = ERR_RESIZE_RESYNC;
2446		goto fail_ldev;
2447	}
2448
2449	if (device->state.role == R_SECONDARY &&
2450	    device->state.peer == R_SECONDARY) {
2451		retcode = ERR_NO_PRIMARY;
2452		goto fail_ldev;
2453	}
2454
2455	if (rs.no_resync && first_peer_device(device)->connection->agreed_pro_version < 93) {
2456		retcode = ERR_NEED_APV_93;
2457		goto fail_ldev;
2458	}
2459
2460	rcu_read_lock();
2461	u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
2462	rcu_read_unlock();
2463	if (u_size != (sector_t)rs.resize_size) {
2464		new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
2465		if (!new_disk_conf) {
2466			retcode = ERR_NOMEM;
2467			goto fail_ldev;
2468		}
2469	}
2470
2471	if (device->ldev->md.al_stripes != rs.al_stripes ||
2472	    device->ldev->md.al_stripe_size_4k != rs.al_stripe_size / 4) {
2473		u32 al_size_k = rs.al_stripes * rs.al_stripe_size;
2474
2475		if (al_size_k > (16 * 1024 * 1024)) {
2476			retcode = ERR_MD_LAYOUT_TOO_BIG;
2477			goto fail_ldev;
2478		}
2479
2480		if (al_size_k < MD_32kB_SECT/2) {
2481			retcode = ERR_MD_LAYOUT_TOO_SMALL;
2482			goto fail_ldev;
2483		}
2484
2485		if (device->state.conn != C_CONNECTED) {
2486			retcode = ERR_MD_LAYOUT_CONNECTED;
2487			goto fail_ldev;
2488		}
2489
2490		change_al_layout = true;
2491	}
2492
2493	if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev))
2494		device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
2495
2496	if (new_disk_conf) {
2497		mutex_lock(&device->resource->conf_update);
2498		old_disk_conf = device->ldev->disk_conf;
2499		*new_disk_conf = *old_disk_conf;
2500		new_disk_conf->disk_size = (sector_t)rs.resize_size;
2501		rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
2502		mutex_unlock(&device->resource->conf_update);
2503		synchronize_rcu();
2504		kfree(old_disk_conf);
2505	}
2506
2507	ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
2508	dd = drbd_determine_dev_size(device, ddsf, change_al_layout ? &rs : NULL);
2509	drbd_md_sync(device);
2510	put_ldev(device);
2511	if (dd == DS_ERROR) {
2512		retcode = ERR_NOMEM_BITMAP;
2513		goto fail;
2514	} else if (dd == DS_ERROR_SPACE_MD) {
2515		retcode = ERR_MD_LAYOUT_NO_FIT;
2516		goto fail;
2517	} else if (dd == DS_ERROR_SHRINK) {
2518		retcode = ERR_IMPLICIT_SHRINK;
2519		goto fail;
2520	}
2521
2522	if (device->state.conn == C_CONNECTED) {
2523		if (dd == DS_GREW)
2524			set_bit(RESIZE_PENDING, &device->flags);
2525
2526		drbd_send_uuids(first_peer_device(device));
2527		drbd_send_sizes(first_peer_device(device), 1, ddsf);
2528	}
2529
2530 fail:
2531	drbd_adm_finish(info, retcode);
2532	return 0;
2533
2534 fail_ldev:
2535	put_ldev(device);
2536	goto fail;
2537}
2538
2539int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info)
2540{
2541	enum drbd_ret_code retcode;
2542	struct res_opts res_opts;
2543	int err;
2544
2545	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
2546	if (!adm_ctx.reply_skb)
2547		return retcode;
2548	if (retcode != NO_ERROR)
2549		goto fail;
2550
2551	res_opts = adm_ctx.resource->res_opts;
2552	if (should_set_defaults(info))
2553		set_res_opts_defaults(&res_opts);
2554
2555	err = res_opts_from_attrs(&res_opts, info);
2556	if (err && err != -ENOMSG) {
2557		retcode = ERR_MANDATORY_TAG;
2558		drbd_msg_put_info(from_attrs_err_to_txt(err));
2559		goto fail;
2560	}
2561
2562	err = set_resource_options(adm_ctx.resource, &res_opts);
2563	if (err) {
2564		retcode = ERR_INVALID_REQUEST;
2565		if (err == -ENOMEM)
2566			retcode = ERR_NOMEM;
2567	}
2568
2569fail:
2570	drbd_adm_finish(info, retcode);
2571	return 0;
2572}
2573
2574int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
2575{
2576	struct drbd_device *device;
2577	int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
2578
2579	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2580	if (!adm_ctx.reply_skb)
2581		return retcode;
2582	if (retcode != NO_ERROR)
2583		goto out;
2584
2585	device = adm_ctx.device;
2586
2587	/* If there is still bitmap IO pending, probably because of a previous
2588	 * resync just being finished, wait for it before requesting a new resync.
2589	 * Also wait for it's after_state_ch(). */
2590	drbd_suspend_io(device);
2591	wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
2592	drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work);
2593
2594	/* If we happen to be C_STANDALONE R_SECONDARY, just change to
2595	 * D_INCONSISTENT, and set all bits in the bitmap.  Otherwise,
2596	 * try to start a resync handshake as sync target for full sync.
2597	 */
2598	if (device->state.conn == C_STANDALONE && device->state.role == R_SECONDARY) {
2599		retcode = drbd_request_state(device, NS(disk, D_INCONSISTENT));
2600		if (retcode >= SS_SUCCESS) {
2601			if (drbd_bitmap_io(device, &drbd_bmio_set_n_write,
2602				"set_n_write from invalidate", BM_LOCKED_MASK))
2603				retcode = ERR_IO_MD_DISK;
2604		}
2605	} else
2606		retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T));
2607	drbd_resume_io(device);
2608
2609out:
2610	drbd_adm_finish(info, retcode);
2611	return 0;
2612}
2613
2614static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info,
2615		union drbd_state mask, union drbd_state val)
2616{
2617	enum drbd_ret_code retcode;
2618
2619	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2620	if (!adm_ctx.reply_skb)
2621		return retcode;
2622	if (retcode != NO_ERROR)
2623		goto out;
2624
2625	retcode = drbd_request_state(adm_ctx.device, mask, val);
2626out:
2627	drbd_adm_finish(info, retcode);
2628	return 0;
2629}
2630
2631static int drbd_bmio_set_susp_al(struct drbd_device *device)
2632{
2633	int rv;
2634
2635	rv = drbd_bmio_set_n_write(device);
2636	drbd_suspend_al(device);
2637	return rv;
2638}
2639
2640int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
2641{
2642	int retcode; /* drbd_ret_code, drbd_state_rv */
2643	struct drbd_device *device;
2644
2645	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2646	if (!adm_ctx.reply_skb)
2647		return retcode;
2648	if (retcode != NO_ERROR)
2649		goto out;
2650
2651	device = adm_ctx.device;
2652
2653	/* If there is still bitmap IO pending, probably because of a previous
2654	 * resync just being finished, wait for it before requesting a new resync.
2655	 * Also wait for it's after_state_ch(). */
2656	drbd_suspend_io(device);
2657	wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
2658	drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work);
2659
2660	/* If we happen to be C_STANDALONE R_PRIMARY, just set all bits
2661	 * in the bitmap.  Otherwise, try to start a resync handshake
2662	 * as sync source for full sync.
2663	 */
2664	if (device->state.conn == C_STANDALONE && device->state.role == R_PRIMARY) {
2665		/* The peer will get a resync upon connect anyways. Just make that
2666		   into a full resync. */
2667		retcode = drbd_request_state(device, NS(pdsk, D_INCONSISTENT));
2668		if (retcode >= SS_SUCCESS) {
2669			if (drbd_bitmap_io(device, &drbd_bmio_set_susp_al,
2670				"set_n_write from invalidate_peer",
2671				BM_LOCKED_SET_ALLOWED))
2672				retcode = ERR_IO_MD_DISK;
2673		}
2674	} else
2675		retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S));
2676	drbd_resume_io(device);
2677
2678out:
2679	drbd_adm_finish(info, retcode);
2680	return 0;
2681}
2682
2683int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info)
2684{
2685	enum drbd_ret_code retcode;
2686
2687	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2688	if (!adm_ctx.reply_skb)
2689		return retcode;
2690	if (retcode != NO_ERROR)
2691		goto out;
2692
2693	if (drbd_request_state(adm_ctx.device, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
2694		retcode = ERR_PAUSE_IS_SET;
2695out:
2696	drbd_adm_finish(info, retcode);
2697	return 0;
2698}
2699
2700int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info)
2701{
2702	union drbd_dev_state s;
2703	enum drbd_ret_code retcode;
2704
2705	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2706	if (!adm_ctx.reply_skb)
2707		return retcode;
2708	if (retcode != NO_ERROR)
2709		goto out;
2710
2711	if (drbd_request_state(adm_ctx.device, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
2712		s = adm_ctx.device->state;
2713		if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) {
2714			retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP :
2715				  s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR;
2716		} else {
2717			retcode = ERR_PAUSE_IS_CLEAR;
2718		}
2719	}
2720
2721out:
2722	drbd_adm_finish(info, retcode);
2723	return 0;
2724}
2725
2726int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info)
2727{
2728	return drbd_adm_simple_request_state(skb, info, NS(susp, 1));
2729}
2730
2731int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
2732{
2733	struct drbd_device *device;
2734	int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
2735
2736	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2737	if (!adm_ctx.reply_skb)
2738		return retcode;
2739	if (retcode != NO_ERROR)
2740		goto out;
2741
2742	device = adm_ctx.device;
2743	if (test_bit(NEW_CUR_UUID, &device->flags)) {
2744		drbd_uuid_new_current(device);
2745		clear_bit(NEW_CUR_UUID, &device->flags);
2746	}
2747	drbd_suspend_io(device);
2748	retcode = drbd_request_state(device, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
2749	if (retcode == SS_SUCCESS) {
2750		if (device->state.conn < C_CONNECTED)
2751			tl_clear(first_peer_device(device)->connection);
2752		if (device->state.disk == D_DISKLESS || device->state.disk == D_FAILED)
2753			tl_restart(first_peer_device(device)->connection, FAIL_FROZEN_DISK_IO);
2754	}
2755	drbd_resume_io(device);
2756
2757out:
2758	drbd_adm_finish(info, retcode);
2759	return 0;
2760}
2761
2762int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info)
2763{
2764	return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED));
2765}
2766
2767static int nla_put_drbd_cfg_context(struct sk_buff *skb,
2768				    struct drbd_resource *resource,
2769				    struct drbd_connection *connection,
2770				    struct drbd_device *device)
2771{
2772	struct nlattr *nla;
2773	nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT);
2774	if (!nla)
2775		goto nla_put_failure;
2776	if (device &&
2777	    nla_put_u32(skb, T_ctx_volume, device->vnr))
2778		goto nla_put_failure;
2779	if (nla_put_string(skb, T_ctx_resource_name, resource->name))
2780		goto nla_put_failure;
2781	if (connection) {
2782		if (connection->my_addr_len &&
2783		    nla_put(skb, T_ctx_my_addr, connection->my_addr_len, &connection->my_addr))
2784			goto nla_put_failure;
2785		if (connection->peer_addr_len &&
2786		    nla_put(skb, T_ctx_peer_addr, connection->peer_addr_len, &connection->peer_addr))
2787			goto nla_put_failure;
2788	}
2789	nla_nest_end(skb, nla);
2790	return 0;
2791
2792nla_put_failure:
2793	if (nla)
2794		nla_nest_cancel(skb, nla);
2795	return -EMSGSIZE;
2796}
2797
2798/*
2799 * Return the connection of @resource if @resource has exactly one connection.
2800 */
2801static struct drbd_connection *the_only_connection(struct drbd_resource *resource)
2802{
2803	struct list_head *connections = &resource->connections;
2804
2805	if (list_empty(connections) || connections->next->next != connections)
2806		return NULL;
2807	return list_first_entry(&resource->connections, struct drbd_connection, connections);
2808}
2809
2810int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device,
2811		const struct sib_info *sib)
2812{
2813	struct drbd_resource *resource = device->resource;
2814	struct state_info *si = NULL; /* for sizeof(si->member); */
2815	struct nlattr *nla;
2816	int got_ldev;
2817	int err = 0;
2818	int exclude_sensitive;
2819
2820	/* If sib != NULL, this is drbd_bcast_event, which anyone can listen
2821	 * to.  So we better exclude_sensitive information.
2822	 *
2823	 * If sib == NULL, this is drbd_adm_get_status, executed synchronously
2824	 * in the context of the requesting user process. Exclude sensitive
2825	 * information, unless current has superuser.
2826	 *
2827	 * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and
2828	 * relies on the current implementation of netlink_dump(), which
2829	 * executes the dump callback successively from netlink_recvmsg(),
2830	 * always in the context of the receiving process */
2831	exclude_sensitive = sib || !capable(CAP_SYS_ADMIN);
2832
2833	got_ldev = get_ldev(device);
2834
2835	/* We need to add connection name and volume number information still.
2836	 * Minor number is in drbd_genlmsghdr. */
2837	if (nla_put_drbd_cfg_context(skb, resource, the_only_connection(resource), device))
2838		goto nla_put_failure;
2839
2840	if (res_opts_to_skb(skb, &device->resource->res_opts, exclude_sensitive))
2841		goto nla_put_failure;
2842
2843	rcu_read_lock();
2844	if (got_ldev) {
2845		struct disk_conf *disk_conf;
2846
2847		disk_conf = rcu_dereference(device->ldev->disk_conf);
2848		err = disk_conf_to_skb(skb, disk_conf, exclude_sensitive);
2849	}
2850	if (!err) {
2851		struct net_conf *nc;
2852
2853		nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
2854		if (nc)
2855			err = net_conf_to_skb(skb, nc, exclude_sensitive);
2856	}
2857	rcu_read_unlock();
2858	if (err)
2859		goto nla_put_failure;
2860
2861	nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO);
2862	if (!nla)
2863		goto nla_put_failure;
2864	if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) ||
2865	    nla_put_u32(skb, T_current_state, device->state.i) ||
2866	    nla_put_u64(skb, T_ed_uuid, device->ed_uuid) ||
2867	    nla_put_u64(skb, T_capacity, drbd_get_capacity(device->this_bdev)) ||
2868	    nla_put_u64(skb, T_send_cnt, device->send_cnt) ||
2869	    nla_put_u64(skb, T_recv_cnt, device->recv_cnt) ||
2870	    nla_put_u64(skb, T_read_cnt, device->read_cnt) ||
2871	    nla_put_u64(skb, T_writ_cnt, device->writ_cnt) ||
2872	    nla_put_u64(skb, T_al_writ_cnt, device->al_writ_cnt) ||
2873	    nla_put_u64(skb, T_bm_writ_cnt, device->bm_writ_cnt) ||
2874	    nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&device->ap_bio_cnt)) ||
2875	    nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&device->ap_pending_cnt)) ||
2876	    nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&device->rs_pending_cnt)))
2877		goto nla_put_failure;
2878
2879	if (got_ldev) {
2880		int err;
2881
2882		spin_lock_irq(&device->ldev->md.uuid_lock);
2883		err = nla_put(skb, T_uuids, sizeof(si->uuids), device->ldev->md.uuid);
2884		spin_unlock_irq(&device->ldev->md.uuid_lock);
2885
2886		if (err)
2887			goto nla_put_failure;
2888
2889		if (nla_put_u32(skb, T_disk_flags, device->ldev->md.flags) ||
2890		    nla_put_u64(skb, T_bits_total, drbd_bm_bits(device)) ||
2891		    nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(device)))
2892			goto nla_put_failure;
2893		if (C_SYNC_SOURCE <= device->state.conn &&
2894		    C_PAUSED_SYNC_T >= device->state.conn) {
2895			if (nla_put_u64(skb, T_bits_rs_total, device->rs_total) ||
2896			    nla_put_u64(skb, T_bits_rs_failed, device->rs_failed))
2897				goto nla_put_failure;
2898		}
2899	}
2900
2901	if (sib) {
2902		switch(sib->sib_reason) {
2903		case SIB_SYNC_PROGRESS:
2904		case SIB_GET_STATUS_REPLY:
2905			break;
2906		case SIB_STATE_CHANGE:
2907			if (nla_put_u32(skb, T_prev_state, sib->os.i) ||
2908			    nla_put_u32(skb, T_new_state, sib->ns.i))
2909				goto nla_put_failure;
2910			break;
2911		case SIB_HELPER_POST:
2912			if (nla_put_u32(skb, T_helper_exit_code,
2913					sib->helper_exit_code))
2914				goto nla_put_failure;
2915			/* fall through */
2916		case SIB_HELPER_PRE:
2917			if (nla_put_string(skb, T_helper, sib->helper_name))
2918				goto nla_put_failure;
2919			break;
2920		}
2921	}
2922	nla_nest_end(skb, nla);
2923
2924	if (0)
2925nla_put_failure:
2926		err = -EMSGSIZE;
2927	if (got_ldev)
2928		put_ldev(device);
2929	return err;
2930}
2931
2932int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info)
2933{
2934	enum drbd_ret_code retcode;
2935	int err;
2936
2937	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2938	if (!adm_ctx.reply_skb)
2939		return retcode;
2940	if (retcode != NO_ERROR)
2941		goto out;
2942
2943	err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.device, NULL);
2944	if (err) {
2945		nlmsg_free(adm_ctx.reply_skb);
2946		return err;
2947	}
2948out:
2949	drbd_adm_finish(info, retcode);
2950	return 0;
2951}
2952
2953static int get_one_status(struct sk_buff *skb, struct netlink_callback *cb)
2954{
2955	struct drbd_device *device;
2956	struct drbd_genlmsghdr *dh;
2957	struct drbd_resource *pos = (struct drbd_resource *)cb->args[0];
2958	struct drbd_resource *resource = NULL;
2959	struct drbd_resource *tmp;
2960	unsigned volume = cb->args[1];
2961
2962	/* Open coded, deferred, iteration:
2963	 * for_each_resource_safe(resource, tmp, &drbd_resources) {
2964	 *      connection = "first connection of resource or undefined";
2965	 *	idr_for_each_entry(&resource->devices, device, i) {
2966	 *	  ...
2967	 *	}
2968	 * }
2969	 * where resource is cb->args[0];
2970	 * and i is cb->args[1];
2971	 *
2972	 * cb->args[2] indicates if we shall loop over all resources,
2973	 * or just dump all volumes of a single resource.
2974	 *
2975	 * This may miss entries inserted after this dump started,
2976	 * or entries deleted before they are reached.
2977	 *
2978	 * We need to make sure the device won't disappear while
2979	 * we are looking at it, and revalidate our iterators
2980	 * on each iteration.
2981	 */
2982
2983	/* synchronize with conn_create()/drbd_destroy_connection() */
2984	rcu_read_lock();
2985	/* revalidate iterator position */
2986	for_each_resource_rcu(tmp, &drbd_resources) {
2987		if (pos == NULL) {
2988			/* first iteration */
2989			pos = tmp;
2990			resource = pos;
2991			break;
2992		}
2993		if (tmp == pos) {
2994			resource = pos;
2995			break;
2996		}
2997	}
2998	if (resource) {
2999next_resource:
3000		device = idr_get_next(&resource->devices, &volume);
3001		if (!device) {
3002			/* No more volumes to dump on this resource.
3003			 * Advance resource iterator. */
3004			pos = list_entry_rcu(resource->resources.next,
3005					     struct drbd_resource, resources);
3006			/* Did we dump any volume of this resource yet? */
3007			if (volume != 0) {
3008				/* If we reached the end of the list,
3009				 * or only a single resource dump was requested,
3010				 * we are done. */
3011				if (&pos->resources == &drbd_resources || cb->args[2])
3012					goto out;
3013				volume = 0;
3014				resource = pos;
3015				goto next_resource;
3016			}
3017		}
3018
3019		dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
3020				cb->nlh->nlmsg_seq, &drbd_genl_family,
3021				NLM_F_MULTI, DRBD_ADM_GET_STATUS);
3022		if (!dh)
3023			goto out;
3024
3025		if (!device) {
3026			/* This is a connection without a single volume.
3027			 * Suprisingly enough, it may have a network
3028			 * configuration. */
3029			struct drbd_connection *connection;
3030
3031			dh->minor = -1U;
3032			dh->ret_code = NO_ERROR;
3033			connection = the_only_connection(resource);
3034			if (nla_put_drbd_cfg_context(skb, resource, connection, NULL))
3035				goto cancel;
3036			if (connection) {
3037				struct net_conf *nc;
3038
3039				nc = rcu_dereference(connection->net_conf);
3040				if (nc && net_conf_to_skb(skb, nc, 1) != 0)
3041					goto cancel;
3042			}
3043			goto done;
3044		}
3045
3046		D_ASSERT(device, device->vnr == volume);
3047		D_ASSERT(device, device->resource == resource);
3048
3049		dh->minor = device_to_minor(device);
3050		dh->ret_code = NO_ERROR;
3051
3052		if (nla_put_status_info(skb, device, NULL)) {
3053cancel:
3054			genlmsg_cancel(skb, dh);
3055			goto out;
3056		}
3057done:
3058		genlmsg_end(skb, dh);
3059	}
3060
3061out:
3062	rcu_read_unlock();
3063	/* where to start the next iteration */
3064	cb->args[0] = (long)pos;
3065	cb->args[1] = (pos == resource) ? volume + 1 : 0;
3066
3067	/* No more resources/volumes/minors found results in an empty skb.
3068	 * Which will terminate the dump. */
3069        return skb->len;
3070}
3071
3072/*
3073 * Request status of all resources, or of all volumes within a single resource.
3074 *
3075 * This is a dump, as the answer may not fit in a single reply skb otherwise.
3076 * Which means we cannot use the family->attrbuf or other such members, because
3077 * dump is NOT protected by the genl_lock().  During dump, we only have access
3078 * to the incoming skb, and need to opencode "parsing" of the nlattr payload.
3079 *
3080 * Once things are setup properly, we call into get_one_status().
3081 */
3082int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb)
3083{
3084	const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
3085	struct nlattr *nla;
3086	const char *resource_name;
3087	struct drbd_resource *resource;
3088	int maxtype;
3089
3090	/* Is this a followup call? */
3091	if (cb->args[0]) {
3092		/* ... of a single resource dump,
3093		 * and the resource iterator has been advanced already? */
3094		if (cb->args[2] && cb->args[2] != cb->args[0])
3095			return 0; /* DONE. */
3096		goto dump;
3097	}
3098
3099	/* First call (from netlink_dump_start).  We need to figure out
3100	 * which resource(s) the user wants us to dump. */
3101	nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen),
3102			nlmsg_attrlen(cb->nlh, hdrlen),
3103			DRBD_NLA_CFG_CONTEXT);
3104
3105	/* No explicit context given.  Dump all. */
3106	if (!nla)
3107		goto dump;
3108	maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
3109	nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name));
3110	if (IS_ERR(nla))
3111		return PTR_ERR(nla);
3112	/* context given, but no name present? */
3113	if (!nla)
3114		return -EINVAL;
3115	resource_name = nla_data(nla);
3116	if (!*resource_name)
3117		return -ENODEV;
3118	resource = drbd_find_resource(resource_name);
3119	if (!resource)
3120		return -ENODEV;
3121
3122	kref_put(&resource->kref, drbd_destroy_resource); /* get_one_status() revalidates the resource */
3123
3124	/* prime iterators, and set "filter" mode mark:
3125	 * only dump this connection. */
3126	cb->args[0] = (long)resource;
3127	/* cb->args[1] = 0; passed in this way. */
3128	cb->args[2] = (long)resource;
3129
3130dump:
3131	return get_one_status(skb, cb);
3132}
3133
3134int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info)
3135{
3136	enum drbd_ret_code retcode;
3137	struct timeout_parms tp;
3138	int err;
3139
3140	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
3141	if (!adm_ctx.reply_skb)
3142		return retcode;
3143	if (retcode != NO_ERROR)
3144		goto out;
3145
3146	tp.timeout_type =
3147		adm_ctx.device->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
3148		test_bit(USE_DEGR_WFC_T, &adm_ctx.device->flags) ? UT_DEGRADED :
3149		UT_DEFAULT;
3150
3151	err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp);
3152	if (err) {
3153		nlmsg_free(adm_ctx.reply_skb);
3154		return err;
3155	}
3156out:
3157	drbd_adm_finish(info, retcode);
3158	return 0;
3159}
3160
3161int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
3162{
3163	struct drbd_device *device;
3164	enum drbd_ret_code retcode;
3165	struct start_ov_parms parms;
3166
3167	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
3168	if (!adm_ctx.reply_skb)
3169		return retcode;
3170	if (retcode != NO_ERROR)
3171		goto out;
3172
3173	device = adm_ctx.device;
3174
3175	/* resume from last known position, if possible */
3176	parms.ov_start_sector = device->ov_start_sector;
3177	parms.ov_stop_sector = ULLONG_MAX;
3178	if (info->attrs[DRBD_NLA_START_OV_PARMS]) {
3179		int err = start_ov_parms_from_attrs(&parms, info);
3180		if (err) {
3181			retcode = ERR_MANDATORY_TAG;
3182			drbd_msg_put_info(from_attrs_err_to_txt(err));
3183			goto out;
3184		}
3185	}
3186	/* w_make_ov_request expects position to be aligned */
3187	device->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1);
3188	device->ov_stop_sector = parms.ov_stop_sector;
3189
3190	/* If there is still bitmap IO pending, e.g. previous resync or verify
3191	 * just being finished, wait for it before requesting a new resync. */
3192	drbd_suspend_io(device);
3193	wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
3194	retcode = drbd_request_state(device, NS(conn, C_VERIFY_S));
3195	drbd_resume_io(device);
3196out:
3197	drbd_adm_finish(info, retcode);
3198	return 0;
3199}
3200
3201
3202int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info)
3203{
3204	struct drbd_device *device;
3205	enum drbd_ret_code retcode;
3206	int skip_initial_sync = 0;
3207	int err;
3208	struct new_c_uuid_parms args;
3209
3210	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
3211	if (!adm_ctx.reply_skb)
3212		return retcode;
3213	if (retcode != NO_ERROR)
3214		goto out_nolock;
3215
3216	device = adm_ctx.device;
3217	memset(&args, 0, sizeof(args));
3218	if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) {
3219		err = new_c_uuid_parms_from_attrs(&args, info);
3220		if (err) {
3221			retcode = ERR_MANDATORY_TAG;
3222			drbd_msg_put_info(from_attrs_err_to_txt(err));
3223			goto out_nolock;
3224		}
3225	}
3226
3227	mutex_lock(device->state_mutex); /* Protects us against serialized state changes. */
3228
3229	if (!get_ldev(device)) {
3230		retcode = ERR_NO_DISK;
3231		goto out;
3232	}
3233
3234	/* this is "skip initial sync", assume to be clean */
3235	if (device->state.conn == C_CONNECTED &&
3236	    first_peer_device(device)->connection->agreed_pro_version >= 90 &&
3237	    device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
3238		drbd_info(device, "Preparing to skip initial sync\n");
3239		skip_initial_sync = 1;
3240	} else if (device->state.conn != C_STANDALONE) {
3241		retcode = ERR_CONNECTED;
3242		goto out_dec;
3243	}
3244
3245	drbd_uuid_set(device, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */
3246	drbd_uuid_new_current(device); /* New current, previous to UI_BITMAP */
3247
3248	if (args.clear_bm) {
3249		err = drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
3250			"clear_n_write from new_c_uuid", BM_LOCKED_MASK);
3251		if (err) {
3252			drbd_err(device, "Writing bitmap failed with %d\n", err);
3253			retcode = ERR_IO_MD_DISK;
3254		}
3255		if (skip_initial_sync) {
3256			drbd_send_uuids_skip_initial_sync(first_peer_device(device));
3257			_drbd_uuid_set(device, UI_BITMAP, 0);
3258			drbd_print_uuids(device, "cleared bitmap UUID");
3259			spin_lock_irq(&device->resource->req_lock);
3260			_drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
3261					CS_VERBOSE, NULL);
3262			spin_unlock_irq(&device->resource->req_lock);
3263		}
3264	}
3265
3266	drbd_md_sync(device);
3267out_dec:
3268	put_ldev(device);
3269out:
3270	mutex_unlock(device->state_mutex);
3271out_nolock:
3272	drbd_adm_finish(info, retcode);
3273	return 0;
3274}
3275
3276static enum drbd_ret_code
3277drbd_check_resource_name(const char *name)
3278{
3279	if (!name || !name[0]) {
3280		drbd_msg_put_info("resource name missing");
3281		return ERR_MANDATORY_TAG;
3282	}
3283	/* if we want to use these in sysfs/configfs/debugfs some day,
3284	 * we must not allow slashes */
3285	if (strchr(name, '/')) {
3286		drbd_msg_put_info("invalid resource name");
3287		return ERR_INVALID_REQUEST;
3288	}
3289	return NO_ERROR;
3290}
3291
3292int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
3293{
3294	enum drbd_ret_code retcode;
3295	struct res_opts res_opts;
3296	int err;
3297
3298	retcode = drbd_adm_prepare(skb, info, 0);
3299	if (!adm_ctx.reply_skb)
3300		return retcode;
3301	if (retcode != NO_ERROR)
3302		goto out;
3303
3304	set_res_opts_defaults(&res_opts);
3305	err = res_opts_from_attrs(&res_opts, info);
3306	if (err && err != -ENOMSG) {
3307		retcode = ERR_MANDATORY_TAG;
3308		drbd_msg_put_info(from_attrs_err_to_txt(err));
3309		goto out;
3310	}
3311
3312	retcode = drbd_check_resource_name(adm_ctx.resource_name);
3313	if (retcode != NO_ERROR)
3314		goto out;
3315
3316	if (adm_ctx.resource) {
3317		if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) {
3318			retcode = ERR_INVALID_REQUEST;
3319			drbd_msg_put_info("resource exists");
3320		}
3321		/* else: still NO_ERROR */
3322		goto out;
3323	}
3324
3325	if (!conn_create(adm_ctx.resource_name, &res_opts))
3326		retcode = ERR_NOMEM;
3327out:
3328	drbd_adm_finish(info, retcode);
3329	return 0;
3330}
3331
3332int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
3333{
3334	struct drbd_genlmsghdr *dh = info->userhdr;
3335	enum drbd_ret_code retcode;
3336
3337	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
3338	if (!adm_ctx.reply_skb)
3339		return retcode;
3340	if (retcode != NO_ERROR)
3341		goto out;
3342
3343	if (dh->minor > MINORMASK) {
3344		drbd_msg_put_info("requested minor out of range");
3345		retcode = ERR_INVALID_REQUEST;
3346		goto out;
3347	}
3348	if (adm_ctx.volume > DRBD_VOLUME_MAX) {
3349		drbd_msg_put_info("requested volume id out of range");
3350		retcode = ERR_INVALID_REQUEST;
3351		goto out;
3352	}
3353
3354	/* drbd_adm_prepare made sure already
3355	 * that first_peer_device(device)->connection and device->vnr match the request. */
3356	if (adm_ctx.device) {
3357		if (info->nlhdr->nlmsg_flags & NLM_F_EXCL)
3358			retcode = ERR_MINOR_EXISTS;
3359		/* else: still NO_ERROR */
3360		goto out;
3361	}
3362
3363	retcode = drbd_create_device(adm_ctx.resource, dh->minor, adm_ctx.volume);
3364out:
3365	drbd_adm_finish(info, retcode);
3366	return 0;
3367}
3368
3369static enum drbd_ret_code adm_del_minor(struct drbd_device *device)
3370{
3371	if (device->state.disk == D_DISKLESS &&
3372	    /* no need to be device->state.conn == C_STANDALONE &&
3373	     * we may want to delete a minor from a live replication group.
3374	     */
3375	    device->state.role == R_SECONDARY) {
3376		_drbd_request_state(device, NS(conn, C_WF_REPORT_PARAMS),
3377				    CS_VERBOSE + CS_WAIT_COMPLETE);
3378		drbd_delete_device(device);
3379		return NO_ERROR;
3380	} else
3381		return ERR_MINOR_CONFIGURED;
3382}
3383
3384int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info)
3385{
3386	enum drbd_ret_code retcode;
3387
3388	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
3389	if (!adm_ctx.reply_skb)
3390		return retcode;
3391	if (retcode != NO_ERROR)
3392		goto out;
3393
3394	retcode = adm_del_minor(adm_ctx.device);
3395out:
3396	drbd_adm_finish(info, retcode);
3397	return 0;
3398}
3399
3400int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
3401{
3402	struct drbd_resource *resource;
3403	struct drbd_connection *connection;
3404	struct drbd_device *device;
3405	int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
3406	unsigned i;
3407
3408	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
3409	if (!adm_ctx.reply_skb)
3410		return retcode;
3411	if (retcode != NO_ERROR)
3412		goto out;
3413
3414	resource = adm_ctx.resource;
3415	/* demote */
3416	for_each_connection(connection, resource) {
3417		struct drbd_peer_device *peer_device;
3418
3419		idr_for_each_entry(&connection->peer_devices, peer_device, i) {
3420			retcode = drbd_set_role(peer_device->device, R_SECONDARY, 0);
3421			if (retcode < SS_SUCCESS) {
3422				drbd_msg_put_info("failed to demote");
3423				goto out;
3424			}
3425		}
3426
3427		retcode = conn_try_disconnect(connection, 0);
3428		if (retcode < SS_SUCCESS) {
3429			drbd_msg_put_info("failed to disconnect");
3430			goto out;
3431		}
3432	}
3433
3434	/* detach */
3435	idr_for_each_entry(&resource->devices, device, i) {
3436		retcode = adm_detach(device, 0);
3437		if (retcode < SS_SUCCESS || retcode > NO_ERROR) {
3438			drbd_msg_put_info("failed to detach");
3439			goto out;
3440		}
3441	}
3442
3443	/* If we reach this, all volumes (of this connection) are Secondary,
3444	 * Disconnected, Diskless, aka Unconfigured. Make sure all threads have
3445	 * actually stopped, state handling only does drbd_thread_stop_nowait(). */
3446	for_each_connection(connection, resource)
3447		drbd_thread_stop(&connection->worker);
3448
3449	/* Now, nothing can fail anymore */
3450
3451	/* delete volumes */
3452	idr_for_each_entry(&resource->devices, device, i) {
3453		retcode = adm_del_minor(device);
3454		if (retcode != NO_ERROR) {
3455			/* "can not happen" */
3456			drbd_msg_put_info("failed to delete volume");
3457			goto out;
3458		}
3459	}
3460
3461	list_del_rcu(&resource->resources);
3462	synchronize_rcu();
3463	drbd_free_resource(resource);
3464	retcode = NO_ERROR;
3465
3466out:
3467	drbd_adm_finish(info, retcode);
3468	return 0;
3469}
3470
3471int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
3472{
3473	struct drbd_resource *resource;
3474	struct drbd_connection *connection;
3475	enum drbd_ret_code retcode;
3476
3477	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
3478	if (!adm_ctx.reply_skb)
3479		return retcode;
3480	if (retcode != NO_ERROR)
3481		goto out;
3482
3483	resource = adm_ctx.resource;
3484	for_each_connection(connection, resource) {
3485		if (connection->cstate > C_STANDALONE) {
3486			retcode = ERR_NET_CONFIGURED;
3487			goto out;
3488		}
3489	}
3490	if (!idr_is_empty(&resource->devices)) {
3491		retcode = ERR_RES_IN_USE;
3492		goto out;
3493	}
3494
3495	list_del_rcu(&resource->resources);
3496	for_each_connection(connection, resource)
3497		drbd_thread_stop(&connection->worker);
3498	synchronize_rcu();
3499	drbd_free_resource(resource);
3500	retcode = NO_ERROR;
3501out:
3502	drbd_adm_finish(info, retcode);
3503	return 0;
3504}
3505
3506void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
3507{
3508	static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
3509	struct sk_buff *msg;
3510	struct drbd_genlmsghdr *d_out;
3511	unsigned seq;
3512	int err = -ENOMEM;
3513
3514	if (sib->sib_reason == SIB_SYNC_PROGRESS) {
3515		if (time_after(jiffies, device->rs_last_bcast + HZ))
3516			device->rs_last_bcast = jiffies;
3517		else
3518			return;
3519	}
3520
3521	seq = atomic_inc_return(&drbd_genl_seq);
3522	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
3523	if (!msg)
3524		goto failed;
3525
3526	err = -EMSGSIZE;
3527	d_out = genlmsg_put(msg, 0, seq, &drbd_genl_family, 0, DRBD_EVENT);
3528	if (!d_out) /* cannot happen, but anyways. */
3529		goto nla_put_failure;
3530	d_out->minor = device_to_minor(device);
3531	d_out->ret_code = NO_ERROR;
3532
3533	if (nla_put_status_info(msg, device, sib))
3534		goto nla_put_failure;
3535	genlmsg_end(msg, d_out);
3536	err = drbd_genl_multicast_events(msg, 0);
3537	/* msg has been consumed or freed in netlink_broadcast() */
3538	if (err && err != -ESRCH)
3539		goto failed;
3540
3541	return;
3542
3543nla_put_failure:
3544	nlmsg_free(msg);
3545failed:
3546	drbd_err(device, "Error %d while broadcasting event. "
3547			"Event seq:%u sib_reason:%u\n",
3548			err, seq, sib->sib_reason);
3549}
Configure Feed

Configure Feed