drivers/nvme/target/core.c at v5.7

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / drivers / nvme / target / core.c
at v5.7 1527 lines 37 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Common code for the NVMe target.
   4 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
   5 */
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7#include <linux/module.h>
   8#include <linux/random.h>
   9#include <linux/rculist.h>
  10#include <linux/pci-p2pdma.h>
  11#include <linux/scatterlist.h>
  12
  13#define CREATE_TRACE_POINTS
  14#include "trace.h"
  15
  16#include "nvmet.h"
  17
  18struct workqueue_struct *buffered_io_wq;
  19static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
  20static DEFINE_IDA(cntlid_ida);
  21
  22/*
  23 * This read/write semaphore is used to synchronize access to configuration
  24 * information on a target system that will result in discovery log page
  25 * information change for at least one host.
  26 * The full list of resources to protected by this semaphore is:
  27 *
  28 *  - subsystems list
  29 *  - per-subsystem allowed hosts list
  30 *  - allow_any_host subsystem attribute
  31 *  - nvmet_genctr
  32 *  - the nvmet_transports array
  33 *
  34 * When updating any of those lists/structures write lock should be obtained,
  35 * while when reading (popolating discovery log page or checking host-subsystem
  36 * link) read lock is obtained to allow concurrent reads.
  37 */
  38DECLARE_RWSEM(nvmet_config_sem);
  39
  40u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
  41u64 nvmet_ana_chgcnt;
  42DECLARE_RWSEM(nvmet_ana_sem);
  43
  44inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno)
  45{
  46	u16 status;
  47
  48	switch (errno) {
  49	case 0:
  50		status = NVME_SC_SUCCESS;
  51		break;
  52	case -ENOSPC:
  53		req->error_loc = offsetof(struct nvme_rw_command, length);
  54		status = NVME_SC_CAP_EXCEEDED | NVME_SC_DNR;
  55		break;
  56	case -EREMOTEIO:
  57		req->error_loc = offsetof(struct nvme_rw_command, slba);
  58		status = NVME_SC_LBA_RANGE | NVME_SC_DNR;
  59		break;
  60	case -EOPNOTSUPP:
  61		req->error_loc = offsetof(struct nvme_common_command, opcode);
  62		switch (req->cmd->common.opcode) {
  63		case nvme_cmd_dsm:
  64		case nvme_cmd_write_zeroes:
  65			status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR;
  66			break;
  67		default:
  68			status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
  69		}
  70		break;
  71	case -ENODATA:
  72		req->error_loc = offsetof(struct nvme_rw_command, nsid);
  73		status = NVME_SC_ACCESS_DENIED;
  74		break;
  75	case -EIO:
  76		/* FALLTHRU */
  77	default:
  78		req->error_loc = offsetof(struct nvme_common_command, opcode);
  79		status = NVME_SC_INTERNAL | NVME_SC_DNR;
  80	}
  81
  82	return status;
  83}
  84
  85static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
  86		const char *subsysnqn);
  87
  88u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
  89		size_t len)
  90{
  91	if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) {
  92		req->error_loc = offsetof(struct nvme_common_command, dptr);
  93		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
  94	}
  95	return 0;
  96}
  97
  98u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len)
  99{
 100	if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) {
 101		req->error_loc = offsetof(struct nvme_common_command, dptr);
 102		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
 103	}
 104	return 0;
 105}
 106
 107u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len)
 108{
 109	if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len) {
 110		req->error_loc = offsetof(struct nvme_common_command, dptr);
 111		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
 112	}
 113	return 0;
 114}
 115
 116static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys)
 117{
 118	struct nvmet_ns *ns;
 119
 120	if (list_empty(&subsys->namespaces))
 121		return 0;
 122
 123	ns = list_last_entry(&subsys->namespaces, struct nvmet_ns, dev_link);
 124	return ns->nsid;
 125}
 126
 127static u32 nvmet_async_event_result(struct nvmet_async_event *aen)
 128{
 129	return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16);
 130}
 131
 132static void nvmet_async_events_process(struct nvmet_ctrl *ctrl, u16 status)
 133{
 134	struct nvmet_async_event *aen;
 135	struct nvmet_req *req;
 136
 137	while (1) {
 138		mutex_lock(&ctrl->lock);
 139		aen = list_first_entry_or_null(&ctrl->async_events,
 140				struct nvmet_async_event, entry);
 141		if (!aen || !ctrl->nr_async_event_cmds) {
 142			mutex_unlock(&ctrl->lock);
 143			break;
 144		}
 145
 146		req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
 147		if (status == 0)
 148			nvmet_set_result(req, nvmet_async_event_result(aen));
 149
 150		list_del(&aen->entry);
 151		kfree(aen);
 152
 153		mutex_unlock(&ctrl->lock);
 154		nvmet_req_complete(req, status);
 155	}
 156}
 157
 158static void nvmet_async_events_free(struct nvmet_ctrl *ctrl)
 159{
 160	struct nvmet_req *req;
 161
 162	mutex_lock(&ctrl->lock);
 163	while (ctrl->nr_async_event_cmds) {
 164		req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
 165		mutex_unlock(&ctrl->lock);
 166		nvmet_req_complete(req, NVME_SC_INTERNAL | NVME_SC_DNR);
 167		mutex_lock(&ctrl->lock);
 168	}
 169	mutex_unlock(&ctrl->lock);
 170}
 171
 172static void nvmet_async_event_work(struct work_struct *work)
 173{
 174	struct nvmet_ctrl *ctrl =
 175		container_of(work, struct nvmet_ctrl, async_event_work);
 176
 177	nvmet_async_events_process(ctrl, 0);
 178}
 179
 180void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
 181		u8 event_info, u8 log_page)
 182{
 183	struct nvmet_async_event *aen;
 184
 185	aen = kmalloc(sizeof(*aen), GFP_KERNEL);
 186	if (!aen)
 187		return;
 188
 189	aen->event_type = event_type;
 190	aen->event_info = event_info;
 191	aen->log_page = log_page;
 192
 193	mutex_lock(&ctrl->lock);
 194	list_add_tail(&aen->entry, &ctrl->async_events);
 195	mutex_unlock(&ctrl->lock);
 196
 197	schedule_work(&ctrl->async_event_work);
 198}
 199
 200static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid)
 201{
 202	u32 i;
 203
 204	mutex_lock(&ctrl->lock);
 205	if (ctrl->nr_changed_ns > NVME_MAX_CHANGED_NAMESPACES)
 206		goto out_unlock;
 207
 208	for (i = 0; i < ctrl->nr_changed_ns; i++) {
 209		if (ctrl->changed_ns_list[i] == nsid)
 210			goto out_unlock;
 211	}
 212
 213	if (ctrl->nr_changed_ns == NVME_MAX_CHANGED_NAMESPACES) {
 214		ctrl->changed_ns_list[0] = cpu_to_le32(0xffffffff);
 215		ctrl->nr_changed_ns = U32_MAX;
 216		goto out_unlock;
 217	}
 218
 219	ctrl->changed_ns_list[ctrl->nr_changed_ns++] = nsid;
 220out_unlock:
 221	mutex_unlock(&ctrl->lock);
 222}
 223
 224void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
 225{
 226	struct nvmet_ctrl *ctrl;
 227
 228	lockdep_assert_held(&subsys->lock);
 229
 230	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
 231		nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid));
 232		if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_NS_ATTR))
 233			continue;
 234		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
 235				NVME_AER_NOTICE_NS_CHANGED,
 236				NVME_LOG_CHANGED_NS);
 237	}
 238}
 239
 240void nvmet_send_ana_event(struct nvmet_subsys *subsys,
 241		struct nvmet_port *port)
 242{
 243	struct nvmet_ctrl *ctrl;
 244
 245	mutex_lock(&subsys->lock);
 246	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
 247		if (port && ctrl->port != port)
 248			continue;
 249		if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_ANA_CHANGE))
 250			continue;
 251		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
 252				NVME_AER_NOTICE_ANA, NVME_LOG_ANA);
 253	}
 254	mutex_unlock(&subsys->lock);
 255}
 256
 257void nvmet_port_send_ana_event(struct nvmet_port *port)
 258{
 259	struct nvmet_subsys_link *p;
 260
 261	down_read(&nvmet_config_sem);
 262	list_for_each_entry(p, &port->subsystems, entry)
 263		nvmet_send_ana_event(p->subsys, port);
 264	up_read(&nvmet_config_sem);
 265}
 266
 267int nvmet_register_transport(const struct nvmet_fabrics_ops *ops)
 268{
 269	int ret = 0;
 270
 271	down_write(&nvmet_config_sem);
 272	if (nvmet_transports[ops->type])
 273		ret = -EINVAL;
 274	else
 275		nvmet_transports[ops->type] = ops;
 276	up_write(&nvmet_config_sem);
 277
 278	return ret;
 279}
 280EXPORT_SYMBOL_GPL(nvmet_register_transport);
 281
 282void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops)
 283{
 284	down_write(&nvmet_config_sem);
 285	nvmet_transports[ops->type] = NULL;
 286	up_write(&nvmet_config_sem);
 287}
 288EXPORT_SYMBOL_GPL(nvmet_unregister_transport);
 289
 290void nvmet_port_del_ctrls(struct nvmet_port *port, struct nvmet_subsys *subsys)
 291{
 292	struct nvmet_ctrl *ctrl;
 293
 294	mutex_lock(&subsys->lock);
 295	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
 296		if (ctrl->port == port)
 297			ctrl->ops->delete_ctrl(ctrl);
 298	}
 299	mutex_unlock(&subsys->lock);
 300}
 301
 302int nvmet_enable_port(struct nvmet_port *port)
 303{
 304	const struct nvmet_fabrics_ops *ops;
 305	int ret;
 306
 307	lockdep_assert_held(&nvmet_config_sem);
 308
 309	ops = nvmet_transports[port->disc_addr.trtype];
 310	if (!ops) {
 311		up_write(&nvmet_config_sem);
 312		request_module("nvmet-transport-%d", port->disc_addr.trtype);
 313		down_write(&nvmet_config_sem);
 314		ops = nvmet_transports[port->disc_addr.trtype];
 315		if (!ops) {
 316			pr_err("transport type %d not supported\n",
 317				port->disc_addr.trtype);
 318			return -EINVAL;
 319		}
 320	}
 321
 322	if (!try_module_get(ops->owner))
 323		return -EINVAL;
 324
 325	ret = ops->add_port(port);
 326	if (ret) {
 327		module_put(ops->owner);
 328		return ret;
 329	}
 330
 331	/* If the transport didn't set inline_data_size, then disable it. */
 332	if (port->inline_data_size < 0)
 333		port->inline_data_size = 0;
 334
 335	port->enabled = true;
 336	port->tr_ops = ops;
 337	return 0;
 338}
 339
 340void nvmet_disable_port(struct nvmet_port *port)
 341{
 342	const struct nvmet_fabrics_ops *ops;
 343
 344	lockdep_assert_held(&nvmet_config_sem);
 345
 346	port->enabled = false;
 347	port->tr_ops = NULL;
 348
 349	ops = nvmet_transports[port->disc_addr.trtype];
 350	ops->remove_port(port);
 351	module_put(ops->owner);
 352}
 353
 354static void nvmet_keep_alive_timer(struct work_struct *work)
 355{
 356	struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work),
 357			struct nvmet_ctrl, ka_work);
 358	bool cmd_seen = ctrl->cmd_seen;
 359
 360	ctrl->cmd_seen = false;
 361	if (cmd_seen) {
 362		pr_debug("ctrl %d reschedule traffic based keep-alive timer\n",
 363			ctrl->cntlid);
 364		schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
 365		return;
 366	}
 367
 368	pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n",
 369		ctrl->cntlid, ctrl->kato);
 370
 371	nvmet_ctrl_fatal_error(ctrl);
 372}
 373
 374static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl)
 375{
 376	pr_debug("ctrl %d start keep-alive timer for %d secs\n",
 377		ctrl->cntlid, ctrl->kato);
 378
 379	INIT_DELAYED_WORK(&ctrl->ka_work, nvmet_keep_alive_timer);
 380	schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
 381}
 382
 383static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
 384{
 385	pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid);
 386
 387	cancel_delayed_work_sync(&ctrl->ka_work);
 388}
 389
 390static struct nvmet_ns *__nvmet_find_namespace(struct nvmet_ctrl *ctrl,
 391		__le32 nsid)
 392{
 393	struct nvmet_ns *ns;
 394
 395	list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
 396		if (ns->nsid == le32_to_cpu(nsid))
 397			return ns;
 398	}
 399
 400	return NULL;
 401}
 402
 403struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid)
 404{
 405	struct nvmet_ns *ns;
 406
 407	rcu_read_lock();
 408	ns = __nvmet_find_namespace(ctrl, nsid);
 409	if (ns)
 410		percpu_ref_get(&ns->ref);
 411	rcu_read_unlock();
 412
 413	return ns;
 414}
 415
 416static void nvmet_destroy_namespace(struct percpu_ref *ref)
 417{
 418	struct nvmet_ns *ns = container_of(ref, struct nvmet_ns, ref);
 419
 420	complete(&ns->disable_done);
 421}
 422
 423void nvmet_put_namespace(struct nvmet_ns *ns)
 424{
 425	percpu_ref_put(&ns->ref);
 426}
 427
 428static void nvmet_ns_dev_disable(struct nvmet_ns *ns)
 429{
 430	nvmet_bdev_ns_disable(ns);
 431	nvmet_file_ns_disable(ns);
 432}
 433
 434static int nvmet_p2pmem_ns_enable(struct nvmet_ns *ns)
 435{
 436	int ret;
 437	struct pci_dev *p2p_dev;
 438
 439	if (!ns->use_p2pmem)
 440		return 0;
 441
 442	if (!ns->bdev) {
 443		pr_err("peer-to-peer DMA is not supported by non-block device namespaces\n");
 444		return -EINVAL;
 445	}
 446
 447	if (!blk_queue_pci_p2pdma(ns->bdev->bd_queue)) {
 448		pr_err("peer-to-peer DMA is not supported by the driver of %s\n",
 449		       ns->device_path);
 450		return -EINVAL;
 451	}
 452
 453	if (ns->p2p_dev) {
 454		ret = pci_p2pdma_distance(ns->p2p_dev, nvmet_ns_dev(ns), true);
 455		if (ret < 0)
 456			return -EINVAL;
 457	} else {
 458		/*
 459		 * Right now we just check that there is p2pmem available so
 460		 * we can report an error to the user right away if there
 461		 * is not. We'll find the actual device to use once we
 462		 * setup the controller when the port's device is available.
 463		 */
 464
 465		p2p_dev = pci_p2pmem_find(nvmet_ns_dev(ns));
 466		if (!p2p_dev) {
 467			pr_err("no peer-to-peer memory is available for %s\n",
 468			       ns->device_path);
 469			return -EINVAL;
 470		}
 471
 472		pci_dev_put(p2p_dev);
 473	}
 474
 475	return 0;
 476}
 477
 478/*
 479 * Note: ctrl->subsys->lock should be held when calling this function
 480 */
 481static void nvmet_p2pmem_ns_add_p2p(struct nvmet_ctrl *ctrl,
 482				    struct nvmet_ns *ns)
 483{
 484	struct device *clients[2];
 485	struct pci_dev *p2p_dev;
 486	int ret;
 487
 488	if (!ctrl->p2p_client || !ns->use_p2pmem)
 489		return;
 490
 491	if (ns->p2p_dev) {
 492		ret = pci_p2pdma_distance(ns->p2p_dev, ctrl->p2p_client, true);
 493		if (ret < 0)
 494			return;
 495
 496		p2p_dev = pci_dev_get(ns->p2p_dev);
 497	} else {
 498		clients[0] = ctrl->p2p_client;
 499		clients[1] = nvmet_ns_dev(ns);
 500
 501		p2p_dev = pci_p2pmem_find_many(clients, ARRAY_SIZE(clients));
 502		if (!p2p_dev) {
 503			pr_err("no peer-to-peer memory is available that's supported by %s and %s\n",
 504			       dev_name(ctrl->p2p_client), ns->device_path);
 505			return;
 506		}
 507	}
 508
 509	ret = radix_tree_insert(&ctrl->p2p_ns_map, ns->nsid, p2p_dev);
 510	if (ret < 0)
 511		pci_dev_put(p2p_dev);
 512
 513	pr_info("using p2pmem on %s for nsid %d\n", pci_name(p2p_dev),
 514		ns->nsid);
 515}
 516
 517int nvmet_ns_enable(struct nvmet_ns *ns)
 518{
 519	struct nvmet_subsys *subsys = ns->subsys;
 520	struct nvmet_ctrl *ctrl;
 521	int ret;
 522
 523	mutex_lock(&subsys->lock);
 524	ret = 0;
 525	if (ns->enabled)
 526		goto out_unlock;
 527
 528	ret = -EMFILE;
 529	if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES)
 530		goto out_unlock;
 531
 532	ret = nvmet_bdev_ns_enable(ns);
 533	if (ret == -ENOTBLK)
 534		ret = nvmet_file_ns_enable(ns);
 535	if (ret)
 536		goto out_unlock;
 537
 538	ret = nvmet_p2pmem_ns_enable(ns);
 539	if (ret)
 540		goto out_dev_disable;
 541
 542	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
 543		nvmet_p2pmem_ns_add_p2p(ctrl, ns);
 544
 545	ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace,
 546				0, GFP_KERNEL);
 547	if (ret)
 548		goto out_dev_put;
 549
 550	if (ns->nsid > subsys->max_nsid)
 551		subsys->max_nsid = ns->nsid;
 552
 553	/*
 554	 * The namespaces list needs to be sorted to simplify the implementation
 555	 * of the Identify Namepace List subcommand.
 556	 */
 557	if (list_empty(&subsys->namespaces)) {
 558		list_add_tail_rcu(&ns->dev_link, &subsys->namespaces);
 559	} else {
 560		struct nvmet_ns *old;
 561
 562		list_for_each_entry_rcu(old, &subsys->namespaces, dev_link,
 563					lockdep_is_held(&subsys->lock)) {
 564			BUG_ON(ns->nsid == old->nsid);
 565			if (ns->nsid < old->nsid)
 566				break;
 567		}
 568
 569		list_add_tail_rcu(&ns->dev_link, &old->dev_link);
 570	}
 571	subsys->nr_namespaces++;
 572
 573	nvmet_ns_changed(subsys, ns->nsid);
 574	ns->enabled = true;
 575	ret = 0;
 576out_unlock:
 577	mutex_unlock(&subsys->lock);
 578	return ret;
 579out_dev_put:
 580	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
 581		pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
 582out_dev_disable:
 583	nvmet_ns_dev_disable(ns);
 584	goto out_unlock;
 585}
 586
 587void nvmet_ns_disable(struct nvmet_ns *ns)
 588{
 589	struct nvmet_subsys *subsys = ns->subsys;
 590	struct nvmet_ctrl *ctrl;
 591
 592	mutex_lock(&subsys->lock);
 593	if (!ns->enabled)
 594		goto out_unlock;
 595
 596	ns->enabled = false;
 597	list_del_rcu(&ns->dev_link);
 598	if (ns->nsid == subsys->max_nsid)
 599		subsys->max_nsid = nvmet_max_nsid(subsys);
 600
 601	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
 602		pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
 603
 604	mutex_unlock(&subsys->lock);
 605
 606	/*
 607	 * Now that we removed the namespaces from the lookup list, we
 608	 * can kill the per_cpu ref and wait for any remaining references
 609	 * to be dropped, as well as a RCU grace period for anyone only
 610	 * using the namepace under rcu_read_lock().  Note that we can't
 611	 * use call_rcu here as we need to ensure the namespaces have
 612	 * been fully destroyed before unloading the module.
 613	 */
 614	percpu_ref_kill(&ns->ref);
 615	synchronize_rcu();
 616	wait_for_completion(&ns->disable_done);
 617	percpu_ref_exit(&ns->ref);
 618
 619	mutex_lock(&subsys->lock);
 620
 621	subsys->nr_namespaces--;
 622	nvmet_ns_changed(subsys, ns->nsid);
 623	nvmet_ns_dev_disable(ns);
 624out_unlock:
 625	mutex_unlock(&subsys->lock);
 626}
 627
 628void nvmet_ns_free(struct nvmet_ns *ns)
 629{
 630	nvmet_ns_disable(ns);
 631
 632	down_write(&nvmet_ana_sem);
 633	nvmet_ana_group_enabled[ns->anagrpid]--;
 634	up_write(&nvmet_ana_sem);
 635
 636	kfree(ns->device_path);
 637	kfree(ns);
 638}
 639
 640struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
 641{
 642	struct nvmet_ns *ns;
 643
 644	ns = kzalloc(sizeof(*ns), GFP_KERNEL);
 645	if (!ns)
 646		return NULL;
 647
 648	INIT_LIST_HEAD(&ns->dev_link);
 649	init_completion(&ns->disable_done);
 650
 651	ns->nsid = nsid;
 652	ns->subsys = subsys;
 653
 654	down_write(&nvmet_ana_sem);
 655	ns->anagrpid = NVMET_DEFAULT_ANA_GRPID;
 656	nvmet_ana_group_enabled[ns->anagrpid]++;
 657	up_write(&nvmet_ana_sem);
 658
 659	uuid_gen(&ns->uuid);
 660	ns->buffered_io = false;
 661
 662	return ns;
 663}
 664
 665static void nvmet_update_sq_head(struct nvmet_req *req)
 666{
 667	if (req->sq->size) {
 668		u32 old_sqhd, new_sqhd;
 669
 670		do {
 671			old_sqhd = req->sq->sqhd;
 672			new_sqhd = (old_sqhd + 1) % req->sq->size;
 673		} while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) !=
 674					old_sqhd);
 675	}
 676	req->cqe->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF);
 677}
 678
 679static void nvmet_set_error(struct nvmet_req *req, u16 status)
 680{
 681	struct nvmet_ctrl *ctrl = req->sq->ctrl;
 682	struct nvme_error_slot *new_error_slot;
 683	unsigned long flags;
 684
 685	req->cqe->status = cpu_to_le16(status << 1);
 686
 687	if (!ctrl || req->error_loc == NVMET_NO_ERROR_LOC)
 688		return;
 689
 690	spin_lock_irqsave(&ctrl->error_lock, flags);
 691	ctrl->err_counter++;
 692	new_error_slot =
 693		&ctrl->slots[ctrl->err_counter % NVMET_ERROR_LOG_SLOTS];
 694
 695	new_error_slot->error_count = cpu_to_le64(ctrl->err_counter);
 696	new_error_slot->sqid = cpu_to_le16(req->sq->qid);
 697	new_error_slot->cmdid = cpu_to_le16(req->cmd->common.command_id);
 698	new_error_slot->status_field = cpu_to_le16(status << 1);
 699	new_error_slot->param_error_location = cpu_to_le16(req->error_loc);
 700	new_error_slot->lba = cpu_to_le64(req->error_slba);
 701	new_error_slot->nsid = req->cmd->common.nsid;
 702	spin_unlock_irqrestore(&ctrl->error_lock, flags);
 703
 704	/* set the more bit for this request */
 705	req->cqe->status |= cpu_to_le16(1 << 14);
 706}
 707
 708static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
 709{
 710	if (!req->sq->sqhd_disabled)
 711		nvmet_update_sq_head(req);
 712	req->cqe->sq_id = cpu_to_le16(req->sq->qid);
 713	req->cqe->command_id = req->cmd->common.command_id;
 714
 715	if (unlikely(status))
 716		nvmet_set_error(req, status);
 717
 718	trace_nvmet_req_complete(req);
 719
 720	if (req->ns)
 721		nvmet_put_namespace(req->ns);
 722	req->ops->queue_response(req);
 723}
 724
 725void nvmet_req_complete(struct nvmet_req *req, u16 status)
 726{
 727	__nvmet_req_complete(req, status);
 728	percpu_ref_put(&req->sq->ref);
 729}
 730EXPORT_SYMBOL_GPL(nvmet_req_complete);
 731
 732void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq,
 733		u16 qid, u16 size)
 734{
 735	cq->qid = qid;
 736	cq->size = size;
 737
 738	ctrl->cqs[qid] = cq;
 739}
 740
 741void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
 742		u16 qid, u16 size)
 743{
 744	sq->sqhd = 0;
 745	sq->qid = qid;
 746	sq->size = size;
 747
 748	ctrl->sqs[qid] = sq;
 749}
 750
 751static void nvmet_confirm_sq(struct percpu_ref *ref)
 752{
 753	struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref);
 754
 755	complete(&sq->confirm_done);
 756}
 757
 758void nvmet_sq_destroy(struct nvmet_sq *sq)
 759{
 760	u16 status = NVME_SC_INTERNAL | NVME_SC_DNR;
 761	struct nvmet_ctrl *ctrl = sq->ctrl;
 762
 763	/*
 764	 * If this is the admin queue, complete all AERs so that our
 765	 * queue doesn't have outstanding requests on it.
 766	 */
 767	if (ctrl && ctrl->sqs && ctrl->sqs[0] == sq) {
 768		nvmet_async_events_process(ctrl, status);
 769		nvmet_async_events_free(ctrl);
 770	}
 771	percpu_ref_kill_and_confirm(&sq->ref, nvmet_confirm_sq);
 772	wait_for_completion(&sq->confirm_done);
 773	wait_for_completion(&sq->free_done);
 774	percpu_ref_exit(&sq->ref);
 775
 776	if (ctrl) {
 777		nvmet_ctrl_put(ctrl);
 778		sq->ctrl = NULL; /* allows reusing the queue later */
 779	}
 780}
 781EXPORT_SYMBOL_GPL(nvmet_sq_destroy);
 782
 783static void nvmet_sq_free(struct percpu_ref *ref)
 784{
 785	struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref);
 786
 787	complete(&sq->free_done);
 788}
 789
 790int nvmet_sq_init(struct nvmet_sq *sq)
 791{
 792	int ret;
 793
 794	ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL);
 795	if (ret) {
 796		pr_err("percpu_ref init failed!\n");
 797		return ret;
 798	}
 799	init_completion(&sq->free_done);
 800	init_completion(&sq->confirm_done);
 801
 802	return 0;
 803}
 804EXPORT_SYMBOL_GPL(nvmet_sq_init);
 805
 806static inline u16 nvmet_check_ana_state(struct nvmet_port *port,
 807		struct nvmet_ns *ns)
 808{
 809	enum nvme_ana_state state = port->ana_state[ns->anagrpid];
 810
 811	if (unlikely(state == NVME_ANA_INACCESSIBLE))
 812		return NVME_SC_ANA_INACCESSIBLE;
 813	if (unlikely(state == NVME_ANA_PERSISTENT_LOSS))
 814		return NVME_SC_ANA_PERSISTENT_LOSS;
 815	if (unlikely(state == NVME_ANA_CHANGE))
 816		return NVME_SC_ANA_TRANSITION;
 817	return 0;
 818}
 819
 820static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req)
 821{
 822	if (unlikely(req->ns->readonly)) {
 823		switch (req->cmd->common.opcode) {
 824		case nvme_cmd_read:
 825		case nvme_cmd_flush:
 826			break;
 827		default:
 828			return NVME_SC_NS_WRITE_PROTECTED;
 829		}
 830	}
 831
 832	return 0;
 833}
 834
 835static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
 836{
 837	struct nvme_command *cmd = req->cmd;
 838	u16 ret;
 839
 840	ret = nvmet_check_ctrl_status(req, cmd);
 841	if (unlikely(ret))
 842		return ret;
 843
 844	req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
 845	if (unlikely(!req->ns)) {
 846		req->error_loc = offsetof(struct nvme_common_command, nsid);
 847		return NVME_SC_INVALID_NS | NVME_SC_DNR;
 848	}
 849	ret = nvmet_check_ana_state(req->port, req->ns);
 850	if (unlikely(ret)) {
 851		req->error_loc = offsetof(struct nvme_common_command, nsid);
 852		return ret;
 853	}
 854	ret = nvmet_io_cmd_check_access(req);
 855	if (unlikely(ret)) {
 856		req->error_loc = offsetof(struct nvme_common_command, nsid);
 857		return ret;
 858	}
 859
 860	if (req->ns->file)
 861		return nvmet_file_parse_io_cmd(req);
 862	else
 863		return nvmet_bdev_parse_io_cmd(req);
 864}
 865
 866bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 867		struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops)
 868{
 869	u8 flags = req->cmd->common.flags;
 870	u16 status;
 871
 872	req->cq = cq;
 873	req->sq = sq;
 874	req->ops = ops;
 875	req->sg = NULL;
 876	req->sg_cnt = 0;
 877	req->transfer_len = 0;
 878	req->cqe->status = 0;
 879	req->cqe->sq_head = 0;
 880	req->ns = NULL;
 881	req->error_loc = NVMET_NO_ERROR_LOC;
 882	req->error_slba = 0;
 883
 884	trace_nvmet_req_init(req, req->cmd);
 885
 886	/* no support for fused commands yet */
 887	if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
 888		req->error_loc = offsetof(struct nvme_common_command, flags);
 889		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 890		goto fail;
 891	}
 892
 893	/*
 894	 * For fabrics, PSDT field shall describe metadata pointer (MPTR) that
 895	 * contains an address of a single contiguous physical buffer that is
 896	 * byte aligned.
 897	 */
 898	if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF)) {
 899		req->error_loc = offsetof(struct nvme_common_command, flags);
 900		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 901		goto fail;
 902	}
 903
 904	if (unlikely(!req->sq->ctrl))
 905		/* will return an error for any non-connect command: */
 906		status = nvmet_parse_connect_cmd(req);
 907	else if (likely(req->sq->qid != 0))
 908		status = nvmet_parse_io_cmd(req);
 909	else
 910		status = nvmet_parse_admin_cmd(req);
 911
 912	if (status)
 913		goto fail;
 914
 915	if (unlikely(!percpu_ref_tryget_live(&sq->ref))) {
 916		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 917		goto fail;
 918	}
 919
 920	if (sq->ctrl)
 921		sq->ctrl->cmd_seen = true;
 922
 923	return true;
 924
 925fail:
 926	__nvmet_req_complete(req, status);
 927	return false;
 928}
 929EXPORT_SYMBOL_GPL(nvmet_req_init);
 930
 931void nvmet_req_uninit(struct nvmet_req *req)
 932{
 933	percpu_ref_put(&req->sq->ref);
 934	if (req->ns)
 935		nvmet_put_namespace(req->ns);
 936}
 937EXPORT_SYMBOL_GPL(nvmet_req_uninit);
 938
 939bool nvmet_check_data_len(struct nvmet_req *req, size_t data_len)
 940{
 941	if (unlikely(data_len != req->transfer_len)) {
 942		req->error_loc = offsetof(struct nvme_common_command, dptr);
 943		nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR);
 944		return false;
 945	}
 946
 947	return true;
 948}
 949EXPORT_SYMBOL_GPL(nvmet_check_data_len);
 950
 951bool nvmet_check_data_len_lte(struct nvmet_req *req, size_t data_len)
 952{
 953	if (unlikely(data_len > req->transfer_len)) {
 954		req->error_loc = offsetof(struct nvme_common_command, dptr);
 955		nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR);
 956		return false;
 957	}
 958
 959	return true;
 960}
 961
 962int nvmet_req_alloc_sgl(struct nvmet_req *req)
 963{
 964	struct pci_dev *p2p_dev = NULL;
 965
 966	if (IS_ENABLED(CONFIG_PCI_P2PDMA)) {
 967		if (req->sq->ctrl && req->ns)
 968			p2p_dev = radix_tree_lookup(&req->sq->ctrl->p2p_ns_map,
 969						    req->ns->nsid);
 970
 971		req->p2p_dev = NULL;
 972		if (req->sq->qid && p2p_dev) {
 973			req->sg = pci_p2pmem_alloc_sgl(p2p_dev, &req->sg_cnt,
 974						       req->transfer_len);
 975			if (req->sg) {
 976				req->p2p_dev = p2p_dev;
 977				return 0;
 978			}
 979		}
 980
 981		/*
 982		 * If no P2P memory was available we fallback to using
 983		 * regular memory
 984		 */
 985	}
 986
 987	req->sg = sgl_alloc(req->transfer_len, GFP_KERNEL, &req->sg_cnt);
 988	if (unlikely(!req->sg))
 989		return -ENOMEM;
 990
 991	return 0;
 992}
 993EXPORT_SYMBOL_GPL(nvmet_req_alloc_sgl);
 994
 995void nvmet_req_free_sgl(struct nvmet_req *req)
 996{
 997	if (req->p2p_dev)
 998		pci_p2pmem_free_sgl(req->p2p_dev, req->sg);
 999	else
1000		sgl_free(req->sg);
1001
1002	req->sg = NULL;
1003	req->sg_cnt = 0;
1004}
1005EXPORT_SYMBOL_GPL(nvmet_req_free_sgl);
1006
1007static inline bool nvmet_cc_en(u32 cc)
1008{
1009	return (cc >> NVME_CC_EN_SHIFT) & 0x1;
1010}
1011
1012static inline u8 nvmet_cc_css(u32 cc)
1013{
1014	return (cc >> NVME_CC_CSS_SHIFT) & 0x7;
1015}
1016
1017static inline u8 nvmet_cc_mps(u32 cc)
1018{
1019	return (cc >> NVME_CC_MPS_SHIFT) & 0xf;
1020}
1021
1022static inline u8 nvmet_cc_ams(u32 cc)
1023{
1024	return (cc >> NVME_CC_AMS_SHIFT) & 0x7;
1025}
1026
1027static inline u8 nvmet_cc_shn(u32 cc)
1028{
1029	return (cc >> NVME_CC_SHN_SHIFT) & 0x3;
1030}
1031
1032static inline u8 nvmet_cc_iosqes(u32 cc)
1033{
1034	return (cc >> NVME_CC_IOSQES_SHIFT) & 0xf;
1035}
1036
1037static inline u8 nvmet_cc_iocqes(u32 cc)
1038{
1039	return (cc >> NVME_CC_IOCQES_SHIFT) & 0xf;
1040}
1041
1042static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
1043{
1044	lockdep_assert_held(&ctrl->lock);
1045
1046	if (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES ||
1047	    nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES ||
1048	    nvmet_cc_mps(ctrl->cc) != 0 ||
1049	    nvmet_cc_ams(ctrl->cc) != 0 ||
1050	    nvmet_cc_css(ctrl->cc) != 0) {
1051		ctrl->csts = NVME_CSTS_CFS;
1052		return;
1053	}
1054
1055	ctrl->csts = NVME_CSTS_RDY;
1056
1057	/*
1058	 * Controllers that are not yet enabled should not really enforce the
1059	 * keep alive timeout, but we still want to track a timeout and cleanup
1060	 * in case a host died before it enabled the controller.  Hence, simply
1061	 * reset the keep alive timer when the controller is enabled.
1062	 */
1063	mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ);
1064}
1065
1066static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl)
1067{
1068	lockdep_assert_held(&ctrl->lock);
1069
1070	/* XXX: tear down queues? */
1071	ctrl->csts &= ~NVME_CSTS_RDY;
1072	ctrl->cc = 0;
1073}
1074
1075void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new)
1076{
1077	u32 old;
1078
1079	mutex_lock(&ctrl->lock);
1080	old = ctrl->cc;
1081	ctrl->cc = new;
1082
1083	if (nvmet_cc_en(new) && !nvmet_cc_en(old))
1084		nvmet_start_ctrl(ctrl);
1085	if (!nvmet_cc_en(new) && nvmet_cc_en(old))
1086		nvmet_clear_ctrl(ctrl);
1087	if (nvmet_cc_shn(new) && !nvmet_cc_shn(old)) {
1088		nvmet_clear_ctrl(ctrl);
1089		ctrl->csts |= NVME_CSTS_SHST_CMPLT;
1090	}
1091	if (!nvmet_cc_shn(new) && nvmet_cc_shn(old))
1092		ctrl->csts &= ~NVME_CSTS_SHST_CMPLT;
1093	mutex_unlock(&ctrl->lock);
1094}
1095
1096static void nvmet_init_cap(struct nvmet_ctrl *ctrl)
1097{
1098	/* command sets supported: NVMe command set: */
1099	ctrl->cap = (1ULL << 37);
1100	/* CC.EN timeout in 500msec units: */
1101	ctrl->cap |= (15ULL << 24);
1102	/* maximum queue entries supported: */
1103	ctrl->cap |= NVMET_QUEUE_SIZE - 1;
1104}
1105
1106u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid,
1107		struct nvmet_req *req, struct nvmet_ctrl **ret)
1108{
1109	struct nvmet_subsys *subsys;
1110	struct nvmet_ctrl *ctrl;
1111	u16 status = 0;
1112
1113	subsys = nvmet_find_get_subsys(req->port, subsysnqn);
1114	if (!subsys) {
1115		pr_warn("connect request for invalid subsystem %s!\n",
1116			subsysnqn);
1117		req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn);
1118		return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1119	}
1120
1121	mutex_lock(&subsys->lock);
1122	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
1123		if (ctrl->cntlid == cntlid) {
1124			if (strncmp(hostnqn, ctrl->hostnqn, NVMF_NQN_SIZE)) {
1125				pr_warn("hostnqn mismatch.\n");
1126				continue;
1127			}
1128			if (!kref_get_unless_zero(&ctrl->ref))
1129				continue;
1130
1131			*ret = ctrl;
1132			goto out;
1133		}
1134	}
1135
1136	pr_warn("could not find controller %d for subsys %s / host %s\n",
1137		cntlid, subsysnqn, hostnqn);
1138	req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid);
1139	status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1140
1141out:
1142	mutex_unlock(&subsys->lock);
1143	nvmet_subsys_put(subsys);
1144	return status;
1145}
1146
1147u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd)
1148{
1149	if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
1150		pr_err("got cmd %d while CC.EN == 0 on qid = %d\n",
1151		       cmd->common.opcode, req->sq->qid);
1152		return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
1153	}
1154
1155	if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
1156		pr_err("got cmd %d while CSTS.RDY == 0 on qid = %d\n",
1157		       cmd->common.opcode, req->sq->qid);
1158		return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
1159	}
1160	return 0;
1161}
1162
1163bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn)
1164{
1165	struct nvmet_host_link *p;
1166
1167	lockdep_assert_held(&nvmet_config_sem);
1168
1169	if (subsys->allow_any_host)
1170		return true;
1171
1172	if (subsys->type == NVME_NQN_DISC) /* allow all access to disc subsys */
1173		return true;
1174
1175	list_for_each_entry(p, &subsys->hosts, entry) {
1176		if (!strcmp(nvmet_host_name(p->host), hostnqn))
1177			return true;
1178	}
1179
1180	return false;
1181}
1182
1183/*
1184 * Note: ctrl->subsys->lock should be held when calling this function
1185 */
1186static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl,
1187		struct nvmet_req *req)
1188{
1189	struct nvmet_ns *ns;
1190
1191	if (!req->p2p_client)
1192		return;
1193
1194	ctrl->p2p_client = get_device(req->p2p_client);
1195
1196	list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link,
1197				lockdep_is_held(&ctrl->subsys->lock))
1198		nvmet_p2pmem_ns_add_p2p(ctrl, ns);
1199}
1200
1201/*
1202 * Note: ctrl->subsys->lock should be held when calling this function
1203 */
1204static void nvmet_release_p2p_ns_map(struct nvmet_ctrl *ctrl)
1205{
1206	struct radix_tree_iter iter;
1207	void __rcu **slot;
1208
1209	radix_tree_for_each_slot(slot, &ctrl->p2p_ns_map, &iter, 0)
1210		pci_dev_put(radix_tree_deref_slot(slot));
1211
1212	put_device(ctrl->p2p_client);
1213}
1214
1215static void nvmet_fatal_error_handler(struct work_struct *work)
1216{
1217	struct nvmet_ctrl *ctrl =
1218			container_of(work, struct nvmet_ctrl, fatal_err_work);
1219
1220	pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid);
1221	ctrl->ops->delete_ctrl(ctrl);
1222}
1223
1224u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
1225		struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp)
1226{
1227	struct nvmet_subsys *subsys;
1228	struct nvmet_ctrl *ctrl;
1229	int ret;
1230	u16 status;
1231
1232	status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1233	subsys = nvmet_find_get_subsys(req->port, subsysnqn);
1234	if (!subsys) {
1235		pr_warn("connect request for invalid subsystem %s!\n",
1236			subsysnqn);
1237		req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn);
1238		goto out;
1239	}
1240
1241	status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1242	down_read(&nvmet_config_sem);
1243	if (!nvmet_host_allowed(subsys, hostnqn)) {
1244		pr_info("connect by host %s for subsystem %s not allowed\n",
1245			hostnqn, subsysnqn);
1246		req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn);
1247		up_read(&nvmet_config_sem);
1248		status = NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR;
1249		goto out_put_subsystem;
1250	}
1251	up_read(&nvmet_config_sem);
1252
1253	status = NVME_SC_INTERNAL;
1254	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
1255	if (!ctrl)
1256		goto out_put_subsystem;
1257	mutex_init(&ctrl->lock);
1258
1259	nvmet_init_cap(ctrl);
1260
1261	ctrl->port = req->port;
1262
1263	INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work);
1264	INIT_LIST_HEAD(&ctrl->async_events);
1265	INIT_RADIX_TREE(&ctrl->p2p_ns_map, GFP_KERNEL);
1266	INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler);
1267
1268	memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE);
1269	memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE);
1270
1271	kref_init(&ctrl->ref);
1272	ctrl->subsys = subsys;
1273	WRITE_ONCE(ctrl->aen_enabled, NVMET_AEN_CFG_OPTIONAL);
1274
1275	ctrl->changed_ns_list = kmalloc_array(NVME_MAX_CHANGED_NAMESPACES,
1276			sizeof(__le32), GFP_KERNEL);
1277	if (!ctrl->changed_ns_list)
1278		goto out_free_ctrl;
1279
1280	ctrl->cqs = kcalloc(subsys->max_qid + 1,
1281			sizeof(struct nvmet_cq *),
1282			GFP_KERNEL);
1283	if (!ctrl->cqs)
1284		goto out_free_changed_ns_list;
1285
1286	ctrl->sqs = kcalloc(subsys->max_qid + 1,
1287			sizeof(struct nvmet_sq *),
1288			GFP_KERNEL);
1289	if (!ctrl->sqs)
1290		goto out_free_cqs;
1291
1292	if (subsys->cntlid_min > subsys->cntlid_max)
1293		goto out_free_cqs;
1294
1295	ret = ida_simple_get(&cntlid_ida,
1296			     subsys->cntlid_min, subsys->cntlid_max,
1297			     GFP_KERNEL);
1298	if (ret < 0) {
1299		status = NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR;
1300		goto out_free_sqs;
1301	}
1302	ctrl->cntlid = ret;
1303
1304	ctrl->ops = req->ops;
1305
1306	/*
1307	 * Discovery controllers may use some arbitrary high value
1308	 * in order to cleanup stale discovery sessions
1309	 */
1310	if ((ctrl->subsys->type == NVME_NQN_DISC) && !kato)
1311		kato = NVMET_DISC_KATO_MS;
1312
1313	/* keep-alive timeout in seconds */
1314	ctrl->kato = DIV_ROUND_UP(kato, 1000);
1315
1316	ctrl->err_counter = 0;
1317	spin_lock_init(&ctrl->error_lock);
1318
1319	nvmet_start_keep_alive_timer(ctrl);
1320
1321	mutex_lock(&subsys->lock);
1322	list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
1323	nvmet_setup_p2p_ns_map(ctrl, req);
1324	mutex_unlock(&subsys->lock);
1325
1326	*ctrlp = ctrl;
1327	return 0;
1328
1329out_free_sqs:
1330	kfree(ctrl->sqs);
1331out_free_cqs:
1332	kfree(ctrl->cqs);
1333out_free_changed_ns_list:
1334	kfree(ctrl->changed_ns_list);
1335out_free_ctrl:
1336	kfree(ctrl);
1337out_put_subsystem:
1338	nvmet_subsys_put(subsys);
1339out:
1340	return status;
1341}
1342
1343static void nvmet_ctrl_free(struct kref *ref)
1344{
1345	struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref);
1346	struct nvmet_subsys *subsys = ctrl->subsys;
1347
1348	mutex_lock(&subsys->lock);
1349	nvmet_release_p2p_ns_map(ctrl);
1350	list_del(&ctrl->subsys_entry);
1351	mutex_unlock(&subsys->lock);
1352
1353	nvmet_stop_keep_alive_timer(ctrl);
1354
1355	flush_work(&ctrl->async_event_work);
1356	cancel_work_sync(&ctrl->fatal_err_work);
1357
1358	ida_simple_remove(&cntlid_ida, ctrl->cntlid);
1359
1360	kfree(ctrl->sqs);
1361	kfree(ctrl->cqs);
1362	kfree(ctrl->changed_ns_list);
1363	kfree(ctrl);
1364
1365	nvmet_subsys_put(subsys);
1366}
1367
1368void nvmet_ctrl_put(struct nvmet_ctrl *ctrl)
1369{
1370	kref_put(&ctrl->ref, nvmet_ctrl_free);
1371}
1372
1373void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl)
1374{
1375	mutex_lock(&ctrl->lock);
1376	if (!(ctrl->csts & NVME_CSTS_CFS)) {
1377		ctrl->csts |= NVME_CSTS_CFS;
1378		schedule_work(&ctrl->fatal_err_work);
1379	}
1380	mutex_unlock(&ctrl->lock);
1381}
1382EXPORT_SYMBOL_GPL(nvmet_ctrl_fatal_error);
1383
1384static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
1385		const char *subsysnqn)
1386{
1387	struct nvmet_subsys_link *p;
1388
1389	if (!port)
1390		return NULL;
1391
1392	if (!strcmp(NVME_DISC_SUBSYS_NAME, subsysnqn)) {
1393		if (!kref_get_unless_zero(&nvmet_disc_subsys->ref))
1394			return NULL;
1395		return nvmet_disc_subsys;
1396	}
1397
1398	down_read(&nvmet_config_sem);
1399	list_for_each_entry(p, &port->subsystems, entry) {
1400		if (!strncmp(p->subsys->subsysnqn, subsysnqn,
1401				NVMF_NQN_SIZE)) {
1402			if (!kref_get_unless_zero(&p->subsys->ref))
1403				break;
1404			up_read(&nvmet_config_sem);
1405			return p->subsys;
1406		}
1407	}
1408	up_read(&nvmet_config_sem);
1409	return NULL;
1410}
1411
1412struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
1413		enum nvme_subsys_type type)
1414{
1415	struct nvmet_subsys *subsys;
1416
1417	subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
1418	if (!subsys)
1419		return ERR_PTR(-ENOMEM);
1420
1421	subsys->ver = NVME_VS(1, 3, 0); /* NVMe 1.3.0 */
1422	/* generate a random serial number as our controllers are ephemeral: */
1423	get_random_bytes(&subsys->serial, sizeof(subsys->serial));
1424
1425	switch (type) {
1426	case NVME_NQN_NVME:
1427		subsys->max_qid = NVMET_NR_QUEUES;
1428		break;
1429	case NVME_NQN_DISC:
1430		subsys->max_qid = 0;
1431		break;
1432	default:
1433		pr_err("%s: Unknown Subsystem type - %d\n", __func__, type);
1434		kfree(subsys);
1435		return ERR_PTR(-EINVAL);
1436	}
1437	subsys->type = type;
1438	subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE,
1439			GFP_KERNEL);
1440	if (!subsys->subsysnqn) {
1441		kfree(subsys);
1442		return ERR_PTR(-ENOMEM);
1443	}
1444	subsys->cntlid_min = NVME_CNTLID_MIN;
1445	subsys->cntlid_max = NVME_CNTLID_MAX;
1446	kref_init(&subsys->ref);
1447
1448	mutex_init(&subsys->lock);
1449	INIT_LIST_HEAD(&subsys->namespaces);
1450	INIT_LIST_HEAD(&subsys->ctrls);
1451	INIT_LIST_HEAD(&subsys->hosts);
1452
1453	return subsys;
1454}
1455
1456static void nvmet_subsys_free(struct kref *ref)
1457{
1458	struct nvmet_subsys *subsys =
1459		container_of(ref, struct nvmet_subsys, ref);
1460
1461	WARN_ON_ONCE(!list_empty(&subsys->namespaces));
1462
1463	kfree(subsys->subsysnqn);
1464	kfree_rcu(subsys->model, rcuhead);
1465	kfree(subsys);
1466}
1467
1468void nvmet_subsys_del_ctrls(struct nvmet_subsys *subsys)
1469{
1470	struct nvmet_ctrl *ctrl;
1471
1472	mutex_lock(&subsys->lock);
1473	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
1474		ctrl->ops->delete_ctrl(ctrl);
1475	mutex_unlock(&subsys->lock);
1476}
1477
1478void nvmet_subsys_put(struct nvmet_subsys *subsys)
1479{
1480	kref_put(&subsys->ref, nvmet_subsys_free);
1481}
1482
1483static int __init nvmet_init(void)
1484{
1485	int error;
1486
1487	nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1;
1488
1489	buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq",
1490			WQ_MEM_RECLAIM, 0);
1491	if (!buffered_io_wq) {
1492		error = -ENOMEM;
1493		goto out;
1494	}
1495
1496	error = nvmet_init_discovery();
1497	if (error)
1498		goto out_free_work_queue;
1499
1500	error = nvmet_init_configfs();
1501	if (error)
1502		goto out_exit_discovery;
1503	return 0;
1504
1505out_exit_discovery:
1506	nvmet_exit_discovery();
1507out_free_work_queue:
1508	destroy_workqueue(buffered_io_wq);
1509out:
1510	return error;
1511}
1512
1513static void __exit nvmet_exit(void)
1514{
1515	nvmet_exit_configfs();
1516	nvmet_exit_discovery();
1517	ida_destroy(&cntlid_ida);
1518	destroy_workqueue(buffered_io_wq);
1519
1520	BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024);
1521	BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024);
1522}
1523
1524module_init(nvmet_init);
1525module_exit(nvmet_exit);
1526
1527MODULE_LICENSE("GPL v2");