drivers/nvme/target/core.c at v5.8 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / drivers / nvme / target / core.c
at v5.8 1604 lines 38 kB view raw
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Common code for the NVMe target.
   4 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
   5 */
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7#include <linux/module.h>
   8#include <linux/random.h>
   9#include <linux/rculist.h>
  10#include <linux/pci-p2pdma.h>
  11#include <linux/scatterlist.h>
  12
  13#define CREATE_TRACE_POINTS
  14#include "trace.h"
  15
  16#include "nvmet.h"
  17
  18struct workqueue_struct *buffered_io_wq;
  19static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
  20static DEFINE_IDA(cntlid_ida);
  21
  22/*
  23 * This read/write semaphore is used to synchronize access to configuration
  24 * information on a target system that will result in discovery log page
  25 * information change for at least one host.
  26 * The full list of resources to protected by this semaphore is:
  27 *
  28 *  - subsystems list
  29 *  - per-subsystem allowed hosts list
  30 *  - allow_any_host subsystem attribute
  31 *  - nvmet_genctr
  32 *  - the nvmet_transports array
  33 *
  34 * When updating any of those lists/structures write lock should be obtained,
  35 * while when reading (popolating discovery log page or checking host-subsystem
  36 * link) read lock is obtained to allow concurrent reads.
  37 */
  38DECLARE_RWSEM(nvmet_config_sem);
  39
  40u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
  41u64 nvmet_ana_chgcnt;
  42DECLARE_RWSEM(nvmet_ana_sem);
  43
  44inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno)
  45{
  46	u16 status;
  47
  48	switch (errno) {
  49	case 0:
  50		status = NVME_SC_SUCCESS;
  51		break;
  52	case -ENOSPC:
  53		req->error_loc = offsetof(struct nvme_rw_command, length);
  54		status = NVME_SC_CAP_EXCEEDED | NVME_SC_DNR;
  55		break;
  56	case -EREMOTEIO:
  57		req->error_loc = offsetof(struct nvme_rw_command, slba);
  58		status = NVME_SC_LBA_RANGE | NVME_SC_DNR;
  59		break;
  60	case -EOPNOTSUPP:
  61		req->error_loc = offsetof(struct nvme_common_command, opcode);
  62		switch (req->cmd->common.opcode) {
  63		case nvme_cmd_dsm:
  64		case nvme_cmd_write_zeroes:
  65			status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR;
  66			break;
  67		default:
  68			status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
  69		}
  70		break;
  71	case -ENODATA:
  72		req->error_loc = offsetof(struct nvme_rw_command, nsid);
  73		status = NVME_SC_ACCESS_DENIED;
  74		break;
  75	case -EIO:
  76		/* FALLTHRU */
  77	default:
  78		req->error_loc = offsetof(struct nvme_common_command, opcode);
  79		status = NVME_SC_INTERNAL | NVME_SC_DNR;
  80	}
  81
  82	return status;
  83}
  84
  85static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
  86		const char *subsysnqn);
  87
  88u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
  89		size_t len)
  90{
  91	if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) {
  92		req->error_loc = offsetof(struct nvme_common_command, dptr);
  93		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
  94	}
  95	return 0;
  96}
  97
  98u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len)
  99{
 100	if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) {
 101		req->error_loc = offsetof(struct nvme_common_command, dptr);
 102		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
 103	}
 104	return 0;
 105}
 106
 107u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len)
 108{
 109	if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len) {
 110		req->error_loc = offsetof(struct nvme_common_command, dptr);
 111		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
 112	}
 113	return 0;
 114}
 115
 116static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys)
 117{
 118	struct nvmet_ns *ns;
 119
 120	if (list_empty(&subsys->namespaces))
 121		return 0;
 122
 123	ns = list_last_entry(&subsys->namespaces, struct nvmet_ns, dev_link);
 124	return ns->nsid;
 125}
 126
 127static u32 nvmet_async_event_result(struct nvmet_async_event *aen)
 128{
 129	return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16);
 130}
 131
 132static void nvmet_async_events_failall(struct nvmet_ctrl *ctrl)
 133{
 134	u16 status = NVME_SC_INTERNAL | NVME_SC_DNR;
 135	struct nvmet_req *req;
 136
 137	mutex_lock(&ctrl->lock);
 138	while (ctrl->nr_async_event_cmds) {
 139		req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
 140		mutex_unlock(&ctrl->lock);
 141		nvmet_req_complete(req, status);
 142		mutex_lock(&ctrl->lock);
 143	}
 144	mutex_unlock(&ctrl->lock);
 145}
 146
 147static void nvmet_async_events_process(struct nvmet_ctrl *ctrl)
 148{
 149	struct nvmet_async_event *aen;
 150	struct nvmet_req *req;
 151
 152	mutex_lock(&ctrl->lock);
 153	while (ctrl->nr_async_event_cmds && !list_empty(&ctrl->async_events)) {
 154		aen = list_first_entry(&ctrl->async_events,
 155				       struct nvmet_async_event, entry);
 156		req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
 157		nvmet_set_result(req, nvmet_async_event_result(aen));
 158
 159		list_del(&aen->entry);
 160		kfree(aen);
 161
 162		mutex_unlock(&ctrl->lock);
 163		trace_nvmet_async_event(ctrl, req->cqe->result.u32);
 164		nvmet_req_complete(req, 0);
 165		mutex_lock(&ctrl->lock);
 166	}
 167	mutex_unlock(&ctrl->lock);
 168}
 169
 170static void nvmet_async_events_free(struct nvmet_ctrl *ctrl)
 171{
 172	struct nvmet_async_event *aen, *tmp;
 173
 174	mutex_lock(&ctrl->lock);
 175	list_for_each_entry_safe(aen, tmp, &ctrl->async_events, entry) {
 176		list_del(&aen->entry);
 177		kfree(aen);
 178	}
 179	mutex_unlock(&ctrl->lock);
 180}
 181
 182static void nvmet_async_event_work(struct work_struct *work)
 183{
 184	struct nvmet_ctrl *ctrl =
 185		container_of(work, struct nvmet_ctrl, async_event_work);
 186
 187	nvmet_async_events_process(ctrl);
 188}
 189
 190void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
 191		u8 event_info, u8 log_page)
 192{
 193	struct nvmet_async_event *aen;
 194
 195	aen = kmalloc(sizeof(*aen), GFP_KERNEL);
 196	if (!aen)
 197		return;
 198
 199	aen->event_type = event_type;
 200	aen->event_info = event_info;
 201	aen->log_page = log_page;
 202
 203	mutex_lock(&ctrl->lock);
 204	list_add_tail(&aen->entry, &ctrl->async_events);
 205	mutex_unlock(&ctrl->lock);
 206
 207	schedule_work(&ctrl->async_event_work);
 208}
 209
 210static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid)
 211{
 212	u32 i;
 213
 214	mutex_lock(&ctrl->lock);
 215	if (ctrl->nr_changed_ns > NVME_MAX_CHANGED_NAMESPACES)
 216		goto out_unlock;
 217
 218	for (i = 0; i < ctrl->nr_changed_ns; i++) {
 219		if (ctrl->changed_ns_list[i] == nsid)
 220			goto out_unlock;
 221	}
 222
 223	if (ctrl->nr_changed_ns == NVME_MAX_CHANGED_NAMESPACES) {
 224		ctrl->changed_ns_list[0] = cpu_to_le32(0xffffffff);
 225		ctrl->nr_changed_ns = U32_MAX;
 226		goto out_unlock;
 227	}
 228
 229	ctrl->changed_ns_list[ctrl->nr_changed_ns++] = nsid;
 230out_unlock:
 231	mutex_unlock(&ctrl->lock);
 232}
 233
 234void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
 235{
 236	struct nvmet_ctrl *ctrl;
 237
 238	lockdep_assert_held(&subsys->lock);
 239
 240	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
 241		nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid));
 242		if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_NS_ATTR))
 243			continue;
 244		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
 245				NVME_AER_NOTICE_NS_CHANGED,
 246				NVME_LOG_CHANGED_NS);
 247	}
 248}
 249
 250void nvmet_send_ana_event(struct nvmet_subsys *subsys,
 251		struct nvmet_port *port)
 252{
 253	struct nvmet_ctrl *ctrl;
 254
 255	mutex_lock(&subsys->lock);
 256	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
 257		if (port && ctrl->port != port)
 258			continue;
 259		if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_ANA_CHANGE))
 260			continue;
 261		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
 262				NVME_AER_NOTICE_ANA, NVME_LOG_ANA);
 263	}
 264	mutex_unlock(&subsys->lock);
 265}
 266
 267void nvmet_port_send_ana_event(struct nvmet_port *port)
 268{
 269	struct nvmet_subsys_link *p;
 270
 271	down_read(&nvmet_config_sem);
 272	list_for_each_entry(p, &port->subsystems, entry)
 273		nvmet_send_ana_event(p->subsys, port);
 274	up_read(&nvmet_config_sem);
 275}
 276
 277int nvmet_register_transport(const struct nvmet_fabrics_ops *ops)
 278{
 279	int ret = 0;
 280
 281	down_write(&nvmet_config_sem);
 282	if (nvmet_transports[ops->type])
 283		ret = -EINVAL;
 284	else
 285		nvmet_transports[ops->type] = ops;
 286	up_write(&nvmet_config_sem);
 287
 288	return ret;
 289}
 290EXPORT_SYMBOL_GPL(nvmet_register_transport);
 291
 292void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops)
 293{
 294	down_write(&nvmet_config_sem);
 295	nvmet_transports[ops->type] = NULL;
 296	up_write(&nvmet_config_sem);
 297}
 298EXPORT_SYMBOL_GPL(nvmet_unregister_transport);
 299
 300void nvmet_port_del_ctrls(struct nvmet_port *port, struct nvmet_subsys *subsys)
 301{
 302	struct nvmet_ctrl *ctrl;
 303
 304	mutex_lock(&subsys->lock);
 305	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
 306		if (ctrl->port == port)
 307			ctrl->ops->delete_ctrl(ctrl);
 308	}
 309	mutex_unlock(&subsys->lock);
 310}
 311
 312int nvmet_enable_port(struct nvmet_port *port)
 313{
 314	const struct nvmet_fabrics_ops *ops;
 315	int ret;
 316
 317	lockdep_assert_held(&nvmet_config_sem);
 318
 319	ops = nvmet_transports[port->disc_addr.trtype];
 320	if (!ops) {
 321		up_write(&nvmet_config_sem);
 322		request_module("nvmet-transport-%d", port->disc_addr.trtype);
 323		down_write(&nvmet_config_sem);
 324		ops = nvmet_transports[port->disc_addr.trtype];
 325		if (!ops) {
 326			pr_err("transport type %d not supported\n",
 327				port->disc_addr.trtype);
 328			return -EINVAL;
 329		}
 330	}
 331
 332	if (!try_module_get(ops->owner))
 333		return -EINVAL;
 334
 335	/*
 336	 * If the user requested PI support and the transport isn't pi capable,
 337	 * don't enable the port.
 338	 */
 339	if (port->pi_enable && !ops->metadata_support) {
 340		pr_err("T10-PI is not supported by transport type %d\n",
 341		       port->disc_addr.trtype);
 342		ret = -EINVAL;
 343		goto out_put;
 344	}
 345
 346	ret = ops->add_port(port);
 347	if (ret)
 348		goto out_put;
 349
 350	/* If the transport didn't set inline_data_size, then disable it. */
 351	if (port->inline_data_size < 0)
 352		port->inline_data_size = 0;
 353
 354	port->enabled = true;
 355	port->tr_ops = ops;
 356	return 0;
 357
 358out_put:
 359	module_put(ops->owner);
 360	return ret;
 361}
 362
 363void nvmet_disable_port(struct nvmet_port *port)
 364{
 365	const struct nvmet_fabrics_ops *ops;
 366
 367	lockdep_assert_held(&nvmet_config_sem);
 368
 369	port->enabled = false;
 370	port->tr_ops = NULL;
 371
 372	ops = nvmet_transports[port->disc_addr.trtype];
 373	ops->remove_port(port);
 374	module_put(ops->owner);
 375}
 376
 377static void nvmet_keep_alive_timer(struct work_struct *work)
 378{
 379	struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work),
 380			struct nvmet_ctrl, ka_work);
 381	bool cmd_seen = ctrl->cmd_seen;
 382
 383	ctrl->cmd_seen = false;
 384	if (cmd_seen) {
 385		pr_debug("ctrl %d reschedule traffic based keep-alive timer\n",
 386			ctrl->cntlid);
 387		schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
 388		return;
 389	}
 390
 391	pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n",
 392		ctrl->cntlid, ctrl->kato);
 393
 394	nvmet_ctrl_fatal_error(ctrl);
 395}
 396
 397static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl)
 398{
 399	pr_debug("ctrl %d start keep-alive timer for %d secs\n",
 400		ctrl->cntlid, ctrl->kato);
 401
 402	INIT_DELAYED_WORK(&ctrl->ka_work, nvmet_keep_alive_timer);
 403	schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
 404}
 405
 406static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
 407{
 408	pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid);
 409
 410	cancel_delayed_work_sync(&ctrl->ka_work);
 411}
 412
 413static struct nvmet_ns *__nvmet_find_namespace(struct nvmet_ctrl *ctrl,
 414		__le32 nsid)
 415{
 416	struct nvmet_ns *ns;
 417
 418	list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
 419		if (ns->nsid == le32_to_cpu(nsid))
 420			return ns;
 421	}
 422
 423	return NULL;
 424}
 425
 426struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid)
 427{
 428	struct nvmet_ns *ns;
 429
 430	rcu_read_lock();
 431	ns = __nvmet_find_namespace(ctrl, nsid);
 432	if (ns)
 433		percpu_ref_get(&ns->ref);
 434	rcu_read_unlock();
 435
 436	return ns;
 437}
 438
 439static void nvmet_destroy_namespace(struct percpu_ref *ref)
 440{
 441	struct nvmet_ns *ns = container_of(ref, struct nvmet_ns, ref);
 442
 443	complete(&ns->disable_done);
 444}
 445
 446void nvmet_put_namespace(struct nvmet_ns *ns)
 447{
 448	percpu_ref_put(&ns->ref);
 449}
 450
 451static void nvmet_ns_dev_disable(struct nvmet_ns *ns)
 452{
 453	nvmet_bdev_ns_disable(ns);
 454	nvmet_file_ns_disable(ns);
 455}
 456
 457static int nvmet_p2pmem_ns_enable(struct nvmet_ns *ns)
 458{
 459	int ret;
 460	struct pci_dev *p2p_dev;
 461
 462	if (!ns->use_p2pmem)
 463		return 0;
 464
 465	if (!ns->bdev) {
 466		pr_err("peer-to-peer DMA is not supported by non-block device namespaces\n");
 467		return -EINVAL;
 468	}
 469
 470	if (!blk_queue_pci_p2pdma(ns->bdev->bd_queue)) {
 471		pr_err("peer-to-peer DMA is not supported by the driver of %s\n",
 472		       ns->device_path);
 473		return -EINVAL;
 474	}
 475
 476	if (ns->p2p_dev) {
 477		ret = pci_p2pdma_distance(ns->p2p_dev, nvmet_ns_dev(ns), true);
 478		if (ret < 0)
 479			return -EINVAL;
 480	} else {
 481		/*
 482		 * Right now we just check that there is p2pmem available so
 483		 * we can report an error to the user right away if there
 484		 * is not. We'll find the actual device to use once we
 485		 * setup the controller when the port's device is available.
 486		 */
 487
 488		p2p_dev = pci_p2pmem_find(nvmet_ns_dev(ns));
 489		if (!p2p_dev) {
 490			pr_err("no peer-to-peer memory is available for %s\n",
 491			       ns->device_path);
 492			return -EINVAL;
 493		}
 494
 495		pci_dev_put(p2p_dev);
 496	}
 497
 498	return 0;
 499}
 500
 501/*
 502 * Note: ctrl->subsys->lock should be held when calling this function
 503 */
 504static void nvmet_p2pmem_ns_add_p2p(struct nvmet_ctrl *ctrl,
 505				    struct nvmet_ns *ns)
 506{
 507	struct device *clients[2];
 508	struct pci_dev *p2p_dev;
 509	int ret;
 510
 511	if (!ctrl->p2p_client || !ns->use_p2pmem)
 512		return;
 513
 514	if (ns->p2p_dev) {
 515		ret = pci_p2pdma_distance(ns->p2p_dev, ctrl->p2p_client, true);
 516		if (ret < 0)
 517			return;
 518
 519		p2p_dev = pci_dev_get(ns->p2p_dev);
 520	} else {
 521		clients[0] = ctrl->p2p_client;
 522		clients[1] = nvmet_ns_dev(ns);
 523
 524		p2p_dev = pci_p2pmem_find_many(clients, ARRAY_SIZE(clients));
 525		if (!p2p_dev) {
 526			pr_err("no peer-to-peer memory is available that's supported by %s and %s\n",
 527			       dev_name(ctrl->p2p_client), ns->device_path);
 528			return;
 529		}
 530	}
 531
 532	ret = radix_tree_insert(&ctrl->p2p_ns_map, ns->nsid, p2p_dev);
 533	if (ret < 0)
 534		pci_dev_put(p2p_dev);
 535
 536	pr_info("using p2pmem on %s for nsid %d\n", pci_name(p2p_dev),
 537		ns->nsid);
 538}
 539
 540void nvmet_ns_revalidate(struct nvmet_ns *ns)
 541{
 542	loff_t oldsize = ns->size;
 543
 544	if (ns->bdev)
 545		nvmet_bdev_ns_revalidate(ns);
 546	else
 547		nvmet_file_ns_revalidate(ns);
 548
 549	if (oldsize != ns->size)
 550		nvmet_ns_changed(ns->subsys, ns->nsid);
 551}
 552
 553int nvmet_ns_enable(struct nvmet_ns *ns)
 554{
 555	struct nvmet_subsys *subsys = ns->subsys;
 556	struct nvmet_ctrl *ctrl;
 557	int ret;
 558
 559	mutex_lock(&subsys->lock);
 560	ret = 0;
 561	if (ns->enabled)
 562		goto out_unlock;
 563
 564	ret = -EMFILE;
 565	if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES)
 566		goto out_unlock;
 567
 568	ret = nvmet_bdev_ns_enable(ns);
 569	if (ret == -ENOTBLK)
 570		ret = nvmet_file_ns_enable(ns);
 571	if (ret)
 572		goto out_unlock;
 573
 574	ret = nvmet_p2pmem_ns_enable(ns);
 575	if (ret)
 576		goto out_dev_disable;
 577
 578	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
 579		nvmet_p2pmem_ns_add_p2p(ctrl, ns);
 580
 581	ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace,
 582				0, GFP_KERNEL);
 583	if (ret)
 584		goto out_dev_put;
 585
 586	if (ns->nsid > subsys->max_nsid)
 587		subsys->max_nsid = ns->nsid;
 588
 589	/*
 590	 * The namespaces list needs to be sorted to simplify the implementation
 591	 * of the Identify Namepace List subcommand.
 592	 */
 593	if (list_empty(&subsys->namespaces)) {
 594		list_add_tail_rcu(&ns->dev_link, &subsys->namespaces);
 595	} else {
 596		struct nvmet_ns *old;
 597
 598		list_for_each_entry_rcu(old, &subsys->namespaces, dev_link,
 599					lockdep_is_held(&subsys->lock)) {
 600			BUG_ON(ns->nsid == old->nsid);
 601			if (ns->nsid < old->nsid)
 602				break;
 603		}
 604
 605		list_add_tail_rcu(&ns->dev_link, &old->dev_link);
 606	}
 607	subsys->nr_namespaces++;
 608
 609	nvmet_ns_changed(subsys, ns->nsid);
 610	ns->enabled = true;
 611	ret = 0;
 612out_unlock:
 613	mutex_unlock(&subsys->lock);
 614	return ret;
 615out_dev_put:
 616	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
 617		pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
 618out_dev_disable:
 619	nvmet_ns_dev_disable(ns);
 620	goto out_unlock;
 621}
 622
 623void nvmet_ns_disable(struct nvmet_ns *ns)
 624{
 625	struct nvmet_subsys *subsys = ns->subsys;
 626	struct nvmet_ctrl *ctrl;
 627
 628	mutex_lock(&subsys->lock);
 629	if (!ns->enabled)
 630		goto out_unlock;
 631
 632	ns->enabled = false;
 633	list_del_rcu(&ns->dev_link);
 634	if (ns->nsid == subsys->max_nsid)
 635		subsys->max_nsid = nvmet_max_nsid(subsys);
 636
 637	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
 638		pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
 639
 640	mutex_unlock(&subsys->lock);
 641
 642	/*
 643	 * Now that we removed the namespaces from the lookup list, we
 644	 * can kill the per_cpu ref and wait for any remaining references
 645	 * to be dropped, as well as a RCU grace period for anyone only
 646	 * using the namepace under rcu_read_lock().  Note that we can't
 647	 * use call_rcu here as we need to ensure the namespaces have
 648	 * been fully destroyed before unloading the module.
 649	 */
 650	percpu_ref_kill(&ns->ref);
 651	synchronize_rcu();
 652	wait_for_completion(&ns->disable_done);
 653	percpu_ref_exit(&ns->ref);
 654
 655	mutex_lock(&subsys->lock);
 656
 657	subsys->nr_namespaces--;
 658	nvmet_ns_changed(subsys, ns->nsid);
 659	nvmet_ns_dev_disable(ns);
 660out_unlock:
 661	mutex_unlock(&subsys->lock);
 662}
 663
 664void nvmet_ns_free(struct nvmet_ns *ns)
 665{
 666	nvmet_ns_disable(ns);
 667
 668	down_write(&nvmet_ana_sem);
 669	nvmet_ana_group_enabled[ns->anagrpid]--;
 670	up_write(&nvmet_ana_sem);
 671
 672	kfree(ns->device_path);
 673	kfree(ns);
 674}
 675
 676struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
 677{
 678	struct nvmet_ns *ns;
 679
 680	ns = kzalloc(sizeof(*ns), GFP_KERNEL);
 681	if (!ns)
 682		return NULL;
 683
 684	INIT_LIST_HEAD(&ns->dev_link);
 685	init_completion(&ns->disable_done);
 686
 687	ns->nsid = nsid;
 688	ns->subsys = subsys;
 689
 690	down_write(&nvmet_ana_sem);
 691	ns->anagrpid = NVMET_DEFAULT_ANA_GRPID;
 692	nvmet_ana_group_enabled[ns->anagrpid]++;
 693	up_write(&nvmet_ana_sem);
 694
 695	uuid_gen(&ns->uuid);
 696	ns->buffered_io = false;
 697
 698	return ns;
 699}
 700
 701static void nvmet_update_sq_head(struct nvmet_req *req)
 702{
 703	if (req->sq->size) {
 704		u32 old_sqhd, new_sqhd;
 705
 706		do {
 707			old_sqhd = req->sq->sqhd;
 708			new_sqhd = (old_sqhd + 1) % req->sq->size;
 709		} while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) !=
 710					old_sqhd);
 711	}
 712	req->cqe->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF);
 713}
 714
 715static void nvmet_set_error(struct nvmet_req *req, u16 status)
 716{
 717	struct nvmet_ctrl *ctrl = req->sq->ctrl;
 718	struct nvme_error_slot *new_error_slot;
 719	unsigned long flags;
 720
 721	req->cqe->status = cpu_to_le16(status << 1);
 722
 723	if (!ctrl || req->error_loc == NVMET_NO_ERROR_LOC)
 724		return;
 725
 726	spin_lock_irqsave(&ctrl->error_lock, flags);
 727	ctrl->err_counter++;
 728	new_error_slot =
 729		&ctrl->slots[ctrl->err_counter % NVMET_ERROR_LOG_SLOTS];
 730
 731	new_error_slot->error_count = cpu_to_le64(ctrl->err_counter);
 732	new_error_slot->sqid = cpu_to_le16(req->sq->qid);
 733	new_error_slot->cmdid = cpu_to_le16(req->cmd->common.command_id);
 734	new_error_slot->status_field = cpu_to_le16(status << 1);
 735	new_error_slot->param_error_location = cpu_to_le16(req->error_loc);
 736	new_error_slot->lba = cpu_to_le64(req->error_slba);
 737	new_error_slot->nsid = req->cmd->common.nsid;
 738	spin_unlock_irqrestore(&ctrl->error_lock, flags);
 739
 740	/* set the more bit for this request */
 741	req->cqe->status |= cpu_to_le16(1 << 14);
 742}
 743
 744static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
 745{
 746	if (!req->sq->sqhd_disabled)
 747		nvmet_update_sq_head(req);
 748	req->cqe->sq_id = cpu_to_le16(req->sq->qid);
 749	req->cqe->command_id = req->cmd->common.command_id;
 750
 751	if (unlikely(status))
 752		nvmet_set_error(req, status);
 753
 754	trace_nvmet_req_complete(req);
 755
 756	if (req->ns)
 757		nvmet_put_namespace(req->ns);
 758	req->ops->queue_response(req);
 759}
 760
 761void nvmet_req_complete(struct nvmet_req *req, u16 status)
 762{
 763	__nvmet_req_complete(req, status);
 764	percpu_ref_put(&req->sq->ref);
 765}
 766EXPORT_SYMBOL_GPL(nvmet_req_complete);
 767
 768void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq,
 769		u16 qid, u16 size)
 770{
 771	cq->qid = qid;
 772	cq->size = size;
 773
 774	ctrl->cqs[qid] = cq;
 775}
 776
 777void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
 778		u16 qid, u16 size)
 779{
 780	sq->sqhd = 0;
 781	sq->qid = qid;
 782	sq->size = size;
 783
 784	ctrl->sqs[qid] = sq;
 785}
 786
 787static void nvmet_confirm_sq(struct percpu_ref *ref)
 788{
 789	struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref);
 790
 791	complete(&sq->confirm_done);
 792}
 793
 794void nvmet_sq_destroy(struct nvmet_sq *sq)
 795{
 796	struct nvmet_ctrl *ctrl = sq->ctrl;
 797
 798	/*
 799	 * If this is the admin queue, complete all AERs so that our
 800	 * queue doesn't have outstanding requests on it.
 801	 */
 802	if (ctrl && ctrl->sqs && ctrl->sqs[0] == sq)
 803		nvmet_async_events_failall(ctrl);
 804	percpu_ref_kill_and_confirm(&sq->ref, nvmet_confirm_sq);
 805	wait_for_completion(&sq->confirm_done);
 806	wait_for_completion(&sq->free_done);
 807	percpu_ref_exit(&sq->ref);
 808
 809	if (ctrl) {
 810		nvmet_ctrl_put(ctrl);
 811		sq->ctrl = NULL; /* allows reusing the queue later */
 812	}
 813}
 814EXPORT_SYMBOL_GPL(nvmet_sq_destroy);
 815
 816static void nvmet_sq_free(struct percpu_ref *ref)
 817{
 818	struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref);
 819
 820	complete(&sq->free_done);
 821}
 822
 823int nvmet_sq_init(struct nvmet_sq *sq)
 824{
 825	int ret;
 826
 827	ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL);
 828	if (ret) {
 829		pr_err("percpu_ref init failed!\n");
 830		return ret;
 831	}
 832	init_completion(&sq->free_done);
 833	init_completion(&sq->confirm_done);
 834
 835	return 0;
 836}
 837EXPORT_SYMBOL_GPL(nvmet_sq_init);
 838
 839static inline u16 nvmet_check_ana_state(struct nvmet_port *port,
 840		struct nvmet_ns *ns)
 841{
 842	enum nvme_ana_state state = port->ana_state[ns->anagrpid];
 843
 844	if (unlikely(state == NVME_ANA_INACCESSIBLE))
 845		return NVME_SC_ANA_INACCESSIBLE;
 846	if (unlikely(state == NVME_ANA_PERSISTENT_LOSS))
 847		return NVME_SC_ANA_PERSISTENT_LOSS;
 848	if (unlikely(state == NVME_ANA_CHANGE))
 849		return NVME_SC_ANA_TRANSITION;
 850	return 0;
 851}
 852
 853static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req)
 854{
 855	if (unlikely(req->ns->readonly)) {
 856		switch (req->cmd->common.opcode) {
 857		case nvme_cmd_read:
 858		case nvme_cmd_flush:
 859			break;
 860		default:
 861			return NVME_SC_NS_WRITE_PROTECTED;
 862		}
 863	}
 864
 865	return 0;
 866}
 867
 868static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
 869{
 870	struct nvme_command *cmd = req->cmd;
 871	u16 ret;
 872
 873	ret = nvmet_check_ctrl_status(req, cmd);
 874	if (unlikely(ret))
 875		return ret;
 876
 877	req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
 878	if (unlikely(!req->ns)) {
 879		req->error_loc = offsetof(struct nvme_common_command, nsid);
 880		return NVME_SC_INVALID_NS | NVME_SC_DNR;
 881	}
 882	ret = nvmet_check_ana_state(req->port, req->ns);
 883	if (unlikely(ret)) {
 884		req->error_loc = offsetof(struct nvme_common_command, nsid);
 885		return ret;
 886	}
 887	ret = nvmet_io_cmd_check_access(req);
 888	if (unlikely(ret)) {
 889		req->error_loc = offsetof(struct nvme_common_command, nsid);
 890		return ret;
 891	}
 892
 893	if (req->ns->file)
 894		return nvmet_file_parse_io_cmd(req);
 895	else
 896		return nvmet_bdev_parse_io_cmd(req);
 897}
 898
 899bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 900		struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops)
 901{
 902	u8 flags = req->cmd->common.flags;
 903	u16 status;
 904
 905	req->cq = cq;
 906	req->sq = sq;
 907	req->ops = ops;
 908	req->sg = NULL;
 909	req->metadata_sg = NULL;
 910	req->sg_cnt = 0;
 911	req->metadata_sg_cnt = 0;
 912	req->transfer_len = 0;
 913	req->metadata_len = 0;
 914	req->cqe->status = 0;
 915	req->cqe->sq_head = 0;
 916	req->ns = NULL;
 917	req->error_loc = NVMET_NO_ERROR_LOC;
 918	req->error_slba = 0;
 919
 920	trace_nvmet_req_init(req, req->cmd);
 921
 922	/* no support for fused commands yet */
 923	if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
 924		req->error_loc = offsetof(struct nvme_common_command, flags);
 925		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 926		goto fail;
 927	}
 928
 929	/*
 930	 * For fabrics, PSDT field shall describe metadata pointer (MPTR) that
 931	 * contains an address of a single contiguous physical buffer that is
 932	 * byte aligned.
 933	 */
 934	if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF)) {
 935		req->error_loc = offsetof(struct nvme_common_command, flags);
 936		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 937		goto fail;
 938	}
 939
 940	if (unlikely(!req->sq->ctrl))
 941		/* will return an error for any non-connect command: */
 942		status = nvmet_parse_connect_cmd(req);
 943	else if (likely(req->sq->qid != 0))
 944		status = nvmet_parse_io_cmd(req);
 945	else
 946		status = nvmet_parse_admin_cmd(req);
 947
 948	if (status)
 949		goto fail;
 950
 951	if (unlikely(!percpu_ref_tryget_live(&sq->ref))) {
 952		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 953		goto fail;
 954	}
 955
 956	if (sq->ctrl)
 957		sq->ctrl->cmd_seen = true;
 958
 959	return true;
 960
 961fail:
 962	__nvmet_req_complete(req, status);
 963	return false;
 964}
 965EXPORT_SYMBOL_GPL(nvmet_req_init);
 966
 967void nvmet_req_uninit(struct nvmet_req *req)
 968{
 969	percpu_ref_put(&req->sq->ref);
 970	if (req->ns)
 971		nvmet_put_namespace(req->ns);
 972}
 973EXPORT_SYMBOL_GPL(nvmet_req_uninit);
 974
 975bool nvmet_check_transfer_len(struct nvmet_req *req, size_t len)
 976{
 977	if (unlikely(len != req->transfer_len)) {
 978		req->error_loc = offsetof(struct nvme_common_command, dptr);
 979		nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR);
 980		return false;
 981	}
 982
 983	return true;
 984}
 985EXPORT_SYMBOL_GPL(nvmet_check_transfer_len);
 986
 987bool nvmet_check_data_len_lte(struct nvmet_req *req, size_t data_len)
 988{
 989	if (unlikely(data_len > req->transfer_len)) {
 990		req->error_loc = offsetof(struct nvme_common_command, dptr);
 991		nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR);
 992		return false;
 993	}
 994
 995	return true;
 996}
 997
 998static unsigned int nvmet_data_transfer_len(struct nvmet_req *req)
 999{
1000	return req->transfer_len - req->metadata_len;
1001}
1002
1003static int nvmet_req_alloc_p2pmem_sgls(struct nvmet_req *req)
1004{
1005	req->sg = pci_p2pmem_alloc_sgl(req->p2p_dev, &req->sg_cnt,
1006			nvmet_data_transfer_len(req));
1007	if (!req->sg)
1008		goto out_err;
1009
1010	if (req->metadata_len) {
1011		req->metadata_sg = pci_p2pmem_alloc_sgl(req->p2p_dev,
1012				&req->metadata_sg_cnt, req->metadata_len);
1013		if (!req->metadata_sg)
1014			goto out_free_sg;
1015	}
1016	return 0;
1017out_free_sg:
1018	pci_p2pmem_free_sgl(req->p2p_dev, req->sg);
1019out_err:
1020	return -ENOMEM;
1021}
1022
1023static bool nvmet_req_find_p2p_dev(struct nvmet_req *req)
1024{
1025	if (!IS_ENABLED(CONFIG_PCI_P2PDMA))
1026		return false;
1027
1028	if (req->sq->ctrl && req->sq->qid && req->ns) {
1029		req->p2p_dev = radix_tree_lookup(&req->sq->ctrl->p2p_ns_map,
1030						 req->ns->nsid);
1031		if (req->p2p_dev)
1032			return true;
1033	}
1034
1035	req->p2p_dev = NULL;
1036	return false;
1037}
1038
1039int nvmet_req_alloc_sgls(struct nvmet_req *req)
1040{
1041	if (nvmet_req_find_p2p_dev(req) && !nvmet_req_alloc_p2pmem_sgls(req))
1042		return 0;
1043
1044	req->sg = sgl_alloc(nvmet_data_transfer_len(req), GFP_KERNEL,
1045			    &req->sg_cnt);
1046	if (unlikely(!req->sg))
1047		goto out;
1048
1049	if (req->metadata_len) {
1050		req->metadata_sg = sgl_alloc(req->metadata_len, GFP_KERNEL,
1051					     &req->metadata_sg_cnt);
1052		if (unlikely(!req->metadata_sg))
1053			goto out_free;
1054	}
1055
1056	return 0;
1057out_free:
1058	sgl_free(req->sg);
1059out:
1060	return -ENOMEM;
1061}
1062EXPORT_SYMBOL_GPL(nvmet_req_alloc_sgls);
1063
1064void nvmet_req_free_sgls(struct nvmet_req *req)
1065{
1066	if (req->p2p_dev) {
1067		pci_p2pmem_free_sgl(req->p2p_dev, req->sg);
1068		if (req->metadata_sg)
1069			pci_p2pmem_free_sgl(req->p2p_dev, req->metadata_sg);
1070	} else {
1071		sgl_free(req->sg);
1072		if (req->metadata_sg)
1073			sgl_free(req->metadata_sg);
1074	}
1075
1076	req->sg = NULL;
1077	req->metadata_sg = NULL;
1078	req->sg_cnt = 0;
1079	req->metadata_sg_cnt = 0;
1080}
1081EXPORT_SYMBOL_GPL(nvmet_req_free_sgls);
1082
1083static inline bool nvmet_cc_en(u32 cc)
1084{
1085	return (cc >> NVME_CC_EN_SHIFT) & 0x1;
1086}
1087
1088static inline u8 nvmet_cc_css(u32 cc)
1089{
1090	return (cc >> NVME_CC_CSS_SHIFT) & 0x7;
1091}
1092
1093static inline u8 nvmet_cc_mps(u32 cc)
1094{
1095	return (cc >> NVME_CC_MPS_SHIFT) & 0xf;
1096}
1097
1098static inline u8 nvmet_cc_ams(u32 cc)
1099{
1100	return (cc >> NVME_CC_AMS_SHIFT) & 0x7;
1101}
1102
1103static inline u8 nvmet_cc_shn(u32 cc)
1104{
1105	return (cc >> NVME_CC_SHN_SHIFT) & 0x3;
1106}
1107
1108static inline u8 nvmet_cc_iosqes(u32 cc)
1109{
1110	return (cc >> NVME_CC_IOSQES_SHIFT) & 0xf;
1111}
1112
1113static inline u8 nvmet_cc_iocqes(u32 cc)
1114{
1115	return (cc >> NVME_CC_IOCQES_SHIFT) & 0xf;
1116}
1117
1118static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
1119{
1120	lockdep_assert_held(&ctrl->lock);
1121
1122	if (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES ||
1123	    nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES ||
1124	    nvmet_cc_mps(ctrl->cc) != 0 ||
1125	    nvmet_cc_ams(ctrl->cc) != 0 ||
1126	    nvmet_cc_css(ctrl->cc) != 0) {
1127		ctrl->csts = NVME_CSTS_CFS;
1128		return;
1129	}
1130
1131	ctrl->csts = NVME_CSTS_RDY;
1132
1133	/*
1134	 * Controllers that are not yet enabled should not really enforce the
1135	 * keep alive timeout, but we still want to track a timeout and cleanup
1136	 * in case a host died before it enabled the controller.  Hence, simply
1137	 * reset the keep alive timer when the controller is enabled.
1138	 */
1139	mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ);
1140}
1141
1142static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl)
1143{
1144	lockdep_assert_held(&ctrl->lock);
1145
1146	/* XXX: tear down queues? */
1147	ctrl->csts &= ~NVME_CSTS_RDY;
1148	ctrl->cc = 0;
1149}
1150
1151void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new)
1152{
1153	u32 old;
1154
1155	mutex_lock(&ctrl->lock);
1156	old = ctrl->cc;
1157	ctrl->cc = new;
1158
1159	if (nvmet_cc_en(new) && !nvmet_cc_en(old))
1160		nvmet_start_ctrl(ctrl);
1161	if (!nvmet_cc_en(new) && nvmet_cc_en(old))
1162		nvmet_clear_ctrl(ctrl);
1163	if (nvmet_cc_shn(new) && !nvmet_cc_shn(old)) {
1164		nvmet_clear_ctrl(ctrl);
1165		ctrl->csts |= NVME_CSTS_SHST_CMPLT;
1166	}
1167	if (!nvmet_cc_shn(new) && nvmet_cc_shn(old))
1168		ctrl->csts &= ~NVME_CSTS_SHST_CMPLT;
1169	mutex_unlock(&ctrl->lock);
1170}
1171
1172static void nvmet_init_cap(struct nvmet_ctrl *ctrl)
1173{
1174	/* command sets supported: NVMe command set: */
1175	ctrl->cap = (1ULL << 37);
1176	/* CC.EN timeout in 500msec units: */
1177	ctrl->cap |= (15ULL << 24);
1178	/* maximum queue entries supported: */
1179	ctrl->cap |= NVMET_QUEUE_SIZE - 1;
1180}
1181
1182u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid,
1183		struct nvmet_req *req, struct nvmet_ctrl **ret)
1184{
1185	struct nvmet_subsys *subsys;
1186	struct nvmet_ctrl *ctrl;
1187	u16 status = 0;
1188
1189	subsys = nvmet_find_get_subsys(req->port, subsysnqn);
1190	if (!subsys) {
1191		pr_warn("connect request for invalid subsystem %s!\n",
1192			subsysnqn);
1193		req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn);
1194		return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1195	}
1196
1197	mutex_lock(&subsys->lock);
1198	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
1199		if (ctrl->cntlid == cntlid) {
1200			if (strncmp(hostnqn, ctrl->hostnqn, NVMF_NQN_SIZE)) {
1201				pr_warn("hostnqn mismatch.\n");
1202				continue;
1203			}
1204			if (!kref_get_unless_zero(&ctrl->ref))
1205				continue;
1206
1207			*ret = ctrl;
1208			goto out;
1209		}
1210	}
1211
1212	pr_warn("could not find controller %d for subsys %s / host %s\n",
1213		cntlid, subsysnqn, hostnqn);
1214	req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid);
1215	status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1216
1217out:
1218	mutex_unlock(&subsys->lock);
1219	nvmet_subsys_put(subsys);
1220	return status;
1221}
1222
1223u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd)
1224{
1225	if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
1226		pr_err("got cmd %d while CC.EN == 0 on qid = %d\n",
1227		       cmd->common.opcode, req->sq->qid);
1228		return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
1229	}
1230
1231	if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
1232		pr_err("got cmd %d while CSTS.RDY == 0 on qid = %d\n",
1233		       cmd->common.opcode, req->sq->qid);
1234		return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
1235	}
1236	return 0;
1237}
1238
1239bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn)
1240{
1241	struct nvmet_host_link *p;
1242
1243	lockdep_assert_held(&nvmet_config_sem);
1244
1245	if (subsys->allow_any_host)
1246		return true;
1247
1248	if (subsys->type == NVME_NQN_DISC) /* allow all access to disc subsys */
1249		return true;
1250
1251	list_for_each_entry(p, &subsys->hosts, entry) {
1252		if (!strcmp(nvmet_host_name(p->host), hostnqn))
1253			return true;
1254	}
1255
1256	return false;
1257}
1258
1259/*
1260 * Note: ctrl->subsys->lock should be held when calling this function
1261 */
1262static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl,
1263		struct nvmet_req *req)
1264{
1265	struct nvmet_ns *ns;
1266
1267	if (!req->p2p_client)
1268		return;
1269
1270	ctrl->p2p_client = get_device(req->p2p_client);
1271
1272	list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link,
1273				lockdep_is_held(&ctrl->subsys->lock))
1274		nvmet_p2pmem_ns_add_p2p(ctrl, ns);
1275}
1276
1277/*
1278 * Note: ctrl->subsys->lock should be held when calling this function
1279 */
1280static void nvmet_release_p2p_ns_map(struct nvmet_ctrl *ctrl)
1281{
1282	struct radix_tree_iter iter;
1283	void __rcu **slot;
1284
1285	radix_tree_for_each_slot(slot, &ctrl->p2p_ns_map, &iter, 0)
1286		pci_dev_put(radix_tree_deref_slot(slot));
1287
1288	put_device(ctrl->p2p_client);
1289}
1290
1291static void nvmet_fatal_error_handler(struct work_struct *work)
1292{
1293	struct nvmet_ctrl *ctrl =
1294			container_of(work, struct nvmet_ctrl, fatal_err_work);
1295
1296	pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid);
1297	ctrl->ops->delete_ctrl(ctrl);
1298}
1299
1300u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
1301		struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp)
1302{
1303	struct nvmet_subsys *subsys;
1304	struct nvmet_ctrl *ctrl;
1305	int ret;
1306	u16 status;
1307
1308	status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1309	subsys = nvmet_find_get_subsys(req->port, subsysnqn);
1310	if (!subsys) {
1311		pr_warn("connect request for invalid subsystem %s!\n",
1312			subsysnqn);
1313		req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn);
1314		goto out;
1315	}
1316
1317	status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1318	down_read(&nvmet_config_sem);
1319	if (!nvmet_host_allowed(subsys, hostnqn)) {
1320		pr_info("connect by host %s for subsystem %s not allowed\n",
1321			hostnqn, subsysnqn);
1322		req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn);
1323		up_read(&nvmet_config_sem);
1324		status = NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR;
1325		goto out_put_subsystem;
1326	}
1327	up_read(&nvmet_config_sem);
1328
1329	status = NVME_SC_INTERNAL;
1330	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
1331	if (!ctrl)
1332		goto out_put_subsystem;
1333	mutex_init(&ctrl->lock);
1334
1335	nvmet_init_cap(ctrl);
1336
1337	ctrl->port = req->port;
1338
1339	INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work);
1340	INIT_LIST_HEAD(&ctrl->async_events);
1341	INIT_RADIX_TREE(&ctrl->p2p_ns_map, GFP_KERNEL);
1342	INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler);
1343
1344	memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE);
1345	memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE);
1346
1347	kref_init(&ctrl->ref);
1348	ctrl->subsys = subsys;
1349	WRITE_ONCE(ctrl->aen_enabled, NVMET_AEN_CFG_OPTIONAL);
1350
1351	ctrl->changed_ns_list = kmalloc_array(NVME_MAX_CHANGED_NAMESPACES,
1352			sizeof(__le32), GFP_KERNEL);
1353	if (!ctrl->changed_ns_list)
1354		goto out_free_ctrl;
1355
1356	ctrl->cqs = kcalloc(subsys->max_qid + 1,
1357			sizeof(struct nvmet_cq *),
1358			GFP_KERNEL);
1359	if (!ctrl->cqs)
1360		goto out_free_changed_ns_list;
1361
1362	ctrl->sqs = kcalloc(subsys->max_qid + 1,
1363			sizeof(struct nvmet_sq *),
1364			GFP_KERNEL);
1365	if (!ctrl->sqs)
1366		goto out_free_cqs;
1367
1368	if (subsys->cntlid_min > subsys->cntlid_max)
1369		goto out_free_cqs;
1370
1371	ret = ida_simple_get(&cntlid_ida,
1372			     subsys->cntlid_min, subsys->cntlid_max,
1373			     GFP_KERNEL);
1374	if (ret < 0) {
1375		status = NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR;
1376		goto out_free_sqs;
1377	}
1378	ctrl->cntlid = ret;
1379
1380	ctrl->ops = req->ops;
1381
1382	/*
1383	 * Discovery controllers may use some arbitrary high value
1384	 * in order to cleanup stale discovery sessions
1385	 */
1386	if ((ctrl->subsys->type == NVME_NQN_DISC) && !kato)
1387		kato = NVMET_DISC_KATO_MS;
1388
1389	/* keep-alive timeout in seconds */
1390	ctrl->kato = DIV_ROUND_UP(kato, 1000);
1391
1392	ctrl->err_counter = 0;
1393	spin_lock_init(&ctrl->error_lock);
1394
1395	nvmet_start_keep_alive_timer(ctrl);
1396
1397	mutex_lock(&subsys->lock);
1398	list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
1399	nvmet_setup_p2p_ns_map(ctrl, req);
1400	mutex_unlock(&subsys->lock);
1401
1402	*ctrlp = ctrl;
1403	return 0;
1404
1405out_free_sqs:
1406	kfree(ctrl->sqs);
1407out_free_cqs:
1408	kfree(ctrl->cqs);
1409out_free_changed_ns_list:
1410	kfree(ctrl->changed_ns_list);
1411out_free_ctrl:
1412	kfree(ctrl);
1413out_put_subsystem:
1414	nvmet_subsys_put(subsys);
1415out:
1416	return status;
1417}
1418
1419static void nvmet_ctrl_free(struct kref *ref)
1420{
1421	struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref);
1422	struct nvmet_subsys *subsys = ctrl->subsys;
1423
1424	mutex_lock(&subsys->lock);
1425	nvmet_release_p2p_ns_map(ctrl);
1426	list_del(&ctrl->subsys_entry);
1427	mutex_unlock(&subsys->lock);
1428
1429	nvmet_stop_keep_alive_timer(ctrl);
1430
1431	flush_work(&ctrl->async_event_work);
1432	cancel_work_sync(&ctrl->fatal_err_work);
1433
1434	ida_simple_remove(&cntlid_ida, ctrl->cntlid);
1435
1436	nvmet_async_events_free(ctrl);
1437	kfree(ctrl->sqs);
1438	kfree(ctrl->cqs);
1439	kfree(ctrl->changed_ns_list);
1440	kfree(ctrl);
1441
1442	nvmet_subsys_put(subsys);
1443}
1444
1445void nvmet_ctrl_put(struct nvmet_ctrl *ctrl)
1446{
1447	kref_put(&ctrl->ref, nvmet_ctrl_free);
1448}
1449
1450void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl)
1451{
1452	mutex_lock(&ctrl->lock);
1453	if (!(ctrl->csts & NVME_CSTS_CFS)) {
1454		ctrl->csts |= NVME_CSTS_CFS;
1455		schedule_work(&ctrl->fatal_err_work);
1456	}
1457	mutex_unlock(&ctrl->lock);
1458}
1459EXPORT_SYMBOL_GPL(nvmet_ctrl_fatal_error);
1460
1461static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
1462		const char *subsysnqn)
1463{
1464	struct nvmet_subsys_link *p;
1465
1466	if (!port)
1467		return NULL;
1468
1469	if (!strcmp(NVME_DISC_SUBSYS_NAME, subsysnqn)) {
1470		if (!kref_get_unless_zero(&nvmet_disc_subsys->ref))
1471			return NULL;
1472		return nvmet_disc_subsys;
1473	}
1474
1475	down_read(&nvmet_config_sem);
1476	list_for_each_entry(p, &port->subsystems, entry) {
1477		if (!strncmp(p->subsys->subsysnqn, subsysnqn,
1478				NVMF_NQN_SIZE)) {
1479			if (!kref_get_unless_zero(&p->subsys->ref))
1480				break;
1481			up_read(&nvmet_config_sem);
1482			return p->subsys;
1483		}
1484	}
1485	up_read(&nvmet_config_sem);
1486	return NULL;
1487}
1488
1489struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
1490		enum nvme_subsys_type type)
1491{
1492	struct nvmet_subsys *subsys;
1493
1494	subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
1495	if (!subsys)
1496		return ERR_PTR(-ENOMEM);
1497
1498	subsys->ver = NVME_VS(1, 3, 0); /* NVMe 1.3.0 */
1499	/* generate a random serial number as our controllers are ephemeral: */
1500	get_random_bytes(&subsys->serial, sizeof(subsys->serial));
1501
1502	switch (type) {
1503	case NVME_NQN_NVME:
1504		subsys->max_qid = NVMET_NR_QUEUES;
1505		break;
1506	case NVME_NQN_DISC:
1507		subsys->max_qid = 0;
1508		break;
1509	default:
1510		pr_err("%s: Unknown Subsystem type - %d\n", __func__, type);
1511		kfree(subsys);
1512		return ERR_PTR(-EINVAL);
1513	}
1514	subsys->type = type;
1515	subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE,
1516			GFP_KERNEL);
1517	if (!subsys->subsysnqn) {
1518		kfree(subsys);
1519		return ERR_PTR(-ENOMEM);
1520	}
1521	subsys->cntlid_min = NVME_CNTLID_MIN;
1522	subsys->cntlid_max = NVME_CNTLID_MAX;
1523	kref_init(&subsys->ref);
1524
1525	mutex_init(&subsys->lock);
1526	INIT_LIST_HEAD(&subsys->namespaces);
1527	INIT_LIST_HEAD(&subsys->ctrls);
1528	INIT_LIST_HEAD(&subsys->hosts);
1529
1530	return subsys;
1531}
1532
1533static void nvmet_subsys_free(struct kref *ref)
1534{
1535	struct nvmet_subsys *subsys =
1536		container_of(ref, struct nvmet_subsys, ref);
1537
1538	WARN_ON_ONCE(!list_empty(&subsys->namespaces));
1539
1540	kfree(subsys->subsysnqn);
1541	kfree_rcu(subsys->model, rcuhead);
1542	kfree(subsys);
1543}
1544
1545void nvmet_subsys_del_ctrls(struct nvmet_subsys *subsys)
1546{
1547	struct nvmet_ctrl *ctrl;
1548
1549	mutex_lock(&subsys->lock);
1550	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
1551		ctrl->ops->delete_ctrl(ctrl);
1552	mutex_unlock(&subsys->lock);
1553}
1554
1555void nvmet_subsys_put(struct nvmet_subsys *subsys)
1556{
1557	kref_put(&subsys->ref, nvmet_subsys_free);
1558}
1559
1560static int __init nvmet_init(void)
1561{
1562	int error;
1563
1564	nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1;
1565
1566	buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq",
1567			WQ_MEM_RECLAIM, 0);
1568	if (!buffered_io_wq) {
1569		error = -ENOMEM;
1570		goto out;
1571	}
1572
1573	error = nvmet_init_discovery();
1574	if (error)
1575		goto out_free_work_queue;
1576
1577	error = nvmet_init_configfs();
1578	if (error)
1579		goto out_exit_discovery;
1580	return 0;
1581
1582out_exit_discovery:
1583	nvmet_exit_discovery();
1584out_free_work_queue:
1585	destroy_workqueue(buffered_io_wq);
1586out:
1587	return error;
1588}
1589
1590static void __exit nvmet_exit(void)
1591{
1592	nvmet_exit_configfs();
1593	nvmet_exit_discovery();
1594	ida_destroy(&cntlid_ida);
1595	destroy_workqueue(buffered_io_wq);
1596
1597	BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024);
1598	BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024);
1599}
1600
1601module_init(nvmet_init);
1602module_exit(nvmet_exit);
1603
1604MODULE_LICENSE("GPL v2");