drivers/nvme/target/core.c at v5.0 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / drivers / nvme / target / core.c
at v5.0 1481 lines 36 kB view raw
   1/*
   2 * Common code for the NVMe target.
   3 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
   4 *
   5 * This program is free software; you can redistribute it and/or modify it
   6 * under the terms and conditions of the GNU General Public License,
   7 * version 2, as published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope it will be useful, but WITHOUT
  10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  12 * more details.
  13 */
  14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  15#include <linux/module.h>
  16#include <linux/random.h>
  17#include <linux/rculist.h>
  18#include <linux/pci-p2pdma.h>
  19
  20#include "nvmet.h"
  21
  22struct workqueue_struct *buffered_io_wq;
  23static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
  24static DEFINE_IDA(cntlid_ida);
  25
  26/*
  27 * This read/write semaphore is used to synchronize access to configuration
  28 * information on a target system that will result in discovery log page
  29 * information change for at least one host.
  30 * The full list of resources to protected by this semaphore is:
  31 *
  32 *  - subsystems list
  33 *  - per-subsystem allowed hosts list
  34 *  - allow_any_host subsystem attribute
  35 *  - nvmet_genctr
  36 *  - the nvmet_transports array
  37 *
  38 * When updating any of those lists/structures write lock should be obtained,
  39 * while when reading (popolating discovery log page or checking host-subsystem
  40 * link) read lock is obtained to allow concurrent reads.
  41 */
  42DECLARE_RWSEM(nvmet_config_sem);
  43
  44u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
  45u64 nvmet_ana_chgcnt;
  46DECLARE_RWSEM(nvmet_ana_sem);
  47
  48inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno)
  49{
  50	u16 status;
  51
  52	switch (errno) {
  53	case -ENOSPC:
  54		req->error_loc = offsetof(struct nvme_rw_command, length);
  55		status = NVME_SC_CAP_EXCEEDED | NVME_SC_DNR;
  56		break;
  57	case -EREMOTEIO:
  58		req->error_loc = offsetof(struct nvme_rw_command, slba);
  59		status = NVME_SC_LBA_RANGE | NVME_SC_DNR;
  60		break;
  61	case -EOPNOTSUPP:
  62		req->error_loc = offsetof(struct nvme_common_command, opcode);
  63		switch (req->cmd->common.opcode) {
  64		case nvme_cmd_dsm:
  65		case nvme_cmd_write_zeroes:
  66			status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR;
  67			break;
  68		default:
  69			status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
  70		}
  71		break;
  72	case -ENODATA:
  73		req->error_loc = offsetof(struct nvme_rw_command, nsid);
  74		status = NVME_SC_ACCESS_DENIED;
  75		break;
  76	case -EIO:
  77		/* FALLTHRU */
  78	default:
  79		req->error_loc = offsetof(struct nvme_common_command, opcode);
  80		status = NVME_SC_INTERNAL | NVME_SC_DNR;
  81	}
  82
  83	return status;
  84}
  85
  86static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
  87		const char *subsysnqn);
  88
  89u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
  90		size_t len)
  91{
  92	if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) {
  93		req->error_loc = offsetof(struct nvme_common_command, dptr);
  94		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
  95	}
  96	return 0;
  97}
  98
  99u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len)
 100{
 101	if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) {
 102		req->error_loc = offsetof(struct nvme_common_command, dptr);
 103		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
 104	}
 105	return 0;
 106}
 107
 108u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len)
 109{
 110	if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len) {
 111		req->error_loc = offsetof(struct nvme_common_command, dptr);
 112		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
 113	}
 114	return 0;
 115}
 116
 117static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys)
 118{
 119	struct nvmet_ns *ns;
 120
 121	if (list_empty(&subsys->namespaces))
 122		return 0;
 123
 124	ns = list_last_entry(&subsys->namespaces, struct nvmet_ns, dev_link);
 125	return ns->nsid;
 126}
 127
 128static u32 nvmet_async_event_result(struct nvmet_async_event *aen)
 129{
 130	return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16);
 131}
 132
 133static void nvmet_async_events_free(struct nvmet_ctrl *ctrl)
 134{
 135	struct nvmet_req *req;
 136
 137	while (1) {
 138		mutex_lock(&ctrl->lock);
 139		if (!ctrl->nr_async_event_cmds) {
 140			mutex_unlock(&ctrl->lock);
 141			return;
 142		}
 143
 144		req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
 145		mutex_unlock(&ctrl->lock);
 146		nvmet_req_complete(req, NVME_SC_INTERNAL | NVME_SC_DNR);
 147	}
 148}
 149
 150static void nvmet_async_event_work(struct work_struct *work)
 151{
 152	struct nvmet_ctrl *ctrl =
 153		container_of(work, struct nvmet_ctrl, async_event_work);
 154	struct nvmet_async_event *aen;
 155	struct nvmet_req *req;
 156
 157	while (1) {
 158		mutex_lock(&ctrl->lock);
 159		aen = list_first_entry_or_null(&ctrl->async_events,
 160				struct nvmet_async_event, entry);
 161		if (!aen || !ctrl->nr_async_event_cmds) {
 162			mutex_unlock(&ctrl->lock);
 163			return;
 164		}
 165
 166		req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
 167		nvmet_set_result(req, nvmet_async_event_result(aen));
 168
 169		list_del(&aen->entry);
 170		kfree(aen);
 171
 172		mutex_unlock(&ctrl->lock);
 173		nvmet_req_complete(req, 0);
 174	}
 175}
 176
 177void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
 178		u8 event_info, u8 log_page)
 179{
 180	struct nvmet_async_event *aen;
 181
 182	aen = kmalloc(sizeof(*aen), GFP_KERNEL);
 183	if (!aen)
 184		return;
 185
 186	aen->event_type = event_type;
 187	aen->event_info = event_info;
 188	aen->log_page = log_page;
 189
 190	mutex_lock(&ctrl->lock);
 191	list_add_tail(&aen->entry, &ctrl->async_events);
 192	mutex_unlock(&ctrl->lock);
 193
 194	schedule_work(&ctrl->async_event_work);
 195}
 196
 197static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid)
 198{
 199	u32 i;
 200
 201	mutex_lock(&ctrl->lock);
 202	if (ctrl->nr_changed_ns > NVME_MAX_CHANGED_NAMESPACES)
 203		goto out_unlock;
 204
 205	for (i = 0; i < ctrl->nr_changed_ns; i++) {
 206		if (ctrl->changed_ns_list[i] == nsid)
 207			goto out_unlock;
 208	}
 209
 210	if (ctrl->nr_changed_ns == NVME_MAX_CHANGED_NAMESPACES) {
 211		ctrl->changed_ns_list[0] = cpu_to_le32(0xffffffff);
 212		ctrl->nr_changed_ns = U32_MAX;
 213		goto out_unlock;
 214	}
 215
 216	ctrl->changed_ns_list[ctrl->nr_changed_ns++] = nsid;
 217out_unlock:
 218	mutex_unlock(&ctrl->lock);
 219}
 220
 221void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
 222{
 223	struct nvmet_ctrl *ctrl;
 224
 225	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
 226		nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid));
 227		if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_NS_ATTR))
 228			continue;
 229		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
 230				NVME_AER_NOTICE_NS_CHANGED,
 231				NVME_LOG_CHANGED_NS);
 232	}
 233}
 234
 235void nvmet_send_ana_event(struct nvmet_subsys *subsys,
 236		struct nvmet_port *port)
 237{
 238	struct nvmet_ctrl *ctrl;
 239
 240	mutex_lock(&subsys->lock);
 241	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
 242		if (port && ctrl->port != port)
 243			continue;
 244		if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_ANA_CHANGE))
 245			continue;
 246		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
 247				NVME_AER_NOTICE_ANA, NVME_LOG_ANA);
 248	}
 249	mutex_unlock(&subsys->lock);
 250}
 251
 252void nvmet_port_send_ana_event(struct nvmet_port *port)
 253{
 254	struct nvmet_subsys_link *p;
 255
 256	down_read(&nvmet_config_sem);
 257	list_for_each_entry(p, &port->subsystems, entry)
 258		nvmet_send_ana_event(p->subsys, port);
 259	up_read(&nvmet_config_sem);
 260}
 261
 262int nvmet_register_transport(const struct nvmet_fabrics_ops *ops)
 263{
 264	int ret = 0;
 265
 266	down_write(&nvmet_config_sem);
 267	if (nvmet_transports[ops->type])
 268		ret = -EINVAL;
 269	else
 270		nvmet_transports[ops->type] = ops;
 271	up_write(&nvmet_config_sem);
 272
 273	return ret;
 274}
 275EXPORT_SYMBOL_GPL(nvmet_register_transport);
 276
 277void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops)
 278{
 279	down_write(&nvmet_config_sem);
 280	nvmet_transports[ops->type] = NULL;
 281	up_write(&nvmet_config_sem);
 282}
 283EXPORT_SYMBOL_GPL(nvmet_unregister_transport);
 284
 285int nvmet_enable_port(struct nvmet_port *port)
 286{
 287	const struct nvmet_fabrics_ops *ops;
 288	int ret;
 289
 290	lockdep_assert_held(&nvmet_config_sem);
 291
 292	ops = nvmet_transports[port->disc_addr.trtype];
 293	if (!ops) {
 294		up_write(&nvmet_config_sem);
 295		request_module("nvmet-transport-%d", port->disc_addr.trtype);
 296		down_write(&nvmet_config_sem);
 297		ops = nvmet_transports[port->disc_addr.trtype];
 298		if (!ops) {
 299			pr_err("transport type %d not supported\n",
 300				port->disc_addr.trtype);
 301			return -EINVAL;
 302		}
 303	}
 304
 305	if (!try_module_get(ops->owner))
 306		return -EINVAL;
 307
 308	ret = ops->add_port(port);
 309	if (ret) {
 310		module_put(ops->owner);
 311		return ret;
 312	}
 313
 314	/* If the transport didn't set inline_data_size, then disable it. */
 315	if (port->inline_data_size < 0)
 316		port->inline_data_size = 0;
 317
 318	port->enabled = true;
 319	return 0;
 320}
 321
 322void nvmet_disable_port(struct nvmet_port *port)
 323{
 324	const struct nvmet_fabrics_ops *ops;
 325
 326	lockdep_assert_held(&nvmet_config_sem);
 327
 328	port->enabled = false;
 329
 330	ops = nvmet_transports[port->disc_addr.trtype];
 331	ops->remove_port(port);
 332	module_put(ops->owner);
 333}
 334
 335static void nvmet_keep_alive_timer(struct work_struct *work)
 336{
 337	struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work),
 338			struct nvmet_ctrl, ka_work);
 339	bool cmd_seen = ctrl->cmd_seen;
 340
 341	ctrl->cmd_seen = false;
 342	if (cmd_seen) {
 343		pr_debug("ctrl %d reschedule traffic based keep-alive timer\n",
 344			ctrl->cntlid);
 345		schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
 346		return;
 347	}
 348
 349	pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n",
 350		ctrl->cntlid, ctrl->kato);
 351
 352	nvmet_ctrl_fatal_error(ctrl);
 353}
 354
 355static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl)
 356{
 357	pr_debug("ctrl %d start keep-alive timer for %d secs\n",
 358		ctrl->cntlid, ctrl->kato);
 359
 360	INIT_DELAYED_WORK(&ctrl->ka_work, nvmet_keep_alive_timer);
 361	schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
 362}
 363
 364static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
 365{
 366	pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid);
 367
 368	cancel_delayed_work_sync(&ctrl->ka_work);
 369}
 370
 371static struct nvmet_ns *__nvmet_find_namespace(struct nvmet_ctrl *ctrl,
 372		__le32 nsid)
 373{
 374	struct nvmet_ns *ns;
 375
 376	list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
 377		if (ns->nsid == le32_to_cpu(nsid))
 378			return ns;
 379	}
 380
 381	return NULL;
 382}
 383
 384struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid)
 385{
 386	struct nvmet_ns *ns;
 387
 388	rcu_read_lock();
 389	ns = __nvmet_find_namespace(ctrl, nsid);
 390	if (ns)
 391		percpu_ref_get(&ns->ref);
 392	rcu_read_unlock();
 393
 394	return ns;
 395}
 396
 397static void nvmet_destroy_namespace(struct percpu_ref *ref)
 398{
 399	struct nvmet_ns *ns = container_of(ref, struct nvmet_ns, ref);
 400
 401	complete(&ns->disable_done);
 402}
 403
 404void nvmet_put_namespace(struct nvmet_ns *ns)
 405{
 406	percpu_ref_put(&ns->ref);
 407}
 408
 409static void nvmet_ns_dev_disable(struct nvmet_ns *ns)
 410{
 411	nvmet_bdev_ns_disable(ns);
 412	nvmet_file_ns_disable(ns);
 413}
 414
 415static int nvmet_p2pmem_ns_enable(struct nvmet_ns *ns)
 416{
 417	int ret;
 418	struct pci_dev *p2p_dev;
 419
 420	if (!ns->use_p2pmem)
 421		return 0;
 422
 423	if (!ns->bdev) {
 424		pr_err("peer-to-peer DMA is not supported by non-block device namespaces\n");
 425		return -EINVAL;
 426	}
 427
 428	if (!blk_queue_pci_p2pdma(ns->bdev->bd_queue)) {
 429		pr_err("peer-to-peer DMA is not supported by the driver of %s\n",
 430		       ns->device_path);
 431		return -EINVAL;
 432	}
 433
 434	if (ns->p2p_dev) {
 435		ret = pci_p2pdma_distance(ns->p2p_dev, nvmet_ns_dev(ns), true);
 436		if (ret < 0)
 437			return -EINVAL;
 438	} else {
 439		/*
 440		 * Right now we just check that there is p2pmem available so
 441		 * we can report an error to the user right away if there
 442		 * is not. We'll find the actual device to use once we
 443		 * setup the controller when the port's device is available.
 444		 */
 445
 446		p2p_dev = pci_p2pmem_find(nvmet_ns_dev(ns));
 447		if (!p2p_dev) {
 448			pr_err("no peer-to-peer memory is available for %s\n",
 449			       ns->device_path);
 450			return -EINVAL;
 451		}
 452
 453		pci_dev_put(p2p_dev);
 454	}
 455
 456	return 0;
 457}
 458
 459/*
 460 * Note: ctrl->subsys->lock should be held when calling this function
 461 */
 462static void nvmet_p2pmem_ns_add_p2p(struct nvmet_ctrl *ctrl,
 463				    struct nvmet_ns *ns)
 464{
 465	struct device *clients[2];
 466	struct pci_dev *p2p_dev;
 467	int ret;
 468
 469	if (!ctrl->p2p_client || !ns->use_p2pmem)
 470		return;
 471
 472	if (ns->p2p_dev) {
 473		ret = pci_p2pdma_distance(ns->p2p_dev, ctrl->p2p_client, true);
 474		if (ret < 0)
 475			return;
 476
 477		p2p_dev = pci_dev_get(ns->p2p_dev);
 478	} else {
 479		clients[0] = ctrl->p2p_client;
 480		clients[1] = nvmet_ns_dev(ns);
 481
 482		p2p_dev = pci_p2pmem_find_many(clients, ARRAY_SIZE(clients));
 483		if (!p2p_dev) {
 484			pr_err("no peer-to-peer memory is available that's supported by %s and %s\n",
 485			       dev_name(ctrl->p2p_client), ns->device_path);
 486			return;
 487		}
 488	}
 489
 490	ret = radix_tree_insert(&ctrl->p2p_ns_map, ns->nsid, p2p_dev);
 491	if (ret < 0)
 492		pci_dev_put(p2p_dev);
 493
 494	pr_info("using p2pmem on %s for nsid %d\n", pci_name(p2p_dev),
 495		ns->nsid);
 496}
 497
 498int nvmet_ns_enable(struct nvmet_ns *ns)
 499{
 500	struct nvmet_subsys *subsys = ns->subsys;
 501	struct nvmet_ctrl *ctrl;
 502	int ret;
 503
 504	mutex_lock(&subsys->lock);
 505	ret = -EMFILE;
 506	if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES)
 507		goto out_unlock;
 508	ret = 0;
 509	if (ns->enabled)
 510		goto out_unlock;
 511
 512	ret = nvmet_bdev_ns_enable(ns);
 513	if (ret == -ENOTBLK)
 514		ret = nvmet_file_ns_enable(ns);
 515	if (ret)
 516		goto out_unlock;
 517
 518	ret = nvmet_p2pmem_ns_enable(ns);
 519	if (ret)
 520		goto out_unlock;
 521
 522	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
 523		nvmet_p2pmem_ns_add_p2p(ctrl, ns);
 524
 525	ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace,
 526				0, GFP_KERNEL);
 527	if (ret)
 528		goto out_dev_put;
 529
 530	if (ns->nsid > subsys->max_nsid)
 531		subsys->max_nsid = ns->nsid;
 532
 533	/*
 534	 * The namespaces list needs to be sorted to simplify the implementation
 535	 * of the Identify Namepace List subcommand.
 536	 */
 537	if (list_empty(&subsys->namespaces)) {
 538		list_add_tail_rcu(&ns->dev_link, &subsys->namespaces);
 539	} else {
 540		struct nvmet_ns *old;
 541
 542		list_for_each_entry_rcu(old, &subsys->namespaces, dev_link) {
 543			BUG_ON(ns->nsid == old->nsid);
 544			if (ns->nsid < old->nsid)
 545				break;
 546		}
 547
 548		list_add_tail_rcu(&ns->dev_link, &old->dev_link);
 549	}
 550	subsys->nr_namespaces++;
 551
 552	nvmet_ns_changed(subsys, ns->nsid);
 553	ns->enabled = true;
 554	ret = 0;
 555out_unlock:
 556	mutex_unlock(&subsys->lock);
 557	return ret;
 558out_dev_put:
 559	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
 560		pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
 561
 562	nvmet_ns_dev_disable(ns);
 563	goto out_unlock;
 564}
 565
 566void nvmet_ns_disable(struct nvmet_ns *ns)
 567{
 568	struct nvmet_subsys *subsys = ns->subsys;
 569	struct nvmet_ctrl *ctrl;
 570
 571	mutex_lock(&subsys->lock);
 572	if (!ns->enabled)
 573		goto out_unlock;
 574
 575	ns->enabled = false;
 576	list_del_rcu(&ns->dev_link);
 577	if (ns->nsid == subsys->max_nsid)
 578		subsys->max_nsid = nvmet_max_nsid(subsys);
 579
 580	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
 581		pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
 582
 583	mutex_unlock(&subsys->lock);
 584
 585	/*
 586	 * Now that we removed the namespaces from the lookup list, we
 587	 * can kill the per_cpu ref and wait for any remaining references
 588	 * to be dropped, as well as a RCU grace period for anyone only
 589	 * using the namepace under rcu_read_lock().  Note that we can't
 590	 * use call_rcu here as we need to ensure the namespaces have
 591	 * been fully destroyed before unloading the module.
 592	 */
 593	percpu_ref_kill(&ns->ref);
 594	synchronize_rcu();
 595	wait_for_completion(&ns->disable_done);
 596	percpu_ref_exit(&ns->ref);
 597
 598	mutex_lock(&subsys->lock);
 599
 600	subsys->nr_namespaces--;
 601	nvmet_ns_changed(subsys, ns->nsid);
 602	nvmet_ns_dev_disable(ns);
 603out_unlock:
 604	mutex_unlock(&subsys->lock);
 605}
 606
 607void nvmet_ns_free(struct nvmet_ns *ns)
 608{
 609	nvmet_ns_disable(ns);
 610
 611	down_write(&nvmet_ana_sem);
 612	nvmet_ana_group_enabled[ns->anagrpid]--;
 613	up_write(&nvmet_ana_sem);
 614
 615	kfree(ns->device_path);
 616	kfree(ns);
 617}
 618
 619struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
 620{
 621	struct nvmet_ns *ns;
 622
 623	ns = kzalloc(sizeof(*ns), GFP_KERNEL);
 624	if (!ns)
 625		return NULL;
 626
 627	INIT_LIST_HEAD(&ns->dev_link);
 628	init_completion(&ns->disable_done);
 629
 630	ns->nsid = nsid;
 631	ns->subsys = subsys;
 632
 633	down_write(&nvmet_ana_sem);
 634	ns->anagrpid = NVMET_DEFAULT_ANA_GRPID;
 635	nvmet_ana_group_enabled[ns->anagrpid]++;
 636	up_write(&nvmet_ana_sem);
 637
 638	uuid_gen(&ns->uuid);
 639	ns->buffered_io = false;
 640
 641	return ns;
 642}
 643
 644static void nvmet_update_sq_head(struct nvmet_req *req)
 645{
 646	if (req->sq->size) {
 647		u32 old_sqhd, new_sqhd;
 648
 649		do {
 650			old_sqhd = req->sq->sqhd;
 651			new_sqhd = (old_sqhd + 1) % req->sq->size;
 652		} while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) !=
 653					old_sqhd);
 654	}
 655	req->rsp->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF);
 656}
 657
 658static void nvmet_set_error(struct nvmet_req *req, u16 status)
 659{
 660	struct nvmet_ctrl *ctrl = req->sq->ctrl;
 661	struct nvme_error_slot *new_error_slot;
 662	unsigned long flags;
 663
 664	req->rsp->status = cpu_to_le16(status << 1);
 665
 666	if (!ctrl || req->error_loc == NVMET_NO_ERROR_LOC)
 667		return;
 668
 669	spin_lock_irqsave(&ctrl->error_lock, flags);
 670	ctrl->err_counter++;
 671	new_error_slot =
 672		&ctrl->slots[ctrl->err_counter % NVMET_ERROR_LOG_SLOTS];
 673
 674	new_error_slot->error_count = cpu_to_le64(ctrl->err_counter);
 675	new_error_slot->sqid = cpu_to_le16(req->sq->qid);
 676	new_error_slot->cmdid = cpu_to_le16(req->cmd->common.command_id);
 677	new_error_slot->status_field = cpu_to_le16(status << 1);
 678	new_error_slot->param_error_location = cpu_to_le16(req->error_loc);
 679	new_error_slot->lba = cpu_to_le64(req->error_slba);
 680	new_error_slot->nsid = req->cmd->common.nsid;
 681	spin_unlock_irqrestore(&ctrl->error_lock, flags);
 682
 683	/* set the more bit for this request */
 684	req->rsp->status |= cpu_to_le16(1 << 14);
 685}
 686
 687static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
 688{
 689	if (!req->sq->sqhd_disabled)
 690		nvmet_update_sq_head(req);
 691	req->rsp->sq_id = cpu_to_le16(req->sq->qid);
 692	req->rsp->command_id = req->cmd->common.command_id;
 693
 694	if (unlikely(status))
 695		nvmet_set_error(req, status);
 696	if (req->ns)
 697		nvmet_put_namespace(req->ns);
 698	req->ops->queue_response(req);
 699}
 700
 701void nvmet_req_complete(struct nvmet_req *req, u16 status)
 702{
 703	__nvmet_req_complete(req, status);
 704	percpu_ref_put(&req->sq->ref);
 705}
 706EXPORT_SYMBOL_GPL(nvmet_req_complete);
 707
 708void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq,
 709		u16 qid, u16 size)
 710{
 711	cq->qid = qid;
 712	cq->size = size;
 713
 714	ctrl->cqs[qid] = cq;
 715}
 716
 717void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
 718		u16 qid, u16 size)
 719{
 720	sq->sqhd = 0;
 721	sq->qid = qid;
 722	sq->size = size;
 723
 724	ctrl->sqs[qid] = sq;
 725}
 726
 727static void nvmet_confirm_sq(struct percpu_ref *ref)
 728{
 729	struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref);
 730
 731	complete(&sq->confirm_done);
 732}
 733
 734void nvmet_sq_destroy(struct nvmet_sq *sq)
 735{
 736	/*
 737	 * If this is the admin queue, complete all AERs so that our
 738	 * queue doesn't have outstanding requests on it.
 739	 */
 740	if (sq->ctrl && sq->ctrl->sqs && sq->ctrl->sqs[0] == sq)
 741		nvmet_async_events_free(sq->ctrl);
 742	percpu_ref_kill_and_confirm(&sq->ref, nvmet_confirm_sq);
 743	wait_for_completion(&sq->confirm_done);
 744	wait_for_completion(&sq->free_done);
 745	percpu_ref_exit(&sq->ref);
 746
 747	if (sq->ctrl) {
 748		nvmet_ctrl_put(sq->ctrl);
 749		sq->ctrl = NULL; /* allows reusing the queue later */
 750	}
 751}
 752EXPORT_SYMBOL_GPL(nvmet_sq_destroy);
 753
 754static void nvmet_sq_free(struct percpu_ref *ref)
 755{
 756	struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref);
 757
 758	complete(&sq->free_done);
 759}
 760
 761int nvmet_sq_init(struct nvmet_sq *sq)
 762{
 763	int ret;
 764
 765	ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL);
 766	if (ret) {
 767		pr_err("percpu_ref init failed!\n");
 768		return ret;
 769	}
 770	init_completion(&sq->free_done);
 771	init_completion(&sq->confirm_done);
 772
 773	return 0;
 774}
 775EXPORT_SYMBOL_GPL(nvmet_sq_init);
 776
 777static inline u16 nvmet_check_ana_state(struct nvmet_port *port,
 778		struct nvmet_ns *ns)
 779{
 780	enum nvme_ana_state state = port->ana_state[ns->anagrpid];
 781
 782	if (unlikely(state == NVME_ANA_INACCESSIBLE))
 783		return NVME_SC_ANA_INACCESSIBLE;
 784	if (unlikely(state == NVME_ANA_PERSISTENT_LOSS))
 785		return NVME_SC_ANA_PERSISTENT_LOSS;
 786	if (unlikely(state == NVME_ANA_CHANGE))
 787		return NVME_SC_ANA_TRANSITION;
 788	return 0;
 789}
 790
 791static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req)
 792{
 793	if (unlikely(req->ns->readonly)) {
 794		switch (req->cmd->common.opcode) {
 795		case nvme_cmd_read:
 796		case nvme_cmd_flush:
 797			break;
 798		default:
 799			return NVME_SC_NS_WRITE_PROTECTED;
 800		}
 801	}
 802
 803	return 0;
 804}
 805
 806static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
 807{
 808	struct nvme_command *cmd = req->cmd;
 809	u16 ret;
 810
 811	ret = nvmet_check_ctrl_status(req, cmd);
 812	if (unlikely(ret))
 813		return ret;
 814
 815	req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
 816	if (unlikely(!req->ns)) {
 817		req->error_loc = offsetof(struct nvme_common_command, nsid);
 818		return NVME_SC_INVALID_NS | NVME_SC_DNR;
 819	}
 820	ret = nvmet_check_ana_state(req->port, req->ns);
 821	if (unlikely(ret)) {
 822		req->error_loc = offsetof(struct nvme_common_command, nsid);
 823		return ret;
 824	}
 825	ret = nvmet_io_cmd_check_access(req);
 826	if (unlikely(ret)) {
 827		req->error_loc = offsetof(struct nvme_common_command, nsid);
 828		return ret;
 829	}
 830
 831	if (req->ns->file)
 832		return nvmet_file_parse_io_cmd(req);
 833	else
 834		return nvmet_bdev_parse_io_cmd(req);
 835}
 836
 837bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 838		struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops)
 839{
 840	u8 flags = req->cmd->common.flags;
 841	u16 status;
 842
 843	req->cq = cq;
 844	req->sq = sq;
 845	req->ops = ops;
 846	req->sg = NULL;
 847	req->sg_cnt = 0;
 848	req->transfer_len = 0;
 849	req->rsp->status = 0;
 850	req->rsp->sq_head = 0;
 851	req->ns = NULL;
 852	req->error_loc = NVMET_NO_ERROR_LOC;
 853	req->error_slba = 0;
 854
 855	/* no support for fused commands yet */
 856	if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
 857		req->error_loc = offsetof(struct nvme_common_command, flags);
 858		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 859		goto fail;
 860	}
 861
 862	/*
 863	 * For fabrics, PSDT field shall describe metadata pointer (MPTR) that
 864	 * contains an address of a single contiguous physical buffer that is
 865	 * byte aligned.
 866	 */
 867	if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF)) {
 868		req->error_loc = offsetof(struct nvme_common_command, flags);
 869		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 870		goto fail;
 871	}
 872
 873	if (unlikely(!req->sq->ctrl))
 874		/* will return an error for any Non-connect command: */
 875		status = nvmet_parse_connect_cmd(req);
 876	else if (likely(req->sq->qid != 0))
 877		status = nvmet_parse_io_cmd(req);
 878	else if (req->cmd->common.opcode == nvme_fabrics_command)
 879		status = nvmet_parse_fabrics_cmd(req);
 880	else if (req->sq->ctrl->subsys->type == NVME_NQN_DISC)
 881		status = nvmet_parse_discovery_cmd(req);
 882	else
 883		status = nvmet_parse_admin_cmd(req);
 884
 885	if (status)
 886		goto fail;
 887
 888	if (unlikely(!percpu_ref_tryget_live(&sq->ref))) {
 889		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 890		goto fail;
 891	}
 892
 893	if (sq->ctrl)
 894		sq->ctrl->cmd_seen = true;
 895
 896	return true;
 897
 898fail:
 899	__nvmet_req_complete(req, status);
 900	return false;
 901}
 902EXPORT_SYMBOL_GPL(nvmet_req_init);
 903
 904void nvmet_req_uninit(struct nvmet_req *req)
 905{
 906	percpu_ref_put(&req->sq->ref);
 907	if (req->ns)
 908		nvmet_put_namespace(req->ns);
 909}
 910EXPORT_SYMBOL_GPL(nvmet_req_uninit);
 911
 912void nvmet_req_execute(struct nvmet_req *req)
 913{
 914	if (unlikely(req->data_len != req->transfer_len)) {
 915		req->error_loc = offsetof(struct nvme_common_command, dptr);
 916		nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR);
 917	} else
 918		req->execute(req);
 919}
 920EXPORT_SYMBOL_GPL(nvmet_req_execute);
 921
 922int nvmet_req_alloc_sgl(struct nvmet_req *req)
 923{
 924	struct pci_dev *p2p_dev = NULL;
 925
 926	if (IS_ENABLED(CONFIG_PCI_P2PDMA)) {
 927		if (req->sq->ctrl && req->ns)
 928			p2p_dev = radix_tree_lookup(&req->sq->ctrl->p2p_ns_map,
 929						    req->ns->nsid);
 930
 931		req->p2p_dev = NULL;
 932		if (req->sq->qid && p2p_dev) {
 933			req->sg = pci_p2pmem_alloc_sgl(p2p_dev, &req->sg_cnt,
 934						       req->transfer_len);
 935			if (req->sg) {
 936				req->p2p_dev = p2p_dev;
 937				return 0;
 938			}
 939		}
 940
 941		/*
 942		 * If no P2P memory was available we fallback to using
 943		 * regular memory
 944		 */
 945	}
 946
 947	req->sg = sgl_alloc(req->transfer_len, GFP_KERNEL, &req->sg_cnt);
 948	if (!req->sg)
 949		return -ENOMEM;
 950
 951	return 0;
 952}
 953EXPORT_SYMBOL_GPL(nvmet_req_alloc_sgl);
 954
 955void nvmet_req_free_sgl(struct nvmet_req *req)
 956{
 957	if (req->p2p_dev)
 958		pci_p2pmem_free_sgl(req->p2p_dev, req->sg);
 959	else
 960		sgl_free(req->sg);
 961
 962	req->sg = NULL;
 963	req->sg_cnt = 0;
 964}
 965EXPORT_SYMBOL_GPL(nvmet_req_free_sgl);
 966
 967static inline bool nvmet_cc_en(u32 cc)
 968{
 969	return (cc >> NVME_CC_EN_SHIFT) & 0x1;
 970}
 971
 972static inline u8 nvmet_cc_css(u32 cc)
 973{
 974	return (cc >> NVME_CC_CSS_SHIFT) & 0x7;
 975}
 976
 977static inline u8 nvmet_cc_mps(u32 cc)
 978{
 979	return (cc >> NVME_CC_MPS_SHIFT) & 0xf;
 980}
 981
 982static inline u8 nvmet_cc_ams(u32 cc)
 983{
 984	return (cc >> NVME_CC_AMS_SHIFT) & 0x7;
 985}
 986
 987static inline u8 nvmet_cc_shn(u32 cc)
 988{
 989	return (cc >> NVME_CC_SHN_SHIFT) & 0x3;
 990}
 991
 992static inline u8 nvmet_cc_iosqes(u32 cc)
 993{
 994	return (cc >> NVME_CC_IOSQES_SHIFT) & 0xf;
 995}
 996
 997static inline u8 nvmet_cc_iocqes(u32 cc)
 998{
 999	return (cc >> NVME_CC_IOCQES_SHIFT) & 0xf;
1000}
1001
1002static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
1003{
1004	lockdep_assert_held(&ctrl->lock);
1005
1006	if (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES ||
1007	    nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES ||
1008	    nvmet_cc_mps(ctrl->cc) != 0 ||
1009	    nvmet_cc_ams(ctrl->cc) != 0 ||
1010	    nvmet_cc_css(ctrl->cc) != 0) {
1011		ctrl->csts = NVME_CSTS_CFS;
1012		return;
1013	}
1014
1015	ctrl->csts = NVME_CSTS_RDY;
1016
1017	/*
1018	 * Controllers that are not yet enabled should not really enforce the
1019	 * keep alive timeout, but we still want to track a timeout and cleanup
1020	 * in case a host died before it enabled the controller.  Hence, simply
1021	 * reset the keep alive timer when the controller is enabled.
1022	 */
1023	mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ);
1024}
1025
1026static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl)
1027{
1028	lockdep_assert_held(&ctrl->lock);
1029
1030	/* XXX: tear down queues? */
1031	ctrl->csts &= ~NVME_CSTS_RDY;
1032	ctrl->cc = 0;
1033}
1034
1035void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new)
1036{
1037	u32 old;
1038
1039	mutex_lock(&ctrl->lock);
1040	old = ctrl->cc;
1041	ctrl->cc = new;
1042
1043	if (nvmet_cc_en(new) && !nvmet_cc_en(old))
1044		nvmet_start_ctrl(ctrl);
1045	if (!nvmet_cc_en(new) && nvmet_cc_en(old))
1046		nvmet_clear_ctrl(ctrl);
1047	if (nvmet_cc_shn(new) && !nvmet_cc_shn(old)) {
1048		nvmet_clear_ctrl(ctrl);
1049		ctrl->csts |= NVME_CSTS_SHST_CMPLT;
1050	}
1051	if (!nvmet_cc_shn(new) && nvmet_cc_shn(old))
1052		ctrl->csts &= ~NVME_CSTS_SHST_CMPLT;
1053	mutex_unlock(&ctrl->lock);
1054}
1055
1056static void nvmet_init_cap(struct nvmet_ctrl *ctrl)
1057{
1058	/* command sets supported: NVMe command set: */
1059	ctrl->cap = (1ULL << 37);
1060	/* CC.EN timeout in 500msec units: */
1061	ctrl->cap |= (15ULL << 24);
1062	/* maximum queue entries supported: */
1063	ctrl->cap |= NVMET_QUEUE_SIZE - 1;
1064}
1065
1066u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid,
1067		struct nvmet_req *req, struct nvmet_ctrl **ret)
1068{
1069	struct nvmet_subsys *subsys;
1070	struct nvmet_ctrl *ctrl;
1071	u16 status = 0;
1072
1073	subsys = nvmet_find_get_subsys(req->port, subsysnqn);
1074	if (!subsys) {
1075		pr_warn("connect request for invalid subsystem %s!\n",
1076			subsysnqn);
1077		req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn);
1078		return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1079	}
1080
1081	mutex_lock(&subsys->lock);
1082	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
1083		if (ctrl->cntlid == cntlid) {
1084			if (strncmp(hostnqn, ctrl->hostnqn, NVMF_NQN_SIZE)) {
1085				pr_warn("hostnqn mismatch.\n");
1086				continue;
1087			}
1088			if (!kref_get_unless_zero(&ctrl->ref))
1089				continue;
1090
1091			*ret = ctrl;
1092			goto out;
1093		}
1094	}
1095
1096	pr_warn("could not find controller %d for subsys %s / host %s\n",
1097		cntlid, subsysnqn, hostnqn);
1098	req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid);
1099	status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1100
1101out:
1102	mutex_unlock(&subsys->lock);
1103	nvmet_subsys_put(subsys);
1104	return status;
1105}
1106
1107u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd)
1108{
1109	if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
1110		pr_err("got cmd %d while CC.EN == 0 on qid = %d\n",
1111		       cmd->common.opcode, req->sq->qid);
1112		return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
1113	}
1114
1115	if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
1116		pr_err("got cmd %d while CSTS.RDY == 0 on qid = %d\n",
1117		       cmd->common.opcode, req->sq->qid);
1118		return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
1119	}
1120	return 0;
1121}
1122
1123bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn)
1124{
1125	struct nvmet_host_link *p;
1126
1127	lockdep_assert_held(&nvmet_config_sem);
1128
1129	if (subsys->allow_any_host)
1130		return true;
1131
1132	if (subsys->type == NVME_NQN_DISC) /* allow all access to disc subsys */
1133		return true;
1134
1135	list_for_each_entry(p, &subsys->hosts, entry) {
1136		if (!strcmp(nvmet_host_name(p->host), hostnqn))
1137			return true;
1138	}
1139
1140	return false;
1141}
1142
1143/*
1144 * Note: ctrl->subsys->lock should be held when calling this function
1145 */
1146static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl,
1147		struct nvmet_req *req)
1148{
1149	struct nvmet_ns *ns;
1150
1151	if (!req->p2p_client)
1152		return;
1153
1154	ctrl->p2p_client = get_device(req->p2p_client);
1155
1156	list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link)
1157		nvmet_p2pmem_ns_add_p2p(ctrl, ns);
1158}
1159
1160/*
1161 * Note: ctrl->subsys->lock should be held when calling this function
1162 */
1163static void nvmet_release_p2p_ns_map(struct nvmet_ctrl *ctrl)
1164{
1165	struct radix_tree_iter iter;
1166	void __rcu **slot;
1167
1168	radix_tree_for_each_slot(slot, &ctrl->p2p_ns_map, &iter, 0)
1169		pci_dev_put(radix_tree_deref_slot(slot));
1170
1171	put_device(ctrl->p2p_client);
1172}
1173
1174u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
1175		struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp)
1176{
1177	struct nvmet_subsys *subsys;
1178	struct nvmet_ctrl *ctrl;
1179	int ret;
1180	u16 status;
1181
1182	status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1183	subsys = nvmet_find_get_subsys(req->port, subsysnqn);
1184	if (!subsys) {
1185		pr_warn("connect request for invalid subsystem %s!\n",
1186			subsysnqn);
1187		req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn);
1188		goto out;
1189	}
1190
1191	status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1192	down_read(&nvmet_config_sem);
1193	if (!nvmet_host_allowed(subsys, hostnqn)) {
1194		pr_info("connect by host %s for subsystem %s not allowed\n",
1195			hostnqn, subsysnqn);
1196		req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn);
1197		up_read(&nvmet_config_sem);
1198		status = NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR;
1199		goto out_put_subsystem;
1200	}
1201	up_read(&nvmet_config_sem);
1202
1203	status = NVME_SC_INTERNAL;
1204	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
1205	if (!ctrl)
1206		goto out_put_subsystem;
1207	mutex_init(&ctrl->lock);
1208
1209	nvmet_init_cap(ctrl);
1210
1211	ctrl->port = req->port;
1212
1213	INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work);
1214	INIT_LIST_HEAD(&ctrl->async_events);
1215	INIT_RADIX_TREE(&ctrl->p2p_ns_map, GFP_KERNEL);
1216
1217	memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE);
1218	memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE);
1219
1220	kref_init(&ctrl->ref);
1221	ctrl->subsys = subsys;
1222	WRITE_ONCE(ctrl->aen_enabled, NVMET_AEN_CFG_OPTIONAL);
1223
1224	ctrl->changed_ns_list = kmalloc_array(NVME_MAX_CHANGED_NAMESPACES,
1225			sizeof(__le32), GFP_KERNEL);
1226	if (!ctrl->changed_ns_list)
1227		goto out_free_ctrl;
1228
1229	ctrl->cqs = kcalloc(subsys->max_qid + 1,
1230			sizeof(struct nvmet_cq *),
1231			GFP_KERNEL);
1232	if (!ctrl->cqs)
1233		goto out_free_changed_ns_list;
1234
1235	ctrl->sqs = kcalloc(subsys->max_qid + 1,
1236			sizeof(struct nvmet_sq *),
1237			GFP_KERNEL);
1238	if (!ctrl->sqs)
1239		goto out_free_cqs;
1240
1241	ret = ida_simple_get(&cntlid_ida,
1242			     NVME_CNTLID_MIN, NVME_CNTLID_MAX,
1243			     GFP_KERNEL);
1244	if (ret < 0) {
1245		status = NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR;
1246		goto out_free_sqs;
1247	}
1248	ctrl->cntlid = ret;
1249
1250	ctrl->ops = req->ops;
1251
1252	/*
1253	 * Discovery controllers may use some arbitrary high value
1254	 * in order to cleanup stale discovery sessions
1255	 */
1256	if ((ctrl->subsys->type == NVME_NQN_DISC) && !kato)
1257		kato = NVMET_DISC_KATO_MS;
1258
1259	/* keep-alive timeout in seconds */
1260	ctrl->kato = DIV_ROUND_UP(kato, 1000);
1261
1262	ctrl->err_counter = 0;
1263	spin_lock_init(&ctrl->error_lock);
1264
1265	nvmet_start_keep_alive_timer(ctrl);
1266
1267	mutex_lock(&subsys->lock);
1268	list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
1269	nvmet_setup_p2p_ns_map(ctrl, req);
1270	mutex_unlock(&subsys->lock);
1271
1272	*ctrlp = ctrl;
1273	return 0;
1274
1275out_free_sqs:
1276	kfree(ctrl->sqs);
1277out_free_cqs:
1278	kfree(ctrl->cqs);
1279out_free_changed_ns_list:
1280	kfree(ctrl->changed_ns_list);
1281out_free_ctrl:
1282	kfree(ctrl);
1283out_put_subsystem:
1284	nvmet_subsys_put(subsys);
1285out:
1286	return status;
1287}
1288
1289static void nvmet_ctrl_free(struct kref *ref)
1290{
1291	struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref);
1292	struct nvmet_subsys *subsys = ctrl->subsys;
1293
1294	mutex_lock(&subsys->lock);
1295	nvmet_release_p2p_ns_map(ctrl);
1296	list_del(&ctrl->subsys_entry);
1297	mutex_unlock(&subsys->lock);
1298
1299	nvmet_stop_keep_alive_timer(ctrl);
1300
1301	flush_work(&ctrl->async_event_work);
1302	cancel_work_sync(&ctrl->fatal_err_work);
1303
1304	ida_simple_remove(&cntlid_ida, ctrl->cntlid);
1305
1306	kfree(ctrl->sqs);
1307	kfree(ctrl->cqs);
1308	kfree(ctrl->changed_ns_list);
1309	kfree(ctrl);
1310
1311	nvmet_subsys_put(subsys);
1312}
1313
1314void nvmet_ctrl_put(struct nvmet_ctrl *ctrl)
1315{
1316	kref_put(&ctrl->ref, nvmet_ctrl_free);
1317}
1318
1319static void nvmet_fatal_error_handler(struct work_struct *work)
1320{
1321	struct nvmet_ctrl *ctrl =
1322			container_of(work, struct nvmet_ctrl, fatal_err_work);
1323
1324	pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid);
1325	ctrl->ops->delete_ctrl(ctrl);
1326}
1327
1328void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl)
1329{
1330	mutex_lock(&ctrl->lock);
1331	if (!(ctrl->csts & NVME_CSTS_CFS)) {
1332		ctrl->csts |= NVME_CSTS_CFS;
1333		INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler);
1334		schedule_work(&ctrl->fatal_err_work);
1335	}
1336	mutex_unlock(&ctrl->lock);
1337}
1338EXPORT_SYMBOL_GPL(nvmet_ctrl_fatal_error);
1339
1340static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
1341		const char *subsysnqn)
1342{
1343	struct nvmet_subsys_link *p;
1344
1345	if (!port)
1346		return NULL;
1347
1348	if (!strcmp(NVME_DISC_SUBSYS_NAME, subsysnqn)) {
1349		if (!kref_get_unless_zero(&nvmet_disc_subsys->ref))
1350			return NULL;
1351		return nvmet_disc_subsys;
1352	}
1353
1354	down_read(&nvmet_config_sem);
1355	list_for_each_entry(p, &port->subsystems, entry) {
1356		if (!strncmp(p->subsys->subsysnqn, subsysnqn,
1357				NVMF_NQN_SIZE)) {
1358			if (!kref_get_unless_zero(&p->subsys->ref))
1359				break;
1360			up_read(&nvmet_config_sem);
1361			return p->subsys;
1362		}
1363	}
1364	up_read(&nvmet_config_sem);
1365	return NULL;
1366}
1367
1368struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
1369		enum nvme_subsys_type type)
1370{
1371	struct nvmet_subsys *subsys;
1372
1373	subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
1374	if (!subsys)
1375		return NULL;
1376
1377	subsys->ver = NVME_VS(1, 3, 0); /* NVMe 1.3.0 */
1378	/* generate a random serial number as our controllers are ephemeral: */
1379	get_random_bytes(&subsys->serial, sizeof(subsys->serial));
1380
1381	switch (type) {
1382	case NVME_NQN_NVME:
1383		subsys->max_qid = NVMET_NR_QUEUES;
1384		break;
1385	case NVME_NQN_DISC:
1386		subsys->max_qid = 0;
1387		break;
1388	default:
1389		pr_err("%s: Unknown Subsystem type - %d\n", __func__, type);
1390		kfree(subsys);
1391		return NULL;
1392	}
1393	subsys->type = type;
1394	subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE,
1395			GFP_KERNEL);
1396	if (!subsys->subsysnqn) {
1397		kfree(subsys);
1398		return NULL;
1399	}
1400
1401	kref_init(&subsys->ref);
1402
1403	mutex_init(&subsys->lock);
1404	INIT_LIST_HEAD(&subsys->namespaces);
1405	INIT_LIST_HEAD(&subsys->ctrls);
1406	INIT_LIST_HEAD(&subsys->hosts);
1407
1408	return subsys;
1409}
1410
1411static void nvmet_subsys_free(struct kref *ref)
1412{
1413	struct nvmet_subsys *subsys =
1414		container_of(ref, struct nvmet_subsys, ref);
1415
1416	WARN_ON_ONCE(!list_empty(&subsys->namespaces));
1417
1418	kfree(subsys->subsysnqn);
1419	kfree(subsys);
1420}
1421
1422void nvmet_subsys_del_ctrls(struct nvmet_subsys *subsys)
1423{
1424	struct nvmet_ctrl *ctrl;
1425
1426	mutex_lock(&subsys->lock);
1427	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
1428		ctrl->ops->delete_ctrl(ctrl);
1429	mutex_unlock(&subsys->lock);
1430}
1431
1432void nvmet_subsys_put(struct nvmet_subsys *subsys)
1433{
1434	kref_put(&subsys->ref, nvmet_subsys_free);
1435}
1436
1437static int __init nvmet_init(void)
1438{
1439	int error;
1440
1441	nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1;
1442
1443	buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq",
1444			WQ_MEM_RECLAIM, 0);
1445	if (!buffered_io_wq) {
1446		error = -ENOMEM;
1447		goto out;
1448	}
1449
1450	error = nvmet_init_discovery();
1451	if (error)
1452		goto out_free_work_queue;
1453
1454	error = nvmet_init_configfs();
1455	if (error)
1456		goto out_exit_discovery;
1457	return 0;
1458
1459out_exit_discovery:
1460	nvmet_exit_discovery();
1461out_free_work_queue:
1462	destroy_workqueue(buffered_io_wq);
1463out:
1464	return error;
1465}
1466
1467static void __exit nvmet_exit(void)
1468{
1469	nvmet_exit_configfs();
1470	nvmet_exit_discovery();
1471	ida_destroy(&cntlid_ida);
1472	destroy_workqueue(buffered_io_wq);
1473
1474	BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024);
1475	BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024);
1476}
1477
1478module_init(nvmet_init);
1479module_exit(nvmet_exit);
1480
1481MODULE_LICENSE("GPL v2");