drivers/nvme/target/core.c at v5.18

tjh.dev / kernel
fork atom
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork atom
kernel / drivers / nvme / target / core.c
at v5.18 1667 lines 40 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Common code for the NVMe target.
   4 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
   5 */
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7#include <linux/module.h>
   8#include <linux/random.h>
   9#include <linux/rculist.h>
  10#include <linux/pci-p2pdma.h>
  11#include <linux/scatterlist.h>
  12
  13#define CREATE_TRACE_POINTS
  14#include "trace.h"
  15
  16#include "nvmet.h"
  17
  18struct workqueue_struct *buffered_io_wq;
  19struct workqueue_struct *zbd_wq;
  20static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
  21static DEFINE_IDA(cntlid_ida);
  22
  23struct workqueue_struct *nvmet_wq;
  24EXPORT_SYMBOL_GPL(nvmet_wq);
  25
  26/*
  27 * This read/write semaphore is used to synchronize access to configuration
  28 * information on a target system that will result in discovery log page
  29 * information change for at least one host.
  30 * The full list of resources to protected by this semaphore is:
  31 *
  32 *  - subsystems list
  33 *  - per-subsystem allowed hosts list
  34 *  - allow_any_host subsystem attribute
  35 *  - nvmet_genctr
  36 *  - the nvmet_transports array
  37 *
  38 * When updating any of those lists/structures write lock should be obtained,
  39 * while when reading (popolating discovery log page or checking host-subsystem
  40 * link) read lock is obtained to allow concurrent reads.
  41 */
  42DECLARE_RWSEM(nvmet_config_sem);
  43
  44u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
  45u64 nvmet_ana_chgcnt;
  46DECLARE_RWSEM(nvmet_ana_sem);
  47
  48inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno)
  49{
  50	switch (errno) {
  51	case 0:
  52		return NVME_SC_SUCCESS;
  53	case -ENOSPC:
  54		req->error_loc = offsetof(struct nvme_rw_command, length);
  55		return NVME_SC_CAP_EXCEEDED | NVME_SC_DNR;
  56	case -EREMOTEIO:
  57		req->error_loc = offsetof(struct nvme_rw_command, slba);
  58		return  NVME_SC_LBA_RANGE | NVME_SC_DNR;
  59	case -EOPNOTSUPP:
  60		req->error_loc = offsetof(struct nvme_common_command, opcode);
  61		switch (req->cmd->common.opcode) {
  62		case nvme_cmd_dsm:
  63		case nvme_cmd_write_zeroes:
  64			return NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR;
  65		default:
  66			return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
  67		}
  68		break;
  69	case -ENODATA:
  70		req->error_loc = offsetof(struct nvme_rw_command, nsid);
  71		return NVME_SC_ACCESS_DENIED;
  72	case -EIO:
  73		fallthrough;
  74	default:
  75		req->error_loc = offsetof(struct nvme_common_command, opcode);
  76		return NVME_SC_INTERNAL | NVME_SC_DNR;
  77	}
  78}
  79
  80u16 nvmet_report_invalid_opcode(struct nvmet_req *req)
  81{
  82	pr_debug("unhandled cmd %d on qid %d\n", req->cmd->common.opcode,
  83		 req->sq->qid);
  84
  85	req->error_loc = offsetof(struct nvme_common_command, opcode);
  86	return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
  87}
  88
  89static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
  90		const char *subsysnqn);
  91
  92u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
  93		size_t len)
  94{
  95	if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) {
  96		req->error_loc = offsetof(struct nvme_common_command, dptr);
  97		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
  98	}
  99	return 0;
 100}
 101
 102u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len)
 103{
 104	if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) {
 105		req->error_loc = offsetof(struct nvme_common_command, dptr);
 106		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
 107	}
 108	return 0;
 109}
 110
 111u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len)
 112{
 113	if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len) {
 114		req->error_loc = offsetof(struct nvme_common_command, dptr);
 115		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
 116	}
 117	return 0;
 118}
 119
 120static u32 nvmet_max_nsid(struct nvmet_subsys *subsys)
 121{
 122	struct nvmet_ns *cur;
 123	unsigned long idx;
 124	u32 nsid = 0;
 125
 126	xa_for_each(&subsys->namespaces, idx, cur)
 127		nsid = cur->nsid;
 128
 129	return nsid;
 130}
 131
 132static u32 nvmet_async_event_result(struct nvmet_async_event *aen)
 133{
 134	return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16);
 135}
 136
 137static void nvmet_async_events_failall(struct nvmet_ctrl *ctrl)
 138{
 139	struct nvmet_req *req;
 140
 141	mutex_lock(&ctrl->lock);
 142	while (ctrl->nr_async_event_cmds) {
 143		req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
 144		mutex_unlock(&ctrl->lock);
 145		nvmet_req_complete(req, NVME_SC_INTERNAL | NVME_SC_DNR);
 146		mutex_lock(&ctrl->lock);
 147	}
 148	mutex_unlock(&ctrl->lock);
 149}
 150
 151static void nvmet_async_events_process(struct nvmet_ctrl *ctrl)
 152{
 153	struct nvmet_async_event *aen;
 154	struct nvmet_req *req;
 155
 156	mutex_lock(&ctrl->lock);
 157	while (ctrl->nr_async_event_cmds && !list_empty(&ctrl->async_events)) {
 158		aen = list_first_entry(&ctrl->async_events,
 159				       struct nvmet_async_event, entry);
 160		req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
 161		nvmet_set_result(req, nvmet_async_event_result(aen));
 162
 163		list_del(&aen->entry);
 164		kfree(aen);
 165
 166		mutex_unlock(&ctrl->lock);
 167		trace_nvmet_async_event(ctrl, req->cqe->result.u32);
 168		nvmet_req_complete(req, 0);
 169		mutex_lock(&ctrl->lock);
 170	}
 171	mutex_unlock(&ctrl->lock);
 172}
 173
 174static void nvmet_async_events_free(struct nvmet_ctrl *ctrl)
 175{
 176	struct nvmet_async_event *aen, *tmp;
 177
 178	mutex_lock(&ctrl->lock);
 179	list_for_each_entry_safe(aen, tmp, &ctrl->async_events, entry) {
 180		list_del(&aen->entry);
 181		kfree(aen);
 182	}
 183	mutex_unlock(&ctrl->lock);
 184}
 185
 186static void nvmet_async_event_work(struct work_struct *work)
 187{
 188	struct nvmet_ctrl *ctrl =
 189		container_of(work, struct nvmet_ctrl, async_event_work);
 190
 191	nvmet_async_events_process(ctrl);
 192}
 193
 194void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
 195		u8 event_info, u8 log_page)
 196{
 197	struct nvmet_async_event *aen;
 198
 199	aen = kmalloc(sizeof(*aen), GFP_KERNEL);
 200	if (!aen)
 201		return;
 202
 203	aen->event_type = event_type;
 204	aen->event_info = event_info;
 205	aen->log_page = log_page;
 206
 207	mutex_lock(&ctrl->lock);
 208	list_add_tail(&aen->entry, &ctrl->async_events);
 209	mutex_unlock(&ctrl->lock);
 210
 211	queue_work(nvmet_wq, &ctrl->async_event_work);
 212}
 213
 214static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid)
 215{
 216	u32 i;
 217
 218	mutex_lock(&ctrl->lock);
 219	if (ctrl->nr_changed_ns > NVME_MAX_CHANGED_NAMESPACES)
 220		goto out_unlock;
 221
 222	for (i = 0; i < ctrl->nr_changed_ns; i++) {
 223		if (ctrl->changed_ns_list[i] == nsid)
 224			goto out_unlock;
 225	}
 226
 227	if (ctrl->nr_changed_ns == NVME_MAX_CHANGED_NAMESPACES) {
 228		ctrl->changed_ns_list[0] = cpu_to_le32(0xffffffff);
 229		ctrl->nr_changed_ns = U32_MAX;
 230		goto out_unlock;
 231	}
 232
 233	ctrl->changed_ns_list[ctrl->nr_changed_ns++] = nsid;
 234out_unlock:
 235	mutex_unlock(&ctrl->lock);
 236}
 237
 238void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
 239{
 240	struct nvmet_ctrl *ctrl;
 241
 242	lockdep_assert_held(&subsys->lock);
 243
 244	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
 245		nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid));
 246		if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_NS_ATTR))
 247			continue;
 248		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
 249				NVME_AER_NOTICE_NS_CHANGED,
 250				NVME_LOG_CHANGED_NS);
 251	}
 252}
 253
 254void nvmet_send_ana_event(struct nvmet_subsys *subsys,
 255		struct nvmet_port *port)
 256{
 257	struct nvmet_ctrl *ctrl;
 258
 259	mutex_lock(&subsys->lock);
 260	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
 261		if (port && ctrl->port != port)
 262			continue;
 263		if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_ANA_CHANGE))
 264			continue;
 265		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
 266				NVME_AER_NOTICE_ANA, NVME_LOG_ANA);
 267	}
 268	mutex_unlock(&subsys->lock);
 269}
 270
 271void nvmet_port_send_ana_event(struct nvmet_port *port)
 272{
 273	struct nvmet_subsys_link *p;
 274
 275	down_read(&nvmet_config_sem);
 276	list_for_each_entry(p, &port->subsystems, entry)
 277		nvmet_send_ana_event(p->subsys, port);
 278	up_read(&nvmet_config_sem);
 279}
 280
 281int nvmet_register_transport(const struct nvmet_fabrics_ops *ops)
 282{
 283	int ret = 0;
 284
 285	down_write(&nvmet_config_sem);
 286	if (nvmet_transports[ops->type])
 287		ret = -EINVAL;
 288	else
 289		nvmet_transports[ops->type] = ops;
 290	up_write(&nvmet_config_sem);
 291
 292	return ret;
 293}
 294EXPORT_SYMBOL_GPL(nvmet_register_transport);
 295
 296void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops)
 297{
 298	down_write(&nvmet_config_sem);
 299	nvmet_transports[ops->type] = NULL;
 300	up_write(&nvmet_config_sem);
 301}
 302EXPORT_SYMBOL_GPL(nvmet_unregister_transport);
 303
 304void nvmet_port_del_ctrls(struct nvmet_port *port, struct nvmet_subsys *subsys)
 305{
 306	struct nvmet_ctrl *ctrl;
 307
 308	mutex_lock(&subsys->lock);
 309	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
 310		if (ctrl->port == port)
 311			ctrl->ops->delete_ctrl(ctrl);
 312	}
 313	mutex_unlock(&subsys->lock);
 314}
 315
 316int nvmet_enable_port(struct nvmet_port *port)
 317{
 318	const struct nvmet_fabrics_ops *ops;
 319	int ret;
 320
 321	lockdep_assert_held(&nvmet_config_sem);
 322
 323	ops = nvmet_transports[port->disc_addr.trtype];
 324	if (!ops) {
 325		up_write(&nvmet_config_sem);
 326		request_module("nvmet-transport-%d", port->disc_addr.trtype);
 327		down_write(&nvmet_config_sem);
 328		ops = nvmet_transports[port->disc_addr.trtype];
 329		if (!ops) {
 330			pr_err("transport type %d not supported\n",
 331				port->disc_addr.trtype);
 332			return -EINVAL;
 333		}
 334	}
 335
 336	if (!try_module_get(ops->owner))
 337		return -EINVAL;
 338
 339	/*
 340	 * If the user requested PI support and the transport isn't pi capable,
 341	 * don't enable the port.
 342	 */
 343	if (port->pi_enable && !(ops->flags & NVMF_METADATA_SUPPORTED)) {
 344		pr_err("T10-PI is not supported by transport type %d\n",
 345		       port->disc_addr.trtype);
 346		ret = -EINVAL;
 347		goto out_put;
 348	}
 349
 350	ret = ops->add_port(port);
 351	if (ret)
 352		goto out_put;
 353
 354	/* If the transport didn't set inline_data_size, then disable it. */
 355	if (port->inline_data_size < 0)
 356		port->inline_data_size = 0;
 357
 358	port->enabled = true;
 359	port->tr_ops = ops;
 360	return 0;
 361
 362out_put:
 363	module_put(ops->owner);
 364	return ret;
 365}
 366
 367void nvmet_disable_port(struct nvmet_port *port)
 368{
 369	const struct nvmet_fabrics_ops *ops;
 370
 371	lockdep_assert_held(&nvmet_config_sem);
 372
 373	port->enabled = false;
 374	port->tr_ops = NULL;
 375
 376	ops = nvmet_transports[port->disc_addr.trtype];
 377	ops->remove_port(port);
 378	module_put(ops->owner);
 379}
 380
 381static void nvmet_keep_alive_timer(struct work_struct *work)
 382{
 383	struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work),
 384			struct nvmet_ctrl, ka_work);
 385	bool reset_tbkas = ctrl->reset_tbkas;
 386
 387	ctrl->reset_tbkas = false;
 388	if (reset_tbkas) {
 389		pr_debug("ctrl %d reschedule traffic based keep-alive timer\n",
 390			ctrl->cntlid);
 391		queue_delayed_work(nvmet_wq, &ctrl->ka_work, ctrl->kato * HZ);
 392		return;
 393	}
 394
 395	pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n",
 396		ctrl->cntlid, ctrl->kato);
 397
 398	nvmet_ctrl_fatal_error(ctrl);
 399}
 400
 401void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl)
 402{
 403	if (unlikely(ctrl->kato == 0))
 404		return;
 405
 406	pr_debug("ctrl %d start keep-alive timer for %d secs\n",
 407		ctrl->cntlid, ctrl->kato);
 408
 409	queue_delayed_work(nvmet_wq, &ctrl->ka_work, ctrl->kato * HZ);
 410}
 411
 412void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
 413{
 414	if (unlikely(ctrl->kato == 0))
 415		return;
 416
 417	pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid);
 418
 419	cancel_delayed_work_sync(&ctrl->ka_work);
 420}
 421
 422u16 nvmet_req_find_ns(struct nvmet_req *req)
 423{
 424	u32 nsid = le32_to_cpu(req->cmd->common.nsid);
 425
 426	req->ns = xa_load(&nvmet_req_subsys(req)->namespaces, nsid);
 427	if (unlikely(!req->ns)) {
 428		req->error_loc = offsetof(struct nvme_common_command, nsid);
 429		return NVME_SC_INVALID_NS | NVME_SC_DNR;
 430	}
 431
 432	percpu_ref_get(&req->ns->ref);
 433	return NVME_SC_SUCCESS;
 434}
 435
 436static void nvmet_destroy_namespace(struct percpu_ref *ref)
 437{
 438	struct nvmet_ns *ns = container_of(ref, struct nvmet_ns, ref);
 439
 440	complete(&ns->disable_done);
 441}
 442
 443void nvmet_put_namespace(struct nvmet_ns *ns)
 444{
 445	percpu_ref_put(&ns->ref);
 446}
 447
 448static void nvmet_ns_dev_disable(struct nvmet_ns *ns)
 449{
 450	nvmet_bdev_ns_disable(ns);
 451	nvmet_file_ns_disable(ns);
 452}
 453
 454static int nvmet_p2pmem_ns_enable(struct nvmet_ns *ns)
 455{
 456	int ret;
 457	struct pci_dev *p2p_dev;
 458
 459	if (!ns->use_p2pmem)
 460		return 0;
 461
 462	if (!ns->bdev) {
 463		pr_err("peer-to-peer DMA is not supported by non-block device namespaces\n");
 464		return -EINVAL;
 465	}
 466
 467	if (!blk_queue_pci_p2pdma(ns->bdev->bd_disk->queue)) {
 468		pr_err("peer-to-peer DMA is not supported by the driver of %s\n",
 469		       ns->device_path);
 470		return -EINVAL;
 471	}
 472
 473	if (ns->p2p_dev) {
 474		ret = pci_p2pdma_distance(ns->p2p_dev, nvmet_ns_dev(ns), true);
 475		if (ret < 0)
 476			return -EINVAL;
 477	} else {
 478		/*
 479		 * Right now we just check that there is p2pmem available so
 480		 * we can report an error to the user right away if there
 481		 * is not. We'll find the actual device to use once we
 482		 * setup the controller when the port's device is available.
 483		 */
 484
 485		p2p_dev = pci_p2pmem_find(nvmet_ns_dev(ns));
 486		if (!p2p_dev) {
 487			pr_err("no peer-to-peer memory is available for %s\n",
 488			       ns->device_path);
 489			return -EINVAL;
 490		}
 491
 492		pci_dev_put(p2p_dev);
 493	}
 494
 495	return 0;
 496}
 497
 498/*
 499 * Note: ctrl->subsys->lock should be held when calling this function
 500 */
 501static void nvmet_p2pmem_ns_add_p2p(struct nvmet_ctrl *ctrl,
 502				    struct nvmet_ns *ns)
 503{
 504	struct device *clients[2];
 505	struct pci_dev *p2p_dev;
 506	int ret;
 507
 508	if (!ctrl->p2p_client || !ns->use_p2pmem)
 509		return;
 510
 511	if (ns->p2p_dev) {
 512		ret = pci_p2pdma_distance(ns->p2p_dev, ctrl->p2p_client, true);
 513		if (ret < 0)
 514			return;
 515
 516		p2p_dev = pci_dev_get(ns->p2p_dev);
 517	} else {
 518		clients[0] = ctrl->p2p_client;
 519		clients[1] = nvmet_ns_dev(ns);
 520
 521		p2p_dev = pci_p2pmem_find_many(clients, ARRAY_SIZE(clients));
 522		if (!p2p_dev) {
 523			pr_err("no peer-to-peer memory is available that's supported by %s and %s\n",
 524			       dev_name(ctrl->p2p_client), ns->device_path);
 525			return;
 526		}
 527	}
 528
 529	ret = radix_tree_insert(&ctrl->p2p_ns_map, ns->nsid, p2p_dev);
 530	if (ret < 0)
 531		pci_dev_put(p2p_dev);
 532
 533	pr_info("using p2pmem on %s for nsid %d\n", pci_name(p2p_dev),
 534		ns->nsid);
 535}
 536
 537bool nvmet_ns_revalidate(struct nvmet_ns *ns)
 538{
 539	loff_t oldsize = ns->size;
 540
 541	if (ns->bdev)
 542		nvmet_bdev_ns_revalidate(ns);
 543	else
 544		nvmet_file_ns_revalidate(ns);
 545
 546	return oldsize != ns->size;
 547}
 548
 549int nvmet_ns_enable(struct nvmet_ns *ns)
 550{
 551	struct nvmet_subsys *subsys = ns->subsys;
 552	struct nvmet_ctrl *ctrl;
 553	int ret;
 554
 555	mutex_lock(&subsys->lock);
 556	ret = 0;
 557
 558	if (nvmet_is_passthru_subsys(subsys)) {
 559		pr_info("cannot enable both passthru and regular namespaces for a single subsystem");
 560		goto out_unlock;
 561	}
 562
 563	if (ns->enabled)
 564		goto out_unlock;
 565
 566	ret = -EMFILE;
 567	if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES)
 568		goto out_unlock;
 569
 570	ret = nvmet_bdev_ns_enable(ns);
 571	if (ret == -ENOTBLK)
 572		ret = nvmet_file_ns_enable(ns);
 573	if (ret)
 574		goto out_unlock;
 575
 576	ret = nvmet_p2pmem_ns_enable(ns);
 577	if (ret)
 578		goto out_dev_disable;
 579
 580	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
 581		nvmet_p2pmem_ns_add_p2p(ctrl, ns);
 582
 583	ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace,
 584				0, GFP_KERNEL);
 585	if (ret)
 586		goto out_dev_put;
 587
 588	if (ns->nsid > subsys->max_nsid)
 589		subsys->max_nsid = ns->nsid;
 590
 591	ret = xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL);
 592	if (ret)
 593		goto out_restore_subsys_maxnsid;
 594
 595	subsys->nr_namespaces++;
 596
 597	nvmet_ns_changed(subsys, ns->nsid);
 598	ns->enabled = true;
 599	ret = 0;
 600out_unlock:
 601	mutex_unlock(&subsys->lock);
 602	return ret;
 603
 604out_restore_subsys_maxnsid:
 605	subsys->max_nsid = nvmet_max_nsid(subsys);
 606	percpu_ref_exit(&ns->ref);
 607out_dev_put:
 608	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
 609		pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
 610out_dev_disable:
 611	nvmet_ns_dev_disable(ns);
 612	goto out_unlock;
 613}
 614
 615void nvmet_ns_disable(struct nvmet_ns *ns)
 616{
 617	struct nvmet_subsys *subsys = ns->subsys;
 618	struct nvmet_ctrl *ctrl;
 619
 620	mutex_lock(&subsys->lock);
 621	if (!ns->enabled)
 622		goto out_unlock;
 623
 624	ns->enabled = false;
 625	xa_erase(&ns->subsys->namespaces, ns->nsid);
 626	if (ns->nsid == subsys->max_nsid)
 627		subsys->max_nsid = nvmet_max_nsid(subsys);
 628
 629	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
 630		pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
 631
 632	mutex_unlock(&subsys->lock);
 633
 634	/*
 635	 * Now that we removed the namespaces from the lookup list, we
 636	 * can kill the per_cpu ref and wait for any remaining references
 637	 * to be dropped, as well as a RCU grace period for anyone only
 638	 * using the namepace under rcu_read_lock().  Note that we can't
 639	 * use call_rcu here as we need to ensure the namespaces have
 640	 * been fully destroyed before unloading the module.
 641	 */
 642	percpu_ref_kill(&ns->ref);
 643	synchronize_rcu();
 644	wait_for_completion(&ns->disable_done);
 645	percpu_ref_exit(&ns->ref);
 646
 647	mutex_lock(&subsys->lock);
 648
 649	subsys->nr_namespaces--;
 650	nvmet_ns_changed(subsys, ns->nsid);
 651	nvmet_ns_dev_disable(ns);
 652out_unlock:
 653	mutex_unlock(&subsys->lock);
 654}
 655
 656void nvmet_ns_free(struct nvmet_ns *ns)
 657{
 658	nvmet_ns_disable(ns);
 659
 660	down_write(&nvmet_ana_sem);
 661	nvmet_ana_group_enabled[ns->anagrpid]--;
 662	up_write(&nvmet_ana_sem);
 663
 664	kfree(ns->device_path);
 665	kfree(ns);
 666}
 667
 668struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
 669{
 670	struct nvmet_ns *ns;
 671
 672	ns = kzalloc(sizeof(*ns), GFP_KERNEL);
 673	if (!ns)
 674		return NULL;
 675
 676	init_completion(&ns->disable_done);
 677
 678	ns->nsid = nsid;
 679	ns->subsys = subsys;
 680
 681	down_write(&nvmet_ana_sem);
 682	ns->anagrpid = NVMET_DEFAULT_ANA_GRPID;
 683	nvmet_ana_group_enabled[ns->anagrpid]++;
 684	up_write(&nvmet_ana_sem);
 685
 686	uuid_gen(&ns->uuid);
 687	ns->buffered_io = false;
 688	ns->csi = NVME_CSI_NVM;
 689
 690	return ns;
 691}
 692
 693static void nvmet_update_sq_head(struct nvmet_req *req)
 694{
 695	if (req->sq->size) {
 696		u32 old_sqhd, new_sqhd;
 697
 698		do {
 699			old_sqhd = req->sq->sqhd;
 700			new_sqhd = (old_sqhd + 1) % req->sq->size;
 701		} while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) !=
 702					old_sqhd);
 703	}
 704	req->cqe->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF);
 705}
 706
 707static void nvmet_set_error(struct nvmet_req *req, u16 status)
 708{
 709	struct nvmet_ctrl *ctrl = req->sq->ctrl;
 710	struct nvme_error_slot *new_error_slot;
 711	unsigned long flags;
 712
 713	req->cqe->status = cpu_to_le16(status << 1);
 714
 715	if (!ctrl || req->error_loc == NVMET_NO_ERROR_LOC)
 716		return;
 717
 718	spin_lock_irqsave(&ctrl->error_lock, flags);
 719	ctrl->err_counter++;
 720	new_error_slot =
 721		&ctrl->slots[ctrl->err_counter % NVMET_ERROR_LOG_SLOTS];
 722
 723	new_error_slot->error_count = cpu_to_le64(ctrl->err_counter);
 724	new_error_slot->sqid = cpu_to_le16(req->sq->qid);
 725	new_error_slot->cmdid = cpu_to_le16(req->cmd->common.command_id);
 726	new_error_slot->status_field = cpu_to_le16(status << 1);
 727	new_error_slot->param_error_location = cpu_to_le16(req->error_loc);
 728	new_error_slot->lba = cpu_to_le64(req->error_slba);
 729	new_error_slot->nsid = req->cmd->common.nsid;
 730	spin_unlock_irqrestore(&ctrl->error_lock, flags);
 731
 732	/* set the more bit for this request */
 733	req->cqe->status |= cpu_to_le16(1 << 14);
 734}
 735
 736static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
 737{
 738	if (!req->sq->sqhd_disabled)
 739		nvmet_update_sq_head(req);
 740	req->cqe->sq_id = cpu_to_le16(req->sq->qid);
 741	req->cqe->command_id = req->cmd->common.command_id;
 742
 743	if (unlikely(status))
 744		nvmet_set_error(req, status);
 745
 746	trace_nvmet_req_complete(req);
 747
 748	if (req->ns)
 749		nvmet_put_namespace(req->ns);
 750	req->ops->queue_response(req);
 751}
 752
 753void nvmet_req_complete(struct nvmet_req *req, u16 status)
 754{
 755	__nvmet_req_complete(req, status);
 756	percpu_ref_put(&req->sq->ref);
 757}
 758EXPORT_SYMBOL_GPL(nvmet_req_complete);
 759
 760void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq,
 761		u16 qid, u16 size)
 762{
 763	cq->qid = qid;
 764	cq->size = size;
 765}
 766
 767void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
 768		u16 qid, u16 size)
 769{
 770	sq->sqhd = 0;
 771	sq->qid = qid;
 772	sq->size = size;
 773
 774	ctrl->sqs[qid] = sq;
 775}
 776
 777static void nvmet_confirm_sq(struct percpu_ref *ref)
 778{
 779	struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref);
 780
 781	complete(&sq->confirm_done);
 782}
 783
 784void nvmet_sq_destroy(struct nvmet_sq *sq)
 785{
 786	struct nvmet_ctrl *ctrl = sq->ctrl;
 787
 788	/*
 789	 * If this is the admin queue, complete all AERs so that our
 790	 * queue doesn't have outstanding requests on it.
 791	 */
 792	if (ctrl && ctrl->sqs && ctrl->sqs[0] == sq)
 793		nvmet_async_events_failall(ctrl);
 794	percpu_ref_kill_and_confirm(&sq->ref, nvmet_confirm_sq);
 795	wait_for_completion(&sq->confirm_done);
 796	wait_for_completion(&sq->free_done);
 797	percpu_ref_exit(&sq->ref);
 798
 799	if (ctrl) {
 800		/*
 801		 * The teardown flow may take some time, and the host may not
 802		 * send us keep-alive during this period, hence reset the
 803		 * traffic based keep-alive timer so we don't trigger a
 804		 * controller teardown as a result of a keep-alive expiration.
 805		 */
 806		ctrl->reset_tbkas = true;
 807		sq->ctrl->sqs[sq->qid] = NULL;
 808		nvmet_ctrl_put(ctrl);
 809		sq->ctrl = NULL; /* allows reusing the queue later */
 810	}
 811}
 812EXPORT_SYMBOL_GPL(nvmet_sq_destroy);
 813
 814static void nvmet_sq_free(struct percpu_ref *ref)
 815{
 816	struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref);
 817
 818	complete(&sq->free_done);
 819}
 820
 821int nvmet_sq_init(struct nvmet_sq *sq)
 822{
 823	int ret;
 824
 825	ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL);
 826	if (ret) {
 827		pr_err("percpu_ref init failed!\n");
 828		return ret;
 829	}
 830	init_completion(&sq->free_done);
 831	init_completion(&sq->confirm_done);
 832
 833	return 0;
 834}
 835EXPORT_SYMBOL_GPL(nvmet_sq_init);
 836
 837static inline u16 nvmet_check_ana_state(struct nvmet_port *port,
 838		struct nvmet_ns *ns)
 839{
 840	enum nvme_ana_state state = port->ana_state[ns->anagrpid];
 841
 842	if (unlikely(state == NVME_ANA_INACCESSIBLE))
 843		return NVME_SC_ANA_INACCESSIBLE;
 844	if (unlikely(state == NVME_ANA_PERSISTENT_LOSS))
 845		return NVME_SC_ANA_PERSISTENT_LOSS;
 846	if (unlikely(state == NVME_ANA_CHANGE))
 847		return NVME_SC_ANA_TRANSITION;
 848	return 0;
 849}
 850
 851static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req)
 852{
 853	if (unlikely(req->ns->readonly)) {
 854		switch (req->cmd->common.opcode) {
 855		case nvme_cmd_read:
 856		case nvme_cmd_flush:
 857			break;
 858		default:
 859			return NVME_SC_NS_WRITE_PROTECTED;
 860		}
 861	}
 862
 863	return 0;
 864}
 865
 866static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
 867{
 868	u16 ret;
 869
 870	ret = nvmet_check_ctrl_status(req);
 871	if (unlikely(ret))
 872		return ret;
 873
 874	if (nvmet_is_passthru_req(req))
 875		return nvmet_parse_passthru_io_cmd(req);
 876
 877	ret = nvmet_req_find_ns(req);
 878	if (unlikely(ret))
 879		return ret;
 880
 881	ret = nvmet_check_ana_state(req->port, req->ns);
 882	if (unlikely(ret)) {
 883		req->error_loc = offsetof(struct nvme_common_command, nsid);
 884		return ret;
 885	}
 886	ret = nvmet_io_cmd_check_access(req);
 887	if (unlikely(ret)) {
 888		req->error_loc = offsetof(struct nvme_common_command, nsid);
 889		return ret;
 890	}
 891
 892	switch (req->ns->csi) {
 893	case NVME_CSI_NVM:
 894		if (req->ns->file)
 895			return nvmet_file_parse_io_cmd(req);
 896		return nvmet_bdev_parse_io_cmd(req);
 897	case NVME_CSI_ZNS:
 898		if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
 899			return nvmet_bdev_zns_parse_io_cmd(req);
 900		return NVME_SC_INVALID_IO_CMD_SET;
 901	default:
 902		return NVME_SC_INVALID_IO_CMD_SET;
 903	}
 904}
 905
 906bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 907		struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops)
 908{
 909	u8 flags = req->cmd->common.flags;
 910	u16 status;
 911
 912	req->cq = cq;
 913	req->sq = sq;
 914	req->ops = ops;
 915	req->sg = NULL;
 916	req->metadata_sg = NULL;
 917	req->sg_cnt = 0;
 918	req->metadata_sg_cnt = 0;
 919	req->transfer_len = 0;
 920	req->metadata_len = 0;
 921	req->cqe->status = 0;
 922	req->cqe->sq_head = 0;
 923	req->ns = NULL;
 924	req->error_loc = NVMET_NO_ERROR_LOC;
 925	req->error_slba = 0;
 926
 927	/* no support for fused commands yet */
 928	if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
 929		req->error_loc = offsetof(struct nvme_common_command, flags);
 930		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 931		goto fail;
 932	}
 933
 934	/*
 935	 * For fabrics, PSDT field shall describe metadata pointer (MPTR) that
 936	 * contains an address of a single contiguous physical buffer that is
 937	 * byte aligned.
 938	 */
 939	if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF)) {
 940		req->error_loc = offsetof(struct nvme_common_command, flags);
 941		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 942		goto fail;
 943	}
 944
 945	if (unlikely(!req->sq->ctrl))
 946		/* will return an error for any non-connect command: */
 947		status = nvmet_parse_connect_cmd(req);
 948	else if (likely(req->sq->qid != 0))
 949		status = nvmet_parse_io_cmd(req);
 950	else
 951		status = nvmet_parse_admin_cmd(req);
 952
 953	if (status)
 954		goto fail;
 955
 956	trace_nvmet_req_init(req, req->cmd);
 957
 958	if (unlikely(!percpu_ref_tryget_live(&sq->ref))) {
 959		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 960		goto fail;
 961	}
 962
 963	if (sq->ctrl)
 964		sq->ctrl->reset_tbkas = true;
 965
 966	return true;
 967
 968fail:
 969	__nvmet_req_complete(req, status);
 970	return false;
 971}
 972EXPORT_SYMBOL_GPL(nvmet_req_init);
 973
 974void nvmet_req_uninit(struct nvmet_req *req)
 975{
 976	percpu_ref_put(&req->sq->ref);
 977	if (req->ns)
 978		nvmet_put_namespace(req->ns);
 979}
 980EXPORT_SYMBOL_GPL(nvmet_req_uninit);
 981
 982bool nvmet_check_transfer_len(struct nvmet_req *req, size_t len)
 983{
 984	if (unlikely(len != req->transfer_len)) {
 985		req->error_loc = offsetof(struct nvme_common_command, dptr);
 986		nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR);
 987		return false;
 988	}
 989
 990	return true;
 991}
 992EXPORT_SYMBOL_GPL(nvmet_check_transfer_len);
 993
 994bool nvmet_check_data_len_lte(struct nvmet_req *req, size_t data_len)
 995{
 996	if (unlikely(data_len > req->transfer_len)) {
 997		req->error_loc = offsetof(struct nvme_common_command, dptr);
 998		nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR);
 999		return false;
1000	}
1001
1002	return true;
1003}
1004
1005static unsigned int nvmet_data_transfer_len(struct nvmet_req *req)
1006{
1007	return req->transfer_len - req->metadata_len;
1008}
1009
1010static int nvmet_req_alloc_p2pmem_sgls(struct pci_dev *p2p_dev,
1011		struct nvmet_req *req)
1012{
1013	req->sg = pci_p2pmem_alloc_sgl(p2p_dev, &req->sg_cnt,
1014			nvmet_data_transfer_len(req));
1015	if (!req->sg)
1016		goto out_err;
1017
1018	if (req->metadata_len) {
1019		req->metadata_sg = pci_p2pmem_alloc_sgl(p2p_dev,
1020				&req->metadata_sg_cnt, req->metadata_len);
1021		if (!req->metadata_sg)
1022			goto out_free_sg;
1023	}
1024
1025	req->p2p_dev = p2p_dev;
1026
1027	return 0;
1028out_free_sg:
1029	pci_p2pmem_free_sgl(req->p2p_dev, req->sg);
1030out_err:
1031	return -ENOMEM;
1032}
1033
1034static struct pci_dev *nvmet_req_find_p2p_dev(struct nvmet_req *req)
1035{
1036	if (!IS_ENABLED(CONFIG_PCI_P2PDMA) ||
1037	    !req->sq->ctrl || !req->sq->qid || !req->ns)
1038		return NULL;
1039	return radix_tree_lookup(&req->sq->ctrl->p2p_ns_map, req->ns->nsid);
1040}
1041
1042int nvmet_req_alloc_sgls(struct nvmet_req *req)
1043{
1044	struct pci_dev *p2p_dev = nvmet_req_find_p2p_dev(req);
1045
1046	if (p2p_dev && !nvmet_req_alloc_p2pmem_sgls(p2p_dev, req))
1047		return 0;
1048
1049	req->sg = sgl_alloc(nvmet_data_transfer_len(req), GFP_KERNEL,
1050			    &req->sg_cnt);
1051	if (unlikely(!req->sg))
1052		goto out;
1053
1054	if (req->metadata_len) {
1055		req->metadata_sg = sgl_alloc(req->metadata_len, GFP_KERNEL,
1056					     &req->metadata_sg_cnt);
1057		if (unlikely(!req->metadata_sg))
1058			goto out_free;
1059	}
1060
1061	return 0;
1062out_free:
1063	sgl_free(req->sg);
1064out:
1065	return -ENOMEM;
1066}
1067EXPORT_SYMBOL_GPL(nvmet_req_alloc_sgls);
1068
1069void nvmet_req_free_sgls(struct nvmet_req *req)
1070{
1071	if (req->p2p_dev) {
1072		pci_p2pmem_free_sgl(req->p2p_dev, req->sg);
1073		if (req->metadata_sg)
1074			pci_p2pmem_free_sgl(req->p2p_dev, req->metadata_sg);
1075		req->p2p_dev = NULL;
1076	} else {
1077		sgl_free(req->sg);
1078		if (req->metadata_sg)
1079			sgl_free(req->metadata_sg);
1080	}
1081
1082	req->sg = NULL;
1083	req->metadata_sg = NULL;
1084	req->sg_cnt = 0;
1085	req->metadata_sg_cnt = 0;
1086}
1087EXPORT_SYMBOL_GPL(nvmet_req_free_sgls);
1088
1089static inline bool nvmet_cc_en(u32 cc)
1090{
1091	return (cc >> NVME_CC_EN_SHIFT) & 0x1;
1092}
1093
1094static inline u8 nvmet_cc_css(u32 cc)
1095{
1096	return (cc >> NVME_CC_CSS_SHIFT) & 0x7;
1097}
1098
1099static inline u8 nvmet_cc_mps(u32 cc)
1100{
1101	return (cc >> NVME_CC_MPS_SHIFT) & 0xf;
1102}
1103
1104static inline u8 nvmet_cc_ams(u32 cc)
1105{
1106	return (cc >> NVME_CC_AMS_SHIFT) & 0x7;
1107}
1108
1109static inline u8 nvmet_cc_shn(u32 cc)
1110{
1111	return (cc >> NVME_CC_SHN_SHIFT) & 0x3;
1112}
1113
1114static inline u8 nvmet_cc_iosqes(u32 cc)
1115{
1116	return (cc >> NVME_CC_IOSQES_SHIFT) & 0xf;
1117}
1118
1119static inline u8 nvmet_cc_iocqes(u32 cc)
1120{
1121	return (cc >> NVME_CC_IOCQES_SHIFT) & 0xf;
1122}
1123
1124static inline bool nvmet_css_supported(u8 cc_css)
1125{
1126	switch (cc_css << NVME_CC_CSS_SHIFT) {
1127	case NVME_CC_CSS_NVM:
1128	case NVME_CC_CSS_CSI:
1129		return true;
1130	default:
1131		return false;
1132	}
1133}
1134
1135static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
1136{
1137	lockdep_assert_held(&ctrl->lock);
1138
1139	/*
1140	 * Only I/O controllers should verify iosqes,iocqes.
1141	 * Strictly speaking, the spec says a discovery controller
1142	 * should verify iosqes,iocqes are zeroed, however that
1143	 * would break backwards compatibility, so don't enforce it.
1144	 */
1145	if (!nvmet_is_disc_subsys(ctrl->subsys) &&
1146	    (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES ||
1147	     nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES)) {
1148		ctrl->csts = NVME_CSTS_CFS;
1149		return;
1150	}
1151
1152	if (nvmet_cc_mps(ctrl->cc) != 0 ||
1153	    nvmet_cc_ams(ctrl->cc) != 0 ||
1154	    !nvmet_css_supported(nvmet_cc_css(ctrl->cc))) {
1155		ctrl->csts = NVME_CSTS_CFS;
1156		return;
1157	}
1158
1159	ctrl->csts = NVME_CSTS_RDY;
1160
1161	/*
1162	 * Controllers that are not yet enabled should not really enforce the
1163	 * keep alive timeout, but we still want to track a timeout and cleanup
1164	 * in case a host died before it enabled the controller.  Hence, simply
1165	 * reset the keep alive timer when the controller is enabled.
1166	 */
1167	if (ctrl->kato)
1168		mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ);
1169}
1170
1171static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl)
1172{
1173	lockdep_assert_held(&ctrl->lock);
1174
1175	/* XXX: tear down queues? */
1176	ctrl->csts &= ~NVME_CSTS_RDY;
1177	ctrl->cc = 0;
1178}
1179
1180void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new)
1181{
1182	u32 old;
1183
1184	mutex_lock(&ctrl->lock);
1185	old = ctrl->cc;
1186	ctrl->cc = new;
1187
1188	if (nvmet_cc_en(new) && !nvmet_cc_en(old))
1189		nvmet_start_ctrl(ctrl);
1190	if (!nvmet_cc_en(new) && nvmet_cc_en(old))
1191		nvmet_clear_ctrl(ctrl);
1192	if (nvmet_cc_shn(new) && !nvmet_cc_shn(old)) {
1193		nvmet_clear_ctrl(ctrl);
1194		ctrl->csts |= NVME_CSTS_SHST_CMPLT;
1195	}
1196	if (!nvmet_cc_shn(new) && nvmet_cc_shn(old))
1197		ctrl->csts &= ~NVME_CSTS_SHST_CMPLT;
1198	mutex_unlock(&ctrl->lock);
1199}
1200
1201static void nvmet_init_cap(struct nvmet_ctrl *ctrl)
1202{
1203	/* command sets supported: NVMe command set: */
1204	ctrl->cap = (1ULL << 37);
1205	/* Controller supports one or more I/O Command Sets */
1206	ctrl->cap |= (1ULL << 43);
1207	/* CC.EN timeout in 500msec units: */
1208	ctrl->cap |= (15ULL << 24);
1209	/* maximum queue entries supported: */
1210	if (ctrl->ops->get_max_queue_size)
1211		ctrl->cap |= ctrl->ops->get_max_queue_size(ctrl) - 1;
1212	else
1213		ctrl->cap |= NVMET_QUEUE_SIZE - 1;
1214
1215	if (nvmet_is_passthru_subsys(ctrl->subsys))
1216		nvmet_passthrough_override_cap(ctrl);
1217}
1218
1219struct nvmet_ctrl *nvmet_ctrl_find_get(const char *subsysnqn,
1220				       const char *hostnqn, u16 cntlid,
1221				       struct nvmet_req *req)
1222{
1223	struct nvmet_ctrl *ctrl = NULL;
1224	struct nvmet_subsys *subsys;
1225
1226	subsys = nvmet_find_get_subsys(req->port, subsysnqn);
1227	if (!subsys) {
1228		pr_warn("connect request for invalid subsystem %s!\n",
1229			subsysnqn);
1230		req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn);
1231		goto out;
1232	}
1233
1234	mutex_lock(&subsys->lock);
1235	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
1236		if (ctrl->cntlid == cntlid) {
1237			if (strncmp(hostnqn, ctrl->hostnqn, NVMF_NQN_SIZE)) {
1238				pr_warn("hostnqn mismatch.\n");
1239				continue;
1240			}
1241			if (!kref_get_unless_zero(&ctrl->ref))
1242				continue;
1243
1244			/* ctrl found */
1245			goto found;
1246		}
1247	}
1248
1249	ctrl = NULL; /* ctrl not found */
1250	pr_warn("could not find controller %d for subsys %s / host %s\n",
1251		cntlid, subsysnqn, hostnqn);
1252	req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid);
1253
1254found:
1255	mutex_unlock(&subsys->lock);
1256	nvmet_subsys_put(subsys);
1257out:
1258	return ctrl;
1259}
1260
1261u16 nvmet_check_ctrl_status(struct nvmet_req *req)
1262{
1263	if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
1264		pr_err("got cmd %d while CC.EN == 0 on qid = %d\n",
1265		       req->cmd->common.opcode, req->sq->qid);
1266		return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
1267	}
1268
1269	if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
1270		pr_err("got cmd %d while CSTS.RDY == 0 on qid = %d\n",
1271		       req->cmd->common.opcode, req->sq->qid);
1272		return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
1273	}
1274	return 0;
1275}
1276
1277bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn)
1278{
1279	struct nvmet_host_link *p;
1280
1281	lockdep_assert_held(&nvmet_config_sem);
1282
1283	if (subsys->allow_any_host)
1284		return true;
1285
1286	if (nvmet_is_disc_subsys(subsys)) /* allow all access to disc subsys */
1287		return true;
1288
1289	list_for_each_entry(p, &subsys->hosts, entry) {
1290		if (!strcmp(nvmet_host_name(p->host), hostnqn))
1291			return true;
1292	}
1293
1294	return false;
1295}
1296
1297/*
1298 * Note: ctrl->subsys->lock should be held when calling this function
1299 */
1300static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl,
1301		struct nvmet_req *req)
1302{
1303	struct nvmet_ns *ns;
1304	unsigned long idx;
1305
1306	if (!req->p2p_client)
1307		return;
1308
1309	ctrl->p2p_client = get_device(req->p2p_client);
1310
1311	xa_for_each(&ctrl->subsys->namespaces, idx, ns)
1312		nvmet_p2pmem_ns_add_p2p(ctrl, ns);
1313}
1314
1315/*
1316 * Note: ctrl->subsys->lock should be held when calling this function
1317 */
1318static void nvmet_release_p2p_ns_map(struct nvmet_ctrl *ctrl)
1319{
1320	struct radix_tree_iter iter;
1321	void __rcu **slot;
1322
1323	radix_tree_for_each_slot(slot, &ctrl->p2p_ns_map, &iter, 0)
1324		pci_dev_put(radix_tree_deref_slot(slot));
1325
1326	put_device(ctrl->p2p_client);
1327}
1328
1329static void nvmet_fatal_error_handler(struct work_struct *work)
1330{
1331	struct nvmet_ctrl *ctrl =
1332			container_of(work, struct nvmet_ctrl, fatal_err_work);
1333
1334	pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid);
1335	ctrl->ops->delete_ctrl(ctrl);
1336}
1337
1338u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
1339		struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp)
1340{
1341	struct nvmet_subsys *subsys;
1342	struct nvmet_ctrl *ctrl;
1343	int ret;
1344	u16 status;
1345
1346	status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1347	subsys = nvmet_find_get_subsys(req->port, subsysnqn);
1348	if (!subsys) {
1349		pr_warn("connect request for invalid subsystem %s!\n",
1350			subsysnqn);
1351		req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn);
1352		req->error_loc = offsetof(struct nvme_common_command, dptr);
1353		goto out;
1354	}
1355
1356	down_read(&nvmet_config_sem);
1357	if (!nvmet_host_allowed(subsys, hostnqn)) {
1358		pr_info("connect by host %s for subsystem %s not allowed\n",
1359			hostnqn, subsysnqn);
1360		req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn);
1361		up_read(&nvmet_config_sem);
1362		status = NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR;
1363		req->error_loc = offsetof(struct nvme_common_command, dptr);
1364		goto out_put_subsystem;
1365	}
1366	up_read(&nvmet_config_sem);
1367
1368	status = NVME_SC_INTERNAL;
1369	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
1370	if (!ctrl)
1371		goto out_put_subsystem;
1372	mutex_init(&ctrl->lock);
1373
1374	ctrl->port = req->port;
1375	ctrl->ops = req->ops;
1376
1377	INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work);
1378	INIT_LIST_HEAD(&ctrl->async_events);
1379	INIT_RADIX_TREE(&ctrl->p2p_ns_map, GFP_KERNEL);
1380	INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler);
1381	INIT_DELAYED_WORK(&ctrl->ka_work, nvmet_keep_alive_timer);
1382
1383	memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE);
1384	memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE);
1385
1386	kref_init(&ctrl->ref);
1387	ctrl->subsys = subsys;
1388	nvmet_init_cap(ctrl);
1389	WRITE_ONCE(ctrl->aen_enabled, NVMET_AEN_CFG_OPTIONAL);
1390
1391	ctrl->changed_ns_list = kmalloc_array(NVME_MAX_CHANGED_NAMESPACES,
1392			sizeof(__le32), GFP_KERNEL);
1393	if (!ctrl->changed_ns_list)
1394		goto out_free_ctrl;
1395
1396	ctrl->sqs = kcalloc(subsys->max_qid + 1,
1397			sizeof(struct nvmet_sq *),
1398			GFP_KERNEL);
1399	if (!ctrl->sqs)
1400		goto out_free_changed_ns_list;
1401
1402	if (subsys->cntlid_min > subsys->cntlid_max)
1403		goto out_free_sqs;
1404
1405	ret = ida_alloc_range(&cntlid_ida,
1406			     subsys->cntlid_min, subsys->cntlid_max,
1407			     GFP_KERNEL);
1408	if (ret < 0) {
1409		status = NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR;
1410		goto out_free_sqs;
1411	}
1412	ctrl->cntlid = ret;
1413
1414	/*
1415	 * Discovery controllers may use some arbitrary high value
1416	 * in order to cleanup stale discovery sessions
1417	 */
1418	if (nvmet_is_disc_subsys(ctrl->subsys) && !kato)
1419		kato = NVMET_DISC_KATO_MS;
1420
1421	/* keep-alive timeout in seconds */
1422	ctrl->kato = DIV_ROUND_UP(kato, 1000);
1423
1424	ctrl->err_counter = 0;
1425	spin_lock_init(&ctrl->error_lock);
1426
1427	nvmet_start_keep_alive_timer(ctrl);
1428
1429	mutex_lock(&subsys->lock);
1430	list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
1431	nvmet_setup_p2p_ns_map(ctrl, req);
1432	mutex_unlock(&subsys->lock);
1433
1434	*ctrlp = ctrl;
1435	return 0;
1436
1437out_free_sqs:
1438	kfree(ctrl->sqs);
1439out_free_changed_ns_list:
1440	kfree(ctrl->changed_ns_list);
1441out_free_ctrl:
1442	kfree(ctrl);
1443out_put_subsystem:
1444	nvmet_subsys_put(subsys);
1445out:
1446	return status;
1447}
1448
1449static void nvmet_ctrl_free(struct kref *ref)
1450{
1451	struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref);
1452	struct nvmet_subsys *subsys = ctrl->subsys;
1453
1454	mutex_lock(&subsys->lock);
1455	nvmet_release_p2p_ns_map(ctrl);
1456	list_del(&ctrl->subsys_entry);
1457	mutex_unlock(&subsys->lock);
1458
1459	nvmet_stop_keep_alive_timer(ctrl);
1460
1461	flush_work(&ctrl->async_event_work);
1462	cancel_work_sync(&ctrl->fatal_err_work);
1463
1464	ida_free(&cntlid_ida, ctrl->cntlid);
1465
1466	nvmet_async_events_free(ctrl);
1467	kfree(ctrl->sqs);
1468	kfree(ctrl->changed_ns_list);
1469	kfree(ctrl);
1470
1471	nvmet_subsys_put(subsys);
1472}
1473
1474void nvmet_ctrl_put(struct nvmet_ctrl *ctrl)
1475{
1476	kref_put(&ctrl->ref, nvmet_ctrl_free);
1477}
1478
1479void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl)
1480{
1481	mutex_lock(&ctrl->lock);
1482	if (!(ctrl->csts & NVME_CSTS_CFS)) {
1483		ctrl->csts |= NVME_CSTS_CFS;
1484		queue_work(nvmet_wq, &ctrl->fatal_err_work);
1485	}
1486	mutex_unlock(&ctrl->lock);
1487}
1488EXPORT_SYMBOL_GPL(nvmet_ctrl_fatal_error);
1489
1490static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
1491		const char *subsysnqn)
1492{
1493	struct nvmet_subsys_link *p;
1494
1495	if (!port)
1496		return NULL;
1497
1498	if (!strcmp(NVME_DISC_SUBSYS_NAME, subsysnqn)) {
1499		if (!kref_get_unless_zero(&nvmet_disc_subsys->ref))
1500			return NULL;
1501		return nvmet_disc_subsys;
1502	}
1503
1504	down_read(&nvmet_config_sem);
1505	list_for_each_entry(p, &port->subsystems, entry) {
1506		if (!strncmp(p->subsys->subsysnqn, subsysnqn,
1507				NVMF_NQN_SIZE)) {
1508			if (!kref_get_unless_zero(&p->subsys->ref))
1509				break;
1510			up_read(&nvmet_config_sem);
1511			return p->subsys;
1512		}
1513	}
1514	up_read(&nvmet_config_sem);
1515	return NULL;
1516}
1517
1518struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
1519		enum nvme_subsys_type type)
1520{
1521	struct nvmet_subsys *subsys;
1522	char serial[NVMET_SN_MAX_SIZE / 2];
1523	int ret;
1524
1525	subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
1526	if (!subsys)
1527		return ERR_PTR(-ENOMEM);
1528
1529	subsys->ver = NVMET_DEFAULT_VS;
1530	/* generate a random serial number as our controllers are ephemeral: */
1531	get_random_bytes(&serial, sizeof(serial));
1532	bin2hex(subsys->serial, &serial, sizeof(serial));
1533
1534	subsys->model_number = kstrdup(NVMET_DEFAULT_CTRL_MODEL, GFP_KERNEL);
1535	if (!subsys->model_number) {
1536		ret = -ENOMEM;
1537		goto free_subsys;
1538	}
1539
1540	switch (type) {
1541	case NVME_NQN_NVME:
1542		subsys->max_qid = NVMET_NR_QUEUES;
1543		break;
1544	case NVME_NQN_DISC:
1545	case NVME_NQN_CURR:
1546		subsys->max_qid = 0;
1547		break;
1548	default:
1549		pr_err("%s: Unknown Subsystem type - %d\n", __func__, type);
1550		ret = -EINVAL;
1551		goto free_mn;
1552	}
1553	subsys->type = type;
1554	subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE,
1555			GFP_KERNEL);
1556	if (!subsys->subsysnqn) {
1557		ret = -ENOMEM;
1558		goto free_mn;
1559	}
1560	subsys->cntlid_min = NVME_CNTLID_MIN;
1561	subsys->cntlid_max = NVME_CNTLID_MAX;
1562	kref_init(&subsys->ref);
1563
1564	mutex_init(&subsys->lock);
1565	xa_init(&subsys->namespaces);
1566	INIT_LIST_HEAD(&subsys->ctrls);
1567	INIT_LIST_HEAD(&subsys->hosts);
1568
1569	return subsys;
1570
1571free_mn:
1572	kfree(subsys->model_number);
1573free_subsys:
1574	kfree(subsys);
1575	return ERR_PTR(ret);
1576}
1577
1578static void nvmet_subsys_free(struct kref *ref)
1579{
1580	struct nvmet_subsys *subsys =
1581		container_of(ref, struct nvmet_subsys, ref);
1582
1583	WARN_ON_ONCE(!xa_empty(&subsys->namespaces));
1584
1585	xa_destroy(&subsys->namespaces);
1586	nvmet_passthru_subsys_free(subsys);
1587
1588	kfree(subsys->subsysnqn);
1589	kfree(subsys->model_number);
1590	kfree(subsys);
1591}
1592
1593void nvmet_subsys_del_ctrls(struct nvmet_subsys *subsys)
1594{
1595	struct nvmet_ctrl *ctrl;
1596
1597	mutex_lock(&subsys->lock);
1598	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
1599		ctrl->ops->delete_ctrl(ctrl);
1600	mutex_unlock(&subsys->lock);
1601}
1602
1603void nvmet_subsys_put(struct nvmet_subsys *subsys)
1604{
1605	kref_put(&subsys->ref, nvmet_subsys_free);
1606}
1607
1608static int __init nvmet_init(void)
1609{
1610	int error;
1611
1612	nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1;
1613
1614	zbd_wq = alloc_workqueue("nvmet-zbd-wq", WQ_MEM_RECLAIM, 0);
1615	if (!zbd_wq)
1616		return -ENOMEM;
1617
1618	buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq",
1619			WQ_MEM_RECLAIM, 0);
1620	if (!buffered_io_wq) {
1621		error = -ENOMEM;
1622		goto out_free_zbd_work_queue;
1623	}
1624
1625	nvmet_wq = alloc_workqueue("nvmet-wq", WQ_MEM_RECLAIM, 0);
1626	if (!nvmet_wq) {
1627		error = -ENOMEM;
1628		goto out_free_buffered_work_queue;
1629	}
1630
1631	error = nvmet_init_discovery();
1632	if (error)
1633		goto out_free_nvmet_work_queue;
1634
1635	error = nvmet_init_configfs();
1636	if (error)
1637		goto out_exit_discovery;
1638	return 0;
1639
1640out_exit_discovery:
1641	nvmet_exit_discovery();
1642out_free_nvmet_work_queue:
1643	destroy_workqueue(nvmet_wq);
1644out_free_buffered_work_queue:
1645	destroy_workqueue(buffered_io_wq);
1646out_free_zbd_work_queue:
1647	destroy_workqueue(zbd_wq);
1648	return error;
1649}
1650
1651static void __exit nvmet_exit(void)
1652{
1653	nvmet_exit_configfs();
1654	nvmet_exit_discovery();
1655	ida_destroy(&cntlid_ida);
1656	destroy_workqueue(nvmet_wq);
1657	destroy_workqueue(buffered_io_wq);
1658	destroy_workqueue(zbd_wq);
1659
1660	BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024);
1661	BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024);
1662}
1663
1664module_init(nvmet_init);
1665module_exit(nvmet_exit);
1666
1667MODULE_LICENSE("GPL v2");