drivers/nvme/host/core.c at v6.19

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / drivers / nvme / host / core.c
at v6.19 5451 lines 147 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * NVM Express device driver
   4 * Copyright (c) 2011-2014, Intel Corporation.
   5 */
   6
   7#include <linux/async.h>
   8#include <linux/blkdev.h>
   9#include <linux/blk-mq.h>
  10#include <linux/blk-integrity.h>
  11#include <linux/compat.h>
  12#include <linux/delay.h>
  13#include <linux/errno.h>
  14#include <linux/hdreg.h>
  15#include <linux/kernel.h>
  16#include <linux/module.h>
  17#include <linux/backing-dev.h>
  18#include <linux/slab.h>
  19#include <linux/types.h>
  20#include <linux/pr.h>
  21#include <linux/ptrace.h>
  22#include <linux/nvme_ioctl.h>
  23#include <linux/pm_qos.h>
  24#include <linux/ratelimit.h>
  25#include <linux/unaligned.h>
  26
  27#include "nvme.h"
  28#include "fabrics.h"
  29#include <linux/nvme-auth.h>
  30
  31#define CREATE_TRACE_POINTS
  32#include "trace.h"
  33
  34#define NVME_MINORS		(1U << MINORBITS)
  35
  36struct nvme_ns_info {
  37	struct nvme_ns_ids ids;
  38	u32 nsid;
  39	__le32 anagrpid;
  40	u8 pi_offset;
  41	u16 endgid;
  42	u64 runs;
  43	bool is_shared;
  44	bool is_readonly;
  45	bool is_ready;
  46	bool is_removed;
  47	bool is_rotational;
  48	bool no_vwc;
  49};
  50
  51unsigned int admin_timeout = 60;
  52module_param(admin_timeout, uint, 0644);
  53MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
  54EXPORT_SYMBOL_GPL(admin_timeout);
  55
  56unsigned int nvme_io_timeout = 30;
  57module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
  58MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
  59EXPORT_SYMBOL_GPL(nvme_io_timeout);
  60
  61static unsigned char shutdown_timeout = 5;
  62module_param(shutdown_timeout, byte, 0644);
  63MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
  64
  65static u8 nvme_max_retries = 5;
  66module_param_named(max_retries, nvme_max_retries, byte, 0644);
  67MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
  68
  69static unsigned long default_ps_max_latency_us = 100000;
  70module_param(default_ps_max_latency_us, ulong, 0644);
  71MODULE_PARM_DESC(default_ps_max_latency_us,
  72		 "max power saving latency for new devices; use PM QOS to change per device");
  73
  74static bool force_apst;
  75module_param(force_apst, bool, 0644);
  76MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
  77
  78static unsigned long apst_primary_timeout_ms = 100;
  79module_param(apst_primary_timeout_ms, ulong, 0644);
  80MODULE_PARM_DESC(apst_primary_timeout_ms,
  81	"primary APST timeout in ms");
  82
  83static unsigned long apst_secondary_timeout_ms = 2000;
  84module_param(apst_secondary_timeout_ms, ulong, 0644);
  85MODULE_PARM_DESC(apst_secondary_timeout_ms,
  86	"secondary APST timeout in ms");
  87
  88static unsigned long apst_primary_latency_tol_us = 15000;
  89module_param(apst_primary_latency_tol_us, ulong, 0644);
  90MODULE_PARM_DESC(apst_primary_latency_tol_us,
  91	"primary APST latency tolerance in us");
  92
  93static unsigned long apst_secondary_latency_tol_us = 100000;
  94module_param(apst_secondary_latency_tol_us, ulong, 0644);
  95MODULE_PARM_DESC(apst_secondary_latency_tol_us,
  96	"secondary APST latency tolerance in us");
  97
  98/*
  99 * Older kernels didn't enable protection information if it was at an offset.
 100 * Newer kernels do, so it breaks reads on the upgrade if such formats were
 101 * used in prior kernels since the metadata written did not contain a valid
 102 * checksum.
 103 */
 104static bool disable_pi_offsets = false;
 105module_param(disable_pi_offsets, bool, 0444);
 106MODULE_PARM_DESC(disable_pi_offsets,
 107	"disable protection information if it has an offset");
 108
 109/*
 110 * nvme_wq - hosts nvme related works that are not reset or delete
 111 * nvme_reset_wq - hosts nvme reset works
 112 * nvme_delete_wq - hosts nvme delete works
 113 *
 114 * nvme_wq will host works such as scan, aen handling, fw activation,
 115 * keep-alive, periodic reconnects etc. nvme_reset_wq
 116 * runs reset works which also flush works hosted on nvme_wq for
 117 * serialization purposes. nvme_delete_wq host controller deletion
 118 * works which flush reset works for serialization.
 119 */
 120struct workqueue_struct *nvme_wq;
 121EXPORT_SYMBOL_GPL(nvme_wq);
 122
 123struct workqueue_struct *nvme_reset_wq;
 124EXPORT_SYMBOL_GPL(nvme_reset_wq);
 125
 126struct workqueue_struct *nvme_delete_wq;
 127EXPORT_SYMBOL_GPL(nvme_delete_wq);
 128
 129static LIST_HEAD(nvme_subsystems);
 130DEFINE_MUTEX(nvme_subsystems_lock);
 131
 132static DEFINE_IDA(nvme_instance_ida);
 133static dev_t nvme_ctrl_base_chr_devt;
 134static int nvme_class_uevent(const struct device *dev, struct kobj_uevent_env *env);
 135static const struct class nvme_class = {
 136	.name = "nvme",
 137	.dev_uevent = nvme_class_uevent,
 138};
 139
 140static const struct class nvme_subsys_class = {
 141	.name = "nvme-subsystem",
 142};
 143
 144static DEFINE_IDA(nvme_ns_chr_minor_ida);
 145static dev_t nvme_ns_chr_devt;
 146static const struct class nvme_ns_chr_class = {
 147	.name = "nvme-generic",
 148};
 149
 150static void nvme_put_subsystem(struct nvme_subsystem *subsys);
 151static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
 152					   unsigned nsid);
 153static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
 154				   struct nvme_command *cmd);
 155static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
 156		u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi);
 157
 158void nvme_queue_scan(struct nvme_ctrl *ctrl)
 159{
 160	/*
 161	 * Only new queue scan work when admin and IO queues are both alive
 162	 */
 163	if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE && ctrl->tagset)
 164		queue_work(nvme_wq, &ctrl->scan_work);
 165}
 166
 167/*
 168 * Use this function to proceed with scheduling reset_work for a controller
 169 * that had previously been set to the resetting state. This is intended for
 170 * code paths that can't be interrupted by other reset attempts. A hot removal
 171 * may prevent this from succeeding.
 172 */
 173int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
 174{
 175	if (nvme_ctrl_state(ctrl) != NVME_CTRL_RESETTING)
 176		return -EBUSY;
 177	if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
 178		return -EBUSY;
 179	return 0;
 180}
 181EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
 182
 183static void nvme_failfast_work(struct work_struct *work)
 184{
 185	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
 186			struct nvme_ctrl, failfast_work);
 187
 188	if (nvme_ctrl_state(ctrl) != NVME_CTRL_CONNECTING)
 189		return;
 190
 191	set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
 192	dev_info(ctrl->device, "failfast expired\n");
 193	nvme_kick_requeue_lists(ctrl);
 194}
 195
 196static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl)
 197{
 198	if (!ctrl->opts || ctrl->opts->fast_io_fail_tmo == -1)
 199		return;
 200
 201	schedule_delayed_work(&ctrl->failfast_work,
 202			      ctrl->opts->fast_io_fail_tmo * HZ);
 203}
 204
 205static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl)
 206{
 207	if (!ctrl->opts)
 208		return;
 209
 210	cancel_delayed_work_sync(&ctrl->failfast_work);
 211	clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
 212}
 213
 214
 215int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
 216{
 217	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
 218		return -EBUSY;
 219	if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
 220		return -EBUSY;
 221	return 0;
 222}
 223EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
 224
 225int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
 226{
 227	int ret;
 228
 229	ret = nvme_reset_ctrl(ctrl);
 230	if (!ret) {
 231		flush_work(&ctrl->reset_work);
 232		if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
 233			ret = -ENETRESET;
 234	}
 235
 236	return ret;
 237}
 238
 239static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
 240{
 241	dev_info(ctrl->device,
 242		 "Removing ctrl: NQN \"%s\"\n", nvmf_ctrl_subsysnqn(ctrl));
 243
 244	flush_work(&ctrl->reset_work);
 245	nvme_stop_ctrl(ctrl);
 246	nvme_remove_namespaces(ctrl);
 247	ctrl->ops->delete_ctrl(ctrl);
 248	nvme_uninit_ctrl(ctrl);
 249}
 250
 251static void nvme_delete_ctrl_work(struct work_struct *work)
 252{
 253	struct nvme_ctrl *ctrl =
 254		container_of(work, struct nvme_ctrl, delete_work);
 255
 256	nvme_do_delete_ctrl(ctrl);
 257}
 258
 259int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
 260{
 261	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
 262		return -EBUSY;
 263	if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
 264		return -EBUSY;
 265	return 0;
 266}
 267EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
 268
 269void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
 270{
 271	/*
 272	 * Keep a reference until nvme_do_delete_ctrl() complete,
 273	 * since ->delete_ctrl can free the controller.
 274	 */
 275	nvme_get_ctrl(ctrl);
 276	if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
 277		nvme_do_delete_ctrl(ctrl);
 278	nvme_put_ctrl(ctrl);
 279}
 280
 281static blk_status_t nvme_error_status(u16 status)
 282{
 283	switch (status & NVME_SCT_SC_MASK) {
 284	case NVME_SC_SUCCESS:
 285		return BLK_STS_OK;
 286	case NVME_SC_CAP_EXCEEDED:
 287		return BLK_STS_NOSPC;
 288	case NVME_SC_LBA_RANGE:
 289	case NVME_SC_CMD_INTERRUPTED:
 290	case NVME_SC_NS_NOT_READY:
 291		return BLK_STS_TARGET;
 292	case NVME_SC_BAD_ATTRIBUTES:
 293	case NVME_SC_INVALID_OPCODE:
 294	case NVME_SC_INVALID_FIELD:
 295	case NVME_SC_INVALID_NS:
 296		return BLK_STS_NOTSUPP;
 297	case NVME_SC_WRITE_FAULT:
 298	case NVME_SC_READ_ERROR:
 299	case NVME_SC_UNWRITTEN_BLOCK:
 300	case NVME_SC_ACCESS_DENIED:
 301	case NVME_SC_READ_ONLY:
 302	case NVME_SC_COMPARE_FAILED:
 303		return BLK_STS_MEDIUM;
 304	case NVME_SC_GUARD_CHECK:
 305	case NVME_SC_APPTAG_CHECK:
 306	case NVME_SC_REFTAG_CHECK:
 307	case NVME_SC_INVALID_PI:
 308		return BLK_STS_PROTECTION;
 309	case NVME_SC_RESERVATION_CONFLICT:
 310		return BLK_STS_RESV_CONFLICT;
 311	case NVME_SC_HOST_PATH_ERROR:
 312		return BLK_STS_TRANSPORT;
 313	case NVME_SC_ZONE_TOO_MANY_ACTIVE:
 314		return BLK_STS_ZONE_ACTIVE_RESOURCE;
 315	case NVME_SC_ZONE_TOO_MANY_OPEN:
 316		return BLK_STS_ZONE_OPEN_RESOURCE;
 317	default:
 318		return BLK_STS_IOERR;
 319	}
 320}
 321
 322static void nvme_retry_req(struct request *req)
 323{
 324	unsigned long delay = 0;
 325	u16 crd;
 326
 327	/* The mask and shift result must be <= 3 */
 328	crd = (nvme_req(req)->status & NVME_STATUS_CRD) >> 11;
 329	if (crd)
 330		delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100;
 331
 332	nvme_req(req)->retries++;
 333	blk_mq_requeue_request(req, false);
 334	blk_mq_delay_kick_requeue_list(req->q, delay);
 335}
 336
 337static void nvme_log_error(struct request *req)
 338{
 339	struct nvme_ns *ns = req->q->queuedata;
 340	struct nvme_request *nr = nvme_req(req);
 341
 342	if (ns) {
 343		pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %u blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
 344		       ns->disk ? ns->disk->disk_name : "?",
 345		       nvme_get_opcode_str(nr->cmd->common.opcode),
 346		       nr->cmd->common.opcode,
 347		       nvme_sect_to_lba(ns->head, blk_rq_pos(req)),
 348		       blk_rq_bytes(req) >> ns->head->lba_shift,
 349		       nvme_get_error_status_str(nr->status),
 350		       NVME_SCT(nr->status),		/* Status Code Type */
 351		       nr->status & NVME_SC_MASK,	/* Status Code */
 352		       nr->status & NVME_STATUS_MORE ? "MORE " : "",
 353		       nr->status & NVME_STATUS_DNR  ? "DNR "  : "");
 354		return;
 355	}
 356
 357	pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s\n",
 358			   dev_name(nr->ctrl->device),
 359			   nvme_get_admin_opcode_str(nr->cmd->common.opcode),
 360			   nr->cmd->common.opcode,
 361			   nvme_get_error_status_str(nr->status),
 362			   NVME_SCT(nr->status),	/* Status Code Type */
 363			   nr->status & NVME_SC_MASK,	/* Status Code */
 364			   nr->status & NVME_STATUS_MORE ? "MORE " : "",
 365			   nr->status & NVME_STATUS_DNR  ? "DNR "  : "");
 366}
 367
 368static void nvme_log_err_passthru(struct request *req)
 369{
 370	struct nvme_ns *ns = req->q->queuedata;
 371	struct nvme_request *nr = nvme_req(req);
 372
 373	pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s"
 374		"cdw10=0x%x cdw11=0x%x cdw12=0x%x cdw13=0x%x cdw14=0x%x cdw15=0x%x\n",
 375		ns ? ns->disk->disk_name : dev_name(nr->ctrl->device),
 376		ns ? nvme_get_opcode_str(nr->cmd->common.opcode) :
 377		     nvme_get_admin_opcode_str(nr->cmd->common.opcode),
 378		nr->cmd->common.opcode,
 379		nvme_get_error_status_str(nr->status),
 380		NVME_SCT(nr->status),		/* Status Code Type */
 381		nr->status & NVME_SC_MASK,	/* Status Code */
 382		nr->status & NVME_STATUS_MORE ? "MORE " : "",
 383		nr->status & NVME_STATUS_DNR  ? "DNR "  : "",
 384		le32_to_cpu(nr->cmd->common.cdw10),
 385		le32_to_cpu(nr->cmd->common.cdw11),
 386		le32_to_cpu(nr->cmd->common.cdw12),
 387		le32_to_cpu(nr->cmd->common.cdw13),
 388		le32_to_cpu(nr->cmd->common.cdw14),
 389		le32_to_cpu(nr->cmd->common.cdw15));
 390}
 391
 392enum nvme_disposition {
 393	COMPLETE,
 394	RETRY,
 395	FAILOVER,
 396	AUTHENTICATE,
 397};
 398
 399static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
 400{
 401	if (likely(nvme_req(req)->status == 0))
 402		return COMPLETE;
 403
 404	if (blk_noretry_request(req) ||
 405	    (nvme_req(req)->status & NVME_STATUS_DNR) ||
 406	    nvme_req(req)->retries >= nvme_max_retries)
 407		return COMPLETE;
 408
 409	if ((nvme_req(req)->status & NVME_SCT_SC_MASK) == NVME_SC_AUTH_REQUIRED)
 410		return AUTHENTICATE;
 411
 412	if (req->cmd_flags & REQ_NVME_MPATH) {
 413		if (nvme_is_path_error(nvme_req(req)->status) ||
 414		    blk_queue_dying(req->q))
 415			return FAILOVER;
 416	} else {
 417		if (blk_queue_dying(req->q))
 418			return COMPLETE;
 419	}
 420
 421	return RETRY;
 422}
 423
 424static inline void nvme_end_req_zoned(struct request *req)
 425{
 426	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
 427	    req_op(req) == REQ_OP_ZONE_APPEND) {
 428		struct nvme_ns *ns = req->q->queuedata;
 429
 430		req->__sector = nvme_lba_to_sect(ns->head,
 431			le64_to_cpu(nvme_req(req)->result.u64));
 432	}
 433}
 434
 435static inline void __nvme_end_req(struct request *req)
 436{
 437	if (unlikely(nvme_req(req)->status && !(req->rq_flags & RQF_QUIET))) {
 438		if (blk_rq_is_passthrough(req))
 439			nvme_log_err_passthru(req);
 440		else
 441			nvme_log_error(req);
 442	}
 443	nvme_end_req_zoned(req);
 444	nvme_trace_bio_complete(req);
 445	if (req->cmd_flags & REQ_NVME_MPATH)
 446		nvme_mpath_end_request(req);
 447}
 448
 449void nvme_end_req(struct request *req)
 450{
 451	blk_status_t status = nvme_error_status(nvme_req(req)->status);
 452
 453	__nvme_end_req(req);
 454	blk_mq_end_request(req, status);
 455}
 456
 457void nvme_complete_rq(struct request *req)
 458{
 459	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
 460
 461	trace_nvme_complete_rq(req);
 462	nvme_cleanup_cmd(req);
 463
 464	/*
 465	 * Completions of long-running commands should not be able to
 466	 * defer sending of periodic keep alives, since the controller
 467	 * may have completed processing such commands a long time ago
 468	 * (arbitrarily close to command submission time).
 469	 * req->deadline - req->timeout is the command submission time
 470	 * in jiffies.
 471	 */
 472	if (ctrl->kas &&
 473	    req->deadline - req->timeout >= ctrl->ka_last_check_time)
 474		ctrl->comp_seen = true;
 475
 476	switch (nvme_decide_disposition(req)) {
 477	case COMPLETE:
 478		nvme_end_req(req);
 479		return;
 480	case RETRY:
 481		nvme_retry_req(req);
 482		return;
 483	case FAILOVER:
 484		nvme_failover_req(req);
 485		return;
 486	case AUTHENTICATE:
 487#ifdef CONFIG_NVME_HOST_AUTH
 488		queue_work(nvme_wq, &ctrl->dhchap_auth_work);
 489		nvme_retry_req(req);
 490#else
 491		nvme_end_req(req);
 492#endif
 493		return;
 494	}
 495}
 496EXPORT_SYMBOL_GPL(nvme_complete_rq);
 497
 498void nvme_complete_batch_req(struct request *req)
 499{
 500	trace_nvme_complete_rq(req);
 501	nvme_cleanup_cmd(req);
 502	__nvme_end_req(req);
 503}
 504EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
 505
 506/*
 507 * Called to unwind from ->queue_rq on a failed command submission so that the
 508 * multipathing code gets called to potentially failover to another path.
 509 * The caller needs to unwind all transport specific resource allocations and
 510 * must return propagate the return value.
 511 */
 512blk_status_t nvme_host_path_error(struct request *req)
 513{
 514	nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR;
 515	blk_mq_set_request_complete(req);
 516	nvme_complete_rq(req);
 517	return BLK_STS_OK;
 518}
 519EXPORT_SYMBOL_GPL(nvme_host_path_error);
 520
 521bool nvme_cancel_request(struct request *req, void *data)
 522{
 523	dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
 524				"Cancelling I/O %d", req->tag);
 525
 526	/* don't abort one completed or idle request */
 527	if (blk_mq_rq_state(req) != MQ_RQ_IN_FLIGHT)
 528		return true;
 529
 530	nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
 531	nvme_req(req)->flags |= NVME_REQ_CANCELLED;
 532	blk_mq_complete_request(req);
 533	return true;
 534}
 535EXPORT_SYMBOL_GPL(nvme_cancel_request);
 536
 537void nvme_cancel_tagset(struct nvme_ctrl *ctrl)
 538{
 539	if (ctrl->tagset) {
 540		blk_mq_tagset_busy_iter(ctrl->tagset,
 541				nvme_cancel_request, ctrl);
 542		blk_mq_tagset_wait_completed_request(ctrl->tagset);
 543	}
 544}
 545EXPORT_SYMBOL_GPL(nvme_cancel_tagset);
 546
 547void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
 548{
 549	if (ctrl->admin_tagset) {
 550		blk_mq_tagset_busy_iter(ctrl->admin_tagset,
 551				nvme_cancel_request, ctrl);
 552		blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
 553	}
 554}
 555EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
 556
 557bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 558		enum nvme_ctrl_state new_state)
 559{
 560	enum nvme_ctrl_state old_state;
 561	unsigned long flags;
 562	bool changed = false;
 563
 564	spin_lock_irqsave(&ctrl->lock, flags);
 565
 566	old_state = nvme_ctrl_state(ctrl);
 567	switch (new_state) {
 568	case NVME_CTRL_LIVE:
 569		switch (old_state) {
 570		case NVME_CTRL_CONNECTING:
 571			changed = true;
 572			fallthrough;
 573		default:
 574			break;
 575		}
 576		break;
 577	case NVME_CTRL_RESETTING:
 578		switch (old_state) {
 579		case NVME_CTRL_NEW:
 580		case NVME_CTRL_LIVE:
 581			changed = true;
 582			fallthrough;
 583		default:
 584			break;
 585		}
 586		break;
 587	case NVME_CTRL_CONNECTING:
 588		switch (old_state) {
 589		case NVME_CTRL_NEW:
 590		case NVME_CTRL_RESETTING:
 591			changed = true;
 592			fallthrough;
 593		default:
 594			break;
 595		}
 596		break;
 597	case NVME_CTRL_DELETING:
 598		switch (old_state) {
 599		case NVME_CTRL_LIVE:
 600		case NVME_CTRL_RESETTING:
 601		case NVME_CTRL_CONNECTING:
 602			changed = true;
 603			fallthrough;
 604		default:
 605			break;
 606		}
 607		break;
 608	case NVME_CTRL_DELETING_NOIO:
 609		switch (old_state) {
 610		case NVME_CTRL_DELETING:
 611		case NVME_CTRL_DEAD:
 612			changed = true;
 613			fallthrough;
 614		default:
 615			break;
 616		}
 617		break;
 618	case NVME_CTRL_DEAD:
 619		switch (old_state) {
 620		case NVME_CTRL_DELETING:
 621			changed = true;
 622			fallthrough;
 623		default:
 624			break;
 625		}
 626		break;
 627	default:
 628		break;
 629	}
 630
 631	if (changed) {
 632		WRITE_ONCE(ctrl->state, new_state);
 633		wake_up_all(&ctrl->state_wq);
 634	}
 635
 636	spin_unlock_irqrestore(&ctrl->lock, flags);
 637	if (!changed)
 638		return false;
 639
 640	if (new_state == NVME_CTRL_LIVE) {
 641		if (old_state == NVME_CTRL_CONNECTING)
 642			nvme_stop_failfast_work(ctrl);
 643		nvme_kick_requeue_lists(ctrl);
 644	} else if (new_state == NVME_CTRL_CONNECTING &&
 645		old_state == NVME_CTRL_RESETTING) {
 646		nvme_start_failfast_work(ctrl);
 647	}
 648	return changed;
 649}
 650EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
 651
 652/*
 653 * Waits for the controller state to be resetting, or returns false if it is
 654 * not possible to ever transition to that state.
 655 */
 656bool nvme_wait_reset(struct nvme_ctrl *ctrl)
 657{
 658	wait_event(ctrl->state_wq,
 659		   nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
 660		   nvme_state_terminal(ctrl));
 661	return nvme_ctrl_state(ctrl) == NVME_CTRL_RESETTING;
 662}
 663EXPORT_SYMBOL_GPL(nvme_wait_reset);
 664
 665static void nvme_free_ns_head(struct kref *ref)
 666{
 667	struct nvme_ns_head *head =
 668		container_of(ref, struct nvme_ns_head, ref);
 669
 670	nvme_mpath_put_disk(head);
 671	ida_free(&head->subsys->ns_ida, head->instance);
 672	cleanup_srcu_struct(&head->srcu);
 673	nvme_put_subsystem(head->subsys);
 674	kfree(head->plids);
 675	kfree(head);
 676}
 677
 678bool nvme_tryget_ns_head(struct nvme_ns_head *head)
 679{
 680	return kref_get_unless_zero(&head->ref);
 681}
 682
 683void nvme_put_ns_head(struct nvme_ns_head *head)
 684{
 685	kref_put(&head->ref, nvme_free_ns_head);
 686}
 687
 688static void nvme_free_ns(struct kref *kref)
 689{
 690	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
 691
 692	put_disk(ns->disk);
 693	nvme_put_ns_head(ns->head);
 694	nvme_put_ctrl(ns->ctrl);
 695	kfree(ns);
 696}
 697
 698bool nvme_get_ns(struct nvme_ns *ns)
 699{
 700	return kref_get_unless_zero(&ns->kref);
 701}
 702
 703void nvme_put_ns(struct nvme_ns *ns)
 704{
 705	kref_put(&ns->kref, nvme_free_ns);
 706}
 707EXPORT_SYMBOL_NS_GPL(nvme_put_ns, "NVME_TARGET_PASSTHRU");
 708
 709static inline void nvme_clear_nvme_request(struct request *req)
 710{
 711	nvme_req(req)->status = 0;
 712	nvme_req(req)->retries = 0;
 713	nvme_req(req)->flags = 0;
 714	req->rq_flags |= RQF_DONTPREP;
 715}
 716
 717/* initialize a passthrough request */
 718void nvme_init_request(struct request *req, struct nvme_command *cmd)
 719{
 720	struct nvme_request *nr = nvme_req(req);
 721	bool logging_enabled;
 722
 723	if (req->q->queuedata) {
 724		struct nvme_ns *ns = req->q->disk->private_data;
 725
 726		logging_enabled = ns->head->passthru_err_log_enabled;
 727		req->timeout = NVME_IO_TIMEOUT;
 728	} else { /* no queuedata implies admin queue */
 729		logging_enabled = nr->ctrl->passthru_err_log_enabled;
 730		req->timeout = NVME_ADMIN_TIMEOUT;
 731	}
 732
 733	if (!logging_enabled)
 734		req->rq_flags |= RQF_QUIET;
 735
 736	/* passthru commands should let the driver set the SGL flags */
 737	cmd->common.flags &= ~NVME_CMD_SGL_ALL;
 738
 739	req->cmd_flags |= REQ_FAILFAST_DRIVER;
 740	if (req->mq_hctx->type == HCTX_TYPE_POLL)
 741		req->cmd_flags |= REQ_POLLED;
 742	nvme_clear_nvme_request(req);
 743	memcpy(nr->cmd, cmd, sizeof(*cmd));
 744}
 745EXPORT_SYMBOL_GPL(nvme_init_request);
 746
 747/*
 748 * For something we're not in a state to send to the device the default action
 749 * is to busy it and retry it after the controller state is recovered.  However,
 750 * if the controller is deleting or if anything is marked for failfast or
 751 * nvme multipath it is immediately failed.
 752 *
 753 * Note: commands used to initialize the controller will be marked for failfast.
 754 * Note: nvme cli/ioctl commands are marked for failfast.
 755 */
 756blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
 757		struct request *rq)
 758{
 759	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
 760
 761	if (state != NVME_CTRL_DELETING_NOIO &&
 762	    state != NVME_CTRL_DELETING &&
 763	    state != NVME_CTRL_DEAD &&
 764	    !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
 765	    !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
 766		return BLK_STS_RESOURCE;
 767
 768	if (!(rq->rq_flags & RQF_DONTPREP))
 769		nvme_clear_nvme_request(rq);
 770
 771	return nvme_host_path_error(rq);
 772}
 773EXPORT_SYMBOL_GPL(nvme_fail_nonready_command);
 774
 775bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
 776		bool queue_live, enum nvme_ctrl_state state)
 777{
 778	struct nvme_request *req = nvme_req(rq);
 779
 780	/*
 781	 * currently we have a problem sending passthru commands
 782	 * on the admin_q if the controller is not LIVE because we can't
 783	 * make sure that they are going out after the admin connect,
 784	 * controller enable and/or other commands in the initialization
 785	 * sequence. until the controller will be LIVE, fail with
 786	 * BLK_STS_RESOURCE so that they will be rescheduled.
 787	 */
 788	if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD))
 789		return false;
 790
 791	if (ctrl->ops->flags & NVME_F_FABRICS) {
 792		/*
 793		 * Only allow commands on a live queue, except for the connect
 794		 * command, which is require to set the queue live in the
 795		 * appropinquate states.
 796		 */
 797		switch (state) {
 798		case NVME_CTRL_CONNECTING:
 799			if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
 800			    (req->cmd->fabrics.fctype == nvme_fabrics_type_connect ||
 801			     req->cmd->fabrics.fctype == nvme_fabrics_type_auth_send ||
 802			     req->cmd->fabrics.fctype == nvme_fabrics_type_auth_receive))
 803				return true;
 804			break;
 805		default:
 806			break;
 807		case NVME_CTRL_DEAD:
 808			return false;
 809		}
 810	}
 811
 812	return queue_live;
 813}
 814EXPORT_SYMBOL_GPL(__nvme_check_ready);
 815
 816static inline void nvme_setup_flush(struct nvme_ns *ns,
 817		struct nvme_command *cmnd)
 818{
 819	memset(cmnd, 0, sizeof(*cmnd));
 820	cmnd->common.opcode = nvme_cmd_flush;
 821	cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
 822}
 823
 824static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 825		struct nvme_command *cmnd)
 826{
 827	unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
 828	struct nvme_dsm_range *range;
 829	struct bio *bio;
 830
 831	/*
 832	 * Some devices do not consider the DSM 'Number of Ranges' field when
 833	 * determining how much data to DMA. Always allocate memory for maximum
 834	 * number of segments to prevent device reading beyond end of buffer.
 835	 */
 836	static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES;
 837
 838	range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN);
 839	if (!range) {
 840		/*
 841		 * If we fail allocation our range, fallback to the controller
 842		 * discard page. If that's also busy, it's safe to return
 843		 * busy, as we know we can make progress once that's freed.
 844		 */
 845		if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
 846			return BLK_STS_RESOURCE;
 847
 848		range = page_address(ns->ctrl->discard_page);
 849	}
 850
 851	if (queue_max_discard_segments(req->q) == 1) {
 852		u64 slba = nvme_sect_to_lba(ns->head, blk_rq_pos(req));
 853		u32 nlb = blk_rq_sectors(req) >> (ns->head->lba_shift - 9);
 854
 855		range[0].cattr = cpu_to_le32(0);
 856		range[0].nlb = cpu_to_le32(nlb);
 857		range[0].slba = cpu_to_le64(slba);
 858		n = 1;
 859	} else {
 860		__rq_for_each_bio(bio, req) {
 861			u64 slba = nvme_sect_to_lba(ns->head,
 862						    bio->bi_iter.bi_sector);
 863			u32 nlb = bio->bi_iter.bi_size >> ns->head->lba_shift;
 864
 865			if (n < segments) {
 866				range[n].cattr = cpu_to_le32(0);
 867				range[n].nlb = cpu_to_le32(nlb);
 868				range[n].slba = cpu_to_le64(slba);
 869			}
 870			n++;
 871		}
 872	}
 873
 874	if (WARN_ON_ONCE(n != segments)) {
 875		if (virt_to_page(range) == ns->ctrl->discard_page)
 876			clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
 877		else
 878			kfree(range);
 879		return BLK_STS_IOERR;
 880	}
 881
 882	memset(cmnd, 0, sizeof(*cmnd));
 883	cmnd->dsm.opcode = nvme_cmd_dsm;
 884	cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
 885	cmnd->dsm.nr = cpu_to_le32(segments - 1);
 886	cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
 887
 888	bvec_set_virt(&req->special_vec, range, alloc_size);
 889	req->rq_flags |= RQF_SPECIAL_PAYLOAD;
 890
 891	return BLK_STS_OK;
 892}
 893
 894static void nvme_set_app_tag(struct request *req, struct nvme_command *cmnd)
 895{
 896	cmnd->rw.lbat = cpu_to_le16(bio_integrity(req->bio)->app_tag);
 897	cmnd->rw.lbatm = cpu_to_le16(0xffff);
 898}
 899
 900static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
 901			      struct request *req)
 902{
 903	u32 upper, lower;
 904	u64 ref48;
 905
 906	/* only type1 and type 2 PI formats have a reftag */
 907	switch (ns->head->pi_type) {
 908	case NVME_NS_DPS_PI_TYPE1:
 909	case NVME_NS_DPS_PI_TYPE2:
 910		break;
 911	default:
 912		return;
 913	}
 914
 915	/* both rw and write zeroes share the same reftag format */
 916	switch (ns->head->guard_type) {
 917	case NVME_NVM_NS_16B_GUARD:
 918		cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
 919		break;
 920	case NVME_NVM_NS_64B_GUARD:
 921		ref48 = ext_pi_ref_tag(req);
 922		lower = lower_32_bits(ref48);
 923		upper = upper_32_bits(ref48);
 924
 925		cmnd->rw.reftag = cpu_to_le32(lower);
 926		cmnd->rw.cdw3 = cpu_to_le32(upper);
 927		break;
 928	default:
 929		break;
 930	}
 931}
 932
 933static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
 934		struct request *req, struct nvme_command *cmnd)
 935{
 936	memset(cmnd, 0, sizeof(*cmnd));
 937
 938	if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
 939		return nvme_setup_discard(ns, req, cmnd);
 940
 941	cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
 942	cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
 943	cmnd->write_zeroes.slba =
 944		cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
 945	cmnd->write_zeroes.length =
 946		cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
 947
 948	if (!(req->cmd_flags & REQ_NOUNMAP) &&
 949	    (ns->head->features & NVME_NS_DEAC))
 950		cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC);
 951
 952	if (nvme_ns_has_pi(ns->head)) {
 953		cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT);
 954		nvme_set_ref_tag(ns, cmnd, req);
 955	}
 956
 957	return BLK_STS_OK;
 958}
 959
 960/*
 961 * NVMe does not support a dedicated command to issue an atomic write. A write
 962 * which does adhere to the device atomic limits will silently be executed
 963 * non-atomically. The request issuer should ensure that the write is within
 964 * the queue atomic writes limits, but just validate this in case it is not.
 965 */
 966static bool nvme_valid_atomic_write(struct request *req)
 967{
 968	struct request_queue *q = req->q;
 969	u32 boundary_bytes = queue_atomic_write_boundary_bytes(q);
 970
 971	if (blk_rq_bytes(req) > queue_atomic_write_unit_max_bytes(q))
 972		return false;
 973
 974	if (boundary_bytes) {
 975		u64 mask = boundary_bytes - 1, imask = ~mask;
 976		u64 start = blk_rq_pos(req) << SECTOR_SHIFT;
 977		u64 end = start + blk_rq_bytes(req) - 1;
 978
 979		/* If greater then must be crossing a boundary */
 980		if (blk_rq_bytes(req) > boundary_bytes)
 981			return false;
 982
 983		if ((start & imask) != (end & imask))
 984			return false;
 985	}
 986
 987	return true;
 988}
 989
 990static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 991		struct request *req, struct nvme_command *cmnd,
 992		enum nvme_opcode op)
 993{
 994	u16 control = 0;
 995	u32 dsmgmt = 0;
 996
 997	if (req->cmd_flags & REQ_FUA)
 998		control |= NVME_RW_FUA;
 999	if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
1000		control |= NVME_RW_LR;
1001
1002	if (req->cmd_flags & REQ_RAHEAD)
1003		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
1004
1005	if (op == nvme_cmd_write && ns->head->nr_plids) {
1006		u16 write_stream = req->bio->bi_write_stream;
1007
1008		if (WARN_ON_ONCE(write_stream > ns->head->nr_plids))
1009			return BLK_STS_INVAL;
1010
1011		if (write_stream) {
1012			dsmgmt |= ns->head->plids[write_stream - 1] << 16;
1013			control |= NVME_RW_DTYPE_DPLCMT;
1014		}
1015	}
1016
1017	if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req))
1018		return BLK_STS_INVAL;
1019
1020	cmnd->rw.opcode = op;
1021	cmnd->rw.flags = 0;
1022	cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
1023	cmnd->rw.cdw2 = 0;
1024	cmnd->rw.cdw3 = 0;
1025	cmnd->rw.metadata = 0;
1026	cmnd->rw.slba =
1027		cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
1028	cmnd->rw.length =
1029		cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
1030	cmnd->rw.reftag = 0;
1031	cmnd->rw.lbat = 0;
1032	cmnd->rw.lbatm = 0;
1033
1034	if (ns->head->ms) {
1035		/*
1036		 * If formatted with metadata, the block layer always provides a
1037		 * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled.  Else
1038		 * we enable the PRACT bit for protection information or set the
1039		 * namespace capacity to zero to prevent any I/O.
1040		 */
1041		if (!blk_integrity_rq(req)) {
1042			if (WARN_ON_ONCE(!nvme_ns_has_pi(ns->head)))
1043				return BLK_STS_NOTSUPP;
1044			control |= NVME_RW_PRINFO_PRACT;
1045			nvme_set_ref_tag(ns, cmnd, req);
1046		}
1047
1048		if (bio_integrity_flagged(req->bio, BIP_CHECK_GUARD))
1049			control |= NVME_RW_PRINFO_PRCHK_GUARD;
1050		if (bio_integrity_flagged(req->bio, BIP_CHECK_REFTAG)) {
1051			control |= NVME_RW_PRINFO_PRCHK_REF;
1052			if (op == nvme_cmd_zone_append)
1053				control |= NVME_RW_APPEND_PIREMAP;
1054			nvme_set_ref_tag(ns, cmnd, req);
1055		}
1056		if (bio_integrity_flagged(req->bio, BIP_CHECK_APPTAG)) {
1057			control |= NVME_RW_PRINFO_PRCHK_APP;
1058			nvme_set_app_tag(req, cmnd);
1059		}
1060	}
1061
1062	cmnd->rw.control = cpu_to_le16(control);
1063	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
1064	return 0;
1065}
1066
1067void nvme_cleanup_cmd(struct request *req)
1068{
1069	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
1070		struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
1071
1072		if (req->special_vec.bv_page == ctrl->discard_page)
1073			clear_bit_unlock(0, &ctrl->discard_page_busy);
1074		else
1075			kfree(bvec_virt(&req->special_vec));
1076		req->rq_flags &= ~RQF_SPECIAL_PAYLOAD;
1077	}
1078}
1079EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
1080
1081blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
1082{
1083	struct nvme_command *cmd = nvme_req(req)->cmd;
1084	blk_status_t ret = BLK_STS_OK;
1085
1086	if (!(req->rq_flags & RQF_DONTPREP))
1087		nvme_clear_nvme_request(req);
1088
1089	switch (req_op(req)) {
1090	case REQ_OP_DRV_IN:
1091	case REQ_OP_DRV_OUT:
1092		/* these are setup prior to execution in nvme_init_request() */
1093		break;
1094	case REQ_OP_FLUSH:
1095		nvme_setup_flush(ns, cmd);
1096		break;
1097	case REQ_OP_ZONE_RESET_ALL:
1098	case REQ_OP_ZONE_RESET:
1099		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
1100		break;
1101	case REQ_OP_ZONE_OPEN:
1102		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
1103		break;
1104	case REQ_OP_ZONE_CLOSE:
1105		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
1106		break;
1107	case REQ_OP_ZONE_FINISH:
1108		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
1109		break;
1110	case REQ_OP_WRITE_ZEROES:
1111		ret = nvme_setup_write_zeroes(ns, req, cmd);
1112		break;
1113	case REQ_OP_DISCARD:
1114		ret = nvme_setup_discard(ns, req, cmd);
1115		break;
1116	case REQ_OP_READ:
1117		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
1118		break;
1119	case REQ_OP_WRITE:
1120		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
1121		break;
1122	case REQ_OP_ZONE_APPEND:
1123		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
1124		break;
1125	default:
1126		WARN_ON_ONCE(1);
1127		return BLK_STS_IOERR;
1128	}
1129
1130	cmd->common.command_id = nvme_cid(req);
1131	trace_nvme_setup_cmd(req, cmd);
1132	return ret;
1133}
1134EXPORT_SYMBOL_GPL(nvme_setup_cmd);
1135
1136/*
1137 * Return values:
1138 * 0:  success
1139 * >0: nvme controller's cqe status response
1140 * <0: kernel error in lieu of controller response
1141 */
1142int nvme_execute_rq(struct request *rq, bool at_head)
1143{
1144	blk_status_t status;
1145
1146	status = blk_execute_rq(rq, at_head);
1147	if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
1148		return -EINTR;
1149	if (nvme_req(rq)->status)
1150		return nvme_req(rq)->status;
1151	return blk_status_to_errno(status);
1152}
1153EXPORT_SYMBOL_NS_GPL(nvme_execute_rq, "NVME_TARGET_PASSTHRU");
1154
1155/*
1156 * Returns 0 on success.  If the result is negative, it's a Linux error code;
1157 * if the result is positive, it's an NVM Express status code
1158 */
1159int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
1160		union nvme_result *result, void *buffer, unsigned bufflen,
1161		int qid, nvme_submit_flags_t flags)
1162{
1163	struct request *req;
1164	int ret;
1165	blk_mq_req_flags_t blk_flags = 0;
1166
1167	if (flags & NVME_SUBMIT_NOWAIT)
1168		blk_flags |= BLK_MQ_REQ_NOWAIT;
1169	if (flags & NVME_SUBMIT_RESERVED)
1170		blk_flags |= BLK_MQ_REQ_RESERVED;
1171	if (qid == NVME_QID_ANY)
1172		req = blk_mq_alloc_request(q, nvme_req_op(cmd), blk_flags);
1173	else
1174		req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), blk_flags,
1175						qid - 1);
1176
1177	if (IS_ERR(req))
1178		return PTR_ERR(req);
1179	nvme_init_request(req, cmd);
1180	if (flags & NVME_SUBMIT_RETRY)
1181		req->cmd_flags &= ~REQ_FAILFAST_DRIVER;
1182
1183	if (buffer && bufflen) {
1184		ret = blk_rq_map_kern(req, buffer, bufflen, GFP_KERNEL);
1185		if (ret)
1186			goto out;
1187	}
1188
1189	ret = nvme_execute_rq(req, flags & NVME_SUBMIT_AT_HEAD);
1190	if (result && ret >= 0)
1191		*result = nvme_req(req)->result;
1192 out:
1193	blk_mq_free_request(req);
1194	return ret;
1195}
1196EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
1197
1198int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
1199		void *buffer, unsigned bufflen)
1200{
1201	return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen,
1202			NVME_QID_ANY, 0);
1203}
1204EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
1205
1206u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
1207{
1208	u32 effects = 0;
1209
1210	if (ns) {
1211		effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
1212		if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
1213			dev_warn_once(ctrl->device,
1214				"IO command:%02x has unusual effects:%08x\n",
1215				opcode, effects);
1216
1217		/*
1218		 * NVME_CMD_EFFECTS_CSE_MASK causes a freeze all I/O queues,
1219		 * which would deadlock when done on an I/O command.  Note that
1220		 * We already warn about an unusual effect above.
1221		 */
1222		effects &= ~NVME_CMD_EFFECTS_CSE_MASK;
1223	} else {
1224		effects = le32_to_cpu(ctrl->effects->acs[opcode]);
1225
1226		/* Ignore execution restrictions if any relaxation bits are set */
1227		if (effects & NVME_CMD_EFFECTS_CSER_MASK)
1228			effects &= ~NVME_CMD_EFFECTS_CSE_MASK;
1229	}
1230
1231	return effects;
1232}
1233EXPORT_SYMBOL_NS_GPL(nvme_command_effects, "NVME_TARGET_PASSTHRU");
1234
1235u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
1236{
1237	u32 effects = nvme_command_effects(ctrl, ns, opcode);
1238
1239	/*
1240	 * For simplicity, IO to all namespaces is quiesced even if the command
1241	 * effects say only one namespace is affected.
1242	 */
1243	if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1244		mutex_lock(&ctrl->scan_lock);
1245		mutex_lock(&ctrl->subsys->lock);
1246		nvme_mpath_start_freeze(ctrl->subsys);
1247		nvme_mpath_wait_freeze(ctrl->subsys);
1248		nvme_start_freeze(ctrl);
1249		nvme_wait_freeze(ctrl);
1250	}
1251	return effects;
1252}
1253EXPORT_SYMBOL_NS_GPL(nvme_passthru_start, "NVME_TARGET_PASSTHRU");
1254
1255void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects,
1256		       struct nvme_command *cmd, int status)
1257{
1258	if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1259		nvme_unfreeze(ctrl);
1260		nvme_mpath_unfreeze(ctrl->subsys);
1261		mutex_unlock(&ctrl->subsys->lock);
1262		mutex_unlock(&ctrl->scan_lock);
1263	}
1264	if (effects & NVME_CMD_EFFECTS_CCC) {
1265		if (!test_and_set_bit(NVME_CTRL_DIRTY_CAPABILITY,
1266				      &ctrl->flags)) {
1267			dev_info(ctrl->device,
1268"controller capabilities changed, reset may be required to take effect.\n");
1269		}
1270	}
1271	if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
1272		nvme_queue_scan(ctrl);
1273		flush_work(&ctrl->scan_work);
1274	}
1275	if (ns)
1276		return;
1277
1278	switch (cmd->common.opcode) {
1279	case nvme_admin_set_features:
1280		switch (le32_to_cpu(cmd->common.cdw10) & 0xFF) {
1281		case NVME_FEAT_KATO:
1282			/*
1283			 * Keep alive commands interval on the host should be
1284			 * updated when KATO is modified by Set Features
1285			 * commands.
1286			 */
1287			if (!status)
1288				nvme_update_keep_alive(ctrl, cmd);
1289			break;
1290		default:
1291			break;
1292		}
1293		break;
1294	default:
1295		break;
1296	}
1297}
1298EXPORT_SYMBOL_NS_GPL(nvme_passthru_end, "NVME_TARGET_PASSTHRU");
1299
1300/*
1301 * Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1:
1302 *
1303 *   The host should send Keep Alive commands at half of the Keep Alive Timeout
1304 *   accounting for transport roundtrip times [..].
1305 */
1306static unsigned long nvme_keep_alive_work_period(struct nvme_ctrl *ctrl)
1307{
1308	unsigned long delay = ctrl->kato * HZ / 2;
1309
1310	/*
1311	 * When using Traffic Based Keep Alive, we need to run
1312	 * nvme_keep_alive_work at twice the normal frequency, as one
1313	 * command completion can postpone sending a keep alive command
1314	 * by up to twice the delay between runs.
1315	 */
1316	if (ctrl->ctratt & NVME_CTRL_ATTR_TBKAS)
1317		delay /= 2;
1318	return delay;
1319}
1320
1321static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
1322{
1323	unsigned long now = jiffies;
1324	unsigned long delay = nvme_keep_alive_work_period(ctrl);
1325	unsigned long ka_next_check_tm = ctrl->ka_last_check_time + delay;
1326
1327	if (time_after(now, ka_next_check_tm))
1328		delay = 0;
1329	else
1330		delay = ka_next_check_tm - now;
1331
1332	queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
1333}
1334
1335static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
1336						 blk_status_t status)
1337{
1338	struct nvme_ctrl *ctrl = rq->end_io_data;
1339	unsigned long rtt = jiffies - (rq->deadline - rq->timeout);
1340	unsigned long delay = nvme_keep_alive_work_period(ctrl);
1341	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
1342
1343	/*
1344	 * Subtract off the keepalive RTT so nvme_keep_alive_work runs
1345	 * at the desired frequency.
1346	 */
1347	if (rtt <= delay) {
1348		delay -= rtt;
1349	} else {
1350		dev_warn(ctrl->device, "long keepalive RTT (%u ms)\n",
1351			 jiffies_to_msecs(rtt));
1352		delay = 0;
1353	}
1354
1355	blk_mq_free_request(rq);
1356
1357	if (status) {
1358		dev_err(ctrl->device,
1359			"failed nvme_keep_alive_end_io error=%d\n",
1360				status);
1361		return RQ_END_IO_NONE;
1362	}
1363
1364	ctrl->ka_last_check_time = jiffies;
1365	ctrl->comp_seen = false;
1366	if (state == NVME_CTRL_LIVE || state == NVME_CTRL_CONNECTING)
1367		queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
1368	return RQ_END_IO_NONE;
1369}
1370
1371static void nvme_keep_alive_work(struct work_struct *work)
1372{
1373	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
1374			struct nvme_ctrl, ka_work);
1375	bool comp_seen = ctrl->comp_seen;
1376	struct request *rq;
1377
1378	ctrl->ka_last_check_time = jiffies;
1379
1380	if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
1381		dev_dbg(ctrl->device,
1382			"reschedule traffic based keep-alive timer\n");
1383		ctrl->comp_seen = false;
1384		nvme_queue_keep_alive_work(ctrl);
1385		return;
1386	}
1387
1388	rq = blk_mq_alloc_request(ctrl->admin_q, nvme_req_op(&ctrl->ka_cmd),
1389				  BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
1390	if (IS_ERR(rq)) {
1391		/* allocation failure, reset the controller */
1392		dev_err(ctrl->device, "keep-alive failed: %ld\n", PTR_ERR(rq));
1393		nvme_reset_ctrl(ctrl);
1394		return;
1395	}
1396	nvme_init_request(rq, &ctrl->ka_cmd);
1397
1398	rq->timeout = ctrl->kato * HZ;
1399	rq->end_io = nvme_keep_alive_end_io;
1400	rq->end_io_data = ctrl;
1401	blk_execute_rq_nowait(rq, false);
1402}
1403
1404static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
1405{
1406	if (unlikely(ctrl->kato == 0))
1407		return;
1408
1409	nvme_queue_keep_alive_work(ctrl);
1410}
1411
1412void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
1413{
1414	if (unlikely(ctrl->kato == 0))
1415		return;
1416
1417	cancel_delayed_work_sync(&ctrl->ka_work);
1418}
1419EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
1420
1421static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
1422				   struct nvme_command *cmd)
1423{
1424	unsigned int new_kato =
1425		DIV_ROUND_UP(le32_to_cpu(cmd->common.cdw11), 1000);
1426
1427	dev_info(ctrl->device,
1428		 "keep alive interval updated from %u ms to %u ms\n",
1429		 ctrl->kato * 1000 / 2, new_kato * 1000 / 2);
1430
1431	nvme_stop_keep_alive(ctrl);
1432	ctrl->kato = new_kato;
1433	nvme_start_keep_alive(ctrl);
1434}
1435
1436static bool nvme_id_cns_ok(struct nvme_ctrl *ctrl, u8 cns)
1437{
1438	/*
1439	 * The CNS field occupies a full byte starting with NVMe 1.2
1440	 */
1441	if (ctrl->vs >= NVME_VS(1, 2, 0))
1442		return true;
1443
1444	/*
1445	 * NVMe 1.1 expanded the CNS value to two bits, which means values
1446	 * larger than that could get truncated and treated as an incorrect
1447	 * value.
1448	 *
1449	 * Qemu implemented 1.0 behavior for controllers claiming 1.1
1450	 * compliance, so they need to be quirked here.
1451	 */
1452	if (ctrl->vs >= NVME_VS(1, 1, 0) &&
1453	    !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS))
1454		return cns <= 3;
1455
1456	/*
1457	 * NVMe 1.0 used a single bit for the CNS value.
1458	 */
1459	return cns <= 1;
1460}
1461
1462static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
1463{
1464	struct nvme_command c = { };
1465	int error;
1466
1467	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1468	c.identify.opcode = nvme_admin_identify;
1469	c.identify.cns = NVME_ID_CNS_CTRL;
1470
1471	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
1472	if (!*id)
1473		return -ENOMEM;
1474
1475	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
1476			sizeof(struct nvme_id_ctrl));
1477	if (error) {
1478		kfree(*id);
1479		*id = NULL;
1480	}
1481	return error;
1482}
1483
1484static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
1485		struct nvme_ns_id_desc *cur, bool *csi_seen)
1486{
1487	const char *warn_str = "ctrl returned bogus length:";
1488	void *data = cur;
1489
1490	switch (cur->nidt) {
1491	case NVME_NIDT_EUI64:
1492		if (cur->nidl != NVME_NIDT_EUI64_LEN) {
1493			dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n",
1494				 warn_str, cur->nidl);
1495			return -1;
1496		}
1497		if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1498			return NVME_NIDT_EUI64_LEN;
1499		memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN);
1500		return NVME_NIDT_EUI64_LEN;
1501	case NVME_NIDT_NGUID:
1502		if (cur->nidl != NVME_NIDT_NGUID_LEN) {
1503			dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n",
1504				 warn_str, cur->nidl);
1505			return -1;
1506		}
1507		if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1508			return NVME_NIDT_NGUID_LEN;
1509		memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN);
1510		return NVME_NIDT_NGUID_LEN;
1511	case NVME_NIDT_UUID:
1512		if (cur->nidl != NVME_NIDT_UUID_LEN) {
1513			dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n",
1514				 warn_str, cur->nidl);
1515			return -1;
1516		}
1517		if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1518			return NVME_NIDT_UUID_LEN;
1519		uuid_copy(&ids->uuid, data + sizeof(*cur));
1520		return NVME_NIDT_UUID_LEN;
1521	case NVME_NIDT_CSI:
1522		if (cur->nidl != NVME_NIDT_CSI_LEN) {
1523			dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
1524				 warn_str, cur->nidl);
1525			return -1;
1526		}
1527		memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
1528		*csi_seen = true;
1529		return NVME_NIDT_CSI_LEN;
1530	default:
1531		/* Skip unknown types */
1532		return cur->nidl;
1533	}
1534}
1535
1536static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl,
1537		struct nvme_ns_info *info)
1538{
1539	struct nvme_command c = { };
1540	bool csi_seen = false;
1541	int status, pos, len;
1542	void *data;
1543
1544	if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl))
1545		return 0;
1546	if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
1547		return 0;
1548
1549	c.identify.opcode = nvme_admin_identify;
1550	c.identify.nsid = cpu_to_le32(info->nsid);
1551	c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
1552
1553	data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
1554	if (!data)
1555		return -ENOMEM;
1556
1557	status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
1558				      NVME_IDENTIFY_DATA_SIZE);
1559	if (status) {
1560		dev_warn(ctrl->device,
1561			"Identify Descriptors failed (nsid=%u, status=0x%x)\n",
1562			info->nsid, status);
1563		goto free_data;
1564	}
1565
1566	for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
1567		struct nvme_ns_id_desc *cur = data + pos;
1568
1569		if (cur->nidl == 0)
1570			break;
1571
1572		len = nvme_process_ns_desc(ctrl, &info->ids, cur, &csi_seen);
1573		if (len < 0)
1574			break;
1575
1576		len += sizeof(*cur);
1577	}
1578
1579	if (nvme_multi_css(ctrl) && !csi_seen) {
1580		dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
1581			 info->nsid);
1582		status = -EINVAL;
1583	}
1584
1585free_data:
1586	kfree(data);
1587	return status;
1588}
1589
1590int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
1591			struct nvme_id_ns **id)
1592{
1593	struct nvme_command c = { };
1594	int error;
1595
1596	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1597	c.identify.opcode = nvme_admin_identify;
1598	c.identify.nsid = cpu_to_le32(nsid);
1599	c.identify.cns = NVME_ID_CNS_NS;
1600
1601	*id = kmalloc(sizeof(**id), GFP_KERNEL);
1602	if (!*id)
1603		return -ENOMEM;
1604
1605	error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
1606	if (error) {
1607		dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
1608		kfree(*id);
1609		*id = NULL;
1610	}
1611	return error;
1612}
1613
1614static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
1615		struct nvme_ns_info *info)
1616{
1617	struct nvme_ns_ids *ids = &info->ids;
1618	struct nvme_id_ns *id;
1619	int ret;
1620
1621	ret = nvme_identify_ns(ctrl, info->nsid, &id);
1622	if (ret)
1623		return ret;
1624
1625	if (id->ncap == 0) {
1626		/* namespace not allocated or attached */
1627		info->is_removed = true;
1628		ret = -ENODEV;
1629		goto error;
1630	}
1631
1632	info->anagrpid = id->anagrpid;
1633	info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
1634	info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
1635	info->is_ready = true;
1636	info->endgid = le16_to_cpu(id->endgid);
1637	if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
1638		dev_info(ctrl->device,
1639			 "Ignoring bogus Namespace Identifiers\n");
1640	} else {
1641		if (ctrl->vs >= NVME_VS(1, 1, 0) &&
1642		    !memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
1643			memcpy(ids->eui64, id->eui64, sizeof(ids->eui64));
1644		if (ctrl->vs >= NVME_VS(1, 2, 0) &&
1645		    !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
1646			memcpy(ids->nguid, id->nguid, sizeof(ids->nguid));
1647	}
1648
1649error:
1650	kfree(id);
1651	return ret;
1652}
1653
1654static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
1655		struct nvme_ns_info *info)
1656{
1657	struct nvme_id_ns_cs_indep *id;
1658	struct nvme_command c = {
1659		.identify.opcode	= nvme_admin_identify,
1660		.identify.nsid		= cpu_to_le32(info->nsid),
1661		.identify.cns		= NVME_ID_CNS_NS_CS_INDEP,
1662	};
1663	int ret;
1664
1665	id = kmalloc(sizeof(*id), GFP_KERNEL);
1666	if (!id)
1667		return -ENOMEM;
1668
1669	ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
1670	if (!ret) {
1671		info->anagrpid = id->anagrpid;
1672		info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
1673		info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
1674		info->is_ready = id->nstat & NVME_NSTAT_NRDY;
1675		info->is_rotational = id->nsfeat & NVME_NS_ROTATIONAL;
1676		info->no_vwc = id->nsfeat & NVME_NS_VWC_NOT_PRESENT;
1677		info->endgid = le16_to_cpu(id->endgid);
1678	}
1679	kfree(id);
1680	return ret;
1681}
1682
1683static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
1684		unsigned int dword11, void *buffer, size_t buflen, u32 *result)
1685{
1686	union nvme_result res = { 0 };
1687	struct nvme_command c = { };
1688	int ret;
1689
1690	c.features.opcode = op;
1691	c.features.fid = cpu_to_le32(fid);
1692	c.features.dword11 = cpu_to_le32(dword11);
1693
1694	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
1695			buffer, buflen, NVME_QID_ANY, 0);
1696	if (ret >= 0 && result)
1697		*result = le32_to_cpu(res.u32);
1698	return ret;
1699}
1700
1701int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
1702		      unsigned int dword11, void *buffer, size_t buflen,
1703		      void *result)
1704{
1705	return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
1706			     buflen, result);
1707}
1708EXPORT_SYMBOL_GPL(nvme_set_features);
1709
1710int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
1711		      unsigned int dword11, void *buffer, size_t buflen,
1712		      void *result)
1713{
1714	return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
1715			     buflen, result);
1716}
1717EXPORT_SYMBOL_GPL(nvme_get_features);
1718
1719int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
1720{
1721	u32 q_count = (*count - 1) | ((*count - 1) << 16);
1722	u32 result;
1723	int status, nr_io_queues;
1724
1725	status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
1726			&result);
1727
1728	/*
1729	 * It's either a kernel error or the host observed a connection
1730	 * lost. In either case it's not possible communicate with the
1731	 * controller and thus enter the error code path.
1732	 */
1733	if (status < 0 || status == NVME_SC_HOST_PATH_ERROR)
1734		return status;
1735
1736	/*
1737	 * Degraded controllers might return an error when setting the queue
1738	 * count.  We still want to be able to bring them online and offer
1739	 * access to the admin queue, as that might be only way to fix them up.
1740	 */
1741	if (status > 0) {
1742		dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
1743		*count = 0;
1744	} else {
1745		nr_io_queues = min(result & 0xffff, result >> 16) + 1;
1746		*count = min(*count, nr_io_queues);
1747	}
1748
1749	return 0;
1750}
1751EXPORT_SYMBOL_GPL(nvme_set_queue_count);
1752
1753#define NVME_AEN_SUPPORTED \
1754	(NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \
1755	 NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE)
1756
1757static void nvme_enable_aen(struct nvme_ctrl *ctrl)
1758{
1759	u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED;
1760	int status;
1761
1762	if (!supported_aens)
1763		return;
1764
1765	status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens,
1766			NULL, 0, &result);
1767	if (status)
1768		dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
1769			 supported_aens);
1770
1771	queue_work(nvme_wq, &ctrl->async_event_work);
1772}
1773
1774static int nvme_ns_open(struct nvme_ns *ns)
1775{
1776
1777	/* should never be called due to GENHD_FL_HIDDEN */
1778	if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head)))
1779		goto fail;
1780	if (!nvme_get_ns(ns))
1781		goto fail;
1782	if (!try_module_get(ns->ctrl->ops->module))
1783		goto fail_put_ns;
1784
1785	return 0;
1786
1787fail_put_ns:
1788	nvme_put_ns(ns);
1789fail:
1790	return -ENXIO;
1791}
1792
1793static void nvme_ns_release(struct nvme_ns *ns)
1794{
1795
1796	module_put(ns->ctrl->ops->module);
1797	nvme_put_ns(ns);
1798}
1799
1800static int nvme_open(struct gendisk *disk, blk_mode_t mode)
1801{
1802	return nvme_ns_open(disk->private_data);
1803}
1804
1805static void nvme_release(struct gendisk *disk)
1806{
1807	nvme_ns_release(disk->private_data);
1808}
1809
1810int nvme_getgeo(struct gendisk *disk, struct hd_geometry *geo)
1811{
1812	/* some standard values */
1813	geo->heads = 1 << 6;
1814	geo->sectors = 1 << 5;
1815	geo->cylinders = get_capacity(disk) >> 11;
1816	return 0;
1817}
1818
1819static bool nvme_init_integrity(struct nvme_ns_head *head,
1820		struct queue_limits *lim, struct nvme_ns_info *info)
1821{
1822	struct blk_integrity *bi = &lim->integrity;
1823
1824	memset(bi, 0, sizeof(*bi));
1825
1826	if (!head->ms)
1827		return true;
1828
1829	/*
1830	 * PI can always be supported as we can ask the controller to simply
1831	 * insert/strip it, which is not possible for other kinds of metadata.
1832	 */
1833	if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ||
1834	    !(head->features & NVME_NS_METADATA_SUPPORTED))
1835		return nvme_ns_has_pi(head);
1836
1837	switch (head->pi_type) {
1838	case NVME_NS_DPS_PI_TYPE3:
1839		switch (head->guard_type) {
1840		case NVME_NVM_NS_16B_GUARD:
1841			bi->csum_type = BLK_INTEGRITY_CSUM_CRC;
1842			bi->tag_size = sizeof(u16) + sizeof(u32);
1843			bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1844			break;
1845		case NVME_NVM_NS_64B_GUARD:
1846			bi->csum_type = BLK_INTEGRITY_CSUM_CRC64;
1847			bi->tag_size = sizeof(u16) + 6;
1848			bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1849			break;
1850		default:
1851			break;
1852		}
1853		break;
1854	case NVME_NS_DPS_PI_TYPE1:
1855	case NVME_NS_DPS_PI_TYPE2:
1856		switch (head->guard_type) {
1857		case NVME_NVM_NS_16B_GUARD:
1858			bi->csum_type = BLK_INTEGRITY_CSUM_CRC;
1859			bi->tag_size = sizeof(u16);
1860			bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE |
1861				     BLK_INTEGRITY_REF_TAG;
1862			break;
1863		case NVME_NVM_NS_64B_GUARD:
1864			bi->csum_type = BLK_INTEGRITY_CSUM_CRC64;
1865			bi->tag_size = sizeof(u16);
1866			bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE |
1867				     BLK_INTEGRITY_REF_TAG;
1868			break;
1869		default:
1870			break;
1871		}
1872		break;
1873	default:
1874		break;
1875	}
1876
1877	bi->metadata_size = head->ms;
1878	if (bi->csum_type) {
1879		bi->pi_tuple_size = head->pi_size;
1880		bi->pi_offset = info->pi_offset;
1881	}
1882	return true;
1883}
1884
1885static void nvme_config_discard(struct nvme_ns *ns, struct queue_limits *lim)
1886{
1887	struct nvme_ctrl *ctrl = ns->ctrl;
1888
1889	if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX))
1890		lim->max_hw_discard_sectors =
1891			nvme_lba_to_sect(ns->head, ctrl->dmrsl);
1892	else if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
1893		lim->max_hw_discard_sectors = UINT_MAX;
1894	else
1895		lim->max_hw_discard_sectors = 0;
1896
1897	lim->discard_granularity = lim->logical_block_size;
1898
1899	if (ctrl->dmrl)
1900		lim->max_discard_segments = ctrl->dmrl;
1901	else
1902		lim->max_discard_segments = NVME_DSM_MAX_RANGES;
1903}
1904
1905static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
1906{
1907	return uuid_equal(&a->uuid, &b->uuid) &&
1908		memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
1909		memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
1910		a->csi == b->csi;
1911}
1912
1913static int nvme_identify_ns_nvm(struct nvme_ctrl *ctrl, unsigned int nsid,
1914		struct nvme_id_ns_nvm **nvmp)
1915{
1916	struct nvme_command c = {
1917		.identify.opcode	= nvme_admin_identify,
1918		.identify.nsid		= cpu_to_le32(nsid),
1919		.identify.cns		= NVME_ID_CNS_CS_NS,
1920		.identify.csi		= NVME_CSI_NVM,
1921	};
1922	struct nvme_id_ns_nvm *nvm;
1923	int ret;
1924
1925	nvm = kzalloc(sizeof(*nvm), GFP_KERNEL);
1926	if (!nvm)
1927		return -ENOMEM;
1928
1929	ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm));
1930	if (ret)
1931		kfree(nvm);
1932	else
1933		*nvmp = nvm;
1934	return ret;
1935}
1936
1937static void nvme_configure_pi_elbas(struct nvme_ns_head *head,
1938		struct nvme_id_ns *id, struct nvme_id_ns_nvm *nvm)
1939{
1940	u32 elbaf = le32_to_cpu(nvm->elbaf[nvme_lbaf_index(id->flbas)]);
1941	u8 guard_type;
1942
1943	/* no support for storage tag formats right now */
1944	if (nvme_elbaf_sts(elbaf))
1945		return;
1946
1947	guard_type = nvme_elbaf_guard_type(elbaf);
1948	if ((nvm->pic & NVME_ID_NS_NVM_QPIFS) &&
1949	     guard_type == NVME_NVM_NS_QTYPE_GUARD)
1950		guard_type = nvme_elbaf_qualified_guard_type(elbaf);
1951
1952	head->guard_type = guard_type;
1953	switch (head->guard_type) {
1954	case NVME_NVM_NS_64B_GUARD:
1955		head->pi_size = sizeof(struct crc64_pi_tuple);
1956		break;
1957	case NVME_NVM_NS_16B_GUARD:
1958		head->pi_size = sizeof(struct t10_pi_tuple);
1959		break;
1960	default:
1961		break;
1962	}
1963}
1964
1965static void nvme_configure_metadata(struct nvme_ctrl *ctrl,
1966		struct nvme_ns_head *head, struct nvme_id_ns *id,
1967		struct nvme_id_ns_nvm *nvm, struct nvme_ns_info *info)
1968{
1969	head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
1970	head->pi_type = 0;
1971	head->pi_size = 0;
1972	head->ms = le16_to_cpu(id->lbaf[nvme_lbaf_index(id->flbas)].ms);
1973	if (!head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
1974		return;
1975
1976	if (nvm && (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
1977		nvme_configure_pi_elbas(head, id, nvm);
1978	} else {
1979		head->pi_size = sizeof(struct t10_pi_tuple);
1980		head->guard_type = NVME_NVM_NS_16B_GUARD;
1981	}
1982
1983	if (head->pi_size && head->ms >= head->pi_size)
1984		head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
1985	if (!(id->dps & NVME_NS_DPS_PI_FIRST)) {
1986		if (disable_pi_offsets)
1987			head->pi_type = 0;
1988		else
1989			info->pi_offset = head->ms - head->pi_size;
1990	}
1991
1992	if (ctrl->ops->flags & NVME_F_FABRICS) {
1993		/*
1994		 * The NVMe over Fabrics specification only supports metadata as
1995		 * part of the extended data LBA.  We rely on HCA/HBA support to
1996		 * remap the separate metadata buffer from the block layer.
1997		 */
1998		if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
1999			return;
2000
2001		head->features |= NVME_NS_EXT_LBAS;
2002
2003		/*
2004		 * The current fabrics transport drivers support namespace
2005		 * metadata formats only if nvme_ns_has_pi() returns true.
2006		 * Suppress support for all other formats so the namespace will
2007		 * have a 0 capacity and not be usable through the block stack.
2008		 *
2009		 * Note, this check will need to be modified if any drivers
2010		 * gain the ability to use other metadata formats.
2011		 */
2012		if (ctrl->max_integrity_segments && nvme_ns_has_pi(head))
2013			head->features |= NVME_NS_METADATA_SUPPORTED;
2014	} else {
2015		/*
2016		 * For PCIe controllers, we can't easily remap the separate
2017		 * metadata buffer from the block layer and thus require a
2018		 * separate metadata buffer for block layer metadata/PI support.
2019		 * We allow extended LBAs for the passthrough interface, though.
2020		 */
2021		if (id->flbas & NVME_NS_FLBAS_META_EXT)
2022			head->features |= NVME_NS_EXT_LBAS;
2023		else
2024			head->features |= NVME_NS_METADATA_SUPPORTED;
2025	}
2026}
2027
2028
2029static u32 nvme_configure_atomic_write(struct nvme_ns *ns,
2030		struct nvme_id_ns *id, struct queue_limits *lim, u32 bs)
2031{
2032	u32 atomic_bs, boundary = 0;
2033
2034	/*
2035	 * We do not support an offset for the atomic boundaries.
2036	 */
2037	if (id->nabo)
2038		return bs;
2039
2040	if ((id->nsfeat & NVME_NS_FEAT_ATOMICS) && id->nawupf) {
2041		/*
2042		 * Use the per-namespace atomic write unit when available.
2043		 */
2044		atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
2045		if (id->nabspf)
2046			boundary = (le16_to_cpu(id->nabspf) + 1) * bs;
2047	} else {
2048		/*
2049		 * Use the controller wide atomic write unit.  This sucks
2050		 * because the limit is defined in terms of logical blocks while
2051		 * namespaces can have different formats, and because there is
2052		 * no clear language in the specification prohibiting different
2053		 * values for different controllers in the subsystem.
2054		 */
2055		atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
2056	}
2057
2058	lim->atomic_write_hw_max = atomic_bs;
2059	lim->atomic_write_hw_boundary = boundary;
2060	lim->atomic_write_hw_unit_min = bs;
2061	lim->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs);
2062	lim->features |= BLK_FEAT_ATOMIC_WRITES;
2063	return atomic_bs;
2064}
2065
2066static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl)
2067{
2068	return ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> SECTOR_SHIFT) + 1;
2069}
2070
2071static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl,
2072		struct queue_limits *lim, bool is_admin)
2073{
2074	lim->max_hw_sectors = ctrl->max_hw_sectors;
2075	lim->max_segments = min_t(u32, USHRT_MAX,
2076		min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments));
2077	lim->max_integrity_segments = ctrl->max_integrity_segments;
2078	lim->virt_boundary_mask = ctrl->ops->get_virt_boundary(ctrl, is_admin);
2079	lim->max_segment_size = UINT_MAX;
2080	lim->dma_alignment = 3;
2081}
2082
2083static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
2084		struct queue_limits *lim)
2085{
2086	struct nvme_ns_head *head = ns->head;
2087	u32 bs = 1U << head->lba_shift;
2088	u32 atomic_bs, phys_bs, io_opt = 0;
2089	bool valid = true;
2090
2091	/*
2092	 * The block layer can't support LBA sizes larger than the page size
2093	 * or smaller than a sector size yet, so catch this early and don't
2094	 * allow block I/O.
2095	 */
2096	if (blk_validate_block_size(bs)) {
2097		bs = (1 << 9);
2098		valid = false;
2099	}
2100
2101	phys_bs = bs;
2102	atomic_bs = nvme_configure_atomic_write(ns, id, lim, bs);
2103
2104	if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
2105		/* NPWG = Namespace Preferred Write Granularity */
2106		phys_bs = bs * (1 + le16_to_cpu(id->npwg));
2107		/* NOWS = Namespace Optimal Write Size */
2108		if (id->nows)
2109			io_opt = bs * (1 + le16_to_cpu(id->nows));
2110	}
2111
2112	/*
2113	 * Linux filesystems assume writing a single physical block is
2114	 * an atomic operation. Hence limit the physical block size to the
2115	 * value of the Atomic Write Unit Power Fail parameter.
2116	 */
2117	lim->logical_block_size = bs;
2118	lim->physical_block_size = min(phys_bs, atomic_bs);
2119	lim->io_min = phys_bs;
2120	lim->io_opt = io_opt;
2121	if ((ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) &&
2122	    (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM))
2123		lim->max_write_zeroes_sectors = UINT_MAX;
2124	else
2125		lim->max_write_zeroes_sectors = ns->ctrl->max_zeroes_sectors;
2126	return valid;
2127}
2128
2129static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info)
2130{
2131	return info->is_readonly || test_bit(NVME_NS_FORCE_RO, &ns->flags);
2132}
2133
2134static inline bool nvme_first_scan(struct gendisk *disk)
2135{
2136	/* nvme_alloc_ns() scans the disk prior to adding it */
2137	return !disk_live(disk);
2138}
2139
2140static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id,
2141		struct queue_limits *lim)
2142{
2143	struct nvme_ctrl *ctrl = ns->ctrl;
2144	u32 iob;
2145
2146	if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
2147	    is_power_of_2(ctrl->max_hw_sectors))
2148		iob = ctrl->max_hw_sectors;
2149	else
2150		iob = nvme_lba_to_sect(ns->head, le16_to_cpu(id->noiob));
2151
2152	if (!iob)
2153		return;
2154
2155	if (!is_power_of_2(iob)) {
2156		if (nvme_first_scan(ns->disk))
2157			pr_warn("%s: ignoring unaligned IO boundary:%u\n",
2158				ns->disk->disk_name, iob);
2159		return;
2160	}
2161
2162	if (blk_queue_is_zoned(ns->disk->queue)) {
2163		if (nvme_first_scan(ns->disk))
2164			pr_warn("%s: ignoring zoned namespace IO boundary\n",
2165				ns->disk->disk_name);
2166		return;
2167	}
2168
2169	lim->chunk_sectors = iob;
2170}
2171
2172static int nvme_update_ns_info_generic(struct nvme_ns *ns,
2173		struct nvme_ns_info *info)
2174{
2175	struct queue_limits lim;
2176	unsigned int memflags;
2177	int ret;
2178
2179	lim = queue_limits_start_update(ns->disk->queue);
2180	nvme_set_ctrl_limits(ns->ctrl, &lim, false);
2181
2182	memflags = blk_mq_freeze_queue(ns->disk->queue);
2183	ret = queue_limits_commit_update(ns->disk->queue, &lim);
2184	set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
2185	blk_mq_unfreeze_queue(ns->disk->queue, memflags);
2186
2187	/* Hide the block-interface for these devices */
2188	if (!ret)
2189		ret = -ENODEV;
2190	return ret;
2191}
2192
2193static int nvme_query_fdp_granularity(struct nvme_ctrl *ctrl,
2194				      struct nvme_ns_info *info, u8 fdp_idx)
2195{
2196	struct nvme_fdp_config_log hdr, *h;
2197	struct nvme_fdp_config_desc *desc;
2198	size_t size = sizeof(hdr);
2199	void *log, *end;
2200	int i, n, ret;
2201
2202	ret = nvme_get_log_lsi(ctrl, 0, NVME_LOG_FDP_CONFIGS, 0,
2203			       NVME_CSI_NVM, &hdr, size, 0, info->endgid);
2204	if (ret) {
2205		dev_warn(ctrl->device,
2206			 "FDP configs log header status:0x%x endgid:%d\n", ret,
2207			 info->endgid);
2208		return ret;
2209	}
2210
2211	size = le32_to_cpu(hdr.sze);
2212	if (size > PAGE_SIZE * MAX_ORDER_NR_PAGES) {
2213		dev_warn(ctrl->device, "FDP config size too large:%zu\n",
2214			 size);
2215		return 0;
2216	}
2217
2218	h = kvmalloc(size, GFP_KERNEL);
2219	if (!h)
2220		return -ENOMEM;
2221
2222	ret = nvme_get_log_lsi(ctrl, 0, NVME_LOG_FDP_CONFIGS, 0,
2223			       NVME_CSI_NVM, h, size, 0, info->endgid);
2224	if (ret) {
2225		dev_warn(ctrl->device,
2226			 "FDP configs log status:0x%x endgid:%d\n", ret,
2227			 info->endgid);
2228		goto out;
2229	}
2230
2231	n = le16_to_cpu(h->numfdpc) + 1;
2232	if (fdp_idx > n) {
2233		dev_warn(ctrl->device, "FDP index:%d out of range:%d\n",
2234			 fdp_idx, n);
2235		/* Proceed without registering FDP streams */
2236		ret = 0;
2237		goto out;
2238	}
2239
2240	log = h + 1;
2241	desc = log;
2242	end = log + size - sizeof(*h);
2243	for (i = 0; i < fdp_idx; i++) {
2244		log += le16_to_cpu(desc->dsze);
2245		desc = log;
2246		if (log >= end) {
2247			dev_warn(ctrl->device,
2248				 "FDP invalid config descriptor list\n");
2249			ret = 0;
2250			goto out;
2251		}
2252	}
2253
2254	if (le32_to_cpu(desc->nrg) > 1) {
2255		dev_warn(ctrl->device, "FDP NRG > 1 not supported\n");
2256		ret = 0;
2257		goto out;
2258	}
2259
2260	info->runs = le64_to_cpu(desc->runs);
2261out:
2262	kvfree(h);
2263	return ret;
2264}
2265
2266static int nvme_query_fdp_info(struct nvme_ns *ns, struct nvme_ns_info *info)
2267{
2268	struct nvme_ns_head *head = ns->head;
2269	struct nvme_ctrl *ctrl = ns->ctrl;
2270	struct nvme_fdp_ruh_status *ruhs;
2271	struct nvme_fdp_config fdp;
2272	struct nvme_command c = {};
2273	size_t size;
2274	int i, ret;
2275
2276	/*
2277	 * The FDP configuration is static for the lifetime of the namespace,
2278	 * so return immediately if we've already registered this namespace's
2279	 * streams.
2280	 */
2281	if (head->nr_plids)
2282		return 0;
2283
2284	ret = nvme_get_features(ctrl, NVME_FEAT_FDP, info->endgid, NULL, 0,
2285				&fdp);
2286	if (ret) {
2287		dev_warn(ctrl->device, "FDP get feature status:0x%x\n", ret);
2288		return ret;
2289	}
2290
2291	if (!(fdp.flags & FDPCFG_FDPE))
2292		return 0;
2293
2294	ret = nvme_query_fdp_granularity(ctrl, info, fdp.fdpcidx);
2295	if (!info->runs)
2296		return ret;
2297
2298	size = struct_size(ruhs, ruhsd, S8_MAX - 1);
2299	ruhs = kzalloc(size, GFP_KERNEL);
2300	if (!ruhs)
2301		return -ENOMEM;
2302
2303	c.imr.opcode = nvme_cmd_io_mgmt_recv;
2304	c.imr.nsid = cpu_to_le32(head->ns_id);
2305	c.imr.mo = NVME_IO_MGMT_RECV_MO_RUHS;
2306	c.imr.numd = cpu_to_le32(nvme_bytes_to_numd(size));
2307	ret = nvme_submit_sync_cmd(ns->queue, &c, ruhs, size);
2308	if (ret) {
2309		dev_warn(ctrl->device, "FDP io-mgmt status:0x%x\n", ret);
2310		goto free;
2311	}
2312
2313	head->nr_plids = le16_to_cpu(ruhs->nruhsd);
2314	if (!head->nr_plids)
2315		goto free;
2316
2317	head->plids = kcalloc(head->nr_plids, sizeof(*head->plids),
2318			      GFP_KERNEL);
2319	if (!head->plids) {
2320		dev_warn(ctrl->device,
2321			 "failed to allocate %u FDP placement IDs\n",
2322			 head->nr_plids);
2323		head->nr_plids = 0;
2324		ret = -ENOMEM;
2325		goto free;
2326	}
2327
2328	for (i = 0; i < head->nr_plids; i++)
2329		head->plids[i] = le16_to_cpu(ruhs->ruhsd[i].pid);
2330free:
2331	kfree(ruhs);
2332	return ret;
2333}
2334
2335static int nvme_update_ns_info_block(struct nvme_ns *ns,
2336		struct nvme_ns_info *info)
2337{
2338	struct queue_limits lim;
2339	struct nvme_id_ns_nvm *nvm = NULL;
2340	struct nvme_zone_info zi = {};
2341	struct nvme_id_ns *id;
2342	unsigned int memflags;
2343	sector_t capacity;
2344	unsigned lbaf;
2345	int ret;
2346
2347	ret = nvme_identify_ns(ns->ctrl, info->nsid, &id);
2348	if (ret)
2349		return ret;
2350
2351	if (id->ncap == 0) {
2352		/* namespace not allocated or attached */
2353		info->is_removed = true;
2354		ret = -ENXIO;
2355		goto out;
2356	}
2357	lbaf = nvme_lbaf_index(id->flbas);
2358
2359	if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) {
2360		ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm);
2361		if (ret < 0)
2362			goto out;
2363	}
2364
2365	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
2366	    ns->head->ids.csi == NVME_CSI_ZNS) {
2367		ret = nvme_query_zone_info(ns, lbaf, &zi);
2368		if (ret < 0)
2369			goto out;
2370	}
2371
2372	if (ns->ctrl->ctratt & NVME_CTRL_ATTR_FDPS) {
2373		ret = nvme_query_fdp_info(ns, info);
2374		if (ret < 0)
2375			goto out;
2376	}
2377
2378	lim = queue_limits_start_update(ns->disk->queue);
2379
2380	memflags = blk_mq_freeze_queue(ns->disk->queue);
2381	ns->head->lba_shift = id->lbaf[lbaf].ds;
2382	ns->head->nuse = le64_to_cpu(id->nuse);
2383	capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze));
2384	nvme_set_ctrl_limits(ns->ctrl, &lim, false);
2385	nvme_configure_metadata(ns->ctrl, ns->head, id, nvm, info);
2386	nvme_set_chunk_sectors(ns, id, &lim);
2387	if (!nvme_update_disk_info(ns, id, &lim))
2388		capacity = 0;
2389
2390	nvme_config_discard(ns, &lim);
2391	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
2392	    ns->head->ids.csi == NVME_CSI_ZNS)
2393		nvme_update_zone_info(ns, &lim, &zi);
2394
2395	if ((ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT) && !info->no_vwc)
2396		lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA;
2397	else
2398		lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA);
2399
2400	if (info->is_rotational)
2401		lim.features |= BLK_FEAT_ROTATIONAL;
2402
2403	/*
2404	 * Register a metadata profile for PI, or the plain non-integrity NVMe
2405	 * metadata masquerading as Type 0 if supported, otherwise reject block
2406	 * I/O to namespaces with metadata except when the namespace supports
2407	 * PI, as it can strip/insert in that case.
2408	 */
2409	if (!nvme_init_integrity(ns->head, &lim, info))
2410		capacity = 0;
2411
2412	lim.max_write_streams = ns->head->nr_plids;
2413	if (lim.max_write_streams)
2414		lim.write_stream_granularity = min(info->runs, U32_MAX);
2415	else
2416		lim.write_stream_granularity = 0;
2417
2418	/*
2419	 * Only set the DEAC bit if the device guarantees that reads from
2420	 * deallocated data return zeroes.  While the DEAC bit does not
2421	 * require that, it must be a no-op if reads from deallocated data
2422	 * do not return zeroes.
2423	 */
2424	if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3))) {
2425		ns->head->features |= NVME_NS_DEAC;
2426		lim.max_hw_wzeroes_unmap_sectors = lim.max_write_zeroes_sectors;
2427	}
2428
2429	ret = queue_limits_commit_update(ns->disk->queue, &lim);
2430	if (ret) {
2431		blk_mq_unfreeze_queue(ns->disk->queue, memflags);
2432		goto out;
2433	}
2434
2435	set_capacity_and_notify(ns->disk, capacity);
2436	set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
2437	set_bit(NVME_NS_READY, &ns->flags);
2438	blk_mq_unfreeze_queue(ns->disk->queue, memflags);
2439
2440	if (blk_queue_is_zoned(ns->queue)) {
2441		ret = blk_revalidate_disk_zones(ns->disk);
2442		if (ret && !nvme_first_scan(ns->disk))
2443			goto out;
2444	}
2445
2446	ret = 0;
2447out:
2448	kfree(nvm);
2449	kfree(id);
2450	return ret;
2451}
2452
2453static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
2454{
2455	bool unsupported = false;
2456	int ret;
2457
2458	switch (info->ids.csi) {
2459	case NVME_CSI_ZNS:
2460		if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
2461			dev_info(ns->ctrl->device,
2462	"block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
2463				info->nsid);
2464			ret = nvme_update_ns_info_generic(ns, info);
2465			break;
2466		}
2467		ret = nvme_update_ns_info_block(ns, info);
2468		break;
2469	case NVME_CSI_NVM:
2470		ret = nvme_update_ns_info_block(ns, info);
2471		break;
2472	default:
2473		dev_info(ns->ctrl->device,
2474			"block device for nsid %u not supported (csi %u)\n",
2475			info->nsid, info->ids.csi);
2476		ret = nvme_update_ns_info_generic(ns, info);
2477		break;
2478	}
2479
2480	/*
2481	 * If probing fails due an unsupported feature, hide the block device,
2482	 * but still allow other access.
2483	 */
2484	if (ret == -ENODEV) {
2485		ns->disk->flags |= GENHD_FL_HIDDEN;
2486		set_bit(NVME_NS_READY, &ns->flags);
2487		unsupported = true;
2488		ret = 0;
2489	}
2490
2491	if (!ret && nvme_ns_head_multipath(ns->head)) {
2492		struct queue_limits *ns_lim = &ns->disk->queue->limits;
2493		struct queue_limits lim;
2494		unsigned int memflags;
2495
2496		lim = queue_limits_start_update(ns->head->disk->queue);
2497		memflags = blk_mq_freeze_queue(ns->head->disk->queue);
2498		/*
2499		 * queue_limits mixes values that are the hardware limitations
2500		 * for bio splitting with what is the device configuration.
2501		 *
2502		 * For NVMe the device configuration can change after e.g. a
2503		 * Format command, and we really want to pick up the new format
2504		 * value here.  But we must still stack the queue limits to the
2505		 * least common denominator for multipathing to split the bios
2506		 * properly.
2507		 *
2508		 * To work around this, we explicitly set the device
2509		 * configuration to those that we just queried, but only stack
2510		 * the splitting limits in to make sure we still obey possibly
2511		 * lower limitations of other controllers.
2512		 */
2513		lim.logical_block_size = ns_lim->logical_block_size;
2514		lim.physical_block_size = ns_lim->physical_block_size;
2515		lim.io_min = ns_lim->io_min;
2516		lim.io_opt = ns_lim->io_opt;
2517		queue_limits_stack_bdev(&lim, ns->disk->part0, 0,
2518					ns->head->disk->disk_name);
2519		if (unsupported)
2520			ns->head->disk->flags |= GENHD_FL_HIDDEN;
2521		else
2522			nvme_init_integrity(ns->head, &lim, info);
2523		lim.max_write_streams = ns_lim->max_write_streams;
2524		lim.write_stream_granularity = ns_lim->write_stream_granularity;
2525		ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
2526
2527		set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk));
2528		set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
2529		nvme_mpath_revalidate_paths(ns);
2530
2531		blk_mq_unfreeze_queue(ns->head->disk->queue, memflags);
2532	}
2533
2534	return ret;
2535}
2536
2537int nvme_ns_get_unique_id(struct nvme_ns *ns, u8 id[16],
2538		enum blk_unique_id type)
2539{
2540	struct nvme_ns_ids *ids = &ns->head->ids;
2541
2542	if (type != BLK_UID_EUI64)
2543		return -EINVAL;
2544
2545	if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) {
2546		memcpy(id, &ids->nguid, sizeof(ids->nguid));
2547		return sizeof(ids->nguid);
2548	}
2549	if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) {
2550		memcpy(id, &ids->eui64, sizeof(ids->eui64));
2551		return sizeof(ids->eui64);
2552	}
2553
2554	return -EINVAL;
2555}
2556
2557static int nvme_get_unique_id(struct gendisk *disk, u8 id[16],
2558		enum blk_unique_id type)
2559{
2560	return nvme_ns_get_unique_id(disk->private_data, id, type);
2561}
2562
2563#ifdef CONFIG_BLK_SED_OPAL
2564static int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
2565		bool send)
2566{
2567	struct nvme_ctrl *ctrl = data;
2568	struct nvme_command cmd = { };
2569
2570	if (send)
2571		cmd.common.opcode = nvme_admin_security_send;
2572	else
2573		cmd.common.opcode = nvme_admin_security_recv;
2574	cmd.common.nsid = 0;
2575	cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
2576	cmd.common.cdw11 = cpu_to_le32(len);
2577
2578	return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
2579			NVME_QID_ANY, NVME_SUBMIT_AT_HEAD);
2580}
2581
2582static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
2583{
2584	if (ctrl->oacs & NVME_CTRL_OACS_SEC_SUPP) {
2585		if (!ctrl->opal_dev)
2586			ctrl->opal_dev = init_opal_dev(ctrl, &nvme_sec_submit);
2587		else if (was_suspended)
2588			opal_unlock_from_suspend(ctrl->opal_dev);
2589	} else {
2590		free_opal_dev(ctrl->opal_dev);
2591		ctrl->opal_dev = NULL;
2592	}
2593}
2594#else
2595static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
2596{
2597}
2598#endif /* CONFIG_BLK_SED_OPAL */
2599
2600#ifdef CONFIG_BLK_DEV_ZONED
2601static int nvme_report_zones(struct gendisk *disk, sector_t sector,
2602		unsigned int nr_zones, struct blk_report_zones_args *args)
2603{
2604	return nvme_ns_report_zones(disk->private_data, sector, nr_zones, args);
2605}
2606#else
2607#define nvme_report_zones	NULL
2608#endif /* CONFIG_BLK_DEV_ZONED */
2609
2610const struct block_device_operations nvme_bdev_ops = {
2611	.owner		= THIS_MODULE,
2612	.ioctl		= nvme_ioctl,
2613	.compat_ioctl	= blkdev_compat_ptr_ioctl,
2614	.open		= nvme_open,
2615	.release	= nvme_release,
2616	.getgeo		= nvme_getgeo,
2617	.get_unique_id	= nvme_get_unique_id,
2618	.report_zones	= nvme_report_zones,
2619	.pr_ops		= &nvme_pr_ops,
2620};
2621
2622static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 mask, u32 val,
2623		u32 timeout, const char *op)
2624{
2625	unsigned long timeout_jiffies = jiffies + timeout * HZ;
2626	u32 csts;
2627	int ret;
2628
2629	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
2630		if (csts == ~0)
2631			return -ENODEV;
2632		if ((csts & mask) == val)
2633			break;
2634
2635		usleep_range(1000, 2000);
2636		if (fatal_signal_pending(current))
2637			return -EINTR;
2638		if (time_after(jiffies, timeout_jiffies)) {
2639			dev_err(ctrl->device,
2640				"Device not ready; aborting %s, CSTS=0x%x\n",
2641				op, csts);
2642			return -ENODEV;
2643		}
2644	}
2645
2646	return ret;
2647}
2648
2649int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
2650{
2651	int ret;
2652
2653	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
2654	if (shutdown)
2655		ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
2656	else
2657		ctrl->ctrl_config &= ~NVME_CC_ENABLE;
2658
2659	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2660	if (ret)
2661		return ret;
2662
2663	if (shutdown) {
2664		return nvme_wait_ready(ctrl, NVME_CSTS_SHST_MASK,
2665				       NVME_CSTS_SHST_CMPLT,
2666				       ctrl->shutdown_timeout, "shutdown");
2667	}
2668	if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
2669		msleep(NVME_QUIRK_DELAY_AMOUNT);
2670	return nvme_wait_ready(ctrl, NVME_CSTS_RDY, 0,
2671			       (NVME_CAP_TIMEOUT(ctrl->cap) + 1) / 2, "reset");
2672}
2673EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
2674
2675int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
2676{
2677	unsigned dev_page_min;
2678	u32 timeout;
2679	int ret;
2680
2681	ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
2682	if (ret) {
2683		dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
2684		return ret;
2685	}
2686	dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
2687
2688	if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
2689		dev_err(ctrl->device,
2690			"Minimum device page size %u too large for host (%u)\n",
2691			1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
2692		return -ENODEV;
2693	}
2694
2695	if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
2696		ctrl->ctrl_config = NVME_CC_CSS_CSI;
2697	else
2698		ctrl->ctrl_config = NVME_CC_CSS_NVM;
2699
2700	/*
2701	 * Setting CRIME results in CSTS.RDY before the media is ready. This
2702	 * makes it possible for media related commands to return the error
2703	 * NVME_SC_ADMIN_COMMAND_MEDIA_NOT_READY. Until the driver is
2704	 * restructured to handle retries, disable CC.CRIME.
2705	 */
2706	ctrl->ctrl_config &= ~NVME_CC_CRIME;
2707
2708	ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
2709	ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
2710	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
2711	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2712	if (ret)
2713		return ret;
2714
2715	/* CAP value may change after initial CC write */
2716	ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
2717	if (ret)
2718		return ret;
2719
2720	timeout = NVME_CAP_TIMEOUT(ctrl->cap);
2721	if (ctrl->cap & NVME_CAP_CRMS_CRWMS) {
2722		u32 crto, ready_timeout;
2723
2724		ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto);
2725		if (ret) {
2726			dev_err(ctrl->device, "Reading CRTO failed (%d)\n",
2727				ret);
2728			return ret;
2729		}
2730
2731		/*
2732		 * CRTO should always be greater or equal to CAP.TO, but some
2733		 * devices are known to get this wrong. Use the larger of the
2734		 * two values.
2735		 */
2736		ready_timeout = NVME_CRTO_CRWMT(crto);
2737
2738		if (ready_timeout < timeout)
2739			dev_warn_once(ctrl->device, "bad crto:%x cap:%llx\n",
2740				      crto, ctrl->cap);
2741		else
2742			timeout = ready_timeout;
2743	}
2744
2745	ctrl->ctrl_config |= NVME_CC_ENABLE;
2746	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2747	if (ret)
2748		return ret;
2749	return nvme_wait_ready(ctrl, NVME_CSTS_RDY, NVME_CSTS_RDY,
2750			       (timeout + 1) / 2, "initialisation");
2751}
2752EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
2753
2754static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
2755{
2756	__le64 ts;
2757	int ret;
2758
2759	if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
2760		return 0;
2761
2762	ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
2763	ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
2764			NULL);
2765	if (ret)
2766		dev_warn_once(ctrl->device,
2767			"could not set timestamp (%d)\n", ret);
2768	return ret;
2769}
2770
2771static int nvme_configure_host_options(struct nvme_ctrl *ctrl)
2772{
2773	struct nvme_feat_host_behavior *host;
2774	u8 acre = 0, lbafee = 0;
2775	int ret;
2776
2777	/* Don't bother enabling the feature if retry delay is not reported */
2778	if (ctrl->crdt[0])
2779		acre = NVME_ENABLE_ACRE;
2780	if (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)
2781		lbafee = NVME_ENABLE_LBAFEE;
2782
2783	if (!acre && !lbafee)
2784		return 0;
2785
2786	host = kzalloc(sizeof(*host), GFP_KERNEL);
2787	if (!host)
2788		return 0;
2789
2790	host->acre = acre;
2791	host->lbafee = lbafee;
2792	ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
2793				host, sizeof(*host), NULL);
2794	kfree(host);
2795	return ret;
2796}
2797
2798/*
2799 * The function checks whether the given total (exlat + enlat) latency of
2800 * a power state allows the latter to be used as an APST transition target.
2801 * It does so by comparing the latency to the primary and secondary latency
2802 * tolerances defined by module params. If there's a match, the corresponding
2803 * timeout value is returned and the matching tolerance index (1 or 2) is
2804 * reported.
2805 */
2806static bool nvme_apst_get_transition_time(u64 total_latency,
2807		u64 *transition_time, unsigned *last_index)
2808{
2809	if (total_latency <= apst_primary_latency_tol_us) {
2810		if (*last_index == 1)
2811			return false;
2812		*last_index = 1;
2813		*transition_time = apst_primary_timeout_ms;
2814		return true;
2815	}
2816	if (apst_secondary_timeout_ms &&
2817		total_latency <= apst_secondary_latency_tol_us) {
2818		if (*last_index <= 2)
2819			return false;
2820		*last_index = 2;
2821		*transition_time = apst_secondary_timeout_ms;
2822		return true;
2823	}
2824	return false;
2825}
2826
2827/*
2828 * APST (Autonomous Power State Transition) lets us program a table of power
2829 * state transitions that the controller will perform automatically.
2830 *
2831 * Depending on module params, one of the two supported techniques will be used:
2832 *
2833 * - If the parameters provide explicit timeouts and tolerances, they will be
2834 *   used to build a table with up to 2 non-operational states to transition to.
2835 *   The default parameter values were selected based on the values used by
2836 *   Microsoft's and Intel's NVMe drivers. Yet, since we don't implement dynamic
2837 *   regeneration of the APST table in the event of switching between external
2838 *   and battery power, the timeouts and tolerances reflect a compromise
2839 *   between values used by Microsoft for AC and battery scenarios.
2840 * - If not, we'll configure the table with a simple heuristic: we are willing
2841 *   to spend at most 2% of the time transitioning between power states.
2842 *   Therefore, when running in any given state, we will enter the next
2843 *   lower-power non-operational state after waiting 50 * (enlat + exlat)
2844 *   microseconds, as long as that state's exit latency is under the requested
2845 *   maximum latency.
2846 *
2847 * We will not autonomously enter any non-operational state for which the total
2848 * latency exceeds ps_max_latency_us.
2849 *
2850 * Users can set ps_max_latency_us to zero to turn off APST.
2851 */
2852static int nvme_configure_apst(struct nvme_ctrl *ctrl)
2853{
2854	struct nvme_feat_auto_pst *table;
2855	unsigned apste = 0;
2856	u64 max_lat_us = 0;
2857	__le64 target = 0;
2858	int max_ps = -1;
2859	int state;
2860	int ret;
2861	unsigned last_lt_index = UINT_MAX;
2862
2863	/*
2864	 * If APST isn't supported or if we haven't been initialized yet,
2865	 * then don't do anything.
2866	 */
2867	if (!ctrl->apsta)
2868		return 0;
2869
2870	if (ctrl->npss > 31) {
2871		dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
2872		return 0;
2873	}
2874
2875	table = kzalloc(sizeof(*table), GFP_KERNEL);
2876	if (!table)
2877		return 0;
2878
2879	if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
2880		/* Turn off APST. */
2881		dev_dbg(ctrl->device, "APST disabled\n");
2882		goto done;
2883	}
2884
2885	/*
2886	 * Walk through all states from lowest- to highest-power.
2887	 * According to the spec, lower-numbered states use more power.  NPSS,
2888	 * despite the name, is the index of the lowest-power state, not the
2889	 * number of states.
2890	 */
2891	for (state = (int)ctrl->npss; state >= 0; state--) {
2892		u64 total_latency_us, exit_latency_us, transition_ms;
2893
2894		if (target)
2895			table->entries[state] = target;
2896
2897		/*
2898		 * Don't allow transitions to the deepest state if it's quirked
2899		 * off.
2900		 */
2901		if (state == ctrl->npss &&
2902		    (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
2903			continue;
2904
2905		/*
2906		 * Is this state a useful non-operational state for higher-power
2907		 * states to autonomously transition to?
2908		 */
2909		if (!(ctrl->psd[state].flags & NVME_PS_FLAGS_NON_OP_STATE))
2910			continue;
2911
2912		exit_latency_us = (u64)le32_to_cpu(ctrl->psd[state].exit_lat);
2913		if (exit_latency_us > ctrl->ps_max_latency_us)
2914			continue;
2915
2916		total_latency_us = exit_latency_us +
2917			le32_to_cpu(ctrl->psd[state].entry_lat);
2918
2919		/*
2920		 * This state is good. It can be used as the APST idle target
2921		 * for higher power states.
2922		 */
2923		if (apst_primary_timeout_ms && apst_primary_latency_tol_us) {
2924			if (!nvme_apst_get_transition_time(total_latency_us,
2925					&transition_ms, &last_lt_index))
2926				continue;
2927		} else {
2928			transition_ms = total_latency_us + 19;
2929			do_div(transition_ms, 20);
2930			if (transition_ms > (1 << 24) - 1)
2931				transition_ms = (1 << 24) - 1;
2932		}
2933
2934		target = cpu_to_le64((state << 3) | (transition_ms << 8));
2935		if (max_ps == -1)
2936			max_ps = state;
2937		if (total_latency_us > max_lat_us)
2938			max_lat_us = total_latency_us;
2939	}
2940
2941	if (max_ps == -1)
2942		dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
2943	else
2944		dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
2945			max_ps, max_lat_us, (int)sizeof(*table), table);
2946	apste = 1;
2947
2948done:
2949	ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
2950				table, sizeof(*table), NULL);
2951	if (ret)
2952		dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
2953	kfree(table);
2954	return ret;
2955}
2956
2957static void nvme_set_latency_tolerance(struct device *dev, s32 val)
2958{
2959	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2960	u64 latency;
2961
2962	switch (val) {
2963	case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
2964	case PM_QOS_LATENCY_ANY:
2965		latency = U64_MAX;
2966		break;
2967
2968	default:
2969		latency = val;
2970	}
2971
2972	if (ctrl->ps_max_latency_us != latency) {
2973		ctrl->ps_max_latency_us = latency;
2974		if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE)
2975			nvme_configure_apst(ctrl);
2976	}
2977}
2978
2979struct nvme_core_quirk_entry {
2980	/*
2981	 * NVMe model and firmware strings are padded with spaces.  For
2982	 * simplicity, strings in the quirk table are padded with NULLs
2983	 * instead.
2984	 */
2985	u16 vid;
2986	const char *mn;
2987	const char *fr;
2988	unsigned long quirks;
2989};
2990
2991static const struct nvme_core_quirk_entry core_quirks[] = {
2992	{
2993		/*
2994		 * This Toshiba device seems to die using any APST states.  See:
2995		 * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
2996		 */
2997		.vid = 0x1179,
2998		.mn = "THNSF5256GPUK TOSHIBA",
2999		.quirks = NVME_QUIRK_NO_APST,
3000	},
3001	{
3002		/*
3003		 * This LiteON CL1-3D*-Q11 firmware version has a race
3004		 * condition associated with actions related to suspend to idle
3005		 * LiteON has resolved the problem in future firmware
3006		 */
3007		.vid = 0x14a4,
3008		.fr = "22301111",
3009		.quirks = NVME_QUIRK_SIMPLE_SUSPEND,
3010	},
3011	{
3012		/*
3013		 * This Kioxia CD6-V Series / HPE PE8030 device times out and
3014		 * aborts I/O during any load, but more easily reproducible
3015		 * with discards (fstrim).
3016		 *
3017		 * The device is left in a state where it is also not possible
3018		 * to use "nvme set-feature" to disable APST, but booting with
3019		 * nvme_core.default_ps_max_latency=0 works.
3020		 */
3021		.vid = 0x1e0f,
3022		.mn = "KCD6XVUL6T40",
3023		.quirks = NVME_QUIRK_NO_APST,
3024	},
3025	{
3026		/*
3027		 * The external Samsung X5 SSD fails initialization without a
3028		 * delay before checking if it is ready and has a whole set of
3029		 * other problems.  To make this even more interesting, it
3030		 * shares the PCI ID with internal Samsung 970 Evo Plus that
3031		 * does not need or want these quirks.
3032		 */
3033		.vid = 0x144d,
3034		.mn = "Samsung Portable SSD X5",
3035		.quirks = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
3036			  NVME_QUIRK_NO_DEEPEST_PS |
3037			  NVME_QUIRK_IGNORE_DEV_SUBNQN,
3038	}
3039};
3040
3041/* match is null-terminated but idstr is space-padded. */
3042static bool string_matches(const char *idstr, const char *match, size_t len)
3043{
3044	size_t matchlen;
3045
3046	if (!match)
3047		return true;
3048
3049	matchlen = strlen(match);
3050	WARN_ON_ONCE(matchlen > len);
3051
3052	if (memcmp(idstr, match, matchlen))
3053		return false;
3054
3055	for (; matchlen < len; matchlen++)
3056		if (idstr[matchlen] != ' ')
3057			return false;
3058
3059	return true;
3060}
3061
3062static bool quirk_matches(const struct nvme_id_ctrl *id,
3063			  const struct nvme_core_quirk_entry *q)
3064{
3065	return q->vid == le16_to_cpu(id->vid) &&
3066		string_matches(id->mn, q->mn, sizeof(id->mn)) &&
3067		string_matches(id->fr, q->fr, sizeof(id->fr));
3068}
3069
3070static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
3071		struct nvme_id_ctrl *id)
3072{
3073	size_t nqnlen;
3074	int off;
3075
3076	if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) {
3077		nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
3078		if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
3079			strscpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
3080			return;
3081		}
3082
3083		if (ctrl->vs >= NVME_VS(1, 2, 1))
3084			dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
3085	}
3086
3087	/*
3088	 * Generate a "fake" NQN similar to the one in Section 4.5 of the NVMe
3089	 * Base Specification 2.0.  It is slightly different from the format
3090	 * specified there due to historic reasons, and we can't change it now.
3091	 */
3092	off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
3093			"nqn.2014.08.org.nvmexpress:%04x%04x",
3094			le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
3095	memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
3096	off += sizeof(id->sn);
3097	memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
3098	off += sizeof(id->mn);
3099	memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
3100}
3101
3102static void nvme_release_subsystem(struct device *dev)
3103{
3104	struct nvme_subsystem *subsys =
3105		container_of(dev, struct nvme_subsystem, dev);
3106
3107	if (subsys->instance >= 0)
3108		ida_free(&nvme_instance_ida, subsys->instance);
3109	kfree(subsys);
3110}
3111
3112static void nvme_destroy_subsystem(struct kref *ref)
3113{
3114	struct nvme_subsystem *subsys =
3115			container_of(ref, struct nvme_subsystem, ref);
3116
3117	mutex_lock(&nvme_subsystems_lock);
3118	list_del(&subsys->entry);
3119	mutex_unlock(&nvme_subsystems_lock);
3120
3121	ida_destroy(&subsys->ns_ida);
3122	device_del(&subsys->dev);
3123	put_device(&subsys->dev);
3124}
3125
3126static void nvme_put_subsystem(struct nvme_subsystem *subsys)
3127{
3128	kref_put(&subsys->ref, nvme_destroy_subsystem);
3129}
3130
3131static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
3132{
3133	struct nvme_subsystem *subsys;
3134
3135	lockdep_assert_held(&nvme_subsystems_lock);
3136
3137	/*
3138	 * Fail matches for discovery subsystems. This results
3139	 * in each discovery controller bound to a unique subsystem.
3140	 * This avoids issues with validating controller values
3141	 * that can only be true when there is a single unique subsystem.
3142	 * There may be multiple and completely independent entities
3143	 * that provide discovery controllers.
3144	 */
3145	if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME))
3146		return NULL;
3147
3148	list_for_each_entry(subsys, &nvme_subsystems, entry) {
3149		if (strcmp(subsys->subnqn, subsysnqn))
3150			continue;
3151		if (!kref_get_unless_zero(&subsys->ref))
3152			continue;
3153		return subsys;
3154	}
3155
3156	return NULL;
3157}
3158
3159static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl)
3160{
3161	return ctrl->opts && ctrl->opts->discovery_nqn;
3162}
3163
3164static inline bool nvme_admin_ctrl(struct nvme_ctrl *ctrl)
3165{
3166	return ctrl->cntrltype == NVME_CTRL_ADMIN;
3167}
3168
3169static inline bool nvme_is_io_ctrl(struct nvme_ctrl *ctrl)
3170{
3171	return !nvme_discovery_ctrl(ctrl) && !nvme_admin_ctrl(ctrl);
3172}
3173
3174static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
3175		struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
3176{
3177	struct nvme_ctrl *tmp;
3178
3179	lockdep_assert_held(&nvme_subsystems_lock);
3180
3181	list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
3182		if (nvme_state_terminal(tmp))
3183			continue;
3184
3185		if (tmp->cntlid == ctrl->cntlid) {
3186			dev_err(ctrl->device,
3187				"Duplicate cntlid %u with %s, subsys %s, rejecting\n",
3188				ctrl->cntlid, dev_name(tmp->device),
3189				subsys->subnqn);
3190			return false;
3191		}
3192
3193		if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
3194		    nvme_discovery_ctrl(ctrl))
3195			continue;
3196
3197		dev_err(ctrl->device,
3198			"Subsystem does not support multiple controllers\n");
3199		return false;
3200	}
3201
3202	return true;
3203}
3204
3205static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
3206{
3207	struct nvme_subsystem *subsys, *found;
3208	int ret;
3209
3210	subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
3211	if (!subsys)
3212		return -ENOMEM;
3213
3214	subsys->instance = -1;
3215	mutex_init(&subsys->lock);
3216	kref_init(&subsys->ref);
3217	INIT_LIST_HEAD(&subsys->ctrls);
3218	INIT_LIST_HEAD(&subsys->nsheads);
3219	nvme_init_subnqn(subsys, ctrl, id);
3220	memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
3221	memcpy(subsys->model, id->mn, sizeof(subsys->model));
3222	subsys->vendor_id = le16_to_cpu(id->vid);
3223	subsys->cmic = id->cmic;
3224	subsys->awupf = le16_to_cpu(id->awupf);
3225
3226	/* Versions prior to 1.4 don't necessarily report a valid type */
3227	if (id->cntrltype == NVME_CTRL_DISC ||
3228	    !strcmp(subsys->subnqn, NVME_DISC_SUBSYS_NAME))
3229		subsys->subtype = NVME_NQN_DISC;
3230	else
3231		subsys->subtype = NVME_NQN_NVME;
3232
3233	if (nvme_discovery_ctrl(ctrl) && subsys->subtype != NVME_NQN_DISC) {
3234		dev_err(ctrl->device,
3235			"Subsystem %s is not a discovery controller",
3236			subsys->subnqn);
3237		kfree(subsys);
3238		return -EINVAL;
3239	}
3240	nvme_mpath_default_iopolicy(subsys);
3241
3242	subsys->dev.class = &nvme_subsys_class;
3243	subsys->dev.release = nvme_release_subsystem;
3244	subsys->dev.groups = nvme_subsys_attrs_groups;
3245	dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
3246	device_initialize(&subsys->dev);
3247
3248	mutex_lock(&nvme_subsystems_lock);
3249	found = __nvme_find_get_subsystem(subsys->subnqn);
3250	if (found) {
3251		put_device(&subsys->dev);
3252		subsys = found;
3253
3254		if (!nvme_validate_cntlid(subsys, ctrl, id)) {
3255			ret = -EINVAL;
3256			goto out_put_subsystem;
3257		}
3258	} else {
3259		ret = device_add(&subsys->dev);
3260		if (ret) {
3261			dev_err(ctrl->device,
3262				"failed to register subsystem device.\n");
3263			put_device(&subsys->dev);
3264			goto out_unlock;
3265		}
3266		ida_init(&subsys->ns_ida);
3267		list_add_tail(&subsys->entry, &nvme_subsystems);
3268	}
3269
3270	ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
3271				dev_name(ctrl->device));
3272	if (ret) {
3273		dev_err(ctrl->device,
3274			"failed to create sysfs link from subsystem.\n");
3275		goto out_put_subsystem;
3276	}
3277
3278	if (!found)
3279		subsys->instance = ctrl->instance;
3280	ctrl->subsys = subsys;
3281	list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
3282	mutex_unlock(&nvme_subsystems_lock);
3283	return 0;
3284
3285out_put_subsystem:
3286	nvme_put_subsystem(subsys);
3287out_unlock:
3288	mutex_unlock(&nvme_subsystems_lock);
3289	return ret;
3290}
3291
3292static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
3293		u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi)
3294{
3295	struct nvme_command c = { };
3296	u32 dwlen = nvme_bytes_to_numd(size);
3297
3298	c.get_log_page.opcode = nvme_admin_get_log_page;
3299	c.get_log_page.nsid = cpu_to_le32(nsid);
3300	c.get_log_page.lid = log_page;
3301	c.get_log_page.lsp = lsp;
3302	c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
3303	c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
3304	c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
3305	c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
3306	c.get_log_page.csi = csi;
3307	c.get_log_page.lsi = cpu_to_le16(lsi);
3308
3309	return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
3310}
3311
3312int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
3313		void *log, size_t size, u64 offset)
3314{
3315	return nvme_get_log_lsi(ctrl, nsid, log_page, lsp, csi, log, size,
3316			offset, 0);
3317}
3318
3319static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
3320				struct nvme_effects_log **log)
3321{
3322	struct nvme_effects_log *old, *cel = xa_load(&ctrl->cels, csi);
3323	int ret;
3324
3325	if (cel)
3326		goto out;
3327
3328	cel = kzalloc(sizeof(*cel), GFP_KERNEL);
3329	if (!cel)
3330		return -ENOMEM;
3331
3332	ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi,
3333			cel, sizeof(*cel), 0);
3334	if (ret) {
3335		kfree(cel);
3336		return ret;
3337	}
3338
3339	old = xa_store(&ctrl->cels, csi, cel, GFP_KERNEL);
3340	if (xa_is_err(old)) {
3341		kfree(cel);
3342		return xa_err(old);
3343	}
3344out:
3345	*log = cel;
3346	return 0;
3347}
3348
3349static inline u32 nvme_mps_to_sectors(struct nvme_ctrl *ctrl, u32 units)
3350{
3351	u32 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12, val;
3352
3353	if (check_shl_overflow(1U, units + page_shift - 9, &val))
3354		return UINT_MAX;
3355	return val;
3356}
3357
3358static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
3359{
3360	struct nvme_command c = { };
3361	struct nvme_id_ctrl_nvm *id;
3362	int ret;
3363
3364	/*
3365	 * Even though NVMe spec explicitly states that MDTS is not applicable
3366	 * to the write-zeroes, we are cautious and limit the size to the
3367	 * controllers max_hw_sectors value, which is based on the MDTS field
3368	 * and possibly other limiting factors.
3369	 */
3370	if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) &&
3371	    !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
3372		ctrl->max_zeroes_sectors = ctrl->max_hw_sectors;
3373	else
3374		ctrl->max_zeroes_sectors = 0;
3375
3376	if (!nvme_is_io_ctrl(ctrl) ||
3377	    !nvme_id_cns_ok(ctrl, NVME_ID_CNS_CS_CTRL) ||
3378	    test_bit(NVME_CTRL_SKIP_ID_CNS_CS, &ctrl->flags))
3379		return 0;
3380
3381	id = kzalloc(sizeof(*id), GFP_KERNEL);
3382	if (!id)
3383		return -ENOMEM;
3384
3385	c.identify.opcode = nvme_admin_identify;
3386	c.identify.cns = NVME_ID_CNS_CS_CTRL;
3387	c.identify.csi = NVME_CSI_NVM;
3388
3389	ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
3390	if (ret)
3391		goto free_data;
3392
3393	ctrl->dmrl = id->dmrl;
3394	ctrl->dmrsl = le32_to_cpu(id->dmrsl);
3395	if (id->wzsl)
3396		ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl);
3397
3398free_data:
3399	if (ret > 0)
3400		set_bit(NVME_CTRL_SKIP_ID_CNS_CS, &ctrl->flags);
3401	kfree(id);
3402	return ret;
3403}
3404
3405static int nvme_init_effects_log(struct nvme_ctrl *ctrl,
3406		u8 csi, struct nvme_effects_log **log)
3407{
3408	struct nvme_effects_log *effects, *old;
3409
3410	effects = kzalloc(sizeof(*effects), GFP_KERNEL);
3411	if (!effects)
3412		return -ENOMEM;
3413
3414	old = xa_store(&ctrl->cels, csi, effects, GFP_KERNEL);
3415	if (xa_is_err(old)) {
3416		kfree(effects);
3417		return xa_err(old);
3418	}
3419
3420	*log = effects;
3421	return 0;
3422}
3423
3424static void nvme_init_known_nvm_effects(struct nvme_ctrl *ctrl)
3425{
3426	struct nvme_effects_log	*log = ctrl->effects;
3427
3428	log->acs[nvme_admin_format_nvm] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC |
3429						NVME_CMD_EFFECTS_NCC |
3430						NVME_CMD_EFFECTS_CSE_MASK);
3431	log->acs[nvme_admin_sanitize_nvm] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC |
3432						NVME_CMD_EFFECTS_CSE_MASK);
3433
3434	/*
3435	 * The spec says the result of a security receive command depends on
3436	 * the previous security send command. As such, many vendors log this
3437	 * command as one to submitted only when no other commands to the same
3438	 * namespace are outstanding. The intention is to tell the host to
3439	 * prevent mixing security send and receive.
3440	 *
3441	 * This driver can only enforce such exclusive access against IO
3442	 * queues, though. We are not readily able to enforce such a rule for
3443	 * two commands to the admin queue, which is the only queue that
3444	 * matters for this command.
3445	 *
3446	 * Rather than blindly freezing the IO queues for this effect that
3447	 * doesn't even apply to IO, mask it off.
3448	 */
3449	log->acs[nvme_admin_security_recv] &= cpu_to_le32(~NVME_CMD_EFFECTS_CSE_MASK);
3450
3451	log->iocs[nvme_cmd_write] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
3452	log->iocs[nvme_cmd_write_zeroes] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
3453	log->iocs[nvme_cmd_write_uncor] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
3454}
3455
3456static int nvme_init_effects(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
3457{
3458	int ret = 0;
3459
3460	if (ctrl->effects)
3461		return 0;
3462
3463	if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
3464		ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
3465		if (ret < 0)
3466			return ret;
3467	}
3468
3469	if (!ctrl->effects) {
3470		ret = nvme_init_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
3471		if (ret < 0)
3472			return ret;
3473	}
3474
3475	nvme_init_known_nvm_effects(ctrl);
3476	return 0;
3477}
3478
3479static int nvme_check_ctrl_fabric_info(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
3480{
3481	/*
3482	 * In fabrics we need to verify the cntlid matches the
3483	 * admin connect
3484	 */
3485	if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
3486		dev_err(ctrl->device,
3487			"Mismatching cntlid: Connect %u vs Identify %u, rejecting\n",
3488			ctrl->cntlid, le16_to_cpu(id->cntlid));
3489		return -EINVAL;
3490	}
3491
3492	if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
3493		dev_err(ctrl->device,
3494			"keep-alive support is mandatory for fabrics\n");
3495		return -EINVAL;
3496	}
3497
3498	if (nvme_is_io_ctrl(ctrl) && ctrl->ioccsz < 4) {
3499		dev_err(ctrl->device,
3500			"I/O queue command capsule supported size %d < 4\n",
3501			ctrl->ioccsz);
3502		return -EINVAL;
3503	}
3504
3505	if (nvme_is_io_ctrl(ctrl) && ctrl->iorcsz < 1) {
3506		dev_err(ctrl->device,
3507			"I/O queue response capsule supported size %d < 1\n",
3508			ctrl->iorcsz);
3509		return -EINVAL;
3510	}
3511
3512	if (!ctrl->maxcmd) {
3513		dev_warn(ctrl->device,
3514			"Firmware bug: maximum outstanding commands is 0\n");
3515		ctrl->maxcmd = ctrl->sqsize + 1;
3516	}
3517
3518	return 0;
3519}
3520
3521static int nvme_init_identify(struct nvme_ctrl *ctrl)
3522{
3523	struct queue_limits lim;
3524	struct nvme_id_ctrl *id;
3525	u32 max_hw_sectors;
3526	bool prev_apst_enabled;
3527	int ret;
3528
3529	ret = nvme_identify_ctrl(ctrl, &id);
3530	if (ret) {
3531		dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
3532		return -EIO;
3533	}
3534
3535	if (!(ctrl->ops->flags & NVME_F_FABRICS))
3536		ctrl->cntlid = le16_to_cpu(id->cntlid);
3537
3538	if (!ctrl->identified) {
3539		unsigned int i;
3540
3541		/*
3542		 * Check for quirks.  Quirk can depend on firmware version,
3543		 * so, in principle, the set of quirks present can change
3544		 * across a reset.  As a possible future enhancement, we
3545		 * could re-scan for quirks every time we reinitialize
3546		 * the device, but we'd have to make sure that the driver
3547		 * behaves intelligently if the quirks change.
3548		 */
3549		for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
3550			if (quirk_matches(id, &core_quirks[i]))
3551				ctrl->quirks |= core_quirks[i].quirks;
3552		}
3553
3554		ret = nvme_init_subsystem(ctrl, id);
3555		if (ret)
3556			goto out_free;
3557
3558		ret = nvme_init_effects(ctrl, id);
3559		if (ret)
3560			goto out_free;
3561	}
3562	memcpy(ctrl->subsys->firmware_rev, id->fr,
3563	       sizeof(ctrl->subsys->firmware_rev));
3564
3565	if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
3566		dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
3567		ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
3568	}
3569
3570	ctrl->crdt[0] = le16_to_cpu(id->crdt1);
3571	ctrl->crdt[1] = le16_to_cpu(id->crdt2);
3572	ctrl->crdt[2] = le16_to_cpu(id->crdt3);
3573
3574	ctrl->oacs = le16_to_cpu(id->oacs);
3575	ctrl->oncs = le16_to_cpu(id->oncs);
3576	ctrl->mtfa = le16_to_cpu(id->mtfa);
3577	ctrl->oaes = le32_to_cpu(id->oaes);
3578	ctrl->wctemp = le16_to_cpu(id->wctemp);
3579	ctrl->cctemp = le16_to_cpu(id->cctemp);
3580
3581	atomic_set(&ctrl->abort_limit, id->acl + 1);
3582	ctrl->vwc = id->vwc;
3583	if (id->mdts)
3584		max_hw_sectors = nvme_mps_to_sectors(ctrl, id->mdts);
3585	else
3586		max_hw_sectors = UINT_MAX;
3587	ctrl->max_hw_sectors =
3588		min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
3589
3590	lim = queue_limits_start_update(ctrl->admin_q);
3591	nvme_set_ctrl_limits(ctrl, &lim, true);
3592	ret = queue_limits_commit_update(ctrl->admin_q, &lim);
3593	if (ret)
3594		goto out_free;
3595
3596	ctrl->sgls = le32_to_cpu(id->sgls);
3597	ctrl->kas = le16_to_cpu(id->kas);
3598	ctrl->max_namespaces = le32_to_cpu(id->mnan);
3599	ctrl->ctratt = le32_to_cpu(id->ctratt);
3600
3601	ctrl->cntrltype = id->cntrltype;
3602	ctrl->dctype = id->dctype;
3603
3604	if (id->rtd3e) {
3605		/* us -> s */
3606		u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
3607
3608		ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
3609						 shutdown_timeout, 60);
3610
3611		if (ctrl->shutdown_timeout != shutdown_timeout)
3612			dev_info(ctrl->device,
3613				 "D3 entry latency set to %u seconds\n",
3614				 ctrl->shutdown_timeout);
3615	} else
3616		ctrl->shutdown_timeout = shutdown_timeout;
3617
3618	ctrl->npss = id->npss;
3619	ctrl->apsta = id->apsta;
3620	prev_apst_enabled = ctrl->apst_enabled;
3621	if (ctrl->quirks & NVME_QUIRK_NO_APST) {
3622		if (force_apst && id->apsta) {
3623			dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
3624			ctrl->apst_enabled = true;
3625		} else {
3626			ctrl->apst_enabled = false;
3627		}
3628	} else {
3629		ctrl->apst_enabled = id->apsta;
3630	}
3631	memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
3632
3633	if (ctrl->ops->flags & NVME_F_FABRICS) {
3634		ctrl->icdoff = le16_to_cpu(id->icdoff);
3635		ctrl->ioccsz = le32_to_cpu(id->ioccsz);
3636		ctrl->iorcsz = le32_to_cpu(id->iorcsz);
3637		ctrl->maxcmd = le16_to_cpu(id->maxcmd);
3638
3639		ret = nvme_check_ctrl_fabric_info(ctrl, id);
3640		if (ret)
3641			goto out_free;
3642	} else {
3643		ctrl->hmpre = le32_to_cpu(id->hmpre);
3644		ctrl->hmmin = le32_to_cpu(id->hmmin);
3645		ctrl->hmminds = le32_to_cpu(id->hmminds);
3646		ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
3647	}
3648
3649	ret = nvme_mpath_init_identify(ctrl, id);
3650	if (ret < 0)
3651		goto out_free;
3652
3653	if (ctrl->apst_enabled && !prev_apst_enabled)
3654		dev_pm_qos_expose_latency_tolerance(ctrl->device);
3655	else if (!ctrl->apst_enabled && prev_apst_enabled)
3656		dev_pm_qos_hide_latency_tolerance(ctrl->device);
3657out_free:
3658	kfree(id);
3659	return ret;
3660}
3661
3662/*
3663 * Initialize the cached copies of the Identify data and various controller
3664 * register in our nvme_ctrl structure.  This should be called as soon as
3665 * the admin queue is fully up and running.
3666 */
3667int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended)
3668{
3669	int ret;
3670
3671	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
3672	if (ret) {
3673		dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
3674		return ret;
3675	}
3676
3677	ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
3678
3679	if (ctrl->vs >= NVME_VS(1, 1, 0))
3680		ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
3681
3682	ret = nvme_init_identify(ctrl);
3683	if (ret)
3684		return ret;
3685
3686	if (nvme_admin_ctrl(ctrl)) {
3687		/*
3688		 * An admin controller has one admin queue, but no I/O queues.
3689		 * Override queue_count so it only creates an admin queue.
3690		 */
3691		dev_dbg(ctrl->device,
3692			"Subsystem %s is an administrative controller",
3693			ctrl->subsys->subnqn);
3694		ctrl->queue_count = 1;
3695	}
3696
3697	ret = nvme_configure_apst(ctrl);
3698	if (ret < 0)
3699		return ret;
3700
3701	ret = nvme_configure_timestamp(ctrl);
3702	if (ret < 0)
3703		return ret;
3704
3705	ret = nvme_configure_host_options(ctrl);
3706	if (ret < 0)
3707		return ret;
3708
3709	nvme_configure_opal(ctrl, was_suspended);
3710
3711	if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) {
3712		/*
3713		 * Do not return errors unless we are in a controller reset,
3714		 * the controller works perfectly fine without hwmon.
3715		 */
3716		ret = nvme_hwmon_init(ctrl);
3717		if (ret == -EINTR)
3718			return ret;
3719	}
3720
3721	clear_bit(NVME_CTRL_DIRTY_CAPABILITY, &ctrl->flags);
3722	ctrl->identified = true;
3723
3724	nvme_start_keep_alive(ctrl);
3725
3726	return 0;
3727}
3728EXPORT_SYMBOL_GPL(nvme_init_ctrl_finish);
3729
3730static int nvme_dev_open(struct inode *inode, struct file *file)
3731{
3732	struct nvme_ctrl *ctrl =
3733		container_of(inode->i_cdev, struct nvme_ctrl, cdev);
3734
3735	switch (nvme_ctrl_state(ctrl)) {
3736	case NVME_CTRL_LIVE:
3737		break;
3738	default:
3739		return -EWOULDBLOCK;
3740	}
3741
3742	nvme_get_ctrl(ctrl);
3743	if (!try_module_get(ctrl->ops->module)) {
3744		nvme_put_ctrl(ctrl);
3745		return -EINVAL;
3746	}
3747
3748	file->private_data = ctrl;
3749	return 0;
3750}
3751
3752static int nvme_dev_release(struct inode *inode, struct file *file)
3753{
3754	struct nvme_ctrl *ctrl =
3755		container_of(inode->i_cdev, struct nvme_ctrl, cdev);
3756
3757	module_put(ctrl->ops->module);
3758	nvme_put_ctrl(ctrl);
3759	return 0;
3760}
3761
3762static const struct file_operations nvme_dev_fops = {
3763	.owner		= THIS_MODULE,
3764	.open		= nvme_dev_open,
3765	.release	= nvme_dev_release,
3766	.unlocked_ioctl	= nvme_dev_ioctl,
3767	.compat_ioctl	= compat_ptr_ioctl,
3768	.uring_cmd	= nvme_dev_uring_cmd,
3769};
3770
3771static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
3772		unsigned nsid)
3773{
3774	struct nvme_ns_head *h;
3775
3776	lockdep_assert_held(&ctrl->subsys->lock);
3777
3778	list_for_each_entry(h, &ctrl->subsys->nsheads, entry) {
3779		/*
3780		 * Private namespaces can share NSIDs under some conditions.
3781		 * In that case we can't use the same ns_head for namespaces
3782		 * with the same NSID.
3783		 */
3784		if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
3785			continue;
3786		if (nvme_tryget_ns_head(h))
3787			return h;
3788	}
3789
3790	return NULL;
3791}
3792
3793static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys,
3794		struct nvme_ns_ids *ids)
3795{
3796	bool has_uuid = !uuid_is_null(&ids->uuid);
3797	bool has_nguid = memchr_inv(ids->nguid, 0, sizeof(ids->nguid));
3798	bool has_eui64 = memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
3799	struct nvme_ns_head *h;
3800
3801	lockdep_assert_held(&subsys->lock);
3802
3803	list_for_each_entry(h, &subsys->nsheads, entry) {
3804		if (has_uuid && uuid_equal(&ids->uuid, &h->ids.uuid))
3805			return -EINVAL;
3806		if (has_nguid &&
3807		    memcmp(&ids->nguid, &h->ids.nguid, sizeof(ids->nguid)) == 0)
3808			return -EINVAL;
3809		if (has_eui64 &&
3810		    memcmp(&ids->eui64, &h->ids.eui64, sizeof(ids->eui64)) == 0)
3811			return -EINVAL;
3812	}
3813
3814	return 0;
3815}
3816
3817static void nvme_cdev_rel(struct device *dev)
3818{
3819	ida_free(&nvme_ns_chr_minor_ida, MINOR(dev->devt));
3820}
3821
3822void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device)
3823{
3824	cdev_device_del(cdev, cdev_device);
3825	put_device(cdev_device);
3826}
3827
3828int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device,
3829		const struct file_operations *fops, struct module *owner)
3830{
3831	int minor, ret;
3832
3833	minor = ida_alloc(&nvme_ns_chr_minor_ida, GFP_KERNEL);
3834	if (minor < 0)
3835		return minor;
3836	cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor);
3837	cdev_device->class = &nvme_ns_chr_class;
3838	cdev_device->release = nvme_cdev_rel;
3839	device_initialize(cdev_device);
3840	cdev_init(cdev, fops);
3841	cdev->owner = owner;
3842	ret = cdev_device_add(cdev, cdev_device);
3843	if (ret)
3844		put_device(cdev_device);
3845
3846	return ret;
3847}
3848
3849static int nvme_ns_chr_open(struct inode *inode, struct file *file)
3850{
3851	return nvme_ns_open(container_of(inode->i_cdev, struct nvme_ns, cdev));
3852}
3853
3854static int nvme_ns_chr_release(struct inode *inode, struct file *file)
3855{
3856	nvme_ns_release(container_of(inode->i_cdev, struct nvme_ns, cdev));
3857	return 0;
3858}
3859
3860static const struct file_operations nvme_ns_chr_fops = {
3861	.owner		= THIS_MODULE,
3862	.open		= nvme_ns_chr_open,
3863	.release	= nvme_ns_chr_release,
3864	.unlocked_ioctl	= nvme_ns_chr_ioctl,
3865	.compat_ioctl	= compat_ptr_ioctl,
3866	.uring_cmd	= nvme_ns_chr_uring_cmd,
3867	.uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
3868};
3869
3870static int nvme_add_ns_cdev(struct nvme_ns *ns)
3871{
3872	int ret;
3873
3874	ns->cdev_device.parent = ns->ctrl->device;
3875	ret = dev_set_name(&ns->cdev_device, "ng%dn%d",
3876			   ns->ctrl->instance, ns->head->instance);
3877	if (ret)
3878		return ret;
3879
3880	return nvme_cdev_add(&ns->cdev, &ns->cdev_device, &nvme_ns_chr_fops,
3881			     ns->ctrl->ops->module);
3882}
3883
3884static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
3885		struct nvme_ns_info *info)
3886{
3887	struct nvme_ns_head *head;
3888	size_t size = sizeof(*head);
3889	int ret = -ENOMEM;
3890
3891#ifdef CONFIG_NVME_MULTIPATH
3892	size += num_possible_nodes() * sizeof(struct nvme_ns *);
3893#endif
3894
3895	head = kzalloc(size, GFP_KERNEL);
3896	if (!head)
3897		goto out;
3898	ret = ida_alloc_min(&ctrl->subsys->ns_ida, 1, GFP_KERNEL);
3899	if (ret < 0)
3900		goto out_free_head;
3901	head->instance = ret;
3902	INIT_LIST_HEAD(&head->list);
3903	ret = init_srcu_struct(&head->srcu);
3904	if (ret)
3905		goto out_ida_remove;
3906	head->subsys = ctrl->subsys;
3907	head->ns_id = info->nsid;
3908	head->ids = info->ids;
3909	head->shared = info->is_shared;
3910	head->rotational = info->is_rotational;
3911	ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1);
3912	ratelimit_set_flags(&head->rs_nuse, RATELIMIT_MSG_ON_RELEASE);
3913	kref_init(&head->ref);
3914
3915	if (head->ids.csi) {
3916		ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
3917		if (ret)
3918			goto out_cleanup_srcu;
3919	} else
3920		head->effects = ctrl->effects;
3921
3922	ret = nvme_mpath_alloc_disk(ctrl, head);
3923	if (ret)
3924		goto out_cleanup_srcu;
3925
3926	list_add_tail(&head->entry, &ctrl->subsys->nsheads);
3927
3928	kref_get(&ctrl->subsys->ref);
3929
3930	return head;
3931out_cleanup_srcu:
3932	cleanup_srcu_struct(&head->srcu);
3933out_ida_remove:
3934	ida_free(&ctrl->subsys->ns_ida, head->instance);
3935out_free_head:
3936	kfree(head);
3937out:
3938	if (ret > 0)
3939		ret = blk_status_to_errno(nvme_error_status(ret));
3940	return ERR_PTR(ret);
3941}
3942
3943static int nvme_global_check_duplicate_ids(struct nvme_subsystem *this,
3944		struct nvme_ns_ids *ids)
3945{
3946	struct nvme_subsystem *s;
3947	int ret = 0;
3948
3949	/*
3950	 * Note that this check is racy as we try to avoid holding the global
3951	 * lock over the whole ns_head creation.  But it is only intended as
3952	 * a sanity check anyway.
3953	 */
3954	mutex_lock(&nvme_subsystems_lock);
3955	list_for_each_entry(s, &nvme_subsystems, entry) {
3956		if (s == this)
3957			continue;
3958		mutex_lock(&s->lock);
3959		ret = nvme_subsys_check_duplicate_ids(s, ids);
3960		mutex_unlock(&s->lock);
3961		if (ret)
3962			break;
3963	}
3964	mutex_unlock(&nvme_subsystems_lock);
3965
3966	return ret;
3967}
3968
3969static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
3970{
3971	struct nvme_ctrl *ctrl = ns->ctrl;
3972	struct nvme_ns_head *head = NULL;
3973	int ret;
3974
3975	ret = nvme_global_check_duplicate_ids(ctrl->subsys, &info->ids);
3976	if (ret) {
3977		/*
3978		 * We've found two different namespaces on two different
3979		 * subsystems that report the same ID.  This is pretty nasty
3980		 * for anything that actually requires unique device
3981		 * identification.  In the kernel we need this for multipathing,
3982		 * and in user space the /dev/disk/by-id/ links rely on it.
3983		 *
3984		 * If the device also claims to be multi-path capable back off
3985		 * here now and refuse the probe the second device as this is a
3986		 * recipe for data corruption.  If not this is probably a
3987		 * cheap consumer device if on the PCIe bus, so let the user
3988		 * proceed and use the shiny toy, but warn that with changing
3989		 * probing order (which due to our async probing could just be
3990		 * device taking longer to startup) the other device could show
3991		 * up at any time.
3992		 */
3993		nvme_print_device_info(ctrl);
3994		if ((ns->ctrl->ops->flags & NVME_F_FABRICS) || /* !PCIe */
3995		    ((ns->ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) &&
3996		     info->is_shared)) {
3997			dev_err(ctrl->device,
3998				"ignoring nsid %d because of duplicate IDs\n",
3999				info->nsid);
4000			return ret;
4001		}
4002
4003		dev_err(ctrl->device,
4004			"clearing duplicate IDs for nsid %d\n", info->nsid);
4005		dev_err(ctrl->device,
4006			"use of /dev/disk/by-id/ may cause data corruption\n");
4007		memset(&info->ids.nguid, 0, sizeof(info->ids.nguid));
4008		memset(&info->ids.uuid, 0, sizeof(info->ids.uuid));
4009		memset(&info->ids.eui64, 0, sizeof(info->ids.eui64));
4010		ctrl->quirks |= NVME_QUIRK_BOGUS_NID;
4011	}
4012
4013	mutex_lock(&ctrl->subsys->lock);
4014	head = nvme_find_ns_head(ctrl, info->nsid);
4015	if (!head) {
4016		ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &info->ids);
4017		if (ret) {
4018			dev_err(ctrl->device,
4019				"duplicate IDs in subsystem for nsid %d\n",
4020				info->nsid);
4021			goto out_unlock;
4022		}
4023		head = nvme_alloc_ns_head(ctrl, info);
4024		if (IS_ERR(head)) {
4025			ret = PTR_ERR(head);
4026			goto out_unlock;
4027		}
4028	} else {
4029		ret = -EINVAL;
4030		if ((!info->is_shared || !head->shared) &&
4031		    !list_empty(&head->list)) {
4032			dev_err(ctrl->device,
4033				"Duplicate unshared namespace %d\n",
4034				info->nsid);
4035			goto out_put_ns_head;
4036		}
4037		if (!nvme_ns_ids_equal(&head->ids, &info->ids)) {
4038			dev_err(ctrl->device,
4039				"IDs don't match for shared namespace %d\n",
4040					info->nsid);
4041			goto out_put_ns_head;
4042		}
4043
4044		if (!multipath) {
4045			dev_warn(ctrl->device,
4046				"Found shared namespace %d, but multipathing not supported.\n",
4047				info->nsid);
4048			dev_warn_once(ctrl->device,
4049				"Shared namespace support requires core_nvme.multipath=Y.\n");
4050		}
4051	}
4052
4053	list_add_tail_rcu(&ns->siblings, &head->list);
4054	ns->head = head;
4055	mutex_unlock(&ctrl->subsys->lock);
4056
4057#ifdef CONFIG_NVME_MULTIPATH
4058	cancel_delayed_work(&head->remove_work);
4059#endif
4060	return 0;
4061
4062out_put_ns_head:
4063	nvme_put_ns_head(head);
4064out_unlock:
4065	mutex_unlock(&ctrl->subsys->lock);
4066	return ret;
4067}
4068
4069struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
4070{
4071	struct nvme_ns *ns, *ret = NULL;
4072	int srcu_idx;
4073
4074	srcu_idx = srcu_read_lock(&ctrl->srcu);
4075	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
4076				 srcu_read_lock_held(&ctrl->srcu)) {
4077		if (ns->head->ns_id == nsid) {
4078			if (!nvme_get_ns(ns))
4079				continue;
4080			ret = ns;
4081			break;
4082		}
4083		if (ns->head->ns_id > nsid)
4084			break;
4085	}
4086	srcu_read_unlock(&ctrl->srcu, srcu_idx);
4087	return ret;
4088}
4089EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, "NVME_TARGET_PASSTHRU");
4090
4091/*
4092 * Add the namespace to the controller list while keeping the list ordered.
4093 */
4094static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
4095{
4096	struct nvme_ns *tmp;
4097
4098	list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
4099		if (tmp->head->ns_id < ns->head->ns_id) {
4100			list_add_rcu(&ns->list, &tmp->list);
4101			return;
4102		}
4103	}
4104	list_add_rcu(&ns->list, &ns->ctrl->namespaces);
4105}
4106
4107static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
4108{
4109	struct queue_limits lim = { };
4110	struct nvme_ns *ns;
4111	struct gendisk *disk;
4112	int node = ctrl->numa_node;
4113	bool last_path = false;
4114
4115	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
4116	if (!ns)
4117		return;
4118
4119	if (ctrl->opts && ctrl->opts->data_digest)
4120		lim.features |= BLK_FEAT_STABLE_WRITES;
4121	if (ctrl->ops->supports_pci_p2pdma &&
4122	    ctrl->ops->supports_pci_p2pdma(ctrl))
4123		lim.features |= BLK_FEAT_PCI_P2PDMA;
4124
4125	disk = blk_mq_alloc_disk(ctrl->tagset, &lim, ns);
4126	if (IS_ERR(disk))
4127		goto out_free_ns;
4128	disk->fops = &nvme_bdev_ops;
4129	disk->private_data = ns;
4130
4131	ns->disk = disk;
4132	ns->queue = disk->queue;
4133	ns->ctrl = ctrl;
4134	kref_init(&ns->kref);
4135
4136	if (nvme_init_ns_head(ns, info))
4137		goto out_cleanup_disk;
4138
4139	/*
4140	 * If multipathing is enabled, the device name for all disks and not
4141	 * just those that represent shared namespaces needs to be based on the
4142	 * subsystem instance.  Using the controller instance for private
4143	 * namespaces could lead to naming collisions between shared and private
4144	 * namespaces if they don't use a common numbering scheme.
4145	 *
4146	 * If multipathing is not enabled, disk names must use the controller
4147	 * instance as shared namespaces will show up as multiple block
4148	 * devices.
4149	 */
4150	if (nvme_ns_head_multipath(ns->head)) {
4151		sprintf(disk->disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
4152			ctrl->instance, ns->head->instance);
4153		disk->flags |= GENHD_FL_HIDDEN;
4154	} else if (multipath) {
4155		sprintf(disk->disk_name, "nvme%dn%d", ctrl->subsys->instance,
4156			ns->head->instance);
4157	} else {
4158		sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance,
4159			ns->head->instance);
4160	}
4161
4162	if (nvme_update_ns_info(ns, info))
4163		goto out_unlink_ns;
4164
4165	mutex_lock(&ctrl->namespaces_lock);
4166	/*
4167	 * Ensure that no namespaces are added to the ctrl list after the queues
4168	 * are frozen, thereby avoiding a deadlock between scan and reset.
4169	 */
4170	if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) {
4171		mutex_unlock(&ctrl->namespaces_lock);
4172		goto out_unlink_ns;
4173	}
4174	nvme_ns_add_to_ctrl_list(ns);
4175	mutex_unlock(&ctrl->namespaces_lock);
4176	synchronize_srcu(&ctrl->srcu);
4177	nvme_get_ctrl(ctrl);
4178
4179	if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups))
4180		goto out_cleanup_ns_from_list;
4181
4182	if (!nvme_ns_head_multipath(ns->head))
4183		nvme_add_ns_cdev(ns);
4184
4185	nvme_mpath_add_disk(ns, info->anagrpid);
4186	nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
4187
4188	/*
4189	 * Set ns->disk->device->driver_data to ns so we can access
4190	 * ns->head->passthru_err_log_enabled in
4191	 * nvme_io_passthru_err_log_enabled_[store | show]().
4192	 */
4193	dev_set_drvdata(disk_to_dev(ns->disk), ns);
4194
4195	return;
4196
4197 out_cleanup_ns_from_list:
4198	nvme_put_ctrl(ctrl);
4199	mutex_lock(&ctrl->namespaces_lock);
4200	list_del_rcu(&ns->list);
4201	mutex_unlock(&ctrl->namespaces_lock);
4202	synchronize_srcu(&ctrl->srcu);
4203 out_unlink_ns:
4204	mutex_lock(&ctrl->subsys->lock);
4205	list_del_rcu(&ns->siblings);
4206	if (list_empty(&ns->head->list)) {
4207		list_del_init(&ns->head->entry);
4208		/*
4209		 * If multipath is not configured, we still create a namespace
4210		 * head (nshead), but head->disk is not initialized in that
4211		 * case.  As a result, only a single reference to nshead is held
4212		 * (via kref_init()) when it is created. Therefore, ensure that
4213		 * we do not release the reference to nshead twice if head->disk
4214		 * is not present.
4215		 */
4216		if (ns->head->disk)
4217			last_path = true;
4218	}
4219	mutex_unlock(&ctrl->subsys->lock);
4220	if (last_path)
4221		nvme_put_ns_head(ns->head);
4222	nvme_put_ns_head(ns->head);
4223 out_cleanup_disk:
4224	put_disk(disk);
4225 out_free_ns:
4226	kfree(ns);
4227}
4228
4229static void nvme_ns_remove(struct nvme_ns *ns)
4230{
4231	bool last_path = false;
4232
4233	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
4234		return;
4235
4236	clear_bit(NVME_NS_READY, &ns->flags);
4237	set_capacity(ns->disk, 0);
4238	nvme_fault_inject_fini(&ns->fault_inject);
4239
4240	/*
4241	 * Ensure that !NVME_NS_READY is seen by other threads to prevent
4242	 * this ns going back into current_path.
4243	 */
4244	synchronize_srcu(&ns->head->srcu);
4245
4246	/* wait for concurrent submissions */
4247	if (nvme_mpath_clear_current_path(ns))
4248		synchronize_srcu(&ns->head->srcu);
4249
4250	mutex_lock(&ns->ctrl->subsys->lock);
4251	list_del_rcu(&ns->siblings);
4252	if (list_empty(&ns->head->list)) {
4253		if (!nvme_mpath_queue_if_no_path(ns->head))
4254			list_del_init(&ns->head->entry);
4255		last_path = true;
4256	}
4257	mutex_unlock(&ns->ctrl->subsys->lock);
4258
4259	/* guarantee not available in head->list */
4260	synchronize_srcu(&ns->head->srcu);
4261
4262	if (!nvme_ns_head_multipath(ns->head))
4263		nvme_cdev_del(&ns->cdev, &ns->cdev_device);
4264
4265	nvme_mpath_remove_sysfs_link(ns);
4266
4267	del_gendisk(ns->disk);
4268
4269	mutex_lock(&ns->ctrl->namespaces_lock);
4270	list_del_rcu(&ns->list);
4271	mutex_unlock(&ns->ctrl->namespaces_lock);
4272	synchronize_srcu(&ns->ctrl->srcu);
4273
4274	if (last_path)
4275		nvme_mpath_remove_disk(ns->head);
4276	nvme_put_ns(ns);
4277}
4278
4279static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
4280{
4281	struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid);
4282
4283	if (ns) {
4284		nvme_ns_remove(ns);
4285		nvme_put_ns(ns);
4286	}
4287}
4288
4289static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
4290{
4291	int ret = NVME_SC_INVALID_NS | NVME_STATUS_DNR;
4292
4293	if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) {
4294		dev_err(ns->ctrl->device,
4295			"identifiers changed for nsid %d\n", ns->head->ns_id);
4296		goto out;
4297	}
4298
4299	ret = nvme_update_ns_info(ns, info);
4300out:
4301	/*
4302	 * Only remove the namespace if we got a fatal error back from the
4303	 * device, otherwise ignore the error and just move on.
4304	 *
4305	 * TODO: we should probably schedule a delayed retry here.
4306	 */
4307	if (ret > 0 && (ret & NVME_STATUS_DNR))
4308		nvme_ns_remove(ns);
4309}
4310
4311static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
4312{
4313	struct nvme_ns_info info = { .nsid = nsid };
4314	struct nvme_ns *ns;
4315	int ret = 1;
4316
4317	if (nvme_identify_ns_descs(ctrl, &info))
4318		return;
4319
4320	if (info.ids.csi != NVME_CSI_NVM && !nvme_multi_css(ctrl)) {
4321		dev_warn(ctrl->device,
4322			"command set not reported for nsid: %d\n", nsid);
4323		return;
4324	}
4325
4326	/*
4327	 * If available try to use the Command Set Independent Identify Namespace
4328	 * data structure to find all the generic information that is needed to
4329	 * set up a namespace.  If not fall back to the legacy version.
4330	 */
4331	if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) ||
4332	    (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS) ||
4333	    ctrl->vs >= NVME_VS(2, 0, 0))
4334		ret = nvme_ns_info_from_id_cs_indep(ctrl, &info);
4335	if (ret > 0)
4336		ret = nvme_ns_info_from_identify(ctrl, &info);
4337
4338	if (info.is_removed)
4339		nvme_ns_remove_by_nsid(ctrl, nsid);
4340
4341	/*
4342	 * Ignore the namespace if it is not ready. We will get an AEN once it
4343	 * becomes ready and restart the scan.
4344	 */
4345	if (ret || !info.is_ready)
4346		return;
4347
4348	ns = nvme_find_get_ns(ctrl, nsid);
4349	if (ns) {
4350		nvme_validate_ns(ns, &info);
4351		nvme_put_ns(ns);
4352	} else {
4353		nvme_alloc_ns(ctrl, &info);
4354	}
4355}
4356
4357/**
4358 * struct async_scan_info - keeps track of controller & NSIDs to scan
4359 * @ctrl:	Controller on which namespaces are being scanned
4360 * @next_nsid:	Index of next NSID to scan in ns_list
4361 * @ns_list:	Pointer to list of NSIDs to scan
4362 *
4363 * Note: There is a single async_scan_info structure shared by all instances
4364 * of nvme_scan_ns_async() scanning a given controller, so the atomic
4365 * operations on next_nsid are critical to ensure each instance scans a unique
4366 * NSID.
4367 */
4368struct async_scan_info {
4369	struct nvme_ctrl *ctrl;
4370	atomic_t next_nsid;
4371	__le32 *ns_list;
4372};
4373
4374static void nvme_scan_ns_async(void *data, async_cookie_t cookie)
4375{
4376	struct async_scan_info *scan_info = data;
4377	int idx;
4378	u32 nsid;
4379
4380	idx = (u32)atomic_fetch_inc(&scan_info->next_nsid);
4381	nsid = le32_to_cpu(scan_info->ns_list[idx]);
4382
4383	nvme_scan_ns(scan_info->ctrl, nsid);
4384}
4385
4386static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
4387					unsigned nsid)
4388{
4389	struct nvme_ns *ns, *next;
4390	LIST_HEAD(rm_list);
4391
4392	mutex_lock(&ctrl->namespaces_lock);
4393	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
4394		if (ns->head->ns_id > nsid) {
4395			list_del_rcu(&ns->list);
4396			synchronize_srcu(&ctrl->srcu);
4397			list_add_tail_rcu(&ns->list, &rm_list);
4398		}
4399	}
4400	mutex_unlock(&ctrl->namespaces_lock);
4401
4402	list_for_each_entry_safe(ns, next, &rm_list, list)
4403		nvme_ns_remove(ns);
4404}
4405
4406static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
4407{
4408	const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32);
4409	__le32 *ns_list;
4410	u32 prev = 0;
4411	int ret = 0, i;
4412	ASYNC_DOMAIN(domain);
4413	struct async_scan_info scan_info;
4414
4415	ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
4416	if (!ns_list)
4417		return -ENOMEM;
4418
4419	scan_info.ctrl = ctrl;
4420	scan_info.ns_list = ns_list;
4421	for (;;) {
4422		struct nvme_command cmd = {
4423			.identify.opcode	= nvme_admin_identify,
4424			.identify.cns		= NVME_ID_CNS_NS_ACTIVE_LIST,
4425			.identify.nsid		= cpu_to_le32(prev),
4426		};
4427
4428		ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list,
4429					    NVME_IDENTIFY_DATA_SIZE);
4430		if (ret) {
4431			dev_warn(ctrl->device,
4432				"Identify NS List failed (status=0x%x)\n", ret);
4433			goto free;
4434		}
4435
4436		atomic_set(&scan_info.next_nsid, 0);
4437		for (i = 0; i < nr_entries; i++) {
4438			u32 nsid = le32_to_cpu(ns_list[i]);
4439
4440			if (!nsid)	/* end of the list? */
4441				goto out;
4442			async_schedule_domain(nvme_scan_ns_async, &scan_info,
4443						&domain);
4444			while (++prev < nsid)
4445				nvme_ns_remove_by_nsid(ctrl, prev);
4446		}
4447		async_synchronize_full_domain(&domain);
4448	}
4449 out:
4450	nvme_remove_invalid_namespaces(ctrl, prev);
4451 free:
4452	async_synchronize_full_domain(&domain);
4453	kfree(ns_list);
4454	return ret;
4455}
4456
4457static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
4458{
4459	struct nvme_id_ctrl *id;
4460	u32 nn, i;
4461
4462	if (nvme_identify_ctrl(ctrl, &id))
4463		return;
4464	nn = le32_to_cpu(id->nn);
4465	kfree(id);
4466
4467	for (i = 1; i <= nn; i++)
4468		nvme_scan_ns(ctrl, i);
4469
4470	nvme_remove_invalid_namespaces(ctrl, nn);
4471}
4472
4473static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
4474{
4475	size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32);
4476	__le32 *log;
4477	int error;
4478
4479	log = kzalloc(log_size, GFP_KERNEL);
4480	if (!log)
4481		return;
4482
4483	/*
4484	 * We need to read the log to clear the AEN, but we don't want to rely
4485	 * on it for the changed namespace information as userspace could have
4486	 * raced with us in reading the log page, which could cause us to miss
4487	 * updates.
4488	 */
4489	error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
4490			NVME_CSI_NVM, log, log_size, 0);
4491	if (error)
4492		dev_warn(ctrl->device,
4493			"reading changed ns log failed: %d\n", error);
4494
4495	kfree(log);
4496}
4497
4498static void nvme_scan_work(struct work_struct *work)
4499{
4500	struct nvme_ctrl *ctrl =
4501		container_of(work, struct nvme_ctrl, scan_work);
4502	int ret;
4503
4504	/* No tagset on a live ctrl means IO queues could not created */
4505	if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE || !ctrl->tagset)
4506		return;
4507
4508	/*
4509	 * Identify controller limits can change at controller reset due to
4510	 * new firmware download, even though it is not common we cannot ignore
4511	 * such scenario. Controller's non-mdts limits are reported in the unit
4512	 * of logical blocks that is dependent on the format of attached
4513	 * namespace. Hence re-read the limits at the time of ns allocation.
4514	 */
4515	ret = nvme_init_non_mdts_limits(ctrl);
4516	if (ret < 0) {
4517		dev_warn(ctrl->device,
4518			"reading non-mdts-limits failed: %d\n", ret);
4519		return;
4520	}
4521
4522	if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
4523		dev_info(ctrl->device, "rescanning namespaces.\n");
4524		nvme_clear_changed_ns_log(ctrl);
4525	}
4526
4527	mutex_lock(&ctrl->scan_lock);
4528	if (!nvme_id_cns_ok(ctrl, NVME_ID_CNS_NS_ACTIVE_LIST)) {
4529		nvme_scan_ns_sequential(ctrl);
4530	} else {
4531		/*
4532		 * Fall back to sequential scan if DNR is set to handle broken
4533		 * devices which should support Identify NS List (as per the VS
4534		 * they report) but don't actually support it.
4535		 */
4536		ret = nvme_scan_ns_list(ctrl);
4537		if (ret > 0 && ret & NVME_STATUS_DNR)
4538			nvme_scan_ns_sequential(ctrl);
4539	}
4540	mutex_unlock(&ctrl->scan_lock);
4541
4542	/* Requeue if we have missed AENs */
4543	if (test_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events))
4544		nvme_queue_scan(ctrl);
4545#ifdef CONFIG_NVME_MULTIPATH
4546	else if (ctrl->ana_log_buf)
4547		/* Re-read the ANA log page to not miss updates */
4548		queue_work(nvme_wq, &ctrl->ana_work);
4549#endif
4550}
4551
4552/*
4553 * This function iterates the namespace list unlocked to allow recovery from
4554 * controller failure. It is up to the caller to ensure the namespace list is
4555 * not modified by scan work while this function is executing.
4556 */
4557void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
4558{
4559	struct nvme_ns *ns, *next;
4560	LIST_HEAD(ns_list);
4561
4562	/*
4563	 * make sure to requeue I/O to all namespaces as these
4564	 * might result from the scan itself and must complete
4565	 * for the scan_work to make progress
4566	 */
4567	nvme_mpath_clear_ctrl_paths(ctrl);
4568
4569	/*
4570	 * Unquiesce io queues so any pending IO won't hang, especially
4571	 * those submitted from scan work
4572	 */
4573	nvme_unquiesce_io_queues(ctrl);
4574
4575	/* prevent racing with ns scanning */
4576	flush_work(&ctrl->scan_work);
4577
4578	/*
4579	 * The dead states indicates the controller was not gracefully
4580	 * disconnected. In that case, we won't be able to flush any data while
4581	 * removing the namespaces' disks; fail all the queues now to avoid
4582	 * potentially having to clean up the failed sync later.
4583	 */
4584	if (nvme_ctrl_state(ctrl) == NVME_CTRL_DEAD)
4585		nvme_mark_namespaces_dead(ctrl);
4586
4587	/* this is a no-op when called from the controller reset handler */
4588	nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
4589
4590	mutex_lock(&ctrl->namespaces_lock);
4591	list_splice_init_rcu(&ctrl->namespaces, &ns_list, synchronize_rcu);
4592	mutex_unlock(&ctrl->namespaces_lock);
4593	synchronize_srcu(&ctrl->srcu);
4594
4595	list_for_each_entry_safe(ns, next, &ns_list, list)
4596		nvme_ns_remove(ns);
4597}
4598EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
4599
4600static int nvme_class_uevent(const struct device *dev, struct kobj_uevent_env *env)
4601{
4602	const struct nvme_ctrl *ctrl =
4603		container_of(dev, struct nvme_ctrl, ctrl_device);
4604	struct nvmf_ctrl_options *opts = ctrl->opts;
4605	int ret;
4606
4607	ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name);
4608	if (ret)
4609		return ret;
4610
4611	if (opts) {
4612		ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr);
4613		if (ret)
4614			return ret;
4615
4616		ret = add_uevent_var(env, "NVME_TRSVCID=%s",
4617				opts->trsvcid ?: "none");
4618		if (ret)
4619			return ret;
4620
4621		ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
4622				opts->host_traddr ?: "none");
4623		if (ret)
4624			return ret;
4625
4626		ret = add_uevent_var(env, "NVME_HOST_IFACE=%s",
4627				opts->host_iface ?: "none");
4628	}
4629	return ret;
4630}
4631
4632static void nvme_change_uevent(struct nvme_ctrl *ctrl, char *envdata)
4633{
4634	char *envp[2] = { envdata, NULL };
4635
4636	kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
4637}
4638
4639static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
4640{
4641	char *envp[2] = { NULL, NULL };
4642	u32 aen_result = ctrl->aen_result;
4643
4644	ctrl->aen_result = 0;
4645	if (!aen_result)
4646		return;
4647
4648	envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
4649	if (!envp[0])
4650		return;
4651	kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
4652	kfree(envp[0]);
4653}
4654
4655static void nvme_async_event_work(struct work_struct *work)
4656{
4657	struct nvme_ctrl *ctrl =
4658		container_of(work, struct nvme_ctrl, async_event_work);
4659
4660	nvme_aen_uevent(ctrl);
4661
4662	/*
4663	 * The transport drivers must guarantee AER submission here is safe by
4664	 * flushing ctrl async_event_work after changing the controller state
4665	 * from LIVE and before freeing the admin queue.
4666	*/
4667	if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE)
4668		ctrl->ops->submit_async_event(ctrl);
4669}
4670
4671static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
4672{
4673
4674	u32 csts;
4675
4676	if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
4677		return false;
4678
4679	if (csts == ~0)
4680		return false;
4681
4682	return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
4683}
4684
4685static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
4686{
4687	struct nvme_fw_slot_info_log *log;
4688	u8 next_fw_slot, cur_fw_slot;
4689
4690	log = kmalloc(sizeof(*log), GFP_KERNEL);
4691	if (!log)
4692		return;
4693
4694	if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
4695			 log, sizeof(*log), 0)) {
4696		dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
4697		goto out_free_log;
4698	}
4699
4700	cur_fw_slot = log->afi & 0x7;
4701	next_fw_slot = (log->afi & 0x70) >> 4;
4702	if (!cur_fw_slot || (next_fw_slot && (cur_fw_slot != next_fw_slot))) {
4703		dev_info(ctrl->device,
4704			 "Firmware is activated after next Controller Level Reset\n");
4705		goto out_free_log;
4706	}
4707
4708	memcpy(ctrl->subsys->firmware_rev, &log->frs[cur_fw_slot - 1],
4709		sizeof(ctrl->subsys->firmware_rev));
4710
4711out_free_log:
4712	kfree(log);
4713}
4714
4715static void nvme_fw_act_work(struct work_struct *work)
4716{
4717	struct nvme_ctrl *ctrl = container_of(work,
4718				struct nvme_ctrl, fw_act_work);
4719	unsigned long fw_act_timeout;
4720
4721	nvme_auth_stop(ctrl);
4722
4723	if (ctrl->mtfa)
4724		fw_act_timeout = jiffies + msecs_to_jiffies(ctrl->mtfa * 100);
4725	else
4726		fw_act_timeout = jiffies + secs_to_jiffies(admin_timeout);
4727
4728	nvme_quiesce_io_queues(ctrl);
4729	while (nvme_ctrl_pp_status(ctrl)) {
4730		if (time_after(jiffies, fw_act_timeout)) {
4731			dev_warn(ctrl->device,
4732				"Fw activation timeout, reset controller\n");
4733			nvme_try_sched_reset(ctrl);
4734			return;
4735		}
4736		msleep(100);
4737	}
4738
4739	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING) ||
4740	    !nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
4741		return;
4742
4743	nvme_unquiesce_io_queues(ctrl);
4744	/* read FW slot information to clear the AER */
4745	nvme_get_fw_slot_info(ctrl);
4746
4747	queue_work(nvme_wq, &ctrl->async_event_work);
4748}
4749
4750static u32 nvme_aer_type(u32 result)
4751{
4752	return result & 0x7;
4753}
4754
4755static u32 nvme_aer_subtype(u32 result)
4756{
4757	return (result & 0xff00) >> 8;
4758}
4759
4760static bool nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
4761{
4762	u32 aer_notice_type = nvme_aer_subtype(result);
4763	bool requeue = true;
4764
4765	switch (aer_notice_type) {
4766	case NVME_AER_NOTICE_NS_CHANGED:
4767		set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events);
4768		nvme_queue_scan(ctrl);
4769		break;
4770	case NVME_AER_NOTICE_FW_ACT_STARTING:
4771		/*
4772		 * We are (ab)using the RESETTING state to prevent subsequent
4773		 * recovery actions from interfering with the controller's
4774		 * firmware activation.
4775		 */
4776		if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
4777			requeue = false;
4778			queue_work(nvme_wq, &ctrl->fw_act_work);
4779		}
4780		break;
4781#ifdef CONFIG_NVME_MULTIPATH
4782	case NVME_AER_NOTICE_ANA:
4783		if (!ctrl->ana_log_buf)
4784			break;
4785		queue_work(nvme_wq, &ctrl->ana_work);
4786		break;
4787#endif
4788	case NVME_AER_NOTICE_DISC_CHANGED:
4789		ctrl->aen_result = result;
4790		break;
4791	default:
4792		dev_warn(ctrl->device, "async event result %08x\n", result);
4793	}
4794	return requeue;
4795}
4796
4797static void nvme_handle_aer_persistent_error(struct nvme_ctrl *ctrl)
4798{
4799	dev_warn(ctrl->device,
4800		"resetting controller due to persistent internal error\n");
4801	nvme_reset_ctrl(ctrl);
4802}
4803
4804void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
4805		volatile union nvme_result *res)
4806{
4807	u32 result = le32_to_cpu(res->u32);
4808	u32 aer_type = nvme_aer_type(result);
4809	u32 aer_subtype = nvme_aer_subtype(result);
4810	bool requeue = true;
4811
4812	if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
4813		return;
4814
4815	trace_nvme_async_event(ctrl, result);
4816	switch (aer_type) {
4817	case NVME_AER_NOTICE:
4818		requeue = nvme_handle_aen_notice(ctrl, result);
4819		break;
4820	case NVME_AER_ERROR:
4821		/*
4822		 * For a persistent internal error, don't run async_event_work
4823		 * to submit a new AER. The controller reset will do it.
4824		 */
4825		if (aer_subtype == NVME_AER_ERROR_PERSIST_INT_ERR) {
4826			nvme_handle_aer_persistent_error(ctrl);
4827			return;
4828		}
4829		fallthrough;
4830	case NVME_AER_SMART:
4831	case NVME_AER_CSS:
4832	case NVME_AER_VS:
4833		ctrl->aen_result = result;
4834		break;
4835	default:
4836		break;
4837	}
4838
4839	if (requeue)
4840		queue_work(nvme_wq, &ctrl->async_event_work);
4841}
4842EXPORT_SYMBOL_GPL(nvme_complete_async_event);
4843
4844int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
4845		const struct blk_mq_ops *ops, unsigned int cmd_size)
4846{
4847	struct queue_limits lim = {};
4848	int ret;
4849
4850	memset(set, 0, sizeof(*set));
4851	set->ops = ops;
4852	set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
4853	if (ctrl->ops->flags & NVME_F_FABRICS)
4854		/* Reserved for fabric connect and keep alive */
4855		set->reserved_tags = 2;
4856	set->numa_node = ctrl->numa_node;
4857	if (ctrl->ops->flags & NVME_F_BLOCKING)
4858		set->flags |= BLK_MQ_F_BLOCKING;
4859	set->cmd_size = cmd_size;
4860	set->driver_data = ctrl;
4861	set->nr_hw_queues = 1;
4862	set->timeout = NVME_ADMIN_TIMEOUT;
4863	ret = blk_mq_alloc_tag_set(set);
4864	if (ret)
4865		return ret;
4866
4867	ctrl->admin_q = blk_mq_alloc_queue(set, &lim, NULL);
4868	if (IS_ERR(ctrl->admin_q)) {
4869		ret = PTR_ERR(ctrl->admin_q);
4870		goto out_free_tagset;
4871	}
4872
4873	if (ctrl->ops->flags & NVME_F_FABRICS) {
4874		ctrl->fabrics_q = blk_mq_alloc_queue(set, NULL, NULL);
4875		if (IS_ERR(ctrl->fabrics_q)) {
4876			ret = PTR_ERR(ctrl->fabrics_q);
4877			goto out_cleanup_admin_q;
4878		}
4879	}
4880
4881	ctrl->admin_tagset = set;
4882	return 0;
4883
4884out_cleanup_admin_q:
4885	blk_mq_destroy_queue(ctrl->admin_q);
4886	blk_put_queue(ctrl->admin_q);
4887out_free_tagset:
4888	blk_mq_free_tag_set(set);
4889	ctrl->admin_q = NULL;
4890	ctrl->fabrics_q = NULL;
4891	return ret;
4892}
4893EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set);
4894
4895void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl)
4896{
4897	/*
4898	 * As we're about to destroy the queue and free tagset
4899	 * we can not have keep-alive work running.
4900	 */
4901	nvme_stop_keep_alive(ctrl);
4902	blk_mq_destroy_queue(ctrl->admin_q);
4903	if (ctrl->ops->flags & NVME_F_FABRICS) {
4904		blk_mq_destroy_queue(ctrl->fabrics_q);
4905		blk_put_queue(ctrl->fabrics_q);
4906	}
4907	blk_mq_free_tag_set(ctrl->admin_tagset);
4908}
4909EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set);
4910
4911int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
4912		const struct blk_mq_ops *ops, unsigned int nr_maps,
4913		unsigned int cmd_size)
4914{
4915	int ret;
4916
4917	memset(set, 0, sizeof(*set));
4918	set->ops = ops;
4919	set->queue_depth = min_t(unsigned, ctrl->sqsize, BLK_MQ_MAX_DEPTH - 1);
4920	/*
4921	 * Some Apple controllers requires tags to be unique across admin and
4922	 * the (only) I/O queue, so reserve the first 32 tags of the I/O queue.
4923	 */
4924	if (ctrl->quirks & NVME_QUIRK_SHARED_TAGS)
4925		set->reserved_tags = NVME_AQ_DEPTH;
4926	else if (ctrl->ops->flags & NVME_F_FABRICS)
4927		/* Reserved for fabric connect */
4928		set->reserved_tags = 1;
4929	set->numa_node = ctrl->numa_node;
4930	if (ctrl->ops->flags & NVME_F_BLOCKING)
4931		set->flags |= BLK_MQ_F_BLOCKING;
4932	set->cmd_size = cmd_size;
4933	set->driver_data = ctrl;
4934	set->nr_hw_queues = ctrl->queue_count - 1;
4935	set->timeout = NVME_IO_TIMEOUT;
4936	set->nr_maps = nr_maps;
4937	ret = blk_mq_alloc_tag_set(set);
4938	if (ret)
4939		return ret;
4940
4941	if (ctrl->ops->flags & NVME_F_FABRICS) {
4942		struct queue_limits lim = {
4943			.features	= BLK_FEAT_SKIP_TAGSET_QUIESCE,
4944		};
4945
4946		ctrl->connect_q = blk_mq_alloc_queue(set, &lim, NULL);
4947        	if (IS_ERR(ctrl->connect_q)) {
4948			ret = PTR_ERR(ctrl->connect_q);
4949			goto out_free_tag_set;
4950		}
4951	}
4952
4953	ctrl->tagset = set;
4954	return 0;
4955
4956out_free_tag_set:
4957	blk_mq_free_tag_set(set);
4958	ctrl->connect_q = NULL;
4959	return ret;
4960}
4961EXPORT_SYMBOL_GPL(nvme_alloc_io_tag_set);
4962
4963void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl)
4964{
4965	if (ctrl->ops->flags & NVME_F_FABRICS) {
4966		blk_mq_destroy_queue(ctrl->connect_q);
4967		blk_put_queue(ctrl->connect_q);
4968	}
4969	blk_mq_free_tag_set(ctrl->tagset);
4970}
4971EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set);
4972
4973void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
4974{
4975	nvme_mpath_stop(ctrl);
4976	nvme_auth_stop(ctrl);
4977	nvme_stop_failfast_work(ctrl);
4978	flush_work(&ctrl->async_event_work);
4979	cancel_work_sync(&ctrl->fw_act_work);
4980	if (ctrl->ops->stop_ctrl)
4981		ctrl->ops->stop_ctrl(ctrl);
4982}
4983EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
4984
4985void nvme_start_ctrl(struct nvme_ctrl *ctrl)
4986{
4987	nvme_enable_aen(ctrl);
4988
4989	/*
4990	 * persistent discovery controllers need to send indication to userspace
4991	 * to re-read the discovery log page to learn about possible changes
4992	 * that were missed. We identify persistent discovery controllers by
4993	 * checking that they started once before, hence are reconnecting back.
4994	 */
4995	if (test_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags) &&
4996	    nvme_discovery_ctrl(ctrl)) {
4997		if (!ctrl->kato) {
4998			nvme_stop_keep_alive(ctrl);
4999			ctrl->kato = NVME_DEFAULT_KATO;
5000			nvme_start_keep_alive(ctrl);
5001		}
5002		nvme_change_uevent(ctrl, "NVME_EVENT=rediscover");
5003	}
5004
5005	if (ctrl->queue_count > 1) {
5006		nvme_queue_scan(ctrl);
5007		nvme_unquiesce_io_queues(ctrl);
5008		nvme_mpath_update(ctrl);
5009	}
5010
5011	nvme_change_uevent(ctrl, "NVME_EVENT=connected");
5012	set_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags);
5013}
5014EXPORT_SYMBOL_GPL(nvme_start_ctrl);
5015
5016void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
5017{
5018	nvme_stop_keep_alive(ctrl);
5019	nvme_hwmon_exit(ctrl);
5020	nvme_fault_inject_fini(&ctrl->fault_inject);
5021	dev_pm_qos_hide_latency_tolerance(ctrl->device);
5022	cdev_device_del(&ctrl->cdev, ctrl->device);
5023	nvme_put_ctrl(ctrl);
5024}
5025EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
5026
5027static void nvme_free_cels(struct nvme_ctrl *ctrl)
5028{
5029	struct nvme_effects_log	*cel;
5030	unsigned long i;
5031
5032	xa_for_each(&ctrl->cels, i, cel) {
5033		xa_erase(&ctrl->cels, i);
5034		kfree(cel);
5035	}
5036
5037	xa_destroy(&ctrl->cels);
5038}
5039
5040static void nvme_free_ctrl(struct device *dev)
5041{
5042	struct nvme_ctrl *ctrl =
5043		container_of(dev, struct nvme_ctrl, ctrl_device);
5044	struct nvme_subsystem *subsys = ctrl->subsys;
5045
5046	if (ctrl->admin_q)
5047		blk_put_queue(ctrl->admin_q);
5048	if (!subsys || ctrl->instance != subsys->instance)
5049		ida_free(&nvme_instance_ida, ctrl->instance);
5050	nvme_free_cels(ctrl);
5051	nvme_mpath_uninit(ctrl);
5052	cleanup_srcu_struct(&ctrl->srcu);
5053	nvme_auth_stop(ctrl);
5054	nvme_auth_free(ctrl);
5055	__free_page(ctrl->discard_page);
5056	free_opal_dev(ctrl->opal_dev);
5057
5058	if (subsys) {
5059		mutex_lock(&nvme_subsystems_lock);
5060		list_del(&ctrl->subsys_entry);
5061		sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
5062		mutex_unlock(&nvme_subsystems_lock);
5063	}
5064
5065	ctrl->ops->free_ctrl(ctrl);
5066
5067	if (subsys)
5068		nvme_put_subsystem(subsys);
5069}
5070
5071/*
5072 * Initialize a NVMe controller structures.  This needs to be called during
5073 * earliest initialization so that we have the initialized structured around
5074 * during probing.
5075 *
5076 * On success, the caller must use the nvme_put_ctrl() to release this when
5077 * needed, which also invokes the ops->free_ctrl() callback.
5078 */
5079int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
5080		const struct nvme_ctrl_ops *ops, unsigned long quirks)
5081{
5082	int ret;
5083
5084	WRITE_ONCE(ctrl->state, NVME_CTRL_NEW);
5085	ctrl->passthru_err_log_enabled = false;
5086	clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
5087	spin_lock_init(&ctrl->lock);
5088	mutex_init(&ctrl->namespaces_lock);
5089
5090	ret = init_srcu_struct(&ctrl->srcu);
5091	if (ret)
5092		return ret;
5093
5094	mutex_init(&ctrl->scan_lock);
5095	INIT_LIST_HEAD(&ctrl->namespaces);
5096	xa_init(&ctrl->cels);
5097	ctrl->dev = dev;
5098	ctrl->ops = ops;
5099	ctrl->quirks = quirks;
5100	ctrl->numa_node = NUMA_NO_NODE;
5101	INIT_WORK(&ctrl->scan_work, nvme_scan_work);
5102	INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
5103	INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
5104	INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
5105	init_waitqueue_head(&ctrl->state_wq);
5106
5107	INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
5108	INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
5109	memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
5110	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
5111	ctrl->ka_last_check_time = jiffies;
5112
5113	BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
5114			PAGE_SIZE);
5115	ctrl->discard_page = alloc_page(GFP_KERNEL);
5116	if (!ctrl->discard_page) {
5117		ret = -ENOMEM;
5118		goto out;
5119	}
5120
5121	ret = ida_alloc(&nvme_instance_ida, GFP_KERNEL);
5122	if (ret < 0)
5123		goto out;
5124	ctrl->instance = ret;
5125
5126	ret = nvme_auth_init_ctrl(ctrl);
5127	if (ret)
5128		goto out_release_instance;
5129
5130	nvme_mpath_init_ctrl(ctrl);
5131
5132	device_initialize(&ctrl->ctrl_device);
5133	ctrl->device = &ctrl->ctrl_device;
5134	ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt),
5135			ctrl->instance);
5136	ctrl->device->class = &nvme_class;
5137	ctrl->device->parent = ctrl->dev;
5138	if (ops->dev_attr_groups)
5139		ctrl->device->groups = ops->dev_attr_groups;
5140	else
5141		ctrl->device->groups = nvme_dev_attr_groups;
5142	ctrl->device->release = nvme_free_ctrl;
5143	dev_set_drvdata(ctrl->device, ctrl);
5144
5145	return ret;
5146
5147out_release_instance:
5148	ida_free(&nvme_instance_ida, ctrl->instance);
5149out:
5150	if (ctrl->discard_page)
5151		__free_page(ctrl->discard_page);
5152	cleanup_srcu_struct(&ctrl->srcu);
5153	return ret;
5154}
5155EXPORT_SYMBOL_GPL(nvme_init_ctrl);
5156
5157/*
5158 * On success, returns with an elevated controller reference and caller must
5159 * use nvme_uninit_ctrl() to properly free resources associated with the ctrl.
5160 */
5161int nvme_add_ctrl(struct nvme_ctrl *ctrl)
5162{
5163	int ret;
5164
5165	ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
5166	if (ret)
5167		return ret;
5168
5169	cdev_init(&ctrl->cdev, &nvme_dev_fops);
5170	ctrl->cdev.owner = ctrl->ops->module;
5171	ret = cdev_device_add(&ctrl->cdev, ctrl->device);
5172	if (ret)
5173		return ret;
5174
5175	/*
5176	 * Initialize latency tolerance controls.  The sysfs files won't
5177	 * be visible to userspace unless the device actually supports APST.
5178	 */
5179	ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
5180	dev_pm_qos_update_user_latency_tolerance(ctrl->device,
5181		min(default_ps_max_latency_us, (unsigned long)S32_MAX));
5182
5183	nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
5184	nvme_get_ctrl(ctrl);
5185
5186	return 0;
5187}
5188EXPORT_SYMBOL_GPL(nvme_add_ctrl);
5189
5190/* let I/O to all namespaces fail in preparation for surprise removal */
5191void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
5192{
5193	struct nvme_ns *ns;
5194	int srcu_idx;
5195
5196	srcu_idx = srcu_read_lock(&ctrl->srcu);
5197	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
5198				 srcu_read_lock_held(&ctrl->srcu))
5199		blk_mark_disk_dead(ns->disk);
5200	srcu_read_unlock(&ctrl->srcu, srcu_idx);
5201}
5202EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
5203
5204void nvme_unfreeze(struct nvme_ctrl *ctrl)
5205{
5206	struct nvme_ns *ns;
5207	int srcu_idx;
5208
5209	srcu_idx = srcu_read_lock(&ctrl->srcu);
5210	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
5211				 srcu_read_lock_held(&ctrl->srcu))
5212		blk_mq_unfreeze_queue_non_owner(ns->queue);
5213	srcu_read_unlock(&ctrl->srcu, srcu_idx);
5214	clear_bit(NVME_CTRL_FROZEN, &ctrl->flags);
5215}
5216EXPORT_SYMBOL_GPL(nvme_unfreeze);
5217
5218int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
5219{
5220	struct nvme_ns *ns;
5221	int srcu_idx;
5222
5223	srcu_idx = srcu_read_lock(&ctrl->srcu);
5224	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
5225				 srcu_read_lock_held(&ctrl->srcu)) {
5226		timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
5227		if (timeout <= 0)
5228			break;
5229	}
5230	srcu_read_unlock(&ctrl->srcu, srcu_idx);
5231	return timeout;
5232}
5233EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
5234
5235void nvme_wait_freeze(struct nvme_ctrl *ctrl)
5236{
5237	struct nvme_ns *ns;
5238	int srcu_idx;
5239
5240	srcu_idx = srcu_read_lock(&ctrl->srcu);
5241	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
5242				 srcu_read_lock_held(&ctrl->srcu))
5243		blk_mq_freeze_queue_wait(ns->queue);
5244	srcu_read_unlock(&ctrl->srcu, srcu_idx);
5245}
5246EXPORT_SYMBOL_GPL(nvme_wait_freeze);
5247
5248void nvme_start_freeze(struct nvme_ctrl *ctrl)
5249{
5250	struct nvme_ns *ns;
5251	int srcu_idx;
5252
5253	set_bit(NVME_CTRL_FROZEN, &ctrl->flags);
5254	srcu_idx = srcu_read_lock(&ctrl->srcu);
5255	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
5256				 srcu_read_lock_held(&ctrl->srcu))
5257		/*
5258		 * Typical non_owner use case is from pci driver, in which
5259		 * start_freeze is called from timeout work function, but
5260		 * unfreeze is done in reset work context
5261		 */
5262		blk_freeze_queue_start_non_owner(ns->queue);
5263	srcu_read_unlock(&ctrl->srcu, srcu_idx);
5264}
5265EXPORT_SYMBOL_GPL(nvme_start_freeze);
5266
5267void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl)
5268{
5269	if (!ctrl->tagset)
5270		return;
5271	if (!test_and_set_bit(NVME_CTRL_STOPPED, &ctrl->flags))
5272		blk_mq_quiesce_tagset(ctrl->tagset);
5273	else
5274		blk_mq_wait_quiesce_done(ctrl->tagset);
5275}
5276EXPORT_SYMBOL_GPL(nvme_quiesce_io_queues);
5277
5278void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl)
5279{
5280	if (!ctrl->tagset)
5281		return;
5282	if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags))
5283		blk_mq_unquiesce_tagset(ctrl->tagset);
5284}
5285EXPORT_SYMBOL_GPL(nvme_unquiesce_io_queues);
5286
5287void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl)
5288{
5289	if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
5290		blk_mq_quiesce_queue(ctrl->admin_q);
5291	else
5292		blk_mq_wait_quiesce_done(ctrl->admin_q->tag_set);
5293}
5294EXPORT_SYMBOL_GPL(nvme_quiesce_admin_queue);
5295
5296void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl)
5297{
5298	if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
5299		blk_mq_unquiesce_queue(ctrl->admin_q);
5300}
5301EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
5302
5303void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
5304{
5305	struct nvme_ns *ns;
5306	int srcu_idx;
5307
5308	srcu_idx = srcu_read_lock(&ctrl->srcu);
5309	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
5310				 srcu_read_lock_held(&ctrl->srcu))
5311		blk_sync_queue(ns->queue);
5312	srcu_read_unlock(&ctrl->srcu, srcu_idx);
5313}
5314EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
5315
5316void nvme_sync_queues(struct nvme_ctrl *ctrl)
5317{
5318	nvme_sync_io_queues(ctrl);
5319	if (ctrl->admin_q)
5320		blk_sync_queue(ctrl->admin_q);
5321}
5322EXPORT_SYMBOL_GPL(nvme_sync_queues);
5323
5324struct nvme_ctrl *nvme_ctrl_from_file(struct file *file)
5325{
5326	if (file->f_op != &nvme_dev_fops)
5327		return NULL;
5328	return file->private_data;
5329}
5330EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, "NVME_TARGET_PASSTHRU");
5331
5332/*
5333 * Check we didn't inadvertently grow the command structure sizes:
5334 */
5335static inline void _nvme_check_size(void)
5336{
5337	BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64);
5338	BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
5339	BUILD_BUG_ON(sizeof(struct nvme_identify) != 64);
5340	BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
5341	BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
5342	BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
5343	BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
5344	BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
5345	BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
5346	BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
5347	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
5348	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
5349	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
5350	BUILD_BUG_ON(sizeof(struct nvme_id_ns_cs_indep) !=
5351			NVME_IDENTIFY_DATA_SIZE);
5352	BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
5353	BUILD_BUG_ON(sizeof(struct nvme_id_ns_nvm) != NVME_IDENTIFY_DATA_SIZE);
5354	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
5355	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE);
5356	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
5357	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
5358	BUILD_BUG_ON(sizeof(struct nvme_endurance_group_log) != 512);
5359	BUILD_BUG_ON(sizeof(struct nvme_rotational_media_log) != 512);
5360	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
5361	BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
5362	BUILD_BUG_ON(sizeof(struct nvme_feat_host_behavior) != 512);
5363}
5364
5365
5366static int __init nvme_core_init(void)
5367{
5368	unsigned int wq_flags = WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS;
5369	int result = -ENOMEM;
5370
5371	_nvme_check_size();
5372
5373	nvme_wq = alloc_workqueue("nvme-wq", wq_flags, 0);
5374	if (!nvme_wq)
5375		goto out;
5376
5377	nvme_reset_wq = alloc_workqueue("nvme-reset-wq", wq_flags, 0);
5378	if (!nvme_reset_wq)
5379		goto destroy_wq;
5380
5381	nvme_delete_wq = alloc_workqueue("nvme-delete-wq", wq_flags, 0);
5382	if (!nvme_delete_wq)
5383		goto destroy_reset_wq;
5384
5385	result = alloc_chrdev_region(&nvme_ctrl_base_chr_devt, 0,
5386			NVME_MINORS, "nvme");
5387	if (result < 0)
5388		goto destroy_delete_wq;
5389
5390	result = class_register(&nvme_class);
5391	if (result)
5392		goto unregister_chrdev;
5393
5394	result = class_register(&nvme_subsys_class);
5395	if (result)
5396		goto destroy_class;
5397
5398	result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS,
5399				     "nvme-generic");
5400	if (result < 0)
5401		goto destroy_subsys_class;
5402
5403	result = class_register(&nvme_ns_chr_class);
5404	if (result)
5405		goto unregister_generic_ns;
5406
5407	result = nvme_init_auth();
5408	if (result)
5409		goto destroy_ns_chr;
5410	return 0;
5411
5412destroy_ns_chr:
5413	class_unregister(&nvme_ns_chr_class);
5414unregister_generic_ns:
5415	unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
5416destroy_subsys_class:
5417	class_unregister(&nvme_subsys_class);
5418destroy_class:
5419	class_unregister(&nvme_class);
5420unregister_chrdev:
5421	unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
5422destroy_delete_wq:
5423	destroy_workqueue(nvme_delete_wq);
5424destroy_reset_wq:
5425	destroy_workqueue(nvme_reset_wq);
5426destroy_wq:
5427	destroy_workqueue(nvme_wq);
5428out:
5429	return result;
5430}
5431
5432static void __exit nvme_core_exit(void)
5433{
5434	nvme_exit_auth();
5435	class_unregister(&nvme_ns_chr_class);
5436	class_unregister(&nvme_subsys_class);
5437	class_unregister(&nvme_class);
5438	unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
5439	unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
5440	destroy_workqueue(nvme_delete_wq);
5441	destroy_workqueue(nvme_reset_wq);
5442	destroy_workqueue(nvme_wq);
5443	ida_destroy(&nvme_ns_chr_minor_ida);
5444	ida_destroy(&nvme_instance_ida);
5445}
5446
5447MODULE_LICENSE("GPL");
5448MODULE_VERSION("1.0");
5449MODULE_DESCRIPTION("NVMe host core framework");
5450module_init(nvme_core_init);
5451module_exit(nvme_core_exit);