block/blk-zoned.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / block / blk-zoned.c
at master 69 kB view raw
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Zoned block device handling
   4 *
   5 * Copyright (c) 2015, Hannes Reinecke
   6 * Copyright (c) 2015, SUSE Linux GmbH
   7 *
   8 * Copyright (c) 2016, Damien Le Moal
   9 * Copyright (c) 2016, Western Digital
  10 * Copyright (c) 2024, Western Digital Corporation or its affiliates.
  11 */
  12
  13#include <linux/kernel.h>
  14#include <linux/blkdev.h>
  15#include <linux/blk-mq.h>
  16#include <linux/spinlock.h>
  17#include <linux/refcount.h>
  18#include <linux/mempool.h>
  19
  20#include <trace/events/block.h>
  21
  22#include "blk.h"
  23#include "blk-mq-sched.h"
  24#include "blk-mq-debugfs.h"
  25
  26#define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
  27static const char *const zone_cond_name[] = {
  28	ZONE_COND_NAME(NOT_WP),
  29	ZONE_COND_NAME(EMPTY),
  30	ZONE_COND_NAME(IMP_OPEN),
  31	ZONE_COND_NAME(EXP_OPEN),
  32	ZONE_COND_NAME(CLOSED),
  33	ZONE_COND_NAME(READONLY),
  34	ZONE_COND_NAME(FULL),
  35	ZONE_COND_NAME(OFFLINE),
  36	ZONE_COND_NAME(ACTIVE),
  37};
  38#undef ZONE_COND_NAME
  39
  40/*
  41 * Per-zone write plug.
  42 * @node: hlist_node structure for managing the plug using a hash table.
  43 * @bio_list: The list of BIOs that are currently plugged.
  44 * @bio_work: Work struct to handle issuing of plugged BIOs
  45 * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
  46 * @disk: The gendisk the plug belongs to.
  47 * @lock: Spinlock to atomically manipulate the plug.
  48 * @ref: Zone write plug reference counter. A zone write plug reference is
  49 *       always at least 1 when the plug is hashed in the disk plug hash table.
  50 *       The reference is incremented whenever a new BIO needing plugging is
  51 *       submitted and when a function needs to manipulate a plug. The
  52 *       reference count is decremented whenever a plugged BIO completes and
  53 *       when a function that referenced the plug returns. The initial
  54 *       reference is dropped whenever the zone of the zone write plug is reset,
  55 *       finished and when the zone becomes full (last write BIO to the zone
  56 *       completes).
  57 * @flags: Flags indicating the plug state.
  58 * @zone_no: The number of the zone the plug is managing.
  59 * @wp_offset: The zone write pointer location relative to the start of the zone
  60 *             as a number of 512B sectors.
  61 * @cond: Condition of the zone
  62 */
  63struct blk_zone_wplug {
  64	struct hlist_node	node;
  65	struct bio_list		bio_list;
  66	struct work_struct	bio_work;
  67	struct rcu_head		rcu_head;
  68	struct gendisk		*disk;
  69	spinlock_t		lock;
  70	refcount_t		ref;
  71	unsigned int		flags;
  72	unsigned int		zone_no;
  73	unsigned int		wp_offset;
  74	enum blk_zone_cond	cond;
  75};
  76
  77static inline bool disk_need_zone_resources(struct gendisk *disk)
  78{
  79	/*
  80	 * All request-based zoned devices need zone resources so that the
  81	 * block layer can automatically handle write BIO plugging. BIO-based
  82	 * device drivers (e.g. DM devices) are normally responsible for
  83	 * handling zone write ordering and do not need zone resources, unless
  84	 * the driver requires zone append emulation.
  85	 */
  86	return queue_is_mq(disk->queue) ||
  87		queue_emulates_zone_append(disk->queue);
  88}
  89
  90static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
  91{
  92	return 1U << disk->zone_wplugs_hash_bits;
  93}
  94
  95/*
  96 * Zone write plug flags bits:
  97 *  - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
  98 *    that is, that write BIOs are being throttled due to a write BIO already
  99 *    being executed or the zone write plug bio list is not empty.
 100 *  - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone
 101 *    write pointer offset and need to update it.
 102 *  - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
 103 *    from the disk hash table and that the initial reference to the zone
 104 *    write plug set when the plug was first added to the hash table has been
 105 *    dropped. This flag is set when a zone is reset, finished or become full,
 106 *    to prevent new references to the zone write plug to be taken for
 107 *    newly incoming BIOs. A zone write plug flagged with this flag will be
 108 *    freed once all remaining references from BIOs or functions are dropped.
 109 */
 110#define BLK_ZONE_WPLUG_PLUGGED		(1U << 0)
 111#define BLK_ZONE_WPLUG_NEED_WP_UPDATE	(1U << 1)
 112#define BLK_ZONE_WPLUG_UNHASHED		(1U << 2)
 113
 114/**
 115 * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
 116 * @zone_cond: BLK_ZONE_COND_XXX.
 117 *
 118 * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX
 119 * into string format. Useful in the debugging and tracing zone conditions. For
 120 * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN".
 121 */
 122const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
 123{
 124	static const char *zone_cond_str = "UNKNOWN";
 125
 126	if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond])
 127		zone_cond_str = zone_cond_name[zone_cond];
 128
 129	return zone_cond_str;
 130}
 131EXPORT_SYMBOL_GPL(blk_zone_cond_str);
 132
 133static void blk_zone_set_cond(u8 *zones_cond, unsigned int zno,
 134			      enum blk_zone_cond cond)
 135{
 136	if (!zones_cond)
 137		return;
 138
 139	switch (cond) {
 140	case BLK_ZONE_COND_IMP_OPEN:
 141	case BLK_ZONE_COND_EXP_OPEN:
 142	case BLK_ZONE_COND_CLOSED:
 143		zones_cond[zno] = BLK_ZONE_COND_ACTIVE;
 144		return;
 145	case BLK_ZONE_COND_NOT_WP:
 146	case BLK_ZONE_COND_EMPTY:
 147	case BLK_ZONE_COND_FULL:
 148	case BLK_ZONE_COND_OFFLINE:
 149	case BLK_ZONE_COND_READONLY:
 150	default:
 151		zones_cond[zno] = cond;
 152		return;
 153	}
 154}
 155
 156static void disk_zone_set_cond(struct gendisk *disk, sector_t sector,
 157			       enum blk_zone_cond cond)
 158{
 159	u8 *zones_cond;
 160
 161	rcu_read_lock();
 162	zones_cond = rcu_dereference(disk->zones_cond);
 163	if (zones_cond) {
 164		unsigned int zno = disk_zone_no(disk, sector);
 165
 166		/*
 167		 * The condition of a conventional, readonly and offline zones
 168		 * never changes, so do nothing if the target zone is in one of
 169		 * these conditions.
 170		 */
 171		switch (zones_cond[zno]) {
 172		case BLK_ZONE_COND_NOT_WP:
 173		case BLK_ZONE_COND_READONLY:
 174		case BLK_ZONE_COND_OFFLINE:
 175			break;
 176		default:
 177			blk_zone_set_cond(zones_cond, zno, cond);
 178			break;
 179		}
 180	}
 181	rcu_read_unlock();
 182}
 183
 184/**
 185 * bdev_zone_is_seq - check if a sector belongs to a sequential write zone
 186 * @bdev:       block device to check
 187 * @sector:     sector number
 188 *
 189 * Check if @sector on @bdev is contained in a sequential write required zone.
 190 */
 191bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector)
 192{
 193	struct gendisk *disk = bdev->bd_disk;
 194	unsigned int zno = disk_zone_no(disk, sector);
 195	bool is_seq = false;
 196	u8 *zones_cond;
 197
 198	if (!bdev_is_zoned(bdev))
 199		return false;
 200
 201	rcu_read_lock();
 202	zones_cond = rcu_dereference(disk->zones_cond);
 203	if (zones_cond && zno < disk->nr_zones)
 204		is_seq = zones_cond[zno] != BLK_ZONE_COND_NOT_WP;
 205	rcu_read_unlock();
 206
 207	return is_seq;
 208}
 209EXPORT_SYMBOL_GPL(bdev_zone_is_seq);
 210
 211/*
 212 * Zone report arguments for block device drivers report_zones operation.
 213 * @cb: report_zones_cb callback for each reported zone.
 214 * @data: Private data passed to report_zones_cb.
 215 */
 216struct blk_report_zones_args {
 217	report_zones_cb cb;
 218	void		*data;
 219	bool		report_active;
 220};
 221
 222static int blkdev_do_report_zones(struct block_device *bdev, sector_t sector,
 223				  unsigned int nr_zones,
 224				  struct blk_report_zones_args *args)
 225{
 226	struct gendisk *disk = bdev->bd_disk;
 227
 228	if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones))
 229		return -EOPNOTSUPP;
 230
 231	if (!nr_zones || sector >= get_capacity(disk))
 232		return 0;
 233
 234	return disk->fops->report_zones(disk, sector, nr_zones, args);
 235}
 236
 237/**
 238 * blkdev_report_zones - Get zones information
 239 * @bdev:	Target block device
 240 * @sector:	Sector from which to report zones
 241 * @nr_zones:	Maximum number of zones to report
 242 * @cb:		Callback function called for each reported zone
 243 * @data:	Private data for the callback
 244 *
 245 * Description:
 246 *    Get zone information starting from the zone containing @sector for at most
 247 *    @nr_zones, and call @cb for each zone reported by the device.
 248 *    To report all zones in a device starting from @sector, the BLK_ALL_ZONES
 249 *    constant can be passed to @nr_zones.
 250 *    Returns the number of zones reported by the device, or a negative errno
 251 *    value in case of failure.
 252 *
 253 *    Note: The caller must use memalloc_noXX_save/restore() calls to control
 254 *    memory allocations done within this function.
 255 */
 256int blkdev_report_zones(struct block_device *bdev, sector_t sector,
 257			unsigned int nr_zones, report_zones_cb cb, void *data)
 258{
 259	struct blk_report_zones_args args = {
 260		.cb = cb,
 261		.data = data,
 262	};
 263
 264	return blkdev_do_report_zones(bdev, sector, nr_zones, &args);
 265}
 266EXPORT_SYMBOL_GPL(blkdev_report_zones);
 267
 268static int blkdev_zone_reset_all(struct block_device *bdev)
 269{
 270	struct bio bio;
 271
 272	bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC);
 273	trace_blkdev_zone_mgmt(&bio, 0);
 274	return submit_bio_wait(&bio);
 275}
 276
 277/**
 278 * blkdev_zone_mgmt - Execute a zone management operation on a range of zones
 279 * @bdev:	Target block device
 280 * @op:		Operation to be performed on the zones
 281 * @sector:	Start sector of the first zone to operate on
 282 * @nr_sectors:	Number of sectors, should be at least the length of one zone and
 283 *		must be zone size aligned.
 284 *
 285 * Description:
 286 *    Perform the specified operation on the range of zones specified by
 287 *    @sector..@sector+@nr_sectors. Specifying the entire disk sector range
 288 *    is valid, but the specified range should not contain conventional zones.
 289 *    The operation to execute on each zone can be a zone reset, open, close
 290 *    or finish request.
 291 */
 292int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
 293		     sector_t sector, sector_t nr_sectors)
 294{
 295	sector_t zone_sectors = bdev_zone_sectors(bdev);
 296	sector_t capacity = bdev_nr_sectors(bdev);
 297	sector_t end_sector = sector + nr_sectors;
 298	struct bio *bio = NULL;
 299	int ret = 0;
 300
 301	if (!bdev_is_zoned(bdev))
 302		return -EOPNOTSUPP;
 303
 304	if (bdev_read_only(bdev))
 305		return -EPERM;
 306
 307	if (!op_is_zone_mgmt(op))
 308		return -EOPNOTSUPP;
 309
 310	if (end_sector <= sector || end_sector > capacity)
 311		/* Out of range */
 312		return -EINVAL;
 313
 314	/* Check alignment (handle eventual smaller last zone) */
 315	if (!bdev_is_zone_start(bdev, sector))
 316		return -EINVAL;
 317
 318	if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity)
 319		return -EINVAL;
 320
 321	/*
 322	 * In the case of a zone reset operation over all zones, use
 323	 * REQ_OP_ZONE_RESET_ALL.
 324	 */
 325	if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity)
 326		return blkdev_zone_reset_all(bdev);
 327
 328	while (sector < end_sector) {
 329		bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL);
 330		bio->bi_iter.bi_sector = sector;
 331		sector += zone_sectors;
 332
 333		/* This may take a while, so be nice to others */
 334		cond_resched();
 335	}
 336
 337	trace_blkdev_zone_mgmt(bio, nr_sectors);
 338	ret = submit_bio_wait(bio);
 339	bio_put(bio);
 340
 341	return ret;
 342}
 343EXPORT_SYMBOL_GPL(blkdev_zone_mgmt);
 344
 345struct zone_report_args {
 346	struct blk_zone __user *zones;
 347};
 348
 349static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx,
 350				    void *data)
 351{
 352	struct zone_report_args *args = data;
 353
 354	if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone)))
 355		return -EFAULT;
 356	return 0;
 357}
 358
 359/*
 360 * Mask of valid input flags for BLKREPORTZONEV2 ioctl.
 361 */
 362#define BLK_ZONE_REPV2_INPUT_FLAGS	BLK_ZONE_REP_CACHED
 363
 364/*
 365 * BLKREPORTZONE and BLKREPORTZONEV2 ioctl processing.
 366 * Called from blkdev_ioctl.
 367 */
 368int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
 369		unsigned long arg)
 370{
 371	void __user *argp = (void __user *)arg;
 372	struct zone_report_args args;
 373	struct blk_zone_report rep;
 374	int ret;
 375
 376	if (!argp)
 377		return -EINVAL;
 378
 379	if (!bdev_is_zoned(bdev))
 380		return -ENOTTY;
 381
 382	if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
 383		return -EFAULT;
 384
 385	if (!rep.nr_zones)
 386		return -EINVAL;
 387
 388	args.zones = argp + sizeof(struct blk_zone_report);
 389
 390	switch (cmd) {
 391	case BLKREPORTZONE:
 392		ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones,
 393					  blkdev_copy_zone_to_user, &args);
 394		break;
 395	case BLKREPORTZONEV2:
 396		if (rep.flags & ~BLK_ZONE_REPV2_INPUT_FLAGS)
 397			return -EINVAL;
 398		ret = blkdev_report_zones_cached(bdev, rep.sector, rep.nr_zones,
 399					 blkdev_copy_zone_to_user, &args);
 400		break;
 401	default:
 402		return -EINVAL;
 403	}
 404
 405	if (ret < 0)
 406		return ret;
 407
 408	rep.nr_zones = ret;
 409	rep.flags = BLK_ZONE_REP_CAPACITY;
 410	if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report)))
 411		return -EFAULT;
 412	return 0;
 413}
 414
 415static int blkdev_truncate_zone_range(struct block_device *bdev,
 416		blk_mode_t mode, const struct blk_zone_range *zrange)
 417{
 418	loff_t start, end;
 419
 420	if (zrange->sector + zrange->nr_sectors <= zrange->sector ||
 421	    zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk))
 422		/* Out of range */
 423		return -EINVAL;
 424
 425	start = zrange->sector << SECTOR_SHIFT;
 426	end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1;
 427
 428	return truncate_bdev_range(bdev, mode, start, end);
 429}
 430
 431/*
 432 * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing.
 433 * Called from blkdev_ioctl.
 434 */
 435int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
 436			   unsigned int cmd, unsigned long arg)
 437{
 438	void __user *argp = (void __user *)arg;
 439	struct blk_zone_range zrange;
 440	enum req_op op;
 441	int ret;
 442
 443	if (!argp)
 444		return -EINVAL;
 445
 446	if (!bdev_is_zoned(bdev))
 447		return -ENOTTY;
 448
 449	if (!(mode & BLK_OPEN_WRITE))
 450		return -EBADF;
 451
 452	if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
 453		return -EFAULT;
 454
 455	switch (cmd) {
 456	case BLKRESETZONE:
 457		op = REQ_OP_ZONE_RESET;
 458
 459		/* Invalidate the page cache, including dirty pages. */
 460		inode_lock(bdev->bd_mapping->host);
 461		filemap_invalidate_lock(bdev->bd_mapping);
 462		ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
 463		if (ret)
 464			goto fail;
 465		break;
 466	case BLKOPENZONE:
 467		op = REQ_OP_ZONE_OPEN;
 468		break;
 469	case BLKCLOSEZONE:
 470		op = REQ_OP_ZONE_CLOSE;
 471		break;
 472	case BLKFINISHZONE:
 473		op = REQ_OP_ZONE_FINISH;
 474		break;
 475	default:
 476		return -ENOTTY;
 477	}
 478
 479	ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
 480
 481fail:
 482	if (cmd == BLKRESETZONE) {
 483		filemap_invalidate_unlock(bdev->bd_mapping);
 484		inode_unlock(bdev->bd_mapping->host);
 485	}
 486
 487	return ret;
 488}
 489
 490static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone)
 491{
 492	return zone->start + zone->len >= get_capacity(disk);
 493}
 494
 495static bool disk_zone_is_full(struct gendisk *disk,
 496			      unsigned int zno, unsigned int offset_in_zone)
 497{
 498	if (zno < disk->nr_zones - 1)
 499		return offset_in_zone >= disk->zone_capacity;
 500	return offset_in_zone >= disk->last_zone_capacity;
 501}
 502
 503static bool disk_zone_wplug_is_full(struct gendisk *disk,
 504				    struct blk_zone_wplug *zwplug)
 505{
 506	return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset);
 507}
 508
 509static bool disk_insert_zone_wplug(struct gendisk *disk,
 510				   struct blk_zone_wplug *zwplug)
 511{
 512	struct blk_zone_wplug *zwplg;
 513	unsigned long flags;
 514	u8 *zones_cond;
 515	unsigned int idx =
 516		hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits);
 517
 518	/*
 519	 * Add the new zone write plug to the hash table, but carefully as we
 520	 * are racing with other submission context, so we may already have a
 521	 * zone write plug for the same zone.
 522	 */
 523	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
 524	hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) {
 525		if (zwplg->zone_no == zwplug->zone_no) {
 526			spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
 527			return false;
 528		}
 529	}
 530
 531	/*
 532	 * Set the zone condition: if we do not yet have a zones_cond array
 533	 * attached to the disk, then this is a zone write plug insert from the
 534	 * first call to blk_revalidate_disk_zones(), in which case the zone is
 535	 * necessarilly in the active condition.
 536	 */
 537	zones_cond = rcu_dereference_check(disk->zones_cond,
 538				lockdep_is_held(&disk->zone_wplugs_lock));
 539	if (zones_cond)
 540		zwplug->cond = zones_cond[zwplug->zone_no];
 541	else
 542		zwplug->cond = BLK_ZONE_COND_ACTIVE;
 543
 544	hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
 545	atomic_inc(&disk->nr_zone_wplugs);
 546	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
 547
 548	return true;
 549}
 550
 551static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk,
 552							 sector_t sector)
 553{
 554	unsigned int zno = disk_zone_no(disk, sector);
 555	unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits);
 556	struct blk_zone_wplug *zwplug;
 557
 558	rcu_read_lock();
 559
 560	hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) {
 561		if (zwplug->zone_no == zno &&
 562		    refcount_inc_not_zero(&zwplug->ref)) {
 563			rcu_read_unlock();
 564			return zwplug;
 565		}
 566	}
 567
 568	rcu_read_unlock();
 569
 570	return NULL;
 571}
 572
 573static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
 574							 sector_t sector)
 575{
 576	if (!atomic_read(&disk->nr_zone_wplugs))
 577		return NULL;
 578
 579	return disk_get_hashed_zone_wplug(disk, sector);
 580}
 581
 582static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
 583{
 584	struct blk_zone_wplug *zwplug =
 585		container_of(rcu_head, struct blk_zone_wplug, rcu_head);
 586
 587	mempool_free(zwplug, zwplug->disk->zone_wplugs_pool);
 588}
 589
 590static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
 591{
 592	if (refcount_dec_and_test(&zwplug->ref)) {
 593		WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
 594		WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED);
 595		WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED));
 596
 597		call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
 598	}
 599}
 600
 601static inline bool disk_should_remove_zone_wplug(struct gendisk *disk,
 602						 struct blk_zone_wplug *zwplug)
 603{
 604	lockdep_assert_held(&zwplug->lock);
 605
 606	/* If the zone write plug was already removed, we are done. */
 607	if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
 608		return false;
 609
 610	/* If the zone write plug is still plugged, it cannot be removed. */
 611	if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)
 612		return false;
 613
 614	/*
 615	 * Completions of BIOs with blk_zone_write_plug_bio_endio() may
 616	 * happen after handling a request completion with
 617	 * blk_zone_write_plug_finish_request() (e.g. with split BIOs
 618	 * that are chained). In such case, disk_zone_wplug_unplug_bio()
 619	 * should not attempt to remove the zone write plug until all BIO
 620	 * completions are seen. Check by looking at the zone write plug
 621	 * reference count, which is 2 when the plug is unused (one reference
 622	 * taken when the plug was allocated and another reference taken by the
 623	 * caller context).
 624	 */
 625	if (refcount_read(&zwplug->ref) > 2)
 626		return false;
 627
 628	/* We can remove zone write plugs for zones that are empty or full. */
 629	return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug);
 630}
 631
 632static void disk_remove_zone_wplug(struct gendisk *disk,
 633				   struct blk_zone_wplug *zwplug)
 634{
 635	unsigned long flags;
 636
 637	/* If the zone write plug was already removed, we have nothing to do. */
 638	if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
 639		return;
 640
 641	/*
 642	 * Mark the zone write plug as unhashed and drop the extra reference we
 643	 * took when the plug was inserted in the hash table. Also update the
 644	 * disk zone condition array with the current condition of the zone
 645	 * write plug.
 646	 */
 647	zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED;
 648	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
 649	blk_zone_set_cond(rcu_dereference_check(disk->zones_cond,
 650				lockdep_is_held(&disk->zone_wplugs_lock)),
 651			  zwplug->zone_no, zwplug->cond);
 652	hlist_del_init_rcu(&zwplug->node);
 653	atomic_dec(&disk->nr_zone_wplugs);
 654	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
 655	disk_put_zone_wplug(zwplug);
 656}
 657
 658static void blk_zone_wplug_bio_work(struct work_struct *work);
 659
 660/*
 661 * Get a reference on the write plug for the zone containing @sector.
 662 * If the plug does not exist, it is allocated and hashed.
 663 * Return a pointer to the zone write plug with the plug spinlock held.
 664 */
 665static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
 666					sector_t sector, gfp_t gfp_mask,
 667					unsigned long *flags)
 668{
 669	unsigned int zno = disk_zone_no(disk, sector);
 670	struct blk_zone_wplug *zwplug;
 671
 672again:
 673	zwplug = disk_get_zone_wplug(disk, sector);
 674	if (zwplug) {
 675		/*
 676		 * Check that a BIO completion or a zone reset or finish
 677		 * operation has not already removed the zone write plug from
 678		 * the hash table and dropped its reference count. In such case,
 679		 * we need to get a new plug so start over from the beginning.
 680		 */
 681		spin_lock_irqsave(&zwplug->lock, *flags);
 682		if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) {
 683			spin_unlock_irqrestore(&zwplug->lock, *flags);
 684			disk_put_zone_wplug(zwplug);
 685			goto again;
 686		}
 687		return zwplug;
 688	}
 689
 690	/*
 691	 * Allocate and initialize a zone write plug with an extra reference
 692	 * so that it is not freed when the zone write plug becomes idle without
 693	 * the zone being full.
 694	 */
 695	zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask);
 696	if (!zwplug)
 697		return NULL;
 698
 699	INIT_HLIST_NODE(&zwplug->node);
 700	refcount_set(&zwplug->ref, 2);
 701	spin_lock_init(&zwplug->lock);
 702	zwplug->flags = 0;
 703	zwplug->zone_no = zno;
 704	zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector);
 705	bio_list_init(&zwplug->bio_list);
 706	INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
 707	zwplug->disk = disk;
 708
 709	spin_lock_irqsave(&zwplug->lock, *flags);
 710
 711	/*
 712	 * Insert the new zone write plug in the hash table. This can fail only
 713	 * if another context already inserted a plug. Retry from the beginning
 714	 * in such case.
 715	 */
 716	if (!disk_insert_zone_wplug(disk, zwplug)) {
 717		spin_unlock_irqrestore(&zwplug->lock, *flags);
 718		mempool_free(zwplug, disk->zone_wplugs_pool);
 719		goto again;
 720	}
 721
 722	return zwplug;
 723}
 724
 725static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug,
 726					       struct bio *bio)
 727{
 728	struct request_queue *q = zwplug->disk->queue;
 729
 730	bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
 731	bio_io_error(bio);
 732	disk_put_zone_wplug(zwplug);
 733	/* Drop the reference taken by disk_zone_wplug_add_bio(). */
 734	blk_queue_exit(q);
 735}
 736
 737/*
 738 * Abort (fail) all plugged BIOs of a zone write plug.
 739 */
 740static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
 741{
 742	struct bio *bio;
 743
 744	lockdep_assert_held(&zwplug->lock);
 745
 746	if (bio_list_empty(&zwplug->bio_list))
 747		return;
 748
 749	pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n",
 750			    zwplug->disk->disk_name, zwplug->zone_no);
 751	while ((bio = bio_list_pop(&zwplug->bio_list)))
 752		blk_zone_wplug_bio_io_error(zwplug, bio);
 753
 754	zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
 755}
 756
 757/*
 758 * Update a zone write plug condition based on the write pointer offset.
 759 */
 760static void disk_zone_wplug_update_cond(struct gendisk *disk,
 761					struct blk_zone_wplug *zwplug)
 762{
 763	lockdep_assert_held(&zwplug->lock);
 764
 765	if (disk_zone_wplug_is_full(disk, zwplug))
 766		zwplug->cond = BLK_ZONE_COND_FULL;
 767	else if (!zwplug->wp_offset)
 768		zwplug->cond = BLK_ZONE_COND_EMPTY;
 769	else
 770		zwplug->cond = BLK_ZONE_COND_ACTIVE;
 771}
 772
 773/*
 774 * Set a zone write plug write pointer offset to the specified value.
 775 * This aborts all plugged BIOs, which is fine as this function is called for
 776 * a zone reset operation, a zone finish operation or if the zone needs a wp
 777 * update from a report zone after a write error.
 778 */
 779static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
 780					  struct blk_zone_wplug *zwplug,
 781					  unsigned int wp_offset)
 782{
 783	lockdep_assert_held(&zwplug->lock);
 784
 785	/* Update the zone write pointer and abort all plugged BIOs. */
 786	zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE;
 787	zwplug->wp_offset = wp_offset;
 788	disk_zone_wplug_update_cond(disk, zwplug);
 789
 790	disk_zone_wplug_abort(zwplug);
 791
 792	/*
 793	 * The zone write plug now has no BIO plugged: remove it from the
 794	 * hash table so that it cannot be seen. The plug will be freed
 795	 * when the last reference is dropped.
 796	 */
 797	if (disk_should_remove_zone_wplug(disk, zwplug))
 798		disk_remove_zone_wplug(disk, zwplug);
 799}
 800
 801static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
 802{
 803	switch (zone->cond) {
 804	case BLK_ZONE_COND_IMP_OPEN:
 805	case BLK_ZONE_COND_EXP_OPEN:
 806	case BLK_ZONE_COND_CLOSED:
 807	case BLK_ZONE_COND_ACTIVE:
 808		return zone->wp - zone->start;
 809	case BLK_ZONE_COND_EMPTY:
 810		return 0;
 811	case BLK_ZONE_COND_FULL:
 812	case BLK_ZONE_COND_NOT_WP:
 813	case BLK_ZONE_COND_OFFLINE:
 814	case BLK_ZONE_COND_READONLY:
 815	default:
 816		/*
 817		 * Conventional, full, offline and read-only zones do not have
 818		 * a valid write pointer.
 819		 */
 820		return UINT_MAX;
 821	}
 822}
 823
 824static unsigned int disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
 825						   struct blk_zone *zone)
 826{
 827	struct blk_zone_wplug *zwplug;
 828	unsigned int wp_offset = blk_zone_wp_offset(zone);
 829
 830	zwplug = disk_get_zone_wplug(disk, zone->start);
 831	if (zwplug) {
 832		unsigned long flags;
 833
 834		spin_lock_irqsave(&zwplug->lock, flags);
 835		if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE)
 836			disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset);
 837		spin_unlock_irqrestore(&zwplug->lock, flags);
 838		disk_put_zone_wplug(zwplug);
 839	}
 840
 841	return wp_offset;
 842}
 843
 844/**
 845 * disk_report_zone - Report one zone
 846 * @disk:	Target disk
 847 * @zone:	The zone to report
 848 * @idx:	The index of the zone in the overall zone report
 849 * @args:	report zones callback and data
 850 *
 851 * Description:
 852 *    Helper function for block device drivers to report one zone of a zone
 853 *    report initiated with blkdev_report_zones(). The zone being reported is
 854 *    specified by @zone and used to update, if necessary, the zone write plug
 855 *    information for the zone. If @args specifies a user callback function,
 856 *    this callback is executed.
 857 */
 858int disk_report_zone(struct gendisk *disk, struct blk_zone *zone,
 859		     unsigned int idx, struct blk_report_zones_args *args)
 860{
 861	if (args && args->report_active) {
 862		/*
 863		 * If we come here, then this is a report zones as a fallback
 864		 * for a cached report. So collapse the implicit open, explicit
 865		 * open and closed conditions into the active zone condition.
 866		 */
 867		switch (zone->cond) {
 868		case BLK_ZONE_COND_IMP_OPEN:
 869		case BLK_ZONE_COND_EXP_OPEN:
 870		case BLK_ZONE_COND_CLOSED:
 871			zone->cond = BLK_ZONE_COND_ACTIVE;
 872			break;
 873		default:
 874			break;
 875		}
 876	}
 877
 878	if (disk->zone_wplugs_hash)
 879		disk_zone_wplug_sync_wp_offset(disk, zone);
 880
 881	if (args && args->cb)
 882		return args->cb(zone, idx, args->data);
 883
 884	return 0;
 885}
 886EXPORT_SYMBOL_GPL(disk_report_zone);
 887
 888static int blkdev_report_zone_cb(struct blk_zone *zone, unsigned int idx,
 889				 void *data)
 890{
 891	memcpy(data, zone, sizeof(struct blk_zone));
 892	return 0;
 893}
 894
 895static int blkdev_report_zone_fallback(struct block_device *bdev,
 896				       sector_t sector, struct blk_zone *zone)
 897{
 898	struct blk_report_zones_args args = {
 899		.cb = blkdev_report_zone_cb,
 900		.data = zone,
 901		.report_active = true,
 902	};
 903	int error;
 904
 905	error = blkdev_do_report_zones(bdev, sector, 1, &args);
 906	if (error < 0)
 907		return error;
 908	if (error == 0)
 909		return -EIO;
 910	return 0;
 911}
 912
 913/*
 914 * For devices that natively support zone append operations, we do not use zone
 915 * write plugging for zone append writes, which makes the zone condition
 916 * tracking invalid once zone append was used.  In that case fall back to a
 917 * regular report zones to get correct information.
 918 */
 919static inline bool blkdev_has_cached_report_zones(struct block_device *bdev)
 920{
 921	return disk_need_zone_resources(bdev->bd_disk) &&
 922		(bdev_emulates_zone_append(bdev) ||
 923		 !test_bit(GD_ZONE_APPEND_USED, &bdev->bd_disk->state));
 924}
 925
 926/**
 927 * blkdev_get_zone_info - Get a single zone information from cached data
 928 * @bdev:   Target block device
 929 * @sector: Sector contained by the target zone
 930 * @zone:   zone structure to return the zone information
 931 *
 932 * Description:
 933 *    Get the zone information for the zone containing @sector using the zone
 934 *    write plug of the target zone, if one exist, or the disk zone condition
 935 *    array otherwise. The zone condition may be reported as being
 936 *    the BLK_ZONE_COND_ACTIVE condition for a zone that is in the implicit
 937 *    open, explicit open or closed condition.
 938 *
 939 *    Returns 0 on success and a negative error code on failure.
 940 */
 941int blkdev_get_zone_info(struct block_device *bdev, sector_t sector,
 942			 struct blk_zone *zone)
 943{
 944	struct gendisk *disk = bdev->bd_disk;
 945	sector_t zone_sectors = bdev_zone_sectors(bdev);
 946	struct blk_zone_wplug *zwplug;
 947	unsigned long flags;
 948	u8 *zones_cond;
 949
 950	if (!bdev_is_zoned(bdev))
 951		return -EOPNOTSUPP;
 952
 953	if (sector >= get_capacity(disk))
 954		return -EINVAL;
 955
 956	memset(zone, 0, sizeof(*zone));
 957	sector = bdev_zone_start(bdev, sector);
 958
 959	if (!blkdev_has_cached_report_zones(bdev))
 960		return blkdev_report_zone_fallback(bdev, sector, zone);
 961
 962	rcu_read_lock();
 963	zones_cond = rcu_dereference(disk->zones_cond);
 964	if (!disk->zone_wplugs_hash || !zones_cond) {
 965		rcu_read_unlock();
 966		return blkdev_report_zone_fallback(bdev, sector, zone);
 967	}
 968	zone->cond = zones_cond[disk_zone_no(disk, sector)];
 969	rcu_read_unlock();
 970
 971	zone->start = sector;
 972	zone->len = zone_sectors;
 973
 974	/*
 975	 * If this is a conventional zone, we do not have a zone write plug and
 976	 * can report the zone immediately.
 977	 */
 978	if (zone->cond == BLK_ZONE_COND_NOT_WP) {
 979		zone->type = BLK_ZONE_TYPE_CONVENTIONAL;
 980		zone->capacity = zone_sectors;
 981		zone->wp = ULLONG_MAX;
 982		return 0;
 983	}
 984
 985	/*
 986	 * This is a sequential write required zone. If the zone is read-only or
 987	 * offline, only set the zone write pointer to an invalid value and
 988	 * report the zone.
 989	 */
 990	zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ;
 991	if (disk_zone_is_last(disk, zone))
 992		zone->capacity = disk->last_zone_capacity;
 993	else
 994		zone->capacity = disk->zone_capacity;
 995
 996	if (zone->cond == BLK_ZONE_COND_READONLY ||
 997	    zone->cond == BLK_ZONE_COND_OFFLINE) {
 998		zone->wp = ULLONG_MAX;
 999		return 0;
1000	}
1001
1002	/*
1003	 * If the zone does not have a zone write plug, it is either full or
1004	 * empty, as we otherwise would have a zone write plug for it. In this
1005	 * case, set the write pointer accordingly and report the zone.
1006	 * Otherwise, if we have a zone write plug, use it.
1007	 */
1008	zwplug = disk_get_zone_wplug(disk, sector);
1009	if (!zwplug) {
1010		if (zone->cond == BLK_ZONE_COND_FULL)
1011			zone->wp = ULLONG_MAX;
1012		else
1013			zone->wp = sector;
1014		return 0;
1015	}
1016
1017	spin_lock_irqsave(&zwplug->lock, flags);
1018	if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) {
1019		spin_unlock_irqrestore(&zwplug->lock, flags);
1020		disk_put_zone_wplug(zwplug);
1021		return blkdev_report_zone_fallback(bdev, sector, zone);
1022	}
1023	zone->cond = zwplug->cond;
1024	zone->wp = sector + zwplug->wp_offset;
1025	spin_unlock_irqrestore(&zwplug->lock, flags);
1026
1027	disk_put_zone_wplug(zwplug);
1028
1029	return 0;
1030}
1031EXPORT_SYMBOL_GPL(blkdev_get_zone_info);
1032
1033/**
1034 * blkdev_report_zones_cached - Get cached zones information
1035 * @bdev:     Target block device
1036 * @sector:   Sector from which to report zones
1037 * @nr_zones: Maximum number of zones to report
1038 * @cb:       Callback function called for each reported zone
1039 * @data:     Private data for the callback function
1040 *
1041 * Description:
1042 *    Similar to blkdev_report_zones() but instead of calling into the low level
1043 *    device driver to get the zone report from the device, use
1044 *    blkdev_get_zone_info() to generate the report from the disk zone write
1045 *    plugs and zones condition array. Since calling this function without a
1046 *    callback does not make sense, @cb must be specified.
1047 */
1048int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector,
1049			unsigned int nr_zones, report_zones_cb cb, void *data)
1050{
1051	struct gendisk *disk = bdev->bd_disk;
1052	sector_t capacity = get_capacity(disk);
1053	sector_t zone_sectors = bdev_zone_sectors(bdev);
1054	unsigned int idx = 0;
1055	struct blk_zone zone;
1056	int ret;
1057
1058	if (!cb || !bdev_is_zoned(bdev) ||
1059	    WARN_ON_ONCE(!disk->fops->report_zones))
1060		return -EOPNOTSUPP;
1061
1062	if (!nr_zones || sector >= capacity)
1063		return 0;
1064
1065	if (!blkdev_has_cached_report_zones(bdev)) {
1066		struct blk_report_zones_args args = {
1067			.cb = cb,
1068			.data = data,
1069			.report_active = true,
1070		};
1071
1072		return blkdev_do_report_zones(bdev, sector, nr_zones, &args);
1073	}
1074
1075	for (sector = bdev_zone_start(bdev, sector);
1076	     sector < capacity && idx < nr_zones;
1077	     sector += zone_sectors, idx++) {
1078		ret = blkdev_get_zone_info(bdev, sector, &zone);
1079		if (ret)
1080			return ret;
1081
1082		ret = cb(&zone, idx, data);
1083		if (ret)
1084			return ret;
1085	}
1086
1087	return idx;
1088}
1089EXPORT_SYMBOL_GPL(blkdev_report_zones_cached);
1090
1091static void blk_zone_reset_bio_endio(struct bio *bio)
1092{
1093	struct gendisk *disk = bio->bi_bdev->bd_disk;
1094	sector_t sector = bio->bi_iter.bi_sector;
1095	struct blk_zone_wplug *zwplug;
1096
1097	/*
1098	 * If we have a zone write plug, set its write pointer offset to 0.
1099	 * This will abort all BIOs plugged for the target zone. It is fine as
1100	 * resetting zones while writes are still in-flight will result in the
1101	 * writes failing anyway.
1102	 */
1103	zwplug = disk_get_zone_wplug(disk, sector);
1104	if (zwplug) {
1105		unsigned long flags;
1106
1107		spin_lock_irqsave(&zwplug->lock, flags);
1108		disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
1109		spin_unlock_irqrestore(&zwplug->lock, flags);
1110		disk_put_zone_wplug(zwplug);
1111	} else {
1112		disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY);
1113	}
1114}
1115
1116static void blk_zone_reset_all_bio_endio(struct bio *bio)
1117{
1118	struct gendisk *disk = bio->bi_bdev->bd_disk;
1119	sector_t capacity = get_capacity(disk);
1120	struct blk_zone_wplug *zwplug;
1121	unsigned long flags;
1122	sector_t sector;
1123	unsigned int i;
1124
1125	if (atomic_read(&disk->nr_zone_wplugs)) {
1126		/* Update the condition of all zone write plugs. */
1127		rcu_read_lock();
1128		for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
1129			hlist_for_each_entry_rcu(zwplug,
1130						 &disk->zone_wplugs_hash[i],
1131						 node) {
1132				spin_lock_irqsave(&zwplug->lock, flags);
1133				disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
1134				spin_unlock_irqrestore(&zwplug->lock, flags);
1135			}
1136		}
1137		rcu_read_unlock();
1138	}
1139
1140	/* Update the cached zone conditions. */
1141	for (sector = 0; sector < capacity;
1142	     sector += bdev_zone_sectors(bio->bi_bdev))
1143		disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY);
1144	clear_bit(GD_ZONE_APPEND_USED, &disk->state);
1145}
1146
1147static void blk_zone_finish_bio_endio(struct bio *bio)
1148{
1149	struct block_device *bdev = bio->bi_bdev;
1150	struct gendisk *disk = bdev->bd_disk;
1151	sector_t sector = bio->bi_iter.bi_sector;
1152	struct blk_zone_wplug *zwplug;
1153
1154	/*
1155	 * If we have a zone write plug, set its write pointer offset to the
1156	 * zone size. This will abort all BIOs plugged for the target zone. It
1157	 * is fine as resetting zones while writes are still in-flight will
1158	 * result in the writes failing anyway.
1159	 */
1160	zwplug = disk_get_zone_wplug(disk, sector);
1161	if (zwplug) {
1162		unsigned long flags;
1163
1164		spin_lock_irqsave(&zwplug->lock, flags);
1165		disk_zone_wplug_set_wp_offset(disk, zwplug,
1166					      bdev_zone_sectors(bdev));
1167		spin_unlock_irqrestore(&zwplug->lock, flags);
1168		disk_put_zone_wplug(zwplug);
1169	} else {
1170		disk_zone_set_cond(disk, sector, BLK_ZONE_COND_FULL);
1171	}
1172}
1173
1174void blk_zone_mgmt_bio_endio(struct bio *bio)
1175{
1176	/* If the BIO failed, we have nothing to do. */
1177	if (bio->bi_status != BLK_STS_OK)
1178		return;
1179
1180	switch (bio_op(bio)) {
1181	case REQ_OP_ZONE_RESET:
1182		blk_zone_reset_bio_endio(bio);
1183		return;
1184	case REQ_OP_ZONE_RESET_ALL:
1185		blk_zone_reset_all_bio_endio(bio);
1186		return;
1187	case REQ_OP_ZONE_FINISH:
1188		blk_zone_finish_bio_endio(bio);
1189		return;
1190	default:
1191		return;
1192	}
1193}
1194
1195static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk,
1196					      struct blk_zone_wplug *zwplug)
1197{
1198	lockdep_assert_held(&zwplug->lock);
1199
1200	/*
1201	 * Take a reference on the zone write plug and schedule the submission
1202	 * of the next plugged BIO. blk_zone_wplug_bio_work() will release the
1203	 * reference we take here.
1204	 */
1205	WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
1206	refcount_inc(&zwplug->ref);
1207	queue_work(disk->zone_wplugs_wq, &zwplug->bio_work);
1208}
1209
1210static inline void disk_zone_wplug_add_bio(struct gendisk *disk,
1211				struct blk_zone_wplug *zwplug,
1212				struct bio *bio, unsigned int nr_segs)
1213{
1214	/*
1215	 * Grab an extra reference on the BIO request queue usage counter.
1216	 * This reference will be reused to submit a request for the BIO for
1217	 * blk-mq devices and dropped when the BIO is failed and after
1218	 * it is issued in the case of BIO-based devices.
1219	 */
1220	percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter);
1221
1222	/*
1223	 * The BIO is being plugged and thus will have to wait for the on-going
1224	 * write and for all other writes already plugged. So polling makes
1225	 * no sense.
1226	 */
1227	bio_clear_polled(bio);
1228
1229	/*
1230	 * Reuse the poll cookie field to store the number of segments when
1231	 * split to the hardware limits.
1232	 */
1233	bio->__bi_nr_segments = nr_segs;
1234
1235	/*
1236	 * We always receive BIOs after they are split and ready to be issued.
1237	 * The block layer passes the parts of a split BIO in order, and the
1238	 * user must also issue write sequentially. So simply add the new BIO
1239	 * at the tail of the list to preserve the sequential write order.
1240	 */
1241	bio_list_add(&zwplug->bio_list, bio);
1242	trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no,
1243				      bio->bi_iter.bi_sector, bio_sectors(bio));
1244}
1245
1246/*
1247 * Called from bio_attempt_back_merge() when a BIO was merged with a request.
1248 */
1249void blk_zone_write_plug_bio_merged(struct bio *bio)
1250{
1251	struct gendisk *disk = bio->bi_bdev->bd_disk;
1252	struct blk_zone_wplug *zwplug;
1253	unsigned long flags;
1254
1255	/*
1256	 * If the BIO was already plugged, then we were called through
1257	 * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge().
1258	 * For this case, we already hold a reference on the zone write plug for
1259	 * the BIO and blk_zone_write_plug_init_request() will handle the
1260	 * zone write pointer offset update.
1261	 */
1262	if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
1263		return;
1264
1265	bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1266
1267	/*
1268	 * Get a reference on the zone write plug of the target zone and advance
1269	 * the zone write pointer offset. Given that this is a merge, we already
1270	 * have at least one request and one BIO referencing the zone write
1271	 * plug. So this should not fail.
1272	 */
1273	zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1274	if (WARN_ON_ONCE(!zwplug))
1275		return;
1276
1277	spin_lock_irqsave(&zwplug->lock, flags);
1278	zwplug->wp_offset += bio_sectors(bio);
1279	disk_zone_wplug_update_cond(disk, zwplug);
1280	spin_unlock_irqrestore(&zwplug->lock, flags);
1281}
1282
1283/*
1284 * Attempt to merge plugged BIOs with a newly prepared request for a BIO that
1285 * already went through zone write plugging (either a new BIO or one that was
1286 * unplugged).
1287 */
1288void blk_zone_write_plug_init_request(struct request *req)
1289{
1290	sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req);
1291	struct request_queue *q = req->q;
1292	struct gendisk *disk = q->disk;
1293	struct blk_zone_wplug *zwplug =
1294		disk_get_zone_wplug(disk, blk_rq_pos(req));
1295	unsigned long flags;
1296	struct bio *bio;
1297
1298	if (WARN_ON_ONCE(!zwplug))
1299		return;
1300
1301	/*
1302	 * Indicate that completion of this request needs to be handled with
1303	 * blk_zone_write_plug_finish_request(), which will drop the reference
1304	 * on the zone write plug we took above on entry to this function.
1305	 */
1306	req->rq_flags |= RQF_ZONE_WRITE_PLUGGING;
1307
1308	if (blk_queue_nomerges(q))
1309		return;
1310
1311	/*
1312	 * Walk through the list of plugged BIOs to check if they can be merged
1313	 * into the back of the request.
1314	 */
1315	spin_lock_irqsave(&zwplug->lock, flags);
1316	while (!disk_zone_wplug_is_full(disk, zwplug)) {
1317		bio = bio_list_peek(&zwplug->bio_list);
1318		if (!bio)
1319			break;
1320
1321		if (bio->bi_iter.bi_sector != req_back_sector ||
1322		    !blk_rq_merge_ok(req, bio))
1323			break;
1324
1325		WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES &&
1326			     !bio->__bi_nr_segments);
1327
1328		bio_list_pop(&zwplug->bio_list);
1329		if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) !=
1330		    BIO_MERGE_OK) {
1331			bio_list_add_head(&zwplug->bio_list, bio);
1332			break;
1333		}
1334
1335		/* Drop the reference taken by disk_zone_wplug_add_bio(). */
1336		blk_queue_exit(q);
1337		zwplug->wp_offset += bio_sectors(bio);
1338		disk_zone_wplug_update_cond(disk, zwplug);
1339
1340		req_back_sector += bio_sectors(bio);
1341	}
1342	spin_unlock_irqrestore(&zwplug->lock, flags);
1343}
1344
1345/*
1346 * Check and prepare a BIO for submission by incrementing the write pointer
1347 * offset of its zone write plug and changing zone append operations into
1348 * regular write when zone append emulation is needed.
1349 */
1350static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
1351				       struct bio *bio)
1352{
1353	struct gendisk *disk = bio->bi_bdev->bd_disk;
1354
1355	lockdep_assert_held(&zwplug->lock);
1356
1357	/*
1358	 * If we lost track of the zone write pointer due to a write error,
1359	 * the user must either execute a report zones, reset the zone or finish
1360	 * the to recover a reliable write pointer position. Fail BIOs if the
1361	 * user did not do that as we cannot handle emulated zone append
1362	 * otherwise.
1363	 */
1364	if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE)
1365		return false;
1366
1367	/*
1368	 * Check that the user is not attempting to write to a full zone.
1369	 * We know such BIO will fail, and that would potentially overflow our
1370	 * write pointer offset beyond the end of the zone.
1371	 */
1372	if (disk_zone_wplug_is_full(disk, zwplug))
1373		return false;
1374
1375	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
1376		/*
1377		 * Use a regular write starting at the current write pointer.
1378		 * Similarly to native zone append operations, do not allow
1379		 * merging.
1380		 */
1381		bio->bi_opf &= ~REQ_OP_MASK;
1382		bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE;
1383		bio->bi_iter.bi_sector += zwplug->wp_offset;
1384
1385		/*
1386		 * Remember that this BIO is in fact a zone append operation
1387		 * so that we can restore its operation code on completion.
1388		 */
1389		bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND);
1390	} else {
1391		/*
1392		 * Check for non-sequential writes early as we know that BIOs
1393		 * with a start sector not unaligned to the zone write pointer
1394		 * will fail.
1395		 */
1396		if (bio_offset_from_zone_start(bio) != zwplug->wp_offset)
1397			return false;
1398	}
1399
1400	/* Advance the zone write pointer offset. */
1401	zwplug->wp_offset += bio_sectors(bio);
1402	disk_zone_wplug_update_cond(disk, zwplug);
1403
1404	return true;
1405}
1406
1407static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
1408{
1409	struct gendisk *disk = bio->bi_bdev->bd_disk;
1410	sector_t sector = bio->bi_iter.bi_sector;
1411	struct blk_zone_wplug *zwplug;
1412	gfp_t gfp_mask = GFP_NOIO;
1413	unsigned long flags;
1414
1415	/*
1416	 * BIOs must be fully contained within a zone so that we use the correct
1417	 * zone write plug for the entire BIO. For blk-mq devices, the block
1418	 * layer should already have done any splitting required to ensure this
1419	 * and this BIO should thus not be straddling zone boundaries. For
1420	 * BIO-based devices, it is the responsibility of the driver to split
1421	 * the bio before submitting it.
1422	 */
1423	if (WARN_ON_ONCE(bio_straddles_zones(bio))) {
1424		bio_io_error(bio);
1425		return true;
1426	}
1427
1428	/* Conventional zones do not need write plugging. */
1429	if (!bdev_zone_is_seq(bio->bi_bdev, sector)) {
1430		/* Zone append to conventional zones is not allowed. */
1431		if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
1432			bio_io_error(bio);
1433			return true;
1434		}
1435		return false;
1436	}
1437
1438	if (bio->bi_opf & REQ_NOWAIT)
1439		gfp_mask = GFP_NOWAIT;
1440
1441	zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags);
1442	if (!zwplug) {
1443		if (bio->bi_opf & REQ_NOWAIT)
1444			bio_wouldblock_error(bio);
1445		else
1446			bio_io_error(bio);
1447		return true;
1448	}
1449
1450	/* Indicate that this BIO is being handled using zone write plugging. */
1451	bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1452
1453	/*
1454	 * Add REQ_NOWAIT BIOs to the plug list to ensure that we will not see a
1455	 * BLK_STS_AGAIN failure if we let the caller submit the BIO.
1456	 */
1457	if (bio->bi_opf & REQ_NOWAIT) {
1458		bio->bi_opf &= ~REQ_NOWAIT;
1459		goto queue_bio;
1460	}
1461
1462	/* If the zone is already plugged, add the BIO to the BIO plug list. */
1463	if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)
1464		goto queue_bio;
1465
1466	if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
1467		spin_unlock_irqrestore(&zwplug->lock, flags);
1468		bio_io_error(bio);
1469		return true;
1470	}
1471
1472	/* Otherwise, plug and let the caller submit the BIO. */
1473	zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
1474
1475	spin_unlock_irqrestore(&zwplug->lock, flags);
1476
1477	return false;
1478
1479queue_bio:
1480	disk_zone_wplug_add_bio(disk, zwplug, bio, nr_segs);
1481
1482	if (!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)) {
1483		zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
1484		disk_zone_wplug_schedule_bio_work(disk, zwplug);
1485	}
1486
1487	spin_unlock_irqrestore(&zwplug->lock, flags);
1488
1489	return true;
1490}
1491
1492static void blk_zone_wplug_handle_native_zone_append(struct bio *bio)
1493{
1494	struct gendisk *disk = bio->bi_bdev->bd_disk;
1495	struct blk_zone_wplug *zwplug;
1496	unsigned long flags;
1497
1498	if (!test_bit(GD_ZONE_APPEND_USED, &disk->state))
1499		set_bit(GD_ZONE_APPEND_USED, &disk->state);
1500
1501	/*
1502	 * We have native support for zone append operations, so we are not
1503	 * going to handle @bio through plugging. However, we may already have a
1504	 * zone write plug for the target zone if that zone was previously
1505	 * partially written using regular writes. In such case, we risk leaving
1506	 * the plug in the disk hash table if the zone is fully written using
1507	 * zone append operations. Avoid this by removing the zone write plug.
1508	 */
1509	zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1510	if (likely(!zwplug))
1511		return;
1512
1513	spin_lock_irqsave(&zwplug->lock, flags);
1514
1515	/*
1516	 * We are about to remove the zone write plug. But if the user
1517	 * (mistakenly) has issued regular writes together with native zone
1518	 * append, we must aborts the writes as otherwise the plugged BIOs would
1519	 * not be executed by the plug BIO work as disk_get_zone_wplug() will
1520	 * return NULL after the plug is removed. Aborting the plugged write
1521	 * BIOs is consistent with the fact that these writes will most likely
1522	 * fail anyway as there is no ordering guarantees between zone append
1523	 * operations and regular write operations.
1524	 */
1525	if (!bio_list_empty(&zwplug->bio_list)) {
1526		pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n",
1527				    disk->disk_name, zwplug->zone_no);
1528		disk_zone_wplug_abort(zwplug);
1529	}
1530	disk_remove_zone_wplug(disk, zwplug);
1531	spin_unlock_irqrestore(&zwplug->lock, flags);
1532
1533	disk_put_zone_wplug(zwplug);
1534}
1535
1536static bool blk_zone_wplug_handle_zone_mgmt(struct bio *bio)
1537{
1538	if (bio_op(bio) != REQ_OP_ZONE_RESET_ALL &&
1539	    !bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
1540		/*
1541		 * Zone reset and zone finish operations do not apply to
1542		 * conventional zones.
1543		 */
1544		bio_io_error(bio);
1545		return true;
1546	}
1547
1548	/*
1549	 * No-wait zone management BIOs do not make much sense as the callers
1550	 * issue these as blocking operations in most cases. To avoid issues
1551	 * with the BIO execution potentially failing with BLK_STS_AGAIN, warn
1552	 * about REQ_NOWAIT being set and ignore that flag.
1553	 */
1554	if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT))
1555		bio->bi_opf &= ~REQ_NOWAIT;
1556
1557	return false;
1558}
1559
1560/**
1561 * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
1562 * @bio: The BIO being submitted
1563 * @nr_segs: The number of physical segments of @bio
1564 *
1565 * Handle write, write zeroes and zone append operations requiring emulation
1566 * using zone write plugging.
1567 *
1568 * Return true whenever @bio execution needs to be delayed through the zone
1569 * write plug. Otherwise, return false to let the submission path process
1570 * @bio normally.
1571 */
1572bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
1573{
1574	struct block_device *bdev = bio->bi_bdev;
1575
1576	if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash))
1577		return false;
1578
1579	/*
1580	 * Regular writes and write zeroes need to be handled through the target
1581	 * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH
1582	 * which may need to go through the flush machinery depending on the
1583	 * target device capabilities. Plugging such writes is fine as the flush
1584	 * machinery operates at the request level, below the plug, and
1585	 * completion of the flush sequence will go through the regular BIO
1586	 * completion, which will handle zone write plugging.
1587	 * Zone append operations for devices that requested emulation must
1588	 * also be plugged so that these BIOs can be changed into regular
1589	 * write BIOs.
1590	 * Zone reset, reset all and finish commands need special treatment
1591	 * to correctly track the write pointer offset of zones. These commands
1592	 * are not plugged as we do not need serialization with write
1593	 * operations. It is the responsibility of the user to not issue reset
1594	 * and finish commands when write operations are in flight.
1595	 */
1596	switch (bio_op(bio)) {
1597	case REQ_OP_ZONE_APPEND:
1598		if (!bdev_emulates_zone_append(bdev)) {
1599			blk_zone_wplug_handle_native_zone_append(bio);
1600			return false;
1601		}
1602		fallthrough;
1603	case REQ_OP_WRITE:
1604	case REQ_OP_WRITE_ZEROES:
1605		return blk_zone_wplug_handle_write(bio, nr_segs);
1606	case REQ_OP_ZONE_RESET:
1607	case REQ_OP_ZONE_FINISH:
1608	case REQ_OP_ZONE_RESET_ALL:
1609		return blk_zone_wplug_handle_zone_mgmt(bio);
1610	default:
1611		return false;
1612	}
1613
1614	return false;
1615}
1616EXPORT_SYMBOL_GPL(blk_zone_plug_bio);
1617
1618static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
1619				       struct blk_zone_wplug *zwplug)
1620{
1621	unsigned long flags;
1622
1623	spin_lock_irqsave(&zwplug->lock, flags);
1624
1625	/* Schedule submission of the next plugged BIO if we have one. */
1626	if (!bio_list_empty(&zwplug->bio_list)) {
1627		disk_zone_wplug_schedule_bio_work(disk, zwplug);
1628		spin_unlock_irqrestore(&zwplug->lock, flags);
1629		return;
1630	}
1631
1632	zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1633
1634	/*
1635	 * If the zone is full (it was fully written or finished, or empty
1636	 * (it was reset), remove its zone write plug from the hash table.
1637	 */
1638	if (disk_should_remove_zone_wplug(disk, zwplug))
1639		disk_remove_zone_wplug(disk, zwplug);
1640
1641	spin_unlock_irqrestore(&zwplug->lock, flags);
1642}
1643
1644void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio)
1645{
1646	/*
1647	 * For zone append requests, the request sector indicates the location
1648	 * at which the BIO data was written. Return this value to the BIO
1649	 * issuer through the BIO iter sector.
1650	 * For plugged zone writes, which include emulated zone append, we need
1651	 * the original BIO sector so that blk_zone_write_plug_bio_endio() can
1652	 * lookup the zone write plug.
1653	 */
1654	bio->bi_iter.bi_sector = rq->__sector;
1655	trace_blk_zone_append_update_request_bio(rq);
1656}
1657
1658void blk_zone_write_plug_bio_endio(struct bio *bio)
1659{
1660	struct gendisk *disk = bio->bi_bdev->bd_disk;
1661	struct blk_zone_wplug *zwplug =
1662		disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1663	unsigned long flags;
1664
1665	if (WARN_ON_ONCE(!zwplug))
1666		return;
1667
1668	/* Make sure we do not see this BIO again by clearing the plug flag. */
1669	bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1670
1671	/*
1672	 * If this is a regular write emulating a zone append operation,
1673	 * restore the original operation code.
1674	 */
1675	if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
1676		bio->bi_opf &= ~REQ_OP_MASK;
1677		bio->bi_opf |= REQ_OP_ZONE_APPEND;
1678		bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND);
1679	}
1680
1681	/*
1682	 * If the BIO failed, abort all plugged BIOs and mark the plug as
1683	 * needing a write pointer update.
1684	 */
1685	if (bio->bi_status != BLK_STS_OK) {
1686		spin_lock_irqsave(&zwplug->lock, flags);
1687		disk_zone_wplug_abort(zwplug);
1688		zwplug->flags |= BLK_ZONE_WPLUG_NEED_WP_UPDATE;
1689		spin_unlock_irqrestore(&zwplug->lock, flags);
1690	}
1691
1692	/* Drop the reference we took when the BIO was issued. */
1693	disk_put_zone_wplug(zwplug);
1694
1695	/*
1696	 * For BIO-based devices, blk_zone_write_plug_finish_request()
1697	 * is not called. So we need to schedule execution of the next
1698	 * plugged BIO here.
1699	 */
1700	if (bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO))
1701		disk_zone_wplug_unplug_bio(disk, zwplug);
1702
1703	/* Drop the reference we took when entering this function. */
1704	disk_put_zone_wplug(zwplug);
1705}
1706
1707void blk_zone_write_plug_finish_request(struct request *req)
1708{
1709	struct gendisk *disk = req->q->disk;
1710	struct blk_zone_wplug *zwplug;
1711
1712	zwplug = disk_get_zone_wplug(disk, req->__sector);
1713	if (WARN_ON_ONCE(!zwplug))
1714		return;
1715
1716	req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING;
1717
1718	/*
1719	 * Drop the reference we took when the request was initialized in
1720	 * blk_zone_write_plug_init_request().
1721	 */
1722	disk_put_zone_wplug(zwplug);
1723
1724	disk_zone_wplug_unplug_bio(disk, zwplug);
1725
1726	/* Drop the reference we took when entering this function. */
1727	disk_put_zone_wplug(zwplug);
1728}
1729
1730static void blk_zone_wplug_bio_work(struct work_struct *work)
1731{
1732	struct blk_zone_wplug *zwplug =
1733		container_of(work, struct blk_zone_wplug, bio_work);
1734	struct block_device *bdev;
1735	unsigned long flags;
1736	struct bio *bio;
1737	bool prepared;
1738
1739	/*
1740	 * Submit the next plugged BIO. If we do not have any, clear
1741	 * the plugged flag.
1742	 */
1743again:
1744	spin_lock_irqsave(&zwplug->lock, flags);
1745	bio = bio_list_pop(&zwplug->bio_list);
1746	if (!bio) {
1747		zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1748		spin_unlock_irqrestore(&zwplug->lock, flags);
1749		goto put_zwplug;
1750	}
1751
1752	trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no,
1753				 bio->bi_iter.bi_sector, bio_sectors(bio));
1754
1755	prepared = blk_zone_wplug_prepare_bio(zwplug, bio);
1756	spin_unlock_irqrestore(&zwplug->lock, flags);
1757
1758	if (!prepared) {
1759		blk_zone_wplug_bio_io_error(zwplug, bio);
1760		goto again;
1761	}
1762
1763	bdev = bio->bi_bdev;
1764
1765	/*
1766	 * blk-mq devices will reuse the extra reference on the request queue
1767	 * usage counter we took when the BIO was plugged, but the submission
1768	 * path for BIO-based devices will not do that. So drop this extra
1769	 * reference here.
1770	 */
1771	if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) {
1772		bdev->bd_disk->fops->submit_bio(bio);
1773		blk_queue_exit(bdev->bd_disk->queue);
1774	} else {
1775		blk_mq_submit_bio(bio);
1776	}
1777
1778put_zwplug:
1779	/* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */
1780	disk_put_zone_wplug(zwplug);
1781}
1782
1783void disk_init_zone_resources(struct gendisk *disk)
1784{
1785	spin_lock_init(&disk->zone_wplugs_lock);
1786}
1787
1788/*
1789 * For the size of a disk zone write plug hash table, use the size of the
1790 * zone write plug mempool, which is the maximum of the disk open zones and
1791 * active zones limits. But do not exceed 4KB (512 hlist head entries), that is,
1792 * 9 bits. For a disk that has no limits, mempool size defaults to 128.
1793 */
1794#define BLK_ZONE_WPLUG_MAX_HASH_BITS		9
1795#define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE	128
1796
1797static int disk_alloc_zone_resources(struct gendisk *disk,
1798				     unsigned int pool_size)
1799{
1800	unsigned int i;
1801
1802	atomic_set(&disk->nr_zone_wplugs, 0);
1803	disk->zone_wplugs_hash_bits =
1804		min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS);
1805
1806	disk->zone_wplugs_hash =
1807		kcalloc(disk_zone_wplugs_hash_size(disk),
1808			sizeof(struct hlist_head), GFP_KERNEL);
1809	if (!disk->zone_wplugs_hash)
1810		return -ENOMEM;
1811
1812	for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
1813		INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]);
1814
1815	disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size,
1816						sizeof(struct blk_zone_wplug));
1817	if (!disk->zone_wplugs_pool)
1818		goto free_hash;
1819
1820	disk->zone_wplugs_wq =
1821		alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI,
1822				pool_size, disk->disk_name);
1823	if (!disk->zone_wplugs_wq)
1824		goto destroy_pool;
1825
1826	return 0;
1827
1828destroy_pool:
1829	mempool_destroy(disk->zone_wplugs_pool);
1830	disk->zone_wplugs_pool = NULL;
1831free_hash:
1832	kfree(disk->zone_wplugs_hash);
1833	disk->zone_wplugs_hash = NULL;
1834	disk->zone_wplugs_hash_bits = 0;
1835	return -ENOMEM;
1836}
1837
1838static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
1839{
1840	struct blk_zone_wplug *zwplug;
1841	unsigned int i;
1842
1843	if (!disk->zone_wplugs_hash)
1844		return;
1845
1846	/* Free all the zone write plugs we have. */
1847	for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
1848		while (!hlist_empty(&disk->zone_wplugs_hash[i])) {
1849			zwplug = hlist_entry(disk->zone_wplugs_hash[i].first,
1850					     struct blk_zone_wplug, node);
1851			refcount_inc(&zwplug->ref);
1852			disk_remove_zone_wplug(disk, zwplug);
1853			disk_put_zone_wplug(zwplug);
1854		}
1855	}
1856
1857	WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs));
1858	kfree(disk->zone_wplugs_hash);
1859	disk->zone_wplugs_hash = NULL;
1860	disk->zone_wplugs_hash_bits = 0;
1861
1862	/*
1863	 * Wait for the zone write plugs to be RCU-freed before destroying the
1864	 * mempool.
1865	 */
1866	rcu_barrier();
1867	mempool_destroy(disk->zone_wplugs_pool);
1868	disk->zone_wplugs_pool = NULL;
1869}
1870
1871static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond)
1872{
1873	unsigned long flags;
1874
1875	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
1876	zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond,
1877				lockdep_is_held(&disk->zone_wplugs_lock));
1878	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
1879
1880	kfree_rcu_mightsleep(zones_cond);
1881}
1882
1883void disk_free_zone_resources(struct gendisk *disk)
1884{
1885	if (disk->zone_wplugs_wq) {
1886		destroy_workqueue(disk->zone_wplugs_wq);
1887		disk->zone_wplugs_wq = NULL;
1888	}
1889
1890	disk_destroy_zone_wplugs_hash_table(disk);
1891
1892	disk_set_zones_cond_array(disk, NULL);
1893	disk->zone_capacity = 0;
1894	disk->last_zone_capacity = 0;
1895	disk->nr_zones = 0;
1896}
1897
1898struct blk_revalidate_zone_args {
1899	struct gendisk	*disk;
1900	u8		*zones_cond;
1901	unsigned int	nr_zones;
1902	unsigned int	nr_conv_zones;
1903	unsigned int	zone_capacity;
1904	unsigned int	last_zone_capacity;
1905	sector_t	sector;
1906};
1907
1908static int disk_revalidate_zone_resources(struct gendisk *disk,
1909				struct blk_revalidate_zone_args *args)
1910{
1911	struct queue_limits *lim = &disk->queue->limits;
1912	unsigned int pool_size;
1913
1914	args->disk = disk;
1915	args->nr_zones =
1916		DIV_ROUND_UP_ULL(get_capacity(disk), lim->chunk_sectors);
1917
1918	/* Cached zone conditions: 1 byte per zone */
1919	args->zones_cond = kzalloc(args->nr_zones, GFP_NOIO);
1920	if (!args->zones_cond)
1921		return -ENOMEM;
1922
1923	if (!disk_need_zone_resources(disk))
1924		return 0;
1925
1926	/*
1927	 * If the device has no limit on the maximum number of open and active
1928	 * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE.
1929	 */
1930	pool_size = max(lim->max_open_zones, lim->max_active_zones);
1931	if (!pool_size)
1932		pool_size =
1933			min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, args->nr_zones);
1934
1935	if (!disk->zone_wplugs_hash)
1936		return disk_alloc_zone_resources(disk, pool_size);
1937
1938	return 0;
1939}
1940
1941/*
1942 * Update the disk zone resources information and device queue limits.
1943 * The disk queue is frozen when this is executed.
1944 */
1945static int disk_update_zone_resources(struct gendisk *disk,
1946				      struct blk_revalidate_zone_args *args)
1947{
1948	struct request_queue *q = disk->queue;
1949	unsigned int nr_seq_zones;
1950	unsigned int pool_size, memflags;
1951	struct queue_limits lim;
1952	int ret = 0;
1953
1954	lim = queue_limits_start_update(q);
1955
1956	memflags = blk_mq_freeze_queue(q);
1957
1958	disk->nr_zones = args->nr_zones;
1959	if (args->nr_conv_zones >= disk->nr_zones) {
1960		pr_warn("%s: Invalid number of conventional zones %u / %u\n",
1961			disk->disk_name, args->nr_conv_zones, disk->nr_zones);
1962		ret = -ENODEV;
1963		goto unfreeze;
1964	}
1965
1966	disk->zone_capacity = args->zone_capacity;
1967	disk->last_zone_capacity = args->last_zone_capacity;
1968	disk_set_zones_cond_array(disk, args->zones_cond);
1969
1970	/*
1971	 * Some devices can advertise zone resource limits that are larger than
1972	 * the number of sequential zones of the zoned block device, e.g. a
1973	 * small ZNS namespace. For such case, assume that the zoned device has
1974	 * no zone resource limits.
1975	 */
1976	nr_seq_zones = disk->nr_zones - args->nr_conv_zones;
1977	if (lim.max_open_zones >= nr_seq_zones)
1978		lim.max_open_zones = 0;
1979	if (lim.max_active_zones >= nr_seq_zones)
1980		lim.max_active_zones = 0;
1981
1982	if (!disk->zone_wplugs_pool)
1983		goto commit;
1984
1985	/*
1986	 * If the device has no limit on the maximum number of open and active
1987	 * zones, set its max open zone limit to the mempool size to indicate
1988	 * to the user that there is a potential performance impact due to
1989	 * dynamic zone write plug allocation when simultaneously writing to
1990	 * more zones than the size of the mempool.
1991	 */
1992	pool_size = max(lim.max_open_zones, lim.max_active_zones);
1993	if (!pool_size)
1994		pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones);
1995
1996	mempool_resize(disk->zone_wplugs_pool, pool_size);
1997
1998	if (!lim.max_open_zones && !lim.max_active_zones) {
1999		if (pool_size < nr_seq_zones)
2000			lim.max_open_zones = pool_size;
2001		else
2002			lim.max_open_zones = 0;
2003	}
2004
2005commit:
2006	ret = queue_limits_commit_update(q, &lim);
2007
2008unfreeze:
2009	if (ret)
2010		disk_free_zone_resources(disk);
2011
2012	blk_mq_unfreeze_queue(q, memflags);
2013
2014	return ret;
2015}
2016
2017static int blk_revalidate_zone_cond(struct blk_zone *zone, unsigned int idx,
2018				    struct blk_revalidate_zone_args *args)
2019{
2020	enum blk_zone_cond cond = zone->cond;
2021
2022	/* Check that the zone condition is consistent with the zone type. */
2023	switch (cond) {
2024	case BLK_ZONE_COND_NOT_WP:
2025		if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL)
2026			goto invalid_condition;
2027		break;
2028	case BLK_ZONE_COND_IMP_OPEN:
2029	case BLK_ZONE_COND_EXP_OPEN:
2030	case BLK_ZONE_COND_CLOSED:
2031	case BLK_ZONE_COND_EMPTY:
2032	case BLK_ZONE_COND_FULL:
2033	case BLK_ZONE_COND_OFFLINE:
2034	case BLK_ZONE_COND_READONLY:
2035		if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
2036			goto invalid_condition;
2037		break;
2038	default:
2039		pr_warn("%s: Invalid zone condition 0x%X\n",
2040			args->disk->disk_name, cond);
2041		return -ENODEV;
2042	}
2043
2044	blk_zone_set_cond(args->zones_cond, idx, cond);
2045
2046	return 0;
2047
2048invalid_condition:
2049	pr_warn("%s: Invalid zone condition 0x%x for type 0x%x\n",
2050		args->disk->disk_name, cond, zone->type);
2051
2052	return -ENODEV;
2053}
2054
2055static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
2056				    struct blk_revalidate_zone_args *args)
2057{
2058	struct gendisk *disk = args->disk;
2059
2060	if (zone->capacity != zone->len) {
2061		pr_warn("%s: Invalid conventional zone capacity\n",
2062			disk->disk_name);
2063		return -ENODEV;
2064	}
2065
2066	if (disk_zone_is_last(disk, zone))
2067		args->last_zone_capacity = zone->capacity;
2068
2069	args->nr_conv_zones++;
2070
2071	return 0;
2072}
2073
2074static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
2075				   struct blk_revalidate_zone_args *args)
2076{
2077	struct gendisk *disk = args->disk;
2078	struct blk_zone_wplug *zwplug;
2079	unsigned int wp_offset;
2080	unsigned long flags;
2081
2082	/*
2083	 * Remember the capacity of the first sequential zone and check
2084	 * if it is constant for all zones, ignoring the last zone as it can be
2085	 * smaller.
2086	 */
2087	if (!args->zone_capacity)
2088		args->zone_capacity = zone->capacity;
2089	if (disk_zone_is_last(disk, zone)) {
2090		args->last_zone_capacity = zone->capacity;
2091	} else if (zone->capacity != args->zone_capacity) {
2092		pr_warn("%s: Invalid variable zone capacity\n",
2093			disk->disk_name);
2094		return -ENODEV;
2095	}
2096
2097	/*
2098	 * If the device needs zone append emulation, we need to track the
2099	 * write pointer of all zones that are not empty nor full. So make sure
2100	 * we have a zone write plug for such zone if the device has a zone
2101	 * write plug hash table.
2102	 */
2103	if (!disk->zone_wplugs_hash)
2104		return 0;
2105
2106	wp_offset = disk_zone_wplug_sync_wp_offset(disk, zone);
2107	if (!wp_offset || wp_offset >= zone->capacity)
2108		return 0;
2109
2110	zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags);
2111	if (!zwplug)
2112		return -ENOMEM;
2113	spin_unlock_irqrestore(&zwplug->lock, flags);
2114	disk_put_zone_wplug(zwplug);
2115
2116	return 0;
2117}
2118
2119/*
2120 * Helper function to check the validity of zones of a zoned block device.
2121 */
2122static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
2123				  void *data)
2124{
2125	struct blk_revalidate_zone_args *args = data;
2126	struct gendisk *disk = args->disk;
2127	sector_t zone_sectors = disk->queue->limits.chunk_sectors;
2128	int ret;
2129
2130	/* Check for bad zones and holes in the zone report */
2131	if (zone->start != args->sector) {
2132		pr_warn("%s: Zone gap at sectors %llu..%llu\n",
2133			disk->disk_name, args->sector, zone->start);
2134		return -ENODEV;
2135	}
2136
2137	if (zone->start >= get_capacity(disk) || !zone->len) {
2138		pr_warn("%s: Invalid zone start %llu, length %llu\n",
2139			disk->disk_name, zone->start, zone->len);
2140		return -ENODEV;
2141	}
2142
2143	/*
2144	 * All zones must have the same size, with the exception on an eventual
2145	 * smaller last zone.
2146	 */
2147	if (!disk_zone_is_last(disk, zone)) {
2148		if (zone->len != zone_sectors) {
2149			pr_warn("%s: Invalid zoned device with non constant zone size\n",
2150				disk->disk_name);
2151			return -ENODEV;
2152		}
2153	} else if (zone->len > zone_sectors) {
2154		pr_warn("%s: Invalid zoned device with larger last zone size\n",
2155			disk->disk_name);
2156		return -ENODEV;
2157	}
2158
2159	if (!zone->capacity || zone->capacity > zone->len) {
2160		pr_warn("%s: Invalid zone capacity\n",
2161			disk->disk_name);
2162		return -ENODEV;
2163	}
2164
2165	/* Check zone condition */
2166	ret = blk_revalidate_zone_cond(zone, idx, args);
2167	if (ret)
2168		return ret;
2169
2170	/* Check zone type */
2171	switch (zone->type) {
2172	case BLK_ZONE_TYPE_CONVENTIONAL:
2173		ret = blk_revalidate_conv_zone(zone, idx, args);
2174		break;
2175	case BLK_ZONE_TYPE_SEQWRITE_REQ:
2176		ret = blk_revalidate_seq_zone(zone, idx, args);
2177		break;
2178	case BLK_ZONE_TYPE_SEQWRITE_PREF:
2179	default:
2180		pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
2181			disk->disk_name, (int)zone->type, zone->start);
2182		ret = -ENODEV;
2183	}
2184
2185	if (!ret)
2186		args->sector += zone->len;
2187
2188	return ret;
2189}
2190
2191/**
2192 * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs
2193 * @disk:	Target disk
2194 *
2195 * Helper function for low-level device drivers to check, (re) allocate and
2196 * initialize resources used for managing zoned disks. This function should
2197 * normally be called by blk-mq based drivers when a zoned gendisk is probed
2198 * and when the zone configuration of the gendisk changes (e.g. after a format).
2199 * Before calling this function, the device driver must already have set the
2200 * device zone size (chunk_sector limit) and the max zone append limit.
2201 * BIO based drivers can also use this function as long as the device queue
2202 * can be safely frozen.
2203 */
2204int blk_revalidate_disk_zones(struct gendisk *disk)
2205{
2206	struct request_queue *q = disk->queue;
2207	sector_t zone_sectors = q->limits.chunk_sectors;
2208	sector_t capacity = get_capacity(disk);
2209	struct blk_revalidate_zone_args args = { };
2210	unsigned int memflags, noio_flag;
2211	struct blk_report_zones_args rep_args = {
2212		.cb = blk_revalidate_zone_cb,
2213		.data = &args,
2214	};
2215	int ret = -ENOMEM;
2216
2217	if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
2218		return -EIO;
2219
2220	if (!capacity)
2221		return -ENODEV;
2222
2223	/*
2224	 * Checks that the device driver indicated a valid zone size and that
2225	 * the max zone append limit is set.
2226	 */
2227	if (!zone_sectors || !is_power_of_2(zone_sectors)) {
2228		pr_warn("%s: Invalid non power of two zone size (%llu)\n",
2229			disk->disk_name, zone_sectors);
2230		return -ENODEV;
2231	}
2232
2233	/*
2234	 * Ensure that all memory allocations in this context are done as if
2235	 * GFP_NOIO was specified.
2236	 */
2237	noio_flag = memalloc_noio_save();
2238	ret = disk_revalidate_zone_resources(disk, &args);
2239	if (ret) {
2240		memalloc_noio_restore(noio_flag);
2241		return ret;
2242	}
2243
2244	ret = disk->fops->report_zones(disk, 0, UINT_MAX, &rep_args);
2245	if (!ret) {
2246		pr_warn("%s: No zones reported\n", disk->disk_name);
2247		ret = -ENODEV;
2248	}
2249	memalloc_noio_restore(noio_flag);
2250
2251	/*
2252	 * If zones where reported, make sure that the entire disk capacity
2253	 * has been checked.
2254	 */
2255	if (ret > 0 && args.sector != capacity) {
2256		pr_warn("%s: Missing zones from sector %llu\n",
2257			disk->disk_name, args.sector);
2258		ret = -ENODEV;
2259	}
2260
2261	if (ret > 0)
2262		return disk_update_zone_resources(disk, &args);
2263
2264	pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
2265
2266	memflags = blk_mq_freeze_queue(q);
2267	disk_free_zone_resources(disk);
2268	blk_mq_unfreeze_queue(q, memflags);
2269
2270	return ret;
2271}
2272EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
2273
2274/**
2275 * blk_zone_issue_zeroout - zero-fill a block range in a zone
2276 * @bdev:	blockdev to write
2277 * @sector:	start sector
2278 * @nr_sects:	number of sectors to write
2279 * @gfp_mask:	memory allocation flags (for bio_alloc)
2280 *
2281 * Description:
2282 *  Zero-fill a block range in a zone (@sector must be equal to the zone write
2283 *  pointer), handling potential errors due to the (initially unknown) lack of
2284 *  hardware offload (See blkdev_issue_zeroout()).
2285 */
2286int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
2287			   sector_t nr_sects, gfp_t gfp_mask)
2288{
2289	struct gendisk *disk = bdev->bd_disk;
2290	int ret;
2291
2292	if (WARN_ON_ONCE(!bdev_is_zoned(bdev)))
2293		return -EIO;
2294
2295	ret = blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
2296				   BLKDEV_ZERO_NOFALLBACK);
2297	if (ret != -EOPNOTSUPP)
2298		return ret;
2299
2300	/*
2301	 * The failed call to blkdev_issue_zeroout() advanced the zone write
2302	 * pointer. Undo this using a report zone to update the zone write
2303	 * pointer to the correct current value.
2304	 */
2305	ret = disk->fops->report_zones(disk, sector, 1, NULL);
2306	if (ret != 1)
2307		return ret < 0 ? ret : -EIO;
2308
2309	/*
2310	 * Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a
2311	 * regular write with zero-pages.
2312	 */
2313	return blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 0);
2314}
2315EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout);
2316
2317#ifdef CONFIG_BLK_DEBUG_FS
2318static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
2319				  struct seq_file *m)
2320{
2321	unsigned int zwp_wp_offset, zwp_flags;
2322	unsigned int zwp_zone_no, zwp_ref;
2323	unsigned int zwp_bio_list_size;
2324	enum blk_zone_cond zwp_cond;
2325	unsigned long flags;
2326
2327	spin_lock_irqsave(&zwplug->lock, flags);
2328	zwp_zone_no = zwplug->zone_no;
2329	zwp_flags = zwplug->flags;
2330	zwp_ref = refcount_read(&zwplug->ref);
2331	zwp_cond = zwplug->cond;
2332	zwp_wp_offset = zwplug->wp_offset;
2333	zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
2334	spin_unlock_irqrestore(&zwplug->lock, flags);
2335
2336	seq_printf(m,
2337		"Zone no: %u, flags: 0x%x, ref: %u, cond: %s, wp ofst: %u, pending BIO: %u\n",
2338		zwp_zone_no, zwp_flags, zwp_ref, blk_zone_cond_str(zwp_cond),
2339		zwp_wp_offset, zwp_bio_list_size);
2340}
2341
2342int queue_zone_wplugs_show(void *data, struct seq_file *m)
2343{
2344	struct request_queue *q = data;
2345	struct gendisk *disk = q->disk;
2346	struct blk_zone_wplug *zwplug;
2347	unsigned int i;
2348
2349	if (!disk->zone_wplugs_hash)
2350		return 0;
2351
2352	rcu_read_lock();
2353	for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
2354		hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i],
2355					 node)
2356			queue_zone_wplug_show(zwplug, m);
2357	rcu_read_unlock();
2358
2359	return 0;
2360}
2361
2362#endif