Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

null_blk: Fix locking in zoned mode

When the zoned mode is enabled in null_blk, Serializing read, write
and zone management operations for each zone is necessary to protect
device level information for managing zone resources (zone open and
closed counters) as well as each zone condition and write pointer
position. Commit 35bc10b2eafb ("null_blk: synchronization fix for
zoned device") introduced a spinlock to implement this serialization.
However, when memory backing is also enabled, GFP_NOIO memory
allocations are executed under the spinlock, resulting in might_sleep()
warnings. Furthermore, the zone_lock spinlock is locked/unlocked using
spin_lock_irq/spin_unlock_irq, similarly to the memory backing code with
the nullb->lock spinlock. This nested use of irq locks wrecks the irq
enabled/disabled state.

Fix all this by introducing a bitmap for per-zone lock, with locking
implemented using wait_on_bit_lock_io() and clear_and_wake_up_bit().
This locking mechanism allows keeping a zone locked while executing
null_process_cmd(), serializing all operations to the zone while
allowing to sleep during memory backing allocation with GFP_NOIO.
Device level zone resource management information is protected using
a spinlock which is not held while executing null_process_cmd();

Fixes: 35bc10b2eafb ("null_blk: synchronization fix for zoned device")
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Damien Le Moal and committed by
Jens Axboe
aa1c09cb f9c91042

+82 -25
+2 -1
drivers/block/null_blk.h
··· 47 47 unsigned int nr_zones_closed; 48 48 struct blk_zone *zones; 49 49 sector_t zone_size_sects; 50 - spinlock_t zone_lock; 50 + spinlock_t zone_dev_lock; 51 + unsigned long *zone_locks; 51 52 52 53 unsigned long size; /* device size in MB */ 53 54 unsigned long completion_nsec; /* time in ns to complete a request */
+80 -24
drivers/block/null_blk_zoned.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <linux/vmalloc.h> 3 + #include <linux/bitmap.h> 3 4 #include "null_blk.h" 4 5 5 6 #define CREATE_TRACE_POINTS ··· 46 45 if (!dev->zones) 47 46 return -ENOMEM; 48 47 49 - spin_lock_init(&dev->zone_lock); 48 + spin_lock_init(&dev->zone_dev_lock); 49 + dev->zone_locks = bitmap_zalloc(dev->nr_zones, GFP_KERNEL); 50 + if (!dev->zone_locks) { 51 + kvfree(dev->zones); 52 + return -ENOMEM; 53 + } 54 + 50 55 if (dev->zone_nr_conv >= dev->nr_zones) { 51 56 dev->zone_nr_conv = dev->nr_zones - 1; 52 57 pr_info("changed the number of conventional zones to %u", ··· 131 124 132 125 void null_free_zoned_dev(struct nullb_device *dev) 133 126 { 127 + bitmap_free(dev->zone_locks); 134 128 kvfree(dev->zones); 129 + } 130 + 131 + static inline void null_lock_zone(struct nullb_device *dev, unsigned int zno) 132 + { 133 + wait_on_bit_lock_io(dev->zone_locks, zno, TASK_UNINTERRUPTIBLE); 134 + } 135 + 136 + static inline void null_unlock_zone(struct nullb_device *dev, unsigned int zno) 137 + { 138 + clear_and_wake_up_bit(zno, dev->zone_locks); 135 139 } 136 140 137 141 int null_report_zones(struct gendisk *disk, sector_t sector, ··· 150 132 { 151 133 struct nullb *nullb = disk->private_data; 152 134 struct nullb_device *dev = nullb->dev; 153 - unsigned int first_zone, i; 135 + unsigned int first_zone, i, zno; 154 136 struct blk_zone zone; 155 137 int error; 156 138 ··· 161 143 nr_zones = min(nr_zones, dev->nr_zones - first_zone); 162 144 trace_nullb_report_zones(nullb, nr_zones); 163 145 164 - for (i = 0; i < nr_zones; i++) { 146 + zno = first_zone; 147 + for (i = 0; i < nr_zones; i++, zno++) { 165 148 /* 166 149 * Stacked DM target drivers will remap the zone information by 167 150 * modifying the zone information passed to the report callback. 168 151 * So use a local copy to avoid corruption of the device zone 169 152 * array. 170 153 */ 171 - spin_lock_irq(&dev->zone_lock); 172 - memcpy(&zone, &dev->zones[first_zone + i], 173 - sizeof(struct blk_zone)); 174 - spin_unlock_irq(&dev->zone_lock); 154 + null_lock_zone(dev, zno); 155 + memcpy(&zone, &dev->zones[zno], sizeof(struct blk_zone)); 156 + null_unlock_zone(dev, zno); 175 157 176 158 error = cb(&zone, i, data); 177 159 if (error) ··· 181 163 return nr_zones; 182 164 } 183 165 166 + /* 167 + * This is called in the case of memory backing from null_process_cmd() 168 + * with the target zone already locked. 169 + */ 184 170 size_t null_zone_valid_read_len(struct nullb *nullb, 185 171 sector_t sector, unsigned int len) 186 172 { ··· 321 299 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 322 300 return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); 323 301 302 + null_lock_zone(dev, zno); 303 + spin_lock(&dev->zone_dev_lock); 304 + 324 305 switch (zone->cond) { 325 306 case BLK_ZONE_COND_FULL: 326 307 /* Cannot write to a full zone */ 327 - return BLK_STS_IOERR; 308 + ret = BLK_STS_IOERR; 309 + goto unlock; 328 310 case BLK_ZONE_COND_EMPTY: 329 311 case BLK_ZONE_COND_CLOSED: 330 312 ret = null_check_zone_resources(dev, zone); 331 313 if (ret != BLK_STS_OK) 332 - return ret; 314 + goto unlock; 333 315 break; 334 316 case BLK_ZONE_COND_IMP_OPEN: 335 317 case BLK_ZONE_COND_EXP_OPEN: 336 318 break; 337 319 default: 338 320 /* Invalid zone condition */ 339 - return BLK_STS_IOERR; 321 + ret = BLK_STS_IOERR; 322 + goto unlock; 340 323 } 341 324 342 325 /* ··· 357 330 else 358 331 cmd->rq->__sector = sector; 359 332 } else if (sector != zone->wp) { 360 - return BLK_STS_IOERR; 333 + ret = BLK_STS_IOERR; 334 + goto unlock; 361 335 } 362 336 363 - if (zone->wp + nr_sectors > zone->start + zone->capacity) 364 - return BLK_STS_IOERR; 337 + if (zone->wp + nr_sectors > zone->start + zone->capacity) { 338 + ret = BLK_STS_IOERR; 339 + goto unlock; 340 + } 365 341 366 342 if (zone->cond == BLK_ZONE_COND_CLOSED) { 367 343 dev->nr_zones_closed--; ··· 375 345 if (zone->cond != BLK_ZONE_COND_EXP_OPEN) 376 346 zone->cond = BLK_ZONE_COND_IMP_OPEN; 377 347 348 + spin_unlock(&dev->zone_dev_lock); 378 349 ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); 350 + spin_lock(&dev->zone_dev_lock); 379 351 if (ret != BLK_STS_OK) 380 - return ret; 352 + goto unlock; 381 353 382 354 zone->wp += nr_sectors; 383 355 if (zone->wp == zone->start + zone->capacity) { ··· 389 357 dev->nr_zones_imp_open--; 390 358 zone->cond = BLK_ZONE_COND_FULL; 391 359 } 392 - return BLK_STS_OK; 360 + ret = BLK_STS_OK; 361 + 362 + unlock: 363 + spin_unlock(&dev->zone_dev_lock); 364 + null_unlock_zone(dev, zno); 365 + 366 + return ret; 393 367 } 394 368 395 369 static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zone) ··· 506 468 sector_t sector) 507 469 { 508 470 struct nullb_device *dev = cmd->nq->dev; 509 - unsigned int zone_no = null_zone_no(dev, sector); 510 - struct blk_zone *zone = &dev->zones[zone_no]; 511 - blk_status_t ret = BLK_STS_OK; 471 + unsigned int zone_no; 472 + struct blk_zone *zone; 473 + blk_status_t ret; 512 474 size_t i; 513 475 514 - switch (op) { 515 - case REQ_OP_ZONE_RESET_ALL: 476 + if (op == REQ_OP_ZONE_RESET_ALL) { 516 477 for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { 478 + null_lock_zone(dev, i); 517 479 zone = &dev->zones[i]; 518 480 if (zone->cond != BLK_ZONE_COND_EMPTY) { 481 + spin_lock(&dev->zone_dev_lock); 519 482 null_reset_zone(dev, zone); 483 + spin_unlock(&dev->zone_dev_lock); 520 484 trace_nullb_zone_op(cmd, i, zone->cond); 521 485 } 486 + null_unlock_zone(dev, i); 522 487 } 523 488 return BLK_STS_OK; 489 + } 490 + 491 + zone_no = null_zone_no(dev, sector); 492 + zone = &dev->zones[zone_no]; 493 + 494 + null_lock_zone(dev, zone_no); 495 + spin_lock(&dev->zone_dev_lock); 496 + 497 + switch (op) { 524 498 case REQ_OP_ZONE_RESET: 525 499 ret = null_reset_zone(dev, zone); 526 500 break; ··· 546 496 ret = null_finish_zone(dev, zone); 547 497 break; 548 498 default: 549 - return BLK_STS_NOTSUPP; 499 + ret = BLK_STS_NOTSUPP; 500 + break; 550 501 } 502 + 503 + spin_unlock(&dev->zone_dev_lock); 551 504 552 505 if (ret == BLK_STS_OK) 553 506 trace_nullb_zone_op(cmd, zone_no, zone->cond); 507 + 508 + null_unlock_zone(dev, zone_no); 554 509 555 510 return ret; 556 511 } ··· 563 508 blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, 564 509 sector_t sector, sector_t nr_sectors) 565 510 { 566 - blk_status_t sts; 567 511 struct nullb_device *dev = cmd->nq->dev; 512 + unsigned int zno = null_zone_no(dev, sector); 513 + blk_status_t sts; 568 514 569 - spin_lock_irq(&dev->zone_lock); 570 515 switch (op) { 571 516 case REQ_OP_WRITE: 572 517 sts = null_zone_write(cmd, sector, nr_sectors, false); ··· 582 527 sts = null_zone_mgmt(cmd, op, sector); 583 528 break; 584 529 default: 530 + null_lock_zone(dev, zno); 585 531 sts = null_process_cmd(cmd, op, sector, nr_sectors); 532 + null_unlock_zone(dev, zno); 586 533 } 587 - spin_unlock_irq(&dev->zone_lock); 588 534 589 535 return sts; 590 536 }