Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

block: Implement support for WRITE SAME

The WRITE SAME command supported on some SCSI devices allows the same
block to be efficiently replicated throughout a block range. Only a
single logical block is transferred from the host and the storage device
writes the same data to all blocks described by the I/O.

This patch implements support for WRITE SAME in the block layer. The
blkdev_issue_write_same() function can be used by filesystems and block
drivers to replicate a buffer across a block range. This can be used to
efficiently initialize software RAID devices, etc.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Martin K. Petersen and committed by
Jens Axboe
4363ac7c f31dc1cd

+181 -6
+14
Documentation/ABI/testing/sysfs-block
··· 206 206 when a discarded area is read the discard_zeroes_data 207 207 parameter will be set to one. Otherwise it will be 0 and 208 208 the result of reading a discarded area is undefined. 209 + 210 + What: /sys/block/<disk>/queue/write_same_max_bytes 211 + Date: January 2012 212 + Contact: Martin K. Petersen <martin.petersen@oracle.com> 213 + Description: 214 + Some devices support a write same operation in which a 215 + single data block can be written to a range of several 216 + contiguous blocks on storage. This can be used to wipe 217 + areas on disk or to initialize drives in a RAID 218 + configuration. write_same_max_bytes indicates how many 219 + bytes can be written in a single write same command. If 220 + write_same_max_bytes is 0, write same is not supported 221 + by the device. 222 +
+12 -2
block/blk-core.c
··· 1704 1704 goto end_io; 1705 1705 } 1706 1706 1707 + if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) { 1708 + err = -EOPNOTSUPP; 1709 + goto end_io; 1710 + } 1711 + 1707 1712 /* 1708 1713 * Various block parts want %current->io_context and lazy ioc 1709 1714 * allocation ends up trading a lot of pain for a small amount of ··· 1814 1809 */ 1815 1810 void submit_bio(int rw, struct bio *bio) 1816 1811 { 1817 - int count = bio_sectors(bio); 1818 - 1819 1812 bio->bi_rw |= rw; 1820 1813 1821 1814 /* ··· 1821 1818 * go through the normal accounting stuff before submission. 1822 1819 */ 1823 1820 if (bio_has_data(bio)) { 1821 + unsigned int count; 1822 + 1823 + if (unlikely(rw & REQ_WRITE_SAME)) 1824 + count = bdev_logical_block_size(bio->bi_bdev) >> 9; 1825 + else 1826 + count = bio_sectors(bio); 1827 + 1824 1828 if (rw & WRITE) { 1825 1829 count_vm_events(PGPGOUT, count); 1826 1830 } else {
+74
block/blk-lib.c
··· 130 130 EXPORT_SYMBOL(blkdev_issue_discard); 131 131 132 132 /** 133 + * blkdev_issue_write_same - queue a write same operation 134 + * @bdev: target blockdev 135 + * @sector: start sector 136 + * @nr_sects: number of sectors to write 137 + * @gfp_mask: memory allocation flags (for bio_alloc) 138 + * @page: page containing data to write 139 + * 140 + * Description: 141 + * Issue a write same request for the sectors in question. 142 + */ 143 + int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, 144 + sector_t nr_sects, gfp_t gfp_mask, 145 + struct page *page) 146 + { 147 + DECLARE_COMPLETION_ONSTACK(wait); 148 + struct request_queue *q = bdev_get_queue(bdev); 149 + unsigned int max_write_same_sectors; 150 + struct bio_batch bb; 151 + struct bio *bio; 152 + int ret = 0; 153 + 154 + if (!q) 155 + return -ENXIO; 156 + 157 + max_write_same_sectors = q->limits.max_write_same_sectors; 158 + 159 + if (max_write_same_sectors == 0) 160 + return -EOPNOTSUPP; 161 + 162 + atomic_set(&bb.done, 1); 163 + bb.flags = 1 << BIO_UPTODATE; 164 + bb.wait = &wait; 165 + 166 + while (nr_sects) { 167 + bio = bio_alloc(gfp_mask, 1); 168 + if (!bio) { 169 + ret = -ENOMEM; 170 + break; 171 + } 172 + 173 + bio->bi_sector = sector; 174 + bio->bi_end_io = bio_batch_end_io; 175 + bio->bi_bdev = bdev; 176 + bio->bi_private = &bb; 177 + bio->bi_vcnt = 1; 178 + bio->bi_io_vec->bv_page = page; 179 + bio->bi_io_vec->bv_offset = 0; 180 + bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev); 181 + 182 + if (nr_sects > max_write_same_sectors) { 183 + bio->bi_size = max_write_same_sectors << 9; 184 + nr_sects -= max_write_same_sectors; 185 + sector += max_write_same_sectors; 186 + } else { 187 + bio->bi_size = nr_sects << 9; 188 + nr_sects = 0; 189 + } 190 + 191 + atomic_inc(&bb.done); 192 + submit_bio(REQ_WRITE | REQ_WRITE_SAME, bio); 193 + } 194 + 195 + /* Wait for bios in-flight */ 196 + if (!atomic_dec_and_test(&bb.done)) 197 + wait_for_completion(&wait); 198 + 199 + if (!test_bit(BIO_UPTODATE, &bb.flags)) 200 + ret = -ENOTSUPP; 201 + 202 + return ret; 203 + } 204 + EXPORT_SYMBOL(blkdev_issue_write_same); 205 + 206 + /** 133 207 * blkdev_issue_zeroout - generate number of zero filed write bios 134 208 * @bdev: blockdev to issue 135 209 * @sector: start sector
+9
block/blk-merge.c
··· 419 419 || next->special) 420 420 return 0; 421 421 422 + if (req->cmd_flags & REQ_WRITE_SAME && 423 + !blk_write_same_mergeable(req->bio, next->bio)) 424 + return 0; 425 + 422 426 /* 423 427 * If we are allowed to merge, then append bio list 424 428 * from next to rq and release next. merge_requests_fn ··· 520 516 521 517 /* only merge integrity protected bio into ditto rq */ 522 518 if (bio_integrity(bio) != blk_integrity_rq(rq)) 519 + return false; 520 + 521 + /* must be using the same buffer */ 522 + if (rq->cmd_flags & REQ_WRITE_SAME && 523 + !blk_write_same_mergeable(rq->bio, bio)) 523 524 return false; 524 525 525 526 return true;
+16
block/blk-settings.c
··· 113 113 lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; 114 114 lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; 115 115 lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; 116 + lim->max_write_same_sectors = 0; 116 117 lim->max_discard_sectors = 0; 117 118 lim->discard_granularity = 0; 118 119 lim->discard_alignment = 0; ··· 145 144 lim->max_segments = USHRT_MAX; 146 145 lim->max_hw_sectors = UINT_MAX; 147 146 lim->max_sectors = UINT_MAX; 147 + lim->max_write_same_sectors = UINT_MAX; 148 148 } 149 149 EXPORT_SYMBOL(blk_set_stacking_limits); 150 150 ··· 286 284 q->limits.max_discard_sectors = max_discard_sectors; 287 285 } 288 286 EXPORT_SYMBOL(blk_queue_max_discard_sectors); 287 + 288 + /** 289 + * blk_queue_max_write_same_sectors - set max sectors for a single write same 290 + * @q: the request queue for the device 291 + * @max_write_same_sectors: maximum number of sectors to write per command 292 + **/ 293 + void blk_queue_max_write_same_sectors(struct request_queue *q, 294 + unsigned int max_write_same_sectors) 295 + { 296 + q->limits.max_write_same_sectors = max_write_same_sectors; 297 + } 298 + EXPORT_SYMBOL(blk_queue_max_write_same_sectors); 289 299 290 300 /** 291 301 * blk_queue_max_segments - set max hw segments for a request for this queue ··· 524 510 525 511 t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); 526 512 t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); 513 + t->max_write_same_sectors = min(t->max_write_same_sectors, 514 + b->max_write_same_sectors); 527 515 t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); 528 516 529 517 t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
+13
block/blk-sysfs.c
··· 180 180 return queue_var_show(queue_discard_zeroes_data(q), page); 181 181 } 182 182 183 + static ssize_t queue_write_same_max_show(struct request_queue *q, char *page) 184 + { 185 + return sprintf(page, "%llu\n", 186 + (unsigned long long)q->limits.max_write_same_sectors << 9); 187 + } 188 + 189 + 183 190 static ssize_t 184 191 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) 185 192 { ··· 392 385 .show = queue_discard_zeroes_data_show, 393 386 }; 394 387 388 + static struct queue_sysfs_entry queue_write_same_max_entry = { 389 + .attr = {.name = "write_same_max_bytes", .mode = S_IRUGO }, 390 + .show = queue_write_same_max_show, 391 + }; 392 + 395 393 static struct queue_sysfs_entry queue_nonrot_entry = { 396 394 .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, 397 395 .show = queue_show_nonrot, ··· 444 432 &queue_discard_granularity_entry.attr, 445 433 &queue_discard_max_entry.attr, 446 434 &queue_discard_zeroes_data_entry.attr, 435 + &queue_write_same_max_entry.attr, 447 436 &queue_nonrot_entry.attr, 448 437 &queue_nomerges_entry.attr, 449 438 &queue_rq_affinity_entry.attr,
+1
drivers/md/raid0.c
··· 422 422 if (md_check_no_bitmap(mddev)) 423 423 return -EINVAL; 424 424 blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); 425 + blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); 425 426 426 427 /* if private is not null, we are here after takeover */ 427 428 if (mddev->private == NULL) {
+6 -3
fs/bio.c
··· 1487 1487 1488 1488 bp->bv1 = bi->bi_io_vec[0]; 1489 1489 bp->bv2 = bi->bi_io_vec[0]; 1490 - bp->bv2.bv_offset += first_sectors << 9; 1491 - bp->bv2.bv_len -= first_sectors << 9; 1492 - bp->bv1.bv_len = first_sectors << 9; 1490 + 1491 + if (bio_is_rw(bi)) { 1492 + bp->bv2.bv_offset += first_sectors << 9; 1493 + bp->bv2.bv_len -= first_sectors << 9; 1494 + bp->bv1.bv_len = first_sectors << 9; 1495 + } 1493 1496 1494 1497 bp->bio1.bi_io_vec = &bp->bv1; 1495 1498 bp->bio2.bi_io_vec = &bp->bv2;
+3
include/linux/bio.h
··· 399 399 if (!bio_has_data(bio)) 400 400 return false; 401 401 402 + if (bio->bi_rw & REQ_WRITE_SAME) 403 + return false; 404 + 402 405 return true; 403 406 } 404 407
+4 -1
include/linux/blk_types.h
··· 147 147 __REQ_PRIO, /* boost priority in cfq */ 148 148 __REQ_DISCARD, /* request to discard sectors */ 149 149 __REQ_SECURE, /* secure discard (used with __REQ_DISCARD) */ 150 + __REQ_WRITE_SAME, /* write same block many times */ 150 151 151 152 __REQ_NOIDLE, /* don't anticipate more IO after this one */ 152 153 __REQ_FUA, /* forced unit access */ ··· 186 185 #define REQ_META (1 << __REQ_META) 187 186 #define REQ_PRIO (1 << __REQ_PRIO) 188 187 #define REQ_DISCARD (1 << __REQ_DISCARD) 188 + #define REQ_WRITE_SAME (1 << __REQ_WRITE_SAME) 189 189 #define REQ_NOIDLE (1 << __REQ_NOIDLE) 190 190 191 191 #define REQ_FAILFAST_MASK \ 192 192 (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) 193 193 #define REQ_COMMON_MASK \ 194 194 (REQ_WRITE | REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | \ 195 - REQ_DISCARD | REQ_NOIDLE | REQ_FLUSH | REQ_FUA | REQ_SECURE) 195 + REQ_DISCARD | REQ_WRITE_SAME | REQ_NOIDLE | REQ_FLUSH | REQ_FUA | \ 196 + REQ_SECURE) 196 197 #define REQ_CLONE_MASK REQ_COMMON_MASK 197 198 198 199 /* This mask is used for both bio and request merge checking */
+29
include/linux/blkdev.h
··· 270 270 unsigned int io_min; 271 271 unsigned int io_opt; 272 272 unsigned int max_discard_sectors; 273 + unsigned int max_write_same_sectors; 273 274 unsigned int discard_granularity; 274 275 unsigned int discard_alignment; 275 276 ··· 615 614 if ((flags1 & REQ_SECURE) != (flags2 & REQ_SECURE)) 616 615 return false; 617 616 617 + if ((flags1 & REQ_WRITE_SAME) != (flags2 & REQ_WRITE_SAME)) 618 + return false; 619 + 618 620 return true; 621 + } 622 + 623 + static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) 624 + { 625 + if (bio_data(a) == bio_data(b)) 626 + return true; 627 + 628 + return false; 619 629 } 620 630 621 631 /* ··· 830 818 if (unlikely(cmd_flags & REQ_DISCARD)) 831 819 return q->limits.max_discard_sectors; 832 820 821 + if (unlikely(cmd_flags & REQ_WRITE_SAME)) 822 + return q->limits.max_write_same_sectors; 823 + 833 824 return q->limits.max_sectors; 834 825 } 835 826 ··· 901 886 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); 902 887 extern void blk_queue_max_discard_sectors(struct request_queue *q, 903 888 unsigned int max_discard_sectors); 889 + extern void blk_queue_max_write_same_sectors(struct request_queue *q, 890 + unsigned int max_write_same_sectors); 904 891 extern void blk_queue_logical_block_size(struct request_queue *, unsigned short); 905 892 extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); 906 893 extern void blk_queue_alignment_offset(struct request_queue *q, ··· 1033 1016 extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); 1034 1017 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, 1035 1018 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); 1019 + extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, 1020 + sector_t nr_sects, gfp_t gfp_mask, struct page *page); 1036 1021 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, 1037 1022 sector_t nr_sects, gfp_t gfp_mask); 1038 1023 static inline int sb_issue_discard(struct super_block *sb, sector_t block, ··· 1210 1191 static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev) 1211 1192 { 1212 1193 return queue_discard_zeroes_data(bdev_get_queue(bdev)); 1194 + } 1195 + 1196 + static inline unsigned int bdev_write_same(struct block_device *bdev) 1197 + { 1198 + struct request_queue *q = bdev_get_queue(bdev); 1199 + 1200 + if (q) 1201 + return q->limits.max_write_same_sectors; 1202 + 1203 + return 0; 1213 1204 } 1214 1205 1215 1206 static inline int queue_dma_alignment(struct request_queue *q)