Merge branch 'for-3.18/core' of git://git.kernel.dk/linux-block

+8

Documentation/ABI/testing/sysfs-block

··· 53 53 512 bytes of data. 54 54 55 55 56 + What: /sys/block/<disk>/integrity/device_is_integrity_capable 57 + Date: July 2014 58 + Contact: Martin K. Petersen <martin.petersen@oracle.com> 59 + Description: 60 + Indicates whether a storage device is capable of storing 61 + integrity metadata. Set if the device is T10 PI-capable. 62 + 63 + 56 64 What: /sys/block/<disk>/integrity/write_generate 57 65 Date: June 2008 58 66 Contact: Martin K. Petersen <martin.petersen@oracle.com>

+5 -49

Documentation/block/data-integrity.txt

··· 129 129 4.1 BIO 130 130 131 131 The data integrity patches add a new field to struct bio when 132 - CONFIG_BLK_DEV_INTEGRITY is enabled. bio->bi_integrity is a pointer 133 - to a struct bip which contains the bio integrity payload. Essentially 134 - a bip is a trimmed down struct bio which holds a bio_vec containing 135 - the integrity metadata and the required housekeeping information (bvec 136 - pool, vector count, etc.) 132 + CONFIG_BLK_DEV_INTEGRITY is enabled. bio_integrity(bio) returns a 133 + pointer to a struct bip which contains the bio integrity payload. 134 + Essentially a bip is a trimmed down struct bio which holds a bio_vec 135 + containing the integrity metadata and the required housekeeping 136 + information (bvec pool, vector count, etc.) 137 137 138 138 A kernel subsystem can enable data integrity protection on a bio by 139 139 calling bio_integrity_alloc(bio). This will allocate and attach the ··· 192 192 supported by the block device. 193 193 194 194 195 - int bdev_integrity_enabled(block_device, int rw); 196 - 197 - bdev_integrity_enabled() will return 1 if the block device 198 - supports integrity metadata transfer for the data direction 199 - specified in 'rw'. 200 - 201 - bdev_integrity_enabled() honors the write_generate and 202 - read_verify flags in sysfs and will respond accordingly. 203 - 204 - 205 195 int bio_integrity_prep(bio); 206 196 207 197 To generate IMD for WRITE and to set up buffers for READ, the ··· 204 214 205 215 bio_integrity_prep() should only be called if 206 216 bio_integrity_enabled() returned 1. 207 - 208 - 209 - int bio_integrity_tag_size(bio); 210 - 211 - If the filesystem wants to use the application tag space it will 212 - first have to find out how much storage space is available. 213 - Because tag space is generally limited (usually 2 bytes per 214 - sector regardless of sector size), the integrity framework 215 - supports interleaving the information between the sectors in an 216 - I/O. 217 - 218 - Filesystems can call bio_integrity_tag_size(bio) to find out how 219 - many bytes of storage are available for that particular bio. 220 - 221 - Another option is bdev_get_tag_size(block_device) which will 222 - return the number of available bytes per hardware sector. 223 - 224 - 225 - int bio_integrity_set_tag(bio, void *tag_buf, len); 226 - 227 - After a successful return from bio_integrity_prep(), 228 - bio_integrity_set_tag() can be used to attach an opaque tag 229 - buffer to a bio. Obviously this only makes sense if the I/O is 230 - a WRITE. 231 - 232 - 233 - int bio_integrity_get_tag(bio, void *tag_buf, len); 234 - 235 - Similarly, at READ I/O completion time the filesystem can 236 - retrieve the tag buffer using bio_integrity_get_tag(). 237 217 238 218 239 219 5.3 PASSING EXISTING INTEGRITY METADATA ··· 258 298 .name = "STANDARDSBODY-TYPE-VARIANT-CSUM", 259 299 .generate_fn = my_generate_fn, 260 300 .verify_fn = my_verify_fn, 261 - .get_tag_fn = my_get_tag_fn, 262 - .set_tag_fn = my_set_tag_fn, 263 301 .tuple_size = sizeof(struct my_tuple_size), 264 302 .tag_size = <tag bytes per hw sector>, 265 303 }; ··· 278 320 'tag_size' must be set to identify how many bytes of tag space 279 321 are available per hardware sector. For DIF this is either 2 or 280 322 0 depending on the value of the Control Mode Page ATO bit. 281 - 282 - See 6.2 for a description of get_tag_fn and set_tag_fn. 283 323 284 324 ---------------------------------------------------------------------- 285 325 2007-12-24 Martin K. Petersen <martin.petersen@oracle.com>

+1

block/Kconfig

··· 77 77 78 78 config BLK_DEV_INTEGRITY 79 79 bool "Block layer data integrity support" 80 + select CRC_T10DIF if BLK_DEV_INTEGRITY 80 81 ---help--- 81 82 Some storage devices allow extra information to be 82 83 stored/retrieved to help protect the data. The block layer

+2 -2

block/Makefile

··· 20 20 obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o 21 21 22 22 obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o 23 - obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o 24 23 obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o 25 - obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o 24 + obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o 25 +

+67 -202

block/bio-integrity.c

··· 79 79 bip->bip_slab = idx; 80 80 bip->bip_bio = bio; 81 81 bio->bi_integrity = bip; 82 + bio->bi_rw |= REQ_INTEGRITY; 82 83 83 84 return bip; 84 85 err: ··· 97 96 */ 98 97 void bio_integrity_free(struct bio *bio) 99 98 { 100 - struct bio_integrity_payload *bip = bio->bi_integrity; 99 + struct bio_integrity_payload *bip = bio_integrity(bio); 101 100 struct bio_set *bs = bio->bi_pool; 102 101 103 - if (bip->bip_owns_buf) 104 - kfree(bip->bip_buf); 102 + if (bip->bip_flags & BIP_BLOCK_INTEGRITY) 103 + kfree(page_address(bip->bip_vec->bv_page) + 104 + bip->bip_vec->bv_offset); 105 105 106 106 if (bs) { 107 107 if (bip->bip_slab != BIO_POOL_NONE) ··· 130 128 int bio_integrity_add_page(struct bio *bio, struct page *page, 131 129 unsigned int len, unsigned int offset) 132 130 { 133 - struct bio_integrity_payload *bip = bio->bi_integrity; 131 + struct bio_integrity_payload *bip = bio_integrity(bio); 134 132 struct bio_vec *iv; 135 133 136 134 if (bip->bip_vcnt >= bip->bip_max_vcnt) { ··· 149 147 } 150 148 EXPORT_SYMBOL(bio_integrity_add_page); 151 149 152 - static int bdev_integrity_enabled(struct block_device *bdev, int rw) 153 - { 154 - struct blk_integrity *bi = bdev_get_integrity(bdev); 155 - 156 - if (bi == NULL) 157 - return 0; 158 - 159 - if (rw == READ && bi->verify_fn != NULL && 160 - (bi->flags & INTEGRITY_FLAG_READ)) 161 - return 1; 162 - 163 - if (rw == WRITE && bi->generate_fn != NULL && 164 - (bi->flags & INTEGRITY_FLAG_WRITE)) 165 - return 1; 166 - 167 - return 0; 168 - } 169 - 170 150 /** 171 151 * bio_integrity_enabled - Check whether integrity can be passed 172 152 * @bio: bio to check ··· 158 174 * set prior to calling. The functions honors the write_generate and 159 175 * read_verify flags in sysfs. 160 176 */ 161 - int bio_integrity_enabled(struct bio *bio) 177 + bool bio_integrity_enabled(struct bio *bio) 162 178 { 179 + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 180 + 163 181 if (!bio_is_rw(bio)) 164 - return 0; 182 + return false; 165 183 166 184 /* Already protected? */ 167 185 if (bio_integrity(bio)) 168 - return 0; 186 + return false; 169 187 170 - return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio)); 188 + if (bi == NULL) 189 + return false; 190 + 191 + if (bio_data_dir(bio) == READ && bi->verify_fn != NULL && 192 + (bi->flags & BLK_INTEGRITY_VERIFY)) 193 + return true; 194 + 195 + if (bio_data_dir(bio) == WRITE && bi->generate_fn != NULL && 196 + (bi->flags & BLK_INTEGRITY_GENERATE)) 197 + return true; 198 + 199 + return false; 171 200 } 172 201 EXPORT_SYMBOL(bio_integrity_enabled); 173 202 174 203 /** 175 - * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto 204 + * bio_integrity_intervals - Return number of integrity intervals for a bio 176 205 * @bi: blk_integrity profile for device 177 - * @sectors: Number of 512 sectors to convert 206 + * @sectors: Size of the bio in 512-byte sectors 178 207 * 179 208 * Description: The block layer calculates everything in 512 byte 180 - * sectors but integrity metadata is done in terms of the hardware 181 - * sector size of the storage device. Convert the block layer sectors 182 - * to physical sectors. 209 + * sectors but integrity metadata is done in terms of the data integrity 210 + * interval size of the storage device. Convert the block layer sectors 211 + * to the appropriate number of integrity intervals. 183 212 */ 184 - static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, 185 - unsigned int sectors) 213 + static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, 214 + unsigned int sectors) 186 215 { 187 - /* At this point there are only 512b or 4096b DIF/EPP devices */ 188 - if (bi->sector_size == 4096) 189 - return sectors >>= 3; 190 - 191 - return sectors; 216 + return sectors >> (ilog2(bi->interval) - 9); 192 217 } 193 218 194 219 static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, 195 220 unsigned int sectors) 196 221 { 197 - return bio_integrity_hw_sectors(bi, sectors) * bi->tuple_size; 222 + return bio_integrity_intervals(bi, sectors) * bi->tuple_size; 198 223 } 199 224 200 225 /** 201 - * bio_integrity_tag_size - Retrieve integrity tag space 202 - * @bio: bio to inspect 203 - * 204 - * Description: Returns the maximum number of tag bytes that can be 205 - * attached to this bio. Filesystems can use this to determine how 206 - * much metadata to attach to an I/O. 207 - */ 208 - unsigned int bio_integrity_tag_size(struct bio *bio) 209 - { 210 - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 211 - 212 - BUG_ON(bio->bi_iter.bi_size == 0); 213 - 214 - return bi->tag_size * (bio->bi_iter.bi_size / bi->sector_size); 215 - } 216 - EXPORT_SYMBOL(bio_integrity_tag_size); 217 - 218 - static int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, 219 - int set) 220 - { 221 - struct bio_integrity_payload *bip = bio->bi_integrity; 222 - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 223 - unsigned int nr_sectors; 224 - 225 - BUG_ON(bip->bip_buf == NULL); 226 - 227 - if (bi->tag_size == 0) 228 - return -1; 229 - 230 - nr_sectors = bio_integrity_hw_sectors(bi, 231 - DIV_ROUND_UP(len, bi->tag_size)); 232 - 233 - if (nr_sectors * bi->tuple_size > bip->bip_iter.bi_size) { 234 - printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", __func__, 235 - nr_sectors * bi->tuple_size, bip->bip_iter.bi_size); 236 - return -1; 237 - } 238 - 239 - if (set) 240 - bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors); 241 - else 242 - bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors); 243 - 244 - return 0; 245 - } 246 - 247 - /** 248 - * bio_integrity_set_tag - Attach a tag buffer to a bio 249 - * @bio: bio to attach buffer to 250 - * @tag_buf: Pointer to a buffer containing tag data 251 - * @len: Length of the included buffer 252 - * 253 - * Description: Use this function to tag a bio by leveraging the extra 254 - * space provided by devices formatted with integrity protection. The 255 - * size of the integrity buffer must be <= to the size reported by 256 - * bio_integrity_tag_size(). 257 - */ 258 - int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len) 259 - { 260 - BUG_ON(bio_data_dir(bio) != WRITE); 261 - 262 - return bio_integrity_tag(bio, tag_buf, len, 1); 263 - } 264 - EXPORT_SYMBOL(bio_integrity_set_tag); 265 - 266 - /** 267 - * bio_integrity_get_tag - Retrieve a tag buffer from a bio 268 - * @bio: bio to retrieve buffer from 269 - * @tag_buf: Pointer to a buffer for the tag data 270 - * @len: Length of the target buffer 271 - * 272 - * Description: Use this function to retrieve the tag buffer from a 273 - * completed I/O. The size of the integrity buffer must be <= to the 274 - * size reported by bio_integrity_tag_size(). 275 - */ 276 - int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len) 277 - { 278 - BUG_ON(bio_data_dir(bio) != READ); 279 - 280 - return bio_integrity_tag(bio, tag_buf, len, 0); 281 - } 282 - EXPORT_SYMBOL(bio_integrity_get_tag); 283 - 284 - /** 285 - * bio_integrity_generate_verify - Generate/verify integrity metadata for a bio 226 + * bio_integrity_process - Process integrity metadata for a bio 286 227 * @bio: bio to generate/verify integrity metadata for 287 - * @operate: operate number, 1 for generate, 0 for verify 228 + * @proc_fn: Pointer to the relevant processing function 288 229 */ 289 - static int bio_integrity_generate_verify(struct bio *bio, int operate) 230 + static int bio_integrity_process(struct bio *bio, 231 + integrity_processing_fn *proc_fn) 290 232 { 291 233 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 292 - struct blk_integrity_exchg bix; 234 + struct blk_integrity_iter iter; 293 235 struct bio_vec *bv; 294 - sector_t sector; 295 - unsigned int sectors, ret = 0, i; 296 - void *prot_buf = bio->bi_integrity->bip_buf; 236 + struct bio_integrity_payload *bip = bio_integrity(bio); 237 + unsigned int i, ret = 0; 238 + void *prot_buf = page_address(bip->bip_vec->bv_page) + 239 + bip->bip_vec->bv_offset; 297 240 298 - if (operate) 299 - sector = bio->bi_iter.bi_sector; 300 - else 301 - sector = bio->bi_integrity->bip_iter.bi_sector; 302 - 303 - bix.disk_name = bio->bi_bdev->bd_disk->disk_name; 304 - bix.sector_size = bi->sector_size; 241 + iter.disk_name = bio->bi_bdev->bd_disk->disk_name; 242 + iter.interval = bi->interval; 243 + iter.seed = bip_get_seed(bip); 244 + iter.prot_buf = prot_buf; 305 245 306 246 bio_for_each_segment_all(bv, bio, i) { 307 247 void *kaddr = kmap_atomic(bv->bv_page); 308 - bix.data_buf = kaddr + bv->bv_offset; 309 - bix.data_size = bv->bv_len; 310 - bix.prot_buf = prot_buf; 311 - bix.sector = sector; 312 248 313 - if (operate) 314 - bi->generate_fn(&bix); 315 - else { 316 - ret = bi->verify_fn(&bix); 317 - if (ret) { 318 - kunmap_atomic(kaddr); 319 - return ret; 320 - } 249 + iter.data_buf = kaddr + bv->bv_offset; 250 + iter.data_size = bv->bv_len; 251 + 252 + ret = proc_fn(&iter); 253 + if (ret) { 254 + kunmap_atomic(kaddr); 255 + return ret; 321 256 } 322 - 323 - sectors = bv->bv_len / bi->sector_size; 324 - sector += sectors; 325 - prot_buf += sectors * bi->tuple_size; 326 257 327 258 kunmap_atomic(kaddr); 328 259 } 329 260 return ret; 330 - } 331 - 332 - /** 333 - * bio_integrity_generate - Generate integrity metadata for a bio 334 - * @bio: bio to generate integrity metadata for 335 - * 336 - * Description: Generates integrity metadata for a bio by calling the 337 - * block device's generation callback function. The bio must have a 338 - * bip attached with enough room to accommodate the generated 339 - * integrity metadata. 340 - */ 341 - static void bio_integrity_generate(struct bio *bio) 342 - { 343 - bio_integrity_generate_verify(bio, 1); 344 - } 345 - 346 - static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi) 347 - { 348 - if (bi) 349 - return bi->tuple_size; 350 - 351 - return 0; 352 261 } 353 262 354 263 /** ··· 264 387 unsigned long start, end; 265 388 unsigned int len, nr_pages; 266 389 unsigned int bytes, offset, i; 267 - unsigned int sectors; 390 + unsigned int intervals; 268 391 269 392 bi = bdev_get_integrity(bio->bi_bdev); 270 393 q = bdev_get_queue(bio->bi_bdev); 271 394 BUG_ON(bi == NULL); 272 395 BUG_ON(bio_integrity(bio)); 273 396 274 - sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio)); 397 + intervals = bio_integrity_intervals(bi, bio_sectors(bio)); 275 398 276 399 /* Allocate kernel buffer for protection data */ 277 - len = sectors * blk_integrity_tuple_size(bi); 400 + len = intervals * bi->tuple_size; 278 401 buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); 279 402 if (unlikely(buf == NULL)) { 280 403 printk(KERN_ERR "could not allocate integrity buffer\n"); ··· 293 416 return -EIO; 294 417 } 295 418 296 - bip->bip_owns_buf = 1; 297 - bip->bip_buf = buf; 419 + bip->bip_flags |= BIP_BLOCK_INTEGRITY; 298 420 bip->bip_iter.bi_size = len; 299 - bip->bip_iter.bi_sector = bio->bi_iter.bi_sector; 421 + bip_set_seed(bip, bio->bi_iter.bi_sector); 422 + 423 + if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM) 424 + bip->bip_flags |= BIP_IP_CHECKSUM; 300 425 301 426 /* Map it */ 302 427 offset = offset_in_page(buf); ··· 334 455 335 456 /* Auto-generate integrity metadata if this is a write */ 336 457 if (bio_data_dir(bio) == WRITE) 337 - bio_integrity_generate(bio); 458 + bio_integrity_process(bio, bi->generate_fn); 338 459 339 460 return 0; 340 461 } 341 462 EXPORT_SYMBOL(bio_integrity_prep); 342 - 343 - /** 344 - * bio_integrity_verify - Verify integrity metadata for a bio 345 - * @bio: bio to verify 346 - * 347 - * Description: This function is called to verify the integrity of a 348 - * bio. The data in the bio io_vec is compared to the integrity 349 - * metadata returned by the HBA. 350 - */ 351 - static int bio_integrity_verify(struct bio *bio) 352 - { 353 - return bio_integrity_generate_verify(bio, 0); 354 - } 355 463 356 464 /** 357 465 * bio_integrity_verify_fn - Integrity I/O completion worker ··· 353 487 struct bio_integrity_payload *bip = 354 488 container_of(work, struct bio_integrity_payload, bip_work); 355 489 struct bio *bio = bip->bip_bio; 490 + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 356 491 int error; 357 492 358 - error = bio_integrity_verify(bio); 493 + error = bio_integrity_process(bio, bi->verify_fn); 359 494 360 495 /* Restore original bio completion handler */ 361 496 bio->bi_end_io = bip->bip_end_io; ··· 377 510 */ 378 511 void bio_integrity_endio(struct bio *bio, int error) 379 512 { 380 - struct bio_integrity_payload *bip = bio->bi_integrity; 513 + struct bio_integrity_payload *bip = bio_integrity(bio); 381 514 382 515 BUG_ON(bip->bip_bio != bio); 383 516 ··· 408 541 */ 409 542 void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) 410 543 { 411 - struct bio_integrity_payload *bip = bio->bi_integrity; 544 + struct bio_integrity_payload *bip = bio_integrity(bio); 412 545 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 413 546 unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); 414 547 ··· 430 563 void bio_integrity_trim(struct bio *bio, unsigned int offset, 431 564 unsigned int sectors) 432 565 { 433 - struct bio_integrity_payload *bip = bio->bi_integrity; 566 + struct bio_integrity_payload *bip = bio_integrity(bio); 434 567 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 435 568 436 569 bio_integrity_advance(bio, offset << 9); ··· 449 582 int bio_integrity_clone(struct bio *bio, struct bio *bio_src, 450 583 gfp_t gfp_mask) 451 584 { 452 - struct bio_integrity_payload *bip_src = bio_src->bi_integrity; 585 + struct bio_integrity_payload *bip_src = bio_integrity(bio_src); 453 586 struct bio_integrity_payload *bip; 454 587 455 588 BUG_ON(bip_src == NULL); ··· 513 646 sizeof(struct bio_integrity_payload) + 514 647 sizeof(struct bio_vec) * BIP_INLINE_VECS, 515 648 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 516 - if (!bip_slab) 517 - panic("Failed to create slab\n"); 518 649 }

+44 -17

block/bio.c

··· 428 428 front_pad = 0; 429 429 inline_vecs = nr_iovecs; 430 430 } else { 431 + /* should not use nobvec bioset for nr_iovecs > 0 */ 432 + if (WARN_ON_ONCE(!bs->bvec_pool && nr_iovecs > 0)) 433 + return NULL; 431 434 /* 432 435 * generic_make_request() converts recursion to iteration; this 433 436 * means if we're running beneath it, any bios we allocate and ··· 1903 1900 } 1904 1901 EXPORT_SYMBOL(bioset_free); 1905 1902 1906 - /** 1907 - * bioset_create - Create a bio_set 1908 - * @pool_size: Number of bio and bio_vecs to cache in the mempool 1909 - * @front_pad: Number of bytes to allocate in front of the returned bio 1910 - * 1911 - * Description: 1912 - * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller 1913 - * to ask for a number of bytes to be allocated in front of the bio. 1914 - * Front pad allocation is useful for embedding the bio inside 1915 - * another structure, to avoid allocating extra data to go with the bio. 1916 - * Note that the bio must be embedded at the END of that structure always, 1917 - * or things will break badly. 1918 - */ 1919 - struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) 1903 + static struct bio_set *__bioset_create(unsigned int pool_size, 1904 + unsigned int front_pad, 1905 + bool create_bvec_pool) 1920 1906 { 1921 1907 unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); 1922 1908 struct bio_set *bs; ··· 1930 1938 if (!bs->bio_pool) 1931 1939 goto bad; 1932 1940 1933 - bs->bvec_pool = biovec_create_pool(pool_size); 1934 - if (!bs->bvec_pool) 1935 - goto bad; 1941 + if (create_bvec_pool) { 1942 + bs->bvec_pool = biovec_create_pool(pool_size); 1943 + if (!bs->bvec_pool) 1944 + goto bad; 1945 + } 1936 1946 1937 1947 bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); 1938 1948 if (!bs->rescue_workqueue) ··· 1945 1951 bioset_free(bs); 1946 1952 return NULL; 1947 1953 } 1954 + 1955 + /** 1956 + * bioset_create - Create a bio_set 1957 + * @pool_size: Number of bio and bio_vecs to cache in the mempool 1958 + * @front_pad: Number of bytes to allocate in front of the returned bio 1959 + * 1960 + * Description: 1961 + * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller 1962 + * to ask for a number of bytes to be allocated in front of the bio. 1963 + * Front pad allocation is useful for embedding the bio inside 1964 + * another structure, to avoid allocating extra data to go with the bio. 1965 + * Note that the bio must be embedded at the END of that structure always, 1966 + * or things will break badly. 1967 + */ 1968 + struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) 1969 + { 1970 + return __bioset_create(pool_size, front_pad, true); 1971 + } 1948 1972 EXPORT_SYMBOL(bioset_create); 1973 + 1974 + /** 1975 + * bioset_create_nobvec - Create a bio_set without bio_vec mempool 1976 + * @pool_size: Number of bio to cache in the mempool 1977 + * @front_pad: Number of bytes to allocate in front of the returned bio 1978 + * 1979 + * Description: 1980 + * Same functionality as bioset_create() except that mempool is not 1981 + * created for bio_vecs. Saving some memory for bio_clone_fast() users. 1982 + */ 1983 + struct bio_set *bioset_create_nobvec(unsigned int pool_size, unsigned int front_pad) 1984 + { 1985 + return __bioset_create(pool_size, front_pad, false); 1986 + } 1987 + EXPORT_SYMBOL(bioset_create_nobvec); 1949 1988 1950 1989 #ifdef CONFIG_BLK_CGROUP 1951 1990 /**

-2

block/blk-cgroup.c

··· 822 822 static struct cgroup_subsys_state * 823 823 blkcg_css_alloc(struct cgroup_subsys_state *parent_css) 824 824 { 825 - static atomic64_t id_seq = ATOMIC64_INIT(0); 826 825 struct blkcg *blkcg; 827 826 828 827 if (!parent_css) { ··· 835 836 836 837 blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; 837 838 blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT; 838 - blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ 839 839 done: 840 840 spin_lock_init(&blkcg->lock); 841 841 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);

-3

block/blk-cgroup.h

··· 50 50 struct blkcg_gq *blkg_hint; 51 51 struct hlist_head blkg_list; 52 52 53 - /* for policies to test whether associated blkcg has changed */ 54 - uint64_t id; 55 - 56 53 /* TODO: per-policy storage in blkcg */ 57 54 unsigned int cfq_weight; /* belongs to cfq */ 58 55 unsigned int cfq_leaf_weight;

+31 -36

block/blk-core.c

··· 83 83 * @bdev: device 84 84 * 85 85 * Locates the passed device's request queue and returns the address of its 86 - * backing_dev_info 87 - * 88 - * Will return NULL if the request queue cannot be located. 86 + * backing_dev_info. This function can only be called if @bdev is opened 87 + * and the return value is never NULL. 89 88 */ 90 89 struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) 91 90 { 92 - struct backing_dev_info *ret = NULL; 93 91 struct request_queue *q = bdev_get_queue(bdev); 94 92 95 - if (q) 96 - ret = &q->backing_dev_info; 97 - return ret; 93 + return &q->backing_dev_info; 98 94 } 99 95 EXPORT_SYMBOL(blk_get_backing_dev_info); 100 96 ··· 390 394 * be drained. Check all the queues and counters. 391 395 */ 392 396 if (drain_all) { 397 + struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); 393 398 drain |= !list_empty(&q->queue_head); 394 399 for (i = 0; i < 2; i++) { 395 400 drain |= q->nr_rqs[i]; 396 401 drain |= q->in_flight[i]; 397 - drain |= !list_empty(&q->flush_queue[i]); 402 + if (fq) 403 + drain |= !list_empty(&fq->flush_queue[i]); 398 404 } 399 405 } 400 406 ··· 602 604 #ifdef CONFIG_BLK_CGROUP 603 605 INIT_LIST_HEAD(&q->blkg_list); 604 606 #endif 605 - INIT_LIST_HEAD(&q->flush_queue[0]); 606 - INIT_LIST_HEAD(&q->flush_queue[1]); 607 - INIT_LIST_HEAD(&q->flush_data_in_flight); 608 607 INIT_DELAYED_WORK(&q->delay_work, blk_delay_work); 609 608 610 609 kobject_init(&q->kobj, &blk_queue_ktype); ··· 704 709 if (!q) 705 710 return NULL; 706 711 707 - q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL); 708 - if (!q->flush_rq) 712 + q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, 0); 713 + if (!q->fq) 709 714 return NULL; 710 715 711 716 if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) ··· 741 746 return q; 742 747 743 748 fail: 744 - kfree(q->flush_rq); 749 + blk_free_flush_queue(q->fq); 745 750 return NULL; 746 751 } 747 752 EXPORT_SYMBOL(blk_init_allocated_queue); ··· 929 934 * pressure or if @q is dead. 930 935 * 931 936 * Must be called with @q->queue_lock held and, 932 - * Returns %NULL on failure, with @q->queue_lock held. 933 - * Returns !%NULL on success, with @q->queue_lock *not held*. 937 + * Returns ERR_PTR on failure, with @q->queue_lock held. 938 + * Returns request pointer on success, with @q->queue_lock *not held*. 934 939 */ 935 940 static struct request *__get_request(struct request_list *rl, int rw_flags, 936 941 struct bio *bio, gfp_t gfp_mask) ··· 944 949 int may_queue; 945 950 946 951 if (unlikely(blk_queue_dying(q))) 947 - return NULL; 952 + return ERR_PTR(-ENODEV); 948 953 949 954 may_queue = elv_may_queue(q, rw_flags); 950 955 if (may_queue == ELV_MQUEUE_NO) ··· 969 974 * process is not a "batcher", and not 970 975 * exempted by the IO scheduler 971 976 */ 972 - return NULL; 977 + return ERR_PTR(-ENOMEM); 973 978 } 974 979 } 975 980 } ··· 987 992 * allocated with any setting of ->nr_requests 988 993 */ 989 994 if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) 990 - return NULL; 995 + return ERR_PTR(-ENOMEM); 991 996 992 997 q->nr_rqs[is_sync]++; 993 998 rl->count[is_sync]++; ··· 1060 1065 * shouldn't stall IO. Treat this request as !elvpriv. This will 1061 1066 * disturb iosched and blkcg but weird is bettern than dead. 1062 1067 */ 1063 - printk_ratelimited(KERN_WARNING "%s: request aux data allocation failed, iosched may be disturbed\n", 1064 - dev_name(q->backing_dev_info.dev)); 1068 + printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n", 1069 + __func__, dev_name(q->backing_dev_info.dev)); 1065 1070 1066 1071 rq->cmd_flags &= ~REQ_ELVPRIV; 1067 1072 rq->elv.icq = NULL; ··· 1092 1097 rq_starved: 1093 1098 if (unlikely(rl->count[is_sync] == 0)) 1094 1099 rl->starved[is_sync] = 1; 1095 - return NULL; 1100 + return ERR_PTR(-ENOMEM); 1096 1101 } 1097 1102 1098 1103 /** ··· 1106 1111 * function keeps retrying under memory pressure and fails iff @q is dead. 1107 1112 * 1108 1113 * Must be called with @q->queue_lock held and, 1109 - * Returns %NULL on failure, with @q->queue_lock held. 1110 - * Returns !%NULL on success, with @q->queue_lock *not held*. 1114 + * Returns ERR_PTR on failure, with @q->queue_lock held. 1115 + * Returns request pointer on success, with @q->queue_lock *not held*. 1111 1116 */ 1112 1117 static struct request *get_request(struct request_queue *q, int rw_flags, 1113 1118 struct bio *bio, gfp_t gfp_mask) ··· 1120 1125 rl = blk_get_rl(q, bio); /* transferred to @rq on success */ 1121 1126 retry: 1122 1127 rq = __get_request(rl, rw_flags, bio, gfp_mask); 1123 - if (rq) 1128 + if (!IS_ERR(rq)) 1124 1129 return rq; 1125 1130 1126 1131 if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) { 1127 1132 blk_put_rl(rl); 1128 - return NULL; 1133 + return rq; 1129 1134 } 1130 1135 1131 1136 /* wait on @rl and retry */ ··· 1162 1167 1163 1168 spin_lock_irq(q->queue_lock); 1164 1169 rq = get_request(q, rw, NULL, gfp_mask); 1165 - if (!rq) 1170 + if (IS_ERR(rq)) 1166 1171 spin_unlock_irq(q->queue_lock); 1167 1172 /* q->queue_lock is unlocked at this point */ 1168 1173 ··· 1214 1219 { 1215 1220 struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask); 1216 1221 1217 - if (unlikely(!rq)) 1218 - return ERR_PTR(-ENOMEM); 1222 + if (IS_ERR(rq)) 1223 + return rq; 1219 1224 1220 1225 blk_rq_set_block_pc(rq); 1221 1226 ··· 1609 1614 * Returns with the queue unlocked. 1610 1615 */ 1611 1616 req = get_request(q, rw_flags, bio, GFP_NOIO); 1612 - if (unlikely(!req)) { 1613 - bio_endio(bio, -ENODEV); /* @q is dead */ 1617 + if (IS_ERR(req)) { 1618 + bio_endio(bio, PTR_ERR(req)); /* @q is dead */ 1614 1619 goto out_unlock; 1615 1620 } 1616 1621 ··· 2400 2405 { 2401 2406 int total_bytes; 2402 2407 2408 + trace_block_rq_complete(req->q, req, nr_bytes); 2409 + 2403 2410 if (!req->bio) 2404 2411 return false; 2405 - 2406 - trace_block_rq_complete(req->q, req, nr_bytes); 2407 2412 2408 2413 /* 2409 2414 * For fs requests, rq is just carrier of independent bio's ··· 2444 2449 error_type = "I/O"; 2445 2450 break; 2446 2451 } 2447 - printk_ratelimited(KERN_ERR "end_request: %s error, dev %s, sector %llu\n", 2448 - error_type, req->rq_disk ? 2452 + printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n", 2453 + __func__, error_type, req->rq_disk ? 2449 2454 req->rq_disk->disk_name : "?", 2450 2455 (unsigned long long)blk_rq_pos(req)); 2451 2456 ··· 2926 2931 blk_rq_init(NULL, rq); 2927 2932 2928 2933 __rq_for_each_bio(bio_src, rq_src) { 2929 - bio = bio_clone_bioset(bio_src, gfp_mask, bs); 2934 + bio = bio_clone_fast(bio_src, gfp_mask, bs); 2930 2935 if (!bio) 2931 2936 goto free_and_out; 2932 2937

+98 -49

block/blk-flush.c

··· 28 28 * 29 29 * The actual execution of flush is double buffered. Whenever a request 30 30 * needs to execute PRE or POSTFLUSH, it queues at 31 - * q->flush_queue[q->flush_pending_idx]. Once certain criteria are met, a 31 + * fq->flush_queue[fq->flush_pending_idx]. Once certain criteria are met, a 32 32 * flush is issued and the pending_idx is toggled. When the flush 33 33 * completes, all the requests which were pending are proceeded to the next 34 34 * step. This allows arbitrary merging of different types of FLUSH/FUA ··· 91 91 FLUSH_PENDING_TIMEOUT = 5 * HZ, 92 92 }; 93 93 94 - static bool blk_kick_flush(struct request_queue *q); 94 + static bool blk_kick_flush(struct request_queue *q, 95 + struct blk_flush_queue *fq); 95 96 96 97 static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq) 97 98 { ··· 127 126 /* make @rq a normal request */ 128 127 rq->cmd_flags &= ~REQ_FLUSH_SEQ; 129 128 rq->end_io = rq->flush.saved_end_io; 130 - 131 - blk_clear_rq_complete(rq); 132 129 } 133 130 134 131 static bool blk_flush_queue_rq(struct request *rq, bool add_front) ··· 149 150 /** 150 151 * blk_flush_complete_seq - complete flush sequence 151 152 * @rq: FLUSH/FUA request being sequenced 153 + * @fq: flush queue 152 154 * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero) 153 155 * @error: whether an error occurred 154 156 * ··· 157 157 * completion and trigger the next step. 158 158 * 159 159 * CONTEXT: 160 - * spin_lock_irq(q->queue_lock or q->mq_flush_lock) 160 + * spin_lock_irq(q->queue_lock or fq->mq_flush_lock) 161 161 * 162 162 * RETURNS: 163 163 * %true if requests were added to the dispatch queue, %false otherwise. 164 164 */ 165 - static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, 166 - int error) 165 + static bool blk_flush_complete_seq(struct request *rq, 166 + struct blk_flush_queue *fq, 167 + unsigned int seq, int error) 167 168 { 168 169 struct request_queue *q = rq->q; 169 - struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; 170 + struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; 170 171 bool queued = false, kicked; 171 172 172 173 BUG_ON(rq->flush.seq & seq); ··· 183 182 case REQ_FSEQ_POSTFLUSH: 184 183 /* queue for flush */ 185 184 if (list_empty(pending)) 186 - q->flush_pending_since = jiffies; 185 + fq->flush_pending_since = jiffies; 187 186 list_move_tail(&rq->flush.list, pending); 188 187 break; 189 188 190 189 case REQ_FSEQ_DATA: 191 - list_move_tail(&rq->flush.list, &q->flush_data_in_flight); 190 + list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); 192 191 queued = blk_flush_queue_rq(rq, true); 193 192 break; 194 193 ··· 203 202 list_del_init(&rq->flush.list); 204 203 blk_flush_restore_request(rq); 205 204 if (q->mq_ops) 206 - blk_mq_end_io(rq, error); 205 + blk_mq_end_request(rq, error); 207 206 else 208 207 __blk_end_request_all(rq, error); 209 208 break; ··· 212 211 BUG(); 213 212 } 214 213 215 - kicked = blk_kick_flush(q); 214 + kicked = blk_kick_flush(q, fq); 216 215 return kicked | queued; 217 216 } 218 217 ··· 223 222 bool queued = false; 224 223 struct request *rq, *n; 225 224 unsigned long flags = 0; 225 + struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); 226 226 227 227 if (q->mq_ops) { 228 - spin_lock_irqsave(&q->mq_flush_lock, flags); 229 - q->flush_rq->tag = -1; 228 + spin_lock_irqsave(&fq->mq_flush_lock, flags); 229 + flush_rq->tag = -1; 230 230 } 231 231 232 - running = &q->flush_queue[q->flush_running_idx]; 233 - BUG_ON(q->flush_pending_idx == q->flush_running_idx); 232 + running = &fq->flush_queue[fq->flush_running_idx]; 233 + BUG_ON(fq->flush_pending_idx == fq->flush_running_idx); 234 234 235 235 /* account completion of the flush request */ 236 - q->flush_running_idx ^= 1; 236 + fq->flush_running_idx ^= 1; 237 237 238 238 if (!q->mq_ops) 239 239 elv_completed_request(q, flush_rq); ··· 244 242 unsigned int seq = blk_flush_cur_seq(rq); 245 243 246 244 BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); 247 - queued |= blk_flush_complete_seq(rq, seq, error); 245 + queued |= blk_flush_complete_seq(rq, fq, seq, error); 248 246 } 249 247 250 248 /* ··· 258 256 * directly into request_fn may confuse the driver. Always use 259 257 * kblockd. 260 258 */ 261 - if (queued || q->flush_queue_delayed) { 259 + if (queued || fq->flush_queue_delayed) { 262 260 WARN_ON(q->mq_ops); 263 261 blk_run_queue_async(q); 264 262 } 265 - q->flush_queue_delayed = 0; 263 + fq->flush_queue_delayed = 0; 266 264 if (q->mq_ops) 267 - spin_unlock_irqrestore(&q->mq_flush_lock, flags); 265 + spin_unlock_irqrestore(&fq->mq_flush_lock, flags); 268 266 } 269 267 270 268 /** 271 269 * blk_kick_flush - consider issuing flush request 272 270 * @q: request_queue being kicked 271 + * @fq: flush queue 273 272 * 274 273 * Flush related states of @q have changed, consider issuing flush request. 275 274 * Please read the comment at the top of this file for more info. 276 275 * 277 276 * CONTEXT: 278 - * spin_lock_irq(q->queue_lock or q->mq_flush_lock) 277 + * spin_lock_irq(q->queue_lock or fq->mq_flush_lock) 279 278 * 280 279 * RETURNS: 281 280 * %true if flush was issued, %false otherwise. 282 281 */ 283 - static bool blk_kick_flush(struct request_queue *q) 282 + static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) 284 283 { 285 - struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; 284 + struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; 286 285 struct request *first_rq = 287 286 list_first_entry(pending, struct request, flush.list); 287 + struct request *flush_rq = fq->flush_rq; 288 288 289 289 /* C1 described at the top of this file */ 290 - if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending)) 290 + if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending)) 291 291 return false; 292 292 293 293 /* C2 and C3 */ 294 - if (!list_empty(&q->flush_data_in_flight) && 294 + if (!list_empty(&fq->flush_data_in_flight) && 295 295 time_before(jiffies, 296 - q->flush_pending_since + FLUSH_PENDING_TIMEOUT)) 296 + fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) 297 297 return false; 298 298 299 299 /* 300 300 * Issue flush and toggle pending_idx. This makes pending_idx 301 301 * different from running_idx, which means flush is in flight. 302 302 */ 303 - q->flush_pending_idx ^= 1; 303 + fq->flush_pending_idx ^= 1; 304 304 305 - blk_rq_init(q, q->flush_rq); 306 - if (q->mq_ops) 307 - blk_mq_clone_flush_request(q->flush_rq, first_rq); 305 + blk_rq_init(q, flush_rq); 308 306 309 - q->flush_rq->cmd_type = REQ_TYPE_FS; 310 - q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; 311 - q->flush_rq->rq_disk = first_rq->rq_disk; 312 - q->flush_rq->end_io = flush_end_io; 307 + /* 308 + * Borrow tag from the first request since they can't 309 + * be in flight at the same time. 310 + */ 311 + if (q->mq_ops) { 312 + flush_rq->mq_ctx = first_rq->mq_ctx; 313 + flush_rq->tag = first_rq->tag; 314 + } 313 315 314 - return blk_flush_queue_rq(q->flush_rq, false); 316 + flush_rq->cmd_type = REQ_TYPE_FS; 317 + flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; 318 + flush_rq->rq_disk = first_rq->rq_disk; 319 + flush_rq->end_io = flush_end_io; 320 + 321 + return blk_flush_queue_rq(flush_rq, false); 315 322 } 316 323 317 324 static void flush_data_end_io(struct request *rq, int error) 318 325 { 319 326 struct request_queue *q = rq->q; 327 + struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); 320 328 321 329 /* 322 330 * After populating an empty queue, kick it to avoid stall. Read 323 331 * the comment in flush_end_io(). 324 332 */ 325 - if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error)) 333 + if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error)) 326 334 blk_run_queue_async(q); 327 335 } 328 336 ··· 340 328 { 341 329 struct request_queue *q = rq->q; 342 330 struct blk_mq_hw_ctx *hctx; 343 - struct blk_mq_ctx *ctx; 331 + struct blk_mq_ctx *ctx = rq->mq_ctx; 344 332 unsigned long flags; 333 + struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); 345 334 346 - ctx = rq->mq_ctx; 347 335 hctx = q->mq_ops->map_queue(q, ctx->cpu); 348 336 349 337 /* 350 338 * After populating an empty queue, kick it to avoid stall. Read 351 339 * the comment in flush_end_io(). 352 340 */ 353 - spin_lock_irqsave(&q->mq_flush_lock, flags); 354 - if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error)) 341 + spin_lock_irqsave(&fq->mq_flush_lock, flags); 342 + if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error)) 355 343 blk_mq_run_hw_queue(hctx, true); 356 - spin_unlock_irqrestore(&q->mq_flush_lock, flags); 344 + spin_unlock_irqrestore(&fq->mq_flush_lock, flags); 357 345 } 358 346 359 347 /** ··· 373 361 struct request_queue *q = rq->q; 374 362 unsigned int fflags = q->flush_flags; /* may change, cache */ 375 363 unsigned int policy = blk_flush_policy(fflags, rq); 364 + struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); 376 365 377 366 /* 378 367 * @policy now records what operations need to be done. Adjust ··· 391 378 */ 392 379 if (!policy) { 393 380 if (q->mq_ops) 394 - blk_mq_end_io(rq, 0); 381 + blk_mq_end_request(rq, 0); 395 382 else 396 383 __blk_end_bidi_request(rq, 0, 0, 0); 397 384 return; ··· 424 411 if (q->mq_ops) { 425 412 rq->end_io = mq_flush_data_end_io; 426 413 427 - spin_lock_irq(&q->mq_flush_lock); 428 - blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); 429 - spin_unlock_irq(&q->mq_flush_lock); 414 + spin_lock_irq(&fq->mq_flush_lock); 415 + blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); 416 + spin_unlock_irq(&fq->mq_flush_lock); 430 417 return; 431 418 } 432 419 rq->end_io = flush_data_end_io; 433 420 434 - blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); 421 + blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); 435 422 } 436 423 437 424 /** ··· 487 474 } 488 475 EXPORT_SYMBOL(blkdev_issue_flush); 489 476 490 - void blk_mq_init_flush(struct request_queue *q) 477 + struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, 478 + int node, int cmd_size) 491 479 { 492 - spin_lock_init(&q->mq_flush_lock); 480 + struct blk_flush_queue *fq; 481 + int rq_sz = sizeof(struct request); 482 + 483 + fq = kzalloc_node(sizeof(*fq), GFP_KERNEL, node); 484 + if (!fq) 485 + goto fail; 486 + 487 + if (q->mq_ops) { 488 + spin_lock_init(&fq->mq_flush_lock); 489 + rq_sz = round_up(rq_sz + cmd_size, cache_line_size()); 490 + } 491 + 492 + fq->flush_rq = kzalloc_node(rq_sz, GFP_KERNEL, node); 493 + if (!fq->flush_rq) 494 + goto fail_rq; 495 + 496 + INIT_LIST_HEAD(&fq->flush_queue[0]); 497 + INIT_LIST_HEAD(&fq->flush_queue[1]); 498 + INIT_LIST_HEAD(&fq->flush_data_in_flight); 499 + 500 + return fq; 501 + 502 + fail_rq: 503 + kfree(fq); 504 + fail: 505 + return NULL; 506 + } 507 + 508 + void blk_free_flush_queue(struct blk_flush_queue *fq) 509 + { 510 + /* bio based request queue hasn't flush queue */ 511 + if (!fq) 512 + return; 513 + 514 + kfree(fq->flush_rq); 515 + kfree(fq); 493 516 }

+65 -38

block/blk-integrity.c

··· 154 154 if (!b1 || !b2) 155 155 return -1; 156 156 157 - if (b1->sector_size != b2->sector_size) { 158 - printk(KERN_ERR "%s: %s/%s sector sz %u != %u\n", __func__, 159 - gd1->disk_name, gd2->disk_name, 160 - b1->sector_size, b2->sector_size); 157 + if (b1->interval != b2->interval) { 158 + pr_err("%s: %s/%s protection interval %u != %u\n", 159 + __func__, gd1->disk_name, gd2->disk_name, 160 + b1->interval, b2->interval); 161 161 return -1; 162 162 } 163 163 ··· 186 186 } 187 187 EXPORT_SYMBOL(blk_integrity_compare); 188 188 189 - int blk_integrity_merge_rq(struct request_queue *q, struct request *req, 190 - struct request *next) 189 + bool blk_integrity_merge_rq(struct request_queue *q, struct request *req, 190 + struct request *next) 191 191 { 192 - if (blk_integrity_rq(req) != blk_integrity_rq(next)) 193 - return -1; 192 + if (blk_integrity_rq(req) == 0 && blk_integrity_rq(next) == 0) 193 + return true; 194 + 195 + if (blk_integrity_rq(req) == 0 || blk_integrity_rq(next) == 0) 196 + return false; 197 + 198 + if (bio_integrity(req->bio)->bip_flags != 199 + bio_integrity(next->bio)->bip_flags) 200 + return false; 194 201 195 202 if (req->nr_integrity_segments + next->nr_integrity_segments > 196 203 q->limits.max_integrity_segments) 197 - return -1; 204 + return false; 198 205 199 - return 0; 206 + return true; 200 207 } 201 208 EXPORT_SYMBOL(blk_integrity_merge_rq); 202 209 203 - int blk_integrity_merge_bio(struct request_queue *q, struct request *req, 204 - struct bio *bio) 210 + bool blk_integrity_merge_bio(struct request_queue *q, struct request *req, 211 + struct bio *bio) 205 212 { 206 213 int nr_integrity_segs; 207 214 struct bio *next = bio->bi_next; 215 + 216 + if (blk_integrity_rq(req) == 0 && bio_integrity(bio) == NULL) 217 + return true; 218 + 219 + if (blk_integrity_rq(req) == 0 || bio_integrity(bio) == NULL) 220 + return false; 221 + 222 + if (bio_integrity(req->bio)->bip_flags != bio_integrity(bio)->bip_flags) 223 + return false; 208 224 209 225 bio->bi_next = NULL; 210 226 nr_integrity_segs = blk_rq_count_integrity_sg(q, bio); ··· 228 212 229 213 if (req->nr_integrity_segments + nr_integrity_segs > 230 214 q->limits.max_integrity_segments) 231 - return -1; 215 + return false; 232 216 233 217 req->nr_integrity_segments += nr_integrity_segs; 234 218 235 - return 0; 219 + return true; 236 220 } 237 221 EXPORT_SYMBOL(blk_integrity_merge_bio); 238 222 ··· 285 269 return sprintf(page, "0\n"); 286 270 } 287 271 288 - static ssize_t integrity_read_store(struct blk_integrity *bi, 289 - const char *page, size_t count) 272 + static ssize_t integrity_verify_store(struct blk_integrity *bi, 273 + const char *page, size_t count) 290 274 { 291 275 char *p = (char *) page; 292 276 unsigned long val = simple_strtoul(p, &p, 10); 293 277 294 278 if (val) 295 - bi->flags |= INTEGRITY_FLAG_READ; 279 + bi->flags |= BLK_INTEGRITY_VERIFY; 296 280 else 297 - bi->flags &= ~INTEGRITY_FLAG_READ; 281 + bi->flags &= ~BLK_INTEGRITY_VERIFY; 298 282 299 283 return count; 300 284 } 301 285 302 - static ssize_t integrity_read_show(struct blk_integrity *bi, char *page) 286 + static ssize_t integrity_verify_show(struct blk_integrity *bi, char *page) 303 287 { 304 - return sprintf(page, "%d\n", (bi->flags & INTEGRITY_FLAG_READ) != 0); 288 + return sprintf(page, "%d\n", (bi->flags & BLK_INTEGRITY_VERIFY) != 0); 305 289 } 306 290 307 - static ssize_t integrity_write_store(struct blk_integrity *bi, 308 - const char *page, size_t count) 291 + static ssize_t integrity_generate_store(struct blk_integrity *bi, 292 + const char *page, size_t count) 309 293 { 310 294 char *p = (char *) page; 311 295 unsigned long val = simple_strtoul(p, &p, 10); 312 296 313 297 if (val) 314 - bi->flags |= INTEGRITY_FLAG_WRITE; 298 + bi->flags |= BLK_INTEGRITY_GENERATE; 315 299 else 316 - bi->flags &= ~INTEGRITY_FLAG_WRITE; 300 + bi->flags &= ~BLK_INTEGRITY_GENERATE; 317 301 318 302 return count; 319 303 } 320 304 321 - static ssize_t integrity_write_show(struct blk_integrity *bi, char *page) 305 + static ssize_t integrity_generate_show(struct blk_integrity *bi, char *page) 322 306 { 323 - return sprintf(page, "%d\n", (bi->flags & INTEGRITY_FLAG_WRITE) != 0); 307 + return sprintf(page, "%d\n", (bi->flags & BLK_INTEGRITY_GENERATE) != 0); 308 + } 309 + 310 + static ssize_t integrity_device_show(struct blk_integrity *bi, char *page) 311 + { 312 + return sprintf(page, "%u\n", 313 + (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) != 0); 324 314 } 325 315 326 316 static struct integrity_sysfs_entry integrity_format_entry = { ··· 339 317 .show = integrity_tag_size_show, 340 318 }; 341 319 342 - static struct integrity_sysfs_entry integrity_read_entry = { 320 + static struct integrity_sysfs_entry integrity_verify_entry = { 343 321 .attr = { .name = "read_verify", .mode = S_IRUGO | S_IWUSR }, 344 - .show = integrity_read_show, 345 - .store = integrity_read_store, 322 + .show = integrity_verify_show, 323 + .store = integrity_verify_store, 346 324 }; 347 325 348 - static struct integrity_sysfs_entry integrity_write_entry = { 326 + static struct integrity_sysfs_entry integrity_generate_entry = { 349 327 .attr = { .name = "write_generate", .mode = S_IRUGO | S_IWUSR }, 350 - .show = integrity_write_show, 351 - .store = integrity_write_store, 328 + .show = integrity_generate_show, 329 + .store = integrity_generate_store, 330 + }; 331 + 332 + static struct integrity_sysfs_entry integrity_device_entry = { 333 + .attr = { .name = "device_is_integrity_capable", .mode = S_IRUGO }, 334 + .show = integrity_device_show, 352 335 }; 353 336 354 337 static struct attribute *integrity_attrs[] = { 355 338 &integrity_format_entry.attr, 356 339 &integrity_tag_size_entry.attr, 357 - &integrity_read_entry.attr, 358 - &integrity_write_entry.attr, 340 + &integrity_verify_entry.attr, 341 + &integrity_generate_entry.attr, 342 + &integrity_device_entry.attr, 359 343 NULL, 360 344 }; 361 345 ··· 434 406 435 407 kobject_uevent(&bi->kobj, KOBJ_ADD); 436 408 437 - bi->flags |= INTEGRITY_FLAG_READ | INTEGRITY_FLAG_WRITE; 438 - bi->sector_size = queue_logical_block_size(disk->queue); 409 + bi->flags |= BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE; 410 + bi->interval = queue_logical_block_size(disk->queue); 439 411 disk->integrity = bi; 440 412 } else 441 413 bi = disk->integrity; ··· 446 418 bi->generate_fn = template->generate_fn; 447 419 bi->verify_fn = template->verify_fn; 448 420 bi->tuple_size = template->tuple_size; 449 - bi->set_tag_fn = template->set_tag_fn; 450 - bi->get_tag_fn = template->get_tag_fn; 451 421 bi->tag_size = template->tag_size; 422 + bi->flags |= template->flags; 452 423 } else 453 424 bi->name = bi_unsupported_name; 454 425

+9 -5

block/blk-merge.c

··· 97 97 98 98 void blk_recount_segments(struct request_queue *q, struct bio *bio) 99 99 { 100 - if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) && 100 + bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE, 101 + &q->queue_flags); 102 + 103 + if (no_sg_merge && !bio_flagged(bio, BIO_CLONED) && 101 104 bio->bi_vcnt < queue_max_segments(q)) 102 105 bio->bi_phys_segments = bio->bi_vcnt; 103 106 else { 104 107 struct bio *nxt = bio->bi_next; 105 108 106 109 bio->bi_next = NULL; 107 - bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, false); 110 + bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, 111 + no_sg_merge); 108 112 bio->bi_next = nxt; 109 113 } 110 114 ··· 317 313 if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) 318 314 goto no_merge; 319 315 320 - if (bio_integrity(bio) && blk_integrity_merge_bio(q, req, bio)) 316 + if (blk_integrity_merge_bio(q, req, bio) == false) 321 317 goto no_merge; 322 318 323 319 /* ··· 414 410 if (total_phys_segments > queue_max_segments(q)) 415 411 return 0; 416 412 417 - if (blk_integrity_rq(req) && blk_integrity_merge_rq(q, req, next)) 413 + if (blk_integrity_merge_rq(q, req, next) == false) 418 414 return 0; 419 415 420 416 /* Merge is OK... */ ··· 594 590 return false; 595 591 596 592 /* only merge integrity protected bio into ditto rq */ 597 - if (bio_integrity(bio) != blk_integrity_rq(rq)) 593 + if (blk_integrity_merge_bio(rq->q, rq, bio) == false) 598 594 return false; 599 595 600 596 /* must be using the same buffer */

+22 -33

block/blk-mq-tag.c

··· 351 351 return; 352 352 353 353 wait_cnt = atomic_dec_return(&bs->wait_cnt); 354 + if (unlikely(wait_cnt < 0)) 355 + wait_cnt = atomic_inc_return(&bs->wait_cnt); 354 356 if (wait_cnt == 0) { 355 - wake: 356 357 atomic_add(bt->wake_cnt, &bs->wait_cnt); 357 358 bt_index_atomic_inc(&bt->wake_index); 358 359 wake_up(&bs->wait); 359 - } else if (wait_cnt < 0) { 360 - wait_cnt = atomic_inc_return(&bs->wait_cnt); 361 - if (!wait_cnt) 362 - goto wake; 363 360 } 364 361 } 365 362 ··· 389 392 __blk_mq_put_reserved_tag(tags, tag); 390 393 } 391 394 392 - static void bt_for_each_free(struct blk_mq_bitmap_tags *bt, 393 - unsigned long *free_map, unsigned int off) 395 + static void bt_for_each(struct blk_mq_hw_ctx *hctx, 396 + struct blk_mq_bitmap_tags *bt, unsigned int off, 397 + busy_iter_fn *fn, void *data, bool reserved) 394 398 { 395 - int i; 399 + struct request *rq; 400 + int bit, i; 396 401 397 402 for (i = 0; i < bt->map_nr; i++) { 398 403 struct blk_align_bitmap *bm = &bt->map[i]; 399 - int bit = 0; 400 404 401 - do { 402 - bit = find_next_zero_bit(&bm->word, bm->depth, bit); 403 - if (bit >= bm->depth) 404 - break; 405 - 406 - __set_bit(bit + off, free_map); 407 - bit++; 408 - } while (1); 405 + for (bit = find_first_bit(&bm->word, bm->depth); 406 + bit < bm->depth; 407 + bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { 408 + rq = blk_mq_tag_to_rq(hctx->tags, off + bit); 409 + if (rq->q == hctx->queue) 410 + fn(hctx, rq, data, reserved); 411 + } 409 412 410 413 off += (1 << bt->bits_per_word); 411 414 } 412 415 } 413 416 414 - void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, 415 - void (*fn)(void *, unsigned long *), void *data) 417 + void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, 418 + void *priv) 416 419 { 417 - unsigned long *tag_map; 418 - size_t map_size; 420 + struct blk_mq_tags *tags = hctx->tags; 419 421 420 - map_size = ALIGN(tags->nr_tags, BITS_PER_LONG) / BITS_PER_LONG; 421 - tag_map = kzalloc(map_size * sizeof(unsigned long), GFP_ATOMIC); 422 - if (!tag_map) 423 - return; 424 - 425 - bt_for_each_free(&tags->bitmap_tags, tag_map, tags->nr_reserved_tags); 426 422 if (tags->nr_reserved_tags) 427 - bt_for_each_free(&tags->breserved_tags, tag_map, 0); 428 - 429 - fn(data, tag_map); 430 - kfree(tag_map); 423 + bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true); 424 + bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, 425 + false); 431 426 } 432 427 EXPORT_SYMBOL(blk_mq_tag_busy_iter); 433 428 ··· 452 463 } 453 464 454 465 bt->wake_cnt = BT_WAIT_BATCH; 455 - if (bt->wake_cnt > depth / 4) 456 - bt->wake_cnt = max(1U, depth / 4); 466 + if (bt->wake_cnt > depth / BT_WAIT_QUEUES) 467 + bt->wake_cnt = max(1U, depth / BT_WAIT_QUEUES); 457 468 458 469 bt->depth = depth; 459 470 }

+189 -179

block/blk-mq.c

··· 20 20 #include <linux/cache.h> 21 21 #include <linux/sched/sysctl.h> 22 22 #include <linux/delay.h> 23 + #include <linux/crash_dump.h> 23 24 24 25 #include <trace/events/block.h> 25 26 ··· 224 223 struct blk_mq_hw_ctx *hctx; 225 224 struct request *rq; 226 225 struct blk_mq_alloc_data alloc_data; 226 + int ret; 227 227 228 - if (blk_mq_queue_enter(q)) 229 - return NULL; 228 + ret = blk_mq_queue_enter(q); 229 + if (ret) 230 + return ERR_PTR(ret); 230 231 231 232 ctx = blk_mq_get_ctx(q); 232 233 hctx = q->mq_ops->map_queue(q, ctx->cpu); ··· 248 245 ctx = alloc_data.ctx; 249 246 } 250 247 blk_mq_put_ctx(ctx); 248 + if (!rq) 249 + return ERR_PTR(-EWOULDBLOCK); 251 250 return rq; 252 251 } 253 252 EXPORT_SYMBOL(blk_mq_alloc_request); ··· 281 276 __blk_mq_free_request(hctx, ctx, rq); 282 277 } 283 278 284 - /* 285 - * Clone all relevant state from a request that has been put on hold in 286 - * the flush state machine into the preallocated flush request that hangs 287 - * off the request queue. 288 - * 289 - * For a driver the flush request should be invisible, that's why we are 290 - * impersonating the original request here. 291 - */ 292 - void blk_mq_clone_flush_request(struct request *flush_rq, 293 - struct request *orig_rq) 294 - { 295 - struct blk_mq_hw_ctx *hctx = 296 - orig_rq->q->mq_ops->map_queue(orig_rq->q, orig_rq->mq_ctx->cpu); 297 - 298 - flush_rq->mq_ctx = orig_rq->mq_ctx; 299 - flush_rq->tag = orig_rq->tag; 300 - memcpy(blk_mq_rq_to_pdu(flush_rq), blk_mq_rq_to_pdu(orig_rq), 301 - hctx->cmd_size); 302 - } 303 - 304 - inline void __blk_mq_end_io(struct request *rq, int error) 279 + inline void __blk_mq_end_request(struct request *rq, int error) 305 280 { 306 281 blk_account_io_done(rq); 307 282 ··· 293 308 blk_mq_free_request(rq); 294 309 } 295 310 } 296 - EXPORT_SYMBOL(__blk_mq_end_io); 311 + EXPORT_SYMBOL(__blk_mq_end_request); 297 312 298 - void blk_mq_end_io(struct request *rq, int error) 313 + void blk_mq_end_request(struct request *rq, int error) 299 314 { 300 315 if (blk_update_request(rq, error, blk_rq_bytes(rq))) 301 316 BUG(); 302 - __blk_mq_end_io(rq, error); 317 + __blk_mq_end_request(rq, error); 303 318 } 304 - EXPORT_SYMBOL(blk_mq_end_io); 319 + EXPORT_SYMBOL(blk_mq_end_request); 305 320 306 321 static void __blk_mq_complete_request_remote(void *data) 307 322 { ··· 341 356 struct request_queue *q = rq->q; 342 357 343 358 if (!q->softirq_done_fn) 344 - blk_mq_end_io(rq, rq->errors); 359 + blk_mq_end_request(rq, rq->errors); 345 360 else 346 361 blk_mq_ipi_complete_request(rq); 347 362 } ··· 365 380 } 366 381 EXPORT_SYMBOL(blk_mq_complete_request); 367 382 368 - static void blk_mq_start_request(struct request *rq, bool last) 383 + void blk_mq_start_request(struct request *rq) 369 384 { 370 385 struct request_queue *q = rq->q; 371 386 ··· 402 417 */ 403 418 rq->nr_phys_segments++; 404 419 } 405 - 406 - /* 407 - * Flag the last request in the series so that drivers know when IO 408 - * should be kicked off, if they don't do it on a per-request basis. 409 - * 410 - * Note: the flag isn't the only condition drivers should do kick off. 411 - * If drive is busy, the last request might not have the bit set. 412 - */ 413 - if (last) 414 - rq->cmd_flags |= REQ_END; 415 420 } 421 + EXPORT_SYMBOL(blk_mq_start_request); 416 422 417 423 static void __blk_mq_requeue_request(struct request *rq) 418 424 { 419 425 struct request_queue *q = rq->q; 420 426 421 427 trace_block_rq_requeue(q, rq); 422 - clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 423 428 424 - rq->cmd_flags &= ~REQ_END; 425 - 426 - if (q->dma_drain_size && blk_rq_bytes(rq)) 427 - rq->nr_phys_segments--; 429 + if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { 430 + if (q->dma_drain_size && blk_rq_bytes(rq)) 431 + rq->nr_phys_segments--; 432 + } 428 433 } 429 434 430 435 void blk_mq_requeue_request(struct request *rq) 431 436 { 432 437 __blk_mq_requeue_request(rq); 433 - blk_clear_rq_complete(rq); 434 438 435 439 BUG_ON(blk_queued_rq(rq)); 436 440 blk_mq_add_to_requeue_list(rq, true); ··· 488 514 } 489 515 EXPORT_SYMBOL(blk_mq_kick_requeue_list); 490 516 491 - static inline bool is_flush_request(struct request *rq, unsigned int tag) 517 + static inline bool is_flush_request(struct request *rq, 518 + struct blk_flush_queue *fq, unsigned int tag) 492 519 { 493 520 return ((rq->cmd_flags & REQ_FLUSH_SEQ) && 494 - rq->q->flush_rq->tag == tag); 521 + fq->flush_rq->tag == tag); 495 522 } 496 523 497 524 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) 498 525 { 499 526 struct request *rq = tags->rqs[tag]; 527 + /* mq_ctx of flush rq is always cloned from the corresponding req */ 528 + struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx); 500 529 501 - if (!is_flush_request(rq, tag)) 530 + if (!is_flush_request(rq, fq, tag)) 502 531 return rq; 503 532 504 - return rq->q->flush_rq; 533 + return fq->flush_rq; 505 534 } 506 535 EXPORT_SYMBOL(blk_mq_tag_to_rq); 507 536 508 537 struct blk_mq_timeout_data { 509 - struct blk_mq_hw_ctx *hctx; 510 - unsigned long *next; 511 - unsigned int *next_set; 538 + unsigned long next; 539 + unsigned int next_set; 512 540 }; 513 541 514 - static void blk_mq_timeout_check(void *__data, unsigned long *free_tags) 542 + void blk_mq_rq_timed_out(struct request *req, bool reserved) 515 543 { 516 - struct blk_mq_timeout_data *data = __data; 517 - struct blk_mq_hw_ctx *hctx = data->hctx; 518 - unsigned int tag; 519 - 520 - /* It may not be in flight yet (this is where 521 - * the REQ_ATOMIC_STARTED flag comes in). The requests are 522 - * statically allocated, so we know it's always safe to access the 523 - * memory associated with a bit offset into ->rqs[]. 524 - */ 525 - tag = 0; 526 - do { 527 - struct request *rq; 528 - 529 - tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag); 530 - if (tag >= hctx->tags->nr_tags) 531 - break; 532 - 533 - rq = blk_mq_tag_to_rq(hctx->tags, tag++); 534 - if (rq->q != hctx->queue) 535 - continue; 536 - if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 537 - continue; 538 - 539 - blk_rq_check_expired(rq, data->next, data->next_set); 540 - } while (1); 541 - } 542 - 543 - static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx, 544 - unsigned long *next, 545 - unsigned int *next_set) 546 - { 547 - struct blk_mq_timeout_data data = { 548 - .hctx = hctx, 549 - .next = next, 550 - .next_set = next_set, 551 - }; 552 - 553 - /* 554 - * Ask the tagging code to iterate busy requests, so we can 555 - * check them for timeout. 556 - */ 557 - blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); 558 - } 559 - 560 - static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq) 561 - { 562 - struct request_queue *q = rq->q; 544 + struct blk_mq_ops *ops = req->q->mq_ops; 545 + enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; 563 546 564 547 /* 565 548 * We know that complete is set at this point. If STARTED isn't set ··· 527 596 * we both flags will get cleared. So check here again, and ignore 528 597 * a timeout event with a request that isn't active. 529 598 */ 599 + if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) 600 + return; 601 + 602 + if (ops->timeout) 603 + ret = ops->timeout(req, reserved); 604 + 605 + switch (ret) { 606 + case BLK_EH_HANDLED: 607 + __blk_mq_complete_request(req); 608 + break; 609 + case BLK_EH_RESET_TIMER: 610 + blk_add_timer(req); 611 + blk_clear_rq_complete(req); 612 + break; 613 + case BLK_EH_NOT_HANDLED: 614 + break; 615 + default: 616 + printk(KERN_ERR "block: bad eh return: %d\n", ret); 617 + break; 618 + } 619 + } 620 + 621 + static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, 622 + struct request *rq, void *priv, bool reserved) 623 + { 624 + struct blk_mq_timeout_data *data = priv; 625 + 530 626 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 531 - return BLK_EH_NOT_HANDLED; 627 + return; 532 628 533 - if (!q->mq_ops->timeout) 534 - return BLK_EH_RESET_TIMER; 535 - 536 - return q->mq_ops->timeout(rq); 629 + if (time_after_eq(jiffies, rq->deadline)) { 630 + if (!blk_mark_rq_complete(rq)) 631 + blk_mq_rq_timed_out(rq, reserved); 632 + } else if (!data->next_set || time_after(data->next, rq->deadline)) { 633 + data->next = rq->deadline; 634 + data->next_set = 1; 635 + } 537 636 } 538 637 539 - static void blk_mq_rq_timer(unsigned long data) 638 + static void blk_mq_rq_timer(unsigned long priv) 540 639 { 541 - struct request_queue *q = (struct request_queue *) data; 640 + struct request_queue *q = (struct request_queue *)priv; 641 + struct blk_mq_timeout_data data = { 642 + .next = 0, 643 + .next_set = 0, 644 + }; 542 645 struct blk_mq_hw_ctx *hctx; 543 - unsigned long next = 0; 544 - int i, next_set = 0; 646 + int i; 545 647 546 648 queue_for_each_hw_ctx(q, hctx, i) { 547 649 /* ··· 584 620 if (!hctx->nr_ctx || !hctx->tags) 585 621 continue; 586 622 587 - blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); 623 + blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); 588 624 } 589 625 590 - if (next_set) { 591 - next = blk_rq_timeout(round_jiffies_up(next)); 592 - mod_timer(&q->timeout, next); 626 + if (data.next_set) { 627 + data.next = blk_rq_timeout(round_jiffies_up(data.next)); 628 + mod_timer(&q->timeout, data.next); 593 629 } else { 594 630 queue_for_each_hw_ctx(q, hctx, i) 595 631 blk_mq_tag_idle(hctx); ··· 715 751 rq = list_first_entry(&rq_list, struct request, queuelist); 716 752 list_del_init(&rq->queuelist); 717 753 718 - blk_mq_start_request(rq, list_empty(&rq_list)); 719 - 720 - ret = q->mq_ops->queue_rq(hctx, rq); 754 + ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list)); 721 755 switch (ret) { 722 756 case BLK_MQ_RQ_QUEUE_OK: 723 757 queued++; ··· 728 766 pr_err("blk-mq: bad return on queue: %d\n", ret); 729 767 case BLK_MQ_RQ_QUEUE_ERROR: 730 768 rq->errors = -EIO; 731 - blk_mq_end_io(rq, rq->errors); 769 + blk_mq_end_request(rq, rq->errors); 732 770 break; 733 771 } 734 772 ··· 1156 1194 int ret; 1157 1195 1158 1196 blk_mq_bio_to_request(rq, bio); 1159 - blk_mq_start_request(rq, true); 1160 1197 1161 1198 /* 1162 1199 * For OK queue, we are done. For error, kill it. Any other 1163 1200 * error (busy), just add it to our list as we previously 1164 1201 * would have done 1165 1202 */ 1166 - ret = q->mq_ops->queue_rq(data.hctx, rq); 1203 + ret = q->mq_ops->queue_rq(data.hctx, rq, true); 1167 1204 if (ret == BLK_MQ_RQ_QUEUE_OK) 1168 1205 goto done; 1169 1206 else { ··· 1170 1209 1171 1210 if (ret == BLK_MQ_RQ_QUEUE_ERROR) { 1172 1211 rq->errors = -EIO; 1173 - blk_mq_end_io(rq, rq->errors); 1212 + blk_mq_end_request(rq, rq->errors); 1174 1213 goto done; 1175 1214 } 1176 1215 } ··· 1492 1531 return NOTIFY_OK; 1493 1532 } 1494 1533 1534 + static void blk_mq_exit_hctx(struct request_queue *q, 1535 + struct blk_mq_tag_set *set, 1536 + struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 1537 + { 1538 + unsigned flush_start_tag = set->queue_depth; 1539 + 1540 + blk_mq_tag_idle(hctx); 1541 + 1542 + if (set->ops->exit_request) 1543 + set->ops->exit_request(set->driver_data, 1544 + hctx->fq->flush_rq, hctx_idx, 1545 + flush_start_tag + hctx_idx); 1546 + 1547 + if (set->ops->exit_hctx) 1548 + set->ops->exit_hctx(hctx, hctx_idx); 1549 + 1550 + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1551 + blk_free_flush_queue(hctx->fq); 1552 + kfree(hctx->ctxs); 1553 + blk_mq_free_bitmap(&hctx->ctx_map); 1554 + } 1555 + 1495 1556 static void blk_mq_exit_hw_queues(struct request_queue *q, 1496 1557 struct blk_mq_tag_set *set, int nr_queue) 1497 1558 { ··· 1523 1540 queue_for_each_hw_ctx(q, hctx, i) { 1524 1541 if (i == nr_queue) 1525 1542 break; 1526 - 1527 - blk_mq_tag_idle(hctx); 1528 - 1529 - if (set->ops->exit_hctx) 1530 - set->ops->exit_hctx(hctx, i); 1531 - 1532 - blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1533 - kfree(hctx->ctxs); 1534 - blk_mq_free_bitmap(&hctx->ctx_map); 1543 + blk_mq_exit_hctx(q, set, hctx, i); 1535 1544 } 1536 - 1537 1545 } 1538 1546 1539 1547 static void blk_mq_free_hw_queues(struct request_queue *q, ··· 1539 1565 } 1540 1566 } 1541 1567 1568 + static int blk_mq_init_hctx(struct request_queue *q, 1569 + struct blk_mq_tag_set *set, 1570 + struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) 1571 + { 1572 + int node; 1573 + unsigned flush_start_tag = set->queue_depth; 1574 + 1575 + node = hctx->numa_node; 1576 + if (node == NUMA_NO_NODE) 1577 + node = hctx->numa_node = set->numa_node; 1578 + 1579 + INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); 1580 + INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); 1581 + spin_lock_init(&hctx->lock); 1582 + INIT_LIST_HEAD(&hctx->dispatch); 1583 + hctx->queue = q; 1584 + hctx->queue_num = hctx_idx; 1585 + hctx->flags = set->flags; 1586 + hctx->cmd_size = set->cmd_size; 1587 + 1588 + blk_mq_init_cpu_notifier(&hctx->cpu_notifier, 1589 + blk_mq_hctx_notify, hctx); 1590 + blk_mq_register_cpu_notifier(&hctx->cpu_notifier); 1591 + 1592 + hctx->tags = set->tags[hctx_idx]; 1593 + 1594 + /* 1595 + * Allocate space for all possible cpus to avoid allocation at 1596 + * runtime 1597 + */ 1598 + hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), 1599 + GFP_KERNEL, node); 1600 + if (!hctx->ctxs) 1601 + goto unregister_cpu_notifier; 1602 + 1603 + if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) 1604 + goto free_ctxs; 1605 + 1606 + hctx->nr_ctx = 0; 1607 + 1608 + if (set->ops->init_hctx && 1609 + set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) 1610 + goto free_bitmap; 1611 + 1612 + hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); 1613 + if (!hctx->fq) 1614 + goto exit_hctx; 1615 + 1616 + if (set->ops->init_request && 1617 + set->ops->init_request(set->driver_data, 1618 + hctx->fq->flush_rq, hctx_idx, 1619 + flush_start_tag + hctx_idx, node)) 1620 + goto free_fq; 1621 + 1622 + return 0; 1623 + 1624 + free_fq: 1625 + kfree(hctx->fq); 1626 + exit_hctx: 1627 + if (set->ops->exit_hctx) 1628 + set->ops->exit_hctx(hctx, hctx_idx); 1629 + free_bitmap: 1630 + blk_mq_free_bitmap(&hctx->ctx_map); 1631 + free_ctxs: 1632 + kfree(hctx->ctxs); 1633 + unregister_cpu_notifier: 1634 + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1635 + 1636 + return -1; 1637 + } 1638 + 1542 1639 static int blk_mq_init_hw_queues(struct request_queue *q, 1543 1640 struct blk_mq_tag_set *set) 1544 1641 { ··· 1620 1575 * Initialize hardware queues 1621 1576 */ 1622 1577 queue_for_each_hw_ctx(q, hctx, i) { 1623 - int node; 1624 - 1625 - node = hctx->numa_node; 1626 - if (node == NUMA_NO_NODE) 1627 - node = hctx->numa_node = set->numa_node; 1628 - 1629 - INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); 1630 - INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); 1631 - spin_lock_init(&hctx->lock); 1632 - INIT_LIST_HEAD(&hctx->dispatch); 1633 - hctx->queue = q; 1634 - hctx->queue_num = i; 1635 - hctx->flags = set->flags; 1636 - hctx->cmd_size = set->cmd_size; 1637 - 1638 - blk_mq_init_cpu_notifier(&hctx->cpu_notifier, 1639 - blk_mq_hctx_notify, hctx); 1640 - blk_mq_register_cpu_notifier(&hctx->cpu_notifier); 1641 - 1642 - hctx->tags = set->tags[i]; 1643 - 1644 - /* 1645 - * Allocate space for all possible cpus to avoid allocation at 1646 - * runtime 1647 - */ 1648 - hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), 1649 - GFP_KERNEL, node); 1650 - if (!hctx->ctxs) 1651 - break; 1652 - 1653 - if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) 1654 - break; 1655 - 1656 - hctx->nr_ctx = 0; 1657 - 1658 - if (set->ops->init_hctx && 1659 - set->ops->init_hctx(hctx, set->driver_data, i)) 1578 + if (blk_mq_init_hctx(q, set, hctx, i)) 1660 1579 break; 1661 1580 } 1662 1581 ··· 1774 1765 if (!ctx) 1775 1766 return ERR_PTR(-ENOMEM); 1776 1767 1768 + /* 1769 + * If a crashdump is active, then we are potentially in a very 1770 + * memory constrained environment. Limit us to 1 queue and 1771 + * 64 tags to prevent using too much memory. 1772 + */ 1773 + if (is_kdump_kernel()) { 1774 + set->nr_hw_queues = 1; 1775 + set->queue_depth = min(64U, set->queue_depth); 1776 + } 1777 + 1777 1778 hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, 1778 1779 set->numa_node); 1779 1780 ··· 1802 1783 if (!hctxs[i]) 1803 1784 goto err_hctxs; 1804 1785 1805 - if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL)) 1786 + if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, 1787 + node)) 1806 1788 goto err_hctxs; 1807 1789 1808 1790 atomic_set(&hctxs[i]->nr_active, 0); ··· 1850 1830 else 1851 1831 blk_queue_make_request(q, blk_sq_make_request); 1852 1832 1853 - blk_queue_rq_timed_out(q, blk_mq_rq_timed_out); 1854 1833 if (set->timeout) 1855 1834 blk_queue_rq_timeout(q, set->timeout); 1856 1835 ··· 1861 1842 if (set->ops->complete) 1862 1843 blk_queue_softirq_done(q, set->ops->complete); 1863 1844 1864 - blk_mq_init_flush(q); 1865 1845 blk_mq_init_cpu_queues(q, set->nr_hw_queues); 1866 1846 1867 - q->flush_rq = kzalloc(round_up(sizeof(struct request) + 1868 - set->cmd_size, cache_line_size()), 1869 - GFP_KERNEL); 1870 - if (!q->flush_rq) 1871 - goto err_hw; 1872 - 1873 1847 if (blk_mq_init_hw_queues(q, set)) 1874 - goto err_flush_rq; 1848 + goto err_hw; 1875 1849 1876 1850 mutex_lock(&all_q_mutex); 1877 1851 list_add_tail(&q->all_q_node, &all_q_list); ··· 1876 1864 1877 1865 return q; 1878 1866 1879 - err_flush_rq: 1880 - kfree(q->flush_rq); 1881 1867 err_hw: 1882 1868 blk_cleanup_queue(q); 1883 1869 err_hctxs:

+2 -1

block/blk-mq.h

··· 27 27 28 28 void __blk_mq_complete_request(struct request *rq); 29 29 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 30 - void blk_mq_init_flush(struct request_queue *q); 31 30 void blk_mq_freeze_queue(struct request_queue *q); 32 31 void blk_mq_free_queue(struct request_queue *q); 33 32 void blk_mq_clone_flush_request(struct request *flush_rq, ··· 58 59 */ 59 60 extern int blk_mq_sysfs_register(struct request_queue *q); 60 61 extern void blk_mq_sysfs_unregister(struct request_queue *q); 62 + 63 + extern void blk_mq_rq_timed_out(struct request *req, bool reserved); 61 64 62 65 /* 63 66 * Basic implementation of sparser bitmap, allowing the user to spread

+2 -2

block/blk-settings.c

··· 574 574 bottom = max(b->physical_block_size, b->io_min) + alignment; 575 575 576 576 /* Verify that top and bottom intervals line up */ 577 - if (max(top, bottom) & (min(top, bottom) - 1)) { 577 + if (max(top, bottom) % min(top, bottom)) { 578 578 t->misaligned = 1; 579 579 ret = -1; 580 580 } ··· 619 619 620 620 /* Find lowest common alignment_offset */ 621 621 t->alignment_offset = lcm(t->alignment_offset, alignment) 622 - & (max(t->physical_block_size, t->io_min) - 1); 622 + % max(t->physical_block_size, t->io_min); 623 623 624 624 /* Verify that new alignment_offset is on a logical block boundary */ 625 625 if (t->alignment_offset & (t->logical_block_size - 1)) {

+2 -2

block/blk-sysfs.c

··· 519 519 520 520 if (q->mq_ops) 521 521 blk_mq_free_queue(q); 522 - 523 - kfree(q->flush_rq); 522 + else 523 + blk_free_flush_queue(q->fq); 524 524 525 525 blk_trace_shutdown(q); 526 526

+8 -7

block/blk-timeout.c

··· 90 90 switch (ret) { 91 91 case BLK_EH_HANDLED: 92 92 /* Can we use req->errors here? */ 93 - if (q->mq_ops) 94 - __blk_mq_complete_request(req); 95 - else 96 - __blk_complete_request(req); 93 + __blk_complete_request(req); 97 94 break; 98 95 case BLK_EH_RESET_TIMER: 99 96 blk_add_timer(req); ··· 110 113 } 111 114 } 112 115 113 - void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, 116 + static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, 114 117 unsigned int *next_set) 115 118 { 116 119 if (time_after_eq(jiffies, rq->deadline)) { ··· 159 162 if (blk_mark_rq_complete(req)) 160 163 return; 161 164 blk_delete_timer(req); 162 - blk_rq_timed_out(req); 165 + if (req->q->mq_ops) 166 + blk_mq_rq_timed_out(req, false); 167 + else 168 + blk_rq_timed_out(req); 163 169 } 164 170 EXPORT_SYMBOL_GPL(blk_abort_request); 165 171 ··· 190 190 struct request_queue *q = req->q; 191 191 unsigned long expiry; 192 192 193 - if (!q->rq_timed_out_fn) 193 + /* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */ 194 + if (!q->mq_ops && !q->rq_timed_out_fn) 194 195 return; 195 196 196 197 BUG_ON(!list_empty(&req->timeout_list));

+33 -4

block/blk.h

··· 2 2 #define BLK_INTERNAL_H 3 3 4 4 #include <linux/idr.h> 5 + #include <linux/blk-mq.h> 6 + #include "blk-mq.h" 5 7 6 8 /* Amount of time in which a process may batch requests */ 7 9 #define BLK_BATCH_TIME (HZ/50UL) ··· 14 12 /* Max future timer expiry for timeouts */ 15 13 #define BLK_MAX_TIMEOUT (5 * HZ) 16 14 15 + struct blk_flush_queue { 16 + unsigned int flush_queue_delayed:1; 17 + unsigned int flush_pending_idx:1; 18 + unsigned int flush_running_idx:1; 19 + unsigned long flush_pending_since; 20 + struct list_head flush_queue[2]; 21 + struct list_head flush_data_in_flight; 22 + struct request *flush_rq; 23 + spinlock_t mq_flush_lock; 24 + }; 25 + 17 26 extern struct kmem_cache *blk_requestq_cachep; 18 27 extern struct kmem_cache *request_cachep; 19 28 extern struct kobj_type blk_queue_ktype; 20 29 extern struct ida blk_queue_ida; 21 30 31 + static inline struct blk_flush_queue *blk_get_flush_queue( 32 + struct request_queue *q, struct blk_mq_ctx *ctx) 33 + { 34 + struct blk_mq_hw_ctx *hctx; 35 + 36 + if (!q->mq_ops) 37 + return q->fq; 38 + 39 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 40 + 41 + return hctx->fq; 42 + } 43 + 22 44 static inline void __blk_get_queue(struct request_queue *q) 23 45 { 24 46 kobject_get(&q->kobj); 25 47 } 48 + 49 + struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, 50 + int node, int cmd_size); 51 + void blk_free_flush_queue(struct blk_flush_queue *q); 26 52 27 53 int blk_init_rl(struct request_list *rl, struct request_queue *q, 28 54 gfp_t gfp_mask); ··· 68 38 unsigned int nr_bytes, unsigned int bidi_bytes); 69 39 70 40 void blk_rq_timed_out_timer(unsigned long data); 71 - void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, 72 - unsigned int *next_set); 73 41 unsigned long blk_rq_timeout(unsigned long timeout); 74 42 void blk_add_timer(struct request *req); 75 43 void blk_delete_timer(struct request *); ··· 116 88 static inline struct request *__elv_next_request(struct request_queue *q) 117 89 { 118 90 struct request *rq; 91 + struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); 119 92 120 93 while (1) { 121 94 if (!list_empty(&q->queue_head)) { ··· 139 110 * should be restarted later. Please see flush_end_io() for 140 111 * details. 141 112 */ 142 - if (q->flush_pending_idx != q->flush_running_idx && 113 + if (fq->flush_pending_idx != fq->flush_running_idx && 143 114 !queue_flush_queueable(q)) { 144 - q->flush_queue_delayed = 1; 115 + fq->flush_queue_delayed = 1; 145 116 return NULL; 146 117 } 147 118 if (unlikely(blk_queue_bypass(q)) ||

+5 -4

block/bsg.c

··· 270 270 * map scatter-gather elements separately and string them to request 271 271 */ 272 272 rq = blk_get_request(q, rw, GFP_KERNEL); 273 - if (!rq) 274 - return ERR_PTR(-ENOMEM); 273 + if (IS_ERR(rq)) 274 + return rq; 275 275 blk_rq_set_block_pc(rq); 276 276 277 277 ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm); ··· 285 285 } 286 286 287 287 next_rq = blk_get_request(q, READ, GFP_KERNEL); 288 - if (!next_rq) { 289 - ret = -ENOMEM; 288 + if (IS_ERR(next_rq)) { 289 + ret = PTR_ERR(next_rq); 290 + next_rq = NULL; 290 291 goto out; 291 292 } 292 293 rq->next_rq = next_rq;

+5 -5

block/cfq-iosched.c

··· 299 299 struct cfq_ttime ttime; 300 300 int ioprio; /* the current ioprio */ 301 301 #ifdef CONFIG_CFQ_GROUP_IOSCHED 302 - uint64_t blkcg_id; /* the current blkcg ID */ 302 + uint64_t blkcg_serial_nr; /* the current blkcg serial */ 303 303 #endif 304 304 }; 305 305 ··· 3547 3547 { 3548 3548 struct cfq_data *cfqd = cic_to_cfqd(cic); 3549 3549 struct cfq_queue *sync_cfqq; 3550 - uint64_t id; 3550 + uint64_t serial_nr; 3551 3551 3552 3552 rcu_read_lock(); 3553 - id = bio_blkcg(bio)->id; 3553 + serial_nr = bio_blkcg(bio)->css.serial_nr; 3554 3554 rcu_read_unlock(); 3555 3555 3556 3556 /* 3557 3557 * Check whether blkcg has changed. The condition may trigger 3558 3558 * spuriously on a newly created cic but there's no harm. 3559 3559 */ 3560 - if (unlikely(!cfqd) || likely(cic->blkcg_id == id)) 3560 + if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) 3561 3561 return; 3562 3562 3563 3563 sync_cfqq = cic_to_cfqq(cic, 1); ··· 3571 3571 cfq_put_queue(sync_cfqq); 3572 3572 } 3573 3573 3574 - cic->blkcg_id = id; 3574 + cic->blkcg_serial_nr = serial_nr; 3575 3575 } 3576 3576 #else 3577 3577 static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }

-4

block/compat_ioctl.c

··· 709 709 if (!arg) 710 710 return -EINVAL; 711 711 bdi = blk_get_backing_dev_info(bdev); 712 - if (bdi == NULL) 713 - return -ENOTTY; 714 712 return compat_put_long(arg, 715 713 (bdi->ra_pages * PAGE_CACHE_SIZE) / 512); 716 714 case BLKROGET: /* compatible */ ··· 729 731 if (!capable(CAP_SYS_ADMIN)) 730 732 return -EACCES; 731 733 bdi = blk_get_backing_dev_info(bdev); 732 - if (bdi == NULL) 733 - return -ENOTTY; 734 734 bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; 735 735 return 0; 736 736 case BLKGETSIZE:

-4

block/ioctl.c

··· 356 356 if (!arg) 357 357 return -EINVAL; 358 358 bdi = blk_get_backing_dev_info(bdev); 359 - if (bdi == NULL) 360 - return -ENOTTY; 361 359 return put_long(arg, (bdi->ra_pages * PAGE_CACHE_SIZE) / 512); 362 360 case BLKROGET: 363 361 return put_int(arg, bdev_read_only(bdev) != 0); ··· 384 386 if(!capable(CAP_SYS_ADMIN)) 385 387 return -EACCES; 386 388 bdi = blk_get_backing_dev_info(bdev); 387 - if (bdi == NULL) 388 - return -ENOTTY; 389 389 bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; 390 390 return 0; 391 391 case BLKBSZSET:

+4 -4

block/partitions/mac.c

··· 81 81 be32_to_cpu(part->start_block) * (secsize/512), 82 82 be32_to_cpu(part->block_count) * (secsize/512)); 83 83 84 - if (!strnicmp(part->type, "Linux_RAID", 10)) 84 + if (!strncasecmp(part->type, "Linux_RAID", 10)) 85 85 state->parts[slot].flags = ADDPART_FLAG_RAID; 86 86 #ifdef CONFIG_PPC_PMAC 87 87 /* ··· 100 100 goodness++; 101 101 102 102 if (strcasecmp(part->type, "Apple_UNIX_SVR2") == 0 103 - || (strnicmp(part->type, "Linux", 5) == 0 103 + || (strncasecmp(part->type, "Linux", 5) == 0 104 104 && strcasecmp(part->type, "Linux_swap") != 0)) { 105 105 int i, l; 106 106 ··· 109 109 if (strcmp(part->name, "/") == 0) 110 110 goodness++; 111 111 for (i = 0; i <= l - 4; ++i) { 112 - if (strnicmp(part->name + i, "root", 112 + if (strncasecmp(part->name + i, "root", 113 113 4) == 0) { 114 114 goodness += 2; 115 115 break; 116 116 } 117 117 } 118 - if (strnicmp(part->name, "swap", 4) == 0) 118 + if (strncasecmp(part->name, "swap", 4) == 0) 119 119 goodness--; 120 120 } 121 121

+6 -5

block/scsi_ioctl.c

··· 316 316 317 317 ret = -ENOMEM; 318 318 rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL); 319 - if (!rq) 320 - goto out; 319 + if (IS_ERR(rq)) 320 + return PTR_ERR(rq); 321 321 blk_rq_set_block_pc(rq); 322 322 323 323 if (hdr->cmd_len > BLK_MAX_CDB) { ··· 387 387 kfree(rq->cmd); 388 388 out_put_request: 389 389 blk_put_request(rq); 390 - out: 391 390 return ret; 392 391 } 393 392 ··· 456 457 } 457 458 458 459 rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT); 459 - if (!rq) { 460 - err = -ENOMEM; 460 + if (IS_ERR(rq)) { 461 + err = PTR_ERR(rq); 461 462 goto error; 462 463 } 463 464 blk_rq_set_block_pc(rq); ··· 547 548 int err; 548 549 549 550 rq = blk_get_request(q, WRITE, __GFP_WAIT); 551 + if (IS_ERR(rq)) 552 + return PTR_ERR(rq); 550 553 blk_rq_set_block_pc(rq); 551 554 rq->timeout = BLK_DEFAULT_SG_TIMEOUT; 552 555 rq->cmd[0] = cmd;

+197

block/t10-pi.c

··· 1 + /* 2 + * t10_pi.c - Functions for generating and verifying T10 Protection 3 + * Information. 4 + * 5 + * Copyright (C) 2007, 2008, 2014 Oracle Corporation 6 + * Written by: Martin K. Petersen <martin.petersen@oracle.com> 7 + * 8 + * This program is free software; you can redistribute it and/or 9 + * modify it under the terms of the GNU General Public License version 10 + * 2 as published by the Free Software Foundation. 11 + * 12 + * This program is distributed in the hope that it will be useful, but 13 + * WITHOUT ANY WARRANTY; without even the implied warranty of 14 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 + * General Public License for more details. 16 + * 17 + * You should have received a copy of the GNU General Public License 18 + * along with this program; see the file COPYING. If not, write to 19 + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, 20 + * USA. 21 + * 22 + */ 23 + 24 + #include <linux/t10-pi.h> 25 + #include <linux/blkdev.h> 26 + #include <linux/crc-t10dif.h> 27 + #include <net/checksum.h> 28 + 29 + typedef __be16 (csum_fn) (void *, unsigned int); 30 + 31 + static const __be16 APP_ESCAPE = (__force __be16) 0xffff; 32 + static const __be32 REF_ESCAPE = (__force __be32) 0xffffffff; 33 + 34 + static __be16 t10_pi_crc_fn(void *data, unsigned int len) 35 + { 36 + return cpu_to_be16(crc_t10dif(data, len)); 37 + } 38 + 39 + static __be16 t10_pi_ip_fn(void *data, unsigned int len) 40 + { 41 + return (__force __be16)ip_compute_csum(data, len); 42 + } 43 + 44 + /* 45 + * Type 1 and Type 2 protection use the same format: 16 bit guard tag, 46 + * 16 bit app tag, 32 bit reference tag. Type 3 does not define the ref 47 + * tag. 48 + */ 49 + static int t10_pi_generate(struct blk_integrity_iter *iter, csum_fn *fn, 50 + unsigned int type) 51 + { 52 + unsigned int i; 53 + 54 + for (i = 0 ; i < iter->data_size ; i += iter->interval) { 55 + struct t10_pi_tuple *pi = iter->prot_buf; 56 + 57 + pi->guard_tag = fn(iter->data_buf, iter->interval); 58 + pi->app_tag = 0; 59 + 60 + if (type == 1) 61 + pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed)); 62 + else 63 + pi->ref_tag = 0; 64 + 65 + iter->data_buf += iter->interval; 66 + iter->prot_buf += sizeof(struct t10_pi_tuple); 67 + iter->seed++; 68 + } 69 + 70 + return 0; 71 + } 72 + 73 + static int t10_pi_verify(struct blk_integrity_iter *iter, csum_fn *fn, 74 + unsigned int type) 75 + { 76 + unsigned int i; 77 + 78 + for (i = 0 ; i < iter->data_size ; i += iter->interval) { 79 + struct t10_pi_tuple *pi = iter->prot_buf; 80 + __be16 csum; 81 + 82 + switch (type) { 83 + case 1: 84 + case 2: 85 + if (pi->app_tag == APP_ESCAPE) 86 + goto next; 87 + 88 + if (be32_to_cpu(pi->ref_tag) != 89 + lower_32_bits(iter->seed)) { 90 + pr_err("%s: ref tag error at location %llu " \ 91 + "(rcvd %u)\n", iter->disk_name, 92 + (unsigned long long) 93 + iter->seed, be32_to_cpu(pi->ref_tag)); 94 + return -EILSEQ; 95 + } 96 + break; 97 + case 3: 98 + if (pi->app_tag == APP_ESCAPE && 99 + pi->ref_tag == REF_ESCAPE) 100 + goto next; 101 + break; 102 + } 103 + 104 + csum = fn(iter->data_buf, iter->interval); 105 + 106 + if (pi->guard_tag != csum) { 107 + pr_err("%s: guard tag error at sector %llu " \ 108 + "(rcvd %04x, want %04x)\n", iter->disk_name, 109 + (unsigned long long)iter->seed, 110 + be16_to_cpu(pi->guard_tag), be16_to_cpu(csum)); 111 + return -EILSEQ; 112 + } 113 + 114 + next: 115 + iter->data_buf += iter->interval; 116 + iter->prot_buf += sizeof(struct t10_pi_tuple); 117 + iter->seed++; 118 + } 119 + 120 + return 0; 121 + } 122 + 123 + static int t10_pi_type1_generate_crc(struct blk_integrity_iter *iter) 124 + { 125 + return t10_pi_generate(iter, t10_pi_crc_fn, 1); 126 + } 127 + 128 + static int t10_pi_type1_generate_ip(struct blk_integrity_iter *iter) 129 + { 130 + return t10_pi_generate(iter, t10_pi_ip_fn, 1); 131 + } 132 + 133 + static int t10_pi_type1_verify_crc(struct blk_integrity_iter *iter) 134 + { 135 + return t10_pi_verify(iter, t10_pi_crc_fn, 1); 136 + } 137 + 138 + static int t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) 139 + { 140 + return t10_pi_verify(iter, t10_pi_ip_fn, 1); 141 + } 142 + 143 + static int t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) 144 + { 145 + return t10_pi_generate(iter, t10_pi_crc_fn, 3); 146 + } 147 + 148 + static int t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) 149 + { 150 + return t10_pi_generate(iter, t10_pi_ip_fn, 3); 151 + } 152 + 153 + static int t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) 154 + { 155 + return t10_pi_verify(iter, t10_pi_crc_fn, 3); 156 + } 157 + 158 + static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) 159 + { 160 + return t10_pi_verify(iter, t10_pi_ip_fn, 3); 161 + } 162 + 163 + struct blk_integrity t10_pi_type1_crc = { 164 + .name = "T10-DIF-TYPE1-CRC", 165 + .generate_fn = t10_pi_type1_generate_crc, 166 + .verify_fn = t10_pi_type1_verify_crc, 167 + .tuple_size = sizeof(struct t10_pi_tuple), 168 + .tag_size = 0, 169 + }; 170 + EXPORT_SYMBOL(t10_pi_type1_crc); 171 + 172 + struct blk_integrity t10_pi_type1_ip = { 173 + .name = "T10-DIF-TYPE1-IP", 174 + .generate_fn = t10_pi_type1_generate_ip, 175 + .verify_fn = t10_pi_type1_verify_ip, 176 + .tuple_size = sizeof(struct t10_pi_tuple), 177 + .tag_size = 0, 178 + }; 179 + EXPORT_SYMBOL(t10_pi_type1_ip); 180 + 181 + struct blk_integrity t10_pi_type3_crc = { 182 + .name = "T10-DIF-TYPE3-CRC", 183 + .generate_fn = t10_pi_type3_generate_crc, 184 + .verify_fn = t10_pi_type3_verify_crc, 185 + .tuple_size = sizeof(struct t10_pi_tuple), 186 + .tag_size = 0, 187 + }; 188 + EXPORT_SYMBOL(t10_pi_type3_crc); 189 + 190 + struct blk_integrity t10_pi_type3_ip = { 191 + .name = "T10-DIF-TYPE3-IP", 192 + .generate_fn = t10_pi_type3_generate_ip, 193 + .verify_fn = t10_pi_type3_verify_ip, 194 + .tuple_size = sizeof(struct t10_pi_tuple), 195 + .tag_size = 0, 196 + }; 197 + EXPORT_SYMBOL(t10_pi_type3_ip);

+6 -3

drivers/block/mtip32xx/mtip32xx.c

··· 247 247 if (unlikely(cmd->unaligned)) 248 248 up(&port->cmd_slot_unal); 249 249 250 - blk_mq_end_io(rq, status ? -EIO : 0); 250 + blk_mq_end_request(rq, status ? -EIO : 0); 251 251 } 252 252 253 253 /* ··· 3739 3739 int err; 3740 3740 3741 3741 err = mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq)); 3742 - blk_mq_end_io(rq, err); 3742 + blk_mq_end_request(rq, err); 3743 3743 return 0; 3744 3744 } 3745 3745 ··· 3775 3775 return false; 3776 3776 } 3777 3777 3778 - static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) 3778 + static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, 3779 + bool last) 3779 3780 { 3780 3781 int ret; 3781 3782 3782 3783 if (unlikely(mtip_check_unal_depth(hctx, rq))) 3783 3784 return BLK_MQ_RQ_QUEUE_BUSY; 3785 + 3786 + blk_mq_start_request(rq); 3784 3787 3785 3788 ret = mtip_submit_request(hctx, rq); 3786 3789 if (likely(!ret))

+5 -2

drivers/block/null_blk.c

··· 177 177 { 178 178 switch (queue_mode) { 179 179 case NULL_Q_MQ: 180 - blk_mq_end_io(cmd->rq, 0); 180 + blk_mq_end_request(cmd->rq, 0); 181 181 return; 182 182 case NULL_Q_RQ: 183 183 INIT_LIST_HEAD(&cmd->rq->queuelist); ··· 313 313 } 314 314 } 315 315 316 - static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) 316 + static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, 317 + bool last) 317 318 { 318 319 struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); 319 320 320 321 cmd->rq = rq; 321 322 cmd->nq = hctx->driver_data; 323 + 324 + blk_mq_start_request(rq); 322 325 323 326 null_handle_cmd(cmd); 324 327 return BLK_MQ_RQ_QUEUE_OK;

+2

drivers/block/paride/pd.c

··· 722 722 int err = 0; 723 723 724 724 rq = blk_get_request(disk->gd->queue, READ, __GFP_WAIT); 725 + if (IS_ERR(rq)) 726 + return PTR_ERR(rq); 725 727 726 728 rq->cmd_type = REQ_TYPE_SPECIAL; 727 729 rq->special = func;

+2

drivers/block/pktcdvd.c

··· 704 704 705 705 rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? 706 706 WRITE : READ, __GFP_WAIT); 707 + if (IS_ERR(rq)) 708 + return PTR_ERR(rq); 707 709 blk_rq_set_block_pc(rq); 708 710 709 711 if (cgc->buflen) {

+1 -1

drivers/block/sx8.c

··· 568 568 return NULL; 569 569 570 570 rq = blk_get_request(host->oob_q, WRITE /* bogus */, GFP_KERNEL); 571 - if (!rq) { 571 + if (IS_ERR(rq)) { 572 572 spin_lock_irqsave(&host->lock, flags); 573 573 carm_put_request(host, crq); 574 574 spin_unlock_irqrestore(&host->lock, flags);

+5 -3

drivers/block/virtio_blk.c

··· 129 129 req->errors = (error != 0); 130 130 } 131 131 132 - blk_mq_end_io(req, error); 132 + blk_mq_end_request(req, error); 133 133 } 134 134 135 135 static void virtblk_done(struct virtqueue *vq) ··· 158 158 spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); 159 159 } 160 160 161 - static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) 161 + static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, 162 + bool last) 162 163 { 163 164 struct virtio_blk *vblk = hctx->queue->queuedata; 164 165 struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); 165 166 unsigned long flags; 166 167 unsigned int num; 167 168 int qid = hctx->queue_num; 168 - const bool last = (req->cmd_flags & REQ_END) != 0; 169 169 int err; 170 170 bool notify = false; 171 171 ··· 198 198 BUG(); 199 199 } 200 200 } 201 + 202 + blk_mq_start_request(req); 201 203 202 204 num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg); 203 205 if (num) {

+2 -2

drivers/cdrom/cdrom.c

··· 2180 2180 len = nr * CD_FRAMESIZE_RAW; 2181 2181 2182 2182 rq = blk_get_request(q, READ, GFP_KERNEL); 2183 - if (!rq) { 2184 - ret = -ENOMEM; 2183 + if (IS_ERR(rq)) { 2184 + ret = PTR_ERR(rq); 2185 2185 break; 2186 2186 } 2187 2187 blk_rq_set_block_pc(rq);

+1 -1

drivers/ide/ide-park.c

··· 46 46 * timeout has expired, so power management will be reenabled. 47 47 */ 48 48 rq = blk_get_request(q, READ, GFP_NOWAIT); 49 - if (unlikely(!rq)) 49 + if (IS_ERR(rq)) 50 50 goto out; 51 51 52 52 rq->cmd[0] = REQ_UNPARK_HEADS;

-1

drivers/scsi/Kconfig

··· 73 73 config BLK_DEV_SD 74 74 tristate "SCSI disk support" 75 75 depends on SCSI 76 - select CRC_T10DIF if BLK_DEV_INTEGRITY 77 76 ---help--- 78 77 If you want to use SCSI hard disks, Fibre Channel disks, 79 78 Serial ATA (SATA) or Parallel ATA (PATA) hard disks,

+1 -1

drivers/scsi/device_handler/scsi_dh_alua.c

··· 115 115 116 116 rq = blk_get_request(q, rw, GFP_NOIO); 117 117 118 - if (!rq) { 118 + if (IS_ERR(rq)) { 119 119 sdev_printk(KERN_INFO, sdev, 120 120 "%s: blk_get_request failed\n", __func__); 121 121 return NULL;

+1 -1

drivers/scsi/device_handler/scsi_dh_emc.c

··· 275 275 276 276 rq = blk_get_request(sdev->request_queue, 277 277 (cmd != INQUIRY) ? WRITE : READ, GFP_NOIO); 278 - if (!rq) { 278 + if (IS_ERR(rq)) { 279 279 sdev_printk(KERN_INFO, sdev, "get_req: blk_get_request failed"); 280 280 return NULL; 281 281 }

+2 -2

drivers/scsi/device_handler/scsi_dh_hp_sw.c

··· 117 117 118 118 retry: 119 119 req = blk_get_request(sdev->request_queue, WRITE, GFP_NOIO); 120 - if (!req) 120 + if (IS_ERR(req)) 121 121 return SCSI_DH_RES_TEMP_UNAVAIL; 122 122 123 123 blk_rq_set_block_pc(req); ··· 247 247 struct request *req; 248 248 249 249 req = blk_get_request(h->sdev->request_queue, WRITE, GFP_ATOMIC); 250 - if (!req) 250 + if (IS_ERR(req)) 251 251 return SCSI_DH_RES_TEMP_UNAVAIL; 252 252 253 253 blk_rq_set_block_pc(req);

+1 -1

drivers/scsi/device_handler/scsi_dh_rdac.c

··· 274 274 275 275 rq = blk_get_request(q, rw, GFP_NOIO); 276 276 277 - if (!rq) { 277 + if (IS_ERR(rq)) { 278 278 sdev_printk(KERN_INFO, sdev, 279 279 "get_rdac_req: blk_get_request failed.\n"); 280 280 return NULL;

+2 -2

drivers/scsi/osd/osd_initiator.c

··· 1567 1567 struct request *req; 1568 1568 1569 1569 req = blk_get_request(q, has_write ? WRITE : READ, flags); 1570 - if (unlikely(!req)) 1571 - return ERR_PTR(-ENOMEM); 1570 + if (IS_ERR(req)) 1571 + return req; 1572 1572 1573 1573 blk_rq_set_block_pc(req); 1574 1574 return req;

+1 -1

drivers/scsi/osst.c

··· 362 362 int write = (data_direction == DMA_TO_DEVICE); 363 363 364 364 req = blk_get_request(SRpnt->stp->device->request_queue, write, GFP_KERNEL); 365 - if (!req) 365 + if (IS_ERR(req)) 366 366 return DRIVER_ERROR << 24; 367 367 368 368 blk_rq_set_block_pc(req);

+2

drivers/scsi/scsi_error.c

··· 1961 1961 * request becomes available 1962 1962 */ 1963 1963 req = blk_get_request(sdev->request_queue, READ, GFP_KERNEL); 1964 + if (IS_ERR(req)) 1965 + return; 1964 1966 1965 1967 blk_rq_set_block_pc(req); 1966 1968

+19 -5

drivers/scsi/scsi_lib.c

··· 221 221 int ret = DRIVER_ERROR << 24; 222 222 223 223 req = blk_get_request(sdev->request_queue, write, __GFP_WAIT); 224 - if (!req) 224 + if (IS_ERR(req)) 225 225 return ret; 226 226 blk_rq_set_block_pc(req); 227 227 ··· 715 715 716 716 if (req->mq_ctx) { 717 717 /* 718 - * In the MQ case the command gets freed by __blk_mq_end_io, 718 + * In the MQ case the command gets freed by __blk_mq_end_request, 719 719 * so we have to do all cleanup that depends on it earlier. 720 720 * 721 721 * We also can't kick the queues from irq context, so we ··· 723 723 */ 724 724 scsi_mq_uninit_cmd(cmd); 725 725 726 - __blk_mq_end_io(req, error); 726 + __blk_mq_end_request(req, error); 727 727 728 728 if (scsi_target(sdev)->single_lun || 729 729 !list_empty(&sdev->host->starved_list)) ··· 1847 1847 next_rq->special = bidi_sdb; 1848 1848 } 1849 1849 1850 + blk_mq_start_request(req); 1851 + 1850 1852 return scsi_setup_cmnd(sdev, req); 1851 1853 } 1852 1854 ··· 1858 1856 blk_mq_complete_request(cmd->request); 1859 1857 } 1860 1858 1861 - static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) 1859 + static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, 1860 + bool last) 1862 1861 { 1863 1862 struct request_queue *q = req->q; 1864 1863 struct scsi_device *sdev = q->queuedata; ··· 1883 1880 if (!scsi_host_queue_ready(q, shost, sdev)) 1884 1881 goto out_dec_target_busy; 1885 1882 1883 + 1886 1884 if (!(req->cmd_flags & REQ_DONTPREP)) { 1887 1885 ret = prep_to_mq(scsi_mq_prep_fn(req)); 1888 1886 if (ret) 1889 1887 goto out_dec_host_busy; 1890 1888 req->cmd_flags |= REQ_DONTPREP; 1889 + } else { 1890 + blk_mq_start_request(req); 1891 1891 } 1892 1892 1893 1893 scsi_init_cmd_errh(cmd); ··· 1935 1929 break; 1936 1930 } 1937 1931 return ret; 1932 + } 1933 + 1934 + static enum blk_eh_timer_return scsi_timeout(struct request *req, 1935 + bool reserved) 1936 + { 1937 + if (reserved) 1938 + return BLK_EH_RESET_TIMER; 1939 + return scsi_times_out(req); 1938 1940 } 1939 1941 1940 1942 static int scsi_init_request(void *data, struct request *rq, ··· 2056 2042 .map_queue = blk_mq_map_queue, 2057 2043 .queue_rq = scsi_queue_rq, 2058 2044 .complete = scsi_softirq_done, 2059 - .timeout = scsi_times_out, 2045 + .timeout = scsi_timeout, 2060 2046 .init_request = scsi_init_request, 2061 2047 .exit_request = scsi_exit_request, 2062 2048 };

+42 -29

drivers/scsi/sd.c

··· 610 610 mutex_unlock(&sd_ref_mutex); 611 611 } 612 612 613 - static void sd_prot_op(struct scsi_cmnd *scmd, unsigned int dif) 614 - { 615 - unsigned int prot_op = SCSI_PROT_NORMAL; 616 - unsigned int dix = scsi_prot_sg_count(scmd); 617 613 618 - if (scmd->sc_data_direction == DMA_FROM_DEVICE) { 619 - if (dif && dix) 620 - prot_op = SCSI_PROT_READ_PASS; 621 - else if (dif && !dix) 622 - prot_op = SCSI_PROT_READ_STRIP; 623 - else if (!dif && dix) 624 - prot_op = SCSI_PROT_READ_INSERT; 625 - } else { 626 - if (dif && dix) 627 - prot_op = SCSI_PROT_WRITE_PASS; 628 - else if (dif && !dix) 629 - prot_op = SCSI_PROT_WRITE_INSERT; 630 - else if (!dif && dix) 631 - prot_op = SCSI_PROT_WRITE_STRIP; 614 + 615 + static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd, 616 + unsigned int dix, unsigned int dif) 617 + { 618 + struct bio *bio = scmd->request->bio; 619 + unsigned int prot_op = sd_prot_op(rq_data_dir(scmd->request), dix, dif); 620 + unsigned int protect = 0; 621 + 622 + if (dix) { /* DIX Type 0, 1, 2, 3 */ 623 + if (bio_integrity_flagged(bio, BIP_IP_CHECKSUM)) 624 + scmd->prot_flags |= SCSI_PROT_IP_CHECKSUM; 625 + 626 + if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false) 627 + scmd->prot_flags |= SCSI_PROT_GUARD_CHECK; 628 + } 629 + 630 + if (dif != SD_DIF_TYPE3_PROTECTION) { /* DIX/DIF Type 0, 1, 2 */ 631 + scmd->prot_flags |= SCSI_PROT_REF_INCREMENT; 632 + 633 + if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false) 634 + scmd->prot_flags |= SCSI_PROT_REF_CHECK; 635 + } 636 + 637 + if (dif) { /* DIX/DIF Type 1, 2, 3 */ 638 + scmd->prot_flags |= SCSI_PROT_TRANSFER_PI; 639 + 640 + if (bio_integrity_flagged(bio, BIP_DISK_NOCHECK)) 641 + protect = 3 << 5; /* Disable target PI checking */ 642 + else 643 + protect = 1 << 5; /* Enable target PI checking */ 632 644 } 633 645 634 646 scsi_set_prot_op(scmd, prot_op); 635 647 scsi_set_prot_type(scmd, dif); 648 + scmd->prot_flags &= sd_prot_flag_mask(prot_op); 649 + 650 + return protect; 636 651 } 637 652 638 653 static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) ··· 908 893 sector_t block = blk_rq_pos(rq); 909 894 sector_t threshold; 910 895 unsigned int this_count = blk_rq_sectors(rq); 911 - int ret, host_dif; 896 + unsigned int dif, dix; 897 + int ret; 912 898 unsigned char protect; 913 899 914 900 ret = scsi_init_io(SCpnt, GFP_ATOMIC); ··· 1011 995 SCpnt->cmnd[0] = WRITE_6; 1012 996 1013 997 if (blk_integrity_rq(rq)) 1014 - sd_dif_prepare(rq, block, sdp->sector_size); 998 + sd_dif_prepare(SCpnt); 1015 999 1016 1000 } else if (rq_data_dir(rq) == READ) { 1017 1001 SCpnt->cmnd[0] = READ_6; ··· 1026 1010 "writing" : "reading", this_count, 1027 1011 blk_rq_sectors(rq))); 1028 1012 1029 - /* Set RDPROTECT/WRPROTECT if disk is formatted with DIF */ 1030 - host_dif = scsi_host_dif_capable(sdp->host, sdkp->protection_type); 1031 - if (host_dif) 1032 - protect = 1 << 5; 1013 + dix = scsi_prot_sg_count(SCpnt); 1014 + dif = scsi_host_dif_capable(SCpnt->device->host, sdkp->protection_type); 1015 + 1016 + if (dif || dix) 1017 + protect = sd_setup_protect_cmnd(SCpnt, dix, dif); 1033 1018 else 1034 1019 protect = 0; 1035 1020 1036 - if (host_dif == SD_DIF_TYPE2_PROTECTION) { 1021 + if (protect && sdkp->protection_type == SD_DIF_TYPE2_PROTECTION) { 1037 1022 SCpnt->cmnd = mempool_alloc(sd_cdb_pool, GFP_ATOMIC); 1038 1023 1039 1024 if (unlikely(SCpnt->cmnd == NULL)) { ··· 1118 1101 SCpnt->cmnd[5] = 0; 1119 1102 } 1120 1103 SCpnt->sdb.length = this_count * sdp->sector_size; 1121 - 1122 - /* If DIF or DIX is enabled, tell HBA how to handle request */ 1123 - if (host_dif || scsi_prot_sg_count(SCpnt)) 1124 - sd_prot_op(SCpnt, host_dif); 1125 1104 1126 1105 /* 1127 1106 * We shouldn't disconnect in the middle of a sector, so with a dumb

+64 -2

drivers/scsi/sd.h

··· 167 167 }; 168 168 169 169 /* 170 + * Look up the DIX operation based on whether the command is read or 171 + * write and whether dix and dif are enabled. 172 + */ 173 + static inline unsigned int sd_prot_op(bool write, bool dix, bool dif) 174 + { 175 + /* Lookup table: bit 2 (write), bit 1 (dix), bit 0 (dif) */ 176 + const unsigned int ops[] = { /* wrt dix dif */ 177 + SCSI_PROT_NORMAL, /* 0 0 0 */ 178 + SCSI_PROT_READ_STRIP, /* 0 0 1 */ 179 + SCSI_PROT_READ_INSERT, /* 0 1 0 */ 180 + SCSI_PROT_READ_PASS, /* 0 1 1 */ 181 + SCSI_PROT_NORMAL, /* 1 0 0 */ 182 + SCSI_PROT_WRITE_INSERT, /* 1 0 1 */ 183 + SCSI_PROT_WRITE_STRIP, /* 1 1 0 */ 184 + SCSI_PROT_WRITE_PASS, /* 1 1 1 */ 185 + }; 186 + 187 + return ops[write << 2 | dix << 1 | dif]; 188 + } 189 + 190 + /* 191 + * Returns a mask of the protection flags that are valid for a given DIX 192 + * operation. 193 + */ 194 + static inline unsigned int sd_prot_flag_mask(unsigned int prot_op) 195 + { 196 + const unsigned int flag_mask[] = { 197 + [SCSI_PROT_NORMAL] = 0, 198 + 199 + [SCSI_PROT_READ_STRIP] = SCSI_PROT_TRANSFER_PI | 200 + SCSI_PROT_GUARD_CHECK | 201 + SCSI_PROT_REF_CHECK | 202 + SCSI_PROT_REF_INCREMENT, 203 + 204 + [SCSI_PROT_READ_INSERT] = SCSI_PROT_REF_INCREMENT | 205 + SCSI_PROT_IP_CHECKSUM, 206 + 207 + [SCSI_PROT_READ_PASS] = SCSI_PROT_TRANSFER_PI | 208 + SCSI_PROT_GUARD_CHECK | 209 + SCSI_PROT_REF_CHECK | 210 + SCSI_PROT_REF_INCREMENT | 211 + SCSI_PROT_IP_CHECKSUM, 212 + 213 + [SCSI_PROT_WRITE_INSERT] = SCSI_PROT_TRANSFER_PI | 214 + SCSI_PROT_REF_INCREMENT, 215 + 216 + [SCSI_PROT_WRITE_STRIP] = SCSI_PROT_GUARD_CHECK | 217 + SCSI_PROT_REF_CHECK | 218 + SCSI_PROT_REF_INCREMENT | 219 + SCSI_PROT_IP_CHECKSUM, 220 + 221 + [SCSI_PROT_WRITE_PASS] = SCSI_PROT_TRANSFER_PI | 222 + SCSI_PROT_GUARD_CHECK | 223 + SCSI_PROT_REF_CHECK | 224 + SCSI_PROT_REF_INCREMENT | 225 + SCSI_PROT_IP_CHECKSUM, 226 + }; 227 + 228 + return flag_mask[prot_op]; 229 + } 230 + 231 + /* 170 232 * Data Integrity Field tuple. 171 233 */ 172 234 struct sd_dif_tuple { ··· 240 178 #ifdef CONFIG_BLK_DEV_INTEGRITY 241 179 242 180 extern void sd_dif_config_host(struct scsi_disk *); 243 - extern void sd_dif_prepare(struct request *rq, sector_t, unsigned int); 181 + extern void sd_dif_prepare(struct scsi_cmnd *scmd); 244 182 extern void sd_dif_complete(struct scsi_cmnd *, unsigned int); 245 183 246 184 #else /* CONFIG_BLK_DEV_INTEGRITY */ ··· 248 186 static inline void sd_dif_config_host(struct scsi_disk *disk) 249 187 { 250 188 } 251 - static inline int sd_dif_prepare(struct request *rq, sector_t s, unsigned int a) 189 + static inline int sd_dif_prepare(struct scsi_cmnd *scmd) 252 190 { 253 191 return 0; 254 192 }

+48 -305

drivers/scsi/sd_dif.c

··· 21 21 */ 22 22 23 23 #include <linux/blkdev.h> 24 - #include <linux/crc-t10dif.h> 24 + #include <linux/t10-pi.h> 25 25 26 26 #include <scsi/scsi.h> 27 27 #include <scsi/scsi_cmnd.h> ··· 33 33 #include <scsi/scsi_ioctl.h> 34 34 #include <scsi/scsicam.h> 35 35 36 - #include <net/checksum.h> 37 - 38 36 #include "sd.h" 39 - 40 - typedef __u16 (csum_fn) (void *, unsigned int); 41 - 42 - static __u16 sd_dif_crc_fn(void *data, unsigned int len) 43 - { 44 - return cpu_to_be16(crc_t10dif(data, len)); 45 - } 46 - 47 - static __u16 sd_dif_ip_fn(void *data, unsigned int len) 48 - { 49 - return ip_compute_csum(data, len); 50 - } 51 - 52 - /* 53 - * Type 1 and Type 2 protection use the same format: 16 bit guard tag, 54 - * 16 bit app tag, 32 bit reference tag. 55 - */ 56 - static void sd_dif_type1_generate(struct blk_integrity_exchg *bix, csum_fn *fn) 57 - { 58 - void *buf = bix->data_buf; 59 - struct sd_dif_tuple *sdt = bix->prot_buf; 60 - sector_t sector = bix->sector; 61 - unsigned int i; 62 - 63 - for (i = 0 ; i < bix->data_size ; i += bix->sector_size, sdt++) { 64 - sdt->guard_tag = fn(buf, bix->sector_size); 65 - sdt->ref_tag = cpu_to_be32(sector & 0xffffffff); 66 - sdt->app_tag = 0; 67 - 68 - buf += bix->sector_size; 69 - sector++; 70 - } 71 - } 72 - 73 - static void sd_dif_type1_generate_crc(struct blk_integrity_exchg *bix) 74 - { 75 - sd_dif_type1_generate(bix, sd_dif_crc_fn); 76 - } 77 - 78 - static void sd_dif_type1_generate_ip(struct blk_integrity_exchg *bix) 79 - { 80 - sd_dif_type1_generate(bix, sd_dif_ip_fn); 81 - } 82 - 83 - static int sd_dif_type1_verify(struct blk_integrity_exchg *bix, csum_fn *fn) 84 - { 85 - void *buf = bix->data_buf; 86 - struct sd_dif_tuple *sdt = bix->prot_buf; 87 - sector_t sector = bix->sector; 88 - unsigned int i; 89 - __u16 csum; 90 - 91 - for (i = 0 ; i < bix->data_size ; i += bix->sector_size, sdt++) { 92 - /* Unwritten sectors */ 93 - if (sdt->app_tag == 0xffff) 94 - return 0; 95 - 96 - if (be32_to_cpu(sdt->ref_tag) != (sector & 0xffffffff)) { 97 - printk(KERN_ERR 98 - "%s: ref tag error on sector %lu (rcvd %u)\n", 99 - bix->disk_name, (unsigned long)sector, 100 - be32_to_cpu(sdt->ref_tag)); 101 - return -EIO; 102 - } 103 - 104 - csum = fn(buf, bix->sector_size); 105 - 106 - if (sdt->guard_tag != csum) { 107 - printk(KERN_ERR "%s: guard tag error on sector %lu " \ 108 - "(rcvd %04x, data %04x)\n", bix->disk_name, 109 - (unsigned long)sector, 110 - be16_to_cpu(sdt->guard_tag), be16_to_cpu(csum)); 111 - return -EIO; 112 - } 113 - 114 - buf += bix->sector_size; 115 - sector++; 116 - } 117 - 118 - return 0; 119 - } 120 - 121 - static int sd_dif_type1_verify_crc(struct blk_integrity_exchg *bix) 122 - { 123 - return sd_dif_type1_verify(bix, sd_dif_crc_fn); 124 - } 125 - 126 - static int sd_dif_type1_verify_ip(struct blk_integrity_exchg *bix) 127 - { 128 - return sd_dif_type1_verify(bix, sd_dif_ip_fn); 129 - } 130 - 131 - /* 132 - * Functions for interleaving and deinterleaving application tags 133 - */ 134 - static void sd_dif_type1_set_tag(void *prot, void *tag_buf, unsigned int sectors) 135 - { 136 - struct sd_dif_tuple *sdt = prot; 137 - u8 *tag = tag_buf; 138 - unsigned int i, j; 139 - 140 - for (i = 0, j = 0 ; i < sectors ; i++, j += 2, sdt++) { 141 - sdt->app_tag = tag[j] << 8 | tag[j+1]; 142 - BUG_ON(sdt->app_tag == 0xffff); 143 - } 144 - } 145 - 146 - static void sd_dif_type1_get_tag(void *prot, void *tag_buf, unsigned int sectors) 147 - { 148 - struct sd_dif_tuple *sdt = prot; 149 - u8 *tag = tag_buf; 150 - unsigned int i, j; 151 - 152 - for (i = 0, j = 0 ; i < sectors ; i++, j += 2, sdt++) { 153 - tag[j] = (sdt->app_tag & 0xff00) >> 8; 154 - tag[j+1] = sdt->app_tag & 0xff; 155 - } 156 - } 157 - 158 - static struct blk_integrity dif_type1_integrity_crc = { 159 - .name = "T10-DIF-TYPE1-CRC", 160 - .generate_fn = sd_dif_type1_generate_crc, 161 - .verify_fn = sd_dif_type1_verify_crc, 162 - .get_tag_fn = sd_dif_type1_get_tag, 163 - .set_tag_fn = sd_dif_type1_set_tag, 164 - .tuple_size = sizeof(struct sd_dif_tuple), 165 - .tag_size = 0, 166 - }; 167 - 168 - static struct blk_integrity dif_type1_integrity_ip = { 169 - .name = "T10-DIF-TYPE1-IP", 170 - .generate_fn = sd_dif_type1_generate_ip, 171 - .verify_fn = sd_dif_type1_verify_ip, 172 - .get_tag_fn = sd_dif_type1_get_tag, 173 - .set_tag_fn = sd_dif_type1_set_tag, 174 - .tuple_size = sizeof(struct sd_dif_tuple), 175 - .tag_size = 0, 176 - }; 177 - 178 - 179 - /* 180 - * Type 3 protection has a 16-bit guard tag and 16 + 32 bits of opaque 181 - * tag space. 182 - */ 183 - static void sd_dif_type3_generate(struct blk_integrity_exchg *bix, csum_fn *fn) 184 - { 185 - void *buf = bix->data_buf; 186 - struct sd_dif_tuple *sdt = bix->prot_buf; 187 - unsigned int i; 188 - 189 - for (i = 0 ; i < bix->data_size ; i += bix->sector_size, sdt++) { 190 - sdt->guard_tag = fn(buf, bix->sector_size); 191 - sdt->ref_tag = 0; 192 - sdt->app_tag = 0; 193 - 194 - buf += bix->sector_size; 195 - } 196 - } 197 - 198 - static void sd_dif_type3_generate_crc(struct blk_integrity_exchg *bix) 199 - { 200 - sd_dif_type3_generate(bix, sd_dif_crc_fn); 201 - } 202 - 203 - static void sd_dif_type3_generate_ip(struct blk_integrity_exchg *bix) 204 - { 205 - sd_dif_type3_generate(bix, sd_dif_ip_fn); 206 - } 207 - 208 - static int sd_dif_type3_verify(struct blk_integrity_exchg *bix, csum_fn *fn) 209 - { 210 - void *buf = bix->data_buf; 211 - struct sd_dif_tuple *sdt = bix->prot_buf; 212 - sector_t sector = bix->sector; 213 - unsigned int i; 214 - __u16 csum; 215 - 216 - for (i = 0 ; i < bix->data_size ; i += bix->sector_size, sdt++) { 217 - /* Unwritten sectors */ 218 - if (sdt->app_tag == 0xffff && sdt->ref_tag == 0xffffffff) 219 - return 0; 220 - 221 - csum = fn(buf, bix->sector_size); 222 - 223 - if (sdt->guard_tag != csum) { 224 - printk(KERN_ERR "%s: guard tag error on sector %lu " \ 225 - "(rcvd %04x, data %04x)\n", bix->disk_name, 226 - (unsigned long)sector, 227 - be16_to_cpu(sdt->guard_tag), be16_to_cpu(csum)); 228 - return -EIO; 229 - } 230 - 231 - buf += bix->sector_size; 232 - sector++; 233 - } 234 - 235 - return 0; 236 - } 237 - 238 - static int sd_dif_type3_verify_crc(struct blk_integrity_exchg *bix) 239 - { 240 - return sd_dif_type3_verify(bix, sd_dif_crc_fn); 241 - } 242 - 243 - static int sd_dif_type3_verify_ip(struct blk_integrity_exchg *bix) 244 - { 245 - return sd_dif_type3_verify(bix, sd_dif_ip_fn); 246 - } 247 - 248 - static void sd_dif_type3_set_tag(void *prot, void *tag_buf, unsigned int sectors) 249 - { 250 - struct sd_dif_tuple *sdt = prot; 251 - u8 *tag = tag_buf; 252 - unsigned int i, j; 253 - 254 - for (i = 0, j = 0 ; i < sectors ; i++, j += 6, sdt++) { 255 - sdt->app_tag = tag[j] << 8 | tag[j+1]; 256 - sdt->ref_tag = tag[j+2] << 24 | tag[j+3] << 16 | 257 - tag[j+4] << 8 | tag[j+5]; 258 - } 259 - } 260 - 261 - static void sd_dif_type3_get_tag(void *prot, void *tag_buf, unsigned int sectors) 262 - { 263 - struct sd_dif_tuple *sdt = prot; 264 - u8 *tag = tag_buf; 265 - unsigned int i, j; 266 - 267 - for (i = 0, j = 0 ; i < sectors ; i++, j += 2, sdt++) { 268 - tag[j] = (sdt->app_tag & 0xff00) >> 8; 269 - tag[j+1] = sdt->app_tag & 0xff; 270 - tag[j+2] = (sdt->ref_tag & 0xff000000) >> 24; 271 - tag[j+3] = (sdt->ref_tag & 0xff0000) >> 16; 272 - tag[j+4] = (sdt->ref_tag & 0xff00) >> 8; 273 - tag[j+5] = sdt->ref_tag & 0xff; 274 - BUG_ON(sdt->app_tag == 0xffff || sdt->ref_tag == 0xffffffff); 275 - } 276 - } 277 - 278 - static struct blk_integrity dif_type3_integrity_crc = { 279 - .name = "T10-DIF-TYPE3-CRC", 280 - .generate_fn = sd_dif_type3_generate_crc, 281 - .verify_fn = sd_dif_type3_verify_crc, 282 - .get_tag_fn = sd_dif_type3_get_tag, 283 - .set_tag_fn = sd_dif_type3_set_tag, 284 - .tuple_size = sizeof(struct sd_dif_tuple), 285 - .tag_size = 0, 286 - }; 287 - 288 - static struct blk_integrity dif_type3_integrity_ip = { 289 - .name = "T10-DIF-TYPE3-IP", 290 - .generate_fn = sd_dif_type3_generate_ip, 291 - .verify_fn = sd_dif_type3_verify_ip, 292 - .get_tag_fn = sd_dif_type3_get_tag, 293 - .set_tag_fn = sd_dif_type3_set_tag, 294 - .tuple_size = sizeof(struct sd_dif_tuple), 295 - .tag_size = 0, 296 - }; 297 37 298 38 /* 299 39 * Configure exchange of protection information between OS and HBA. ··· 56 316 return; 57 317 58 318 /* Enable DMA of protection information */ 59 - if (scsi_host_get_guard(sdkp->device->host) & SHOST_DIX_GUARD_IP) 319 + if (scsi_host_get_guard(sdkp->device->host) & SHOST_DIX_GUARD_IP) { 60 320 if (type == SD_DIF_TYPE3_PROTECTION) 61 - blk_integrity_register(disk, &dif_type3_integrity_ip); 321 + blk_integrity_register(disk, &t10_pi_type3_ip); 62 322 else 63 - blk_integrity_register(disk, &dif_type1_integrity_ip); 64 - else 323 + blk_integrity_register(disk, &t10_pi_type1_ip); 324 + 325 + disk->integrity->flags |= BLK_INTEGRITY_IP_CHECKSUM; 326 + } else 65 327 if (type == SD_DIF_TYPE3_PROTECTION) 66 - blk_integrity_register(disk, &dif_type3_integrity_crc); 328 + blk_integrity_register(disk, &t10_pi_type3_crc); 67 329 else 68 - blk_integrity_register(disk, &dif_type1_integrity_crc); 330 + blk_integrity_register(disk, &t10_pi_type1_crc); 69 331 70 332 sd_printk(KERN_NOTICE, sdkp, 71 333 "Enabling DIX %s protection\n", disk->integrity->name); 72 334 73 335 /* Signal to block layer that we support sector tagging */ 74 - if (dif && type && sdkp->ATO) { 336 + if (dif && type) { 337 + 338 + disk->integrity->flags |= BLK_INTEGRITY_DEVICE_CAPABLE; 339 + 340 + if (!sdkp) 341 + return; 342 + 75 343 if (type == SD_DIF_TYPE3_PROTECTION) 76 344 disk->integrity->tag_size = sizeof(u16) + sizeof(u32); 77 345 else ··· 106 358 * 107 359 * Type 3 does not have a reference tag so no remapping is required. 108 360 */ 109 - void sd_dif_prepare(struct request *rq, sector_t hw_sector, 110 - unsigned int sector_sz) 361 + void sd_dif_prepare(struct scsi_cmnd *scmd) 111 362 { 112 - const int tuple_sz = sizeof(struct sd_dif_tuple); 363 + const int tuple_sz = sizeof(struct t10_pi_tuple); 113 364 struct bio *bio; 114 365 struct scsi_disk *sdkp; 115 - struct sd_dif_tuple *sdt; 366 + struct t10_pi_tuple *pi; 116 367 u32 phys, virt; 117 368 118 - sdkp = rq->bio->bi_bdev->bd_disk->private_data; 369 + sdkp = scsi_disk(scmd->request->rq_disk); 119 370 120 371 if (sdkp->protection_type == SD_DIF_TYPE3_PROTECTION) 121 372 return; 122 373 123 - phys = hw_sector & 0xffffffff; 374 + phys = scsi_prot_ref_tag(scmd); 124 375 125 - __rq_for_each_bio(bio, rq) { 376 + __rq_for_each_bio(bio, scmd->request) { 377 + struct bio_integrity_payload *bip = bio_integrity(bio); 126 378 struct bio_vec iv; 127 379 struct bvec_iter iter; 128 380 unsigned int j; 129 381 130 382 /* Already remapped? */ 131 - if (bio_flagged(bio, BIO_MAPPED_INTEGRITY)) 383 + if (bip->bip_flags & BIP_MAPPED_INTEGRITY) 132 384 break; 133 385 134 - virt = bio->bi_integrity->bip_iter.bi_sector & 0xffffffff; 386 + virt = bip_get_seed(bip) & 0xffffffff; 135 387 136 - bip_for_each_vec(iv, bio->bi_integrity, iter) { 137 - sdt = kmap_atomic(iv.bv_page) 138 - + iv.bv_offset; 388 + bip_for_each_vec(iv, bip, iter) { 389 + pi = kmap_atomic(iv.bv_page) + iv.bv_offset; 139 390 140 - for (j = 0; j < iv.bv_len; j += tuple_sz, sdt++) { 391 + for (j = 0; j < iv.bv_len; j += tuple_sz, pi++) { 141 392 142 - if (be32_to_cpu(sdt->ref_tag) == virt) 143 - sdt->ref_tag = cpu_to_be32(phys); 393 + if (be32_to_cpu(pi->ref_tag) == virt) 394 + pi->ref_tag = cpu_to_be32(phys); 144 395 145 396 virt++; 146 397 phys++; 147 398 } 148 399 149 - kunmap_atomic(sdt); 400 + kunmap_atomic(pi); 150 401 } 151 402 152 - bio->bi_flags |= (1 << BIO_MAPPED_INTEGRITY); 403 + bip->bip_flags |= BIP_MAPPED_INTEGRITY; 153 404 } 154 405 } 155 406 ··· 158 411 */ 159 412 void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int good_bytes) 160 413 { 161 - const int tuple_sz = sizeof(struct sd_dif_tuple); 414 + const int tuple_sz = sizeof(struct t10_pi_tuple); 162 415 struct scsi_disk *sdkp; 163 416 struct bio *bio; 164 - struct sd_dif_tuple *sdt; 165 - unsigned int j, sectors, sector_sz; 417 + struct t10_pi_tuple *pi; 418 + unsigned int j, intervals; 166 419 u32 phys, virt; 167 420 168 421 sdkp = scsi_disk(scmd->request->rq_disk); ··· 170 423 if (sdkp->protection_type == SD_DIF_TYPE3_PROTECTION || good_bytes == 0) 171 424 return; 172 425 173 - sector_sz = scmd->device->sector_size; 174 - sectors = good_bytes / sector_sz; 175 - 176 - phys = blk_rq_pos(scmd->request) & 0xffffffff; 177 - if (sector_sz == 4096) 178 - phys >>= 3; 426 + intervals = good_bytes / scsi_prot_interval(scmd); 427 + phys = scsi_prot_ref_tag(scmd); 179 428 180 429 __rq_for_each_bio(bio, scmd->request) { 430 + struct bio_integrity_payload *bip = bio_integrity(bio); 181 431 struct bio_vec iv; 182 432 struct bvec_iter iter; 183 433 184 - virt = bio->bi_integrity->bip_iter.bi_sector & 0xffffffff; 434 + virt = bip_get_seed(bip) & 0xffffffff; 185 435 186 - bip_for_each_vec(iv, bio->bi_integrity, iter) { 187 - sdt = kmap_atomic(iv.bv_page) 188 - + iv.bv_offset; 436 + bip_for_each_vec(iv, bip, iter) { 437 + pi = kmap_atomic(iv.bv_page) + iv.bv_offset; 189 438 190 - for (j = 0; j < iv.bv_len; j += tuple_sz, sdt++) { 439 + for (j = 0; j < iv.bv_len; j += tuple_sz, pi++) { 191 440 192 - if (sectors == 0) { 193 - kunmap_atomic(sdt); 441 + if (intervals == 0) { 442 + kunmap_atomic(pi); 194 443 return; 195 444 } 196 445 197 - if (be32_to_cpu(sdt->ref_tag) == phys) 198 - sdt->ref_tag = cpu_to_be32(virt); 446 + if (be32_to_cpu(pi->ref_tag) == phys) 447 + pi->ref_tag = cpu_to_be32(virt); 199 448 200 449 virt++; 201 450 phys++; 202 - sectors--; 451 + intervals--; 203 452 } 204 453 205 - kunmap_atomic(sdt); 454 + kunmap_atomic(pi); 206 455 } 207 456 } 208 457 }

+2 -2

drivers/scsi/sg.c

··· 1711 1711 } 1712 1712 1713 1713 rq = blk_get_request(q, rw, GFP_ATOMIC); 1714 - if (!rq) { 1714 + if (IS_ERR(rq)) { 1715 1715 kfree(long_cmdp); 1716 - return -ENOMEM; 1716 + return PTR_ERR(rq); 1717 1717 } 1718 1718 1719 1719 blk_rq_set_block_pc(rq);

+1 -1

drivers/scsi/st.c

··· 490 490 491 491 req = blk_get_request(SRpnt->stp->device->request_queue, write, 492 492 GFP_KERNEL); 493 - if (!req) 493 + if (IS_ERR(req)) 494 494 return DRIVER_ERROR << 24; 495 495 496 496 blk_rq_set_block_pc(req);

+1 -1

drivers/target/target_core_pscsi.c

··· 1050 1050 req = blk_get_request(pdv->pdv_sd->request_queue, 1051 1051 (data_direction == DMA_TO_DEVICE), 1052 1052 GFP_KERNEL); 1053 - if (!req) { 1053 + if (IS_ERR(req)) { 1054 1054 pr_err("PSCSI: blk_get_request() failed\n"); 1055 1055 ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; 1056 1056 goto fail;

+11 -23

fs/block_dev.c

··· 50 50 EXPORT_SYMBOL(I_BDEV); 51 51 52 52 /* 53 - * Move the inode from its current bdi to a new bdi. If the inode is dirty we 54 - * need to move it onto the dirty list of @dst so that the inode is always on 55 - * the right list. 53 + * Move the inode from its current bdi to a new bdi. Make sure the inode 54 + * is clean before moving so that it doesn't linger on the old bdi. 56 55 */ 57 56 static void bdev_inode_switch_bdi(struct inode *inode, 58 57 struct backing_dev_info *dst) 59 58 { 60 - struct backing_dev_info *old = inode->i_data.backing_dev_info; 61 - bool wakeup_bdi = false; 62 - 63 - if (unlikely(dst == old)) /* deadlock avoidance */ 64 - return; 65 - bdi_lock_two(&old->wb, &dst->wb); 66 - spin_lock(&inode->i_lock); 67 - inode->i_data.backing_dev_info = dst; 68 - if (inode->i_state & I_DIRTY) { 69 - if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb)) 70 - wakeup_bdi = true; 71 - list_move(&inode->i_wb_list, &dst->wb.b_dirty); 59 + while (true) { 60 + spin_lock(&inode->i_lock); 61 + if (!(inode->i_state & I_DIRTY)) { 62 + inode->i_data.backing_dev_info = dst; 63 + spin_unlock(&inode->i_lock); 64 + return; 65 + } 66 + spin_unlock(&inode->i_lock); 67 + WARN_ON_ONCE(write_inode_now(inode, true)); 72 68 } 73 - spin_unlock(&inode->i_lock); 74 - spin_unlock(&old->wb.list_lock); 75 - spin_unlock(&dst->wb.list_lock); 76 - 77 - if (wakeup_bdi) 78 - bdi_wakeup_thread_delayed(dst); 79 69 } 80 70 81 71 /* Kill _all_ buffers and pagecache , dirty or not.. */ ··· 1169 1179 if (!ret) { 1170 1180 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 1171 1181 bdi = blk_get_backing_dev_info(bdev); 1172 - if (bdi == NULL) 1173 - bdi = &default_backing_dev_info; 1174 1182 bdev_inode_switch_bdi(bdev->bd_inode, bdi); 1175 1183 } 1176 1184

+1 -1

fs/btrfs/disk-io.c

··· 1702 1702 if (!device->bdev) 1703 1703 continue; 1704 1704 bdi = blk_get_backing_dev_info(device->bdev); 1705 - if (bdi && bdi_congested(bdi, bdi_bits)) { 1705 + if (bdi_congested(bdi, bdi_bits)) { 1706 1706 ret = 1; 1707 1707 break; 1708 1708 }

+5 -7

fs/nfs/direct.c

··· 220 220 #else 221 221 VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); 222 222 223 - if (rw == READ || rw == KERNEL_READ) 224 - return nfs_file_direct_read(iocb, iter, pos, 225 - rw == READ ? true : false); 226 - return nfs_file_direct_write(iocb, iter, pos, 227 - rw == WRITE ? true : false); 223 + if (rw == READ) 224 + return nfs_file_direct_read(iocb, iter, pos); 225 + return nfs_file_direct_write(iocb, iter, pos); 228 226 #endif /* CONFIG_NFS_SWAP */ 229 227 } 230 228 ··· 508 510 * cache. 509 511 */ 510 512 ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, 511 - loff_t pos, bool uio) 513 + loff_t pos) 512 514 { 513 515 struct file *file = iocb->ki_filp; 514 516 struct address_space *mapping = file->f_mapping; ··· 877 879 * is no atomic O_APPEND write facility in the NFS protocol. 878 880 */ 879 881 ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, 880 - loff_t pos, bool uio) 882 + loff_t pos) 881 883 { 882 884 ssize_t result = -EINVAL; 883 885 struct file *file = iocb->ki_filp;

+2 -2

fs/nfs/file.c

··· 172 172 ssize_t result; 173 173 174 174 if (iocb->ki_filp->f_flags & O_DIRECT) 175 - return nfs_file_direct_read(iocb, to, iocb->ki_pos, true); 175 + return nfs_file_direct_read(iocb, to, iocb->ki_pos); 176 176 177 177 dprintk("NFS: read(%pD2, %zu@%lu)\n", 178 178 iocb->ki_filp, ··· 676 676 return result; 677 677 678 678 if (file->f_flags & O_DIRECT) 679 - return nfs_file_direct_write(iocb, from, pos, true); 679 + return nfs_file_direct_write(iocb, from, pos); 680 680 681 681 dprintk("NFS: write(%pD2, %zu@%Ld)\n", 682 682 file, count, (long long) pos);

-2

fs/xfs/xfs_buf.c

··· 1670 1670 btp->bt_dev = bdev->bd_dev; 1671 1671 btp->bt_bdev = bdev; 1672 1672 btp->bt_bdi = blk_get_backing_dev_info(bdev); 1673 - if (!btp->bt_bdi) 1674 - goto error; 1675 1673 1676 1674 if (xfs_setsize_buftarg_early(btp, bdev)) 1677 1675 goto error;

-4

include/linux/backing-dev.h

··· 28 28 * Bits in backing_dev_info.state 29 29 */ 30 30 enum bdi_state { 31 - BDI_wb_alloc, /* Default embedded wb allocated */ 32 31 BDI_async_congested, /* The async (write) queue is getting full */ 33 32 BDI_sync_congested, /* The sync queue is getting full */ 34 33 BDI_registered, /* bdi_register() was done */ 35 34 BDI_writeback_running, /* Writeback is in progress */ 36 - BDI_unused, /* Available bits start here */ 37 35 }; 38 36 39 37 typedef int (congested_fn)(void *, int); ··· 48 50 49 51 struct bdi_writeback { 50 52 struct backing_dev_info *bdi; /* our parent bdi */ 51 - unsigned int nr; 52 53 53 54 unsigned long last_old_flush; /* last old data flush */ 54 55 ··· 121 124 void bdi_writeback_workfn(struct work_struct *work); 122 125 int bdi_has_dirty_io(struct backing_dev_info *bdi); 123 126 void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); 124 - void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); 125 127 126 128 extern spinlock_t bdi_lock; 127 129 extern struct list_head bdi_list;

+51 -14

include/linux/bio.h

··· 292 292 */ 293 293 #define bio_get(bio) atomic_inc(&(bio)->bi_cnt) 294 294 295 + enum bip_flags { 296 + BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ 297 + BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ 298 + BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ 299 + BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ 300 + BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ 301 + }; 302 + 295 303 #if defined(CONFIG_BLK_DEV_INTEGRITY) 304 + 305 + static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) 306 + { 307 + if (bio->bi_rw & REQ_INTEGRITY) 308 + return bio->bi_integrity; 309 + 310 + return NULL; 311 + } 312 + 296 313 /* 297 314 * bio integrity payload 298 315 */ ··· 318 301 319 302 struct bvec_iter bip_iter; 320 303 321 - /* kill - should just use bip_vec */ 322 - void *bip_buf; /* generated integrity data */ 323 - 324 304 bio_end_io_t *bip_end_io; /* saved I/O completion fn */ 325 305 326 306 unsigned short bip_slab; /* slab the bip came from */ 327 307 unsigned short bip_vcnt; /* # of integrity bio_vecs */ 328 308 unsigned short bip_max_vcnt; /* integrity bio_vec slots */ 329 - unsigned bip_owns_buf:1; /* should free bip_buf */ 309 + unsigned short bip_flags; /* control flags */ 330 310 331 311 struct work_struct bip_work; /* I/O completion */ 332 312 333 313 struct bio_vec *bip_vec; 334 314 struct bio_vec bip_inline_vecs[0];/* embedded bvec array */ 335 315 }; 316 + 317 + static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) 318 + { 319 + struct bio_integrity_payload *bip = bio_integrity(bio); 320 + 321 + if (bip) 322 + return bip->bip_flags & flag; 323 + 324 + return false; 325 + } 326 + 327 + static inline sector_t bip_get_seed(struct bio_integrity_payload *bip) 328 + { 329 + return bip->bip_iter.bi_sector; 330 + } 331 + 332 + static inline void bip_set_seed(struct bio_integrity_payload *bip, 333 + sector_t seed) 334 + { 335 + bip->bip_iter.bi_sector = seed; 336 + } 337 + 336 338 #endif /* CONFIG_BLK_DEV_INTEGRITY */ 337 339 338 340 extern void bio_trim(struct bio *bio, int offset, int size); ··· 378 342 } 379 343 380 344 extern struct bio_set *bioset_create(unsigned int, unsigned int); 345 + extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int); 381 346 extern void bioset_free(struct bio_set *); 382 347 extern mempool_t *biovec_create_pool(int pool_entries); 383 348 ··· 390 353 extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs); 391 354 392 355 extern struct bio_set *fs_bio_set; 393 - unsigned int bio_integrity_tag_size(struct bio *bio); 394 356 395 357 static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) 396 358 { ··· 697 661 for_each_bio(_bio) \ 698 662 bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) 699 663 700 - #define bio_integrity(bio) (bio->bi_integrity != NULL) 701 - 702 664 extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); 703 665 extern void bio_integrity_free(struct bio *); 704 666 extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); 705 - extern int bio_integrity_enabled(struct bio *bio); 706 - extern int bio_integrity_set_tag(struct bio *, void *, unsigned int); 707 - extern int bio_integrity_get_tag(struct bio *, void *, unsigned int); 667 + extern bool bio_integrity_enabled(struct bio *bio); 708 668 extern int bio_integrity_prep(struct bio *); 709 669 extern void bio_integrity_endio(struct bio *, int); 710 670 extern void bio_integrity_advance(struct bio *, unsigned int); ··· 712 680 713 681 #else /* CONFIG_BLK_DEV_INTEGRITY */ 714 682 715 - static inline int bio_integrity(struct bio *bio) 683 + static inline void *bio_integrity(struct bio *bio) 716 684 { 717 - return 0; 685 + return NULL; 718 686 } 719 687 720 - static inline int bio_integrity_enabled(struct bio *bio) 688 + static inline bool bio_integrity_enabled(struct bio *bio) 721 689 { 722 - return 0; 690 + return false; 723 691 } 724 692 725 693 static inline int bioset_integrity_create(struct bio_set *bs, int pool_size) ··· 763 731 static inline void bio_integrity_init(void) 764 732 { 765 733 return; 734 + } 735 + 736 + static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) 737 + { 738 + return false; 766 739 } 767 740 768 741 #endif /* CONFIG_BLK_DEV_INTEGRITY */

+17 -5

include/linux/blk-mq.h

··· 4 4 #include <linux/blkdev.h> 5 5 6 6 struct blk_mq_tags; 7 + struct blk_flush_queue; 7 8 8 9 struct blk_mq_cpu_notifier { 9 10 struct list_head list; ··· 35 34 36 35 struct request_queue *queue; 37 36 unsigned int queue_num; 37 + struct blk_flush_queue *fq; 38 38 39 39 void *driver_data; 40 40 ··· 79 77 struct list_head tag_list; 80 78 }; 81 79 82 - typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *); 80 + typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *, bool); 83 81 typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int); 82 + typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); 84 83 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); 85 84 typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); 86 85 typedef int (init_request_fn)(void *, struct request *, unsigned int, 87 86 unsigned int, unsigned int); 88 87 typedef void (exit_request_fn)(void *, struct request *, unsigned int, 89 88 unsigned int); 89 + 90 + typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, 91 + bool); 90 92 91 93 struct blk_mq_ops { 92 94 /* ··· 106 100 /* 107 101 * Called on request timeout 108 102 */ 109 - rq_timed_out_fn *timeout; 103 + timeout_fn *timeout; 110 104 111 105 softirq_done_fn *complete; 112 106 ··· 121 115 /* 122 116 * Called for every command allocated by the block layer to allow 123 117 * the driver to set up driver specific data. 118 + * 119 + * Tag greater than or equal to queue_depth is for setting up 120 + * flush request. 121 + * 124 122 * Ditto for exit/teardown. 125 123 */ 126 124 init_request_fn *init_request; ··· 170 160 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index); 171 161 struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int); 172 162 173 - void blk_mq_end_io(struct request *rq, int error); 174 - void __blk_mq_end_io(struct request *rq, int error); 163 + void blk_mq_start_request(struct request *rq); 164 + void blk_mq_end_request(struct request *rq, int error); 165 + void __blk_mq_end_request(struct request *rq, int error); 175 166 176 167 void blk_mq_requeue_request(struct request *rq); 177 168 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head); ··· 185 174 void blk_mq_start_hw_queues(struct request_queue *q); 186 175 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); 187 176 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); 188 - void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); 177 + void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, 178 + void *priv); 189 179 190 180 /* 191 181 * Driver command data is immediately after the request. So subtract request

+8 -10

include/linux/blk_types.h

··· 78 78 struct io_context *bi_ioc; 79 79 struct cgroup_subsys_state *bi_css; 80 80 #endif 81 + union { 81 82 #if defined(CONFIG_BLK_DEV_INTEGRITY) 82 - struct bio_integrity_payload *bi_integrity; /* data integrity */ 83 + struct bio_integrity_payload *bi_integrity; /* data integrity */ 83 84 #endif 85 + }; 84 86 85 87 unsigned short bi_vcnt; /* how many bio_vec's */ 86 88 ··· 120 118 #define BIO_USER_MAPPED 6 /* contains user pages */ 121 119 #define BIO_EOPNOTSUPP 7 /* not supported */ 122 120 #define BIO_NULL_MAPPED 8 /* contains invalid user pages */ 123 - #define BIO_FS_INTEGRITY 9 /* fs owns integrity data, not block layer */ 124 - #define BIO_QUIET 10 /* Make BIO Quiet */ 125 - #define BIO_MAPPED_INTEGRITY 11/* integrity metadata has been remapped */ 126 - #define BIO_SNAP_STABLE 12 /* bio data must be snapshotted during write */ 121 + #define BIO_QUIET 9 /* Make BIO Quiet */ 122 + #define BIO_SNAP_STABLE 10 /* bio data must be snapshotted during write */ 127 123 128 124 /* 129 125 * Flags starting here get preserved by bio_reset() - this includes ··· 162 162 __REQ_WRITE_SAME, /* write same block many times */ 163 163 164 164 __REQ_NOIDLE, /* don't anticipate more IO after this one */ 165 + __REQ_INTEGRITY, /* I/O includes block integrity payload */ 165 166 __REQ_FUA, /* forced unit access */ 166 167 __REQ_FLUSH, /* request for cache flush */ 167 168 ··· 187 186 __REQ_FLUSH_SEQ, /* request for flush sequence */ 188 187 __REQ_IO_STAT, /* account I/O stat */ 189 188 __REQ_MIXED_MERGE, /* merge of different types, fail separately */ 190 - __REQ_KERNEL, /* direct IO to kernel pages */ 191 189 __REQ_PM, /* runtime pm request */ 192 - __REQ_END, /* last of chain of requests */ 193 190 __REQ_HASHED, /* on IO scheduler merge hash */ 194 191 __REQ_MQ_INFLIGHT, /* track inflight for MQ */ 195 192 __REQ_NR_BITS, /* stops here */ ··· 203 204 #define REQ_DISCARD (1ULL << __REQ_DISCARD) 204 205 #define REQ_WRITE_SAME (1ULL << __REQ_WRITE_SAME) 205 206 #define REQ_NOIDLE (1ULL << __REQ_NOIDLE) 207 + #define REQ_INTEGRITY (1ULL << __REQ_INTEGRITY) 206 208 207 209 #define REQ_FAILFAST_MASK \ 208 210 (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) 209 211 #define REQ_COMMON_MASK \ 210 212 (REQ_WRITE | REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | \ 211 213 REQ_DISCARD | REQ_WRITE_SAME | REQ_NOIDLE | REQ_FLUSH | REQ_FUA | \ 212 - REQ_SECURE) 214 + REQ_SECURE | REQ_INTEGRITY) 213 215 #define REQ_CLONE_MASK REQ_COMMON_MASK 214 216 215 217 #define BIO_NO_ADVANCE_ITER_MASK (REQ_DISCARD|REQ_WRITE_SAME) ··· 240 240 #define REQ_IO_STAT (1ULL << __REQ_IO_STAT) 241 241 #define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE) 242 242 #define REQ_SECURE (1ULL << __REQ_SECURE) 243 - #define REQ_KERNEL (1ULL << __REQ_KERNEL) 244 243 #define REQ_PM (1ULL << __REQ_PM) 245 - #define REQ_END (1ULL << __REQ_END) 246 244 #define REQ_HASHED (1ULL << __REQ_HASHED) 247 245 #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) 248 246

+30 -41

include/linux/blkdev.h

··· 36 36 struct sg_io_hdr; 37 37 struct bsg_job; 38 38 struct blkcg_gq; 39 + struct blk_flush_queue; 39 40 40 41 #define BLKDEV_MIN_RQ 4 41 42 #define BLKDEV_MAX_RQ 128 /* Default maximum */ ··· 456 455 */ 457 456 unsigned int flush_flags; 458 457 unsigned int flush_not_queueable:1; 459 - unsigned int flush_queue_delayed:1; 460 - unsigned int flush_pending_idx:1; 461 - unsigned int flush_running_idx:1; 462 - unsigned long flush_pending_since; 463 - struct list_head flush_queue[2]; 464 - struct list_head flush_data_in_flight; 465 - struct request *flush_rq; 466 - spinlock_t mq_flush_lock; 458 + struct blk_flush_queue *fq; 467 459 468 460 struct list_head requeue_list; 469 461 spinlock_t requeue_lock; ··· 859 865 860 866 static inline struct request_queue *bdev_get_queue(struct block_device *bdev) 861 867 { 862 - return bdev->bd_disk->queue; 868 + return bdev->bd_disk->queue; /* this is never NULL */ 863 869 } 864 870 865 871 /* ··· 1279 1285 static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t sector) 1280 1286 { 1281 1287 unsigned int granularity = max(lim->physical_block_size, lim->io_min); 1282 - unsigned int alignment = (sector << 9) & (granularity - 1); 1288 + unsigned int alignment = sector_div(sector, granularity >> 9) << 9; 1283 1289 1284 - return (granularity + lim->alignment_offset - alignment) 1285 - & (granularity - 1); 1290 + return (granularity + lim->alignment_offset - alignment) % granularity; 1286 1291 } 1287 1292 1288 1293 static inline int bdev_alignment_offset(struct block_device *bdev) ··· 1457 1464 1458 1465 #if defined(CONFIG_BLK_DEV_INTEGRITY) 1459 1466 1460 - #define INTEGRITY_FLAG_READ 2 /* verify data integrity on read */ 1461 - #define INTEGRITY_FLAG_WRITE 4 /* generate data integrity on write */ 1467 + enum blk_integrity_flags { 1468 + BLK_INTEGRITY_VERIFY = 1 << 0, 1469 + BLK_INTEGRITY_GENERATE = 1 << 1, 1470 + BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, 1471 + BLK_INTEGRITY_IP_CHECKSUM = 1 << 3, 1472 + }; 1462 1473 1463 - struct blk_integrity_exchg { 1474 + struct blk_integrity_iter { 1464 1475 void *prot_buf; 1465 1476 void *data_buf; 1466 - sector_t sector; 1477 + sector_t seed; 1467 1478 unsigned int data_size; 1468 - unsigned short sector_size; 1479 + unsigned short interval; 1469 1480 const char *disk_name; 1470 1481 }; 1471 1482 1472 - typedef void (integrity_gen_fn) (struct blk_integrity_exchg *); 1473 - typedef int (integrity_vrfy_fn) (struct blk_integrity_exchg *); 1474 - typedef void (integrity_set_tag_fn) (void *, void *, unsigned int); 1475 - typedef void (integrity_get_tag_fn) (void *, void *, unsigned int); 1483 + typedef int (integrity_processing_fn) (struct blk_integrity_iter *); 1476 1484 1477 1485 struct blk_integrity { 1478 - integrity_gen_fn *generate_fn; 1479 - integrity_vrfy_fn *verify_fn; 1480 - integrity_set_tag_fn *set_tag_fn; 1481 - integrity_get_tag_fn *get_tag_fn; 1486 + integrity_processing_fn *generate_fn; 1487 + integrity_processing_fn *verify_fn; 1482 1488 1483 1489 unsigned short flags; 1484 1490 unsigned short tuple_size; 1485 - unsigned short sector_size; 1491 + unsigned short interval; 1486 1492 unsigned short tag_size; 1487 1493 1488 1494 const char *name; ··· 1496 1504 extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, 1497 1505 struct scatterlist *); 1498 1506 extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); 1499 - extern int blk_integrity_merge_rq(struct request_queue *, struct request *, 1500 - struct request *); 1501 - extern int blk_integrity_merge_bio(struct request_queue *, struct request *, 1502 - struct bio *); 1507 + extern bool blk_integrity_merge_rq(struct request_queue *, struct request *, 1508 + struct request *); 1509 + extern bool blk_integrity_merge_bio(struct request_queue *, struct request *, 1510 + struct bio *); 1503 1511 1504 1512 static inline 1505 1513 struct blk_integrity *bdev_get_integrity(struct block_device *bdev) ··· 1512 1520 return disk->integrity; 1513 1521 } 1514 1522 1515 - static inline int blk_integrity_rq(struct request *rq) 1523 + static inline bool blk_integrity_rq(struct request *rq) 1516 1524 { 1517 - if (rq->bio == NULL) 1518 - return 0; 1519 - 1520 - return bio_integrity(rq->bio); 1525 + return rq->cmd_flags & REQ_INTEGRITY; 1521 1526 } 1522 1527 1523 1528 static inline void blk_queue_max_integrity_segments(struct request_queue *q, ··· 1579 1590 { 1580 1591 return 0; 1581 1592 } 1582 - static inline int blk_integrity_merge_rq(struct request_queue *rq, 1583 - struct request *r1, 1584 - struct request *r2) 1593 + static inline bool blk_integrity_merge_rq(struct request_queue *rq, 1594 + struct request *r1, 1595 + struct request *r2) 1585 1596 { 1586 1597 return 0; 1587 1598 } 1588 - static inline int blk_integrity_merge_bio(struct request_queue *rq, 1589 - struct request *r, 1590 - struct bio *b) 1599 + static inline bool blk_integrity_merge_bio(struct request_queue *rq, 1600 + struct request *r, 1601 + struct bio *b) 1591 1602 { 1592 1603 return 0; 1593 1604 }

+3 -2

include/linux/crc-t10dif.h

··· 6 6 #define CRC_T10DIF_DIGEST_SIZE 2 7 7 #define CRC_T10DIF_BLOCK_SIZE 1 8 8 9 - __u16 crc_t10dif_generic(__u16 crc, const unsigned char *buffer, size_t len); 10 - __u16 crc_t10dif(unsigned char const *, size_t); 9 + extern __u16 crc_t10dif_generic(__u16 crc, const unsigned char *buffer, 10 + size_t len); 11 + extern __u16 crc_t10dif(unsigned char const *, size_t); 11 12 12 13 #endif

-2

include/linux/fs.h

··· 192 192 #define READ 0 193 193 #define WRITE RW_MASK 194 194 #define READA RWA_MASK 195 - #define KERNEL_READ (READ|REQ_KERNEL) 196 - #define KERNEL_WRITE (WRITE|REQ_KERNEL) 197 195 198 196 #define READ_SYNC (READ | REQ_SYNC) 199 197 #define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE)

+2 -2

include/linux/nfs_fs.h

··· 448 448 extern ssize_t nfs_direct_IO(int, struct kiocb *, struct iov_iter *, loff_t); 449 449 extern ssize_t nfs_file_direct_read(struct kiocb *iocb, 450 450 struct iov_iter *iter, 451 - loff_t pos, bool uio); 451 + loff_t pos); 452 452 extern ssize_t nfs_file_direct_write(struct kiocb *iocb, 453 453 struct iov_iter *iter, 454 - loff_t pos, bool uio); 454 + loff_t pos); 455 455 456 456 /* 457 457 * linux/fs/nfs/dir.c

+22

include/linux/t10-pi.h

··· 1 + #ifndef _LINUX_T10_PI_H 2 + #define _LINUX_T10_PI_H 3 + 4 + #include <linux/types.h> 5 + #include <linux/blkdev.h> 6 + 7 + /* 8 + * T10 Protection Information tuple. 9 + */ 10 + struct t10_pi_tuple { 11 + __be16 guard_tag; /* Checksum */ 12 + __be16 app_tag; /* Opaque storage */ 13 + __be32 ref_tag; /* Target LBA or indirect LBA */ 14 + }; 15 + 16 + 17 + extern struct blk_integrity t10_pi_type1_crc; 18 + extern struct blk_integrity t10_pi_type1_ip; 19 + extern struct blk_integrity t10_pi_type3_crc; 20 + extern struct blk_integrity t10_pi_type3_ip; 21 + 22 + #endif

+26 -10

include/scsi/scsi_cmnd.h

··· 10 10 #include <scsi/scsi_device.h> 11 11 12 12 struct Scsi_Host; 13 - struct scsi_device; 14 13 struct scsi_driver; 14 + 15 + #include <scsi/scsi_device.h> 15 16 16 17 /* 17 18 * MAX_COMMAND_SIZE is: ··· 82 81 83 82 unsigned char prot_op; 84 83 unsigned char prot_type; 84 + unsigned char prot_flags; 85 85 86 86 unsigned short cmd_len; 87 87 enum dma_data_direction sc_data_direction; ··· 254 252 return scmd->prot_op; 255 253 } 256 254 255 + enum scsi_prot_flags { 256 + SCSI_PROT_TRANSFER_PI = 1 << 0, 257 + SCSI_PROT_GUARD_CHECK = 1 << 1, 258 + SCSI_PROT_REF_CHECK = 1 << 2, 259 + SCSI_PROT_REF_INCREMENT = 1 << 3, 260 + SCSI_PROT_IP_CHECKSUM = 1 << 4, 261 + }; 262 + 257 263 /* 258 264 * The controller usually does not know anything about the target it 259 265 * is communicating with. However, when DIX is enabled the controller ··· 288 278 static inline sector_t scsi_get_lba(struct scsi_cmnd *scmd) 289 279 { 290 280 return blk_rq_pos(scmd->request); 281 + } 282 + 283 + static inline unsigned int scsi_prot_interval(struct scsi_cmnd *scmd) 284 + { 285 + return scmd->device->sector_size; 286 + } 287 + 288 + static inline u32 scsi_prot_ref_tag(struct scsi_cmnd *scmd) 289 + { 290 + return blk_rq_pos(scmd->request) >> 291 + (ilog2(scsi_prot_interval(scmd)) - 9) & 0xffffffff; 291 292 } 292 293 293 294 static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd) ··· 337 316 static inline unsigned scsi_transfer_length(struct scsi_cmnd *scmd) 338 317 { 339 318 unsigned int xfer_len = scsi_out(scmd)->length; 340 - unsigned int prot_op = scsi_get_prot_op(scmd); 341 - unsigned int sector_size = scmd->device->sector_size; 319 + unsigned int prot_interval = scsi_prot_interval(scmd); 342 320 343 - switch (prot_op) { 344 - case SCSI_PROT_NORMAL: 345 - case SCSI_PROT_WRITE_STRIP: 346 - case SCSI_PROT_READ_INSERT: 347 - return xfer_len; 348 - } 321 + if (scmd->prot_flags & SCSI_PROT_TRANSFER_PI) 322 + xfer_len += (xfer_len >> ilog2(prot_interval)) * 8; 349 323 350 - return xfer_len + (xfer_len >> ilog2(sector_size)) * 8; 324 + return xfer_len; 351 325 } 352 326 353 327 #endif /* _SCSI_SCSI_CMND_H */

+16 -24

mm/backing-dev.c

··· 40 40 /* bdi_wq serves all asynchronous writeback tasks */ 41 41 struct workqueue_struct *bdi_wq; 42 42 43 - void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) 43 + static void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) 44 44 { 45 45 if (wb1 < wb2) { 46 46 spin_lock(&wb1->list_lock); ··· 376 376 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); 377 377 flush_delayed_work(&bdi->wb.dwork); 378 378 WARN_ON(!list_empty(&bdi->work_list)); 379 - 380 - /* 381 - * This shouldn't be necessary unless @bdi for some reason has 382 - * unflushed dirty IO after work_list is drained. Do it anyway 383 - * just in case. 384 - */ 385 - cancel_delayed_work_sync(&bdi->wb.dwork); 379 + WARN_ON(delayed_work_pending(&bdi->wb.dwork)); 386 380 } 387 381 388 382 /* ··· 396 402 397 403 void bdi_unregister(struct backing_dev_info *bdi) 398 404 { 399 - struct device *dev = bdi->dev; 400 - 401 - if (dev) { 405 + if (bdi->dev) { 402 406 bdi_set_min_ratio(bdi, 0); 403 407 trace_writeback_bdi_unregister(bdi); 404 408 bdi_prune_sb(bdi); 405 409 406 410 bdi_wb_shutdown(bdi); 407 411 bdi_debug_unregister(bdi); 408 - 409 - spin_lock_bh(&bdi->wb_lock); 412 + device_unregister(bdi->dev); 410 413 bdi->dev = NULL; 411 - spin_unlock_bh(&bdi->wb_lock); 412 - 413 - device_unregister(dev); 414 414 } 415 415 } 416 416 EXPORT_SYMBOL(bdi_unregister); ··· 475 487 int i; 476 488 477 489 /* 478 - * Splice our entries to the default_backing_dev_info, if this 479 - * bdi disappears 490 + * Splice our entries to the default_backing_dev_info. This 491 + * condition shouldn't happen. @wb must be empty at this point and 492 + * dirty inodes on it might cause other issues. This workaround is 493 + * added by ce5f8e779519 ("writeback: splice dirty inode entries to 494 + * default bdi on bdi_destroy()") without root-causing the issue. 495 + * 496 + * http://lkml.kernel.org/g/1253038617-30204-11-git-send-email-jens.axboe@oracle.com 497 + * http://thread.gmane.org/gmane.linux.file-systems/35341/focus=35350 498 + * 499 + * We should probably add WARN_ON() to find out whether it still 500 + * happens and track it down if so. 480 501 */ 481 502 if (bdi_has_dirty_io(bdi)) { 482 503 struct bdi_writeback *dst = &default_backing_dev_info.wb; ··· 500 503 501 504 bdi_unregister(bdi); 502 505 503 - /* 504 - * If bdi_unregister() had already been called earlier, the dwork 505 - * could still be pending because bdi_prune_sb() can race with the 506 - * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty(). 507 - */ 508 - cancel_delayed_work_sync(&bdi->wb.dwork); 506 + WARN_ON(delayed_work_pending(&bdi->wb.dwork)); 509 507 510 508 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 511 509 percpu_counter_destroy(&bdi->bdi_stat[i]);