Merge tag 'for-5.20/block-2022-08-04' of git://git.kernel.dk/linux-block

+22

Documentation/block/null_blk.rst

··· 72 72 hw_queue_depth=[0..qdepth]: Default: 64 73 73 The hardware queue depth of the device. 74 74 75 + memory_backed=[0/1]: Default: 0 76 + Whether or not to use a memory buffer to respond to IO requests 77 + 78 + = ============================================= 79 + 0 Transfer no data in response to IO requests 80 + 1 Use a memory buffer to respond to IO requests 81 + = ============================================= 82 + 83 + discard=[0/1]: Default: 0 84 + Support discard operations (requires memory-backed null_blk device). 85 + 86 + = ===================================== 87 + 0 Do not support discard operations 88 + 1 Enable support for discard operations 89 + = ===================================== 90 + 91 + cache_size=[Size in MB]: Default: 0 92 + Cache size in MB for memory-backed device. 93 + 94 + mbps=[Maximum bandwidth in MB/s]: Default: 0 (no limit) 95 + Bandwidth limit for device performance. 96 + 75 97 Multi-queue specific parameters 76 98 ------------------------------- 77 99

+3 -1

MAINTAINERS

··· 14507 14507 W: http://git.infradead.org/nvme.git 14508 14508 T: git://git.infradead.org/nvme.git 14509 14509 F: drivers/nvme/host/ 14510 - F: include/linux/nvme.h 14510 + F: drivers/nvme/common/ 14511 + F: include/linux/nvme* 14511 14512 F: include/uapi/linux/nvme_ioctl.h 14512 14513 14513 14514 NVM EXPRESS FC TRANSPORT DRIVERS ··· 18839 18838 M: Song Liu <song@kernel.org> 18840 18839 L: linux-raid@vger.kernel.org 18841 18840 S: Supported 18841 + Q: https://patchwork.kernel.org/project/linux-raid/list/ 18842 18842 T: git git://git.kernel.org/pub/scm/linux/kernel/git/song/md.git 18843 18843 F: drivers/md/Kconfig 18844 18844 F: drivers/md/Makefile

+1 -1

block/bio-integrity.c

··· 134 134 iv = bip->bip_vec + bip->bip_vcnt; 135 135 136 136 if (bip->bip_vcnt && 137 - bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev), 137 + bvec_gap_to_prev(&bdev_get_queue(bio->bi_bdev)->limits, 138 138 &bip->bip_vec[bip->bip_vcnt - 1], offset)) 139 139 return 0; 140 140

+25 -26

block/bio.c

··· 965 965 * would create a gap, disallow it. 966 966 */ 967 967 bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; 968 - if (bvec_gap_to_prev(q, bvec, offset)) 968 + if (bvec_gap_to_prev(&q->limits, bvec, offset)) 969 969 return 0; 970 970 } 971 971 ··· 1151 1151 bio_set_flag(bio, BIO_CLONED); 1152 1152 } 1153 1153 1154 - static void bio_put_pages(struct page **pages, size_t size, size_t off) 1155 - { 1156 - size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); 1157 - 1158 - for (i = 0; i < nr; i++) 1159 - put_page(pages[i]); 1160 - } 1161 - 1162 1154 static int bio_iov_add_page(struct bio *bio, struct page *page, 1163 1155 unsigned int len, unsigned int offset) 1164 1156 { 1165 1157 bool same_page = false; 1166 1158 1167 1159 if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { 1168 - if (WARN_ON_ONCE(bio_full(bio, len))) 1169 - return -EINVAL; 1170 1160 __bio_add_page(bio, page, len, offset); 1171 1161 return 0; 1172 1162 } ··· 1199 1209 struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; 1200 1210 struct page **pages = (struct page **)bv; 1201 1211 ssize_t size, left; 1202 - unsigned len, i; 1212 + unsigned len, i = 0; 1203 1213 size_t offset; 1214 + int ret = 0; 1204 1215 1205 1216 /* 1206 1217 * Move page array up in the allocated memory for the bio vecs as far as ··· 1218 1227 * result to ensure the bio's total size is correct. The remainder of 1219 1228 * the iov data will be picked up in the next bio iteration. 1220 1229 */ 1221 - size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); 1222 - if (size > 0) 1230 + size = iov_iter_get_pages(iter, pages, UINT_MAX - bio->bi_iter.bi_size, 1231 + nr_pages, &offset); 1232 + if (size > 0) { 1233 + nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); 1223 1234 size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); 1224 - if (unlikely(size <= 0)) 1225 - return size ? size : -EFAULT; 1235 + } else 1236 + nr_pages = 0; 1237 + 1238 + if (unlikely(size <= 0)) { 1239 + ret = size ? size : -EFAULT; 1240 + goto out; 1241 + } 1226 1242 1227 1243 for (left = size, i = 0; left > 0; left -= len, i++) { 1228 1244 struct page *page = pages[i]; 1229 - int ret; 1230 1245 1231 1246 len = min_t(size_t, PAGE_SIZE - offset, left); 1232 - if (bio_op(bio) == REQ_OP_ZONE_APPEND) 1247 + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 1233 1248 ret = bio_iov_add_zone_append_page(bio, page, len, 1234 1249 offset); 1235 - else 1236 - ret = bio_iov_add_page(bio, page, len, offset); 1250 + if (ret) 1251 + break; 1252 + } else 1253 + bio_iov_add_page(bio, page, len, offset); 1237 1254 1238 - if (ret) { 1239 - bio_put_pages(pages + i, left, offset); 1240 - return ret; 1241 - } 1242 1255 offset = 0; 1243 1256 } 1244 1257 1245 - iov_iter_advance(iter, size); 1246 - return 0; 1258 + iov_iter_advance(iter, size - left); 1259 + out: 1260 + while (i < nr_pages) 1261 + put_page(pages[i++]); 1262 + 1263 + return ret; 1247 1264 } 1248 1265 1249 1266 /**

+1 -8

block/blk-core.c

··· 377 377 struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu) 378 378 { 379 379 struct request_queue *q; 380 - int ret; 381 380 382 381 q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu), 383 382 GFP_KERNEL | __GFP_ZERO, node_id); ··· 395 396 if (q->id < 0) 396 397 goto fail_srcu; 397 398 398 - ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0); 399 - if (ret) 400 - goto fail_id; 401 - 402 399 q->stats = blk_alloc_queue_stats(); 403 400 if (!q->stats) 404 - goto fail_split; 401 + goto fail_id; 405 402 406 403 q->node = node_id; 407 404 ··· 434 439 435 440 fail_stats: 436 441 blk_free_queue_stats(q->stats); 437 - fail_split: 438 - bioset_exit(&q->bio_split); 439 442 fail_id: 440 443 ida_free(&blk_queue_ida, q->id); 441 444 fail_srcu:

+94 -91

block/blk-merge.c

··· 82 82 bio_get_first_bvec(next, &nb); 83 83 if (biovec_phys_mergeable(q, &pb, &nb)) 84 84 return false; 85 - return __bvec_gap_to_prev(q, &pb, nb.bv_offset); 85 + return __bvec_gap_to_prev(&q->limits, &pb, nb.bv_offset); 86 86 } 87 87 88 88 static inline bool req_gap_back_merge(struct request *req, struct bio *bio) ··· 95 95 return bio_will_gap(req->q, NULL, bio, req->bio); 96 96 } 97 97 98 - static struct bio *blk_bio_discard_split(struct request_queue *q, 99 - struct bio *bio, 100 - struct bio_set *bs, 101 - unsigned *nsegs) 98 + /* 99 + * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size 100 + * is defined as 'unsigned int', meantime it has to be aligned to with the 101 + * logical block size, which is the minimum accepted unit by hardware. 102 + */ 103 + static unsigned int bio_allowed_max_sectors(struct queue_limits *lim) 104 + { 105 + return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT; 106 + } 107 + 108 + static struct bio *bio_split_discard(struct bio *bio, struct queue_limits *lim, 109 + unsigned *nsegs, struct bio_set *bs) 102 110 { 103 111 unsigned int max_discard_sectors, granularity; 104 - int alignment; 105 112 sector_t tmp; 106 113 unsigned split_sectors; 107 114 108 115 *nsegs = 1; 109 116 110 117 /* Zero-sector (unknown) and one-sector granularities are the same. */ 111 - granularity = max(q->limits.discard_granularity >> 9, 1U); 118 + granularity = max(lim->discard_granularity >> 9, 1U); 112 119 113 - max_discard_sectors = min(q->limits.max_discard_sectors, 114 - bio_allowed_max_sectors(q)); 120 + max_discard_sectors = 121 + min(lim->max_discard_sectors, bio_allowed_max_sectors(lim)); 115 122 max_discard_sectors -= max_discard_sectors % granularity; 116 123 117 124 if (unlikely(!max_discard_sectors)) { ··· 135 128 * If the next starting sector would be misaligned, stop the discard at 136 129 * the previous aligned sector. 137 130 */ 138 - alignment = (q->limits.discard_alignment >> 9) % granularity; 139 - 140 - tmp = bio->bi_iter.bi_sector + split_sectors - alignment; 131 + tmp = bio->bi_iter.bi_sector + split_sectors - 132 + ((lim->discard_alignment >> 9) % granularity); 141 133 tmp = sector_div(tmp, granularity); 142 134 143 135 if (split_sectors > tmp) ··· 145 139 return bio_split(bio, split_sectors, GFP_NOIO, bs); 146 140 } 147 141 148 - static struct bio *blk_bio_write_zeroes_split(struct request_queue *q, 149 - struct bio *bio, struct bio_set *bs, unsigned *nsegs) 142 + static struct bio *bio_split_write_zeroes(struct bio *bio, 143 + struct queue_limits *lim, unsigned *nsegs, struct bio_set *bs) 150 144 { 151 145 *nsegs = 0; 152 - 153 - if (!q->limits.max_write_zeroes_sectors) 146 + if (!lim->max_write_zeroes_sectors) 154 147 return NULL; 155 - 156 - if (bio_sectors(bio) <= q->limits.max_write_zeroes_sectors) 148 + if (bio_sectors(bio) <= lim->max_write_zeroes_sectors) 157 149 return NULL; 158 - 159 - return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs); 150 + return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs); 160 151 } 161 152 162 153 /* ··· 164 161 * requests that are submitted to a block device if the start of a bio is not 165 162 * aligned to a physical block boundary. 166 163 */ 167 - static inline unsigned get_max_io_size(struct request_queue *q, 168 - struct bio *bio) 164 + static inline unsigned get_max_io_size(struct bio *bio, 165 + struct queue_limits *lim) 169 166 { 170 - unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT; 171 - unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT; 172 - unsigned max_sectors = queue_max_sectors(q), start, end; 167 + unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT; 168 + unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT; 169 + unsigned max_sectors = lim->max_sectors, start, end; 173 170 174 - if (q->limits.chunk_sectors) { 171 + if (lim->chunk_sectors) { 175 172 max_sectors = min(max_sectors, 176 173 blk_chunk_sectors_left(bio->bi_iter.bi_sector, 177 - q->limits.chunk_sectors)); 174 + lim->chunk_sectors)); 178 175 } 179 176 180 177 start = bio->bi_iter.bi_sector & (pbs - 1); ··· 184 181 return max_sectors & ~(lbs - 1); 185 182 } 186 183 187 - static inline unsigned get_max_segment_size(const struct request_queue *q, 188 - struct page *start_page, 189 - unsigned long offset) 184 + static inline unsigned get_max_segment_size(struct queue_limits *lim, 185 + struct page *start_page, unsigned long offset) 190 186 { 191 - unsigned long mask = queue_segment_boundary(q); 187 + unsigned long mask = lim->seg_boundary_mask; 192 188 193 189 offset = mask & (page_to_phys(start_page) + offset); 194 190 ··· 196 194 * on 32bit arch, use queue's max segment size when that happens. 197 195 */ 198 196 return min_not_zero(mask - offset + 1, 199 - (unsigned long)queue_max_segment_size(q)); 197 + (unsigned long)lim->max_segment_size); 200 198 } 201 199 202 200 /** 203 201 * bvec_split_segs - verify whether or not a bvec should be split in the middle 204 - * @q: [in] request queue associated with the bio associated with @bv 202 + * @lim: [in] queue limits to split based on 205 203 * @bv: [in] bvec to examine 206 204 * @nsegs: [in,out] Number of segments in the bio being built. Incremented 207 205 * by the number of segments from @bv that may be appended to that ··· 219 217 * *@nsegs segments and *@sectors sectors would make that bio unacceptable for 220 218 * the block driver. 221 219 */ 222 - static bool bvec_split_segs(const struct request_queue *q, 223 - const struct bio_vec *bv, unsigned *nsegs, 224 - unsigned *bytes, unsigned max_segs, 225 - unsigned max_bytes) 220 + static bool bvec_split_segs(struct queue_limits *lim, const struct bio_vec *bv, 221 + unsigned *nsegs, unsigned *bytes, unsigned max_segs, 222 + unsigned max_bytes) 226 223 { 227 224 unsigned max_len = min(max_bytes, UINT_MAX) - *bytes; 228 225 unsigned len = min(bv->bv_len, max_len); ··· 229 228 unsigned seg_size = 0; 230 229 231 230 while (len && *nsegs < max_segs) { 232 - seg_size = get_max_segment_size(q, bv->bv_page, 231 + seg_size = get_max_segment_size(lim, bv->bv_page, 233 232 bv->bv_offset + total_len); 234 233 seg_size = min(seg_size, len); 235 234 ··· 237 236 total_len += seg_size; 238 237 len -= seg_size; 239 238 240 - if ((bv->bv_offset + total_len) & queue_virt_boundary(q)) 239 + if ((bv->bv_offset + total_len) & lim->virt_boundary_mask) 241 240 break; 242 241 } 243 242 ··· 248 247 } 249 248 250 249 /** 251 - * blk_bio_segment_split - split a bio in two bios 252 - * @q: [in] request queue pointer 250 + * bio_split_rw - split a bio in two bios 253 251 * @bio: [in] bio to be split 254 - * @bs: [in] bio set to allocate the clone from 252 + * @lim: [in] queue limits to split based on 255 253 * @segs: [out] number of segments in the bio with the first half of the sectors 254 + * @bs: [in] bio set to allocate the clone from 255 + * @max_bytes: [in] maximum number of bytes per bio 256 256 * 257 257 * Clone @bio, update the bi_iter of the clone to represent the first sectors 258 258 * of @bio and update @bio->bi_iter to represent the remaining sectors. The 259 259 * following is guaranteed for the cloned bio: 260 - * - That it has at most get_max_io_size(@q, @bio) sectors. 260 + * - That it has at most @max_bytes worth of data 261 261 * - That it has at most queue_max_segments(@q) segments. 262 262 * 263 263 * Except for discard requests the cloned bio will point at the bi_io_vec of ··· 267 265 * responsible for ensuring that @bs is only destroyed after processing of the 268 266 * split bio has finished. 269 267 */ 270 - static struct bio *blk_bio_segment_split(struct request_queue *q, 271 - struct bio *bio, 272 - struct bio_set *bs, 273 - unsigned *segs) 268 + static struct bio *bio_split_rw(struct bio *bio, struct queue_limits *lim, 269 + unsigned *segs, struct bio_set *bs, unsigned max_bytes) 274 270 { 275 271 struct bio_vec bv, bvprv, *bvprvp = NULL; 276 272 struct bvec_iter iter; 277 273 unsigned nsegs = 0, bytes = 0; 278 - const unsigned max_bytes = get_max_io_size(q, bio) << 9; 279 - const unsigned max_segs = queue_max_segments(q); 280 274 281 275 bio_for_each_bvec(bv, bio, iter) { 282 276 /* 283 277 * If the queue doesn't support SG gaps and adding this 284 278 * offset would create a gap, disallow it. 285 279 */ 286 - if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset)) 280 + if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv.bv_offset)) 287 281 goto split; 288 282 289 - if (nsegs < max_segs && 283 + if (nsegs < lim->max_segments && 290 284 bytes + bv.bv_len <= max_bytes && 291 285 bv.bv_offset + bv.bv_len <= PAGE_SIZE) { 292 286 nsegs++; 293 287 bytes += bv.bv_len; 294 - } else if (bvec_split_segs(q, &bv, &nsegs, &bytes, max_segs, 295 - max_bytes)) { 296 - goto split; 288 + } else { 289 + if (bvec_split_segs(lim, &bv, &nsegs, &bytes, 290 + lim->max_segments, max_bytes)) 291 + goto split; 297 292 } 298 293 299 294 bvprv = bv; ··· 307 308 * split size so that each bio is properly block size aligned, even if 308 309 * we do not use the full hardware limits. 309 310 */ 310 - bytes = ALIGN_DOWN(bytes, queue_logical_block_size(q)); 311 + bytes = ALIGN_DOWN(bytes, lim->logical_block_size); 311 312 312 313 /* 313 314 * Bio splitting may cause subtle trouble such as hang when doing sync ··· 319 320 } 320 321 321 322 /** 322 - * __blk_queue_split - split a bio and submit the second half 323 - * @q: [in] request_queue new bio is being queued at 324 - * @bio: [in, out] bio to be split 325 - * @nr_segs: [out] number of segments in the first bio 323 + * __bio_split_to_limits - split a bio to fit the queue limits 324 + * @bio: bio to be split 325 + * @lim: queue limits to split based on 326 + * @nr_segs: returns the number of segments in the returned bio 326 327 * 327 - * Split a bio into two bios, chain the two bios, submit the second half and 328 - * store a pointer to the first half in *@bio. If the second bio is still too 329 - * big it will be split by a recursive call to this function. Since this 330 - * function may allocate a new bio from q->bio_split, it is the responsibility 331 - * of the caller to ensure that q->bio_split is only released after processing 332 - * of the split bio has finished. 328 + * Check if @bio needs splitting based on the queue limits, and if so split off 329 + * a bio fitting the limits from the beginning of @bio and return it. @bio is 330 + * shortened to the remainder and re-submitted. 331 + * 332 + * The split bio is allocated from @q->bio_split, which is provided by the 333 + * block layer. 333 334 */ 334 - void __blk_queue_split(struct request_queue *q, struct bio **bio, 335 + struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim, 335 336 unsigned int *nr_segs) 336 337 { 337 - struct bio *split = NULL; 338 + struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split; 339 + struct bio *split; 338 340 339 - switch (bio_op(*bio)) { 341 + switch (bio_op(bio)) { 340 342 case REQ_OP_DISCARD: 341 343 case REQ_OP_SECURE_ERASE: 342 - split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs); 344 + split = bio_split_discard(bio, lim, nr_segs, bs); 343 345 break; 344 346 case REQ_OP_WRITE_ZEROES: 345 - split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split, 346 - nr_segs); 347 + split = bio_split_write_zeroes(bio, lim, nr_segs, bs); 347 348 break; 348 349 default: 349 - split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs); 350 + split = bio_split_rw(bio, lim, nr_segs, bs, 351 + get_max_io_size(bio, lim) << SECTOR_SHIFT); 350 352 break; 351 353 } 352 354 ··· 356 356 split->bi_opf |= REQ_NOMERGE; 357 357 358 358 blkcg_bio_issue_init(split); 359 - bio_chain(split, *bio); 360 - trace_block_split(split, (*bio)->bi_iter.bi_sector); 361 - submit_bio_noacct(*bio); 362 - *bio = split; 359 + bio_chain(split, bio); 360 + trace_block_split(split, bio->bi_iter.bi_sector); 361 + submit_bio_noacct(bio); 362 + return split; 363 363 } 364 + return bio; 364 365 } 365 366 366 367 /** 367 - * blk_queue_split - split a bio and submit the second half 368 - * @bio: [in, out] bio to be split 368 + * bio_split_to_limits - split a bio to fit the queue limits 369 + * @bio: bio to be split 369 370 * 370 - * Split a bio into two bios, chains the two bios, submit the second half and 371 - * store a pointer to the first half in *@bio. Since this function may allocate 372 - * a new bio from q->bio_split, it is the responsibility of the caller to ensure 373 - * that q->bio_split is only released after processing of the split bio has 374 - * finished. 371 + * Check if @bio needs splitting based on the queue limits of @bio->bi_bdev, and 372 + * if so split off a bio fitting the limits from the beginning of @bio and 373 + * return it. @bio is shortened to the remainder and re-submitted. 374 + * 375 + * The split bio is allocated from @q->bio_split, which is provided by the 376 + * block layer. 375 377 */ 376 - void blk_queue_split(struct bio **bio) 378 + struct bio *bio_split_to_limits(struct bio *bio) 377 379 { 378 - struct request_queue *q = bdev_get_queue((*bio)->bi_bdev); 380 + struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits; 379 381 unsigned int nr_segs; 380 382 381 - if (blk_may_split(q, *bio)) 382 - __blk_queue_split(q, bio, &nr_segs); 383 + if (bio_may_exceed_limits(bio, lim)) 384 + return __bio_split_to_limits(bio, lim, &nr_segs); 385 + return bio; 383 386 } 384 - EXPORT_SYMBOL(blk_queue_split); 387 + EXPORT_SYMBOL(bio_split_to_limits); 385 388 386 389 unsigned int blk_recalc_rq_segments(struct request *rq) 387 390 { ··· 414 411 } 415 412 416 413 rq_for_each_bvec(bv, rq, iter) 417 - bvec_split_segs(rq->q, &bv, &nr_phys_segs, &bytes, 414 + bvec_split_segs(&rq->q->limits, &bv, &nr_phys_segs, &bytes, 418 415 UINT_MAX, UINT_MAX); 419 416 return nr_phys_segs; 420 417 } ··· 445 442 446 443 while (nbytes > 0) { 447 444 unsigned offset = bvec->bv_offset + total; 448 - unsigned len = min(get_max_segment_size(q, bvec->bv_page, 449 - offset), nbytes); 445 + unsigned len = min(get_max_segment_size(&q->limits, 446 + bvec->bv_page, offset), nbytes); 450 447 struct page *page = bvec->bv_page; 451 448 452 449 /*

+3 -3

block/blk-mq.c

··· 2815 2815 unsigned int nr_segs = 1; 2816 2816 blk_status_t ret; 2817 2817 2818 - blk_queue_bounce(q, &bio); 2819 - if (blk_may_split(q, bio)) 2820 - __blk_queue_split(q, &bio, &nr_segs); 2818 + bio = blk_queue_bounce(bio, q); 2819 + if (bio_may_exceed_limits(bio, &q->limits)) 2820 + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); 2821 2821 2822 2822 if (!bio_integrity_prep(bio)) 2823 2823 return;

-2

block/blk-sysfs.c

··· 779 779 if (queue_is_mq(q)) 780 780 blk_mq_release(q); 781 781 782 - bioset_exit(&q->bio_split); 783 - 784 782 if (blk_queue_has_srcu(q)) 785 783 cleanup_srcu_struct(q->srcu); 786 784

+21 -26

block/blk.h

··· 97 97 return true; 98 98 } 99 99 100 - static inline bool __bvec_gap_to_prev(struct request_queue *q, 100 + static inline bool __bvec_gap_to_prev(struct queue_limits *lim, 101 101 struct bio_vec *bprv, unsigned int offset) 102 102 { 103 - return (offset & queue_virt_boundary(q)) || 104 - ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q)); 103 + return (offset & lim->virt_boundary_mask) || 104 + ((bprv->bv_offset + bprv->bv_len) & lim->virt_boundary_mask); 105 105 } 106 106 107 107 /* 108 108 * Check if adding a bio_vec after bprv with offset would create a gap in 109 109 * the SG list. Most drivers don't care about this, but some do. 110 110 */ 111 - static inline bool bvec_gap_to_prev(struct request_queue *q, 111 + static inline bool bvec_gap_to_prev(struct queue_limits *lim, 112 112 struct bio_vec *bprv, unsigned int offset) 113 113 { 114 - if (!queue_virt_boundary(q)) 114 + if (!lim->virt_boundary_mask) 115 115 return false; 116 - return __bvec_gap_to_prev(q, bprv, offset); 116 + return __bvec_gap_to_prev(lim, bprv, offset); 117 117 } 118 118 119 119 static inline bool rq_mergeable(struct request *rq) ··· 189 189 struct bio_integrity_payload *bip = bio_integrity(req->bio); 190 190 struct bio_integrity_payload *bip_next = bio_integrity(next); 191 191 192 - return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], 192 + return bvec_gap_to_prev(&req->q->limits, 193 + &bip->bip_vec[bip->bip_vcnt - 1], 193 194 bip_next->bip_vec[0].bv_offset); 194 195 } 195 196 ··· 200 199 struct bio_integrity_payload *bip = bio_integrity(bio); 201 200 struct bio_integrity_payload *bip_next = bio_integrity(req->bio); 202 201 203 - return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], 202 + return bvec_gap_to_prev(&req->q->limits, 203 + &bip->bip_vec[bip->bip_vcnt - 1], 204 204 bip_next->bip_vec[0].bv_offset); 205 205 } 206 206 ··· 290 288 ssize_t part_timeout_store(struct device *, struct device_attribute *, 291 289 const char *, size_t); 292 290 293 - static inline bool blk_may_split(struct request_queue *q, struct bio *bio) 291 + static inline bool bio_may_exceed_limits(struct bio *bio, 292 + struct queue_limits *lim) 294 293 { 295 294 switch (bio_op(bio)) { 296 295 case REQ_OP_DISCARD: ··· 310 307 * to the performance impact of cloned bios themselves the loop below 311 308 * doesn't matter anyway. 312 309 */ 313 - return q->limits.chunk_sectors || bio->bi_vcnt != 1 || 310 + return lim->chunk_sectors || bio->bi_vcnt != 1 || 314 311 bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE; 315 312 } 316 313 317 - void __blk_queue_split(struct request_queue *q, struct bio **bio, 318 - unsigned int *nr_segs); 314 + struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim, 315 + unsigned int *nr_segs); 319 316 int ll_back_merge_fn(struct request *req, struct bio *bio, 320 317 unsigned int nr_segs); 321 318 bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, ··· 348 345 } 349 346 350 347 /* 351 - * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size 352 - * is defined as 'unsigned int', meantime it has to aligned to with logical 353 - * block size which is the minimum accepted unit by hardware. 354 - */ 355 - static inline unsigned int bio_allowed_max_sectors(struct request_queue *q) 356 - { 357 - return round_down(UINT_MAX, queue_logical_block_size(q)) >> 9; 358 - } 359 - 360 - /* 361 348 * Internal io_context interface 362 349 */ 363 350 struct io_cq *ioc_find_get_icq(struct request_queue *q); ··· 371 378 static inline void blk_throtl_stat_add(struct request *rq, u64 time) { } 372 379 #endif 373 380 374 - void __blk_queue_bounce(struct request_queue *q, struct bio **bio); 381 + struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q); 375 382 376 383 static inline bool blk_queue_may_bounce(struct request_queue *q) 377 384 { ··· 380 387 max_low_pfn >= max_pfn; 381 388 } 382 389 383 - static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) 390 + static inline struct bio *blk_queue_bounce(struct bio *bio, 391 + struct request_queue *q) 384 392 { 385 - if (unlikely(blk_queue_may_bounce(q) && bio_has_data(*bio))) 386 - __blk_queue_bounce(q, bio); 393 + if (unlikely(blk_queue_may_bounce(q) && bio_has_data(bio))) 394 + return __blk_queue_bounce(bio, q); 395 + return bio; 387 396 } 388 397 389 398 #ifdef CONFIG_BLK_CGROUP_IOLATENCY

+13 -13

block/bounce.c

··· 199 199 return NULL; 200 200 } 201 201 202 - void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) 202 + struct bio *__blk_queue_bounce(struct bio *bio_orig, struct request_queue *q) 203 203 { 204 204 struct bio *bio; 205 - int rw = bio_data_dir(*bio_orig); 205 + int rw = bio_data_dir(bio_orig); 206 206 struct bio_vec *to, from; 207 207 struct bvec_iter iter; 208 208 unsigned i = 0, bytes = 0; 209 209 bool bounce = false; 210 210 int sectors; 211 211 212 - bio_for_each_segment(from, *bio_orig, iter) { 212 + bio_for_each_segment(from, bio_orig, iter) { 213 213 if (i++ < BIO_MAX_VECS) 214 214 bytes += from.bv_len; 215 215 if (PageHighMem(from.bv_page)) 216 216 bounce = true; 217 217 } 218 218 if (!bounce) 219 - return; 219 + return bio_orig; 220 220 221 221 /* 222 222 * Individual bvecs might not be logical block aligned. Round down ··· 225 225 */ 226 226 sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >> 227 227 SECTOR_SHIFT; 228 - if (sectors < bio_sectors(*bio_orig)) { 229 - bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split); 230 - bio_chain(bio, *bio_orig); 231 - submit_bio_noacct(*bio_orig); 232 - *bio_orig = bio; 228 + if (sectors < bio_sectors(bio_orig)) { 229 + bio = bio_split(bio_orig, sectors, GFP_NOIO, &bounce_bio_split); 230 + bio_chain(bio, bio_orig); 231 + submit_bio_noacct(bio_orig); 232 + bio_orig = bio; 233 233 } 234 - bio = bounce_clone_bio(*bio_orig); 234 + bio = bounce_clone_bio(bio_orig); 235 235 236 236 /* 237 237 * Bvec table can't be updated by bio_for_each_segment_all(), ··· 254 254 to->bv_page = bounce_page; 255 255 } 256 256 257 - trace_block_bio_bounce(*bio_orig); 257 + trace_block_bio_bounce(bio_orig); 258 258 259 259 bio->bi_flags |= (1 << BIO_BOUNCED); 260 260 ··· 263 263 else 264 264 bio->bi_end_io = bounce_end_io_write; 265 265 266 - bio->bi_private = *bio_orig; 267 - *bio_orig = bio; 266 + bio->bi_private = bio_orig; 267 + return bio; 268 268 }

+7 -1

block/genhd.c

··· 1151 1151 blk_mq_exit_queue(disk->queue); 1152 1152 1153 1153 blkcg_exit_queue(disk->queue); 1154 + bioset_exit(&disk->bio_split); 1154 1155 1155 1156 disk_release_events(disk); 1156 1157 kfree(disk->random); ··· 1343 1342 if (!disk) 1344 1343 goto out_put_queue; 1345 1344 1345 + if (bioset_init(&disk->bio_split, BIO_POOL_SIZE, 0, 0)) 1346 + goto out_free_disk; 1347 + 1346 1348 disk->bdi = bdi_alloc(node_id); 1347 1349 if (!disk->bdi) 1348 - goto out_free_disk; 1350 + goto out_free_bioset; 1349 1351 1350 1352 /* bdev_alloc() might need the queue, set before the first call */ 1351 1353 disk->queue = q; ··· 1386 1382 iput(disk->part0->bd_inode); 1387 1383 out_free_bdi: 1388 1384 bdi_put(disk->bdi); 1385 + out_free_bioset: 1386 + bioset_exit(&disk->bio_split); 1389 1387 out_free_disk: 1390 1388 kfree(disk); 1391 1389 out_put_queue:

+6

crypto/kpp.c

··· 104 104 } 105 105 EXPORT_SYMBOL_GPL(crypto_grab_kpp); 106 106 107 + int crypto_has_kpp(const char *alg_name, u32 type, u32 mask) 108 + { 109 + return crypto_type_has_alg(alg_name, &crypto_kpp_type, type, mask); 110 + } 111 + EXPORT_SYMBOL_GPL(crypto_has_kpp); 112 + 107 113 static void kpp_prepare_alg(struct kpp_alg *alg) 108 114 { 109 115 struct crypto_alg *base = &alg->base;

+6

crypto/shash.c

··· 521 521 } 522 522 EXPORT_SYMBOL_GPL(crypto_alloc_shash); 523 523 524 + int crypto_has_shash(const char *alg_name, u32 type, u32 mask) 525 + { 526 + return crypto_type_has_alg(alg_name, &crypto_shash_type, type, mask); 527 + } 528 + EXPORT_SYMBOL_GPL(crypto_has_shash); 529 + 524 530 static int shash_prepare_alg(struct shash_alg *alg) 525 531 { 526 532 struct crypto_alg *base = &alg->base;

-9

drivers/block/Kconfig

··· 248 248 249 249 If unsure, say N. 250 250 251 - config BLK_DEV_SX8 252 - tristate "Promise SATA SX8 support" 253 - depends on PCI 254 - help 255 - Saying Y or M here will enable support for the 256 - Promise SATA SX8 controllers. 257 - 258 - Use devices /dev/sx8/$N and /dev/sx8/$Np$M. 259 - 260 251 config BLK_DEV_RAM 261 252 tristate "RAM block device support" 262 253 help

-2

drivers/block/Makefile

··· 26 26 obj-$(CONFIG_BLK_DEV_NBD) += nbd.o 27 27 obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o 28 28 29 - obj-$(CONFIG_BLK_DEV_SX8) += sx8.o 30 - 31 29 obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o 32 30 obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/ 33 31 obj-$(CONFIG_BLK_DEV_DRBD) += drbd/

+42 -7

drivers/block/drbd/drbd_bitmap.c

··· 974 974 } 975 975 } 976 976 977 + /* For the layout, see comment above drbd_md_set_sector_offsets(). */ 978 + static inline sector_t drbd_md_last_bitmap_sector(struct drbd_backing_dev *bdev) 979 + { 980 + switch (bdev->md.meta_dev_idx) { 981 + case DRBD_MD_INDEX_INTERNAL: 982 + case DRBD_MD_INDEX_FLEX_INT: 983 + return bdev->md.md_offset + bdev->md.al_offset -1; 984 + case DRBD_MD_INDEX_FLEX_EXT: 985 + default: 986 + return bdev->md.md_offset + bdev->md.md_size_sect -1; 987 + } 988 + } 989 + 977 990 static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local) 978 991 { 979 992 struct drbd_device *device = ctx->device; 980 993 enum req_op op = ctx->flags & BM_AIO_READ ? REQ_OP_READ : REQ_OP_WRITE; 981 - struct bio *bio = bio_alloc_bioset(device->ldev->md_bdev, 1, op, 982 - GFP_NOIO, &drbd_md_io_bio_set); 983 994 struct drbd_bitmap *b = device->bitmap; 995 + struct bio *bio; 984 996 struct page *page; 997 + sector_t last_bm_sect; 998 + sector_t first_bm_sect; 999 + sector_t on_disk_sector; 985 1000 unsigned int len; 986 1001 987 - sector_t on_disk_sector = 988 - device->ldev->md.md_offset + device->ldev->md.bm_offset; 989 - on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9); 1002 + first_bm_sect = device->ldev->md.md_offset + device->ldev->md.bm_offset; 1003 + on_disk_sector = first_bm_sect + (((sector_t)page_nr) << (PAGE_SHIFT-SECTOR_SHIFT)); 990 1004 991 1005 /* this might happen with very small 992 1006 * flexible external meta data device, 993 1007 * or with PAGE_SIZE > 4k */ 994 - len = min_t(unsigned int, PAGE_SIZE, 995 - (drbd_md_last_sector(device->ldev) - on_disk_sector + 1)<<9); 1008 + last_bm_sect = drbd_md_last_bitmap_sector(device->ldev); 1009 + if (first_bm_sect <= on_disk_sector && last_bm_sect >= on_disk_sector) { 1010 + sector_t len_sect = last_bm_sect - on_disk_sector + 1; 1011 + if (len_sect < PAGE_SIZE/SECTOR_SIZE) 1012 + len = (unsigned int)len_sect*SECTOR_SIZE; 1013 + else 1014 + len = PAGE_SIZE; 1015 + } else { 1016 + if (__ratelimit(&drbd_ratelimit_state)) { 1017 + drbd_err(device, "Invalid offset during on-disk bitmap access: " 1018 + "page idx %u, sector %llu\n", page_nr, on_disk_sector); 1019 + } 1020 + ctx->error = -EIO; 1021 + bm_set_page_io_err(b->bm_pages[page_nr]); 1022 + if (atomic_dec_and_test(&ctx->in_flight)) { 1023 + ctx->done = 1; 1024 + wake_up(&device->misc_wait); 1025 + kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy); 1026 + } 1027 + return; 1028 + } 996 1029 997 1030 /* serialize IO on this page */ 998 1031 bm_page_lock_io(device, page_nr); ··· 1040 1007 bm_store_page_idx(page, page_nr); 1041 1008 } else 1042 1009 page = b->bm_pages[page_nr]; 1010 + bio = bio_alloc_bioset(device->ldev->md_bdev, 1, op, GFP_NOIO, 1011 + &drbd_md_io_bio_set); 1043 1012 bio->bi_iter.bi_sector = on_disk_sector; 1044 1013 /* bio_add_page of a single page to an empty bio will always succeed, 1045 1014 * according to api. Do we want to assert that? */

+1 -1

drivers/block/drbd/drbd_req.c

··· 1608 1608 { 1609 1609 struct drbd_device *device = bio->bi_bdev->bd_disk->private_data; 1610 1610 1611 - blk_queue_split(&bio); 1611 + bio = bio_split_to_limits(bio); 1612 1612 1613 1613 /* 1614 1614 * what we "blindly" assume:

+4 -2

drivers/block/nbd.c

··· 11 11 * (part of code stolen from loop.c) 12 12 */ 13 13 14 + #define pr_fmt(fmt) "nbd: " fmt 15 + 14 16 #include <linux/major.h> 15 17 16 18 #include <linux/blkdev.h> ··· 1952 1950 test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) || 1953 1951 !refcount_inc_not_zero(&nbd->refs)) { 1954 1952 mutex_unlock(&nbd_index_mutex); 1955 - pr_err("nbd: device at index %d is going down\n", 1953 + pr_err("device at index %d is going down\n", 1956 1954 index); 1957 1955 return -EINVAL; 1958 1956 } ··· 1963 1961 if (!nbd) { 1964 1962 nbd = nbd_dev_add(index, 2); 1965 1963 if (IS_ERR(nbd)) { 1966 - pr_err("nbd: failed to add new device\n"); 1964 + pr_err("failed to add new device\n"); 1967 1965 return PTR_ERR(nbd); 1968 1966 } 1969 1967 }

+86 -30

drivers/block/null_blk/main.c

··· 201 201 module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); 202 202 MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); 203 203 204 + static bool g_memory_backed; 205 + module_param_named(memory_backed, g_memory_backed, bool, 0444); 206 + MODULE_PARM_DESC(memory_backed, "Create a memory-backed block device. Default: false"); 207 + 208 + static bool g_discard; 209 + module_param_named(discard, g_discard, bool, 0444); 210 + MODULE_PARM_DESC(discard, "Support discard operations (requires memory-backed null_blk device). Default: false"); 211 + 212 + static unsigned long g_cache_size; 213 + module_param_named(cache_size, g_cache_size, ulong, 0444); 214 + MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)"); 215 + 216 + static unsigned int g_mbps; 217 + module_param_named(mbps, g_mbps, uint, 0444); 218 + MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)"); 219 + 204 220 static bool g_zoned; 205 221 module_param_named(zoned, g_zoned, bool, S_IRUGO); 206 222 MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false"); ··· 425 409 NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); 426 410 NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); 427 411 NULLB_DEVICE_ATTR(virt_boundary, bool, NULL); 412 + NULLB_DEVICE_ATTR(no_sched, bool, NULL); 413 + NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL); 428 414 429 415 static ssize_t nullb_device_power_show(struct config_item *item, char *page) 430 416 { ··· 550 532 &nullb_device_attr_zone_max_open, 551 533 &nullb_device_attr_zone_max_active, 552 534 &nullb_device_attr_virt_boundary, 535 + &nullb_device_attr_no_sched, 536 + &nullb_device_attr_shared_tag_bitmap, 553 537 NULL, 554 538 }; 555 539 ··· 608 588 static ssize_t memb_group_features_show(struct config_item *item, char *page) 609 589 { 610 590 return snprintf(page, PAGE_SIZE, 611 - "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active,blocksize,max_sectors,virt_boundary\n"); 591 + "badblocks,blocking,blocksize,cache_size," 592 + "completion_nsec,discard,home_node,hw_queue_depth," 593 + "irqmode,max_sectors,mbps,memory_backed,no_sched," 594 + "poll_queues,power,queue_mode,shared_tag_bitmap,size," 595 + "submit_queues,use_per_node_hctx,virt_boundary,zoned," 596 + "zone_capacity,zone_max_active,zone_max_open," 597 + "zone_nr_conv,zone_size\n"); 612 598 } 613 599 614 600 CONFIGFS_ATTR_RO(memb_group_, features); ··· 676 650 dev->irqmode = g_irqmode; 677 651 dev->hw_queue_depth = g_hw_queue_depth; 678 652 dev->blocking = g_blocking; 653 + dev->memory_backed = g_memory_backed; 654 + dev->discard = g_discard; 655 + dev->cache_size = g_cache_size; 656 + dev->mbps = g_mbps; 679 657 dev->use_per_node_hctx = g_use_per_node_hctx; 680 658 dev->zoned = g_zoned; 681 659 dev->zone_size = g_zone_size; ··· 688 658 dev->zone_max_open = g_zone_max_open; 689 659 dev->zone_max_active = g_zone_max_active; 690 660 dev->virt_boundary = g_virt_boundary; 661 + dev->no_sched = g_no_sched; 662 + dev->shared_tag_bitmap = g_shared_tag_bitmap; 691 663 return dev; 692 664 } 693 665 ··· 1687 1655 1688 1656 static void cleanup_queue(struct nullb_queue *nq) 1689 1657 { 1690 - kfree(nq->tag_map); 1658 + bitmap_free(nq->tag_map); 1691 1659 kfree(nq->cmds); 1692 1660 } 1693 1661 ··· 1814 1782 static int setup_commands(struct nullb_queue *nq) 1815 1783 { 1816 1784 struct nullb_cmd *cmd; 1817 - int i, tag_size; 1785 + int i; 1818 1786 1819 1787 nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL); 1820 1788 if (!nq->cmds) 1821 1789 return -ENOMEM; 1822 1790 1823 - tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG; 1824 - nq->tag_map = kcalloc(tag_size, sizeof(unsigned long), GFP_KERNEL); 1791 + nq->tag_map = bitmap_zalloc(nq->queue_depth, GFP_KERNEL); 1825 1792 if (!nq->tag_map) { 1826 1793 kfree(nq->cmds); 1827 1794 return -ENOMEM; ··· 1897 1866 1898 1867 static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) 1899 1868 { 1869 + unsigned int flags = BLK_MQ_F_SHOULD_MERGE; 1870 + int hw_queues, numa_node; 1871 + unsigned int queue_depth; 1900 1872 int poll_queues; 1901 1873 1902 - set->ops = &null_mq_ops; 1903 - set->nr_hw_queues = nullb ? nullb->dev->submit_queues : 1904 - g_submit_queues; 1905 - poll_queues = nullb ? nullb->dev->poll_queues : g_poll_queues; 1906 - if (poll_queues) 1907 - set->nr_hw_queues += poll_queues; 1908 - set->queue_depth = nullb ? nullb->dev->hw_queue_depth : 1909 - g_hw_queue_depth; 1910 - set->numa_node = nullb ? nullb->dev->home_node : g_home_node; 1911 - set->cmd_size = sizeof(struct nullb_cmd); 1912 - set->flags = BLK_MQ_F_SHOULD_MERGE; 1913 - if (g_no_sched) 1914 - set->flags |= BLK_MQ_F_NO_SCHED; 1915 - if (g_shared_tag_bitmap) 1916 - set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; 1917 - set->driver_data = nullb; 1918 - if (poll_queues) 1919 - set->nr_maps = 3; 1920 - else 1921 - set->nr_maps = 1; 1874 + if (nullb) { 1875 + hw_queues = nullb->dev->submit_queues; 1876 + poll_queues = nullb->dev->poll_queues; 1877 + queue_depth = nullb->dev->hw_queue_depth; 1878 + numa_node = nullb->dev->home_node; 1879 + if (nullb->dev->no_sched) 1880 + flags |= BLK_MQ_F_NO_SCHED; 1881 + if (nullb->dev->shared_tag_bitmap) 1882 + flags |= BLK_MQ_F_TAG_HCTX_SHARED; 1883 + if (nullb->dev->blocking) 1884 + flags |= BLK_MQ_F_BLOCKING; 1885 + } else { 1886 + hw_queues = g_submit_queues; 1887 + poll_queues = g_poll_queues; 1888 + queue_depth = g_hw_queue_depth; 1889 + numa_node = g_home_node; 1890 + if (g_no_sched) 1891 + flags |= BLK_MQ_F_NO_SCHED; 1892 + if (g_shared_tag_bitmap) 1893 + flags |= BLK_MQ_F_TAG_HCTX_SHARED; 1894 + if (g_blocking) 1895 + flags |= BLK_MQ_F_BLOCKING; 1896 + } 1922 1897 1923 - if ((nullb && nullb->dev->blocking) || g_blocking) 1924 - set->flags |= BLK_MQ_F_BLOCKING; 1898 + set->ops = &null_mq_ops; 1899 + set->cmd_size = sizeof(struct nullb_cmd); 1900 + set->flags = flags; 1901 + set->driver_data = nullb; 1902 + set->nr_hw_queues = hw_queues; 1903 + set->queue_depth = queue_depth; 1904 + set->numa_node = numa_node; 1905 + if (poll_queues) { 1906 + set->nr_hw_queues += poll_queues; 1907 + set->nr_maps = 3; 1908 + } else { 1909 + set->nr_maps = 1; 1910 + } 1925 1911 1926 1912 return blk_mq_alloc_tag_set(set); 1927 1913 } ··· 2090 2042 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); 2091 2043 2092 2044 mutex_lock(&lock); 2093 - nullb->index = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); 2094 - dev->index = nullb->index; 2045 + rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); 2046 + if (rv < 0) { 2047 + mutex_unlock(&lock); 2048 + goto out_cleanup_zone; 2049 + } 2050 + nullb->index = rv; 2051 + dev->index = rv; 2095 2052 mutex_unlock(&lock); 2096 2053 2097 2054 blk_queue_logical_block_size(nullb->q, dev->blocksize); ··· 2122 2069 2123 2070 rv = null_gendisk_register(nullb); 2124 2071 if (rv) 2125 - goto out_cleanup_zone; 2072 + goto out_ida_free; 2126 2073 2127 2074 mutex_lock(&lock); 2128 2075 list_add_tail(&nullb->list, &nullb_list); ··· 2131 2078 pr_info("disk %s created\n", nullb->disk_name); 2132 2079 2133 2080 return 0; 2081 + 2082 + out_ida_free: 2083 + ida_free(&nullb_indexes, nullb->index); 2134 2084 out_cleanup_zone: 2135 2085 null_free_zoned_dev(dev); 2136 2086 out_cleanup_disk:

+2

drivers/block/null_blk/null_blk.h

··· 113 113 bool discard; /* if support discard */ 114 114 bool zoned; /* if device is zoned */ 115 115 bool virt_boundary; /* virtual boundary on/off for the device */ 116 + bool no_sched; /* no IO scheduler for the device */ 117 + bool shared_tag_bitmap; /* use hostwide shared tags */ 116 118 }; 117 119 118 120 struct nullb {

+1 -1

drivers/block/pktcdvd.c

··· 2399 2399 struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->queue->queuedata; 2400 2400 struct bio *split; 2401 2401 2402 - blk_queue_split(&bio); 2402 + bio = bio_split_to_limits(bio); 2403 2403 2404 2404 pkt_dbg(2, pd, "start = %6llx stop = %6llx\n", 2405 2405 (unsigned long long)bio->bi_iter.bi_sector,

+1 -1

drivers/block/ps3vram.c

··· 586 586 587 587 dev_dbg(&dev->core, "%s\n", __func__); 588 588 589 - blk_queue_split(&bio); 589 + bio = bio_split_to_limits(bio); 590 590 591 591 spin_lock_irq(&priv->lock); 592 592 busy = !bio_list_empty(&priv->list);

+1 -1

drivers/block/rnbd/rnbd-clt-sysfs.c

··· 376 376 if (ret) 377 377 return ret; 378 378 379 - ret = rnbd_clt_resize_disk(dev, (size_t)sectors); 379 + ret = rnbd_clt_resize_disk(dev, sectors); 380 380 if (ret) 381 381 return ret; 382 382

+119 -82

drivers/block/rnbd/rnbd-clt.c

··· 68 68 return refcount_inc_not_zero(&dev->refcount); 69 69 } 70 70 71 - static int rnbd_clt_set_dev_attr(struct rnbd_clt_dev *dev, 72 - const struct rnbd_msg_open_rsp *rsp) 71 + static void rnbd_clt_change_capacity(struct rnbd_clt_dev *dev, 72 + sector_t new_nsectors) 73 73 { 74 - struct rnbd_clt_session *sess = dev->sess; 74 + if (get_capacity(dev->gd) == new_nsectors) 75 + return; 75 76 76 - if (!rsp->logical_block_size) 77 - return -EINVAL; 78 - 79 - dev->device_id = le32_to_cpu(rsp->device_id); 80 - dev->nsectors = le64_to_cpu(rsp->nsectors); 81 - dev->logical_block_size = le16_to_cpu(rsp->logical_block_size); 82 - dev->physical_block_size = le16_to_cpu(rsp->physical_block_size); 83 - dev->max_discard_sectors = le32_to_cpu(rsp->max_discard_sectors); 84 - dev->discard_granularity = le32_to_cpu(rsp->discard_granularity); 85 - dev->discard_alignment = le32_to_cpu(rsp->discard_alignment); 86 - dev->secure_discard = le16_to_cpu(rsp->secure_discard); 87 - dev->wc = !!(rsp->cache_policy & RNBD_WRITEBACK); 88 - dev->fua = !!(rsp->cache_policy & RNBD_FUA); 89 - 90 - dev->max_hw_sectors = sess->max_io_size / SECTOR_SIZE; 91 - dev->max_segments = sess->max_segments; 92 - 93 - return 0; 94 - } 95 - 96 - static int rnbd_clt_change_capacity(struct rnbd_clt_dev *dev, 97 - size_t new_nsectors) 98 - { 99 - rnbd_clt_info(dev, "Device size changed from %zu to %zu sectors\n", 100 - dev->nsectors, new_nsectors); 101 - dev->nsectors = new_nsectors; 102 - set_capacity_and_notify(dev->gd, dev->nsectors); 103 - return 0; 77 + /* 78 + * If the size changed, we need to revalidate it 79 + */ 80 + rnbd_clt_info(dev, "Device size changed from %llu to %llu sectors\n", 81 + get_capacity(dev->gd), new_nsectors); 82 + set_capacity_and_notify(dev->gd, new_nsectors); 104 83 } 105 84 106 85 static int process_msg_open_rsp(struct rnbd_clt_dev *dev, ··· 98 119 if (dev->dev_state == DEV_STATE_MAPPED_DISCONNECTED) { 99 120 u64 nsectors = le64_to_cpu(rsp->nsectors); 100 121 101 - /* 102 - * If the device was remapped and the size changed in the 103 - * meantime we need to revalidate it 104 - */ 105 - if (dev->nsectors != nsectors) 106 - rnbd_clt_change_capacity(dev, nsectors); 122 + rnbd_clt_change_capacity(dev, nsectors); 107 123 gd_kobj = &disk_to_dev(dev->gd)->kobj; 108 124 kobject_uevent(gd_kobj, KOBJ_ONLINE); 109 125 rnbd_clt_info(dev, "Device online, device remapped successfully\n"); 110 126 } 111 - err = rnbd_clt_set_dev_attr(dev, rsp); 112 - if (err) 127 + if (!rsp->logical_block_size) { 128 + err = -EINVAL; 113 129 goto out; 130 + } 131 + dev->device_id = le32_to_cpu(rsp->device_id); 114 132 dev->dev_state = DEV_STATE_MAPPED; 115 133 116 134 out: ··· 116 140 return err; 117 141 } 118 142 119 - int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize) 143 + int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, sector_t newsize) 120 144 { 121 145 int ret = 0; 122 146 ··· 126 150 ret = -ENOENT; 127 151 goto out; 128 152 } 129 - ret = rnbd_clt_change_capacity(dev, newsize); 153 + rnbd_clt_change_capacity(dev, newsize); 130 154 131 155 out: 132 156 mutex_unlock(&dev->lock); ··· 483 507 struct rnbd_msg_open_rsp *rsp = iu->buf; 484 508 struct rnbd_clt_dev *dev = iu->dev; 485 509 int errno = iu->errno; 510 + bool from_map = false; 511 + 512 + /* INIT state is only triggered from rnbd_clt_map_device */ 513 + if (dev->dev_state == DEV_STATE_INIT) 514 + from_map = true; 486 515 487 516 if (errno) { 488 517 rnbd_clt_err(dev, ··· 504 523 send_msg_close(dev, device_id, RTRS_PERMIT_NOWAIT); 505 524 } 506 525 } 507 - kfree(rsp); 526 + /* We free rsp in rnbd_clt_map_device for map scenario */ 527 + if (!from_map) 528 + kfree(rsp); 508 529 wake_up_iu_comp(iu, errno); 509 530 rnbd_put_iu(dev->sess, iu); 510 531 rnbd_clt_put_dev(dev); ··· 925 942 { 926 943 struct rnbd_clt_dev *dev = block_device->bd_disk->private_data; 927 944 928 - if (dev->read_only && (mode & FMODE_WRITE)) 945 + if (get_disk_ro(dev->gd) && (mode & FMODE_WRITE)) 929 946 return -EPERM; 930 947 931 948 if (dev->dev_state == DEV_STATE_UNMAPPED || ··· 946 963 struct hd_geometry *geo) 947 964 { 948 965 u64 size; 949 - struct rnbd_clt_dev *dev; 966 + struct rnbd_clt_dev *dev = block_device->bd_disk->private_data; 967 + struct queue_limits *limit = &dev->queue->limits; 950 968 951 - dev = block_device->bd_disk->private_data; 952 - size = dev->size * (dev->logical_block_size / SECTOR_SIZE); 969 + size = dev->size * (limit->logical_block_size / SECTOR_SIZE); 953 970 geo->cylinders = size >> 6; /* size/64 */ 954 971 geo->heads = 4; 955 972 geo->sectors = 16; ··· 1333 1350 } 1334 1351 } 1335 1352 1336 - static void setup_request_queue(struct rnbd_clt_dev *dev) 1353 + static void setup_request_queue(struct rnbd_clt_dev *dev, 1354 + struct rnbd_msg_open_rsp *rsp) 1337 1355 { 1338 - blk_queue_logical_block_size(dev->queue, dev->logical_block_size); 1339 - blk_queue_physical_block_size(dev->queue, dev->physical_block_size); 1340 - blk_queue_max_hw_sectors(dev->queue, dev->max_hw_sectors); 1356 + blk_queue_logical_block_size(dev->queue, 1357 + le16_to_cpu(rsp->logical_block_size)); 1358 + blk_queue_physical_block_size(dev->queue, 1359 + le16_to_cpu(rsp->physical_block_size)); 1360 + blk_queue_max_hw_sectors(dev->queue, 1361 + dev->sess->max_io_size / SECTOR_SIZE); 1341 1362 1342 1363 /* 1343 1364 * we don't support discards to "discontiguous" segments ··· 1349 1362 */ 1350 1363 blk_queue_max_discard_segments(dev->queue, 1); 1351 1364 1352 - blk_queue_max_discard_sectors(dev->queue, dev->max_discard_sectors); 1353 - dev->queue->limits.discard_granularity = dev->discard_granularity; 1354 - dev->queue->limits.discard_alignment = dev->discard_alignment; 1355 - if (dev->secure_discard) 1365 + blk_queue_max_discard_sectors(dev->queue, 1366 + le32_to_cpu(rsp->max_discard_sectors)); 1367 + dev->queue->limits.discard_granularity = 1368 + le32_to_cpu(rsp->discard_granularity); 1369 + dev->queue->limits.discard_alignment = 1370 + le32_to_cpu(rsp->discard_alignment); 1371 + if (le16_to_cpu(rsp->secure_discard)) 1356 1372 blk_queue_max_secure_erase_sectors(dev->queue, 1357 - dev->max_discard_sectors); 1373 + le32_to_cpu(rsp->max_discard_sectors)); 1358 1374 blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue); 1359 1375 blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue); 1360 - blk_queue_max_segments(dev->queue, dev->max_segments); 1376 + blk_queue_max_segments(dev->queue, dev->sess->max_segments); 1361 1377 blk_queue_io_opt(dev->queue, dev->sess->max_io_size); 1362 1378 blk_queue_virt_boundary(dev->queue, SZ_4K - 1); 1363 - blk_queue_write_cache(dev->queue, dev->wc, dev->fua); 1379 + blk_queue_write_cache(dev->queue, 1380 + !!(rsp->cache_policy & RNBD_WRITEBACK), 1381 + !!(rsp->cache_policy & RNBD_FUA)); 1364 1382 } 1365 1383 1366 - static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx) 1384 + static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, 1385 + struct rnbd_msg_open_rsp *rsp, int idx) 1367 1386 { 1368 1387 int err; 1369 1388 ··· 1381 1388 dev->gd->private_data = dev; 1382 1389 snprintf(dev->gd->disk_name, sizeof(dev->gd->disk_name), "rnbd%d", 1383 1390 idx); 1384 - pr_debug("disk_name=%s, capacity=%zu\n", 1391 + pr_debug("disk_name=%s, capacity=%llu\n", 1385 1392 dev->gd->disk_name, 1386 - dev->nsectors * (dev->logical_block_size / SECTOR_SIZE) 1387 - ); 1393 + le64_to_cpu(rsp->nsectors) * 1394 + (le16_to_cpu(rsp->logical_block_size) / SECTOR_SIZE)); 1388 1395 1389 - set_capacity(dev->gd, dev->nsectors); 1396 + set_capacity(dev->gd, le64_to_cpu(rsp->nsectors)); 1390 1397 1391 - if (dev->access_mode == RNBD_ACCESS_RO) { 1392 - dev->read_only = true; 1398 + if (dev->access_mode == RNBD_ACCESS_RO) 1393 1399 set_disk_ro(dev->gd, true); 1394 - } else { 1395 - dev->read_only = false; 1396 - } 1397 1400 1398 1401 /* 1399 1402 * Network device does not need rotational ··· 1402 1413 return err; 1403 1414 } 1404 1415 1405 - static int rnbd_client_setup_device(struct rnbd_clt_dev *dev) 1416 + static int rnbd_client_setup_device(struct rnbd_clt_dev *dev, 1417 + struct rnbd_msg_open_rsp *rsp) 1406 1418 { 1407 1419 int idx = dev->clt_device_id; 1408 1420 1409 - dev->size = dev->nsectors * dev->logical_block_size; 1421 + dev->size = le64_to_cpu(rsp->nsectors) * 1422 + le16_to_cpu(rsp->logical_block_size); 1410 1423 1411 1424 dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, dev); 1412 1425 if (IS_ERR(dev->gd)) ··· 1416 1425 dev->queue = dev->gd->queue; 1417 1426 rnbd_init_mq_hw_queues(dev); 1418 1427 1419 - setup_request_queue(dev); 1420 - return rnbd_clt_setup_gen_disk(dev, idx); 1428 + setup_request_queue(dev, rsp); 1429 + return rnbd_clt_setup_gen_disk(dev, rsp, idx); 1421 1430 } 1422 1431 1423 1432 static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess, ··· 1553 1562 { 1554 1563 struct rnbd_clt_session *sess; 1555 1564 struct rnbd_clt_dev *dev; 1556 - int ret; 1565 + int ret, errno; 1566 + struct rnbd_msg_open_rsp *rsp; 1567 + struct rnbd_msg_open msg; 1568 + struct rnbd_iu *iu; 1569 + struct kvec vec = { 1570 + .iov_base = &msg, 1571 + .iov_len = sizeof(msg) 1572 + }; 1557 1573 1558 1574 if (exists_devpath(pathname, sessname)) 1559 1575 return ERR_PTR(-EEXIST); ··· 1580 1582 ret = -EEXIST; 1581 1583 goto put_dev; 1582 1584 } 1583 - ret = send_msg_open(dev, RTRS_PERMIT_WAIT); 1585 + 1586 + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); 1587 + if (!rsp) { 1588 + ret = -ENOMEM; 1589 + goto del_dev; 1590 + } 1591 + 1592 + iu = rnbd_get_iu(sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT); 1593 + if (!iu) { 1594 + ret = -ENOMEM; 1595 + kfree(rsp); 1596 + goto del_dev; 1597 + } 1598 + iu->buf = rsp; 1599 + iu->dev = dev; 1600 + sg_init_one(iu->sgt.sgl, rsp, sizeof(*rsp)); 1601 + 1602 + msg.hdr.type = cpu_to_le16(RNBD_MSG_OPEN); 1603 + msg.access_mode = dev->access_mode; 1604 + strscpy(msg.dev_name, dev->pathname, sizeof(msg.dev_name)); 1605 + 1606 + WARN_ON(!rnbd_clt_get_dev(dev)); 1607 + ret = send_usr_msg(sess->rtrs, READ, iu, 1608 + &vec, sizeof(*rsp), iu->sgt.sgl, 1, 1609 + msg_open_conf, &errno, RTRS_PERMIT_WAIT); 1610 + if (ret) { 1611 + rnbd_clt_put_dev(dev); 1612 + rnbd_put_iu(sess, iu); 1613 + } else { 1614 + ret = errno; 1615 + } 1584 1616 if (ret) { 1585 1617 rnbd_clt_err(dev, 1586 1618 "map_device: failed, can't open remote device, err: %d\n", 1587 1619 ret); 1588 - goto del_dev; 1620 + goto put_iu; 1589 1621 } 1590 1622 mutex_lock(&dev->lock); 1591 1623 pr_debug("Opened remote device: session=%s, path='%s'\n", 1592 1624 sess->sessname, pathname); 1593 - ret = rnbd_client_setup_device(dev); 1625 + ret = rnbd_client_setup_device(dev, rsp); 1594 1626 if (ret) { 1595 1627 rnbd_clt_err(dev, 1596 1628 "map_device: Failed to configure device, err: %d\n", ··· 1630 1602 } 1631 1603 1632 1604 rnbd_clt_info(dev, 1633 - "map_device: Device mapped as %s (nsectors: %zu, logical_block_size: %d, physical_block_size: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, wc: %d, fua: %d)\n", 1634 - dev->gd->disk_name, dev->nsectors, 1635 - dev->logical_block_size, dev->physical_block_size, 1636 - dev->max_discard_sectors, 1637 - dev->discard_granularity, dev->discard_alignment, 1638 - dev->secure_discard, dev->max_segments, 1639 - dev->max_hw_sectors, dev->wc, dev->fua); 1605 + "map_device: Device mapped as %s (nsectors: %llu, logical_block_size: %d, physical_block_size: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, wc: %d, fua: %d)\n", 1606 + dev->gd->disk_name, le64_to_cpu(rsp->nsectors), 1607 + le16_to_cpu(rsp->logical_block_size), 1608 + le16_to_cpu(rsp->physical_block_size), 1609 + le32_to_cpu(rsp->max_discard_sectors), 1610 + le32_to_cpu(rsp->discard_granularity), 1611 + le32_to_cpu(rsp->discard_alignment), 1612 + le16_to_cpu(rsp->secure_discard), 1613 + sess->max_segments, sess->max_io_size / SECTOR_SIZE, 1614 + !!(rsp->cache_policy & RNBD_WRITEBACK), 1615 + !!(rsp->cache_policy & RNBD_FUA)); 1640 1616 1641 1617 mutex_unlock(&dev->lock); 1618 + kfree(rsp); 1619 + rnbd_put_iu(sess, iu); 1642 1620 rnbd_clt_put_sess(sess); 1643 1621 1644 1622 return dev; 1645 1623 1646 1624 send_close: 1647 1625 send_msg_close(dev, dev->device_id, RTRS_PERMIT_WAIT); 1626 + put_iu: 1627 + kfree(rsp); 1628 + rnbd_put_iu(sess, iu); 1648 1629 del_dev: 1649 1630 delete_dev(dev); 1650 1631 put_dev:

+3 -15

drivers/block/rnbd/rnbd-clt.h

··· 106 106 }; 107 107 108 108 struct rnbd_clt_dev { 109 + struct kobject kobj; 109 110 struct rnbd_clt_session *sess; 110 111 struct request_queue *queue; 111 112 struct rnbd_queue *hw_queues; ··· 115 114 u32 clt_device_id; 116 115 struct mutex lock; 117 116 enum rnbd_clt_dev_state dev_state; 117 + refcount_t refcount; 118 118 char *pathname; 119 119 enum rnbd_access_mode access_mode; 120 120 u32 nr_poll_queues; 121 - bool read_only; 122 - bool wc; 123 - bool fua; 124 - u32 max_hw_sectors; 125 - u32 max_discard_sectors; 126 - u32 discard_granularity; 127 - u32 discard_alignment; 128 - u16 secure_discard; 129 - u16 physical_block_size; 130 - u16 logical_block_size; 131 - u16 max_segments; 132 - size_t nsectors; 133 121 u64 size; /* device size in bytes */ 134 122 struct list_head list; 135 123 struct gendisk *gd; 136 - struct kobject kobj; 137 124 char *blk_symlink_name; 138 - refcount_t refcount; 139 125 struct work_struct unmap_on_rmmod_work; 140 126 }; 141 127 ··· 138 150 const struct attribute *sysfs_self); 139 151 140 152 int rnbd_clt_remap_device(struct rnbd_clt_dev *dev); 141 - int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize); 153 + int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, sector_t newsize); 142 154 143 155 /* rnbd-clt-sysfs.c */ 144 156

+9 -11

drivers/block/rnbd/rnbd-srv.c

··· 224 224 wait_for_completion(&dc); /* wait for inflights to drop to zero */ 225 225 226 226 rnbd_dev_close(sess_dev->rnbd_dev); 227 - list_del(&sess_dev->sess_list); 228 227 mutex_lock(&sess_dev->dev->lock); 229 228 list_del(&sess_dev->dev_list); 230 229 if (sess_dev->open_flags & FMODE_WRITE) ··· 238 239 239 240 static void destroy_sess(struct rnbd_srv_session *srv_sess) 240 241 { 241 - struct rnbd_srv_sess_dev *sess_dev, *tmp; 242 + struct rnbd_srv_sess_dev *sess_dev; 243 + unsigned long index; 242 244 243 - if (list_empty(&srv_sess->sess_dev_list)) 245 + if (xa_empty(&srv_sess->index_idr)) 244 246 goto out; 245 247 246 248 mutex_lock(&srv_sess->lock); 247 - list_for_each_entry_safe(sess_dev, tmp, &srv_sess->sess_dev_list, 248 - sess_list) 249 + xa_for_each(&srv_sess->index_idr, index, sess_dev) 249 250 rnbd_srv_destroy_dev_session_sysfs(sess_dev); 250 251 mutex_unlock(&srv_sess->lock); 251 252 ··· 280 281 281 282 srv_sess->queue_depth = rtrs_srv_get_queue_depth(rtrs); 282 283 xa_init_flags(&srv_sess->index_idr, XA_FLAGS_ALLOC); 283 - INIT_LIST_HEAD(&srv_sess->sess_dev_list); 284 284 mutex_init(&srv_sess->lock); 285 285 mutex_lock(&sess_lock); 286 286 list_add(&srv_sess->list, &sess_list); ··· 321 323 { 322 324 struct rnbd_srv_session *sess = sess_dev->sess; 323 325 324 - sess_dev->keep_id = true; 325 326 /* It is already started to close by client's close message. */ 326 327 if (!mutex_trylock(&sess->lock)) 327 328 return; 329 + 330 + sess_dev->keep_id = true; 328 331 /* first remove sysfs itself to avoid deadlock */ 329 332 sysfs_remove_file_self(&sess_dev->kobj, &attr->attr); 330 333 rnbd_srv_destroy_dev_session_sysfs(sess_dev); ··· 665 666 find_srv_sess_dev(struct rnbd_srv_session *srv_sess, const char *dev_name) 666 667 { 667 668 struct rnbd_srv_sess_dev *sess_dev; 669 + unsigned long index; 668 670 669 - if (list_empty(&srv_sess->sess_dev_list)) 671 + if (xa_empty(&srv_sess->index_idr)) 670 672 return NULL; 671 673 672 - list_for_each_entry(sess_dev, &srv_sess->sess_dev_list, sess_list) 674 + xa_for_each(&srv_sess->index_idr, index, sess_dev) 673 675 if (!strcmp(sess_dev->pathname, dev_name)) 674 676 return sess_dev; 675 677 ··· 779 779 780 780 list_add(&srv_sess_dev->dev_list, &srv_dev->sess_dev_list); 781 781 mutex_unlock(&srv_dev->lock); 782 - 783 - list_add(&srv_sess_dev->sess_list, &srv_sess->sess_dev_list); 784 782 785 783 rnbd_srv_info(srv_sess_dev, "Opened device '%s'\n", srv_dev->id); 786 784

-4

drivers/block/rnbd/rnbd-srv.h

··· 25 25 int queue_depth; 26 26 27 27 struct xarray index_idr; 28 - /* List of struct rnbd_srv_sess_dev */ 29 - struct list_head sess_dev_list; 30 28 struct mutex lock; 31 29 u8 ver; 32 30 }; ··· 46 48 struct rnbd_srv_sess_dev { 47 49 /* Entry inside rnbd_srv_dev struct */ 48 50 struct list_head dev_list; 49 - /* Entry inside rnbd_srv_session struct */ 50 - struct list_head sess_list; 51 51 struct rnbd_dev *rnbd_dev; 52 52 struct rnbd_srv_session *sess; 53 53 struct rnbd_srv_dev *dev;

-1582

drivers/block/sx8.c

··· 1 - /* 2 - * sx8.c: Driver for Promise SATA SX8 looks-like-I2O hardware 3 - * 4 - * Copyright 2004-2005 Red Hat, Inc. 5 - * 6 - * Author/maintainer: Jeff Garzik <jgarzik@pobox.com> 7 - * 8 - * This file is subject to the terms and conditions of the GNU General Public 9 - * License. See the file "COPYING" in the main directory of this archive 10 - * for more details. 11 - */ 12 - 13 - #include <linux/kernel.h> 14 - #include <linux/module.h> 15 - #include <linux/init.h> 16 - #include <linux/pci.h> 17 - #include <linux/slab.h> 18 - #include <linux/spinlock.h> 19 - #include <linux/blk-mq.h> 20 - #include <linux/sched.h> 21 - #include <linux/interrupt.h> 22 - #include <linux/compiler.h> 23 - #include <linux/workqueue.h> 24 - #include <linux/bitops.h> 25 - #include <linux/delay.h> 26 - #include <linux/ktime.h> 27 - #include <linux/hdreg.h> 28 - #include <linux/dma-mapping.h> 29 - #include <linux/completion.h> 30 - #include <linux/scatterlist.h> 31 - #include <asm/io.h> 32 - #include <linux/uaccess.h> 33 - 34 - #if 0 35 - #define CARM_DEBUG 36 - #define CARM_VERBOSE_DEBUG 37 - #else 38 - #undef CARM_DEBUG 39 - #undef CARM_VERBOSE_DEBUG 40 - #endif 41 - #undef CARM_NDEBUG 42 - 43 - #define DRV_NAME "sx8" 44 - #define DRV_VERSION "1.0" 45 - #define PFX DRV_NAME ": " 46 - 47 - MODULE_AUTHOR("Jeff Garzik"); 48 - MODULE_LICENSE("GPL"); 49 - MODULE_DESCRIPTION("Promise SATA SX8 block driver"); 50 - MODULE_VERSION(DRV_VERSION); 51 - 52 - /* 53 - * SX8 hardware has a single message queue for all ATA ports. 54 - * When this driver was written, the hardware (firmware?) would 55 - * corrupt data eventually, if more than one request was outstanding. 56 - * As one can imagine, having 8 ports bottlenecking on a single 57 - * command hurts performance. 58 - * 59 - * Based on user reports, later versions of the hardware (firmware?) 60 - * seem to be able to survive with more than one command queued. 61 - * 62 - * Therefore, we default to the safe option -- 1 command -- but 63 - * allow the user to increase this. 64 - * 65 - * SX8 should be able to support up to ~60 queued commands (CARM_MAX_REQ), 66 - * but problems seem to occur when you exceed ~30, even on newer hardware. 67 - */ 68 - static int max_queue = 1; 69 - module_param(max_queue, int, 0444); 70 - MODULE_PARM_DESC(max_queue, "Maximum number of queued commands. (min==1, max==30, safe==1)"); 71 - 72 - 73 - #define NEXT_RESP(idx) ((idx + 1) % RMSG_Q_LEN) 74 - 75 - /* 0xf is just arbitrary, non-zero noise; this is sorta like poisoning */ 76 - #define TAG_ENCODE(tag) (((tag) << 16) | 0xf) 77 - #define TAG_DECODE(tag) (((tag) >> 16) & 0x1f) 78 - #define TAG_VALID(tag) ((((tag) & 0xf) == 0xf) && (TAG_DECODE(tag) < 32)) 79 - 80 - /* note: prints function name for you */ 81 - #ifdef CARM_DEBUG 82 - #define DPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args) 83 - #ifdef CARM_VERBOSE_DEBUG 84 - #define VPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args) 85 - #else 86 - #define VPRINTK(fmt, args...) 87 - #endif /* CARM_VERBOSE_DEBUG */ 88 - #else 89 - #define DPRINTK(fmt, args...) 90 - #define VPRINTK(fmt, args...) 91 - #endif /* CARM_DEBUG */ 92 - 93 - #ifdef CARM_NDEBUG 94 - #define assert(expr) 95 - #else 96 - #define assert(expr) \ 97 - if(unlikely(!(expr))) { \ 98 - printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \ 99 - #expr, __FILE__, __func__, __LINE__); \ 100 - } 101 - #endif 102 - 103 - /* defines only for the constants which don't work well as enums */ 104 - struct carm_host; 105 - 106 - enum { 107 - /* adapter-wide limits */ 108 - CARM_MAX_PORTS = 8, 109 - CARM_SHM_SIZE = (4096 << 7), 110 - CARM_MINORS_PER_MAJOR = 256 / CARM_MAX_PORTS, 111 - CARM_MAX_WAIT_Q = CARM_MAX_PORTS + 1, 112 - 113 - /* command message queue limits */ 114 - CARM_MAX_REQ = 64, /* max command msgs per host */ 115 - CARM_MSG_LOW_WATER = (CARM_MAX_REQ / 4), /* refill mark */ 116 - 117 - /* S/G limits, host-wide and per-request */ 118 - CARM_MAX_REQ_SG = 32, /* max s/g entries per request */ 119 - CARM_MAX_HOST_SG = 600, /* max s/g entries per host */ 120 - CARM_SG_LOW_WATER = (CARM_MAX_HOST_SG / 4), /* re-fill mark */ 121 - 122 - /* hardware registers */ 123 - CARM_IHQP = 0x1c, 124 - CARM_INT_STAT = 0x10, /* interrupt status */ 125 - CARM_INT_MASK = 0x14, /* interrupt mask */ 126 - CARM_HMUC = 0x18, /* host message unit control */ 127 - RBUF_ADDR_LO = 0x20, /* response msg DMA buf low 32 bits */ 128 - RBUF_ADDR_HI = 0x24, /* response msg DMA buf high 32 bits */ 129 - RBUF_BYTE_SZ = 0x28, 130 - CARM_RESP_IDX = 0x2c, 131 - CARM_CMS0 = 0x30, /* command message size reg 0 */ 132 - CARM_LMUC = 0x48, 133 - CARM_HMPHA = 0x6c, 134 - CARM_INITC = 0xb5, 135 - 136 - /* bits in CARM_INT_{STAT,MASK} */ 137 - INT_RESERVED = 0xfffffff0, 138 - INT_WATCHDOG = (1 << 3), /* watchdog timer */ 139 - INT_Q_OVERFLOW = (1 << 2), /* cmd msg q overflow */ 140 - INT_Q_AVAILABLE = (1 << 1), /* cmd msg q has free space */ 141 - INT_RESPONSE = (1 << 0), /* response msg available */ 142 - INT_ACK_MASK = INT_WATCHDOG | INT_Q_OVERFLOW, 143 - INT_DEF_MASK = INT_RESERVED | INT_Q_OVERFLOW | 144 - INT_RESPONSE, 145 - 146 - /* command messages, and related register bits */ 147 - CARM_HAVE_RESP = 0x01, 148 - CARM_MSG_READ = 1, 149 - CARM_MSG_WRITE = 2, 150 - CARM_MSG_VERIFY = 3, 151 - CARM_MSG_GET_CAPACITY = 4, 152 - CARM_MSG_FLUSH = 5, 153 - CARM_MSG_IOCTL = 6, 154 - CARM_MSG_ARRAY = 8, 155 - CARM_MSG_MISC = 9, 156 - CARM_CME = (1 << 2), 157 - CARM_RME = (1 << 1), 158 - CARM_WZBC = (1 << 0), 159 - CARM_RMI = (1 << 0), 160 - CARM_Q_FULL = (1 << 3), 161 - CARM_MSG_SIZE = 288, 162 - CARM_Q_LEN = 48, 163 - 164 - /* CARM_MSG_IOCTL messages */ 165 - CARM_IOC_SCAN_CHAN = 5, /* scan channels for devices */ 166 - CARM_IOC_GET_TCQ = 13, /* get tcq/ncq depth */ 167 - CARM_IOC_SET_TCQ = 14, /* set tcq/ncq depth */ 168 - 169 - IOC_SCAN_CHAN_NODEV = 0x1f, 170 - IOC_SCAN_CHAN_OFFSET = 0x40, 171 - 172 - /* CARM_MSG_ARRAY messages */ 173 - CARM_ARRAY_INFO = 0, 174 - 175 - ARRAY_NO_EXIST = (1 << 31), 176 - 177 - /* response messages */ 178 - RMSG_SZ = 8, /* sizeof(struct carm_response) */ 179 - RMSG_Q_LEN = 48, /* resp. msg list length */ 180 - RMSG_OK = 1, /* bit indicating msg was successful */ 181 - /* length of entire resp. msg buffer */ 182 - RBUF_LEN = RMSG_SZ * RMSG_Q_LEN, 183 - 184 - PDC_SHM_SIZE = (4096 << 7), /* length of entire h/w buffer */ 185 - 186 - /* CARM_MSG_MISC messages */ 187 - MISC_GET_FW_VER = 2, 188 - MISC_ALLOC_MEM = 3, 189 - MISC_SET_TIME = 5, 190 - 191 - /* MISC_GET_FW_VER feature bits */ 192 - FW_VER_4PORT = (1 << 2), /* 1=4 ports, 0=8 ports */ 193 - FW_VER_NON_RAID = (1 << 1), /* 1=non-RAID firmware, 0=RAID */ 194 - FW_VER_ZCR = (1 << 0), /* zero channel RAID (whatever that is) */ 195 - 196 - /* carm_host flags */ 197 - FL_NON_RAID = FW_VER_NON_RAID, 198 - FL_4PORT = FW_VER_4PORT, 199 - FL_FW_VER_MASK = (FW_VER_NON_RAID | FW_VER_4PORT), 200 - FL_DYN_MAJOR = (1 << 17), 201 - }; 202 - 203 - enum { 204 - CARM_SG_BOUNDARY = 0xffffUL, /* s/g segment boundary */ 205 - }; 206 - 207 - enum scatter_gather_types { 208 - SGT_32BIT = 0, 209 - SGT_64BIT = 1, 210 - }; 211 - 212 - enum host_states { 213 - HST_INVALID, /* invalid state; never used */ 214 - HST_ALLOC_BUF, /* setting up master SHM area */ 215 - HST_ERROR, /* we never leave here */ 216 - HST_PORT_SCAN, /* start dev scan */ 217 - HST_DEV_SCAN_START, /* start per-device probe */ 218 - HST_DEV_SCAN, /* continue per-device probe */ 219 - HST_DEV_ACTIVATE, /* activate devices we found */ 220 - HST_PROBE_FINISHED, /* probe is complete */ 221 - HST_PROBE_START, /* initiate probe */ 222 - HST_SYNC_TIME, /* tell firmware what time it is */ 223 - HST_GET_FW_VER, /* get firmware version, adapter port cnt */ 224 - }; 225 - 226 - #ifdef CARM_DEBUG 227 - static const char *state_name[] = { 228 - "HST_INVALID", 229 - "HST_ALLOC_BUF", 230 - "HST_ERROR", 231 - "HST_PORT_SCAN", 232 - "HST_DEV_SCAN_START", 233 - "HST_DEV_SCAN", 234 - "HST_DEV_ACTIVATE", 235 - "HST_PROBE_FINISHED", 236 - "HST_PROBE_START", 237 - "HST_SYNC_TIME", 238 - "HST_GET_FW_VER", 239 - }; 240 - #endif 241 - 242 - struct carm_port { 243 - unsigned int port_no; 244 - struct gendisk *disk; 245 - struct carm_host *host; 246 - 247 - /* attached device characteristics */ 248 - u64 capacity; 249 - char name[41]; 250 - u16 dev_geom_head; 251 - u16 dev_geom_sect; 252 - u16 dev_geom_cyl; 253 - }; 254 - 255 - struct carm_request { 256 - int n_elem; 257 - unsigned int msg_type; 258 - unsigned int msg_subtype; 259 - unsigned int msg_bucket; 260 - struct scatterlist sg[CARM_MAX_REQ_SG]; 261 - }; 262 - 263 - struct carm_host { 264 - unsigned long flags; 265 - void __iomem *mmio; 266 - void *shm; 267 - dma_addr_t shm_dma; 268 - 269 - int major; 270 - int id; 271 - char name[32]; 272 - 273 - spinlock_t lock; 274 - struct pci_dev *pdev; 275 - unsigned int state; 276 - u32 fw_ver; 277 - 278 - struct blk_mq_tag_set tag_set; 279 - struct request_queue *oob_q; 280 - unsigned int n_oob; 281 - 282 - unsigned int hw_sg_used; 283 - 284 - unsigned int resp_idx; 285 - 286 - unsigned int wait_q_prod; 287 - unsigned int wait_q_cons; 288 - struct request_queue *wait_q[CARM_MAX_WAIT_Q]; 289 - 290 - void *msg_base; 291 - dma_addr_t msg_dma; 292 - 293 - int cur_scan_dev; 294 - unsigned long dev_active; 295 - unsigned long dev_present; 296 - struct carm_port port[CARM_MAX_PORTS]; 297 - 298 - struct work_struct fsm_task; 299 - 300 - int probe_err; 301 - struct completion probe_comp; 302 - }; 303 - 304 - struct carm_response { 305 - __le32 ret_handle; 306 - __le32 status; 307 - } __attribute__((packed)); 308 - 309 - struct carm_msg_sg { 310 - __le32 start; 311 - __le32 len; 312 - } __attribute__((packed)); 313 - 314 - struct carm_msg_rw { 315 - u8 type; 316 - u8 id; 317 - u8 sg_count; 318 - u8 sg_type; 319 - __le32 handle; 320 - __le32 lba; 321 - __le16 lba_count; 322 - __le16 lba_high; 323 - struct carm_msg_sg sg[32]; 324 - } __attribute__((packed)); 325 - 326 - struct carm_msg_allocbuf { 327 - u8 type; 328 - u8 subtype; 329 - u8 n_sg; 330 - u8 sg_type; 331 - __le32 handle; 332 - __le32 addr; 333 - __le32 len; 334 - __le32 evt_pool; 335 - __le32 n_evt; 336 - __le32 rbuf_pool; 337 - __le32 n_rbuf; 338 - __le32 msg_pool; 339 - __le32 n_msg; 340 - struct carm_msg_sg sg[8]; 341 - } __attribute__((packed)); 342 - 343 - struct carm_msg_ioctl { 344 - u8 type; 345 - u8 subtype; 346 - u8 array_id; 347 - u8 reserved1; 348 - __le32 handle; 349 - __le32 data_addr; 350 - u32 reserved2; 351 - } __attribute__((packed)); 352 - 353 - struct carm_msg_sync_time { 354 - u8 type; 355 - u8 subtype; 356 - u16 reserved1; 357 - __le32 handle; 358 - u32 reserved2; 359 - __le32 timestamp; 360 - } __attribute__((packed)); 361 - 362 - struct carm_msg_get_fw_ver { 363 - u8 type; 364 - u8 subtype; 365 - u16 reserved1; 366 - __le32 handle; 367 - __le32 data_addr; 368 - u32 reserved2; 369 - } __attribute__((packed)); 370 - 371 - struct carm_fw_ver { 372 - __le32 version; 373 - u8 features; 374 - u8 reserved1; 375 - u16 reserved2; 376 - } __attribute__((packed)); 377 - 378 - struct carm_array_info { 379 - __le32 size; 380 - 381 - __le16 size_hi; 382 - __le16 stripe_size; 383 - 384 - __le32 mode; 385 - 386 - __le16 stripe_blk_sz; 387 - __le16 reserved1; 388 - 389 - __le16 cyl; 390 - __le16 head; 391 - 392 - __le16 sect; 393 - u8 array_id; 394 - u8 reserved2; 395 - 396 - char name[40]; 397 - 398 - __le32 array_status; 399 - 400 - /* device list continues beyond this point? */ 401 - } __attribute__((packed)); 402 - 403 - static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent); 404 - static void carm_remove_one (struct pci_dev *pdev); 405 - static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo); 406 - 407 - static const struct pci_device_id carm_pci_tbl[] = { 408 - { PCI_VENDOR_ID_PROMISE, 0x8000, PCI_ANY_ID, PCI_ANY_ID, 0, 0, }, 409 - { PCI_VENDOR_ID_PROMISE, 0x8002, PCI_ANY_ID, PCI_ANY_ID, 0, 0, }, 410 - { } /* terminate list */ 411 - }; 412 - MODULE_DEVICE_TABLE(pci, carm_pci_tbl); 413 - 414 - static struct pci_driver carm_driver = { 415 - .name = DRV_NAME, 416 - .id_table = carm_pci_tbl, 417 - .probe = carm_init_one, 418 - .remove = carm_remove_one, 419 - }; 420 - 421 - static const struct block_device_operations carm_bd_ops = { 422 - .owner = THIS_MODULE, 423 - .getgeo = carm_bdev_getgeo, 424 - }; 425 - 426 - static unsigned int carm_host_id; 427 - static unsigned long carm_major_alloc; 428 - 429 - 430 - 431 - static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo) 432 - { 433 - struct carm_port *port = bdev->bd_disk->private_data; 434 - 435 - geo->heads = (u8) port->dev_geom_head; 436 - geo->sectors = (u8) port->dev_geom_sect; 437 - geo->cylinders = port->dev_geom_cyl; 438 - return 0; 439 - } 440 - 441 - static const u32 msg_sizes[] = { 32, 64, 128, CARM_MSG_SIZE }; 442 - 443 - static inline int carm_lookup_bucket(u32 msg_size) 444 - { 445 - int i; 446 - 447 - for (i = 0; i < ARRAY_SIZE(msg_sizes); i++) 448 - if (msg_size <= msg_sizes[i]) 449 - return i; 450 - 451 - return -ENOENT; 452 - } 453 - 454 - static void carm_init_buckets(void __iomem *mmio) 455 - { 456 - unsigned int i; 457 - 458 - for (i = 0; i < ARRAY_SIZE(msg_sizes); i++) 459 - writel(msg_sizes[i], mmio + CARM_CMS0 + (4 * i)); 460 - } 461 - 462 - static inline void *carm_ref_msg(struct carm_host *host, 463 - unsigned int msg_idx) 464 - { 465 - return host->msg_base + (msg_idx * CARM_MSG_SIZE); 466 - } 467 - 468 - static inline dma_addr_t carm_ref_msg_dma(struct carm_host *host, 469 - unsigned int msg_idx) 470 - { 471 - return host->msg_dma + (msg_idx * CARM_MSG_SIZE); 472 - } 473 - 474 - static int carm_send_msg(struct carm_host *host, 475 - struct carm_request *crq, unsigned tag) 476 - { 477 - void __iomem *mmio = host->mmio; 478 - u32 msg = (u32) carm_ref_msg_dma(host, tag); 479 - u32 cm_bucket = crq->msg_bucket; 480 - u32 tmp; 481 - int rc = 0; 482 - 483 - VPRINTK("ENTER\n"); 484 - 485 - tmp = readl(mmio + CARM_HMUC); 486 - if (tmp & CARM_Q_FULL) { 487 - #if 0 488 - tmp = readl(mmio + CARM_INT_MASK); 489 - tmp |= INT_Q_AVAILABLE; 490 - writel(tmp, mmio + CARM_INT_MASK); 491 - readl(mmio + CARM_INT_MASK); /* flush */ 492 - #endif 493 - DPRINTK("host msg queue full\n"); 494 - rc = -EBUSY; 495 - } else { 496 - writel(msg | (cm_bucket << 1), mmio + CARM_IHQP); 497 - readl(mmio + CARM_IHQP); /* flush */ 498 - } 499 - 500 - return rc; 501 - } 502 - 503 - static int carm_array_info (struct carm_host *host, unsigned int array_idx) 504 - { 505 - struct carm_msg_ioctl *ioc; 506 - u32 msg_data; 507 - dma_addr_t msg_dma; 508 - struct carm_request *crq; 509 - struct request *rq; 510 - int rc; 511 - 512 - rq = blk_mq_alloc_request(host->oob_q, REQ_OP_DRV_OUT, 0); 513 - if (IS_ERR(rq)) { 514 - rc = -ENOMEM; 515 - goto err_out; 516 - } 517 - crq = blk_mq_rq_to_pdu(rq); 518 - 519 - ioc = carm_ref_msg(host, rq->tag); 520 - msg_dma = carm_ref_msg_dma(host, rq->tag); 521 - msg_data = (u32) (msg_dma + sizeof(struct carm_array_info)); 522 - 523 - crq->msg_type = CARM_MSG_ARRAY; 524 - crq->msg_subtype = CARM_ARRAY_INFO; 525 - rc = carm_lookup_bucket(sizeof(struct carm_msg_ioctl) + 526 - sizeof(struct carm_array_info)); 527 - BUG_ON(rc < 0); 528 - crq->msg_bucket = (u32) rc; 529 - 530 - memset(ioc, 0, sizeof(*ioc)); 531 - ioc->type = CARM_MSG_ARRAY; 532 - ioc->subtype = CARM_ARRAY_INFO; 533 - ioc->array_id = (u8) array_idx; 534 - ioc->handle = cpu_to_le32(TAG_ENCODE(rq->tag)); 535 - ioc->data_addr = cpu_to_le32(msg_data); 536 - 537 - spin_lock_irq(&host->lock); 538 - assert(host->state == HST_DEV_SCAN_START || 539 - host->state == HST_DEV_SCAN); 540 - spin_unlock_irq(&host->lock); 541 - 542 - DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag); 543 - blk_execute_rq_nowait(rq, true); 544 - 545 - return 0; 546 - 547 - err_out: 548 - spin_lock_irq(&host->lock); 549 - host->state = HST_ERROR; 550 - spin_unlock_irq(&host->lock); 551 - return rc; 552 - } 553 - 554 - typedef unsigned int (*carm_sspc_t)(struct carm_host *, unsigned int, void *); 555 - 556 - static int carm_send_special (struct carm_host *host, carm_sspc_t func) 557 - { 558 - struct request *rq; 559 - struct carm_request *crq; 560 - struct carm_msg_ioctl *ioc; 561 - void *mem; 562 - unsigned int msg_size; 563 - int rc; 564 - 565 - rq = blk_mq_alloc_request(host->oob_q, REQ_OP_DRV_OUT, 0); 566 - if (IS_ERR(rq)) 567 - return -ENOMEM; 568 - crq = blk_mq_rq_to_pdu(rq); 569 - 570 - mem = carm_ref_msg(host, rq->tag); 571 - 572 - msg_size = func(host, rq->tag, mem); 573 - 574 - ioc = mem; 575 - crq->msg_type = ioc->type; 576 - crq->msg_subtype = ioc->subtype; 577 - rc = carm_lookup_bucket(msg_size); 578 - BUG_ON(rc < 0); 579 - crq->msg_bucket = (u32) rc; 580 - 581 - DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag); 582 - blk_execute_rq_nowait(rq, true); 583 - 584 - return 0; 585 - } 586 - 587 - static unsigned int carm_fill_sync_time(struct carm_host *host, 588 - unsigned int idx, void *mem) 589 - { 590 - struct carm_msg_sync_time *st = mem; 591 - 592 - time64_t tv = ktime_get_real_seconds(); 593 - 594 - memset(st, 0, sizeof(*st)); 595 - st->type = CARM_MSG_MISC; 596 - st->subtype = MISC_SET_TIME; 597 - st->handle = cpu_to_le32(TAG_ENCODE(idx)); 598 - st->timestamp = cpu_to_le32(tv); 599 - 600 - return sizeof(struct carm_msg_sync_time); 601 - } 602 - 603 - static unsigned int carm_fill_alloc_buf(struct carm_host *host, 604 - unsigned int idx, void *mem) 605 - { 606 - struct carm_msg_allocbuf *ab = mem; 607 - 608 - memset(ab, 0, sizeof(*ab)); 609 - ab->type = CARM_MSG_MISC; 610 - ab->subtype = MISC_ALLOC_MEM; 611 - ab->handle = cpu_to_le32(TAG_ENCODE(idx)); 612 - ab->n_sg = 1; 613 - ab->sg_type = SGT_32BIT; 614 - ab->addr = cpu_to_le32(host->shm_dma + (PDC_SHM_SIZE >> 1)); 615 - ab->len = cpu_to_le32(PDC_SHM_SIZE >> 1); 616 - ab->evt_pool = cpu_to_le32(host->shm_dma + (16 * 1024)); 617 - ab->n_evt = cpu_to_le32(1024); 618 - ab->rbuf_pool = cpu_to_le32(host->shm_dma); 619 - ab->n_rbuf = cpu_to_le32(RMSG_Q_LEN); 620 - ab->msg_pool = cpu_to_le32(host->shm_dma + RBUF_LEN); 621 - ab->n_msg = cpu_to_le32(CARM_Q_LEN); 622 - ab->sg[0].start = cpu_to_le32(host->shm_dma + (PDC_SHM_SIZE >> 1)); 623 - ab->sg[0].len = cpu_to_le32(65536); 624 - 625 - return sizeof(struct carm_msg_allocbuf); 626 - } 627 - 628 - static unsigned int carm_fill_scan_channels(struct carm_host *host, 629 - unsigned int idx, void *mem) 630 - { 631 - struct carm_msg_ioctl *ioc = mem; 632 - u32 msg_data = (u32) (carm_ref_msg_dma(host, idx) + 633 - IOC_SCAN_CHAN_OFFSET); 634 - 635 - memset(ioc, 0, sizeof(*ioc)); 636 - ioc->type = CARM_MSG_IOCTL; 637 - ioc->subtype = CARM_IOC_SCAN_CHAN; 638 - ioc->handle = cpu_to_le32(TAG_ENCODE(idx)); 639 - ioc->data_addr = cpu_to_le32(msg_data); 640 - 641 - /* fill output data area with "no device" default values */ 642 - mem += IOC_SCAN_CHAN_OFFSET; 643 - memset(mem, IOC_SCAN_CHAN_NODEV, CARM_MAX_PORTS); 644 - 645 - return IOC_SCAN_CHAN_OFFSET + CARM_MAX_PORTS; 646 - } 647 - 648 - static unsigned int carm_fill_get_fw_ver(struct carm_host *host, 649 - unsigned int idx, void *mem) 650 - { 651 - struct carm_msg_get_fw_ver *ioc = mem; 652 - u32 msg_data = (u32) (carm_ref_msg_dma(host, idx) + sizeof(*ioc)); 653 - 654 - memset(ioc, 0, sizeof(*ioc)); 655 - ioc->type = CARM_MSG_MISC; 656 - ioc->subtype = MISC_GET_FW_VER; 657 - ioc->handle = cpu_to_le32(TAG_ENCODE(idx)); 658 - ioc->data_addr = cpu_to_le32(msg_data); 659 - 660 - return sizeof(struct carm_msg_get_fw_ver) + 661 - sizeof(struct carm_fw_ver); 662 - } 663 - 664 - static inline void carm_push_q (struct carm_host *host, struct request_queue *q) 665 - { 666 - unsigned int idx = host->wait_q_prod % CARM_MAX_WAIT_Q; 667 - 668 - blk_mq_stop_hw_queues(q); 669 - VPRINTK("STOPPED QUEUE %p\n", q); 670 - 671 - host->wait_q[idx] = q; 672 - host->wait_q_prod++; 673 - BUG_ON(host->wait_q_prod == host->wait_q_cons); /* overrun */ 674 - } 675 - 676 - static inline struct request_queue *carm_pop_q(struct carm_host *host) 677 - { 678 - unsigned int idx; 679 - 680 - if (host->wait_q_prod == host->wait_q_cons) 681 - return NULL; 682 - 683 - idx = host->wait_q_cons % CARM_MAX_WAIT_Q; 684 - host->wait_q_cons++; 685 - 686 - return host->wait_q[idx]; 687 - } 688 - 689 - static inline void carm_round_robin(struct carm_host *host) 690 - { 691 - struct request_queue *q = carm_pop_q(host); 692 - if (q) { 693 - blk_mq_start_hw_queues(q); 694 - VPRINTK("STARTED QUEUE %p\n", q); 695 - } 696 - } 697 - 698 - static inline enum dma_data_direction carm_rq_dir(struct request *rq) 699 - { 700 - return op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; 701 - } 702 - 703 - static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx, 704 - const struct blk_mq_queue_data *bd) 705 - { 706 - struct request_queue *q = hctx->queue; 707 - struct request *rq = bd->rq; 708 - struct carm_port *port = q->queuedata; 709 - struct carm_host *host = port->host; 710 - struct carm_request *crq = blk_mq_rq_to_pdu(rq); 711 - struct carm_msg_rw *msg; 712 - struct scatterlist *sg; 713 - int i, n_elem = 0, rc; 714 - unsigned int msg_size; 715 - u32 tmp; 716 - 717 - crq->n_elem = 0; 718 - sg_init_table(crq->sg, CARM_MAX_REQ_SG); 719 - 720 - blk_mq_start_request(rq); 721 - 722 - spin_lock_irq(&host->lock); 723 - if (req_op(rq) == REQ_OP_DRV_OUT) 724 - goto send_msg; 725 - 726 - /* get scatterlist from block layer */ 727 - sg = &crq->sg[0]; 728 - n_elem = blk_rq_map_sg(q, rq, sg); 729 - if (n_elem <= 0) 730 - goto out_ioerr; 731 - 732 - /* map scatterlist to PCI bus addresses */ 733 - n_elem = dma_map_sg(&host->pdev->dev, sg, n_elem, carm_rq_dir(rq)); 734 - if (n_elem <= 0) 735 - goto out_ioerr; 736 - 737 - /* obey global hardware limit on S/G entries */ 738 - if (host->hw_sg_used >= CARM_MAX_HOST_SG - n_elem) 739 - goto out_resource; 740 - 741 - crq->n_elem = n_elem; 742 - host->hw_sg_used += n_elem; 743 - 744 - /* 745 - * build read/write message 746 - */ 747 - 748 - VPRINTK("build msg\n"); 749 - msg = (struct carm_msg_rw *) carm_ref_msg(host, rq->tag); 750 - 751 - if (rq_data_dir(rq) == WRITE) { 752 - msg->type = CARM_MSG_WRITE; 753 - crq->msg_type = CARM_MSG_WRITE; 754 - } else { 755 - msg->type = CARM_MSG_READ; 756 - crq->msg_type = CARM_MSG_READ; 757 - } 758 - 759 - msg->id = port->port_no; 760 - msg->sg_count = n_elem; 761 - msg->sg_type = SGT_32BIT; 762 - msg->handle = cpu_to_le32(TAG_ENCODE(rq->tag)); 763 - msg->lba = cpu_to_le32(blk_rq_pos(rq) & 0xffffffff); 764 - tmp = (blk_rq_pos(rq) >> 16) >> 16; 765 - msg->lba_high = cpu_to_le16( (u16) tmp ); 766 - msg->lba_count = cpu_to_le16(blk_rq_sectors(rq)); 767 - 768 - msg_size = sizeof(struct carm_msg_rw) - sizeof(msg->sg); 769 - for (i = 0; i < n_elem; i++) { 770 - struct carm_msg_sg *carm_sg = &msg->sg[i]; 771 - carm_sg->start = cpu_to_le32(sg_dma_address(&crq->sg[i])); 772 - carm_sg->len = cpu_to_le32(sg_dma_len(&crq->sg[i])); 773 - msg_size += sizeof(struct carm_msg_sg); 774 - } 775 - 776 - rc = carm_lookup_bucket(msg_size); 777 - BUG_ON(rc < 0); 778 - crq->msg_bucket = (u32) rc; 779 - send_msg: 780 - /* 781 - * queue read/write message to hardware 782 - */ 783 - VPRINTK("send msg, tag == %u\n", rq->tag); 784 - rc = carm_send_msg(host, crq, rq->tag); 785 - if (rc) { 786 - host->hw_sg_used -= n_elem; 787 - goto out_resource; 788 - } 789 - 790 - spin_unlock_irq(&host->lock); 791 - return BLK_STS_OK; 792 - out_resource: 793 - dma_unmap_sg(&host->pdev->dev, &crq->sg[0], n_elem, carm_rq_dir(rq)); 794 - carm_push_q(host, q); 795 - spin_unlock_irq(&host->lock); 796 - return BLK_STS_DEV_RESOURCE; 797 - out_ioerr: 798 - carm_round_robin(host); 799 - spin_unlock_irq(&host->lock); 800 - return BLK_STS_IOERR; 801 - } 802 - 803 - static void carm_handle_array_info(struct carm_host *host, 804 - struct carm_request *crq, u8 *mem, 805 - blk_status_t error) 806 - { 807 - struct carm_port *port; 808 - u8 *msg_data = mem + sizeof(struct carm_array_info); 809 - struct carm_array_info *desc = (struct carm_array_info *) msg_data; 810 - u64 lo, hi; 811 - int cur_port; 812 - size_t slen; 813 - 814 - DPRINTK("ENTER\n"); 815 - 816 - if (error) 817 - goto out; 818 - if (le32_to_cpu(desc->array_status) & ARRAY_NO_EXIST) 819 - goto out; 820 - 821 - cur_port = host->cur_scan_dev; 822 - 823 - /* should never occur */ 824 - if ((cur_port < 0) || (cur_port >= CARM_MAX_PORTS)) { 825 - printk(KERN_ERR PFX "BUG: cur_scan_dev==%d, array_id==%d\n", 826 - cur_port, (int) desc->array_id); 827 - goto out; 828 - } 829 - 830 - port = &host->port[cur_port]; 831 - 832 - lo = (u64) le32_to_cpu(desc->size); 833 - hi = (u64) le16_to_cpu(desc->size_hi); 834 - 835 - port->capacity = lo | (hi << 32); 836 - port->dev_geom_head = le16_to_cpu(desc->head); 837 - port->dev_geom_sect = le16_to_cpu(desc->sect); 838 - port->dev_geom_cyl = le16_to_cpu(desc->cyl); 839 - 840 - host->dev_active |= (1 << cur_port); 841 - 842 - strncpy(port->name, desc->name, sizeof(port->name)); 843 - port->name[sizeof(port->name) - 1] = 0; 844 - slen = strlen(port->name); 845 - while (slen && (port->name[slen - 1] == ' ')) { 846 - port->name[slen - 1] = 0; 847 - slen--; 848 - } 849 - 850 - printk(KERN_INFO DRV_NAME "(%s): port %u device %Lu sectors\n", 851 - pci_name(host->pdev), port->port_no, 852 - (unsigned long long) port->capacity); 853 - printk(KERN_INFO DRV_NAME "(%s): port %u device \"%s\"\n", 854 - pci_name(host->pdev), port->port_no, port->name); 855 - 856 - out: 857 - assert(host->state == HST_DEV_SCAN); 858 - schedule_work(&host->fsm_task); 859 - } 860 - 861 - static void carm_handle_scan_chan(struct carm_host *host, 862 - struct carm_request *crq, u8 *mem, 863 - blk_status_t error) 864 - { 865 - u8 *msg_data = mem + IOC_SCAN_CHAN_OFFSET; 866 - unsigned int i, dev_count = 0; 867 - int new_state = HST_DEV_SCAN_START; 868 - 869 - DPRINTK("ENTER\n"); 870 - 871 - if (error) { 872 - new_state = HST_ERROR; 873 - goto out; 874 - } 875 - 876 - /* TODO: scan and support non-disk devices */ 877 - for (i = 0; i < 8; i++) 878 - if (msg_data[i] == 0) { /* direct-access device (disk) */ 879 - host->dev_present |= (1 << i); 880 - dev_count++; 881 - } 882 - 883 - printk(KERN_INFO DRV_NAME "(%s): found %u interesting devices\n", 884 - pci_name(host->pdev), dev_count); 885 - 886 - out: 887 - assert(host->state == HST_PORT_SCAN); 888 - host->state = new_state; 889 - schedule_work(&host->fsm_task); 890 - } 891 - 892 - static void carm_handle_generic(struct carm_host *host, 893 - struct carm_request *crq, blk_status_t error, 894 - int cur_state, int next_state) 895 - { 896 - DPRINTK("ENTER\n"); 897 - 898 - assert(host->state == cur_state); 899 - if (error) 900 - host->state = HST_ERROR; 901 - else 902 - host->state = next_state; 903 - schedule_work(&host->fsm_task); 904 - } 905 - 906 - static inline void carm_handle_resp(struct carm_host *host, 907 - __le32 ret_handle_le, u32 status) 908 - { 909 - u32 handle = le32_to_cpu(ret_handle_le); 910 - unsigned int msg_idx; 911 - struct request *rq; 912 - struct carm_request *crq; 913 - blk_status_t error = (status == RMSG_OK) ? 0 : BLK_STS_IOERR; 914 - u8 *mem; 915 - 916 - VPRINTK("ENTER, handle == 0x%x\n", handle); 917 - 918 - if (unlikely(!TAG_VALID(handle))) { 919 - printk(KERN_ERR DRV_NAME "(%s): BUG: invalid tag 0x%x\n", 920 - pci_name(host->pdev), handle); 921 - return; 922 - } 923 - 924 - msg_idx = TAG_DECODE(handle); 925 - VPRINTK("tag == %u\n", msg_idx); 926 - 927 - rq = blk_mq_tag_to_rq(host->tag_set.tags[0], msg_idx); 928 - crq = blk_mq_rq_to_pdu(rq); 929 - 930 - /* fast path */ 931 - if (likely(crq->msg_type == CARM_MSG_READ || 932 - crq->msg_type == CARM_MSG_WRITE)) { 933 - dma_unmap_sg(&host->pdev->dev, &crq->sg[0], crq->n_elem, 934 - carm_rq_dir(rq)); 935 - goto done; 936 - } 937 - 938 - mem = carm_ref_msg(host, msg_idx); 939 - 940 - switch (crq->msg_type) { 941 - case CARM_MSG_IOCTL: { 942 - switch (crq->msg_subtype) { 943 - case CARM_IOC_SCAN_CHAN: 944 - carm_handle_scan_chan(host, crq, mem, error); 945 - goto done; 946 - default: 947 - /* unknown / invalid response */ 948 - goto err_out; 949 - } 950 - break; 951 - } 952 - 953 - case CARM_MSG_MISC: { 954 - switch (crq->msg_subtype) { 955 - case MISC_ALLOC_MEM: 956 - carm_handle_generic(host, crq, error, 957 - HST_ALLOC_BUF, HST_SYNC_TIME); 958 - goto done; 959 - case MISC_SET_TIME: 960 - carm_handle_generic(host, crq, error, 961 - HST_SYNC_TIME, HST_GET_FW_VER); 962 - goto done; 963 - case MISC_GET_FW_VER: { 964 - struct carm_fw_ver *ver = (struct carm_fw_ver *) 965 - (mem + sizeof(struct carm_msg_get_fw_ver)); 966 - if (!error) { 967 - host->fw_ver = le32_to_cpu(ver->version); 968 - host->flags |= (ver->features & FL_FW_VER_MASK); 969 - } 970 - carm_handle_generic(host, crq, error, 971 - HST_GET_FW_VER, HST_PORT_SCAN); 972 - goto done; 973 - } 974 - default: 975 - /* unknown / invalid response */ 976 - goto err_out; 977 - } 978 - break; 979 - } 980 - 981 - case CARM_MSG_ARRAY: { 982 - switch (crq->msg_subtype) { 983 - case CARM_ARRAY_INFO: 984 - carm_handle_array_info(host, crq, mem, error); 985 - break; 986 - default: 987 - /* unknown / invalid response */ 988 - goto err_out; 989 - } 990 - break; 991 - } 992 - 993 - default: 994 - /* unknown / invalid response */ 995 - goto err_out; 996 - } 997 - 998 - return; 999 - 1000 - err_out: 1001 - printk(KERN_WARNING DRV_NAME "(%s): BUG: unhandled message type %d/%d\n", 1002 - pci_name(host->pdev), crq->msg_type, crq->msg_subtype); 1003 - error = BLK_STS_IOERR; 1004 - done: 1005 - host->hw_sg_used -= crq->n_elem; 1006 - blk_mq_end_request(blk_mq_rq_from_pdu(crq), error); 1007 - 1008 - if (host->hw_sg_used <= CARM_SG_LOW_WATER) 1009 - carm_round_robin(host); 1010 - } 1011 - 1012 - static inline void carm_handle_responses(struct carm_host *host) 1013 - { 1014 - void __iomem *mmio = host->mmio; 1015 - struct carm_response *resp = (struct carm_response *) host->shm; 1016 - unsigned int work = 0; 1017 - unsigned int idx = host->resp_idx % RMSG_Q_LEN; 1018 - 1019 - while (1) { 1020 - u32 status = le32_to_cpu(resp[idx].status); 1021 - 1022 - if (status == 0xffffffff) { 1023 - VPRINTK("ending response on index %u\n", idx); 1024 - writel(idx << 3, mmio + CARM_RESP_IDX); 1025 - break; 1026 - } 1027 - 1028 - /* response to a message we sent */ 1029 - else if ((status & (1 << 31)) == 0) { 1030 - VPRINTK("handling msg response on index %u\n", idx); 1031 - carm_handle_resp(host, resp[idx].ret_handle, status); 1032 - resp[idx].status = cpu_to_le32(0xffffffff); 1033 - } 1034 - 1035 - /* asynchronous events the hardware throws our way */ 1036 - else if ((status & 0xff000000) == (1 << 31)) { 1037 - u8 *evt_type_ptr = (u8 *) &resp[idx]; 1038 - u8 evt_type = *evt_type_ptr; 1039 - printk(KERN_WARNING DRV_NAME "(%s): unhandled event type %d\n", 1040 - pci_name(host->pdev), (int) evt_type); 1041 - resp[idx].status = cpu_to_le32(0xffffffff); 1042 - } 1043 - 1044 - idx = NEXT_RESP(idx); 1045 - work++; 1046 - } 1047 - 1048 - VPRINTK("EXIT, work==%u\n", work); 1049 - host->resp_idx += work; 1050 - } 1051 - 1052 - static irqreturn_t carm_interrupt(int irq, void *__host) 1053 - { 1054 - struct carm_host *host = __host; 1055 - void __iomem *mmio; 1056 - u32 mask; 1057 - int handled = 0; 1058 - unsigned long flags; 1059 - 1060 - if (!host) { 1061 - VPRINTK("no host\n"); 1062 - return IRQ_NONE; 1063 - } 1064 - 1065 - spin_lock_irqsave(&host->lock, flags); 1066 - 1067 - mmio = host->mmio; 1068 - 1069 - /* reading should also clear interrupts */ 1070 - mask = readl(mmio + CARM_INT_STAT); 1071 - 1072 - if (mask == 0 || mask == 0xffffffff) { 1073 - VPRINTK("no work, mask == 0x%x\n", mask); 1074 - goto out; 1075 - } 1076 - 1077 - if (mask & INT_ACK_MASK) 1078 - writel(mask, mmio + CARM_INT_STAT); 1079 - 1080 - if (unlikely(host->state == HST_INVALID)) { 1081 - VPRINTK("not initialized yet, mask = 0x%x\n", mask); 1082 - goto out; 1083 - } 1084 - 1085 - if (mask & CARM_HAVE_RESP) { 1086 - handled = 1; 1087 - carm_handle_responses(host); 1088 - } 1089 - 1090 - out: 1091 - spin_unlock_irqrestore(&host->lock, flags); 1092 - VPRINTK("EXIT\n"); 1093 - return IRQ_RETVAL(handled); 1094 - } 1095 - 1096 - static void carm_fsm_task (struct work_struct *work) 1097 - { 1098 - struct carm_host *host = 1099 - container_of(work, struct carm_host, fsm_task); 1100 - unsigned long flags; 1101 - unsigned int state; 1102 - int rc, i, next_dev; 1103 - int reschedule = 0; 1104 - int new_state = HST_INVALID; 1105 - 1106 - spin_lock_irqsave(&host->lock, flags); 1107 - state = host->state; 1108 - spin_unlock_irqrestore(&host->lock, flags); 1109 - 1110 - DPRINTK("ENTER, state == %s\n", state_name[state]); 1111 - 1112 - switch (state) { 1113 - case HST_PROBE_START: 1114 - new_state = HST_ALLOC_BUF; 1115 - reschedule = 1; 1116 - break; 1117 - 1118 - case HST_ALLOC_BUF: 1119 - rc = carm_send_special(host, carm_fill_alloc_buf); 1120 - if (rc) { 1121 - new_state = HST_ERROR; 1122 - reschedule = 1; 1123 - } 1124 - break; 1125 - 1126 - case HST_SYNC_TIME: 1127 - rc = carm_send_special(host, carm_fill_sync_time); 1128 - if (rc) { 1129 - new_state = HST_ERROR; 1130 - reschedule = 1; 1131 - } 1132 - break; 1133 - 1134 - case HST_GET_FW_VER: 1135 - rc = carm_send_special(host, carm_fill_get_fw_ver); 1136 - if (rc) { 1137 - new_state = HST_ERROR; 1138 - reschedule = 1; 1139 - } 1140 - break; 1141 - 1142 - case HST_PORT_SCAN: 1143 - rc = carm_send_special(host, carm_fill_scan_channels); 1144 - if (rc) { 1145 - new_state = HST_ERROR; 1146 - reschedule = 1; 1147 - } 1148 - break; 1149 - 1150 - case HST_DEV_SCAN_START: 1151 - host->cur_scan_dev = -1; 1152 - new_state = HST_DEV_SCAN; 1153 - reschedule = 1; 1154 - break; 1155 - 1156 - case HST_DEV_SCAN: 1157 - next_dev = -1; 1158 - for (i = host->cur_scan_dev + 1; i < CARM_MAX_PORTS; i++) 1159 - if (host->dev_present & (1 << i)) { 1160 - next_dev = i; 1161 - break; 1162 - } 1163 - 1164 - if (next_dev >= 0) { 1165 - host->cur_scan_dev = next_dev; 1166 - rc = carm_array_info(host, next_dev); 1167 - if (rc) { 1168 - new_state = HST_ERROR; 1169 - reschedule = 1; 1170 - } 1171 - } else { 1172 - new_state = HST_DEV_ACTIVATE; 1173 - reschedule = 1; 1174 - } 1175 - break; 1176 - 1177 - case HST_DEV_ACTIVATE: { 1178 - int activated = 0; 1179 - for (i = 0; i < CARM_MAX_PORTS; i++) 1180 - if (host->dev_active & (1 << i)) { 1181 - struct carm_port *port = &host->port[i]; 1182 - struct gendisk *disk = port->disk; 1183 - 1184 - set_capacity(disk, port->capacity); 1185 - host->probe_err = add_disk(disk); 1186 - if (!host->probe_err) 1187 - activated++; 1188 - else 1189 - break; 1190 - } 1191 - 1192 - printk(KERN_INFO DRV_NAME "(%s): %d ports activated\n", 1193 - pci_name(host->pdev), activated); 1194 - 1195 - new_state = HST_PROBE_FINISHED; 1196 - reschedule = 1; 1197 - break; 1198 - } 1199 - case HST_PROBE_FINISHED: 1200 - complete(&host->probe_comp); 1201 - break; 1202 - case HST_ERROR: 1203 - /* FIXME: TODO */ 1204 - break; 1205 - 1206 - default: 1207 - /* should never occur */ 1208 - printk(KERN_ERR PFX "BUG: unknown state %d\n", state); 1209 - assert(0); 1210 - break; 1211 - } 1212 - 1213 - if (new_state != HST_INVALID) { 1214 - spin_lock_irqsave(&host->lock, flags); 1215 - host->state = new_state; 1216 - spin_unlock_irqrestore(&host->lock, flags); 1217 - } 1218 - if (reschedule) 1219 - schedule_work(&host->fsm_task); 1220 - } 1221 - 1222 - static int carm_init_wait(void __iomem *mmio, u32 bits, unsigned int test_bit) 1223 - { 1224 - unsigned int i; 1225 - 1226 - for (i = 0; i < 50000; i++) { 1227 - u32 tmp = readl(mmio + CARM_LMUC); 1228 - udelay(100); 1229 - 1230 - if (test_bit) { 1231 - if ((tmp & bits) == bits) 1232 - return 0; 1233 - } else { 1234 - if ((tmp & bits) == 0) 1235 - return 0; 1236 - } 1237 - 1238 - cond_resched(); 1239 - } 1240 - 1241 - printk(KERN_ERR PFX "carm_init_wait timeout, bits == 0x%x, test_bit == %s\n", 1242 - bits, test_bit ? "yes" : "no"); 1243 - return -EBUSY; 1244 - } 1245 - 1246 - static void carm_init_responses(struct carm_host *host) 1247 - { 1248 - void __iomem *mmio = host->mmio; 1249 - unsigned int i; 1250 - struct carm_response *resp = (struct carm_response *) host->shm; 1251 - 1252 - for (i = 0; i < RMSG_Q_LEN; i++) 1253 - resp[i].status = cpu_to_le32(0xffffffff); 1254 - 1255 - writel(0, mmio + CARM_RESP_IDX); 1256 - } 1257 - 1258 - static int carm_init_host(struct carm_host *host) 1259 - { 1260 - void __iomem *mmio = host->mmio; 1261 - u32 tmp; 1262 - u8 tmp8; 1263 - int rc; 1264 - 1265 - DPRINTK("ENTER\n"); 1266 - 1267 - writel(0, mmio + CARM_INT_MASK); 1268 - 1269 - tmp8 = readb(mmio + CARM_INITC); 1270 - if (tmp8 & 0x01) { 1271 - tmp8 &= ~0x01; 1272 - writeb(tmp8, mmio + CARM_INITC); 1273 - readb(mmio + CARM_INITC); /* flush */ 1274 - 1275 - DPRINTK("snooze...\n"); 1276 - msleep(5000); 1277 - } 1278 - 1279 - tmp = readl(mmio + CARM_HMUC); 1280 - if (tmp & CARM_CME) { 1281 - DPRINTK("CME bit present, waiting\n"); 1282 - rc = carm_init_wait(mmio, CARM_CME, 1); 1283 - if (rc) { 1284 - DPRINTK("EXIT, carm_init_wait 1 failed\n"); 1285 - return rc; 1286 - } 1287 - } 1288 - if (tmp & CARM_RME) { 1289 - DPRINTK("RME bit present, waiting\n"); 1290 - rc = carm_init_wait(mmio, CARM_RME, 1); 1291 - if (rc) { 1292 - DPRINTK("EXIT, carm_init_wait 2 failed\n"); 1293 - return rc; 1294 - } 1295 - } 1296 - 1297 - tmp &= ~(CARM_RME | CARM_CME); 1298 - writel(tmp, mmio + CARM_HMUC); 1299 - readl(mmio + CARM_HMUC); /* flush */ 1300 - 1301 - rc = carm_init_wait(mmio, CARM_RME | CARM_CME, 0); 1302 - if (rc) { 1303 - DPRINTK("EXIT, carm_init_wait 3 failed\n"); 1304 - return rc; 1305 - } 1306 - 1307 - carm_init_buckets(mmio); 1308 - 1309 - writel(host->shm_dma & 0xffffffff, mmio + RBUF_ADDR_LO); 1310 - writel((host->shm_dma >> 16) >> 16, mmio + RBUF_ADDR_HI); 1311 - writel(RBUF_LEN, mmio + RBUF_BYTE_SZ); 1312 - 1313 - tmp = readl(mmio + CARM_HMUC); 1314 - tmp |= (CARM_RME | CARM_CME | CARM_WZBC); 1315 - writel(tmp, mmio + CARM_HMUC); 1316 - readl(mmio + CARM_HMUC); /* flush */ 1317 - 1318 - rc = carm_init_wait(mmio, CARM_RME | CARM_CME, 1); 1319 - if (rc) { 1320 - DPRINTK("EXIT, carm_init_wait 4 failed\n"); 1321 - return rc; 1322 - } 1323 - 1324 - writel(0, mmio + CARM_HMPHA); 1325 - writel(INT_DEF_MASK, mmio + CARM_INT_MASK); 1326 - 1327 - carm_init_responses(host); 1328 - 1329 - /* start initialization, probing state machine */ 1330 - spin_lock_irq(&host->lock); 1331 - assert(host->state == HST_INVALID); 1332 - host->state = HST_PROBE_START; 1333 - spin_unlock_irq(&host->lock); 1334 - schedule_work(&host->fsm_task); 1335 - 1336 - DPRINTK("EXIT\n"); 1337 - return 0; 1338 - } 1339 - 1340 - static const struct blk_mq_ops carm_mq_ops = { 1341 - .queue_rq = carm_queue_rq, 1342 - }; 1343 - 1344 - static int carm_init_disk(struct carm_host *host, unsigned int port_no) 1345 - { 1346 - struct carm_port *port = &host->port[port_no]; 1347 - struct gendisk *disk; 1348 - 1349 - port->host = host; 1350 - port->port_no = port_no; 1351 - 1352 - disk = blk_mq_alloc_disk(&host->tag_set, port); 1353 - if (IS_ERR(disk)) 1354 - return PTR_ERR(disk); 1355 - 1356 - port->disk = disk; 1357 - sprintf(disk->disk_name, DRV_NAME "/%u", 1358 - (unsigned int)host->id * CARM_MAX_PORTS + port_no); 1359 - disk->major = host->major; 1360 - disk->first_minor = port_no * CARM_MINORS_PER_MAJOR; 1361 - disk->minors = CARM_MINORS_PER_MAJOR; 1362 - disk->fops = &carm_bd_ops; 1363 - disk->private_data = port; 1364 - 1365 - blk_queue_max_segments(disk->queue, CARM_MAX_REQ_SG); 1366 - blk_queue_segment_boundary(disk->queue, CARM_SG_BOUNDARY); 1367 - return 0; 1368 - } 1369 - 1370 - static void carm_free_disk(struct carm_host *host, unsigned int port_no) 1371 - { 1372 - struct carm_port *port = &host->port[port_no]; 1373 - struct gendisk *disk = port->disk; 1374 - 1375 - if (!disk) 1376 - return; 1377 - 1378 - if (host->state > HST_DEV_ACTIVATE) 1379 - del_gendisk(disk); 1380 - put_disk(disk); 1381 - } 1382 - 1383 - static int carm_init_shm(struct carm_host *host) 1384 - { 1385 - host->shm = dma_alloc_coherent(&host->pdev->dev, CARM_SHM_SIZE, 1386 - &host->shm_dma, GFP_KERNEL); 1387 - if (!host->shm) 1388 - return -ENOMEM; 1389 - 1390 - host->msg_base = host->shm + RBUF_LEN; 1391 - host->msg_dma = host->shm_dma + RBUF_LEN; 1392 - 1393 - memset(host->shm, 0xff, RBUF_LEN); 1394 - memset(host->msg_base, 0, PDC_SHM_SIZE - RBUF_LEN); 1395 - 1396 - return 0; 1397 - } 1398 - 1399 - static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) 1400 - { 1401 - struct carm_host *host; 1402 - int rc; 1403 - struct request_queue *q; 1404 - unsigned int i; 1405 - 1406 - printk_once(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n"); 1407 - 1408 - rc = pci_enable_device(pdev); 1409 - if (rc) 1410 - return rc; 1411 - 1412 - rc = pci_request_regions(pdev, DRV_NAME); 1413 - if (rc) 1414 - goto err_out; 1415 - 1416 - rc = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32)); 1417 - if (rc) { 1418 - printk(KERN_ERR DRV_NAME "(%s): DMA mask failure\n", 1419 - pci_name(pdev)); 1420 - goto err_out_regions; 1421 - } 1422 - 1423 - host = kzalloc(sizeof(*host), GFP_KERNEL); 1424 - if (!host) { 1425 - rc = -ENOMEM; 1426 - goto err_out_regions; 1427 - } 1428 - 1429 - host->pdev = pdev; 1430 - spin_lock_init(&host->lock); 1431 - INIT_WORK(&host->fsm_task, carm_fsm_task); 1432 - init_completion(&host->probe_comp); 1433 - 1434 - host->mmio = ioremap(pci_resource_start(pdev, 0), 1435 - pci_resource_len(pdev, 0)); 1436 - if (!host->mmio) { 1437 - printk(KERN_ERR DRV_NAME "(%s): MMIO alloc failure\n", 1438 - pci_name(pdev)); 1439 - rc = -ENOMEM; 1440 - goto err_out_kfree; 1441 - } 1442 - 1443 - rc = carm_init_shm(host); 1444 - if (rc) { 1445 - printk(KERN_ERR DRV_NAME "(%s): DMA SHM alloc failure\n", 1446 - pci_name(pdev)); 1447 - goto err_out_iounmap; 1448 - } 1449 - 1450 - memset(&host->tag_set, 0, sizeof(host->tag_set)); 1451 - host->tag_set.ops = &carm_mq_ops; 1452 - host->tag_set.cmd_size = sizeof(struct carm_request); 1453 - host->tag_set.nr_hw_queues = 1; 1454 - host->tag_set.nr_maps = 1; 1455 - host->tag_set.queue_depth = max_queue; 1456 - host->tag_set.numa_node = NUMA_NO_NODE; 1457 - host->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; 1458 - 1459 - rc = blk_mq_alloc_tag_set(&host->tag_set); 1460 - if (rc) 1461 - goto err_out_dma_free; 1462 - 1463 - q = blk_mq_init_queue(&host->tag_set); 1464 - if (IS_ERR(q)) { 1465 - rc = PTR_ERR(q); 1466 - blk_mq_free_tag_set(&host->tag_set); 1467 - goto err_out_dma_free; 1468 - } 1469 - 1470 - host->oob_q = q; 1471 - q->queuedata = host; 1472 - 1473 - /* 1474 - * Figure out which major to use: 160, 161, or dynamic 1475 - */ 1476 - if (!test_and_set_bit(0, &carm_major_alloc)) 1477 - host->major = 160; 1478 - else if (!test_and_set_bit(1, &carm_major_alloc)) 1479 - host->major = 161; 1480 - else 1481 - host->flags |= FL_DYN_MAJOR; 1482 - 1483 - host->id = carm_host_id; 1484 - sprintf(host->name, DRV_NAME "%d", carm_host_id); 1485 - 1486 - rc = register_blkdev(host->major, host->name); 1487 - if (rc < 0) 1488 - goto err_out_free_majors; 1489 - if (host->flags & FL_DYN_MAJOR) 1490 - host->major = rc; 1491 - 1492 - for (i = 0; i < CARM_MAX_PORTS; i++) { 1493 - rc = carm_init_disk(host, i); 1494 - if (rc) 1495 - goto err_out_blkdev_disks; 1496 - } 1497 - 1498 - pci_set_master(pdev); 1499 - 1500 - rc = request_irq(pdev->irq, carm_interrupt, IRQF_SHARED, DRV_NAME, host); 1501 - if (rc) { 1502 - printk(KERN_ERR DRV_NAME "(%s): irq alloc failure\n", 1503 - pci_name(pdev)); 1504 - goto err_out_blkdev_disks; 1505 - } 1506 - 1507 - rc = carm_init_host(host); 1508 - if (rc) 1509 - goto err_out_free_irq; 1510 - 1511 - DPRINTK("waiting for probe_comp\n"); 1512 - host->probe_err = -ENODEV; 1513 - wait_for_completion(&host->probe_comp); 1514 - if (host->probe_err) { 1515 - rc = host->probe_err; 1516 - goto err_out_free_irq; 1517 - } 1518 - 1519 - printk(KERN_INFO "%s: pci %s, ports %d, io %llx, irq %u, major %d\n", 1520 - host->name, pci_name(pdev), (int) CARM_MAX_PORTS, 1521 - (unsigned long long)pci_resource_start(pdev, 0), 1522 - pdev->irq, host->major); 1523 - 1524 - carm_host_id++; 1525 - pci_set_drvdata(pdev, host); 1526 - return 0; 1527 - 1528 - err_out_free_irq: 1529 - free_irq(pdev->irq, host); 1530 - err_out_blkdev_disks: 1531 - for (i = 0; i < CARM_MAX_PORTS; i++) 1532 - carm_free_disk(host, i); 1533 - unregister_blkdev(host->major, host->name); 1534 - err_out_free_majors: 1535 - if (host->major == 160) 1536 - clear_bit(0, &carm_major_alloc); 1537 - else if (host->major == 161) 1538 - clear_bit(1, &carm_major_alloc); 1539 - blk_mq_destroy_queue(host->oob_q); 1540 - blk_mq_free_tag_set(&host->tag_set); 1541 - err_out_dma_free: 1542 - dma_free_coherent(&pdev->dev, CARM_SHM_SIZE, host->shm, host->shm_dma); 1543 - err_out_iounmap: 1544 - iounmap(host->mmio); 1545 - err_out_kfree: 1546 - kfree(host); 1547 - err_out_regions: 1548 - pci_release_regions(pdev); 1549 - err_out: 1550 - pci_disable_device(pdev); 1551 - return rc; 1552 - } 1553 - 1554 - static void carm_remove_one (struct pci_dev *pdev) 1555 - { 1556 - struct carm_host *host = pci_get_drvdata(pdev); 1557 - unsigned int i; 1558 - 1559 - if (!host) { 1560 - printk(KERN_ERR PFX "BUG: no host data for PCI(%s)\n", 1561 - pci_name(pdev)); 1562 - return; 1563 - } 1564 - 1565 - free_irq(pdev->irq, host); 1566 - for (i = 0; i < CARM_MAX_PORTS; i++) 1567 - carm_free_disk(host, i); 1568 - unregister_blkdev(host->major, host->name); 1569 - if (host->major == 160) 1570 - clear_bit(0, &carm_major_alloc); 1571 - else if (host->major == 161) 1572 - clear_bit(1, &carm_major_alloc); 1573 - blk_mq_destroy_queue(host->oob_q); 1574 - blk_mq_free_tag_set(&host->tag_set); 1575 - dma_free_coherent(&pdev->dev, CARM_SHM_SIZE, host->shm, host->shm_dma); 1576 - iounmap(host->mmio); 1577 - kfree(host); 1578 - pci_release_regions(pdev); 1579 - pci_disable_device(pdev); 1580 - } 1581 - 1582 - module_pci_driver(carm_driver);

+303 -45

drivers/block/ublk_drv.c

··· 47 47 #define UBLK_MINORS (1U << MINORBITS) 48 48 49 49 /* All UBLK_F_* have to be included into UBLK_F_ALL */ 50 - #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_URING_CMD_COMP_IN_TASK) 50 + #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ 51 + | UBLK_F_URING_CMD_COMP_IN_TASK \ 52 + | UBLK_F_NEED_GET_DATA) 53 + 54 + /* All UBLK_PARAM_TYPE_* should be included here */ 55 + #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD) 51 56 52 57 struct ublk_rq_data { 53 58 struct callback_head work; ··· 91 86 */ 92 87 #define UBLK_IO_FLAG_ABORTED 0x04 93 88 89 + /* 90 + * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires 91 + * get data buffer address from ublksrv. 92 + * 93 + * Then, bio data could be copied into this data buffer for a WRITE request 94 + * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset. 95 + */ 96 + #define UBLK_IO_FLAG_NEED_GET_DATA 0x08 97 + 94 98 struct ublk_io { 95 99 /* userspace buffer address from io cmd */ 96 100 __u64 addr; ··· 133 119 char *__queues; 134 120 135 121 unsigned short queue_size; 136 - unsigned short bs_shift; 137 122 struct ublksrv_ctrl_dev_info dev_info; 138 123 139 124 struct blk_mq_tag_set tag_set; ··· 150 137 spinlock_t mm_lock; 151 138 struct mm_struct *mm; 152 139 140 + struct ublk_params params; 141 + 153 142 struct completion completion; 154 143 unsigned int nr_queues_ready; 155 144 atomic_t nr_aborted_queues; ··· 162 147 */ 163 148 struct delayed_work monitor_work; 164 149 struct work_struct stop_work; 150 + }; 151 + 152 + /* header of ublk_params */ 153 + struct ublk_params_header { 154 + __u32 len; 155 + __u32 types; 165 156 }; 166 157 167 158 static dev_t ublk_chr_devt; ··· 181 160 182 161 static struct miscdevice ublk_misc; 183 162 163 + static void ublk_dev_param_basic_apply(struct ublk_device *ub) 164 + { 165 + struct request_queue *q = ub->ub_disk->queue; 166 + const struct ublk_param_basic *p = &ub->params.basic; 167 + 168 + blk_queue_logical_block_size(q, 1 << p->logical_bs_shift); 169 + blk_queue_physical_block_size(q, 1 << p->physical_bs_shift); 170 + blk_queue_io_min(q, 1 << p->io_min_shift); 171 + blk_queue_io_opt(q, 1 << p->io_opt_shift); 172 + 173 + blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE, 174 + p->attrs & UBLK_ATTR_FUA); 175 + if (p->attrs & UBLK_ATTR_ROTATIONAL) 176 + blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); 177 + else 178 + blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 179 + 180 + blk_queue_max_hw_sectors(q, p->max_sectors); 181 + blk_queue_chunk_sectors(q, p->chunk_sectors); 182 + blk_queue_virt_boundary(q, p->virt_boundary_mask); 183 + 184 + if (p->attrs & UBLK_ATTR_READ_ONLY) 185 + set_disk_ro(ub->ub_disk, true); 186 + 187 + set_capacity(ub->ub_disk, p->dev_sectors); 188 + } 189 + 190 + static void ublk_dev_param_discard_apply(struct ublk_device *ub) 191 + { 192 + struct request_queue *q = ub->ub_disk->queue; 193 + const struct ublk_param_discard *p = &ub->params.discard; 194 + 195 + q->limits.discard_alignment = p->discard_alignment; 196 + q->limits.discard_granularity = p->discard_granularity; 197 + blk_queue_max_discard_sectors(q, p->max_discard_sectors); 198 + blk_queue_max_write_zeroes_sectors(q, 199 + p->max_write_zeroes_sectors); 200 + blk_queue_max_discard_segments(q, p->max_discard_segments); 201 + } 202 + 203 + static int ublk_validate_params(const struct ublk_device *ub) 204 + { 205 + /* basic param is the only one which must be set */ 206 + if (ub->params.types & UBLK_PARAM_TYPE_BASIC) { 207 + const struct ublk_param_basic *p = &ub->params.basic; 208 + 209 + if (p->logical_bs_shift > PAGE_SHIFT) 210 + return -EINVAL; 211 + 212 + if (p->logical_bs_shift > p->physical_bs_shift) 213 + return -EINVAL; 214 + 215 + if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9)) 216 + return -EINVAL; 217 + } else 218 + return -EINVAL; 219 + 220 + if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) { 221 + const struct ublk_param_discard *p = &ub->params.discard; 222 + 223 + /* So far, only support single segment discard */ 224 + if (p->max_discard_sectors && p->max_discard_segments != 1) 225 + return -EINVAL; 226 + 227 + if (!p->discard_granularity) 228 + return -EINVAL; 229 + } 230 + 231 + return 0; 232 + } 233 + 234 + static int ublk_apply_params(struct ublk_device *ub) 235 + { 236 + if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC)) 237 + return -EINVAL; 238 + 239 + ublk_dev_param_basic_apply(ub); 240 + 241 + if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) 242 + ublk_dev_param_discard_apply(ub); 243 + 244 + return 0; 245 + } 246 + 184 247 static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq) 185 248 { 186 249 if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) && 187 250 !(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK)) 251 + return true; 252 + return false; 253 + } 254 + 255 + static inline bool ublk_need_get_data(const struct ublk_queue *ubq) 256 + { 257 + if (ubq->flags & UBLK_F_NEED_GET_DATA) 188 258 return true; 189 259 return false; 190 260 } ··· 621 509 } 622 510 } 623 511 512 + static void ubq_complete_io_cmd(struct ublk_io *io, int res) 513 + { 514 + /* mark this cmd owned by ublksrv */ 515 + io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV; 516 + 517 + /* 518 + * clear ACTIVE since we are done with this sqe/cmd slot 519 + * We can only accept io cmd in case of being not active. 520 + */ 521 + io->flags &= ~UBLK_IO_FLAG_ACTIVE; 522 + 523 + /* tell ublksrv one io request is coming */ 524 + io_uring_cmd_done(io->cmd, res, 0); 525 + } 526 + 624 527 #define UBLK_REQUEUE_DELAY_MS 3 625 528 626 529 static inline void __ublk_rq_task_work(struct request *req) ··· 656 529 blk_mq_end_request(req, BLK_STS_IOERR); 657 530 mod_delayed_work(system_wq, &ub->monitor_work, 0); 658 531 return; 532 + } 533 + 534 + if (ublk_need_get_data(ubq) && 535 + (req_op(req) == REQ_OP_WRITE || 536 + req_op(req) == REQ_OP_FLUSH)) { 537 + /* 538 + * We have not handled UBLK_IO_NEED_GET_DATA command yet, 539 + * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv 540 + * and notify it. 541 + */ 542 + if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) { 543 + io->flags |= UBLK_IO_FLAG_NEED_GET_DATA; 544 + pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n", 545 + __func__, io->cmd->cmd_op, ubq->q_id, 546 + req->tag, io->flags); 547 + ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA); 548 + return; 549 + } 550 + /* 551 + * We have handled UBLK_IO_NEED_GET_DATA command, 552 + * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just 553 + * do the copy work. 554 + */ 555 + io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA; 659 556 } 660 557 661 558 mapped_bytes = ublk_map_io(ubq, req, io); ··· 704 553 mapped_bytes >> 9; 705 554 } 706 555 707 - /* mark this cmd owned by ublksrv */ 708 - io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV; 709 - 710 - /* 711 - * clear ACTIVE since we are done with this sqe/cmd slot 712 - * We can only accept io cmd in case of being not active. 713 - */ 714 - io->flags &= ~UBLK_IO_FLAG_ACTIVE; 715 - 716 - /* tell ublksrv one io request is coming */ 717 - io_uring_cmd_done(io->cmd, UBLK_IO_RES_OK, 0); 556 + ubq_complete_io_cmd(io, UBLK_IO_RES_OK); 718 557 } 719 558 720 559 static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd) ··· 929 788 UBLK_DAEMON_MONITOR_PERIOD); 930 789 } 931 790 791 + static inline bool ublk_queue_ready(struct ublk_queue *ubq) 792 + { 793 + return ubq->nr_io_ready == ubq->q_depth; 794 + } 795 + 932 796 static void ublk_cancel_queue(struct ublk_queue *ubq) 933 797 { 934 798 int i; 799 + 800 + if (!ublk_queue_ready(ubq)) 801 + return; 935 802 936 803 for (i = 0; i < ubq->q_depth; i++) { 937 804 struct ublk_io *io = &ubq->ios[i]; ··· 947 798 if (io->flags & UBLK_IO_FLAG_ACTIVE) 948 799 io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0); 949 800 } 801 + 802 + /* all io commands are canceled */ 803 + ubq->nr_io_ready = 0; 950 804 } 951 805 952 806 /* Cancel all pending commands, must be called after del_gendisk() returns */ ··· 970 818 del_gendisk(ub->ub_disk); 971 819 ub->dev_info.state = UBLK_S_DEV_DEAD; 972 820 ub->dev_info.ublksrv_pid = -1; 973 - ublk_cancel_dev(ub); 974 821 put_disk(ub->ub_disk); 975 822 ub->ub_disk = NULL; 976 823 unlock: 824 + ublk_cancel_dev(ub); 977 825 mutex_unlock(&ub->mutex); 978 826 cancel_delayed_work_sync(&ub->monitor_work); 979 - } 980 - 981 - static inline bool ublk_queue_ready(struct ublk_queue *ubq) 982 - { 983 - return ubq->nr_io_ready == ubq->q_depth; 984 827 } 985 828 986 829 /* device can only be started after all IOs are ready */ ··· 991 844 if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) 992 845 complete_all(&ub->completion); 993 846 mutex_unlock(&ub->mutex); 847 + } 848 + 849 + static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id, 850 + int tag, struct io_uring_cmd *cmd) 851 + { 852 + struct ublk_queue *ubq = ublk_get_queue(ub, q_id); 853 + struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag); 854 + 855 + if (ublk_can_use_task_work(ubq)) { 856 + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); 857 + 858 + /* should not fail since we call it just in ubq->ubq_daemon */ 859 + task_work_add(ubq->ubq_daemon, &data->work, TWA_SIGNAL_NO_IPI); 860 + } else { 861 + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 862 + 863 + pdu->req = req; 864 + io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb); 865 + } 994 866 } 995 867 996 868 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) ··· 1050 884 goto out; 1051 885 } 1052 886 887 + /* 888 + * ensure that the user issues UBLK_IO_NEED_GET_DATA 889 + * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA. 890 + */ 891 + if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) 892 + ^ (cmd_op == UBLK_IO_NEED_GET_DATA)) 893 + goto out; 894 + 1053 895 switch (cmd_op) { 1054 896 case UBLK_IO_FETCH_REQ: 1055 897 /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ ··· 1090 916 io->flags |= UBLK_IO_FLAG_ACTIVE; 1091 917 io->cmd = cmd; 1092 918 ublk_commit_completion(ub, ub_cmd); 919 + break; 920 + case UBLK_IO_NEED_GET_DATA: 921 + if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) 922 + goto out; 923 + io->addr = ub_cmd->addr; 924 + io->cmd = cmd; 925 + io->flags |= UBLK_IO_FLAG_ACTIVE; 926 + ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag, cmd); 1093 927 break; 1094 928 default: 1095 929 goto out; ··· 1265 1083 ublk_stop_dev(ub); 1266 1084 } 1267 1085 1268 - /* align maximum I/O size to PAGE_SIZE */ 1086 + /* align max io buffer size with PAGE_SIZE */ 1269 1087 static void ublk_align_max_io_size(struct ublk_device *ub) 1270 1088 { 1271 - unsigned int max_rq_bytes = ub->dev_info.rq_max_blocks << ub->bs_shift; 1089 + unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes; 1272 1090 1273 - ub->dev_info.rq_max_blocks = 1274 - round_down(max_rq_bytes, PAGE_SIZE) >> ub->bs_shift; 1091 + ub->dev_info.max_io_buf_bytes = 1092 + round_down(max_io_bytes, PAGE_SIZE); 1275 1093 } 1276 1094 1277 1095 static int ublk_add_tag_set(struct ublk_device *ub) ··· 1314 1132 { 1315 1133 struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; 1316 1134 int ublksrv_pid = (int)header->data[0]; 1317 - unsigned long dev_blocks = header->data[1]; 1318 1135 struct ublk_device *ub; 1319 1136 struct gendisk *disk; 1320 1137 int ret = -EINVAL; ··· 1336 1155 goto out_unlock; 1337 1156 } 1338 1157 1339 - /* We may get disk size updated */ 1340 - if (dev_blocks) 1341 - ub->dev_info.dev_blocks = dev_blocks; 1342 - 1343 1158 disk = blk_mq_alloc_disk(&ub->tag_set, ub); 1344 1159 if (IS_ERR(disk)) { 1345 1160 ret = PTR_ERR(disk); ··· 1345 1168 disk->fops = &ub_fops; 1346 1169 disk->private_data = ub; 1347 1170 1348 - blk_queue_logical_block_size(disk->queue, ub->dev_info.block_size); 1349 - blk_queue_physical_block_size(disk->queue, ub->dev_info.block_size); 1350 - blk_queue_io_min(disk->queue, ub->dev_info.block_size); 1351 - blk_queue_max_hw_sectors(disk->queue, 1352 - ub->dev_info.rq_max_blocks << (ub->bs_shift - 9)); 1353 - disk->queue->limits.discard_granularity = PAGE_SIZE; 1354 - blk_queue_max_discard_sectors(disk->queue, UINT_MAX >> 9); 1355 - blk_queue_max_write_zeroes_sectors(disk->queue, UINT_MAX >> 9); 1356 - 1357 - set_capacity(disk, ub->dev_info.dev_blocks << (ub->bs_shift - 9)); 1358 - 1359 1171 ub->dev_info.ublksrv_pid = ublksrv_pid; 1360 1172 ub->ub_disk = disk; 1173 + 1174 + ret = ublk_apply_params(ub); 1175 + if (ret) 1176 + goto out_put_disk; 1177 + 1361 1178 get_device(&ub->cdev_dev); 1362 1179 ret = add_disk(disk); 1363 1180 if (ret) { 1364 - put_disk(disk); 1365 - goto out_unlock; 1181 + /* 1182 + * Has to drop the reference since ->free_disk won't be 1183 + * called in case of add_disk failure. 1184 + */ 1185 + ublk_put_device(ub); 1186 + goto out_put_disk; 1366 1187 } 1367 1188 set_bit(UB_STATE_USED, &ub->state); 1368 1189 ub->dev_info.state = UBLK_S_DEV_LIVE; 1190 + out_put_disk: 1191 + if (ret) 1192 + put_disk(disk); 1369 1193 out_unlock: 1370 1194 mutex_unlock(&ub->mutex); 1371 1195 ublk_put_device(ub); ··· 1428 1250 { 1429 1251 pr_devel("%s: dev id %d flags %llx\n", __func__, 1430 1252 info->dev_id, info->flags); 1431 - pr_devel("\t nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n", 1432 - info->nr_hw_queues, info->queue_depth, 1433 - info->block_size, info->dev_blocks); 1253 + pr_devel("\t nr_hw_queues %d queue_depth %d\n", 1254 + info->nr_hw_queues, info->queue_depth); 1434 1255 } 1435 1256 1436 1257 static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) ··· 1489 1312 /* We are not ready to support zero copy */ 1490 1313 ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; 1491 1314 1492 - ub->bs_shift = ilog2(ub->dev_info.block_size); 1493 1315 ub->dev_info.nr_hw_queues = min_t(unsigned int, 1494 1316 ub->dev_info.nr_hw_queues, nr_cpu_ids); 1495 1317 ublk_align_max_io_size(ub); ··· 1612 1436 return ret; 1613 1437 } 1614 1438 1439 + static int ublk_ctrl_get_params(struct io_uring_cmd *cmd) 1440 + { 1441 + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; 1442 + void __user *argp = (void __user *)(unsigned long)header->addr; 1443 + struct ublk_params_header ph; 1444 + struct ublk_device *ub; 1445 + int ret; 1446 + 1447 + if (header->len <= sizeof(ph) || !header->addr) 1448 + return -EINVAL; 1449 + 1450 + if (copy_from_user(&ph, argp, sizeof(ph))) 1451 + return -EFAULT; 1452 + 1453 + if (ph.len > header->len || !ph.len) 1454 + return -EINVAL; 1455 + 1456 + if (ph.len > sizeof(struct ublk_params)) 1457 + ph.len = sizeof(struct ublk_params); 1458 + 1459 + ub = ublk_get_device_from_id(header->dev_id); 1460 + if (!ub) 1461 + return -EINVAL; 1462 + 1463 + mutex_lock(&ub->mutex); 1464 + if (copy_to_user(argp, &ub->params, ph.len)) 1465 + ret = -EFAULT; 1466 + else 1467 + ret = 0; 1468 + mutex_unlock(&ub->mutex); 1469 + 1470 + ublk_put_device(ub); 1471 + return ret; 1472 + } 1473 + 1474 + static int ublk_ctrl_set_params(struct io_uring_cmd *cmd) 1475 + { 1476 + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; 1477 + void __user *argp = (void __user *)(unsigned long)header->addr; 1478 + struct ublk_params_header ph; 1479 + struct ublk_device *ub; 1480 + int ret = -EFAULT; 1481 + 1482 + if (header->len <= sizeof(ph) || !header->addr) 1483 + return -EINVAL; 1484 + 1485 + if (copy_from_user(&ph, argp, sizeof(ph))) 1486 + return -EFAULT; 1487 + 1488 + if (ph.len > header->len || !ph.len || !ph.types) 1489 + return -EINVAL; 1490 + 1491 + if (ph.len > sizeof(struct ublk_params)) 1492 + ph.len = sizeof(struct ublk_params); 1493 + 1494 + ub = ublk_get_device_from_id(header->dev_id); 1495 + if (!ub) 1496 + return -EINVAL; 1497 + 1498 + /* parameters can only be changed when device isn't live */ 1499 + mutex_lock(&ub->mutex); 1500 + if (ub->dev_info.state == UBLK_S_DEV_LIVE) { 1501 + ret = -EACCES; 1502 + } else if (copy_from_user(&ub->params, argp, ph.len)) { 1503 + ret = -EFAULT; 1504 + } else { 1505 + /* clear all we don't support yet */ 1506 + ub->params.types &= UBLK_PARAM_TYPE_ALL; 1507 + ret = ublk_validate_params(ub); 1508 + } 1509 + mutex_unlock(&ub->mutex); 1510 + ublk_put_device(ub); 1511 + 1512 + return ret; 1513 + } 1514 + 1615 1515 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, 1616 1516 unsigned int issue_flags) 1617 1517 { ··· 1722 1470 break; 1723 1471 case UBLK_CMD_GET_QUEUE_AFFINITY: 1724 1472 ret = ublk_ctrl_get_queue_affinity(cmd); 1473 + break; 1474 + case UBLK_CMD_GET_PARAMS: 1475 + ret = ublk_ctrl_get_params(cmd); 1476 + break; 1477 + case UBLK_CMD_SET_PARAMS: 1478 + ret = ublk_ctrl_set_params(cmd); 1725 1479 break; 1726 1480 default: 1727 1481 break;

+1 -1

drivers/md/bcache/Kconfig

··· 29 29 operations that get stuck. 30 30 31 31 config BCACHE_ASYNC_REGISTRATION 32 - bool "Asynchronous device registration (EXPERIMENTAL)" 32 + bool "Asynchronous device registration" 33 33 depends on BCACHE 34 34 help 35 35 Add a sysfs file /sys/fs/bcache/register_async. Writing registering

+1

drivers/md/dm-raid.c

··· 3728 3728 if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) { 3729 3729 if (mddev->sync_thread) { 3730 3730 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3731 + md_unregister_thread(&mddev->sync_thread); 3731 3732 md_reap_sync_thread(mddev); 3732 3733 } 3733 3734 } else if (decipher_sync_action(mddev, mddev->recovery) != st_idle)

+4 -4

drivers/md/dm.c

··· 1016 1016 while (io) { 1017 1017 struct dm_io *next = io->next; 1018 1018 1019 - dm_io_rewind(io, &md->queue->bio_split); 1019 + dm_io_rewind(io, &md->disk->bio_split); 1020 1020 1021 1021 io->next = NULL; 1022 1022 __dm_io_complete(io, false); ··· 1181 1181 * Does the target need to split IO even further? 1182 1182 * - varied (per target) IO splitting is a tenet of DM; this 1183 1183 * explains why stacked chunk_sectors based splitting via 1184 - * blk_queue_split() isn't possible here. 1184 + * bio_split_to_limits() isn't possible here. 1185 1185 */ 1186 1186 if (!ti->max_io_len) 1187 1187 return len; ··· 1751 1751 is_abnormal = is_abnormal_io(bio); 1752 1752 if (unlikely(is_abnormal)) { 1753 1753 /* 1754 - * Use blk_queue_split() for abnormal IO (e.g. discard, etc) 1754 + * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc) 1755 1755 * otherwise associated queue_limits won't be imposed. 1756 1756 */ 1757 - blk_queue_split(&bio); 1757 + bio = bio_split_to_limits(bio); 1758 1758 } 1759 1759 1760 1760 init_clone_info(&ci, md, map, bio, is_abnormal);

+6 -15

drivers/md/md-autodetect.c

··· 125 125 char *devname = args->device_names; 126 126 dev_t devices[MD_SB_DISKS + 1], mdev; 127 127 struct mdu_array_info_s ainfo = { }; 128 - struct block_device *bdev; 129 128 struct mddev *mddev; 130 129 int err = 0, i; 131 130 char name[16]; ··· 168 169 169 170 pr_info("md: Loading %s: %s\n", name, args->device_names); 170 171 171 - bdev = blkdev_get_by_dev(mdev, FMODE_READ, NULL); 172 - if (IS_ERR(bdev)) { 173 - pr_err("md: open failed - cannot start array %s\n", name); 172 + mddev = md_alloc(mdev, name); 173 + if (IS_ERR(mddev)) { 174 + pr_err("md: md_alloc failed - cannot start array %s\n", name); 174 175 return; 175 176 } 176 - 177 - err = -EIO; 178 - if (WARN(bdev->bd_disk->fops != &md_fops, 179 - "Opening block device %x resulted in non-md device\n", 180 - mdev)) 181 - goto out_blkdev_put; 182 - 183 - mddev = bdev->bd_disk->private_data; 184 177 185 178 err = mddev_lock(mddev); 186 179 if (err) { 187 180 pr_err("md: failed to lock array %s\n", name); 188 - goto out_blkdev_put; 181 + goto out_mddev_put; 189 182 } 190 183 191 184 if (!list_empty(&mddev->disks) || mddev->raid_disks) { ··· 221 230 pr_warn("md: starting %s failed\n", name); 222 231 out_unlock: 223 232 mddev_unlock(mddev); 224 - out_blkdev_put: 225 - blkdev_put(bdev, FMODE_READ); 233 + out_mddev_put: 234 + mddev_put(mddev); 226 235 } 227 236 228 237 static int __init raid_setup(char *str)

+2 -2

drivers/md/md-cluster.c

··· 40 40 41 41 /* Lock the send communication. This is done through 42 42 * bit manipulation as opposed to a mutex in order to 43 - * accomodate lock and hold. See next comment. 43 + * accommodate lock and hold. See next comment. 44 44 */ 45 45 #define MD_CLUSTER_SEND_LOCK 4 46 46 /* If cluster operations (such as adding a disk) must lock the ··· 689 689 /* 690 690 * If resync thread run after raid1d thread, then process_metadata_update 691 691 * could not continue if raid1d held reconfig_mutex (and raid1d is blocked 692 - * since another node already got EX on Token and waitting the EX of Ack), 692 + * since another node already got EX on Token and waiting the EX of Ack), 693 693 * so let resync wake up thread in case flag is set. 694 694 */ 695 695 if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,

+229 -199

drivers/md/md.c

··· 368 368 static LIST_HEAD(all_mddevs); 369 369 static DEFINE_SPINLOCK(all_mddevs_lock); 370 370 371 - /* 372 - * iterates through all used mddevs in the system. 373 - * We take care to grab the all_mddevs_lock whenever navigating 374 - * the list, and to always hold a refcount when unlocked. 375 - * Any code which breaks out of this loop while own 376 - * a reference to the current mddev and must mddev_put it. 377 - */ 378 - #define for_each_mddev(_mddev,_tmp) \ 379 - \ 380 - for (({ spin_lock(&all_mddevs_lock); \ 381 - _tmp = all_mddevs.next; \ 382 - _mddev = NULL;}); \ 383 - ({ if (_tmp != &all_mddevs) \ 384 - mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ 385 - spin_unlock(&all_mddevs_lock); \ 386 - if (_mddev) mddev_put(_mddev); \ 387 - _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ 388 - _tmp != &all_mddevs;}); \ 389 - ({ spin_lock(&all_mddevs_lock); \ 390 - _tmp = _tmp->next;}) \ 391 - ) 392 - 393 371 /* Rather than calling directly into the personality make_request function, 394 372 * IO requests come here first so that we can check if the device is 395 373 * being suspended pending a reconfiguration. ··· 442 464 return; 443 465 } 444 466 445 - blk_queue_split(&bio); 467 + bio = bio_split_to_limits(bio); 446 468 447 469 if (mddev->ro == 1 && unlikely(rw == WRITE)) { 448 470 if (bio_sectors(bio) != 0) ··· 625 647 626 648 static inline struct mddev *mddev_get(struct mddev *mddev) 627 649 { 650 + lockdep_assert_held(&all_mddevs_lock); 651 + 652 + if (test_bit(MD_DELETED, &mddev->flags)) 653 + return NULL; 628 654 atomic_inc(&mddev->active); 629 655 return mddev; 630 656 } 631 657 632 658 static void mddev_delayed_delete(struct work_struct *ws); 633 659 634 - static void mddev_put(struct mddev *mddev) 660 + void mddev_put(struct mddev *mddev) 635 661 { 636 662 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 637 663 return; ··· 643 661 mddev->ctime == 0 && !mddev->hold_active) { 644 662 /* Array is not configured at all, and not held active, 645 663 * so destroy it */ 646 - list_del_init(&mddev->all_mddevs); 664 + set_bit(MD_DELETED, &mddev->flags); 647 665 648 666 /* 649 667 * Call queue_work inside the spinlock so that ··· 660 678 661 679 void mddev_init(struct mddev *mddev) 662 680 { 663 - kobject_init(&mddev->kobj, &md_ktype); 664 681 mutex_init(&mddev->open_mutex); 665 682 mutex_init(&mddev->reconfig_mutex); 666 683 mutex_init(&mddev->bitmap_info.mutex); ··· 714 733 return dev; 715 734 } 716 735 717 - static struct mddev *mddev_find(dev_t unit) 718 - { 719 - struct mddev *mddev; 720 - 721 - if (MAJOR(unit) != MD_MAJOR) 722 - unit &= ~((1 << MdpMinorShift) - 1); 723 - 724 - spin_lock(&all_mddevs_lock); 725 - mddev = mddev_find_locked(unit); 726 - if (mddev) 727 - mddev_get(mddev); 728 - spin_unlock(&all_mddevs_lock); 729 - 730 - return mddev; 731 - } 732 - 733 736 static struct mddev *mddev_alloc(dev_t unit) 734 737 { 735 738 struct mddev *new; ··· 754 789 spin_unlock(&all_mddevs_lock); 755 790 kfree(new); 756 791 return ERR_PTR(error); 792 + } 793 + 794 + static void mddev_free(struct mddev *mddev) 795 + { 796 + spin_lock(&all_mddevs_lock); 797 + list_del(&mddev->all_mddevs); 798 + spin_unlock(&all_mddevs_lock); 799 + 800 + kfree(mddev); 757 801 } 758 802 759 803 static const struct attribute_group md_redundancy_group; ··· 3309 3335 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 3310 3336 } 3311 3337 3312 - static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) 3338 + static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) 3313 3339 { 3314 3340 /* check if two start/length pairs overlap */ 3315 - if (s1+l1 <= s2) 3316 - return 0; 3317 - if (s2+l2 <= s1) 3318 - return 0; 3319 - return 1; 3341 + if (a->data_offset + a->sectors <= b->data_offset) 3342 + return false; 3343 + if (b->data_offset + b->sectors <= a->data_offset) 3344 + return false; 3345 + return true; 3346 + } 3347 + 3348 + static bool md_rdev_overlaps(struct md_rdev *rdev) 3349 + { 3350 + struct mddev *mddev; 3351 + struct md_rdev *rdev2; 3352 + 3353 + spin_lock(&all_mddevs_lock); 3354 + list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 3355 + if (test_bit(MD_DELETED, &mddev->flags)) 3356 + continue; 3357 + rdev_for_each(rdev2, mddev) { 3358 + if (rdev != rdev2 && rdev->bdev == rdev2->bdev && 3359 + md_rdevs_overlap(rdev, rdev2)) { 3360 + spin_unlock(&all_mddevs_lock); 3361 + return true; 3362 + } 3363 + } 3364 + } 3365 + spin_unlock(&all_mddevs_lock); 3366 + return false; 3320 3367 } 3321 3368 3322 3369 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) ··· 3389 3394 return -EINVAL; /* component must fit device */ 3390 3395 3391 3396 rdev->sectors = sectors; 3392 - if (sectors > oldsectors && my_mddev->external) { 3393 - /* Need to check that all other rdevs with the same 3394 - * ->bdev do not overlap. 'rcu' is sufficient to walk 3395 - * the rdev lists safely. 3396 - * This check does not provide a hard guarantee, it 3397 - * just helps avoid dangerous mistakes. 3397 + 3398 + /* 3399 + * Check that all other rdevs with the same bdev do not overlap. This 3400 + * check does not provide a hard guarantee, it just helps avoid 3401 + * dangerous mistakes. 3402 + */ 3403 + if (sectors > oldsectors && my_mddev->external && 3404 + md_rdev_overlaps(rdev)) { 3405 + /* 3406 + * Someone else could have slipped in a size change here, but 3407 + * doing so is just silly. We put oldsectors back because we 3408 + * know it is safe, and trust userspace not to race with itself. 3398 3409 */ 3399 - struct mddev *mddev; 3400 - int overlap = 0; 3401 - struct list_head *tmp; 3402 - 3403 - rcu_read_lock(); 3404 - for_each_mddev(mddev, tmp) { 3405 - struct md_rdev *rdev2; 3406 - 3407 - rdev_for_each(rdev2, mddev) 3408 - if (rdev->bdev == rdev2->bdev && 3409 - rdev != rdev2 && 3410 - overlaps(rdev->data_offset, rdev->sectors, 3411 - rdev2->data_offset, 3412 - rdev2->sectors)) { 3413 - overlap = 1; 3414 - break; 3415 - } 3416 - if (overlap) { 3417 - mddev_put(mddev); 3418 - break; 3419 - } 3420 - } 3421 - rcu_read_unlock(); 3422 - if (overlap) { 3423 - /* Someone else could have slipped in a size 3424 - * change here, but doing so is just silly. 3425 - * We put oldsectors back because we *know* it is 3426 - * safe, and trust userspace not to race with 3427 - * itself 3428 - */ 3429 - rdev->sectors = oldsectors; 3430 - return -EBUSY; 3431 - } 3410 + rdev->sectors = oldsectors; 3411 + return -EBUSY; 3432 3412 } 3433 3413 return len; 3434 3414 } ··· 4800 4830 if (work_pending(&mddev->del_work)) 4801 4831 flush_workqueue(md_misc_wq); 4802 4832 if (mddev->sync_thread) { 4833 + sector_t save_rp = mddev->reshape_position; 4834 + 4835 + mddev_unlock(mddev); 4836 + set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4837 + md_unregister_thread(&mddev->sync_thread); 4838 + mddev_lock_nointr(mddev); 4839 + /* 4840 + * set RECOVERY_INTR again and restore reshape 4841 + * position in case others changed them after 4842 + * got lock, eg, reshape_position_store and 4843 + * md_check_recovery. 4844 + */ 4845 + mddev->reshape_position = save_rp; 4803 4846 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4804 4847 md_reap_sync_thread(mddev); 4805 4848 } ··· 4984 5001 sync_speed_show(struct mddev *mddev, char *page) 4985 5002 { 4986 5003 unsigned long resync, dt, db; 4987 - if (mddev->curr_resync == 0) 5004 + if (mddev->curr_resync == MD_RESYNC_NONE) 4988 5005 return sprintf(page, "none\n"); 4989 5006 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 4990 5007 dt = (jiffies - mddev->resync_mark) / HZ; ··· 5003 5020 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5004 5021 return sprintf(page, "none\n"); 5005 5022 5006 - if (mddev->curr_resync == 1 || 5007 - mddev->curr_resync == 2) 5023 + if (mddev->curr_resync == MD_RESYNC_YIELDED || 5024 + mddev->curr_resync == MD_RESYNC_DELAYED) 5008 5025 return sprintf(page, "delayed\n"); 5009 5026 5010 5027 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || ··· 5515 5532 if (!entry->show) 5516 5533 return -EIO; 5517 5534 spin_lock(&all_mddevs_lock); 5518 - if (list_empty(&mddev->all_mddevs)) { 5535 + if (!mddev_get(mddev)) { 5519 5536 spin_unlock(&all_mddevs_lock); 5520 5537 return -EBUSY; 5521 5538 } 5522 - mddev_get(mddev); 5523 5539 spin_unlock(&all_mddevs_lock); 5524 5540 5525 5541 rv = entry->show(mddev, page); ··· 5539 5557 if (!capable(CAP_SYS_ADMIN)) 5540 5558 return -EACCES; 5541 5559 spin_lock(&all_mddevs_lock); 5542 - if (list_empty(&mddev->all_mddevs)) { 5560 + if (!mddev_get(mddev)) { 5543 5561 spin_unlock(&all_mddevs_lock); 5544 5562 return -EBUSY; 5545 5563 } 5546 - mddev_get(mddev); 5547 5564 spin_unlock(&all_mddevs_lock); 5548 5565 rv = entry->store(mddev, page, length); 5549 5566 mddev_put(mddev); 5550 5567 return rv; 5551 5568 } 5552 5569 5553 - static void md_free(struct kobject *ko) 5570 + static void md_kobj_release(struct kobject *ko) 5554 5571 { 5555 5572 struct mddev *mddev = container_of(ko, struct mddev, kobj); 5556 5573 ··· 5558 5577 if (mddev->sysfs_level) 5559 5578 sysfs_put(mddev->sysfs_level); 5560 5579 5561 - if (mddev->gendisk) { 5562 - del_gendisk(mddev->gendisk); 5563 - put_disk(mddev->gendisk); 5564 - } 5565 - percpu_ref_exit(&mddev->writes_pending); 5566 - 5567 - bioset_exit(&mddev->bio_set); 5568 - bioset_exit(&mddev->sync_set); 5569 - kfree(mddev); 5580 + del_gendisk(mddev->gendisk); 5581 + put_disk(mddev->gendisk); 5570 5582 } 5571 5583 5572 5584 static const struct sysfs_ops md_sysfs_ops = { ··· 5567 5593 .store = md_attr_store, 5568 5594 }; 5569 5595 static struct kobj_type md_ktype = { 5570 - .release = md_free, 5596 + .release = md_kobj_release, 5571 5597 .sysfs_ops = &md_sysfs_ops, 5572 5598 .default_groups = md_attr_groups, 5573 5599 }; ··· 5578 5604 { 5579 5605 struct mddev *mddev = container_of(ws, struct mddev, del_work); 5580 5606 5581 - kobject_del(&mddev->kobj); 5582 5607 kobject_put(&mddev->kobj); 5583 5608 } 5584 5609 ··· 5596 5623 } 5597 5624 EXPORT_SYMBOL_GPL(mddev_init_writes_pending); 5598 5625 5599 - static int md_alloc(dev_t dev, char *name) 5626 + struct mddev *md_alloc(dev_t dev, char *name) 5600 5627 { 5601 5628 /* 5602 5629 * If dev is zero, name is the name of a device to allocate with ··· 5624 5651 mutex_lock(&disks_mutex); 5625 5652 mddev = mddev_alloc(dev); 5626 5653 if (IS_ERR(mddev)) { 5627 - mutex_unlock(&disks_mutex); 5628 - return PTR_ERR(mddev); 5654 + error = PTR_ERR(mddev); 5655 + goto out_unlock; 5629 5656 } 5630 5657 5631 5658 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); ··· 5643 5670 strcmp(mddev2->gendisk->disk_name, name) == 0) { 5644 5671 spin_unlock(&all_mddevs_lock); 5645 5672 error = -EEXIST; 5646 - goto out_unlock_disks_mutex; 5673 + goto out_free_mddev; 5647 5674 } 5648 5675 spin_unlock(&all_mddevs_lock); 5649 5676 } ··· 5656 5683 error = -ENOMEM; 5657 5684 disk = blk_alloc_disk(NUMA_NO_NODE); 5658 5685 if (!disk) 5659 - goto out_unlock_disks_mutex; 5686 + goto out_free_mddev; 5660 5687 5661 5688 disk->major = MAJOR(mddev->unit); 5662 5689 disk->first_minor = unit << shift; ··· 5677 5704 mddev->gendisk = disk; 5678 5705 error = add_disk(disk); 5679 5706 if (error) 5680 - goto out_cleanup_disk; 5707 + goto out_put_disk; 5681 5708 5709 + kobject_init(&mddev->kobj, &md_ktype); 5682 5710 error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); 5683 - if (error) 5684 - goto out_del_gendisk; 5711 + if (error) { 5712 + /* 5713 + * The disk is already live at this point. Clear the hold flag 5714 + * and let mddev_put take care of the deletion, as it isn't any 5715 + * different from a normal close on last release now. 5716 + */ 5717 + mddev->hold_active = 0; 5718 + mutex_unlock(&disks_mutex); 5719 + mddev_put(mddev); 5720 + return ERR_PTR(error); 5721 + } 5685 5722 5686 5723 kobject_uevent(&mddev->kobj, KOBJ_ADD); 5687 5724 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 5688 5725 mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); 5689 - goto out_unlock_disks_mutex; 5690 - 5691 - out_del_gendisk: 5692 - del_gendisk(disk); 5693 - out_cleanup_disk: 5694 - put_disk(disk); 5695 - out_unlock_disks_mutex: 5696 5726 mutex_unlock(&disks_mutex); 5727 + return mddev; 5728 + 5729 + out_put_disk: 5730 + put_disk(disk); 5731 + out_free_mddev: 5732 + mddev_free(mddev); 5733 + out_unlock: 5734 + mutex_unlock(&disks_mutex); 5735 + return ERR_PTR(error); 5736 + } 5737 + 5738 + static int md_alloc_and_put(dev_t dev, char *name) 5739 + { 5740 + struct mddev *mddev = md_alloc(dev, name); 5741 + 5742 + if (IS_ERR(mddev)) 5743 + return PTR_ERR(mddev); 5697 5744 mddev_put(mddev); 5698 - return error; 5745 + return 0; 5699 5746 } 5700 5747 5701 5748 static void md_probe(dev_t dev) ··· 5723 5730 if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) 5724 5731 return; 5725 5732 if (create_on_open) 5726 - md_alloc(dev, NULL); 5733 + md_alloc_and_put(dev, NULL); 5727 5734 } 5728 5735 5729 5736 static int add_named_array(const char *val, const struct kernel_param *kp) ··· 5745 5752 return -E2BIG; 5746 5753 strscpy(buf, val, len+1); 5747 5754 if (strncmp(buf, "md_", 3) == 0) 5748 - return md_alloc(0, buf); 5755 + return md_alloc_and_put(0, buf); 5749 5756 if (strncmp(buf, "md", 2) == 0 && 5750 5757 isdigit(buf[2]) && 5751 5758 kstrtoul(buf+2, 10, &devnum) == 0 && 5752 5759 devnum <= MINORMASK) 5753 - return md_alloc(MKDEV(MD_MAJOR, devnum), NULL); 5760 + return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); 5754 5761 5755 5762 return -EINVAL; 5756 5763 } ··· 6190 6197 flush_workqueue(md_misc_wq); 6191 6198 if (mddev->sync_thread) { 6192 6199 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6200 + md_unregister_thread(&mddev->sync_thread); 6193 6201 md_reap_sync_thread(mddev); 6194 6202 } 6195 6203 ··· 6238 6244 static void __md_stop(struct mddev *mddev) 6239 6245 { 6240 6246 struct md_personality *pers = mddev->pers; 6241 - md_bitmap_destroy(mddev); 6242 6247 mddev_detach(mddev); 6243 6248 /* Ensure ->event_work is done */ 6244 6249 if (mddev->event_work.func) 6245 6250 flush_workqueue(md_misc_wq); 6251 + md_bitmap_destroy(mddev); 6246 6252 spin_lock(&mddev->lock); 6247 6253 mddev->pers = NULL; 6248 6254 spin_unlock(&mddev->lock); ··· 6491 6497 break; 6492 6498 } 6493 6499 6494 - md_probe(dev); 6495 - mddev = mddev_find(dev); 6496 - if (!mddev) 6500 + mddev = md_alloc(dev, NULL); 6501 + if (IS_ERR(mddev)) 6497 6502 break; 6498 6503 6499 6504 if (mddev_lock(mddev)) ··· 7775 7782 7776 7783 static int md_open(struct block_device *bdev, fmode_t mode) 7777 7784 { 7778 - /* 7779 - * Succeed if we can lock the mddev, which confirms that 7780 - * it isn't being stopped right now. 7781 - */ 7782 - struct mddev *mddev = mddev_find(bdev->bd_dev); 7785 + struct mddev *mddev; 7783 7786 int err; 7784 7787 7788 + spin_lock(&all_mddevs_lock); 7789 + mddev = mddev_get(bdev->bd_disk->private_data); 7790 + spin_unlock(&all_mddevs_lock); 7785 7791 if (!mddev) 7786 7792 return -ENODEV; 7787 7793 7788 - if (mddev->gendisk != bdev->bd_disk) { 7789 - /* we are racing with mddev_put which is discarding this 7790 - * bd_disk. 7791 - */ 7792 - mddev_put(mddev); 7793 - /* Wait until bdev->bd_disk is definitely gone */ 7794 - if (work_pending(&mddev->del_work)) 7795 - flush_workqueue(md_misc_wq); 7796 - return -EBUSY; 7797 - } 7798 - BUG_ON(mddev != bdev->bd_disk->private_data); 7799 - 7800 - if ((err = mutex_lock_interruptible(&mddev->open_mutex))) 7794 + err = mutex_lock_interruptible(&mddev->open_mutex); 7795 + if (err) 7801 7796 goto out; 7802 7797 7803 - if (test_bit(MD_CLOSING, &mddev->flags)) { 7804 - mutex_unlock(&mddev->open_mutex); 7805 - err = -ENODEV; 7806 - goto out; 7807 - } 7798 + err = -ENODEV; 7799 + if (test_bit(MD_CLOSING, &mddev->flags)) 7800 + goto out_unlock; 7808 7801 7809 - err = 0; 7810 7802 atomic_inc(&mddev->openers); 7811 7803 mutex_unlock(&mddev->open_mutex); 7812 7804 7813 7805 bdev_check_media_change(bdev); 7814 - out: 7815 - if (err) 7816 - mddev_put(mddev); 7806 + return 0; 7807 + 7808 + out_unlock: 7809 + mutex_unlock(&mddev->open_mutex); 7810 + out: 7811 + mddev_put(mddev); 7817 7812 return err; 7818 7813 } 7819 7814 ··· 7825 7844 return ret; 7826 7845 } 7827 7846 7847 + static void md_free_disk(struct gendisk *disk) 7848 + { 7849 + struct mddev *mddev = disk->private_data; 7850 + 7851 + percpu_ref_exit(&mddev->writes_pending); 7852 + bioset_exit(&mddev->bio_set); 7853 + bioset_exit(&mddev->sync_set); 7854 + 7855 + mddev_free(mddev); 7856 + } 7857 + 7828 7858 const struct block_device_operations md_fops = 7829 7859 { 7830 7860 .owner = THIS_MODULE, ··· 7849 7857 .getgeo = md_getgeo, 7850 7858 .check_events = md_check_events, 7851 7859 .set_read_only = md_set_read_only, 7860 + .free_disk = md_free_disk, 7852 7861 }; 7853 7862 7854 7863 static int md_thread(void *arg) ··· 8011 8018 max_sectors = mddev->dev_sectors; 8012 8019 8013 8020 resync = mddev->curr_resync; 8014 - if (resync <= 3) { 8021 + if (resync < MD_RESYNC_ACTIVE) { 8015 8022 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 8016 8023 /* Still cleaning up */ 8017 8024 resync = max_sectors; 8018 - } else if (resync > max_sectors) 8025 + } else if (resync > max_sectors) { 8019 8026 resync = max_sectors; 8020 - else 8027 + } else { 8021 8028 resync -= atomic_read(&mddev->recovery_active); 8029 + if (resync < MD_RESYNC_ACTIVE) { 8030 + /* 8031 + * Resync has started, but the subtraction has 8032 + * yielded one of the special values. Force it 8033 + * to active to ensure the status reports an 8034 + * active resync. 8035 + */ 8036 + resync = MD_RESYNC_ACTIVE; 8037 + } 8038 + } 8022 8039 8023 - if (resync == 0) { 8040 + if (resync == MD_RESYNC_NONE) { 8024 8041 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { 8025 8042 struct md_rdev *rdev; 8026 8043 ··· 8054 8051 } 8055 8052 return 0; 8056 8053 } 8057 - if (resync < 3) { 8054 + if (resync < MD_RESYNC_ACTIVE) { 8058 8055 seq_printf(seq, "\tresync=DELAYED"); 8059 8056 return 1; 8060 8057 } ··· 8155 8152 if (!l--) { 8156 8153 mddev = list_entry(tmp, struct mddev, all_mddevs); 8157 8154 mddev_get(mddev); 8155 + if (!mddev_get(mddev)) 8156 + continue; 8158 8157 spin_unlock(&all_mddevs_lock); 8159 8158 return mddev; 8160 8159 } ··· 8170 8165 { 8171 8166 struct list_head *tmp; 8172 8167 struct mddev *next_mddev, *mddev = v; 8168 + struct mddev *to_put = NULL; 8173 8169 8174 8170 ++*pos; 8175 8171 if (v == (void*)2) 8176 8172 return NULL; 8177 8173 8178 8174 spin_lock(&all_mddevs_lock); 8179 - if (v == (void*)1) 8175 + if (v == (void*)1) { 8180 8176 tmp = all_mddevs.next; 8181 - else 8177 + } else { 8178 + to_put = mddev; 8182 8179 tmp = mddev->all_mddevs.next; 8183 - if (tmp != &all_mddevs) 8184 - next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs)); 8185 - else { 8186 - next_mddev = (void*)2; 8187 - *pos = 0x10000; 8180 + } 8181 + 8182 + for (;;) { 8183 + if (tmp == &all_mddevs) { 8184 + next_mddev = (void*)2; 8185 + *pos = 0x10000; 8186 + break; 8187 + } 8188 + next_mddev = list_entry(tmp, struct mddev, all_mddevs); 8189 + if (mddev_get(next_mddev)) 8190 + break; 8191 + mddev = next_mddev; 8192 + tmp = mddev->all_mddevs.next; 8188 8193 } 8189 8194 spin_unlock(&all_mddevs_lock); 8190 8195 8191 - if (v != (void*)1) 8196 + if (to_put) 8192 8197 mddev_put(mddev); 8193 8198 return next_mddev; 8194 8199 ··· 8697 8682 unsigned long update_time; 8698 8683 sector_t mark_cnt[SYNC_MARKS]; 8699 8684 int last_mark,m; 8700 - struct list_head *tmp; 8701 8685 sector_t last_check; 8702 8686 int skipped = 0; 8703 8687 struct md_rdev *rdev; ··· 8743 8729 8744 8730 mddev->last_sync_action = action ?: desc; 8745 8731 8746 - /* we overload curr_resync somewhat here. 8747 - * 0 == not engaged in resync at all 8748 - * 2 == checking that there is no conflict with another sync 8749 - * 1 == like 2, but have yielded to allow conflicting resync to 8750 - * commence 8751 - * other == active in resync - this many blocks 8752 - * 8732 + /* 8753 8733 * Before starting a resync we must have set curr_resync to 8754 8734 * 2, and then checked that every "conflicting" array has curr_resync 8755 8735 * less than ours. When we find one that is the same or higher ··· 8755 8747 8756 8748 do { 8757 8749 int mddev2_minor = -1; 8758 - mddev->curr_resync = 2; 8750 + mddev->curr_resync = MD_RESYNC_DELAYED; 8759 8751 8760 8752 try_again: 8761 8753 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8762 8754 goto skip; 8763 - for_each_mddev(mddev2, tmp) { 8755 + spin_lock(&all_mddevs_lock); 8756 + list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { 8757 + if (test_bit(MD_DELETED, &mddev2->flags)) 8758 + continue; 8764 8759 if (mddev2 == mddev) 8765 8760 continue; 8766 8761 if (!mddev->parallel_resync 8767 8762 && mddev2->curr_resync 8768 8763 && match_mddev_units(mddev, mddev2)) { 8769 8764 DEFINE_WAIT(wq); 8770 - if (mddev < mddev2 && mddev->curr_resync == 2) { 8765 + if (mddev < mddev2 && 8766 + mddev->curr_resync == MD_RESYNC_DELAYED) { 8771 8767 /* arbitrarily yield */ 8772 - mddev->curr_resync = 1; 8768 + mddev->curr_resync = MD_RESYNC_YIELDED; 8773 8769 wake_up(&resync_wait); 8774 8770 } 8775 - if (mddev > mddev2 && mddev->curr_resync == 1) 8771 + if (mddev > mddev2 && 8772 + mddev->curr_resync == MD_RESYNC_YIELDED) 8776 8773 /* no need to wait here, we can wait the next 8777 8774 * time 'round when curr_resync == 2 8778 8775 */ ··· 8795 8782 desc, mdname(mddev), 8796 8783 mdname(mddev2)); 8797 8784 } 8798 - mddev_put(mddev2); 8785 + spin_unlock(&all_mddevs_lock); 8786 + 8799 8787 if (signal_pending(current)) 8800 8788 flush_signals(current); 8801 8789 schedule(); ··· 8806 8792 finish_wait(&resync_wait, &wq); 8807 8793 } 8808 8794 } 8809 - } while (mddev->curr_resync < 2); 8795 + spin_unlock(&all_mddevs_lock); 8796 + } while (mddev->curr_resync < MD_RESYNC_DELAYED); 8810 8797 8811 8798 j = 0; 8812 8799 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { ··· 8891 8876 desc, mdname(mddev)); 8892 8877 mddev->curr_resync = j; 8893 8878 } else 8894 - mddev->curr_resync = 3; /* no longer delayed */ 8879 + mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ 8895 8880 mddev->curr_resync_completed = j; 8896 8881 sysfs_notify_dirent_safe(mddev->sysfs_completed); 8897 8882 md_new_event(); ··· 9026 9011 9027 9012 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9028 9013 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9029 - mddev->curr_resync > 3) { 9014 + mddev->curr_resync >= MD_RESYNC_ACTIVE) { 9030 9015 mddev->curr_resync_completed = mddev->curr_resync; 9031 9016 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9032 9017 } 9033 9018 mddev->pers->sync_request(mddev, max_sectors, &skipped); 9034 9019 9035 9020 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 9036 - mddev->curr_resync > 3) { 9021 + mddev->curr_resync >= MD_RESYNC_ACTIVE) { 9037 9022 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 9038 9023 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9039 9024 if (mddev->curr_resync >= mddev->recovery_cp) { ··· 9097 9082 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9098 9083 mddev->resync_min = mddev->curr_resync_completed; 9099 9084 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 9100 - mddev->curr_resync = 0; 9085 + mddev->curr_resync = MD_RESYNC_NONE; 9101 9086 spin_unlock(&mddev->lock); 9102 9087 9103 9088 wake_up(&resync_wait); ··· 9318 9303 * ->spare_active and clear saved_raid_disk 9319 9304 */ 9320 9305 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9306 + md_unregister_thread(&mddev->sync_thread); 9321 9307 md_reap_sync_thread(mddev); 9322 9308 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9323 9309 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); ··· 9354 9338 goto unlock; 9355 9339 } 9356 9340 if (mddev->sync_thread) { 9341 + md_unregister_thread(&mddev->sync_thread); 9357 9342 md_reap_sync_thread(mddev); 9358 9343 goto unlock; 9359 9344 } ··· 9434 9417 sector_t old_dev_sectors = mddev->dev_sectors; 9435 9418 bool is_reshaped = false; 9436 9419 9437 - /* resync has finished, collect result */ 9438 - md_unregister_thread(&mddev->sync_thread); 9420 + /* sync_thread should be unregistered, collect result */ 9439 9421 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9440 9422 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 9441 9423 mddev->degraded != mddev->raid_disks) { ··· 9482 9466 wake_up(&resync_wait); 9483 9467 /* flag recovery needed just to double check */ 9484 9468 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9469 + sysfs_notify_dirent_safe(mddev->sysfs_completed); 9485 9470 sysfs_notify_dirent_safe(mddev->sysfs_action); 9486 9471 md_new_event(); 9487 9472 if (mddev->event_work.func) ··· 9561 9544 static int md_notify_reboot(struct notifier_block *this, 9562 9545 unsigned long code, void *x) 9563 9546 { 9564 - struct list_head *tmp; 9565 - struct mddev *mddev; 9547 + struct mddev *mddev, *n; 9566 9548 int need_delay = 0; 9567 9549 9568 - for_each_mddev(mddev, tmp) { 9550 + spin_lock(&all_mddevs_lock); 9551 + list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { 9552 + if (!mddev_get(mddev)) 9553 + continue; 9554 + spin_unlock(&all_mddevs_lock); 9569 9555 if (mddev_trylock(mddev)) { 9570 9556 if (mddev->pers) 9571 9557 __md_stop_writes(mddev); ··· 9577 9557 mddev_unlock(mddev); 9578 9558 } 9579 9559 need_delay = 1; 9560 + mddev_put(mddev); 9561 + spin_lock(&all_mddevs_lock); 9580 9562 } 9563 + spin_unlock(&all_mddevs_lock); 9564 + 9581 9565 /* 9582 9566 * certain more exotic SCSI devices are known to be 9583 9567 * volatile wrt too early system reboots. While the ··· 9900 9876 9901 9877 static __exit void md_exit(void) 9902 9878 { 9903 - struct mddev *mddev; 9904 - struct list_head *tmp; 9879 + struct mddev *mddev, *n; 9905 9880 int delay = 1; 9906 9881 9907 9882 unregister_blkdev(MD_MAJOR,"md"); ··· 9920 9897 } 9921 9898 remove_proc_entry("mdstat", NULL); 9922 9899 9923 - for_each_mddev(mddev, tmp) { 9900 + spin_lock(&all_mddevs_lock); 9901 + list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { 9902 + if (!mddev_get(mddev)) 9903 + continue; 9904 + spin_unlock(&all_mddevs_lock); 9924 9905 export_array(mddev); 9925 9906 mddev->ctime = 0; 9926 9907 mddev->hold_active = 0; 9927 9908 /* 9928 - * for_each_mddev() will call mddev_put() at the end of each 9929 - * iteration. As the mddev is now fully clear, this will 9930 - * schedule the mddev for destruction by a workqueue, and the 9909 + * As the mddev is now fully clear, mddev_put will schedule 9910 + * the mddev for destruction by a workqueue, and the 9931 9911 * destroy_workqueue() below will wait for that to complete. 9932 9912 */ 9913 + mddev_put(mddev); 9914 + spin_lock(&all_mddevs_lock); 9933 9915 } 9916 + spin_unlock(&all_mddevs_lock); 9917 + 9934 9918 destroy_workqueue(md_rdev_misc_wq); 9935 9919 destroy_workqueue(md_misc_wq); 9936 9920 destroy_workqueue(md_wq);

+19

drivers/md/md.h

··· 254 254 * @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that 255 255 * array is ready yet. 256 256 * @MD_BROKEN: This is used to stop writes and mark array as failed. 257 + * @MD_DELETED: This device is being deleted 257 258 * 258 259 * change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added 259 260 */ ··· 271 270 MD_UPDATING_SB, 272 271 MD_NOT_READY, 273 272 MD_BROKEN, 273 + MD_DELETED, 274 274 }; 275 275 276 276 enum mddev_sb_flags { ··· 288 286 sector_t start; /* start sector of rb node */ 289 287 sector_t last; /* end sector of rb node */ 290 288 sector_t _subtree_last; /* highest sector in subtree of rb node */ 289 + }; 290 + 291 + /* 292 + * mddev->curr_resync stores the current sector of the resync but 293 + * also has some overloaded values. 294 + */ 295 + enum { 296 + /* No resync in progress */ 297 + MD_RESYNC_NONE = 0, 298 + /* Yielded to allow another conflicting resync to commence */ 299 + MD_RESYNC_YIELDED = 1, 300 + /* Delayed to check that there is no conflict with another sync */ 301 + MD_RESYNC_DELAYED = 2, 302 + /* Any value greater than or equal to this is in an active resync */ 303 + MD_RESYNC_ACTIVE = 3, 291 304 }; 292 305 293 306 struct mddev { ··· 767 750 extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); 768 751 769 752 extern void mddev_init(struct mddev *mddev); 753 + struct mddev *md_alloc(dev_t dev, char *name); 754 + void mddev_put(struct mddev *mddev); 770 755 extern int md_run(struct mddev *mddev); 771 756 extern int md_start(struct mddev *mddev); 772 757 extern void md_stop(struct mddev *mddev);

+4 -1

drivers/md/raid10.c

··· 2167 2167 int err = 0; 2168 2168 int number = rdev->raid_disk; 2169 2169 struct md_rdev **rdevp; 2170 - struct raid10_info *p = conf->mirrors + number; 2170 + struct raid10_info *p; 2171 2171 2172 2172 print_conf(conf); 2173 + if (unlikely(number >= mddev->raid_disks)) 2174 + return 0; 2175 + p = conf->mirrors + number; 2173 2176 if (rdev == p->rdev) 2174 2177 rdevp = &p->rdev; 2175 2178 else if (rdev == p->replacement)

+19 -21

drivers/md/raid5-cache.c

··· 1590 1590 1591 1591 bool r5l_log_disk_error(struct r5conf *conf) 1592 1592 { 1593 - struct r5l_log *log; 1594 - bool ret; 1595 - /* don't allow write if journal disk is missing */ 1596 - rcu_read_lock(); 1597 - log = rcu_dereference(conf->log); 1593 + struct r5l_log *log = conf->log; 1598 1594 1595 + /* don't allow write if journal disk is missing */ 1599 1596 if (!log) 1600 - ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 1597 + return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 1601 1598 else 1602 - ret = test_bit(Faulty, &log->rdev->flags); 1603 - rcu_read_unlock(); 1604 - return ret; 1599 + return test_bit(Faulty, &log->rdev->flags); 1605 1600 } 1606 1601 1607 1602 #define R5L_RECOVERY_PAGE_POOL_SIZE 256 ··· 2529 2534 struct r5conf *conf; 2530 2535 int ret; 2531 2536 2532 - spin_lock(&mddev->lock); 2537 + ret = mddev_lock(mddev); 2538 + if (ret) 2539 + return ret; 2540 + 2533 2541 conf = mddev->private; 2534 - if (!conf || !conf->log) { 2535 - spin_unlock(&mddev->lock); 2536 - return 0; 2537 - } 2542 + if (!conf || !conf->log) 2543 + goto out_unlock; 2538 2544 2539 2545 switch (conf->log->r5c_journal_mode) { 2540 2546 case R5C_JOURNAL_MODE_WRITE_THROUGH: ··· 2553 2557 default: 2554 2558 ret = 0; 2555 2559 } 2556 - spin_unlock(&mddev->lock); 2560 + 2561 + out_unlock: 2562 + mddev_unlock(mddev); 2557 2563 return ret; 2558 2564 } 2559 2565 ··· 2637 2639 int i; 2638 2640 struct r5dev *dev; 2639 2641 int to_cache = 0; 2640 - void **pslot; 2642 + void __rcu **pslot; 2641 2643 sector_t tree_index; 2642 2644 int ret; 2643 2645 uintptr_t refcount; ··· 2804 2806 int i; 2805 2807 int do_wakeup = 0; 2806 2808 sector_t tree_index; 2807 - void **pslot; 2809 + void __rcu **pslot; 2808 2810 uintptr_t refcount; 2809 2811 2810 2812 if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) ··· 3143 3145 spin_lock_init(&log->stripe_in_journal_lock); 3144 3146 atomic_set(&log->stripe_in_journal_count, 0); 3145 3147 3146 - rcu_assign_pointer(conf->log, log); 3148 + conf->log = log; 3147 3149 3148 3150 set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 3149 3151 return 0; ··· 3165 3167 { 3166 3168 struct r5l_log *log = conf->log; 3167 3169 3168 - conf->log = NULL; 3169 - synchronize_rcu(); 3170 - 3171 3170 /* Ensure disable_writeback_work wakes up and exits */ 3172 3171 wake_up(&conf->mddev->sb_wait); 3173 3172 flush_work(&log->disable_writeback_work); 3174 3173 md_unregister_thread(&log->reclaim_thread); 3174 + 3175 + conf->log = NULL; 3176 + 3175 3177 mempool_exit(&log->meta_pool); 3176 3178 bioset_exit(&log->bs); 3177 3179 mempool_exit(&log->io_pool);

+37 -40

drivers/md/raid5-log.h

··· 2 2 #ifndef _RAID5_LOG_H 3 3 #define _RAID5_LOG_H 4 4 5 - extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev); 6 - extern void r5l_exit_log(struct r5conf *conf); 7 - extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh); 8 - extern void r5l_write_stripe_run(struct r5l_log *log); 9 - extern void r5l_flush_stripe_to_raid(struct r5l_log *log); 10 - extern void r5l_stripe_write_finished(struct stripe_head *sh); 11 - extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); 12 - extern void r5l_quiesce(struct r5l_log *log, int quiesce); 13 - extern bool r5l_log_disk_error(struct r5conf *conf); 14 - extern bool r5c_is_writeback(struct r5l_log *log); 15 - extern int 16 - r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh, 17 - struct stripe_head_state *s, int disks); 18 - extern void 19 - r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, 20 - struct stripe_head_state *s); 21 - extern void r5c_release_extra_page(struct stripe_head *sh); 22 - extern void r5c_use_extra_page(struct stripe_head *sh); 23 - extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space); 24 - extern void r5c_handle_cached_data_endio(struct r5conf *conf, 25 - struct stripe_head *sh, int disks); 26 - extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh); 27 - extern void r5c_make_stripe_write_out(struct stripe_head *sh); 28 - extern void r5c_flush_cache(struct r5conf *conf, int num); 29 - extern void r5c_check_stripe_cache_usage(struct r5conf *conf); 30 - extern void r5c_check_cached_full_stripe(struct r5conf *conf); 5 + int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev); 6 + void r5l_exit_log(struct r5conf *conf); 7 + int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh); 8 + void r5l_write_stripe_run(struct r5l_log *log); 9 + void r5l_flush_stripe_to_raid(struct r5l_log *log); 10 + void r5l_stripe_write_finished(struct stripe_head *sh); 11 + int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); 12 + void r5l_quiesce(struct r5l_log *log, int quiesce); 13 + bool r5l_log_disk_error(struct r5conf *conf); 14 + bool r5c_is_writeback(struct r5l_log *log); 15 + int r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh, 16 + struct stripe_head_state *s, int disks); 17 + void r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, 18 + struct stripe_head_state *s); 19 + void r5c_release_extra_page(struct stripe_head *sh); 20 + void r5c_use_extra_page(struct stripe_head *sh); 21 + void r5l_wake_reclaim(struct r5l_log *log, sector_t space); 22 + void r5c_handle_cached_data_endio(struct r5conf *conf, 23 + struct stripe_head *sh, int disks); 24 + int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh); 25 + void r5c_make_stripe_write_out(struct stripe_head *sh); 26 + void r5c_flush_cache(struct r5conf *conf, int num); 27 + void r5c_check_stripe_cache_usage(struct r5conf *conf); 28 + void r5c_check_cached_full_stripe(struct r5conf *conf); 31 29 extern struct md_sysfs_entry r5c_journal_mode; 32 - extern void r5c_update_on_rdev_error(struct mddev *mddev, 33 - struct md_rdev *rdev); 34 - extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); 35 - extern int r5l_start(struct r5l_log *log); 30 + void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev); 31 + bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); 32 + int r5l_start(struct r5l_log *log); 36 33 37 - extern struct dma_async_tx_descriptor * 34 + struct dma_async_tx_descriptor * 38 35 ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu, 39 36 struct dma_async_tx_descriptor *tx); 40 - extern int ppl_init_log(struct r5conf *conf); 41 - extern void ppl_exit_log(struct r5conf *conf); 42 - extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh); 43 - extern void ppl_write_stripe_run(struct r5conf *conf); 44 - extern void ppl_stripe_write_finished(struct stripe_head *sh); 45 - extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add); 46 - extern void ppl_quiesce(struct r5conf *conf, int quiesce); 47 - extern int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio); 37 + int ppl_init_log(struct r5conf *conf); 38 + void ppl_exit_log(struct r5conf *conf); 39 + int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh); 40 + void ppl_write_stripe_run(struct r5conf *conf); 41 + void ppl_stripe_write_finished(struct stripe_head *sh); 42 + int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add); 43 + void ppl_quiesce(struct r5conf *conf, int quiesce); 44 + int ppl_handle_flush_request(struct bio *bio); 48 45 extern struct md_sysfs_entry ppl_write_hint; 49 46 50 47 static inline bool raid5_has_log(struct r5conf *conf) ··· 108 111 if (conf->log) 109 112 ret = r5l_handle_flush_request(conf->log, bio); 110 113 else if (raid5_has_ppl(conf)) 111 - ret = ppl_handle_flush_request(conf->log, bio); 114 + ret = ppl_handle_flush_request(bio); 112 115 113 116 return ret; 114 117 }

+1 -1

drivers/md/raid5-ppl.c

··· 679 679 } 680 680 } 681 681 682 - int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio) 682 + int ppl_handle_flush_request(struct bio *bio) 683 683 { 684 684 if (bio->bi_iter.bi_size == 0) { 685 685 bio_endio(bio);

+486 -249

drivers/md/raid5.c

··· 61 61 #define cpu_to_group(cpu) cpu_to_node(cpu) 62 62 #define ANY_GROUP NUMA_NO_NODE 63 63 64 + #define RAID5_MAX_REQ_STRIPES 256 65 + 64 66 static bool devices_handle_discard_safely = false; 65 67 module_param(devices_handle_discard_safely, bool, 0644); 66 68 MODULE_PARM_DESC(devices_handle_discard_safely, ··· 626 624 return NULL; 627 625 } 628 626 627 + static struct stripe_head *find_get_stripe(struct r5conf *conf, 628 + sector_t sector, short generation, int hash) 629 + { 630 + int inc_empty_inactive_list_flag; 631 + struct stripe_head *sh; 632 + 633 + sh = __find_stripe(conf, sector, generation); 634 + if (!sh) 635 + return NULL; 636 + 637 + if (atomic_inc_not_zero(&sh->count)) 638 + return sh; 639 + 640 + /* 641 + * Slow path. The reference count is zero which means the stripe must 642 + * be on a list (sh->lru). Must remove the stripe from the list that 643 + * references it with the device_lock held. 644 + */ 645 + 646 + spin_lock(&conf->device_lock); 647 + if (!atomic_read(&sh->count)) { 648 + if (!test_bit(STRIPE_HANDLE, &sh->state)) 649 + atomic_inc(&conf->active_stripes); 650 + BUG_ON(list_empty(&sh->lru) && 651 + !test_bit(STRIPE_EXPANDING, &sh->state)); 652 + inc_empty_inactive_list_flag = 0; 653 + if (!list_empty(conf->inactive_list + hash)) 654 + inc_empty_inactive_list_flag = 1; 655 + list_del_init(&sh->lru); 656 + if (list_empty(conf->inactive_list + hash) && 657 + inc_empty_inactive_list_flag) 658 + atomic_inc(&conf->empty_inactive_list_nr); 659 + if (sh->group) { 660 + sh->group->stripes_cnt--; 661 + sh->group = NULL; 662 + } 663 + } 664 + atomic_inc(&sh->count); 665 + spin_unlock(&conf->device_lock); 666 + 667 + return sh; 668 + } 669 + 629 670 /* 630 671 * Need to check if array has failed when deciding whether to: 631 672 * - start an array ··· 755 710 return degraded > conf->max_degraded; 756 711 } 757 712 758 - struct stripe_head * 759 - raid5_get_active_stripe(struct r5conf *conf, sector_t sector, 760 - int previous, int noblock, int noquiesce) 713 + enum stripe_result { 714 + STRIPE_SUCCESS = 0, 715 + STRIPE_RETRY, 716 + STRIPE_SCHEDULE_AND_RETRY, 717 + STRIPE_FAIL, 718 + }; 719 + 720 + struct stripe_request_ctx { 721 + /* a reference to the last stripe_head for batching */ 722 + struct stripe_head *batch_last; 723 + 724 + /* first sector in the request */ 725 + sector_t first_sector; 726 + 727 + /* last sector in the request */ 728 + sector_t last_sector; 729 + 730 + /* 731 + * bitmap to track stripe sectors that have been added to stripes 732 + * add one to account for unaligned requests 733 + */ 734 + DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES + 1); 735 + 736 + /* the request had REQ_PREFLUSH, cleared after the first stripe_head */ 737 + bool do_flush; 738 + }; 739 + 740 + /* 741 + * Block until another thread clears R5_INACTIVE_BLOCKED or 742 + * there are fewer than 3/4 the maximum number of active stripes 743 + * and there is an inactive stripe available. 744 + */ 745 + static bool is_inactive_blocked(struct r5conf *conf, int hash) 746 + { 747 + int active = atomic_read(&conf->active_stripes); 748 + 749 + if (list_empty(conf->inactive_list + hash)) 750 + return false; 751 + 752 + if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) 753 + return true; 754 + 755 + return active < (conf->max_nr_stripes * 3 / 4); 756 + } 757 + 758 + static struct stripe_head *__raid5_get_active_stripe(struct r5conf *conf, 759 + struct stripe_request_ctx *ctx, sector_t sector, 760 + bool previous, bool noblock, bool noquiesce) 761 761 { 762 762 struct stripe_head *sh; 763 763 int hash = stripe_hash_locks_hash(conf, sector); 764 - int inc_empty_inactive_list_flag; 765 764 766 765 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 767 766 768 767 spin_lock_irq(conf->hash_locks + hash); 769 768 770 - do { 771 - wait_event_lock_irq(conf->wait_for_quiescent, 772 - conf->quiesce == 0 || noquiesce, 773 - *(conf->hash_locks + hash)); 774 - sh = __find_stripe(conf, sector, conf->generation - previous); 775 - if (!sh) { 776 - if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { 777 - sh = get_free_stripe(conf, hash); 778 - if (!sh && !test_bit(R5_DID_ALLOC, 779 - &conf->cache_state)) 780 - set_bit(R5_ALLOC_MORE, 781 - &conf->cache_state); 782 - } 783 - if (noblock && sh == NULL) 784 - break; 785 - 786 - r5c_check_stripe_cache_usage(conf); 787 - if (!sh) { 788 - set_bit(R5_INACTIVE_BLOCKED, 789 - &conf->cache_state); 790 - r5l_wake_reclaim(conf->log, 0); 791 - wait_event_lock_irq( 792 - conf->wait_for_stripe, 793 - !list_empty(conf->inactive_list + hash) && 794 - (atomic_read(&conf->active_stripes) 795 - < (conf->max_nr_stripes * 3 / 4) 796 - || !test_bit(R5_INACTIVE_BLOCKED, 797 - &conf->cache_state)), 798 - *(conf->hash_locks + hash)); 799 - clear_bit(R5_INACTIVE_BLOCKED, 800 - &conf->cache_state); 801 - } else { 802 - init_stripe(sh, sector, previous); 803 - atomic_inc(&sh->count); 804 - } 805 - } else if (!atomic_inc_not_zero(&sh->count)) { 806 - spin_lock(&conf->device_lock); 807 - if (!atomic_read(&sh->count)) { 808 - if (!test_bit(STRIPE_HANDLE, &sh->state)) 809 - atomic_inc(&conf->active_stripes); 810 - BUG_ON(list_empty(&sh->lru) && 811 - !test_bit(STRIPE_EXPANDING, &sh->state)); 812 - inc_empty_inactive_list_flag = 0; 813 - if (!list_empty(conf->inactive_list + hash)) 814 - inc_empty_inactive_list_flag = 1; 815 - list_del_init(&sh->lru); 816 - if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) 817 - atomic_inc(&conf->empty_inactive_list_nr); 818 - if (sh->group) { 819 - sh->group->stripes_cnt--; 820 - sh->group = NULL; 821 - } 822 - } 823 - atomic_inc(&sh->count); 824 - spin_unlock(&conf->device_lock); 769 + retry: 770 + if (!noquiesce && conf->quiesce) { 771 + /* 772 + * Must release the reference to batch_last before waiting, 773 + * on quiesce, otherwise the batch_last will hold a reference 774 + * to a stripe and raid5_quiesce() will deadlock waiting for 775 + * active_stripes to go to zero. 776 + */ 777 + if (ctx && ctx->batch_last) { 778 + raid5_release_stripe(ctx->batch_last); 779 + ctx->batch_last = NULL; 825 780 } 826 - } while (sh == NULL); 827 781 782 + wait_event_lock_irq(conf->wait_for_quiescent, !conf->quiesce, 783 + *(conf->hash_locks + hash)); 784 + } 785 + 786 + sh = find_get_stripe(conf, sector, conf->generation - previous, hash); 787 + if (sh) 788 + goto out; 789 + 790 + if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) 791 + goto wait_for_stripe; 792 + 793 + sh = get_free_stripe(conf, hash); 794 + if (sh) { 795 + r5c_check_stripe_cache_usage(conf); 796 + init_stripe(sh, sector, previous); 797 + atomic_inc(&sh->count); 798 + goto out; 799 + } 800 + 801 + if (!test_bit(R5_DID_ALLOC, &conf->cache_state)) 802 + set_bit(R5_ALLOC_MORE, &conf->cache_state); 803 + 804 + wait_for_stripe: 805 + if (noblock) 806 + goto out; 807 + 808 + set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); 809 + r5l_wake_reclaim(conf->log, 0); 810 + wait_event_lock_irq(conf->wait_for_stripe, 811 + is_inactive_blocked(conf, hash), 812 + *(conf->hash_locks + hash)); 813 + clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); 814 + goto retry; 815 + 816 + out: 828 817 spin_unlock_irq(conf->hash_locks + hash); 829 818 return sh; 819 + } 820 + 821 + struct stripe_head *raid5_get_active_stripe(struct r5conf *conf, 822 + sector_t sector, bool previous, bool noblock, bool noquiesce) 823 + { 824 + return __raid5_get_active_stripe(conf, NULL, sector, previous, noblock, 825 + noquiesce); 830 826 } 831 827 832 828 static bool is_full_stripe_write(struct stripe_head *sh) ··· 910 824 } 911 825 912 826 /* we only do back search */ 913 - static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh) 827 + static void stripe_add_to_batch_list(struct r5conf *conf, 828 + struct stripe_head *sh, struct stripe_head *last_sh) 914 829 { 915 830 struct stripe_head *head; 916 831 sector_t head_sector, tmp_sec; 917 832 int hash; 918 833 int dd_idx; 919 - int inc_empty_inactive_list_flag; 920 834 921 835 /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ 922 836 tmp_sec = sh->sector; ··· 924 838 return; 925 839 head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf); 926 840 927 - hash = stripe_hash_locks_hash(conf, head_sector); 928 - spin_lock_irq(conf->hash_locks + hash); 929 - head = __find_stripe(conf, head_sector, conf->generation); 930 - if (head && !atomic_inc_not_zero(&head->count)) { 931 - spin_lock(&conf->device_lock); 932 - if (!atomic_read(&head->count)) { 933 - if (!test_bit(STRIPE_HANDLE, &head->state)) 934 - atomic_inc(&conf->active_stripes); 935 - BUG_ON(list_empty(&head->lru) && 936 - !test_bit(STRIPE_EXPANDING, &head->state)); 937 - inc_empty_inactive_list_flag = 0; 938 - if (!list_empty(conf->inactive_list + hash)) 939 - inc_empty_inactive_list_flag = 1; 940 - list_del_init(&head->lru); 941 - if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) 942 - atomic_inc(&conf->empty_inactive_list_nr); 943 - if (head->group) { 944 - head->group->stripes_cnt--; 945 - head->group = NULL; 946 - } 947 - } 841 + if (last_sh && head_sector == last_sh->sector) { 842 + head = last_sh; 948 843 atomic_inc(&head->count); 949 - spin_unlock(&conf->device_lock); 844 + } else { 845 + hash = stripe_hash_locks_hash(conf, head_sector); 846 + spin_lock_irq(conf->hash_locks + hash); 847 + head = find_get_stripe(conf, head_sector, conf->generation, 848 + hash); 849 + spin_unlock_irq(conf->hash_locks + hash); 850 + if (!head) 851 + return; 852 + if (!stripe_can_batch(head)) 853 + goto out; 950 854 } 951 - spin_unlock_irq(conf->hash_locks + hash); 952 - 953 - if (!head) 954 - return; 955 - if (!stripe_can_batch(head)) 956 - goto out; 957 855 958 856 lock_two_stripes(head, sh); 959 857 /* clear_batch_ready clear the flag */ ··· 2952 2882 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 2953 2883 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2954 2884 set_bit(STRIPE_HANDLE, &sh->state); 2955 - raid5_release_stripe(sh); 2956 2885 2957 2886 if (sh->batch_head && sh != sh->batch_head) 2958 2887 raid5_release_stripe(sh->batch_head); 2888 + raid5_release_stripe(sh); 2959 2889 } 2960 2890 2961 2891 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) ··· 3483 3413 s->locked, s->ops_request); 3484 3414 } 3485 3415 3486 - /* 3487 - * Each stripe/dev can have one or more bion attached. 3488 - * toread/towrite point to the first in a chain. 3489 - * The bi_next chain must be in order. 3490 - */ 3491 - static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, 3492 - int forwrite, int previous) 3416 + static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi, 3417 + int dd_idx, int forwrite) 3493 3418 { 3494 - struct bio **bip; 3495 3419 struct r5conf *conf = sh->raid_conf; 3496 - int firstwrite=0; 3420 + struct bio **bip; 3497 3421 3498 - pr_debug("adding bi b#%llu to stripe s#%llu\n", 3499 - (unsigned long long)bi->bi_iter.bi_sector, 3500 - (unsigned long long)sh->sector); 3422 + pr_debug("checking bi b#%llu to stripe s#%llu\n", 3423 + bi->bi_iter.bi_sector, sh->sector); 3501 3424 3502 - spin_lock_irq(&sh->stripe_lock); 3503 3425 /* Don't allow new IO added to stripes in batch list */ 3504 3426 if (sh->batch_head) 3505 - goto overlap; 3506 - if (forwrite) { 3427 + return true; 3428 + 3429 + if (forwrite) 3507 3430 bip = &sh->dev[dd_idx].towrite; 3508 - if (*bip == NULL) 3509 - firstwrite = 1; 3510 - } else 3431 + else 3511 3432 bip = &sh->dev[dd_idx].toread; 3433 + 3512 3434 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { 3513 3435 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) 3514 - goto overlap; 3515 - bip = & (*bip)->bi_next; 3436 + return true; 3437 + bip = &(*bip)->bi_next; 3516 3438 } 3439 + 3517 3440 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) 3518 - goto overlap; 3441 + return true; 3519 3442 3520 3443 if (forwrite && raid5_has_ppl(conf)) { 3521 3444 /* ··· 3537 3474 } 3538 3475 3539 3476 if (first + conf->chunk_sectors * (count - 1) != last) 3540 - goto overlap; 3477 + return true; 3541 3478 } 3479 + 3480 + return false; 3481 + } 3482 + 3483 + static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi, 3484 + int dd_idx, int forwrite, int previous) 3485 + { 3486 + struct r5conf *conf = sh->raid_conf; 3487 + struct bio **bip; 3488 + int firstwrite = 0; 3489 + 3490 + if (forwrite) { 3491 + bip = &sh->dev[dd_idx].towrite; 3492 + if (!*bip) 3493 + firstwrite = 1; 3494 + } else { 3495 + bip = &sh->dev[dd_idx].toread; 3496 + } 3497 + 3498 + while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) 3499 + bip = &(*bip)->bi_next; 3542 3500 3543 3501 if (!forwrite || previous) 3544 3502 clear_bit(STRIPE_BATCH_READY, &sh->state); ··· 3586 3502 sh->overwrite_disks++; 3587 3503 } 3588 3504 3589 - pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 3590 - (unsigned long long)(*bip)->bi_iter.bi_sector, 3591 - (unsigned long long)sh->sector, dd_idx); 3505 + pr_debug("added bi b#%llu to stripe s#%llu, disk %d, logical %llu\n", 3506 + (*bip)->bi_iter.bi_sector, sh->sector, dd_idx, 3507 + sh->dev[dd_idx].sector); 3592 3508 3593 3509 if (conf->mddev->bitmap && firstwrite) { 3594 3510 /* Cannot hold spinlock over bitmap_startwrite, ··· 3596 3512 * we have added to the bitmap and set bm_seq. 3597 3513 * So set STRIPE_BITMAP_PENDING to prevent 3598 3514 * batching. 3599 - * If multiple add_stripe_bio() calls race here they 3515 + * If multiple __add_stripe_bio() calls race here they 3600 3516 * much all set STRIPE_BITMAP_PENDING. So only the first one 3601 3517 * to complete "bitmap_startwrite" gets to set 3602 3518 * STRIPE_BIT_DELAY. This is important as once a stripe ··· 3614 3530 set_bit(STRIPE_BIT_DELAY, &sh->state); 3615 3531 } 3616 3532 } 3617 - spin_unlock_irq(&sh->stripe_lock); 3533 + } 3618 3534 3619 - if (stripe_can_batch(sh)) 3620 - stripe_add_to_batch_list(conf, sh); 3621 - return 1; 3535 + /* 3536 + * Each stripe/dev can have one or more bios attached. 3537 + * toread/towrite point to the first in a chain. 3538 + * The bi_next chain must be in order. 3539 + */ 3540 + static bool add_stripe_bio(struct stripe_head *sh, struct bio *bi, 3541 + int dd_idx, int forwrite, int previous) 3542 + { 3543 + spin_lock_irq(&sh->stripe_lock); 3622 3544 3623 - overlap: 3624 - set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 3545 + if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) { 3546 + set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 3547 + spin_unlock_irq(&sh->stripe_lock); 3548 + return false; 3549 + } 3550 + 3551 + __add_stripe_bio(sh, bi, dd_idx, forwrite, previous); 3625 3552 spin_unlock_irq(&sh->stripe_lock); 3626 - return 0; 3553 + return true; 3627 3554 } 3628 3555 3629 3556 static void end_reshape(struct r5conf *conf); ··· 5880 5785 bio_endio(bi); 5881 5786 } 5882 5787 5788 + static bool ahead_of_reshape(struct mddev *mddev, sector_t sector, 5789 + sector_t reshape_sector) 5790 + { 5791 + return mddev->reshape_backwards ? sector < reshape_sector : 5792 + sector >= reshape_sector; 5793 + } 5794 + 5795 + static bool range_ahead_of_reshape(struct mddev *mddev, sector_t min, 5796 + sector_t max, sector_t reshape_sector) 5797 + { 5798 + return mddev->reshape_backwards ? max < reshape_sector : 5799 + min >= reshape_sector; 5800 + } 5801 + 5802 + static bool stripe_ahead_of_reshape(struct mddev *mddev, struct r5conf *conf, 5803 + struct stripe_head *sh) 5804 + { 5805 + sector_t max_sector = 0, min_sector = MaxSector; 5806 + bool ret = false; 5807 + int dd_idx; 5808 + 5809 + for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { 5810 + if (dd_idx == sh->pd_idx) 5811 + continue; 5812 + 5813 + min_sector = min(min_sector, sh->dev[dd_idx].sector); 5814 + max_sector = min(max_sector, sh->dev[dd_idx].sector); 5815 + } 5816 + 5817 + spin_lock_irq(&conf->device_lock); 5818 + 5819 + if (!range_ahead_of_reshape(mddev, min_sector, max_sector, 5820 + conf->reshape_progress)) 5821 + /* mismatch, need to try again */ 5822 + ret = true; 5823 + 5824 + spin_unlock_irq(&conf->device_lock); 5825 + 5826 + return ret; 5827 + } 5828 + 5829 + static int add_all_stripe_bios(struct r5conf *conf, 5830 + struct stripe_request_ctx *ctx, struct stripe_head *sh, 5831 + struct bio *bi, int forwrite, int previous) 5832 + { 5833 + int dd_idx; 5834 + int ret = 1; 5835 + 5836 + spin_lock_irq(&sh->stripe_lock); 5837 + 5838 + for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { 5839 + struct r5dev *dev = &sh->dev[dd_idx]; 5840 + 5841 + if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) 5842 + continue; 5843 + 5844 + if (dev->sector < ctx->first_sector || 5845 + dev->sector >= ctx->last_sector) 5846 + continue; 5847 + 5848 + if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) { 5849 + set_bit(R5_Overlap, &dev->flags); 5850 + ret = 0; 5851 + continue; 5852 + } 5853 + } 5854 + 5855 + if (!ret) 5856 + goto out; 5857 + 5858 + for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { 5859 + struct r5dev *dev = &sh->dev[dd_idx]; 5860 + 5861 + if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) 5862 + continue; 5863 + 5864 + if (dev->sector < ctx->first_sector || 5865 + dev->sector >= ctx->last_sector) 5866 + continue; 5867 + 5868 + __add_stripe_bio(sh, bi, dd_idx, forwrite, previous); 5869 + clear_bit((dev->sector - ctx->first_sector) >> 5870 + RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do); 5871 + } 5872 + 5873 + out: 5874 + spin_unlock_irq(&sh->stripe_lock); 5875 + return ret; 5876 + } 5877 + 5878 + static enum stripe_result make_stripe_request(struct mddev *mddev, 5879 + struct r5conf *conf, struct stripe_request_ctx *ctx, 5880 + sector_t logical_sector, struct bio *bi) 5881 + { 5882 + const int rw = bio_data_dir(bi); 5883 + enum stripe_result ret; 5884 + struct stripe_head *sh; 5885 + sector_t new_sector; 5886 + int previous = 0; 5887 + int seq, dd_idx; 5888 + 5889 + seq = read_seqcount_begin(&conf->gen_lock); 5890 + 5891 + if (unlikely(conf->reshape_progress != MaxSector)) { 5892 + /* 5893 + * Spinlock is needed as reshape_progress may be 5894 + * 64bit on a 32bit platform, and so it might be 5895 + * possible to see a half-updated value 5896 + * Of course reshape_progress could change after 5897 + * the lock is dropped, so once we get a reference 5898 + * to the stripe that we think it is, we will have 5899 + * to check again. 5900 + */ 5901 + spin_lock_irq(&conf->device_lock); 5902 + if (ahead_of_reshape(mddev, logical_sector, 5903 + conf->reshape_progress)) { 5904 + previous = 1; 5905 + } else { 5906 + if (ahead_of_reshape(mddev, logical_sector, 5907 + conf->reshape_safe)) { 5908 + spin_unlock_irq(&conf->device_lock); 5909 + return STRIPE_SCHEDULE_AND_RETRY; 5910 + } 5911 + } 5912 + spin_unlock_irq(&conf->device_lock); 5913 + } 5914 + 5915 + new_sector = raid5_compute_sector(conf, logical_sector, previous, 5916 + &dd_idx, NULL); 5917 + pr_debug("raid456: %s, sector %llu logical %llu\n", __func__, 5918 + new_sector, logical_sector); 5919 + 5920 + sh = __raid5_get_active_stripe(conf, ctx, new_sector, previous, 5921 + (bi->bi_opf & REQ_RAHEAD), 0); 5922 + if (unlikely(!sh)) { 5923 + /* cannot get stripe, just give-up */ 5924 + bi->bi_status = BLK_STS_IOERR; 5925 + return STRIPE_FAIL; 5926 + } 5927 + 5928 + if (unlikely(previous) && 5929 + stripe_ahead_of_reshape(mddev, conf, sh)) { 5930 + /* 5931 + * Expansion moved on while waiting for a stripe. 5932 + * Expansion could still move past after this 5933 + * test, but as we are holding a reference to 5934 + * 'sh', we know that if that happens, 5935 + * STRIPE_EXPANDING will get set and the expansion 5936 + * won't proceed until we finish with the stripe. 5937 + */ 5938 + ret = STRIPE_SCHEDULE_AND_RETRY; 5939 + goto out_release; 5940 + } 5941 + 5942 + if (read_seqcount_retry(&conf->gen_lock, seq)) { 5943 + /* Might have got the wrong stripe_head by accident */ 5944 + ret = STRIPE_RETRY; 5945 + goto out_release; 5946 + } 5947 + 5948 + if (test_bit(STRIPE_EXPANDING, &sh->state) || 5949 + !add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) { 5950 + /* 5951 + * Stripe is busy expanding or add failed due to 5952 + * overlap. Flush everything and wait a while. 5953 + */ 5954 + md_wakeup_thread(mddev->thread); 5955 + ret = STRIPE_SCHEDULE_AND_RETRY; 5956 + goto out_release; 5957 + } 5958 + 5959 + if (stripe_can_batch(sh)) { 5960 + stripe_add_to_batch_list(conf, sh, ctx->batch_last); 5961 + if (ctx->batch_last) 5962 + raid5_release_stripe(ctx->batch_last); 5963 + atomic_inc(&sh->count); 5964 + ctx->batch_last = sh; 5965 + } 5966 + 5967 + if (ctx->do_flush) { 5968 + set_bit(STRIPE_R5C_PREFLUSH, &sh->state); 5969 + /* we only need flush for one stripe */ 5970 + ctx->do_flush = false; 5971 + } 5972 + 5973 + set_bit(STRIPE_HANDLE, &sh->state); 5974 + clear_bit(STRIPE_DELAYED, &sh->state); 5975 + if ((!sh->batch_head || sh == sh->batch_head) && 5976 + (bi->bi_opf & REQ_SYNC) && 5977 + !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5978 + atomic_inc(&conf->preread_active_stripes); 5979 + 5980 + release_stripe_plug(mddev, sh); 5981 + return STRIPE_SUCCESS; 5982 + 5983 + out_release: 5984 + raid5_release_stripe(sh); 5985 + return ret; 5986 + } 5987 + 5883 5988 static bool raid5_make_request(struct mddev *mddev, struct bio * bi) 5884 5989 { 5990 + DEFINE_WAIT_FUNC(wait, woken_wake_function); 5885 5991 struct r5conf *conf = mddev->private; 5886 - int dd_idx; 5887 - sector_t new_sector; 5888 - sector_t logical_sector, last_sector; 5889 - struct stripe_head *sh; 5992 + sector_t logical_sector; 5993 + struct stripe_request_ctx ctx = {}; 5890 5994 const int rw = bio_data_dir(bi); 5891 - DEFINE_WAIT(w); 5892 - bool do_prepare; 5893 - bool do_flush = false; 5995 + enum stripe_result res; 5996 + int s, stripe_cnt; 5894 5997 5895 5998 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { 5896 5999 int ret = log_handle_flush_request(conf, bi); ··· 6104 5811 * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, 6105 5812 * we need to flush journal device 6106 5813 */ 6107 - do_flush = bi->bi_opf & REQ_PREFLUSH; 5814 + ctx.do_flush = bi->bi_opf & REQ_PREFLUSH; 6108 5815 } 6109 5816 6110 5817 if (!md_write_start(mddev, bi)) ··· 6128 5835 } 6129 5836 6130 5837 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); 6131 - last_sector = bio_end_sector(bi); 5838 + ctx.first_sector = logical_sector; 5839 + ctx.last_sector = bio_end_sector(bi); 6132 5840 bi->bi_next = NULL; 5841 + 5842 + stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector, 5843 + RAID5_STRIPE_SECTORS(conf)); 5844 + bitmap_set(ctx.sectors_to_do, 0, stripe_cnt); 5845 + 5846 + pr_debug("raid456: %s, logical %llu to %llu\n", __func__, 5847 + bi->bi_iter.bi_sector, ctx.last_sector); 6133 5848 6134 5849 /* Bail out if conflicts with reshape and REQ_NOWAIT is set */ 6135 5850 if ((bi->bi_opf & REQ_NOWAIT) && 6136 5851 (conf->reshape_progress != MaxSector) && 6137 - (mddev->reshape_backwards 6138 - ? (logical_sector > conf->reshape_progress && logical_sector <= conf->reshape_safe) 6139 - : (logical_sector >= conf->reshape_safe && logical_sector < conf->reshape_progress))) { 5852 + !ahead_of_reshape(mddev, logical_sector, conf->reshape_progress) && 5853 + ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) { 6140 5854 bio_wouldblock_error(bi); 6141 5855 if (rw == WRITE) 6142 5856 md_write_end(mddev); 6143 5857 return true; 6144 5858 } 6145 5859 md_account_bio(mddev, &bi); 6146 - prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 6147 - for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) { 6148 - int previous; 6149 - int seq; 6150 5860 6151 - do_prepare = false; 6152 - retry: 6153 - seq = read_seqcount_begin(&conf->gen_lock); 6154 - previous = 0; 6155 - if (do_prepare) 6156 - prepare_to_wait(&conf->wait_for_overlap, &w, 6157 - TASK_UNINTERRUPTIBLE); 6158 - if (unlikely(conf->reshape_progress != MaxSector)) { 6159 - /* spinlock is needed as reshape_progress may be 6160 - * 64bit on a 32bit platform, and so it might be 6161 - * possible to see a half-updated value 6162 - * Of course reshape_progress could change after 6163 - * the lock is dropped, so once we get a reference 6164 - * to the stripe that we think it is, we will have 6165 - * to check again. 6166 - */ 6167 - spin_lock_irq(&conf->device_lock); 6168 - if (mddev->reshape_backwards 6169 - ? logical_sector < conf->reshape_progress 6170 - : logical_sector >= conf->reshape_progress) { 6171 - previous = 1; 6172 - } else { 6173 - if (mddev->reshape_backwards 6174 - ? logical_sector < conf->reshape_safe 6175 - : logical_sector >= conf->reshape_safe) { 6176 - spin_unlock_irq(&conf->device_lock); 6177 - schedule(); 6178 - do_prepare = true; 6179 - goto retry; 6180 - } 6181 - } 6182 - spin_unlock_irq(&conf->device_lock); 6183 - } 6184 - 6185 - new_sector = raid5_compute_sector(conf, logical_sector, 6186 - previous, 6187 - &dd_idx, NULL); 6188 - pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n", 6189 - (unsigned long long)new_sector, 6190 - (unsigned long long)logical_sector); 6191 - 6192 - sh = raid5_get_active_stripe(conf, new_sector, previous, 6193 - (bi->bi_opf & REQ_RAHEAD), 0); 6194 - if (sh) { 6195 - if (unlikely(previous)) { 6196 - /* expansion might have moved on while waiting for a 6197 - * stripe, so we must do the range check again. 6198 - * Expansion could still move past after this 6199 - * test, but as we are holding a reference to 6200 - * 'sh', we know that if that happens, 6201 - * STRIPE_EXPANDING will get set and the expansion 6202 - * won't proceed until we finish with the stripe. 6203 - */ 6204 - int must_retry = 0; 6205 - spin_lock_irq(&conf->device_lock); 6206 - if (mddev->reshape_backwards 6207 - ? logical_sector >= conf->reshape_progress 6208 - : logical_sector < conf->reshape_progress) 6209 - /* mismatch, need to try again */ 6210 - must_retry = 1; 6211 - spin_unlock_irq(&conf->device_lock); 6212 - if (must_retry) { 6213 - raid5_release_stripe(sh); 6214 - schedule(); 6215 - do_prepare = true; 6216 - goto retry; 6217 - } 6218 - } 6219 - if (read_seqcount_retry(&conf->gen_lock, seq)) { 6220 - /* Might have got the wrong stripe_head 6221 - * by accident 6222 - */ 6223 - raid5_release_stripe(sh); 6224 - goto retry; 6225 - } 6226 - 6227 - if (test_bit(STRIPE_EXPANDING, &sh->state) || 6228 - !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { 6229 - /* Stripe is busy expanding or 6230 - * add failed due to overlap. Flush everything 6231 - * and wait a while 6232 - */ 6233 - md_wakeup_thread(mddev->thread); 6234 - raid5_release_stripe(sh); 6235 - schedule(); 6236 - do_prepare = true; 6237 - goto retry; 6238 - } 6239 - if (do_flush) { 6240 - set_bit(STRIPE_R5C_PREFLUSH, &sh->state); 6241 - /* we only need flush for one stripe */ 6242 - do_flush = false; 6243 - } 6244 - 6245 - set_bit(STRIPE_HANDLE, &sh->state); 6246 - clear_bit(STRIPE_DELAYED, &sh->state); 6247 - if ((!sh->batch_head || sh == sh->batch_head) && 6248 - (bi->bi_opf & REQ_SYNC) && 6249 - !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 6250 - atomic_inc(&conf->preread_active_stripes); 6251 - release_stripe_plug(mddev, sh); 6252 - } else { 6253 - /* cannot get stripe for read-ahead, just give-up */ 6254 - bi->bi_status = BLK_STS_IOERR; 5861 + add_wait_queue(&conf->wait_for_overlap, &wait); 5862 + while (1) { 5863 + res = make_stripe_request(mddev, conf, &ctx, logical_sector, 5864 + bi); 5865 + if (res == STRIPE_FAIL) 6255 5866 break; 5867 + 5868 + if (res == STRIPE_RETRY) 5869 + continue; 5870 + 5871 + if (res == STRIPE_SCHEDULE_AND_RETRY) { 5872 + /* 5873 + * Must release the reference to batch_last before 5874 + * scheduling and waiting for work to be done, 5875 + * otherwise the batch_last stripe head could prevent 5876 + * raid5_activate_delayed() from making progress 5877 + * and thus deadlocking. 5878 + */ 5879 + if (ctx.batch_last) { 5880 + raid5_release_stripe(ctx.batch_last); 5881 + ctx.batch_last = NULL; 5882 + } 5883 + 5884 + wait_woken(&wait, TASK_UNINTERRUPTIBLE, 5885 + MAX_SCHEDULE_TIMEOUT); 5886 + continue; 6256 5887 } 5888 + 5889 + s = find_first_bit(ctx.sectors_to_do, stripe_cnt); 5890 + if (s == stripe_cnt) 5891 + break; 5892 + 5893 + logical_sector = ctx.first_sector + 5894 + (s << RAID5_STRIPE_SHIFT(conf)); 6257 5895 } 6258 - finish_wait(&conf->wait_for_overlap, &w); 5896 + remove_wait_queue(&conf->wait_for_overlap, &wait); 5897 + 5898 + if (ctx.batch_last) 5899 + raid5_release_stripe(ctx.batch_last); 6259 5900 6260 5901 if (rw == WRITE) 6261 5902 md_write_end(mddev); ··· 8042 7815 mddev->queue->limits.discard_granularity < stripe) 8043 7816 blk_queue_max_discard_sectors(mddev->queue, 0); 8044 7817 8045 - blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); 7818 + /* 7819 + * Requests require having a bitmap for each stripe. 7820 + * Limit the max sectors based on this. 7821 + */ 7822 + blk_queue_max_hw_sectors(mddev->queue, 7823 + RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf)); 7824 + 7825 + /* No restrictions on the number of segments in the request */ 7826 + blk_queue_max_segments(mddev->queue, USHRT_MAX); 8046 7827 } 8047 7828 8048 7829 if (log_init(conf, journal_dev, raid5_has_ppl(conf))) ··· 8301 8066 * find the disk ... but prefer rdev->saved_raid_disk 8302 8067 * if possible. 8303 8068 */ 8304 - if (rdev->saved_raid_disk >= 0 && 8305 - rdev->saved_raid_disk >= first && 8069 + if (rdev->saved_raid_disk >= first && 8306 8070 rdev->saved_raid_disk <= last && 8307 8071 conf->disks[rdev->saved_raid_disk].rdev == NULL) 8308 8072 first = rdev->saved_raid_disk; ··· 8938 8704 err = log_init(conf, NULL, true); 8939 8705 if (!err) { 8940 8706 err = resize_stripes(conf, conf->pool_size); 8941 - if (err) 8707 + if (err) { 8708 + mddev_suspend(mddev); 8942 8709 log_exit(conf); 8710 + mddev_resume(mddev); 8711 + } 8943 8712 } 8944 8713 } else 8945 8714 err = -EINVAL;

+1 -1

drivers/md/raid5.h

··· 812 812 struct stripe_head *sh); 813 813 extern struct stripe_head * 814 814 raid5_get_active_stripe(struct r5conf *conf, sector_t sector, 815 - int previous, int noblock, int noquiesce); 815 + bool previous, bool noblock, bool noquiesce); 816 816 extern int raid5_calc_degraded(struct r5conf *conf); 817 817 extern int r5c_journal_mode_set(struct mddev *mddev, int journal_mode); 818 818 #endif

+1

drivers/nvme/Kconfig

··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 menu "NVME Support" 3 3 4 + source "drivers/nvme/common/Kconfig" 4 5 source "drivers/nvme/host/Kconfig" 5 6 source "drivers/nvme/target/Kconfig" 6 7

+1

drivers/nvme/Makefile

··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 3 + obj-$(CONFIG_NVME_COMMON) += common/ 3 4 obj-y += host/ 4 5 obj-y += target/

+4

drivers/nvme/common/Kconfig

··· 1 + # SPDX-License-Identifier: GPL-2.0-only 2 + 3 + config NVME_COMMON 4 + tristate

+7

drivers/nvme/common/Makefile

··· 1 + # SPDX-License-Identifier: GPL-2.0 2 + 3 + ccflags-y += -I$(src) 4 + 5 + obj-$(CONFIG_NVME_COMMON) += nvme-common.o 6 + 7 + nvme-common-y += auth.o

+483

drivers/nvme/common/auth.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (c) 2020 Hannes Reinecke, SUSE Linux 4 + */ 5 + 6 + #include <linux/module.h> 7 + #include <linux/crc32.h> 8 + #include <linux/base64.h> 9 + #include <linux/prandom.h> 10 + #include <linux/scatterlist.h> 11 + #include <asm/unaligned.h> 12 + #include <crypto/hash.h> 13 + #include <crypto/dh.h> 14 + #include <linux/nvme.h> 15 + #include <linux/nvme-auth.h> 16 + 17 + static u32 nvme_dhchap_seqnum; 18 + static DEFINE_MUTEX(nvme_dhchap_mutex); 19 + 20 + u32 nvme_auth_get_seqnum(void) 21 + { 22 + u32 seqnum; 23 + 24 + mutex_lock(&nvme_dhchap_mutex); 25 + if (!nvme_dhchap_seqnum) 26 + nvme_dhchap_seqnum = prandom_u32(); 27 + else { 28 + nvme_dhchap_seqnum++; 29 + if (!nvme_dhchap_seqnum) 30 + nvme_dhchap_seqnum++; 31 + } 32 + seqnum = nvme_dhchap_seqnum; 33 + mutex_unlock(&nvme_dhchap_mutex); 34 + return seqnum; 35 + } 36 + EXPORT_SYMBOL_GPL(nvme_auth_get_seqnum); 37 + 38 + static struct nvme_auth_dhgroup_map { 39 + const char name[16]; 40 + const char kpp[16]; 41 + } dhgroup_map[] = { 42 + [NVME_AUTH_DHGROUP_NULL] = { 43 + .name = "null", .kpp = "null" }, 44 + [NVME_AUTH_DHGROUP_2048] = { 45 + .name = "ffdhe2048", .kpp = "ffdhe2048(dh)" }, 46 + [NVME_AUTH_DHGROUP_3072] = { 47 + .name = "ffdhe3072", .kpp = "ffdhe3072(dh)" }, 48 + [NVME_AUTH_DHGROUP_4096] = { 49 + .name = "ffdhe4096", .kpp = "ffdhe4096(dh)" }, 50 + [NVME_AUTH_DHGROUP_6144] = { 51 + .name = "ffdhe6144", .kpp = "ffdhe6144(dh)" }, 52 + [NVME_AUTH_DHGROUP_8192] = { 53 + .name = "ffdhe8192", .kpp = "ffdhe8192(dh)" }, 54 + }; 55 + 56 + const char *nvme_auth_dhgroup_name(u8 dhgroup_id) 57 + { 58 + if (dhgroup_id >= ARRAY_SIZE(dhgroup_map)) 59 + return NULL; 60 + return dhgroup_map[dhgroup_id].name; 61 + } 62 + EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_name); 63 + 64 + const char *nvme_auth_dhgroup_kpp(u8 dhgroup_id) 65 + { 66 + if (dhgroup_id >= ARRAY_SIZE(dhgroup_map)) 67 + return NULL; 68 + return dhgroup_map[dhgroup_id].kpp; 69 + } 70 + EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_kpp); 71 + 72 + u8 nvme_auth_dhgroup_id(const char *dhgroup_name) 73 + { 74 + int i; 75 + 76 + if (!dhgroup_name || !strlen(dhgroup_name)) 77 + return NVME_AUTH_DHGROUP_INVALID; 78 + for (i = 0; i < ARRAY_SIZE(dhgroup_map); i++) { 79 + if (!strlen(dhgroup_map[i].name)) 80 + continue; 81 + if (!strncmp(dhgroup_map[i].name, dhgroup_name, 82 + strlen(dhgroup_map[i].name))) 83 + return i; 84 + } 85 + return NVME_AUTH_DHGROUP_INVALID; 86 + } 87 + EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_id); 88 + 89 + static struct nvme_dhchap_hash_map { 90 + int len; 91 + const char hmac[15]; 92 + const char digest[8]; 93 + } hash_map[] = { 94 + [NVME_AUTH_HASH_SHA256] = { 95 + .len = 32, 96 + .hmac = "hmac(sha256)", 97 + .digest = "sha256", 98 + }, 99 + [NVME_AUTH_HASH_SHA384] = { 100 + .len = 48, 101 + .hmac = "hmac(sha384)", 102 + .digest = "sha384", 103 + }, 104 + [NVME_AUTH_HASH_SHA512] = { 105 + .len = 64, 106 + .hmac = "hmac(sha512)", 107 + .digest = "sha512", 108 + }, 109 + }; 110 + 111 + const char *nvme_auth_hmac_name(u8 hmac_id) 112 + { 113 + if (hmac_id >= ARRAY_SIZE(hash_map)) 114 + return NULL; 115 + return hash_map[hmac_id].hmac; 116 + } 117 + EXPORT_SYMBOL_GPL(nvme_auth_hmac_name); 118 + 119 + const char *nvme_auth_digest_name(u8 hmac_id) 120 + { 121 + if (hmac_id >= ARRAY_SIZE(hash_map)) 122 + return NULL; 123 + return hash_map[hmac_id].digest; 124 + } 125 + EXPORT_SYMBOL_GPL(nvme_auth_digest_name); 126 + 127 + u8 nvme_auth_hmac_id(const char *hmac_name) 128 + { 129 + int i; 130 + 131 + if (!hmac_name || !strlen(hmac_name)) 132 + return NVME_AUTH_HASH_INVALID; 133 + 134 + for (i = 0; i < ARRAY_SIZE(hash_map); i++) { 135 + if (!strlen(hash_map[i].hmac)) 136 + continue; 137 + if (!strncmp(hash_map[i].hmac, hmac_name, 138 + strlen(hash_map[i].hmac))) 139 + return i; 140 + } 141 + return NVME_AUTH_HASH_INVALID; 142 + } 143 + EXPORT_SYMBOL_GPL(nvme_auth_hmac_id); 144 + 145 + size_t nvme_auth_hmac_hash_len(u8 hmac_id) 146 + { 147 + if (hmac_id >= ARRAY_SIZE(hash_map)) 148 + return 0; 149 + return hash_map[hmac_id].len; 150 + } 151 + EXPORT_SYMBOL_GPL(nvme_auth_hmac_hash_len); 152 + 153 + struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret, 154 + u8 key_hash) 155 + { 156 + struct nvme_dhchap_key *key; 157 + unsigned char *p; 158 + u32 crc; 159 + int ret, key_len; 160 + size_t allocated_len = strlen(secret); 161 + 162 + /* Secret might be affixed with a ':' */ 163 + p = strrchr(secret, ':'); 164 + if (p) 165 + allocated_len = p - secret; 166 + key = kzalloc(sizeof(*key), GFP_KERNEL); 167 + if (!key) 168 + return ERR_PTR(-ENOMEM); 169 + key->key = kzalloc(allocated_len, GFP_KERNEL); 170 + if (!key->key) { 171 + ret = -ENOMEM; 172 + goto out_free_key; 173 + } 174 + 175 + key_len = base64_decode(secret, allocated_len, key->key); 176 + if (key_len < 0) { 177 + pr_debug("base64 key decoding error %d\n", 178 + key_len); 179 + ret = key_len; 180 + goto out_free_secret; 181 + } 182 + 183 + if (key_len != 36 && key_len != 52 && 184 + key_len != 68) { 185 + pr_err("Invalid key len %d\n", key_len); 186 + ret = -EINVAL; 187 + goto out_free_secret; 188 + } 189 + 190 + if (key_hash > 0 && 191 + (key_len - 4) != nvme_auth_hmac_hash_len(key_hash)) { 192 + pr_err("Mismatched key len %d for %s\n", key_len, 193 + nvme_auth_hmac_name(key_hash)); 194 + ret = -EINVAL; 195 + goto out_free_secret; 196 + } 197 + 198 + /* The last four bytes is the CRC in little-endian format */ 199 + key_len -= 4; 200 + /* 201 + * The linux implementation doesn't do pre- and post-increments, 202 + * so we have to do it manually. 203 + */ 204 + crc = ~crc32(~0, key->key, key_len); 205 + 206 + if (get_unaligned_le32(key->key + key_len) != crc) { 207 + pr_err("key crc mismatch (key %08x, crc %08x)\n", 208 + get_unaligned_le32(key->key + key_len), crc); 209 + ret = -EKEYREJECTED; 210 + goto out_free_secret; 211 + } 212 + key->len = key_len; 213 + key->hash = key_hash; 214 + return key; 215 + out_free_secret: 216 + kfree_sensitive(key->key); 217 + out_free_key: 218 + kfree(key); 219 + return ERR_PTR(ret); 220 + } 221 + EXPORT_SYMBOL_GPL(nvme_auth_extract_key); 222 + 223 + void nvme_auth_free_key(struct nvme_dhchap_key *key) 224 + { 225 + if (!key) 226 + return; 227 + kfree_sensitive(key->key); 228 + kfree(key); 229 + } 230 + EXPORT_SYMBOL_GPL(nvme_auth_free_key); 231 + 232 + u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn) 233 + { 234 + const char *hmac_name; 235 + struct crypto_shash *key_tfm; 236 + struct shash_desc *shash; 237 + u8 *transformed_key; 238 + int ret; 239 + 240 + if (!key || !key->key) { 241 + pr_warn("No key specified\n"); 242 + return ERR_PTR(-ENOKEY); 243 + } 244 + if (key->hash == 0) { 245 + transformed_key = kmemdup(key->key, key->len, GFP_KERNEL); 246 + return transformed_key ? transformed_key : ERR_PTR(-ENOMEM); 247 + } 248 + hmac_name = nvme_auth_hmac_name(key->hash); 249 + if (!hmac_name) { 250 + pr_warn("Invalid key hash id %d\n", key->hash); 251 + return ERR_PTR(-EINVAL); 252 + } 253 + 254 + key_tfm = crypto_alloc_shash(hmac_name, 0, 0); 255 + if (IS_ERR(key_tfm)) 256 + return (u8 *)key_tfm; 257 + 258 + shash = kmalloc(sizeof(struct shash_desc) + 259 + crypto_shash_descsize(key_tfm), 260 + GFP_KERNEL); 261 + if (!shash) { 262 + ret = -ENOMEM; 263 + goto out_free_key; 264 + } 265 + 266 + transformed_key = kzalloc(crypto_shash_digestsize(key_tfm), GFP_KERNEL); 267 + if (!transformed_key) { 268 + ret = -ENOMEM; 269 + goto out_free_shash; 270 + } 271 + 272 + shash->tfm = key_tfm; 273 + ret = crypto_shash_setkey(key_tfm, key->key, key->len); 274 + if (ret < 0) 275 + goto out_free_transformed_key; 276 + ret = crypto_shash_init(shash); 277 + if (ret < 0) 278 + goto out_free_transformed_key; 279 + ret = crypto_shash_update(shash, nqn, strlen(nqn)); 280 + if (ret < 0) 281 + goto out_free_transformed_key; 282 + ret = crypto_shash_update(shash, "NVMe-over-Fabrics", 17); 283 + if (ret < 0) 284 + goto out_free_transformed_key; 285 + ret = crypto_shash_final(shash, transformed_key); 286 + if (ret < 0) 287 + goto out_free_transformed_key; 288 + 289 + kfree(shash); 290 + crypto_free_shash(key_tfm); 291 + 292 + return transformed_key; 293 + 294 + out_free_transformed_key: 295 + kfree_sensitive(transformed_key); 296 + out_free_shash: 297 + kfree(shash); 298 + out_free_key: 299 + crypto_free_shash(key_tfm); 300 + 301 + return ERR_PTR(ret); 302 + } 303 + EXPORT_SYMBOL_GPL(nvme_auth_transform_key); 304 + 305 + static int nvme_auth_hash_skey(int hmac_id, u8 *skey, size_t skey_len, u8 *hkey) 306 + { 307 + const char *digest_name; 308 + struct crypto_shash *tfm; 309 + int ret; 310 + 311 + digest_name = nvme_auth_digest_name(hmac_id); 312 + if (!digest_name) { 313 + pr_debug("%s: failed to get digest for %d\n", __func__, 314 + hmac_id); 315 + return -EINVAL; 316 + } 317 + tfm = crypto_alloc_shash(digest_name, 0, 0); 318 + if (IS_ERR(tfm)) 319 + return -ENOMEM; 320 + 321 + ret = crypto_shash_tfm_digest(tfm, skey, skey_len, hkey); 322 + if (ret < 0) 323 + pr_debug("%s: Failed to hash digest len %zu\n", __func__, 324 + skey_len); 325 + 326 + crypto_free_shash(tfm); 327 + return ret; 328 + } 329 + 330 + int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len, 331 + u8 *challenge, u8 *aug, size_t hlen) 332 + { 333 + struct crypto_shash *tfm; 334 + struct shash_desc *desc; 335 + u8 *hashed_key; 336 + const char *hmac_name; 337 + int ret; 338 + 339 + hashed_key = kmalloc(hlen, GFP_KERNEL); 340 + if (!hashed_key) 341 + return -ENOMEM; 342 + 343 + ret = nvme_auth_hash_skey(hmac_id, skey, 344 + skey_len, hashed_key); 345 + if (ret < 0) 346 + goto out_free_key; 347 + 348 + hmac_name = nvme_auth_hmac_name(hmac_id); 349 + if (!hmac_name) { 350 + pr_warn("%s: invalid hash algorithm %d\n", 351 + __func__, hmac_id); 352 + ret = -EINVAL; 353 + goto out_free_key; 354 + } 355 + 356 + tfm = crypto_alloc_shash(hmac_name, 0, 0); 357 + if (IS_ERR(tfm)) { 358 + ret = PTR_ERR(tfm); 359 + goto out_free_key; 360 + } 361 + 362 + desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm), 363 + GFP_KERNEL); 364 + if (!desc) { 365 + ret = -ENOMEM; 366 + goto out_free_hash; 367 + } 368 + desc->tfm = tfm; 369 + 370 + ret = crypto_shash_setkey(tfm, hashed_key, hlen); 371 + if (ret) 372 + goto out_free_desc; 373 + 374 + ret = crypto_shash_init(desc); 375 + if (ret) 376 + goto out_free_desc; 377 + 378 + ret = crypto_shash_update(desc, challenge, hlen); 379 + if (ret) 380 + goto out_free_desc; 381 + 382 + ret = crypto_shash_final(desc, aug); 383 + out_free_desc: 384 + kfree_sensitive(desc); 385 + out_free_hash: 386 + crypto_free_shash(tfm); 387 + out_free_key: 388 + kfree_sensitive(hashed_key); 389 + return ret; 390 + } 391 + EXPORT_SYMBOL_GPL(nvme_auth_augmented_challenge); 392 + 393 + int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid) 394 + { 395 + int ret; 396 + 397 + ret = crypto_kpp_set_secret(dh_tfm, NULL, 0); 398 + if (ret) 399 + pr_debug("failed to set private key, error %d\n", ret); 400 + 401 + return ret; 402 + } 403 + EXPORT_SYMBOL_GPL(nvme_auth_gen_privkey); 404 + 405 + int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm, 406 + u8 *host_key, size_t host_key_len) 407 + { 408 + struct kpp_request *req; 409 + struct crypto_wait wait; 410 + struct scatterlist dst; 411 + int ret; 412 + 413 + req = kpp_request_alloc(dh_tfm, GFP_KERNEL); 414 + if (!req) 415 + return -ENOMEM; 416 + 417 + crypto_init_wait(&wait); 418 + kpp_request_set_input(req, NULL, 0); 419 + sg_init_one(&dst, host_key, host_key_len); 420 + kpp_request_set_output(req, &dst, host_key_len); 421 + kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, 422 + crypto_req_done, &wait); 423 + 424 + ret = crypto_wait_req(crypto_kpp_generate_public_key(req), &wait); 425 + kpp_request_free(req); 426 + return ret; 427 + } 428 + EXPORT_SYMBOL_GPL(nvme_auth_gen_pubkey); 429 + 430 + int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, 431 + u8 *ctrl_key, size_t ctrl_key_len, 432 + u8 *sess_key, size_t sess_key_len) 433 + { 434 + struct kpp_request *req; 435 + struct crypto_wait wait; 436 + struct scatterlist src, dst; 437 + int ret; 438 + 439 + req = kpp_request_alloc(dh_tfm, GFP_KERNEL); 440 + if (!req) 441 + return -ENOMEM; 442 + 443 + crypto_init_wait(&wait); 444 + sg_init_one(&src, ctrl_key, ctrl_key_len); 445 + kpp_request_set_input(req, &src, ctrl_key_len); 446 + sg_init_one(&dst, sess_key, sess_key_len); 447 + kpp_request_set_output(req, &dst, sess_key_len); 448 + kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, 449 + crypto_req_done, &wait); 450 + 451 + ret = crypto_wait_req(crypto_kpp_compute_shared_secret(req), &wait); 452 + 453 + kpp_request_free(req); 454 + return ret; 455 + } 456 + EXPORT_SYMBOL_GPL(nvme_auth_gen_shared_secret); 457 + 458 + int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key) 459 + { 460 + struct nvme_dhchap_key *key; 461 + u8 key_hash; 462 + 463 + if (!secret) { 464 + *ret_key = NULL; 465 + return 0; 466 + } 467 + 468 + if (sscanf(secret, "DHHC-1:%hhd:%*s:", &key_hash) != 1) 469 + return -EINVAL; 470 + 471 + /* Pass in the secret without the 'DHHC-1:XX:' prefix */ 472 + key = nvme_auth_extract_key(secret + 10, key_hash); 473 + if (IS_ERR(key)) { 474 + *ret_key = NULL; 475 + return PTR_ERR(key); 476 + } 477 + 478 + *ret_key = key; 479 + return 0; 480 + } 481 + EXPORT_SYMBOL_GPL(nvme_auth_generate_key); 482 + 483 + MODULE_LICENSE("GPL v2");

+15

drivers/nvme/host/Kconfig

··· 92 92 93 93 If unsure, say N. 94 94 95 + config NVME_AUTH 96 + bool "NVM Express over Fabrics In-Band Authentication" 97 + depends on NVME_CORE 98 + select NVME_COMMON 99 + select CRYPTO 100 + select CRYPTO_HMAC 101 + select CRYPTO_SHA256 102 + select CRYPTO_SHA512 103 + select CRYPTO_DH 104 + select CRYPTO_DH_RFC7919_GROUPS 105 + help 106 + This provides support for NVMe over Fabrics In-Band Authentication. 107 + 108 + If unsure, say N. 109 + 95 110 config NVME_APPLE 96 111 tristate "Apple ANS2 NVM Express host driver" 97 112 depends on OF && BLOCK

+3 -1

drivers/nvme/host/Makefile

··· 10 10 obj-$(CONFIG_NVME_TCP) += nvme-tcp.o 11 11 obj-$(CONFIG_NVME_APPLE) += nvme-apple.o 12 12 13 - nvme-core-y := core.o ioctl.o constants.o 13 + nvme-core-y += core.o ioctl.o 14 + nvme-core-$(CONFIG_NVME_VERBOSE_ERRORS) += constants.o 14 15 nvme-core-$(CONFIG_TRACING) += trace.o 15 16 nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o 16 17 nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o 17 18 nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o 18 19 nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o 20 + nvme-core-$(CONFIG_NVME_AUTH) += auth.o 19 21 20 22 nvme-y += pci.o 21 23

+17 -11

drivers/nvme/host/apple.c

··· 845 845 apple_nvme_handle_cq(&anv->adminq, true); 846 846 spin_unlock_irqrestore(&anv->lock, flags); 847 847 848 - blk_mq_tagset_busy_iter(&anv->tagset, nvme_cancel_request, &anv->ctrl); 849 - blk_mq_tagset_busy_iter(&anv->admin_tagset, nvme_cancel_request, 850 - &anv->ctrl); 851 - blk_mq_tagset_wait_completed_request(&anv->tagset); 852 - blk_mq_tagset_wait_completed_request(&anv->admin_tagset); 848 + nvme_cancel_tagset(&anv->ctrl); 849 + nvme_cancel_admin_tagset(&anv->ctrl); 853 850 854 851 /* 855 852 * The driver will not be starting up queues again if shutting down so ··· 1219 1222 nvme_put_ctrl(&anv->ctrl); 1220 1223 } 1221 1224 1225 + static void devm_apple_nvme_put_tag_set(void *data) 1226 + { 1227 + blk_mq_free_tag_set(data); 1228 + } 1229 + 1222 1230 static int apple_nvme_alloc_tagsets(struct apple_nvme *anv) 1223 1231 { 1224 1232 int ret; ··· 1240 1238 ret = blk_mq_alloc_tag_set(&anv->admin_tagset); 1241 1239 if (ret) 1242 1240 return ret; 1243 - ret = devm_add_action_or_reset(anv->dev, 1244 - (void (*)(void *))blk_mq_free_tag_set, 1241 + ret = devm_add_action_or_reset(anv->dev, devm_apple_nvme_put_tag_set, 1245 1242 &anv->admin_tagset); 1246 1243 if (ret) 1247 1244 return ret; ··· 1264 1263 ret = blk_mq_alloc_tag_set(&anv->tagset); 1265 1264 if (ret) 1266 1265 return ret; 1267 - ret = devm_add_action_or_reset( 1268 - anv->dev, (void (*)(void *))blk_mq_free_tag_set, &anv->tagset); 1266 + ret = devm_add_action_or_reset(anv->dev, devm_apple_nvme_put_tag_set, 1267 + &anv->tagset); 1269 1268 if (ret) 1270 1269 return ret; 1271 1270 ··· 1366 1365 return 0; 1367 1366 } 1368 1367 1368 + static void devm_apple_nvme_mempool_destroy(void *data) 1369 + { 1370 + mempool_destroy(data); 1371 + } 1372 + 1369 1373 static int apple_nvme_probe(struct platform_device *pdev) 1370 1374 { 1371 1375 struct device *dev = &pdev->dev; ··· 1468 1462 ret = -ENOMEM; 1469 1463 goto put_dev; 1470 1464 } 1471 - ret = devm_add_action_or_reset( 1472 - anv->dev, (void (*)(void *))mempool_destroy, anv->iod_mempool); 1465 + ret = devm_add_action_or_reset(anv->dev, 1466 + devm_apple_nvme_mempool_destroy, anv->iod_mempool); 1473 1467 if (ret) 1474 1468 goto put_dev; 1475 1469

+1017

drivers/nvme/host/auth.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (c) 2020 Hannes Reinecke, SUSE Linux 4 + */ 5 + 6 + #include <linux/crc32.h> 7 + #include <linux/base64.h> 8 + #include <linux/prandom.h> 9 + #include <asm/unaligned.h> 10 + #include <crypto/hash.h> 11 + #include <crypto/dh.h> 12 + #include "nvme.h" 13 + #include "fabrics.h" 14 + #include <linux/nvme-auth.h> 15 + 16 + struct nvme_dhchap_queue_context { 17 + struct list_head entry; 18 + struct work_struct auth_work; 19 + struct nvme_ctrl *ctrl; 20 + struct crypto_shash *shash_tfm; 21 + struct crypto_kpp *dh_tfm; 22 + void *buf; 23 + size_t buf_size; 24 + int qid; 25 + int error; 26 + u32 s1; 27 + u32 s2; 28 + u16 transaction; 29 + u8 status; 30 + u8 hash_id; 31 + size_t hash_len; 32 + u8 dhgroup_id; 33 + u8 c1[64]; 34 + u8 c2[64]; 35 + u8 response[64]; 36 + u8 *host_response; 37 + u8 *ctrl_key; 38 + int ctrl_key_len; 39 + u8 *host_key; 40 + int host_key_len; 41 + u8 *sess_key; 42 + int sess_key_len; 43 + }; 44 + 45 + #define nvme_auth_flags_from_qid(qid) \ 46 + (qid == 0) ? 0 : BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED 47 + #define nvme_auth_queue_from_qid(ctrl, qid) \ 48 + (qid == 0) ? (ctrl)->fabrics_q : (ctrl)->connect_q 49 + 50 + static int nvme_auth_submit(struct nvme_ctrl *ctrl, int qid, 51 + void *data, size_t data_len, bool auth_send) 52 + { 53 + struct nvme_command cmd = {}; 54 + blk_mq_req_flags_t flags = nvme_auth_flags_from_qid(qid); 55 + struct request_queue *q = nvme_auth_queue_from_qid(ctrl, qid); 56 + int ret; 57 + 58 + cmd.auth_common.opcode = nvme_fabrics_command; 59 + cmd.auth_common.secp = NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER; 60 + cmd.auth_common.spsp0 = 0x01; 61 + cmd.auth_common.spsp1 = 0x01; 62 + if (auth_send) { 63 + cmd.auth_send.fctype = nvme_fabrics_type_auth_send; 64 + cmd.auth_send.tl = cpu_to_le32(data_len); 65 + } else { 66 + cmd.auth_receive.fctype = nvme_fabrics_type_auth_receive; 67 + cmd.auth_receive.al = cpu_to_le32(data_len); 68 + } 69 + 70 + ret = __nvme_submit_sync_cmd(q, &cmd, NULL, data, data_len, 71 + qid == 0 ? NVME_QID_ANY : qid, 72 + 0, flags); 73 + if (ret > 0) 74 + dev_warn(ctrl->device, 75 + "qid %d auth_send failed with status %d\n", qid, ret); 76 + else if (ret < 0) 77 + dev_err(ctrl->device, 78 + "qid %d auth_send failed with error %d\n", qid, ret); 79 + return ret; 80 + } 81 + 82 + static int nvme_auth_receive_validate(struct nvme_ctrl *ctrl, int qid, 83 + struct nvmf_auth_dhchap_failure_data *data, 84 + u16 transaction, u8 expected_msg) 85 + { 86 + dev_dbg(ctrl->device, "%s: qid %d auth_type %d auth_id %x\n", 87 + __func__, qid, data->auth_type, data->auth_id); 88 + 89 + if (data->auth_type == NVME_AUTH_COMMON_MESSAGES && 90 + data->auth_id == NVME_AUTH_DHCHAP_MESSAGE_FAILURE1) { 91 + return data->rescode_exp; 92 + } 93 + if (data->auth_type != NVME_AUTH_DHCHAP_MESSAGES || 94 + data->auth_id != expected_msg) { 95 + dev_warn(ctrl->device, 96 + "qid %d invalid message %02x/%02x\n", 97 + qid, data->auth_type, data->auth_id); 98 + return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE; 99 + } 100 + if (le16_to_cpu(data->t_id) != transaction) { 101 + dev_warn(ctrl->device, 102 + "qid %d invalid transaction ID %d\n", 103 + qid, le16_to_cpu(data->t_id)); 104 + return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE; 105 + } 106 + return 0; 107 + } 108 + 109 + static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl, 110 + struct nvme_dhchap_queue_context *chap) 111 + { 112 + struct nvmf_auth_dhchap_negotiate_data *data = chap->buf; 113 + size_t size = sizeof(*data) + sizeof(union nvmf_auth_protocol); 114 + 115 + if (chap->buf_size < size) { 116 + chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; 117 + return -EINVAL; 118 + } 119 + memset((u8 *)chap->buf, 0, size); 120 + data->auth_type = NVME_AUTH_COMMON_MESSAGES; 121 + data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE; 122 + data->t_id = cpu_to_le16(chap->transaction); 123 + data->sc_c = 0; /* No secure channel concatenation */ 124 + data->napd = 1; 125 + data->auth_protocol[0].dhchap.authid = NVME_AUTH_DHCHAP_AUTH_ID; 126 + data->auth_protocol[0].dhchap.halen = 3; 127 + data->auth_protocol[0].dhchap.dhlen = 6; 128 + data->auth_protocol[0].dhchap.idlist[0] = NVME_AUTH_HASH_SHA256; 129 + data->auth_protocol[0].dhchap.idlist[1] = NVME_AUTH_HASH_SHA384; 130 + data->auth_protocol[0].dhchap.idlist[2] = NVME_AUTH_HASH_SHA512; 131 + data->auth_protocol[0].dhchap.idlist[30] = NVME_AUTH_DHGROUP_NULL; 132 + data->auth_protocol[0].dhchap.idlist[31] = NVME_AUTH_DHGROUP_2048; 133 + data->auth_protocol[0].dhchap.idlist[32] = NVME_AUTH_DHGROUP_3072; 134 + data->auth_protocol[0].dhchap.idlist[33] = NVME_AUTH_DHGROUP_4096; 135 + data->auth_protocol[0].dhchap.idlist[34] = NVME_AUTH_DHGROUP_6144; 136 + data->auth_protocol[0].dhchap.idlist[35] = NVME_AUTH_DHGROUP_8192; 137 + 138 + return size; 139 + } 140 + 141 + static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, 142 + struct nvme_dhchap_queue_context *chap) 143 + { 144 + struct nvmf_auth_dhchap_challenge_data *data = chap->buf; 145 + u16 dhvlen = le16_to_cpu(data->dhvlen); 146 + size_t size = sizeof(*data) + data->hl + dhvlen; 147 + const char *gid_name = nvme_auth_dhgroup_name(data->dhgid); 148 + const char *hmac_name, *kpp_name; 149 + 150 + if (chap->buf_size < size) { 151 + chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; 152 + return NVME_SC_INVALID_FIELD; 153 + } 154 + 155 + hmac_name = nvme_auth_hmac_name(data->hashid); 156 + if (!hmac_name) { 157 + dev_warn(ctrl->device, 158 + "qid %d: invalid HASH ID %d\n", 159 + chap->qid, data->hashid); 160 + chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; 161 + return NVME_SC_INVALID_FIELD; 162 + } 163 + 164 + if (chap->hash_id == data->hashid && chap->shash_tfm && 165 + !strcmp(crypto_shash_alg_name(chap->shash_tfm), hmac_name) && 166 + crypto_shash_digestsize(chap->shash_tfm) == data->hl) { 167 + dev_dbg(ctrl->device, 168 + "qid %d: reuse existing hash %s\n", 169 + chap->qid, hmac_name); 170 + goto select_kpp; 171 + } 172 + 173 + /* Reset if hash cannot be reused */ 174 + if (chap->shash_tfm) { 175 + crypto_free_shash(chap->shash_tfm); 176 + chap->hash_id = 0; 177 + chap->hash_len = 0; 178 + } 179 + chap->shash_tfm = crypto_alloc_shash(hmac_name, 0, 180 + CRYPTO_ALG_ALLOCATES_MEMORY); 181 + if (IS_ERR(chap->shash_tfm)) { 182 + dev_warn(ctrl->device, 183 + "qid %d: failed to allocate hash %s, error %ld\n", 184 + chap->qid, hmac_name, PTR_ERR(chap->shash_tfm)); 185 + chap->shash_tfm = NULL; 186 + chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED; 187 + return NVME_SC_AUTH_REQUIRED; 188 + } 189 + 190 + if (crypto_shash_digestsize(chap->shash_tfm) != data->hl) { 191 + dev_warn(ctrl->device, 192 + "qid %d: invalid hash length %d\n", 193 + chap->qid, data->hl); 194 + crypto_free_shash(chap->shash_tfm); 195 + chap->shash_tfm = NULL; 196 + chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; 197 + return NVME_SC_AUTH_REQUIRED; 198 + } 199 + 200 + /* Reset host response if the hash had been changed */ 201 + if (chap->hash_id != data->hashid) { 202 + kfree(chap->host_response); 203 + chap->host_response = NULL; 204 + } 205 + 206 + chap->hash_id = data->hashid; 207 + chap->hash_len = data->hl; 208 + dev_dbg(ctrl->device, "qid %d: selected hash %s\n", 209 + chap->qid, hmac_name); 210 + 211 + select_kpp: 212 + kpp_name = nvme_auth_dhgroup_kpp(data->dhgid); 213 + if (!kpp_name) { 214 + dev_warn(ctrl->device, 215 + "qid %d: invalid DH group id %d\n", 216 + chap->qid, data->dhgid); 217 + chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE; 218 + /* Leave previous dh_tfm intact */ 219 + return NVME_SC_AUTH_REQUIRED; 220 + } 221 + 222 + /* Clear host and controller key to avoid accidental reuse */ 223 + kfree_sensitive(chap->host_key); 224 + chap->host_key = NULL; 225 + chap->host_key_len = 0; 226 + kfree_sensitive(chap->ctrl_key); 227 + chap->ctrl_key = NULL; 228 + chap->ctrl_key_len = 0; 229 + 230 + if (chap->dhgroup_id == data->dhgid && 231 + (data->dhgid == NVME_AUTH_DHGROUP_NULL || chap->dh_tfm)) { 232 + dev_dbg(ctrl->device, 233 + "qid %d: reuse existing DH group %s\n", 234 + chap->qid, gid_name); 235 + goto skip_kpp; 236 + } 237 + 238 + /* Reset dh_tfm if it can't be reused */ 239 + if (chap->dh_tfm) { 240 + crypto_free_kpp(chap->dh_tfm); 241 + chap->dh_tfm = NULL; 242 + } 243 + 244 + if (data->dhgid != NVME_AUTH_DHGROUP_NULL) { 245 + if (dhvlen == 0) { 246 + dev_warn(ctrl->device, 247 + "qid %d: empty DH value\n", 248 + chap->qid); 249 + chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE; 250 + return NVME_SC_INVALID_FIELD; 251 + } 252 + 253 + chap->dh_tfm = crypto_alloc_kpp(kpp_name, 0, 0); 254 + if (IS_ERR(chap->dh_tfm)) { 255 + int ret = PTR_ERR(chap->dh_tfm); 256 + 257 + dev_warn(ctrl->device, 258 + "qid %d: error %d initializing DH group %s\n", 259 + chap->qid, ret, gid_name); 260 + chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE; 261 + chap->dh_tfm = NULL; 262 + return NVME_SC_AUTH_REQUIRED; 263 + } 264 + dev_dbg(ctrl->device, "qid %d: selected DH group %s\n", 265 + chap->qid, gid_name); 266 + } else if (dhvlen != 0) { 267 + dev_warn(ctrl->device, 268 + "qid %d: invalid DH value for NULL DH\n", 269 + chap->qid); 270 + chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; 271 + return NVME_SC_INVALID_FIELD; 272 + } 273 + chap->dhgroup_id = data->dhgid; 274 + 275 + skip_kpp: 276 + chap->s1 = le32_to_cpu(data->seqnum); 277 + memcpy(chap->c1, data->cval, chap->hash_len); 278 + if (dhvlen) { 279 + chap->ctrl_key = kmalloc(dhvlen, GFP_KERNEL); 280 + if (!chap->ctrl_key) { 281 + chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED; 282 + return NVME_SC_AUTH_REQUIRED; 283 + } 284 + chap->ctrl_key_len = dhvlen; 285 + memcpy(chap->ctrl_key, data->cval + chap->hash_len, 286 + dhvlen); 287 + dev_dbg(ctrl->device, "ctrl public key %*ph\n", 288 + (int)chap->ctrl_key_len, chap->ctrl_key); 289 + } 290 + 291 + return 0; 292 + } 293 + 294 + static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl, 295 + struct nvme_dhchap_queue_context *chap) 296 + { 297 + struct nvmf_auth_dhchap_reply_data *data = chap->buf; 298 + size_t size = sizeof(*data); 299 + 300 + size += 2 * chap->hash_len; 301 + 302 + if (chap->host_key_len) 303 + size += chap->host_key_len; 304 + 305 + if (chap->buf_size < size) { 306 + chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; 307 + return -EINVAL; 308 + } 309 + 310 + memset(chap->buf, 0, size); 311 + data->auth_type = NVME_AUTH_DHCHAP_MESSAGES; 312 + data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_REPLY; 313 + data->t_id = cpu_to_le16(chap->transaction); 314 + data->hl = chap->hash_len; 315 + data->dhvlen = cpu_to_le16(chap->host_key_len); 316 + memcpy(data->rval, chap->response, chap->hash_len); 317 + if (ctrl->ctrl_key) { 318 + get_random_bytes(chap->c2, chap->hash_len); 319 + data->cvalid = 1; 320 + chap->s2 = nvme_auth_get_seqnum(); 321 + memcpy(data->rval + chap->hash_len, chap->c2, 322 + chap->hash_len); 323 + dev_dbg(ctrl->device, "%s: qid %d ctrl challenge %*ph\n", 324 + __func__, chap->qid, (int)chap->hash_len, chap->c2); 325 + } else { 326 + memset(chap->c2, 0, chap->hash_len); 327 + chap->s2 = 0; 328 + } 329 + data->seqnum = cpu_to_le32(chap->s2); 330 + if (chap->host_key_len) { 331 + dev_dbg(ctrl->device, "%s: qid %d host public key %*ph\n", 332 + __func__, chap->qid, 333 + chap->host_key_len, chap->host_key); 334 + memcpy(data->rval + 2 * chap->hash_len, chap->host_key, 335 + chap->host_key_len); 336 + } 337 + 338 + return size; 339 + } 340 + 341 + static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl, 342 + struct nvme_dhchap_queue_context *chap) 343 + { 344 + struct nvmf_auth_dhchap_success1_data *data = chap->buf; 345 + size_t size = sizeof(*data); 346 + 347 + if (ctrl->ctrl_key) 348 + size += chap->hash_len; 349 + 350 + if (chap->buf_size < size) { 351 + chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; 352 + return NVME_SC_INVALID_FIELD; 353 + } 354 + 355 + if (data->hl != chap->hash_len) { 356 + dev_warn(ctrl->device, 357 + "qid %d: invalid hash length %u\n", 358 + chap->qid, data->hl); 359 + chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; 360 + return NVME_SC_INVALID_FIELD; 361 + } 362 + 363 + /* Just print out information for the admin queue */ 364 + if (chap->qid == 0) 365 + dev_info(ctrl->device, 366 + "qid 0: authenticated with hash %s dhgroup %s\n", 367 + nvme_auth_hmac_name(chap->hash_id), 368 + nvme_auth_dhgroup_name(chap->dhgroup_id)); 369 + 370 + if (!data->rvalid) 371 + return 0; 372 + 373 + /* Validate controller response */ 374 + if (memcmp(chap->response, data->rval, data->hl)) { 375 + dev_dbg(ctrl->device, "%s: qid %d ctrl response %*ph\n", 376 + __func__, chap->qid, (int)chap->hash_len, data->rval); 377 + dev_dbg(ctrl->device, "%s: qid %d host response %*ph\n", 378 + __func__, chap->qid, (int)chap->hash_len, 379 + chap->response); 380 + dev_warn(ctrl->device, 381 + "qid %d: controller authentication failed\n", 382 + chap->qid); 383 + chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED; 384 + return NVME_SC_AUTH_REQUIRED; 385 + } 386 + 387 + /* Just print out information for the admin queue */ 388 + if (chap->qid == 0) 389 + dev_info(ctrl->device, 390 + "qid 0: controller authenticated\n"); 391 + return 0; 392 + } 393 + 394 + static int nvme_auth_set_dhchap_success2_data(struct nvme_ctrl *ctrl, 395 + struct nvme_dhchap_queue_context *chap) 396 + { 397 + struct nvmf_auth_dhchap_success2_data *data = chap->buf; 398 + size_t size = sizeof(*data); 399 + 400 + memset(chap->buf, 0, size); 401 + data->auth_type = NVME_AUTH_DHCHAP_MESSAGES; 402 + data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2; 403 + data->t_id = cpu_to_le16(chap->transaction); 404 + 405 + return size; 406 + } 407 + 408 + static int nvme_auth_set_dhchap_failure2_data(struct nvme_ctrl *ctrl, 409 + struct nvme_dhchap_queue_context *chap) 410 + { 411 + struct nvmf_auth_dhchap_failure_data *data = chap->buf; 412 + size_t size = sizeof(*data); 413 + 414 + memset(chap->buf, 0, size); 415 + data->auth_type = NVME_AUTH_COMMON_MESSAGES; 416 + data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_FAILURE2; 417 + data->t_id = cpu_to_le16(chap->transaction); 418 + data->rescode = NVME_AUTH_DHCHAP_FAILURE_REASON_FAILED; 419 + data->rescode_exp = chap->status; 420 + 421 + return size; 422 + } 423 + 424 + static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl, 425 + struct nvme_dhchap_queue_context *chap) 426 + { 427 + SHASH_DESC_ON_STACK(shash, chap->shash_tfm); 428 + u8 buf[4], *challenge = chap->c1; 429 + int ret; 430 + 431 + dev_dbg(ctrl->device, "%s: qid %d host response seq %u transaction %d\n", 432 + __func__, chap->qid, chap->s1, chap->transaction); 433 + 434 + if (!chap->host_response) { 435 + chap->host_response = nvme_auth_transform_key(ctrl->host_key, 436 + ctrl->opts->host->nqn); 437 + if (IS_ERR(chap->host_response)) { 438 + ret = PTR_ERR(chap->host_response); 439 + chap->host_response = NULL; 440 + return ret; 441 + } 442 + } else { 443 + dev_dbg(ctrl->device, "%s: qid %d re-using host response\n", 444 + __func__, chap->qid); 445 + } 446 + 447 + ret = crypto_shash_setkey(chap->shash_tfm, 448 + chap->host_response, ctrl->host_key->len); 449 + if (ret) { 450 + dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n", 451 + chap->qid, ret); 452 + goto out; 453 + } 454 + 455 + if (chap->dh_tfm) { 456 + challenge = kmalloc(chap->hash_len, GFP_KERNEL); 457 + if (!challenge) { 458 + ret = -ENOMEM; 459 + goto out; 460 + } 461 + ret = nvme_auth_augmented_challenge(chap->hash_id, 462 + chap->sess_key, 463 + chap->sess_key_len, 464 + chap->c1, challenge, 465 + chap->hash_len); 466 + if (ret) 467 + goto out; 468 + } 469 + 470 + shash->tfm = chap->shash_tfm; 471 + ret = crypto_shash_init(shash); 472 + if (ret) 473 + goto out; 474 + ret = crypto_shash_update(shash, challenge, chap->hash_len); 475 + if (ret) 476 + goto out; 477 + put_unaligned_le32(chap->s1, buf); 478 + ret = crypto_shash_update(shash, buf, 4); 479 + if (ret) 480 + goto out; 481 + put_unaligned_le16(chap->transaction, buf); 482 + ret = crypto_shash_update(shash, buf, 2); 483 + if (ret) 484 + goto out; 485 + memset(buf, 0, sizeof(buf)); 486 + ret = crypto_shash_update(shash, buf, 1); 487 + if (ret) 488 + goto out; 489 + ret = crypto_shash_update(shash, "HostHost", 8); 490 + if (ret) 491 + goto out; 492 + ret = crypto_shash_update(shash, ctrl->opts->host->nqn, 493 + strlen(ctrl->opts->host->nqn)); 494 + if (ret) 495 + goto out; 496 + ret = crypto_shash_update(shash, buf, 1); 497 + if (ret) 498 + goto out; 499 + ret = crypto_shash_update(shash, ctrl->opts->subsysnqn, 500 + strlen(ctrl->opts->subsysnqn)); 501 + if (ret) 502 + goto out; 503 + ret = crypto_shash_final(shash, chap->response); 504 + out: 505 + if (challenge != chap->c1) 506 + kfree(challenge); 507 + return ret; 508 + } 509 + 510 + static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl, 511 + struct nvme_dhchap_queue_context *chap) 512 + { 513 + SHASH_DESC_ON_STACK(shash, chap->shash_tfm); 514 + u8 *ctrl_response; 515 + u8 buf[4], *challenge = chap->c2; 516 + int ret; 517 + 518 + ctrl_response = nvme_auth_transform_key(ctrl->ctrl_key, 519 + ctrl->opts->subsysnqn); 520 + if (IS_ERR(ctrl_response)) { 521 + ret = PTR_ERR(ctrl_response); 522 + return ret; 523 + } 524 + ret = crypto_shash_setkey(chap->shash_tfm, 525 + ctrl_response, ctrl->ctrl_key->len); 526 + if (ret) { 527 + dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n", 528 + chap->qid, ret); 529 + goto out; 530 + } 531 + 532 + if (chap->dh_tfm) { 533 + challenge = kmalloc(chap->hash_len, GFP_KERNEL); 534 + if (!challenge) { 535 + ret = -ENOMEM; 536 + goto out; 537 + } 538 + ret = nvme_auth_augmented_challenge(chap->hash_id, 539 + chap->sess_key, 540 + chap->sess_key_len, 541 + chap->c2, challenge, 542 + chap->hash_len); 543 + if (ret) 544 + goto out; 545 + } 546 + dev_dbg(ctrl->device, "%s: qid %d ctrl response seq %u transaction %d\n", 547 + __func__, chap->qid, chap->s2, chap->transaction); 548 + dev_dbg(ctrl->device, "%s: qid %d challenge %*ph\n", 549 + __func__, chap->qid, (int)chap->hash_len, challenge); 550 + dev_dbg(ctrl->device, "%s: qid %d subsysnqn %s\n", 551 + __func__, chap->qid, ctrl->opts->subsysnqn); 552 + dev_dbg(ctrl->device, "%s: qid %d hostnqn %s\n", 553 + __func__, chap->qid, ctrl->opts->host->nqn); 554 + shash->tfm = chap->shash_tfm; 555 + ret = crypto_shash_init(shash); 556 + if (ret) 557 + goto out; 558 + ret = crypto_shash_update(shash, challenge, chap->hash_len); 559 + if (ret) 560 + goto out; 561 + put_unaligned_le32(chap->s2, buf); 562 + ret = crypto_shash_update(shash, buf, 4); 563 + if (ret) 564 + goto out; 565 + put_unaligned_le16(chap->transaction, buf); 566 + ret = crypto_shash_update(shash, buf, 2); 567 + if (ret) 568 + goto out; 569 + memset(buf, 0, 4); 570 + ret = crypto_shash_update(shash, buf, 1); 571 + if (ret) 572 + goto out; 573 + ret = crypto_shash_update(shash, "Controller", 10); 574 + if (ret) 575 + goto out; 576 + ret = crypto_shash_update(shash, ctrl->opts->subsysnqn, 577 + strlen(ctrl->opts->subsysnqn)); 578 + if (ret) 579 + goto out; 580 + ret = crypto_shash_update(shash, buf, 1); 581 + if (ret) 582 + goto out; 583 + ret = crypto_shash_update(shash, ctrl->opts->host->nqn, 584 + strlen(ctrl->opts->host->nqn)); 585 + if (ret) 586 + goto out; 587 + ret = crypto_shash_final(shash, chap->response); 588 + out: 589 + if (challenge != chap->c2) 590 + kfree(challenge); 591 + kfree(ctrl_response); 592 + return ret; 593 + } 594 + 595 + static int nvme_auth_dhchap_exponential(struct nvme_ctrl *ctrl, 596 + struct nvme_dhchap_queue_context *chap) 597 + { 598 + int ret; 599 + 600 + if (chap->host_key && chap->host_key_len) { 601 + dev_dbg(ctrl->device, 602 + "qid %d: reusing host key\n", chap->qid); 603 + goto gen_sesskey; 604 + } 605 + ret = nvme_auth_gen_privkey(chap->dh_tfm, chap->dhgroup_id); 606 + if (ret < 0) { 607 + chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; 608 + return ret; 609 + } 610 + 611 + chap->host_key_len = crypto_kpp_maxsize(chap->dh_tfm); 612 + 613 + chap->host_key = kzalloc(chap->host_key_len, GFP_KERNEL); 614 + if (!chap->host_key) { 615 + chap->host_key_len = 0; 616 + chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED; 617 + return -ENOMEM; 618 + } 619 + ret = nvme_auth_gen_pubkey(chap->dh_tfm, 620 + chap->host_key, chap->host_key_len); 621 + if (ret) { 622 + dev_dbg(ctrl->device, 623 + "failed to generate public key, error %d\n", ret); 624 + kfree(chap->host_key); 625 + chap->host_key = NULL; 626 + chap->host_key_len = 0; 627 + chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; 628 + return ret; 629 + } 630 + 631 + gen_sesskey: 632 + chap->sess_key_len = chap->host_key_len; 633 + chap->sess_key = kmalloc(chap->sess_key_len, GFP_KERNEL); 634 + if (!chap->sess_key) { 635 + chap->sess_key_len = 0; 636 + chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED; 637 + return -ENOMEM; 638 + } 639 + 640 + ret = nvme_auth_gen_shared_secret(chap->dh_tfm, 641 + chap->ctrl_key, chap->ctrl_key_len, 642 + chap->sess_key, chap->sess_key_len); 643 + if (ret) { 644 + dev_dbg(ctrl->device, 645 + "failed to generate shared secret, error %d\n", ret); 646 + kfree_sensitive(chap->sess_key); 647 + chap->sess_key = NULL; 648 + chap->sess_key_len = 0; 649 + chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; 650 + return ret; 651 + } 652 + dev_dbg(ctrl->device, "shared secret %*ph\n", 653 + (int)chap->sess_key_len, chap->sess_key); 654 + return 0; 655 + } 656 + 657 + static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap) 658 + { 659 + kfree_sensitive(chap->host_response); 660 + chap->host_response = NULL; 661 + kfree_sensitive(chap->host_key); 662 + chap->host_key = NULL; 663 + chap->host_key_len = 0; 664 + kfree_sensitive(chap->ctrl_key); 665 + chap->ctrl_key = NULL; 666 + chap->ctrl_key_len = 0; 667 + kfree_sensitive(chap->sess_key); 668 + chap->sess_key = NULL; 669 + chap->sess_key_len = 0; 670 + chap->status = 0; 671 + chap->error = 0; 672 + chap->s1 = 0; 673 + chap->s2 = 0; 674 + chap->transaction = 0; 675 + memset(chap->c1, 0, sizeof(chap->c1)); 676 + memset(chap->c2, 0, sizeof(chap->c2)); 677 + } 678 + 679 + static void __nvme_auth_free(struct nvme_dhchap_queue_context *chap) 680 + { 681 + __nvme_auth_reset(chap); 682 + if (chap->shash_tfm) 683 + crypto_free_shash(chap->shash_tfm); 684 + if (chap->dh_tfm) 685 + crypto_free_kpp(chap->dh_tfm); 686 + kfree_sensitive(chap->ctrl_key); 687 + kfree_sensitive(chap->host_key); 688 + kfree_sensitive(chap->sess_key); 689 + kfree_sensitive(chap->host_response); 690 + kfree(chap->buf); 691 + kfree(chap); 692 + } 693 + 694 + static void __nvme_auth_work(struct work_struct *work) 695 + { 696 + struct nvme_dhchap_queue_context *chap = 697 + container_of(work, struct nvme_dhchap_queue_context, auth_work); 698 + struct nvme_ctrl *ctrl = chap->ctrl; 699 + size_t tl; 700 + int ret = 0; 701 + 702 + chap->transaction = ctrl->transaction++; 703 + 704 + /* DH-HMAC-CHAP Step 1: send negotiate */ 705 + dev_dbg(ctrl->device, "%s: qid %d send negotiate\n", 706 + __func__, chap->qid); 707 + ret = nvme_auth_set_dhchap_negotiate_data(ctrl, chap); 708 + if (ret < 0) { 709 + chap->error = ret; 710 + return; 711 + } 712 + tl = ret; 713 + ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true); 714 + if (ret) { 715 + chap->error = ret; 716 + return; 717 + } 718 + 719 + /* DH-HMAC-CHAP Step 2: receive challenge */ 720 + dev_dbg(ctrl->device, "%s: qid %d receive challenge\n", 721 + __func__, chap->qid); 722 + 723 + memset(chap->buf, 0, chap->buf_size); 724 + ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false); 725 + if (ret) { 726 + dev_warn(ctrl->device, 727 + "qid %d failed to receive challenge, %s %d\n", 728 + chap->qid, ret < 0 ? "error" : "nvme status", ret); 729 + chap->error = ret; 730 + return; 731 + } 732 + ret = nvme_auth_receive_validate(ctrl, chap->qid, chap->buf, chap->transaction, 733 + NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE); 734 + if (ret) { 735 + chap->status = ret; 736 + chap->error = NVME_SC_AUTH_REQUIRED; 737 + return; 738 + } 739 + 740 + ret = nvme_auth_process_dhchap_challenge(ctrl, chap); 741 + if (ret) { 742 + /* Invalid challenge parameters */ 743 + chap->error = ret; 744 + goto fail2; 745 + } 746 + 747 + if (chap->ctrl_key_len) { 748 + dev_dbg(ctrl->device, 749 + "%s: qid %d DH exponential\n", 750 + __func__, chap->qid); 751 + ret = nvme_auth_dhchap_exponential(ctrl, chap); 752 + if (ret) { 753 + chap->error = ret; 754 + goto fail2; 755 + } 756 + } 757 + 758 + dev_dbg(ctrl->device, "%s: qid %d host response\n", 759 + __func__, chap->qid); 760 + ret = nvme_auth_dhchap_setup_host_response(ctrl, chap); 761 + if (ret) { 762 + chap->error = ret; 763 + goto fail2; 764 + } 765 + 766 + /* DH-HMAC-CHAP Step 3: send reply */ 767 + dev_dbg(ctrl->device, "%s: qid %d send reply\n", 768 + __func__, chap->qid); 769 + ret = nvme_auth_set_dhchap_reply_data(ctrl, chap); 770 + if (ret < 0) { 771 + chap->error = ret; 772 + goto fail2; 773 + } 774 + 775 + tl = ret; 776 + ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true); 777 + if (ret) { 778 + chap->error = ret; 779 + goto fail2; 780 + } 781 + 782 + /* DH-HMAC-CHAP Step 4: receive success1 */ 783 + dev_dbg(ctrl->device, "%s: qid %d receive success1\n", 784 + __func__, chap->qid); 785 + 786 + memset(chap->buf, 0, chap->buf_size); 787 + ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false); 788 + if (ret) { 789 + dev_warn(ctrl->device, 790 + "qid %d failed to receive success1, %s %d\n", 791 + chap->qid, ret < 0 ? "error" : "nvme status", ret); 792 + chap->error = ret; 793 + return; 794 + } 795 + ret = nvme_auth_receive_validate(ctrl, chap->qid, 796 + chap->buf, chap->transaction, 797 + NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1); 798 + if (ret) { 799 + chap->status = ret; 800 + chap->error = NVME_SC_AUTH_REQUIRED; 801 + return; 802 + } 803 + 804 + if (ctrl->ctrl_key) { 805 + dev_dbg(ctrl->device, 806 + "%s: qid %d controller response\n", 807 + __func__, chap->qid); 808 + ret = nvme_auth_dhchap_setup_ctrl_response(ctrl, chap); 809 + if (ret) { 810 + chap->error = ret; 811 + goto fail2; 812 + } 813 + } 814 + 815 + ret = nvme_auth_process_dhchap_success1(ctrl, chap); 816 + if (ret) { 817 + /* Controller authentication failed */ 818 + chap->error = NVME_SC_AUTH_REQUIRED; 819 + goto fail2; 820 + } 821 + 822 + if (ctrl->ctrl_key) { 823 + /* DH-HMAC-CHAP Step 5: send success2 */ 824 + dev_dbg(ctrl->device, "%s: qid %d send success2\n", 825 + __func__, chap->qid); 826 + tl = nvme_auth_set_dhchap_success2_data(ctrl, chap); 827 + ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true); 828 + if (ret) 829 + chap->error = ret; 830 + } 831 + if (!ret) { 832 + chap->error = 0; 833 + return; 834 + } 835 + 836 + fail2: 837 + dev_dbg(ctrl->device, "%s: qid %d send failure2, status %x\n", 838 + __func__, chap->qid, chap->status); 839 + tl = nvme_auth_set_dhchap_failure2_data(ctrl, chap); 840 + ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true); 841 + /* 842 + * only update error if send failure2 failed and no other 843 + * error had been set during authentication. 844 + */ 845 + if (ret && !chap->error) 846 + chap->error = ret; 847 + } 848 + 849 + int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) 850 + { 851 + struct nvme_dhchap_queue_context *chap; 852 + 853 + if (!ctrl->host_key) { 854 + dev_warn(ctrl->device, "qid %d: no key\n", qid); 855 + return -ENOKEY; 856 + } 857 + 858 + if (ctrl->opts->dhchap_ctrl_secret && !ctrl->ctrl_key) { 859 + dev_warn(ctrl->device, "qid %d: invalid ctrl key\n", qid); 860 + return -ENOKEY; 861 + } 862 + 863 + mutex_lock(&ctrl->dhchap_auth_mutex); 864 + /* Check if the context is already queued */ 865 + list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) { 866 + WARN_ON(!chap->buf); 867 + if (chap->qid == qid) { 868 + dev_dbg(ctrl->device, "qid %d: re-using context\n", qid); 869 + mutex_unlock(&ctrl->dhchap_auth_mutex); 870 + flush_work(&chap->auth_work); 871 + __nvme_auth_reset(chap); 872 + queue_work(nvme_wq, &chap->auth_work); 873 + return 0; 874 + } 875 + } 876 + chap = kzalloc(sizeof(*chap), GFP_KERNEL); 877 + if (!chap) { 878 + mutex_unlock(&ctrl->dhchap_auth_mutex); 879 + return -ENOMEM; 880 + } 881 + chap->qid = (qid == NVME_QID_ANY) ? 0 : qid; 882 + chap->ctrl = ctrl; 883 + 884 + /* 885 + * Allocate a large enough buffer for the entire negotiation: 886 + * 4k should be enough to ffdhe8192. 887 + */ 888 + chap->buf_size = 4096; 889 + chap->buf = kzalloc(chap->buf_size, GFP_KERNEL); 890 + if (!chap->buf) { 891 + mutex_unlock(&ctrl->dhchap_auth_mutex); 892 + kfree(chap); 893 + return -ENOMEM; 894 + } 895 + 896 + INIT_WORK(&chap->auth_work, __nvme_auth_work); 897 + list_add(&chap->entry, &ctrl->dhchap_auth_list); 898 + mutex_unlock(&ctrl->dhchap_auth_mutex); 899 + queue_work(nvme_wq, &chap->auth_work); 900 + return 0; 901 + } 902 + EXPORT_SYMBOL_GPL(nvme_auth_negotiate); 903 + 904 + int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid) 905 + { 906 + struct nvme_dhchap_queue_context *chap; 907 + int ret; 908 + 909 + mutex_lock(&ctrl->dhchap_auth_mutex); 910 + list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) { 911 + if (chap->qid != qid) 912 + continue; 913 + mutex_unlock(&ctrl->dhchap_auth_mutex); 914 + flush_work(&chap->auth_work); 915 + ret = chap->error; 916 + return ret; 917 + } 918 + mutex_unlock(&ctrl->dhchap_auth_mutex); 919 + return -ENXIO; 920 + } 921 + EXPORT_SYMBOL_GPL(nvme_auth_wait); 922 + 923 + void nvme_auth_reset(struct nvme_ctrl *ctrl) 924 + { 925 + struct nvme_dhchap_queue_context *chap; 926 + 927 + mutex_lock(&ctrl->dhchap_auth_mutex); 928 + list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) { 929 + mutex_unlock(&ctrl->dhchap_auth_mutex); 930 + flush_work(&chap->auth_work); 931 + __nvme_auth_reset(chap); 932 + } 933 + mutex_unlock(&ctrl->dhchap_auth_mutex); 934 + } 935 + EXPORT_SYMBOL_GPL(nvme_auth_reset); 936 + 937 + static void nvme_dhchap_auth_work(struct work_struct *work) 938 + { 939 + struct nvme_ctrl *ctrl = 940 + container_of(work, struct nvme_ctrl, dhchap_auth_work); 941 + int ret, q; 942 + 943 + /* Authenticate admin queue first */ 944 + ret = nvme_auth_negotiate(ctrl, 0); 945 + if (ret) { 946 + dev_warn(ctrl->device, 947 + "qid 0: error %d setting up authentication\n", ret); 948 + return; 949 + } 950 + ret = nvme_auth_wait(ctrl, 0); 951 + if (ret) { 952 + dev_warn(ctrl->device, 953 + "qid 0: authentication failed\n"); 954 + return; 955 + } 956 + 957 + for (q = 1; q < ctrl->queue_count; q++) { 958 + ret = nvme_auth_negotiate(ctrl, q); 959 + if (ret) { 960 + dev_warn(ctrl->device, 961 + "qid %d: error %d setting up authentication\n", 962 + q, ret); 963 + break; 964 + } 965 + } 966 + 967 + /* 968 + * Failure is a soft-state; credentials remain valid until 969 + * the controller terminates the connection. 970 + */ 971 + } 972 + 973 + void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) 974 + { 975 + INIT_LIST_HEAD(&ctrl->dhchap_auth_list); 976 + INIT_WORK(&ctrl->dhchap_auth_work, nvme_dhchap_auth_work); 977 + mutex_init(&ctrl->dhchap_auth_mutex); 978 + if (!ctrl->opts) 979 + return; 980 + nvme_auth_generate_key(ctrl->opts->dhchap_secret, &ctrl->host_key); 981 + nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret, &ctrl->ctrl_key); 982 + } 983 + EXPORT_SYMBOL_GPL(nvme_auth_init_ctrl); 984 + 985 + void nvme_auth_stop(struct nvme_ctrl *ctrl) 986 + { 987 + struct nvme_dhchap_queue_context *chap = NULL, *tmp; 988 + 989 + cancel_work_sync(&ctrl->dhchap_auth_work); 990 + mutex_lock(&ctrl->dhchap_auth_mutex); 991 + list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry) 992 + cancel_work_sync(&chap->auth_work); 993 + mutex_unlock(&ctrl->dhchap_auth_mutex); 994 + } 995 + EXPORT_SYMBOL_GPL(nvme_auth_stop); 996 + 997 + void nvme_auth_free(struct nvme_ctrl *ctrl) 998 + { 999 + struct nvme_dhchap_queue_context *chap = NULL, *tmp; 1000 + 1001 + mutex_lock(&ctrl->dhchap_auth_mutex); 1002 + list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry) { 1003 + list_del_init(&chap->entry); 1004 + flush_work(&chap->auth_work); 1005 + __nvme_auth_free(chap); 1006 + } 1007 + mutex_unlock(&ctrl->dhchap_auth_mutex); 1008 + if (ctrl->host_key) { 1009 + nvme_auth_free_key(ctrl->host_key); 1010 + ctrl->host_key = NULL; 1011 + } 1012 + if (ctrl->ctrl_key) { 1013 + nvme_auth_free_key(ctrl->ctrl_key); 1014 + ctrl->ctrl_key = NULL; 1015 + } 1016 + } 1017 + EXPORT_SYMBOL_GPL(nvme_auth_free);

+1 -2

drivers/nvme/host/constants.c

··· 6 6 7 7 #include "nvme.h" 8 8 9 - #ifdef CONFIG_NVME_VERBOSE_ERRORS 10 9 static const char * const nvme_ops[] = { 11 10 [nvme_cmd_flush] = "Flush", 12 11 [nvme_cmd_write] = "Write", ··· 177 178 return nvme_ops[opcode]; 178 179 return "Unknown"; 179 180 } 181 + EXPORT_SYMBOL_GPL(nvme_get_opcode_str); 180 182 181 183 const unsigned char *nvme_get_admin_opcode_str(u8 opcode) 182 184 { ··· 185 185 return nvme_admin_ops[opcode]; 186 186 return "Unknown"; 187 187 } 188 - #endif /* CONFIG_NVME_VERBOSE_ERRORS */

+360 -148

drivers/nvme/host/core.c

··· 24 24 25 25 #include "nvme.h" 26 26 #include "fabrics.h" 27 + #include <linux/nvme-auth.h> 27 28 28 29 #define CREATE_TRACE_POINTS 29 30 #include "trace.h" 30 31 31 32 #define NVME_MINORS (1U << MINORBITS) 33 + 34 + struct nvme_ns_info { 35 + struct nvme_ns_ids ids; 36 + u32 nsid; 37 + __le32 anagrpid; 38 + bool is_shared; 39 + bool is_readonly; 40 + bool is_ready; 41 + }; 32 42 33 43 unsigned int admin_timeout = 60; 34 44 module_param(admin_timeout, uint, 0644); ··· 340 330 COMPLETE, 341 331 RETRY, 342 332 FAILOVER, 333 + AUTHENTICATE, 343 334 }; 344 335 345 336 static inline enum nvme_disposition nvme_decide_disposition(struct request *req) 346 337 { 347 338 if (likely(nvme_req(req)->status == 0)) 348 339 return COMPLETE; 340 + 341 + if ((nvme_req(req)->status & 0x7ff) == NVME_SC_AUTH_REQUIRED) 342 + return AUTHENTICATE; 349 343 350 344 if (blk_noretry_request(req) || 351 345 (nvme_req(req)->status & NVME_SC_DNR) || ··· 389 375 390 376 void nvme_complete_rq(struct request *req) 391 377 { 378 + struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; 379 + 392 380 trace_nvme_complete_rq(req); 393 381 nvme_cleanup_cmd(req); 394 382 395 - if (nvme_req(req)->ctrl->kas) 396 - nvme_req(req)->ctrl->comp_seen = true; 383 + if (ctrl->kas) 384 + ctrl->comp_seen = true; 397 385 398 386 switch (nvme_decide_disposition(req)) { 399 387 case COMPLETE: ··· 406 390 return; 407 391 case FAILOVER: 408 392 nvme_failover_req(req); 393 + return; 394 + case AUTHENTICATE: 395 + #ifdef CONFIG_NVME_AUTH 396 + queue_work(nvme_wq, &ctrl->dhchap_auth_work); 397 + nvme_retry_req(req); 398 + #else 399 + nvme_end_req(req); 400 + #endif 409 401 return; 410 402 } 411 403 } ··· 726 702 switch (ctrl->state) { 727 703 case NVME_CTRL_CONNECTING: 728 704 if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) && 729 - req->cmd->fabrics.fctype == nvme_fabrics_type_connect) 705 + (req->cmd->fabrics.fctype == nvme_fabrics_type_connect || 706 + req->cmd->fabrics.fctype == nvme_fabrics_type_auth_send || 707 + req->cmd->fabrics.fctype == nvme_fabrics_type_auth_receive)) 730 708 return true; 731 709 break; 732 710 default: ··· 1016 990 */ 1017 991 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 1018 992 union nvme_result *result, void *buffer, unsigned bufflen, 1019 - unsigned timeout, int qid, int at_head, 1020 - blk_mq_req_flags_t flags) 993 + int qid, int at_head, blk_mq_req_flags_t flags) 1021 994 { 1022 995 struct request *req; 1023 996 int ret; ··· 1025 1000 req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags); 1026 1001 else 1027 1002 req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags, 1028 - qid ? qid - 1 : 0); 1003 + qid - 1); 1029 1004 1030 1005 if (IS_ERR(req)) 1031 1006 return PTR_ERR(req); 1032 1007 nvme_init_request(req, cmd); 1033 - 1034 - if (timeout) 1035 - req->timeout = timeout; 1036 1008 1037 1009 if (buffer && bufflen) { 1038 1010 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); ··· 1050 1028 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 1051 1029 void *buffer, unsigned bufflen) 1052 1030 { 1053 - return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0, 1031 + return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 1054 1032 NVME_QID_ANY, 0, 0); 1055 1033 } 1056 1034 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); ··· 1351 1329 } 1352 1330 } 1353 1331 1354 - static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, 1355 - struct nvme_ns_ids *ids) 1332 + static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, 1333 + struct nvme_ns_info *info) 1356 1334 { 1357 1335 struct nvme_command c = { }; 1358 1336 bool csi_seen = false; ··· 1365 1343 return 0; 1366 1344 1367 1345 c.identify.opcode = nvme_admin_identify; 1368 - c.identify.nsid = cpu_to_le32(nsid); 1346 + c.identify.nsid = cpu_to_le32(info->nsid); 1369 1347 c.identify.cns = NVME_ID_CNS_NS_DESC_LIST; 1370 1348 1371 1349 data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); ··· 1377 1355 if (status) { 1378 1356 dev_warn(ctrl->device, 1379 1357 "Identify Descriptors failed (nsid=%u, status=0x%x)\n", 1380 - nsid, status); 1358 + info->nsid, status); 1381 1359 goto free_data; 1382 1360 } 1383 1361 ··· 1387 1365 if (cur->nidl == 0) 1388 1366 break; 1389 1367 1390 - len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen); 1368 + len = nvme_process_ns_desc(ctrl, &info->ids, cur, &csi_seen); 1391 1369 if (len < 0) 1392 1370 break; 1393 1371 ··· 1396 1374 1397 1375 if (nvme_multi_css(ctrl) && !csi_seen) { 1398 1376 dev_warn(ctrl->device, "Command set not reported for nsid:%d\n", 1399 - nsid); 1377 + info->nsid); 1400 1378 status = -EINVAL; 1401 1379 } 1402 1380 ··· 1406 1384 } 1407 1385 1408 1386 static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, 1409 - struct nvme_ns_ids *ids, struct nvme_id_ns **id) 1387 + struct nvme_id_ns **id) 1410 1388 { 1411 1389 struct nvme_command c = { }; 1412 1390 int error; ··· 1429 1407 error = NVME_SC_INVALID_NS | NVME_SC_DNR; 1430 1408 if ((*id)->ncap == 0) /* namespace not allocated or attached */ 1431 1409 goto out_free_id; 1432 - 1433 - 1434 - if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) { 1435 - dev_info(ctrl->device, 1436 - "Ignoring bogus Namespace Identifiers\n"); 1437 - } else { 1438 - if (ctrl->vs >= NVME_VS(1, 1, 0) && 1439 - !memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) 1440 - memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64)); 1441 - if (ctrl->vs >= NVME_VS(1, 2, 0) && 1442 - !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) 1443 - memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid)); 1444 - } 1445 - 1446 1410 return 0; 1447 1411 1448 1412 out_free_id: ··· 1436 1428 return error; 1437 1429 } 1438 1430 1439 - static int nvme_identify_ns_cs_indep(struct nvme_ctrl *ctrl, unsigned nsid, 1440 - struct nvme_id_ns_cs_indep **id) 1431 + static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl, 1432 + struct nvme_ns_info *info) 1441 1433 { 1434 + struct nvme_ns_ids *ids = &info->ids; 1435 + struct nvme_id_ns *id; 1436 + int ret; 1437 + 1438 + ret = nvme_identify_ns(ctrl, info->nsid, &id); 1439 + if (ret) 1440 + return ret; 1441 + info->anagrpid = id->anagrpid; 1442 + info->is_shared = id->nmic & NVME_NS_NMIC_SHARED; 1443 + info->is_readonly = id->nsattr & NVME_NS_ATTR_RO; 1444 + info->is_ready = true; 1445 + if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) { 1446 + dev_info(ctrl->device, 1447 + "Ignoring bogus Namespace Identifiers\n"); 1448 + } else { 1449 + if (ctrl->vs >= NVME_VS(1, 1, 0) && 1450 + !memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) 1451 + memcpy(ids->eui64, id->eui64, sizeof(ids->eui64)); 1452 + if (ctrl->vs >= NVME_VS(1, 2, 0) && 1453 + !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) 1454 + memcpy(ids->nguid, id->nguid, sizeof(ids->nguid)); 1455 + } 1456 + kfree(id); 1457 + return 0; 1458 + } 1459 + 1460 + static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl, 1461 + struct nvme_ns_info *info) 1462 + { 1463 + struct nvme_id_ns_cs_indep *id; 1442 1464 struct nvme_command c = { 1443 1465 .identify.opcode = nvme_admin_identify, 1444 - .identify.nsid = cpu_to_le32(nsid), 1466 + .identify.nsid = cpu_to_le32(info->nsid), 1445 1467 .identify.cns = NVME_ID_CNS_NS_CS_INDEP, 1446 1468 }; 1447 1469 int ret; 1448 1470 1449 - *id = kmalloc(sizeof(**id), GFP_KERNEL); 1450 - if (!*id) 1471 + id = kmalloc(sizeof(*id), GFP_KERNEL); 1472 + if (!id) 1451 1473 return -ENOMEM; 1452 1474 1453 - ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id)); 1454 - if (ret) { 1455 - dev_warn(ctrl->device, 1456 - "Identify namespace (CS independent) failed (%d)\n", 1457 - ret); 1458 - kfree(*id); 1459 - return ret; 1475 + ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id)); 1476 + if (!ret) { 1477 + info->anagrpid = id->anagrpid; 1478 + info->is_shared = id->nmic & NVME_NS_NMIC_SHARED; 1479 + info->is_readonly = id->nsattr & NVME_NS_ATTR_RO; 1480 + info->is_ready = id->nstat & NVME_NSTAT_NRDY; 1460 1481 } 1461 - 1462 - return 0; 1482 + kfree(id); 1483 + return ret; 1463 1484 } 1464 1485 1465 1486 static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid, ··· 1503 1466 c.features.dword11 = cpu_to_le32(dword11); 1504 1467 1505 1468 ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res, 1506 - buffer, buflen, 0, NVME_QID_ANY, 0, 0); 1469 + buffer, buflen, NVME_QID_ANY, 0, 0); 1507 1470 if (ret >= 0 && result) 1508 1471 *result = le32_to_cpu(res.u32); 1509 1472 return ret; ··· 1912 1875 ns->ctrl->max_zeroes_sectors); 1913 1876 } 1914 1877 1878 + static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info) 1879 + { 1880 + return info->is_readonly || test_bit(NVME_NS_FORCE_RO, &ns->flags); 1881 + } 1882 + 1915 1883 static inline bool nvme_first_scan(struct gendisk *disk) 1916 1884 { 1917 1885 /* nvme_alloc_ns() scans the disk prior to adding it */ ··· 1954 1912 blk_queue_chunk_sectors(ns->queue, iob); 1955 1913 } 1956 1914 1957 - static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) 1915 + static int nvme_update_ns_info_generic(struct nvme_ns *ns, 1916 + struct nvme_ns_info *info) 1958 1917 { 1959 - unsigned lbaf = nvme_lbaf_index(id->flbas); 1918 + blk_mq_freeze_queue(ns->disk->queue); 1919 + nvme_set_queue_limits(ns->ctrl, ns->queue); 1920 + set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info)); 1921 + blk_mq_unfreeze_queue(ns->disk->queue); 1922 + 1923 + if (nvme_ns_head_multipath(ns->head)) { 1924 + blk_mq_freeze_queue(ns->head->disk->queue); 1925 + set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); 1926 + nvme_mpath_revalidate_paths(ns); 1927 + blk_stack_limits(&ns->head->disk->queue->limits, 1928 + &ns->queue->limits, 0); 1929 + ns->head->disk->flags |= GENHD_FL_HIDDEN; 1930 + blk_mq_unfreeze_queue(ns->head->disk->queue); 1931 + } 1932 + 1933 + /* Hide the block-interface for these devices */ 1934 + ns->disk->flags |= GENHD_FL_HIDDEN; 1935 + set_bit(NVME_NS_READY, &ns->flags); 1936 + 1937 + return 0; 1938 + } 1939 + 1940 + static int nvme_update_ns_info_block(struct nvme_ns *ns, 1941 + struct nvme_ns_info *info) 1942 + { 1943 + struct nvme_id_ns *id; 1944 + unsigned lbaf; 1960 1945 int ret; 1961 1946 1947 + ret = nvme_identify_ns(ns->ctrl, info->nsid, &id); 1948 + if (ret) 1949 + return ret; 1950 + 1962 1951 blk_mq_freeze_queue(ns->disk->queue); 1952 + lbaf = nvme_lbaf_index(id->flbas); 1963 1953 ns->lba_shift = id->lbaf[lbaf].ds; 1964 1954 nvme_set_queue_limits(ns->ctrl, ns->queue); 1965 1955 ··· 2001 1927 2002 1928 if (ns->head->ids.csi == NVME_CSI_ZNS) { 2003 1929 ret = nvme_update_zone_info(ns, lbaf); 2004 - if (ret) 2005 - goto out_unfreeze; 1930 + if (ret) { 1931 + blk_mq_unfreeze_queue(ns->disk->queue); 1932 + goto out; 1933 + } 2006 1934 } 2007 1935 2008 - set_disk_ro(ns->disk, (id->nsattr & NVME_NS_ATTR_RO) || 2009 - test_bit(NVME_NS_FORCE_RO, &ns->flags)); 1936 + set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info)); 2010 1937 set_bit(NVME_NS_READY, &ns->flags); 2011 1938 blk_mq_unfreeze_queue(ns->disk->queue); 2012 1939 2013 1940 if (blk_queue_is_zoned(ns->queue)) { 2014 1941 ret = nvme_revalidate_zones(ns); 2015 1942 if (ret && !nvme_first_scan(ns->disk)) 2016 - return ret; 1943 + goto out; 2017 1944 } 2018 1945 2019 1946 if (nvme_ns_head_multipath(ns->head)) { 2020 1947 blk_mq_freeze_queue(ns->head->disk->queue); 2021 1948 nvme_update_disk_info(ns->head->disk, ns, id); 2022 - set_disk_ro(ns->head->disk, 2023 - (id->nsattr & NVME_NS_ATTR_RO) || 2024 - test_bit(NVME_NS_FORCE_RO, &ns->flags)); 1949 + set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); 2025 1950 nvme_mpath_revalidate_paths(ns); 2026 1951 blk_stack_limits(&ns->head->disk->queue->limits, 2027 1952 &ns->queue->limits, 0); 2028 1953 disk_update_readahead(ns->head->disk); 2029 1954 blk_mq_unfreeze_queue(ns->head->disk->queue); 2030 1955 } 2031 - return 0; 2032 1956 2033 - out_unfreeze: 1957 + ret = 0; 1958 + out: 2034 1959 /* 2035 1960 * If probing fails due an unsupported feature, hide the block device, 2036 1961 * but still allow other access. ··· 2039 1966 set_bit(NVME_NS_READY, &ns->flags); 2040 1967 ret = 0; 2041 1968 } 2042 - blk_mq_unfreeze_queue(ns->disk->queue); 1969 + kfree(id); 2043 1970 return ret; 1971 + } 1972 + 1973 + static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) 1974 + { 1975 + switch (info->ids.csi) { 1976 + case NVME_CSI_ZNS: 1977 + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { 1978 + dev_info(ns->ctrl->device, 1979 + "block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n", 1980 + info->nsid); 1981 + return nvme_update_ns_info_generic(ns, info); 1982 + } 1983 + return nvme_update_ns_info_block(ns, info); 1984 + case NVME_CSI_NVM: 1985 + return nvme_update_ns_info_block(ns, info); 1986 + default: 1987 + dev_info(ns->ctrl->device, 1988 + "block device for nsid %u not supported (csi %u)\n", 1989 + info->nsid, info->ids.csi); 1990 + return nvme_update_ns_info_generic(ns, info); 1991 + } 2044 1992 } 2045 1993 2046 1994 static char nvme_pr_type(enum pr_type type) ··· 2197 2103 cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); 2198 2104 cmd.common.cdw11 = cpu_to_le32(len); 2199 2105 2200 - return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, 0, 2106 + return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, 2201 2107 NVME_QID_ANY, 1, 0); 2202 2108 } 2203 2109 EXPORT_SYMBOL_GPL(nvme_sec_submit); ··· 2217 2123 static const struct block_device_operations nvme_bdev_ops = { 2218 2124 .owner = THIS_MODULE, 2219 2125 .ioctl = nvme_ioctl, 2126 + .compat_ioctl = blkdev_compat_ptr_ioctl, 2220 2127 .open = nvme_open, 2221 2128 .release = nvme_release, 2222 2129 .getgeo = nvme_getgeo, ··· 3708 3613 } 3709 3614 static DEVICE_ATTR_RO(dctype); 3710 3615 3616 + #ifdef CONFIG_NVME_AUTH 3617 + static ssize_t nvme_ctrl_dhchap_secret_show(struct device *dev, 3618 + struct device_attribute *attr, char *buf) 3619 + { 3620 + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3621 + struct nvmf_ctrl_options *opts = ctrl->opts; 3622 + 3623 + if (!opts->dhchap_secret) 3624 + return sysfs_emit(buf, "none\n"); 3625 + return sysfs_emit(buf, "%s\n", opts->dhchap_secret); 3626 + } 3627 + 3628 + static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev, 3629 + struct device_attribute *attr, const char *buf, size_t count) 3630 + { 3631 + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3632 + struct nvmf_ctrl_options *opts = ctrl->opts; 3633 + char *dhchap_secret; 3634 + 3635 + if (!ctrl->opts->dhchap_secret) 3636 + return -EINVAL; 3637 + if (count < 7) 3638 + return -EINVAL; 3639 + if (memcmp(buf, "DHHC-1:", 7)) 3640 + return -EINVAL; 3641 + 3642 + dhchap_secret = kzalloc(count + 1, GFP_KERNEL); 3643 + if (!dhchap_secret) 3644 + return -ENOMEM; 3645 + memcpy(dhchap_secret, buf, count); 3646 + nvme_auth_stop(ctrl); 3647 + if (strcmp(dhchap_secret, opts->dhchap_secret)) { 3648 + int ret; 3649 + 3650 + ret = nvme_auth_generate_key(dhchap_secret, &ctrl->host_key); 3651 + if (ret) 3652 + return ret; 3653 + kfree(opts->dhchap_secret); 3654 + opts->dhchap_secret = dhchap_secret; 3655 + /* Key has changed; re-authentication with new key */ 3656 + nvme_auth_reset(ctrl); 3657 + } 3658 + /* Start re-authentication */ 3659 + dev_info(ctrl->device, "re-authenticating controller\n"); 3660 + queue_work(nvme_wq, &ctrl->dhchap_auth_work); 3661 + 3662 + return count; 3663 + } 3664 + static DEVICE_ATTR(dhchap_secret, S_IRUGO | S_IWUSR, 3665 + nvme_ctrl_dhchap_secret_show, nvme_ctrl_dhchap_secret_store); 3666 + 3667 + static ssize_t nvme_ctrl_dhchap_ctrl_secret_show(struct device *dev, 3668 + struct device_attribute *attr, char *buf) 3669 + { 3670 + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3671 + struct nvmf_ctrl_options *opts = ctrl->opts; 3672 + 3673 + if (!opts->dhchap_ctrl_secret) 3674 + return sysfs_emit(buf, "none\n"); 3675 + return sysfs_emit(buf, "%s\n", opts->dhchap_ctrl_secret); 3676 + } 3677 + 3678 + static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev, 3679 + struct device_attribute *attr, const char *buf, size_t count) 3680 + { 3681 + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3682 + struct nvmf_ctrl_options *opts = ctrl->opts; 3683 + char *dhchap_secret; 3684 + 3685 + if (!ctrl->opts->dhchap_ctrl_secret) 3686 + return -EINVAL; 3687 + if (count < 7) 3688 + return -EINVAL; 3689 + if (memcmp(buf, "DHHC-1:", 7)) 3690 + return -EINVAL; 3691 + 3692 + dhchap_secret = kzalloc(count + 1, GFP_KERNEL); 3693 + if (!dhchap_secret) 3694 + return -ENOMEM; 3695 + memcpy(dhchap_secret, buf, count); 3696 + nvme_auth_stop(ctrl); 3697 + if (strcmp(dhchap_secret, opts->dhchap_ctrl_secret)) { 3698 + int ret; 3699 + 3700 + ret = nvme_auth_generate_key(dhchap_secret, &ctrl->ctrl_key); 3701 + if (ret) 3702 + return ret; 3703 + kfree(opts->dhchap_ctrl_secret); 3704 + opts->dhchap_ctrl_secret = dhchap_secret; 3705 + /* Key has changed; re-authentication with new key */ 3706 + nvme_auth_reset(ctrl); 3707 + } 3708 + /* Start re-authentication */ 3709 + dev_info(ctrl->device, "re-authenticating controller\n"); 3710 + queue_work(nvme_wq, &ctrl->dhchap_auth_work); 3711 + 3712 + return count; 3713 + } 3714 + static DEVICE_ATTR(dhchap_ctrl_secret, S_IRUGO | S_IWUSR, 3715 + nvme_ctrl_dhchap_ctrl_secret_show, nvme_ctrl_dhchap_ctrl_secret_store); 3716 + #endif 3717 + 3711 3718 static struct attribute *nvme_dev_attrs[] = { 3712 3719 &dev_attr_reset_controller.attr, 3713 3720 &dev_attr_rescan_controller.attr, ··· 3833 3636 &dev_attr_kato.attr, 3834 3637 &dev_attr_cntrltype.attr, 3835 3638 &dev_attr_dctype.attr, 3639 + #ifdef CONFIG_NVME_AUTH 3640 + &dev_attr_dhchap_secret.attr, 3641 + &dev_attr_dhchap_ctrl_secret.attr, 3642 + #endif 3836 3643 NULL 3837 3644 }; 3838 3645 ··· 3860 3659 return 0; 3861 3660 if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts) 3862 3661 return 0; 3662 + #ifdef CONFIG_NVME_AUTH 3663 + if (a == &dev_attr_dhchap_secret.attr && !ctrl->opts) 3664 + return 0; 3665 + if (a == &dev_attr_dhchap_ctrl_secret.attr && !ctrl->opts) 3666 + return 0; 3667 + #endif 3863 3668 3864 3669 return a->mode; 3865 3670 } ··· 3993 3786 } 3994 3787 3995 3788 static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, 3996 - unsigned nsid, struct nvme_ns_ids *ids, bool is_shared) 3789 + struct nvme_ns_info *info) 3997 3790 { 3998 3791 struct nvme_ns_head *head; 3999 3792 size_t size = sizeof(*head); ··· 4015 3808 if (ret) 4016 3809 goto out_ida_remove; 4017 3810 head->subsys = ctrl->subsys; 4018 - head->ns_id = nsid; 4019 - head->ids = *ids; 4020 - head->shared = is_shared; 3811 + head->ns_id = info->nsid; 3812 + head->ids = info->ids; 3813 + head->shared = info->is_shared; 4021 3814 kref_init(&head->ref); 4022 3815 4023 3816 if (head->ids.csi) { ··· 4074 3867 return ret; 4075 3868 } 4076 3869 4077 - static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, 4078 - struct nvme_ns_ids *ids, bool is_shared) 3870 + static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info) 4079 3871 { 4080 3872 struct nvme_ctrl *ctrl = ns->ctrl; 4081 3873 struct nvme_ns_head *head = NULL; 4082 3874 int ret; 4083 3875 4084 - ret = nvme_global_check_duplicate_ids(ctrl->subsys, ids); 3876 + ret = nvme_global_check_duplicate_ids(ctrl->subsys, &info->ids); 4085 3877 if (ret) { 4086 3878 dev_err(ctrl->device, 4087 - "globally duplicate IDs for nsid %d\n", nsid); 3879 + "globally duplicate IDs for nsid %d\n", info->nsid); 4088 3880 nvme_print_device_info(ctrl); 4089 3881 return ret; 4090 3882 } 4091 3883 4092 3884 mutex_lock(&ctrl->subsys->lock); 4093 - head = nvme_find_ns_head(ctrl, nsid); 3885 + head = nvme_find_ns_head(ctrl, info->nsid); 4094 3886 if (!head) { 4095 - ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, ids); 3887 + ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &info->ids); 4096 3888 if (ret) { 4097 3889 dev_err(ctrl->device, 4098 3890 "duplicate IDs in subsystem for nsid %d\n", 4099 - nsid); 3891 + info->nsid); 4100 3892 goto out_unlock; 4101 3893 } 4102 - head = nvme_alloc_ns_head(ctrl, nsid, ids, is_shared); 3894 + head = nvme_alloc_ns_head(ctrl, info); 4103 3895 if (IS_ERR(head)) { 4104 3896 ret = PTR_ERR(head); 4105 3897 goto out_unlock; 4106 3898 } 4107 3899 } else { 4108 3900 ret = -EINVAL; 4109 - if (!is_shared || !head->shared) { 3901 + if (!info->is_shared || !head->shared) { 4110 3902 dev_err(ctrl->device, 4111 - "Duplicate unshared namespace %d\n", nsid); 3903 + "Duplicate unshared namespace %d\n", 3904 + info->nsid); 4112 3905 goto out_put_ns_head; 4113 3906 } 4114 - if (!nvme_ns_ids_equal(&head->ids, ids)) { 3907 + if (!nvme_ns_ids_equal(&head->ids, &info->ids)) { 4115 3908 dev_err(ctrl->device, 4116 3909 "IDs don't match for shared namespace %d\n", 4117 - nsid); 3910 + info->nsid); 4118 3911 goto out_put_ns_head; 4119 3912 } 4120 3913 4121 3914 if (!multipath && !list_empty(&head->list)) { 4122 3915 dev_warn(ctrl->device, 4123 3916 "Found shared namespace %d, but multipathing not supported.\n", 4124 - nsid); 3917 + info->nsid); 4125 3918 dev_warn_once(ctrl->device, 4126 3919 "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0\n."); 4127 3920 } ··· 4175 3968 list_add(&ns->list, &ns->ctrl->namespaces); 4176 3969 } 4177 3970 4178 - static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, 4179 - struct nvme_ns_ids *ids) 3971 + static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) 4180 3972 { 4181 3973 struct nvme_ns *ns; 4182 3974 struct gendisk *disk; 4183 - struct nvme_id_ns *id; 4184 3975 int node = ctrl->numa_node; 4185 - 4186 - if (nvme_identify_ns(ctrl, nsid, ids, &id)) 4187 - return; 4188 3976 4189 3977 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); 4190 3978 if (!ns) 4191 - goto out_free_id; 3979 + return; 4192 3980 4193 3981 disk = blk_mq_alloc_disk(ctrl->tagset, ns); 4194 3982 if (IS_ERR(disk)) ··· 4204 4002 ns->ctrl = ctrl; 4205 4003 kref_init(&ns->kref); 4206 4004 4207 - if (nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED)) 4005 + if (nvme_init_ns_head(ns, info)) 4208 4006 goto out_cleanup_disk; 4209 4007 4210 4008 /* ··· 4230 4028 ns->head->instance); 4231 4029 } 4232 4030 4233 - if (nvme_update_ns_info(ns, id)) 4031 + if (nvme_update_ns_info(ns, info)) 4234 4032 goto out_unlink_ns; 4235 4033 4236 4034 down_write(&ctrl->namespaces_rwsem); ··· 4244 4042 if (!nvme_ns_head_multipath(ns->head)) 4245 4043 nvme_add_ns_cdev(ns); 4246 4044 4247 - nvme_mpath_add_disk(ns, id); 4045 + nvme_mpath_add_disk(ns, info->anagrpid); 4248 4046 nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name); 4249 - kfree(id); 4250 4047 4251 4048 return; 4252 4049 ··· 4265 4064 put_disk(disk); 4266 4065 out_free_ns: 4267 4066 kfree(ns); 4268 - out_free_id: 4269 - kfree(id); 4270 4067 } 4271 4068 4272 4069 static void nvme_ns_remove(struct nvme_ns *ns) ··· 4322 4123 } 4323 4124 } 4324 4125 4325 - static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids) 4126 + static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info) 4326 4127 { 4327 - struct nvme_id_ns *id; 4328 4128 int ret = NVME_SC_INVALID_NS | NVME_SC_DNR; 4329 4129 4330 4130 if (test_bit(NVME_NS_DEAD, &ns->flags)) 4331 4131 goto out; 4332 4132 4333 - ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, ids, &id); 4334 - if (ret) 4335 - goto out; 4336 - 4337 4133 ret = NVME_SC_INVALID_NS | NVME_SC_DNR; 4338 - if (!nvme_ns_ids_equal(&ns->head->ids, ids)) { 4134 + if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) { 4339 4135 dev_err(ns->ctrl->device, 4340 4136 "identifiers changed for nsid %d\n", ns->head->ns_id); 4341 - goto out_free_id; 4137 + goto out; 4342 4138 } 4343 4139 4344 - ret = nvme_update_ns_info(ns, id); 4345 - 4346 - out_free_id: 4347 - kfree(id); 4140 + ret = nvme_update_ns_info(ns, info); 4348 4141 out: 4349 4142 /* 4350 4143 * Only remove the namespace if we got a fatal error back from the ··· 4348 4157 nvme_ns_remove(ns); 4349 4158 } 4350 4159 4351 - static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) 4160 + static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid) 4352 4161 { 4353 - struct nvme_ns_ids ids = { }; 4354 - struct nvme_id_ns_cs_indep *id; 4162 + struct nvme_ns_info info = { .nsid = nsid }; 4355 4163 struct nvme_ns *ns; 4356 - bool ready = true; 4357 4164 4358 - if (nvme_identify_ns_descs(ctrl, nsid, &ids)) 4165 + if (nvme_identify_ns_descs(ctrl, &info)) 4359 4166 return; 4360 4167 4361 - /* 4362 - * Check if the namespace is ready. If not ignore it, we will get an 4363 - * AEN once it becomes ready and restart the scan. 4364 - */ 4365 - if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) && 4366 - !nvme_identify_ns_cs_indep(ctrl, nsid, &id)) { 4367 - ready = id->nstat & NVME_NSTAT_NRDY; 4368 - kfree(id); 4168 + if (info.ids.csi != NVME_CSI_NVM && !nvme_multi_css(ctrl)) { 4169 + dev_warn(ctrl->device, 4170 + "command set not reported for nsid: %d\n", nsid); 4171 + return; 4369 4172 } 4370 4173 4371 - if (!ready) 4174 + /* 4175 + * If available try to use the Command Set Idependent Identify Namespace 4176 + * data structure to find all the generic information that is needed to 4177 + * set up a namespace. If not fall back to the legacy version. 4178 + */ 4179 + if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) || 4180 + (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS)) { 4181 + if (nvme_ns_info_from_id_cs_indep(ctrl, &info)) 4182 + return; 4183 + } else { 4184 + if (nvme_ns_info_from_identify(ctrl, &info)) 4185 + return; 4186 + } 4187 + 4188 + /* 4189 + * Ignore the namespace if it is not ready. We will get an AEN once it 4190 + * becomes ready and restart the scan. 4191 + */ 4192 + if (!info.is_ready) 4372 4193 return; 4373 4194 4374 4195 ns = nvme_find_get_ns(ctrl, nsid); 4375 4196 if (ns) { 4376 - nvme_validate_ns(ns, &ids); 4197 + nvme_validate_ns(ns, &info); 4377 4198 nvme_put_ns(ns); 4378 - return; 4379 - } 4380 - 4381 - switch (ids.csi) { 4382 - case NVME_CSI_NVM: 4383 - nvme_alloc_ns(ctrl, nsid, &ids); 4384 - break; 4385 - case NVME_CSI_ZNS: 4386 - if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { 4387 - dev_warn(ctrl->device, 4388 - "nsid %u not supported without CONFIG_BLK_DEV_ZONED\n", 4389 - nsid); 4390 - break; 4391 - } 4392 - if (!nvme_multi_css(ctrl)) { 4393 - dev_warn(ctrl->device, 4394 - "command set not reported for nsid: %d\n", 4395 - nsid); 4396 - break; 4397 - } 4398 - nvme_alloc_ns(ctrl, nsid, &ids); 4399 - break; 4400 - default: 4401 - dev_warn(ctrl->device, "unknown csi %u for nsid %u\n", 4402 - ids.csi, nsid); 4403 - break; 4199 + } else { 4200 + nvme_alloc_ns(ctrl, &info); 4404 4201 } 4405 4202 } 4406 4203 ··· 4444 4265 4445 4266 if (!nsid) /* end of the list? */ 4446 4267 goto out; 4447 - nvme_validate_or_alloc_ns(ctrl, nsid); 4268 + nvme_scan_ns(ctrl, nsid); 4448 4269 while (++prev < nsid) 4449 4270 nvme_ns_remove_by_nsid(ctrl, prev); 4450 4271 } ··· 4467 4288 kfree(id); 4468 4289 4469 4290 for (i = 1; i <= nn; i++) 4470 - nvme_validate_or_alloc_ns(ctrl, i); 4291 + nvme_scan_ns(ctrl, i); 4471 4292 4472 4293 nvme_remove_invalid_namespaces(ctrl, nn); 4473 4294 } ··· 4704 4525 nvme_get_fw_slot_info(ctrl); 4705 4526 } 4706 4527 4528 + static u32 nvme_aer_type(u32 result) 4529 + { 4530 + return result & 0x7; 4531 + } 4532 + 4533 + static u32 nvme_aer_subtype(u32 result) 4534 + { 4535 + return (result & 0xff00) >> 8; 4536 + } 4537 + 4707 4538 static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) 4708 4539 { 4709 - u32 aer_notice_type = (result & 0xff00) >> 8; 4540 + u32 aer_notice_type = nvme_aer_subtype(result); 4710 4541 4711 4542 trace_nvme_async_event(ctrl, aer_notice_type); 4712 4543 ··· 4731 4542 * recovery actions from interfering with the controller's 4732 4543 * firmware activation. 4733 4544 */ 4734 - if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) 4545 + if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) { 4546 + nvme_auth_stop(ctrl); 4735 4547 queue_work(nvme_wq, &ctrl->fw_act_work); 4548 + } 4736 4549 break; 4737 4550 #ifdef CONFIG_NVME_MULTIPATH 4738 4551 case NVME_AER_NOTICE_ANA: ··· 4751 4560 } 4752 4561 } 4753 4562 4563 + static void nvme_handle_aer_persistent_error(struct nvme_ctrl *ctrl) 4564 + { 4565 + trace_nvme_async_event(ctrl, NVME_AER_ERROR); 4566 + dev_warn(ctrl->device, "resetting controller due to AER\n"); 4567 + nvme_reset_ctrl(ctrl); 4568 + } 4569 + 4754 4570 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, 4755 4571 volatile union nvme_result *res) 4756 4572 { 4757 4573 u32 result = le32_to_cpu(res->u32); 4758 - u32 aer_type = result & 0x07; 4574 + u32 aer_type = nvme_aer_type(result); 4575 + u32 aer_subtype = nvme_aer_subtype(result); 4759 4576 4760 4577 if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS) 4761 4578 return; ··· 4773 4574 nvme_handle_aen_notice(ctrl, result); 4774 4575 break; 4775 4576 case NVME_AER_ERROR: 4577 + /* 4578 + * For a persistent internal error, don't run async_event_work 4579 + * to submit a new AER. The controller reset will do it. 4580 + */ 4581 + if (aer_subtype == NVME_AER_ERROR_PERSIST_INT_ERR) { 4582 + nvme_handle_aer_persistent_error(ctrl); 4583 + return; 4584 + } 4585 + fallthrough; 4776 4586 case NVME_AER_SMART: 4777 4587 case NVME_AER_CSS: 4778 4588 case NVME_AER_VS: ··· 4798 4590 void nvme_stop_ctrl(struct nvme_ctrl *ctrl) 4799 4591 { 4800 4592 nvme_mpath_stop(ctrl); 4593 + nvme_auth_stop(ctrl); 4801 4594 nvme_stop_keep_alive(ctrl); 4802 4595 nvme_stop_failfast_work(ctrl); 4803 4596 flush_work(&ctrl->async_event_work); ··· 4858 4649 4859 4650 nvme_free_cels(ctrl); 4860 4651 nvme_mpath_uninit(ctrl); 4652 + nvme_auth_stop(ctrl); 4653 + nvme_auth_free(ctrl); 4861 4654 __free_page(ctrl->discard_page); 4862 4655 4863 4656 if (subsys) { ··· 4950 4739 4951 4740 nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device)); 4952 4741 nvme_mpath_init_ctrl(ctrl); 4742 + nvme_auth_init_ctrl(ctrl); 4953 4743 4954 4744 return 0; 4955 4745 out_free_name:

+86 -8

drivers/nvme/host/fabrics.c

··· 152 152 cmd.prop_get.fctype = nvme_fabrics_type_property_get; 153 153 cmd.prop_get.offset = cpu_to_le32(off); 154 154 155 - ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0, 155 + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 156 156 NVME_QID_ANY, 0, 0); 157 157 158 158 if (ret >= 0) ··· 198 198 cmd.prop_get.attrib = 1; 199 199 cmd.prop_get.offset = cpu_to_le32(off); 200 200 201 - ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0, 201 + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 202 202 NVME_QID_ANY, 0, 0); 203 203 204 204 if (ret >= 0) ··· 243 243 cmd.prop_set.offset = cpu_to_le32(off); 244 244 cmd.prop_set.value = cpu_to_le64(val); 245 245 246 - ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0, 0, 246 + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0, 247 247 NVME_QID_ANY, 0, 0); 248 248 if (unlikely(ret)) 249 249 dev_err(ctrl->device, ··· 331 331 dev_err(ctrl->device, 332 332 "Connect command failed: host path error\n"); 333 333 break; 334 + case NVME_SC_AUTH_REQUIRED: 335 + dev_err(ctrl->device, 336 + "Connect command failed: authentication required\n"); 337 + break; 334 338 default: 335 339 dev_err(ctrl->device, 336 340 "Connect command failed, error wo/DNR bit: %d\n", ··· 369 365 union nvme_result res; 370 366 struct nvmf_connect_data *data; 371 367 int ret; 368 + u32 result; 372 369 373 370 cmd.connect.opcode = nvme_fabrics_command; 374 371 cmd.connect.fctype = nvme_fabrics_type_connect; ··· 394 389 strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); 395 390 396 391 ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, 397 - data, sizeof(*data), 0, NVME_QID_ANY, 1, 392 + data, sizeof(*data), NVME_QID_ANY, 1, 398 393 BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); 399 394 if (ret) { 400 395 nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32), ··· 402 397 goto out_free_data; 403 398 } 404 399 405 - ctrl->cntlid = le16_to_cpu(res.u16); 406 - 400 + result = le32_to_cpu(res.u32); 401 + ctrl->cntlid = result & 0xFFFF; 402 + if ((result >> 16) & 0x3) { 403 + /* Authentication required */ 404 + ret = nvme_auth_negotiate(ctrl, 0); 405 + if (ret) { 406 + dev_warn(ctrl->device, 407 + "qid 0: authentication setup failed\n"); 408 + ret = NVME_SC_AUTH_REQUIRED; 409 + goto out_free_data; 410 + } 411 + ret = nvme_auth_wait(ctrl, 0); 412 + if (ret) 413 + dev_warn(ctrl->device, 414 + "qid 0: authentication failed\n"); 415 + else 416 + dev_info(ctrl->device, 417 + "qid 0: authenticated\n"); 418 + } 407 419 out_free_data: 408 420 kfree(data); 409 421 return ret; ··· 453 431 struct nvmf_connect_data *data; 454 432 union nvme_result res; 455 433 int ret; 434 + u32 result; 456 435 457 436 cmd.connect.opcode = nvme_fabrics_command; 458 437 cmd.connect.fctype = nvme_fabrics_type_connect; ··· 473 450 strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); 474 451 475 452 ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &res, 476 - data, sizeof(*data), 0, qid, 1, 453 + data, sizeof(*data), qid, 1, 477 454 BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); 478 455 if (ret) { 479 456 nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32), 480 457 &cmd, data); 458 + } 459 + result = le32_to_cpu(res.u32); 460 + if ((result >> 16) & 2) { 461 + /* Authentication required */ 462 + ret = nvme_auth_negotiate(ctrl, qid); 463 + if (ret) { 464 + dev_warn(ctrl->device, 465 + "qid %d: authentication setup failed\n", qid); 466 + ret = NVME_SC_AUTH_REQUIRED; 467 + } else { 468 + ret = nvme_auth_wait(ctrl, qid); 469 + if (ret) 470 + dev_warn(ctrl->device, 471 + "qid %u: authentication failed\n", qid); 472 + } 481 473 } 482 474 kfree(data); 483 475 return ret; ··· 586 548 { NVMF_OPT_TOS, "tos=%d" }, 587 549 { NVMF_OPT_FAIL_FAST_TMO, "fast_io_fail_tmo=%d" }, 588 550 { NVMF_OPT_DISCOVERY, "discovery" }, 551 + { NVMF_OPT_DHCHAP_SECRET, "dhchap_secret=%s" }, 552 + { NVMF_OPT_DHCHAP_CTRL_SECRET, "dhchap_ctrl_secret=%s" }, 589 553 { NVMF_OPT_ERR, NULL } 590 554 }; 591 555 ··· 869 829 case NVMF_OPT_DISCOVERY: 870 830 opts->discovery_nqn = true; 871 831 break; 832 + case NVMF_OPT_DHCHAP_SECRET: 833 + p = match_strdup(args); 834 + if (!p) { 835 + ret = -ENOMEM; 836 + goto out; 837 + } 838 + if (strlen(p) < 11 || strncmp(p, "DHHC-1:", 7)) { 839 + pr_err("Invalid DH-CHAP secret %s\n", p); 840 + ret = -EINVAL; 841 + goto out; 842 + } 843 + kfree(opts->dhchap_secret); 844 + opts->dhchap_secret = p; 845 + break; 846 + case NVMF_OPT_DHCHAP_CTRL_SECRET: 847 + p = match_strdup(args); 848 + if (!p) { 849 + ret = -ENOMEM; 850 + goto out; 851 + } 852 + if (strlen(p) < 11 || strncmp(p, "DHHC-1:", 7)) { 853 + pr_err("Invalid DH-CHAP secret %s\n", p); 854 + ret = -EINVAL; 855 + goto out; 856 + } 857 + kfree(opts->dhchap_ctrl_secret); 858 + opts->dhchap_ctrl_secret = p; 859 + break; 872 860 default: 873 861 pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", 874 862 p); ··· 1015 947 kfree(opts->subsysnqn); 1016 948 kfree(opts->host_traddr); 1017 949 kfree(opts->host_iface); 950 + kfree(opts->dhchap_secret); 951 + kfree(opts->dhchap_ctrl_secret); 1018 952 kfree(opts); 1019 953 } 1020 954 EXPORT_SYMBOL_GPL(nvmf_free_options); ··· 1026 956 NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \ 1027 957 NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\ 1028 958 NVMF_OPT_DISABLE_SQFLOW | NVMF_OPT_DISCOVERY |\ 1029 - NVMF_OPT_FAIL_FAST_TMO) 959 + NVMF_OPT_FAIL_FAST_TMO | NVMF_OPT_DHCHAP_SECRET |\ 960 + NVMF_OPT_DHCHAP_CTRL_SECRET) 1030 961 1031 962 static struct nvme_ctrl * 1032 963 nvmf_create_ctrl(struct device *dev, const char *buf) ··· 1263 1192 BUILD_BUG_ON(sizeof(struct nvmf_connect_command) != 64); 1264 1193 BUILD_BUG_ON(sizeof(struct nvmf_property_get_command) != 64); 1265 1194 BUILD_BUG_ON(sizeof(struct nvmf_property_set_command) != 64); 1195 + BUILD_BUG_ON(sizeof(struct nvmf_auth_send_command) != 64); 1196 + BUILD_BUG_ON(sizeof(struct nvmf_auth_receive_command) != 64); 1266 1197 BUILD_BUG_ON(sizeof(struct nvmf_connect_data) != 1024); 1198 + BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_negotiate_data) != 8); 1199 + BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_challenge_data) != 16); 1200 + BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_reply_data) != 16); 1201 + BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_success1_data) != 16); 1202 + BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_success2_data) != 16); 1267 1203 } 1268 1204 1269 1205 MODULE_LICENSE("GPL v2");

+7

drivers/nvme/host/fabrics.h

··· 68 68 NVMF_OPT_FAIL_FAST_TMO = 1 << 20, 69 69 NVMF_OPT_HOST_IFACE = 1 << 21, 70 70 NVMF_OPT_DISCOVERY = 1 << 22, 71 + NVMF_OPT_DHCHAP_SECRET = 1 << 23, 72 + NVMF_OPT_DHCHAP_CTRL_SECRET = 1 << 24, 71 73 }; 72 74 73 75 /** ··· 99 97 * @max_reconnects: maximum number of allowed reconnect attempts before removing 100 98 * the controller, (-1) means reconnect forever, zero means remove 101 99 * immediately; 100 + * @dhchap_secret: DH-HMAC-CHAP secret 101 + * @dhchap_ctrl_secret: DH-HMAC-CHAP controller secret for bi-directional 102 + * authentication 102 103 * @disable_sqflow: disable controller sq flow control 103 104 * @hdr_digest: generate/verify header digest (TCP) 104 105 * @data_digest: generate/verify data digest (TCP) ··· 126 121 unsigned int kato; 127 122 struct nvmf_host *host; 128 123 int max_reconnects; 124 + char *dhchap_secret; 125 + char *dhchap_ctrl_secret; 129 126 bool disable_sqflow; 130 127 bool hdr_digest; 131 128 bool data_digest;

+5 -4

drivers/nvme/host/multipath.c

··· 346 346 * different queue via blk_steal_bios(), so we need to use the bio_split 347 347 * pool from the original queue to allocate the bvecs from. 348 348 */ 349 - blk_queue_split(&bio); 349 + bio = bio_split_to_limits(bio); 350 350 351 351 srcu_idx = srcu_read_lock(&head->srcu); 352 352 ns = nvme_find_path(head); ··· 408 408 .open = nvme_ns_head_open, 409 409 .release = nvme_ns_head_release, 410 410 .ioctl = nvme_ns_head_ioctl, 411 + .compat_ioctl = blkdev_compat_ptr_ioctl, 411 412 .getgeo = nvme_getgeo, 412 413 .report_zones = nvme_ns_head_report_zones, 413 414 .pr_ops = &nvme_pr_ops, ··· 801 800 return -ENXIO; /* just break out of the loop */ 802 801 } 803 802 804 - void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) 803 + void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) 805 804 { 806 805 if (nvme_ctrl_use_ana(ns->ctrl)) { 807 806 struct nvme_ana_group_desc desc = { 808 - .grpid = id->anagrpid, 807 + .grpid = anagrpid, 809 808 .state = 0, 810 809 }; 811 810 812 811 mutex_lock(&ns->ctrl->ana_lock); 813 - ns->ana_grpid = le32_to_cpu(id->anagrpid); 812 + ns->ana_grpid = le32_to_cpu(anagrpid); 814 813 nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc); 815 814 mutex_unlock(&ns->ctrl->ana_lock); 816 815 if (desc.state) {

+34 -5

drivers/nvme/host/nvme.h

··· 140 140 NVME_QUIRK_DMA_ADDRESS_BITS_48 = (1 << 16), 141 141 142 142 /* 143 - * The controller requires the command_id value be be limited, so skip 143 + * The controller requires the command_id value be limited, so skip 144 144 * encoding the generation sequence number. 145 145 */ 146 146 NVME_QUIRK_SKIP_CID_GEN = (1 << 17), ··· 326 326 size_t ana_log_size; 327 327 struct timer_list anatt_timer; 328 328 struct work_struct ana_work; 329 + #endif 330 + 331 + #ifdef CONFIG_NVME_AUTH 332 + struct work_struct dhchap_auth_work; 333 + struct list_head dhchap_auth_list; 334 + struct mutex dhchap_auth_mutex; 335 + struct nvme_dhchap_key *host_key; 336 + struct nvme_dhchap_key *ctrl_key; 337 + u16 transaction; 329 338 #endif 330 339 331 340 /* Power saving configuration */ ··· 790 781 void *buf, unsigned bufflen); 791 782 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 792 783 union nvme_result *result, void *buffer, unsigned bufflen, 793 - unsigned timeout, int qid, int at_head, 784 + int qid, int at_head, 794 785 blk_mq_req_flags_t flags); 795 786 int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid, 796 787 unsigned int dword11, void *buffer, size_t buflen, ··· 846 837 void nvme_failover_req(struct request *req); 847 838 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); 848 839 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); 849 - void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id); 840 + void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid); 850 841 void nvme_mpath_remove_disk(struct nvme_ns_head *head); 851 842 int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id); 852 843 void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl); ··· 888 879 { 889 880 return 0; 890 881 } 891 - static inline void nvme_mpath_add_disk(struct nvme_ns *ns, 892 - struct nvme_id_ns *id) 882 + static inline void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) 893 883 { 894 884 } 895 885 static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) ··· 999 991 { 1000 992 return ctrl->sgls & ((1 << 0) | (1 << 1)); 1001 993 } 994 + 995 + #ifdef CONFIG_NVME_AUTH 996 + void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl); 997 + void nvme_auth_stop(struct nvme_ctrl *ctrl); 998 + int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid); 999 + int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid); 1000 + void nvme_auth_reset(struct nvme_ctrl *ctrl); 1001 + void nvme_auth_free(struct nvme_ctrl *ctrl); 1002 + #else 1003 + static inline void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) {}; 1004 + static inline void nvme_auth_stop(struct nvme_ctrl *ctrl) {}; 1005 + static inline int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) 1006 + { 1007 + return -EPROTONOSUPPORT; 1008 + } 1009 + static inline int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid) 1010 + { 1011 + return NVME_SC_AUTH_REQUIRED; 1012 + } 1013 + static inline void nvme_auth_free(struct nvme_ctrl *ctrl) {}; 1014 + #endif 1002 1015 1003 1016 u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 1004 1017 u8 opcode);

+74 -71

drivers/nvme/host/pci.c

··· 670 670 671 671 prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); 672 672 if (!prp_list) { 673 - iod->first_dma = dma_addr; 674 673 iod->npages = -1; 675 674 return BLK_STS_RESOURCE; 676 675 } ··· 1434 1435 cmd.abort.sqid = cpu_to_le16(nvmeq->qid); 1435 1436 1436 1437 dev_warn(nvmeq->dev->ctrl.device, 1437 - "I/O %d QID %d timeout, aborting\n", 1438 - req->tag, nvmeq->qid); 1438 + "I/O %d (%s) QID %d timeout, aborting\n", 1439 + req->tag, 1440 + nvme_get_opcode_str(nvme_req(req)->cmd->common.opcode), 1441 + nvmeq->qid); 1439 1442 1440 1443 abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd), 1441 1444 BLK_MQ_REQ_NOWAIT); ··· 1766 1765 } 1767 1766 } 1768 1767 1769 - static int nvme_alloc_admin_tags(struct nvme_dev *dev) 1768 + static int nvme_pci_alloc_admin_tag_set(struct nvme_dev *dev) 1770 1769 { 1771 - if (!dev->ctrl.admin_q) { 1772 - dev->admin_tagset.ops = &nvme_mq_admin_ops; 1773 - dev->admin_tagset.nr_hw_queues = 1; 1770 + struct blk_mq_tag_set *set = &dev->admin_tagset; 1774 1771 1775 - dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH; 1776 - dev->admin_tagset.timeout = NVME_ADMIN_TIMEOUT; 1777 - dev->admin_tagset.numa_node = dev->ctrl.numa_node; 1778 - dev->admin_tagset.cmd_size = sizeof(struct nvme_iod); 1779 - dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED; 1780 - dev->admin_tagset.driver_data = dev; 1772 + set->ops = &nvme_mq_admin_ops; 1773 + set->nr_hw_queues = 1; 1781 1774 1782 - if (blk_mq_alloc_tag_set(&dev->admin_tagset)) 1783 - return -ENOMEM; 1784 - dev->ctrl.admin_tagset = &dev->admin_tagset; 1775 + set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; 1776 + set->timeout = NVME_ADMIN_TIMEOUT; 1777 + set->numa_node = dev->ctrl.numa_node; 1778 + set->cmd_size = sizeof(struct nvme_iod); 1779 + set->flags = BLK_MQ_F_NO_SCHED; 1780 + set->driver_data = dev; 1785 1781 1786 - dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset); 1787 - if (IS_ERR(dev->ctrl.admin_q)) { 1788 - blk_mq_free_tag_set(&dev->admin_tagset); 1789 - dev->ctrl.admin_q = NULL; 1790 - return -ENOMEM; 1791 - } 1792 - if (!blk_get_queue(dev->ctrl.admin_q)) { 1793 - nvme_dev_remove_admin(dev); 1794 - dev->ctrl.admin_q = NULL; 1795 - return -ENODEV; 1796 - } 1797 - } else 1798 - nvme_start_admin_queue(&dev->ctrl); 1782 + if (blk_mq_alloc_tag_set(set)) 1783 + return -ENOMEM; 1784 + dev->ctrl.admin_tagset = set; 1799 1785 1786 + dev->ctrl.admin_q = blk_mq_init_queue(set); 1787 + if (IS_ERR(dev->ctrl.admin_q)) { 1788 + blk_mq_free_tag_set(set); 1789 + dev->ctrl.admin_q = NULL; 1790 + return -ENOMEM; 1791 + } 1792 + if (!blk_get_queue(dev->ctrl.admin_q)) { 1793 + nvme_dev_remove_admin(dev); 1794 + dev->ctrl.admin_q = NULL; 1795 + return -ENODEV; 1796 + } 1800 1797 return 0; 1801 1798 } 1802 1799 ··· 2533 2534 return true; 2534 2535 } 2535 2536 2536 - static void nvme_dev_add(struct nvme_dev *dev) 2537 + static void nvme_pci_alloc_tag_set(struct nvme_dev *dev) 2537 2538 { 2539 + struct blk_mq_tag_set * set = &dev->tagset; 2538 2540 int ret; 2539 2541 2540 - if (!dev->ctrl.tagset) { 2541 - dev->tagset.ops = &nvme_mq_ops; 2542 - dev->tagset.nr_hw_queues = dev->online_queues - 1; 2543 - dev->tagset.nr_maps = 2; /* default + read */ 2544 - if (dev->io_queues[HCTX_TYPE_POLL]) 2545 - dev->tagset.nr_maps++; 2546 - dev->tagset.timeout = NVME_IO_TIMEOUT; 2547 - dev->tagset.numa_node = dev->ctrl.numa_node; 2548 - dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth, 2549 - BLK_MQ_MAX_DEPTH) - 1; 2550 - dev->tagset.cmd_size = sizeof(struct nvme_iod); 2551 - dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; 2552 - dev->tagset.driver_data = dev; 2542 + set->ops = &nvme_mq_ops; 2543 + set->nr_hw_queues = dev->online_queues - 1; 2544 + set->nr_maps = 2; /* default + read */ 2545 + if (dev->io_queues[HCTX_TYPE_POLL]) 2546 + set->nr_maps++; 2547 + set->timeout = NVME_IO_TIMEOUT; 2548 + set->numa_node = dev->ctrl.numa_node; 2549 + set->queue_depth = min_t(unsigned, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; 2550 + set->cmd_size = sizeof(struct nvme_iod); 2551 + set->flags = BLK_MQ_F_SHOULD_MERGE; 2552 + set->driver_data = dev; 2553 2553 2554 - /* 2555 - * Some Apple controllers requires tags to be unique 2556 - * across admin and IO queue, so reserve the first 32 2557 - * tags of the IO queue. 2558 - */ 2559 - if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) 2560 - dev->tagset.reserved_tags = NVME_AQ_DEPTH; 2554 + /* 2555 + * Some Apple controllers requires tags to be unique 2556 + * across admin and IO queue, so reserve the first 32 2557 + * tags of the IO queue. 2558 + */ 2559 + if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) 2560 + set->reserved_tags = NVME_AQ_DEPTH; 2561 2561 2562 - ret = blk_mq_alloc_tag_set(&dev->tagset); 2563 - if (ret) { 2564 - dev_warn(dev->ctrl.device, 2565 - "IO queues tagset allocation failed %d\n", ret); 2566 - return; 2567 - } 2568 - dev->ctrl.tagset = &dev->tagset; 2569 - } else { 2570 - blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1); 2571 - 2572 - /* Free previously allocated queues that are no longer usable */ 2573 - nvme_free_queues(dev, dev->online_queues); 2562 + ret = blk_mq_alloc_tag_set(set); 2563 + if (ret) { 2564 + dev_warn(dev->ctrl.device, 2565 + "IO queues tagset allocation failed %d\n", ret); 2566 + return; 2574 2567 } 2568 + dev->ctrl.tagset = set; 2569 + } 2575 2570 2576 - nvme_dbbuf_set(dev); 2571 + static void nvme_pci_update_nr_queues(struct nvme_dev *dev) 2572 + { 2573 + blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1); 2574 + /* free previously allocated queues that are no longer usable */ 2575 + nvme_free_queues(dev, dev->online_queues); 2577 2576 } 2578 2577 2579 2578 static int nvme_pci_enable(struct nvme_dev *dev) ··· 2722 2725 nvme_pci_disable(dev); 2723 2726 nvme_reap_pending_cqes(dev); 2724 2727 2725 - blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); 2726 - blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl); 2727 - blk_mq_tagset_wait_completed_request(&dev->tagset); 2728 - blk_mq_tagset_wait_completed_request(&dev->admin_tagset); 2728 + nvme_cancel_tagset(&dev->ctrl); 2729 + nvme_cancel_admin_tagset(&dev->ctrl); 2729 2730 2730 2731 /* 2731 2732 * The driver will not be starting up queues again if shutting down so ··· 2837 2842 if (result) 2838 2843 goto out_unlock; 2839 2844 2840 - result = nvme_alloc_admin_tags(dev); 2841 - if (result) 2842 - goto out_unlock; 2845 + if (!dev->ctrl.admin_q) { 2846 + result = nvme_pci_alloc_admin_tag_set(dev); 2847 + if (result) 2848 + goto out_unlock; 2849 + } else { 2850 + nvme_start_admin_queue(&dev->ctrl); 2851 + } 2843 2852 2844 2853 /* 2845 2854 * Limit the max command size to prevent iod->sg allocations going ··· 2922 2923 } else { 2923 2924 nvme_start_queues(&dev->ctrl); 2924 2925 nvme_wait_freeze(&dev->ctrl); 2925 - nvme_dev_add(dev); 2926 + if (!dev->ctrl.tagset) 2927 + nvme_pci_alloc_tag_set(dev); 2928 + else 2929 + nvme_pci_update_nr_queues(dev); 2930 + nvme_dbbuf_set(dev); 2926 2931 nvme_unfreeze(&dev->ctrl); 2927 2932 } 2928 2933

+52 -54

drivers/nvme/host/rdma.c

··· 29 29 #include "fabrics.h" 30 30 31 31 32 - #define NVME_RDMA_CONNECT_TIMEOUT_MS 3000 /* 3 second */ 32 + #define NVME_RDMA_CM_TIMEOUT_MS 3000 /* 3 second */ 33 33 34 34 #define NVME_RDMA_MAX_SEGMENTS 256 35 35 ··· 248 248 { 249 249 int ret; 250 250 251 - ret = wait_for_completion_interruptible_timeout(&queue->cm_done, 252 - msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1); 253 - if (ret < 0) 251 + ret = wait_for_completion_interruptible(&queue->cm_done); 252 + if (ret) 254 253 return ret; 255 - if (ret == 0) 256 - return -ETIMEDOUT; 257 254 WARN_ON_ONCE(queue->cm_error > 0); 258 255 return queue->cm_error; 259 256 } ··· 609 612 queue->cm_error = -ETIMEDOUT; 610 613 ret = rdma_resolve_addr(queue->cm_id, src_addr, 611 614 (struct sockaddr *)&ctrl->addr, 612 - NVME_RDMA_CONNECT_TIMEOUT_MS); 615 + NVME_RDMA_CM_TIMEOUT_MS); 613 616 if (ret) { 614 617 dev_info(ctrl->ctrl.device, 615 618 "rdma_resolve_addr failed (%d).\n", ret); ··· 787 790 return ret; 788 791 } 789 792 790 - static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, 791 - bool admin) 793 + static int nvme_rdma_alloc_admin_tag_set(struct nvme_ctrl *nctrl) 792 794 { 793 795 struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); 794 - struct blk_mq_tag_set *set; 796 + struct blk_mq_tag_set *set = &ctrl->admin_tag_set; 795 797 int ret; 796 798 797 - if (admin) { 798 - set = &ctrl->admin_tag_set; 799 - memset(set, 0, sizeof(*set)); 800 - set->ops = &nvme_rdma_admin_mq_ops; 801 - set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; 802 - set->reserved_tags = NVMF_RESERVED_TAGS; 803 - set->numa_node = nctrl->numa_node; 804 - set->cmd_size = sizeof(struct nvme_rdma_request) + 805 - NVME_RDMA_DATA_SGL_SIZE; 806 - set->driver_data = ctrl; 807 - set->nr_hw_queues = 1; 808 - set->timeout = NVME_ADMIN_TIMEOUT; 809 - set->flags = BLK_MQ_F_NO_SCHED; 810 - } else { 811 - set = &ctrl->tag_set; 812 - memset(set, 0, sizeof(*set)); 813 - set->ops = &nvme_rdma_mq_ops; 814 - set->queue_depth = nctrl->sqsize + 1; 815 - set->reserved_tags = NVMF_RESERVED_TAGS; 816 - set->numa_node = nctrl->numa_node; 817 - set->flags = BLK_MQ_F_SHOULD_MERGE; 818 - set->cmd_size = sizeof(struct nvme_rdma_request) + 819 - NVME_RDMA_DATA_SGL_SIZE; 820 - if (nctrl->max_integrity_segments) 821 - set->cmd_size += sizeof(struct nvme_rdma_sgl) + 822 - NVME_RDMA_METADATA_SGL_SIZE; 823 - set->driver_data = ctrl; 824 - set->nr_hw_queues = nctrl->queue_count - 1; 825 - set->timeout = NVME_IO_TIMEOUT; 826 - set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; 827 - } 828 - 799 + memset(set, 0, sizeof(*set)); 800 + set->ops = &nvme_rdma_admin_mq_ops; 801 + set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; 802 + set->reserved_tags = NVMF_RESERVED_TAGS; 803 + set->numa_node = nctrl->numa_node; 804 + set->cmd_size = sizeof(struct nvme_rdma_request) + 805 + NVME_RDMA_DATA_SGL_SIZE; 806 + set->driver_data = ctrl; 807 + set->nr_hw_queues = 1; 808 + set->timeout = NVME_ADMIN_TIMEOUT; 809 + set->flags = BLK_MQ_F_NO_SCHED; 829 810 ret = blk_mq_alloc_tag_set(set); 830 - if (ret) 831 - return ERR_PTR(ret); 811 + if (!ret) 812 + ctrl->ctrl.admin_tagset = set; 813 + return ret; 814 + } 832 815 833 - return set; 816 + static int nvme_rdma_alloc_tag_set(struct nvme_ctrl *nctrl) 817 + { 818 + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); 819 + struct blk_mq_tag_set *set = &ctrl->tag_set; 820 + int ret; 821 + 822 + memset(set, 0, sizeof(*set)); 823 + set->ops = &nvme_rdma_mq_ops; 824 + set->queue_depth = nctrl->sqsize + 1; 825 + set->reserved_tags = NVMF_RESERVED_TAGS; 826 + set->numa_node = nctrl->numa_node; 827 + set->flags = BLK_MQ_F_SHOULD_MERGE; 828 + set->cmd_size = sizeof(struct nvme_rdma_request) + 829 + NVME_RDMA_DATA_SGL_SIZE; 830 + if (nctrl->max_integrity_segments) 831 + set->cmd_size += sizeof(struct nvme_rdma_sgl) + 832 + NVME_RDMA_METADATA_SGL_SIZE; 833 + set->driver_data = ctrl; 834 + set->nr_hw_queues = nctrl->queue_count - 1; 835 + set->timeout = NVME_IO_TIMEOUT; 836 + set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; 837 + ret = blk_mq_alloc_tag_set(set); 838 + if (!ret) 839 + ctrl->ctrl.tagset = set; 840 + return ret; 834 841 } 835 842 836 843 static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, ··· 886 885 goto out_free_queue; 887 886 888 887 if (new) { 889 - ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true); 890 - if (IS_ERR(ctrl->ctrl.admin_tagset)) { 891 - error = PTR_ERR(ctrl->ctrl.admin_tagset); 888 + error = nvme_rdma_alloc_admin_tag_set(&ctrl->ctrl); 889 + if (error) 892 890 goto out_free_async_qe; 893 - } 894 891 895 892 ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); 896 893 if (IS_ERR(ctrl->ctrl.fabrics_q)) { ··· 971 972 return ret; 972 973 973 974 if (new) { 974 - ctrl->ctrl.tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, false); 975 - if (IS_ERR(ctrl->ctrl.tagset)) { 976 - ret = PTR_ERR(ctrl->ctrl.tagset); 975 + ret = nvme_rdma_alloc_tag_set(&ctrl->ctrl); 976 + if (ret) 977 977 goto out_free_io_queues; 978 - } 979 978 980 979 ret = nvme_ctrl_init_connect_q(&(ctrl->ctrl)); 981 980 if (ret) ··· 1202 1205 struct nvme_rdma_ctrl *ctrl = container_of(work, 1203 1206 struct nvme_rdma_ctrl, err_work); 1204 1207 1208 + nvme_auth_stop(&ctrl->ctrl); 1205 1209 nvme_stop_keep_alive(&ctrl->ctrl); 1206 1210 flush_work(&ctrl->ctrl.async_event_work); 1207 1211 nvme_rdma_teardown_io_queues(ctrl, false); ··· 1892 1894 1893 1895 if (ctrl->opts->tos >= 0) 1894 1896 rdma_set_service_type(queue->cm_id, ctrl->opts->tos); 1895 - ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS); 1897 + ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CM_TIMEOUT_MS); 1896 1898 if (ret) { 1897 1899 dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n", 1898 1900 queue->cm_error);

+49 -46

drivers/nvme/host/tcp.c

··· 209 209 return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; 210 210 } 211 211 212 - static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue) 212 + static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_request *req) 213 213 { 214 - return queue->cmnd_capsule_len - sizeof(struct nvme_command); 214 + if (nvme_is_fabrics(req->req.cmd)) 215 + return NVME_TCP_ADMIN_CCSZ; 216 + return req->queue->cmnd_capsule_len - sizeof(struct nvme_command); 215 217 } 216 218 217 219 static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req) ··· 231 229 rq = blk_mq_rq_from_pdu(req); 232 230 233 231 return rq_data_dir(rq) == WRITE && req->data_len && 234 - req->data_len <= nvme_tcp_inline_data_size(req->queue); 232 + req->data_len <= nvme_tcp_inline_data_size(req); 235 233 } 236 234 237 235 static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req) ··· 1687 1685 return ret; 1688 1686 } 1689 1687 1690 - static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl, 1691 - bool admin) 1688 + static int nvme_tcp_alloc_admin_tag_set(struct nvme_ctrl *nctrl) 1692 1689 { 1693 1690 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); 1694 - struct blk_mq_tag_set *set; 1691 + struct blk_mq_tag_set *set = &ctrl->admin_tag_set; 1695 1692 int ret; 1696 1693 1697 - if (admin) { 1698 - set = &ctrl->admin_tag_set; 1699 - memset(set, 0, sizeof(*set)); 1700 - set->ops = &nvme_tcp_admin_mq_ops; 1701 - set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; 1702 - set->reserved_tags = NVMF_RESERVED_TAGS; 1703 - set->numa_node = nctrl->numa_node; 1704 - set->flags = BLK_MQ_F_BLOCKING; 1705 - set->cmd_size = sizeof(struct nvme_tcp_request); 1706 - set->driver_data = ctrl; 1707 - set->nr_hw_queues = 1; 1708 - set->timeout = NVME_ADMIN_TIMEOUT; 1709 - } else { 1710 - set = &ctrl->tag_set; 1711 - memset(set, 0, sizeof(*set)); 1712 - set->ops = &nvme_tcp_mq_ops; 1713 - set->queue_depth = nctrl->sqsize + 1; 1714 - set->reserved_tags = NVMF_RESERVED_TAGS; 1715 - set->numa_node = nctrl->numa_node; 1716 - set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; 1717 - set->cmd_size = sizeof(struct nvme_tcp_request); 1718 - set->driver_data = ctrl; 1719 - set->nr_hw_queues = nctrl->queue_count - 1; 1720 - set->timeout = NVME_IO_TIMEOUT; 1721 - set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; 1722 - } 1723 - 1694 + memset(set, 0, sizeof(*set)); 1695 + set->ops = &nvme_tcp_admin_mq_ops; 1696 + set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; 1697 + set->reserved_tags = NVMF_RESERVED_TAGS; 1698 + set->numa_node = nctrl->numa_node; 1699 + set->flags = BLK_MQ_F_BLOCKING; 1700 + set->cmd_size = sizeof(struct nvme_tcp_request); 1701 + set->driver_data = ctrl; 1702 + set->nr_hw_queues = 1; 1703 + set->timeout = NVME_ADMIN_TIMEOUT; 1724 1704 ret = blk_mq_alloc_tag_set(set); 1725 - if (ret) 1726 - return ERR_PTR(ret); 1705 + if (!ret) 1706 + nctrl->admin_tagset = set; 1707 + return ret; 1708 + } 1727 1709 1728 - return set; 1710 + static int nvme_tcp_alloc_tag_set(struct nvme_ctrl *nctrl) 1711 + { 1712 + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); 1713 + struct blk_mq_tag_set *set = &ctrl->tag_set; 1714 + int ret; 1715 + 1716 + memset(set, 0, sizeof(*set)); 1717 + set->ops = &nvme_tcp_mq_ops; 1718 + set->queue_depth = nctrl->sqsize + 1; 1719 + set->reserved_tags = NVMF_RESERVED_TAGS; 1720 + set->numa_node = nctrl->numa_node; 1721 + set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; 1722 + set->cmd_size = sizeof(struct nvme_tcp_request); 1723 + set->driver_data = ctrl; 1724 + set->nr_hw_queues = nctrl->queue_count - 1; 1725 + set->timeout = NVME_IO_TIMEOUT; 1726 + set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; 1727 + ret = blk_mq_alloc_tag_set(set); 1728 + if (!ret) 1729 + nctrl->tagset = set; 1730 + return ret; 1729 1731 } 1730 1732 1731 1733 static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl) ··· 1905 1899 return ret; 1906 1900 1907 1901 if (new) { 1908 - ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false); 1909 - if (IS_ERR(ctrl->tagset)) { 1910 - ret = PTR_ERR(ctrl->tagset); 1902 + ret = nvme_tcp_alloc_tag_set(ctrl); 1903 + if (ret) 1911 1904 goto out_free_io_queues; 1912 - } 1913 1905 1914 1906 ret = nvme_ctrl_init_connect_q(ctrl); 1915 1907 if (ret) ··· 1972 1968 return error; 1973 1969 1974 1970 if (new) { 1975 - ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true); 1976 - if (IS_ERR(ctrl->admin_tagset)) { 1977 - error = PTR_ERR(ctrl->admin_tagset); 1971 + error = nvme_tcp_alloc_admin_tag_set(ctrl); 1972 + if (error) 1978 1973 goto out_free_queue; 1979 - } 1980 1974 1981 1975 ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset); 1982 1976 if (IS_ERR(ctrl->fabrics_q)) { ··· 2175 2173 struct nvme_tcp_ctrl, err_work); 2176 2174 struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; 2177 2175 2176 + nvme_auth_stop(ctrl); 2178 2177 nvme_stop_keep_alive(ctrl); 2179 2178 flush_work(&ctrl->async_event_work); 2180 2179 nvme_tcp_teardown_io_queues(ctrl, false); ··· 2374 2371 if (!blk_rq_nr_phys_segments(rq)) 2375 2372 nvme_tcp_set_sg_null(c); 2376 2373 else if (rq_data_dir(rq) == WRITE && 2377 - req->data_len <= nvme_tcp_inline_data_size(queue)) 2374 + req->data_len <= nvme_tcp_inline_data_size(req)) 2378 2375 nvme_tcp_set_sg_inline(queue, c, req->data_len); 2379 2376 else 2380 2377 nvme_tcp_set_sg_host_data(c, req->data_len); ··· 2409 2406 nvme_tcp_init_iter(req, rq_data_dir(rq)); 2410 2407 2411 2408 if (rq_data_dir(rq) == WRITE && 2412 - req->data_len <= nvme_tcp_inline_data_size(queue)) 2409 + req->data_len <= nvme_tcp_inline_data_size(req)) 2413 2410 req->pdu_len = req->data_len; 2414 2411 2415 2412 pdu->hdr.type = nvme_tcp_cmd;

+32

drivers/nvme/host/trace.c

··· 287 287 return ret; 288 288 } 289 289 290 + static const char *nvme_trace_fabrics_auth_send(struct trace_seq *p, u8 *spc) 291 + { 292 + const char *ret = trace_seq_buffer_ptr(p); 293 + u8 spsp0 = spc[1]; 294 + u8 spsp1 = spc[2]; 295 + u8 secp = spc[3]; 296 + u32 tl = get_unaligned_le32(spc + 4); 297 + 298 + trace_seq_printf(p, "spsp0=%02x, spsp1=%02x, secp=%02x, tl=%u", 299 + spsp0, spsp1, secp, tl); 300 + trace_seq_putc(p, 0); 301 + return ret; 302 + } 303 + 304 + static const char *nvme_trace_fabrics_auth_receive(struct trace_seq *p, u8 *spc) 305 + { 306 + const char *ret = trace_seq_buffer_ptr(p); 307 + u8 spsp0 = spc[1]; 308 + u8 spsp1 = spc[2]; 309 + u8 secp = spc[3]; 310 + u32 al = get_unaligned_le32(spc + 4); 311 + 312 + trace_seq_printf(p, "spsp0=%02x, spsp1=%02x, secp=%02x, al=%u", 313 + spsp0, spsp1, secp, al); 314 + trace_seq_putc(p, 0); 315 + return ret; 316 + } 317 + 290 318 static const char *nvme_trace_fabrics_common(struct trace_seq *p, u8 *spc) 291 319 { 292 320 const char *ret = trace_seq_buffer_ptr(p); ··· 334 306 return nvme_trace_fabrics_connect(p, spc); 335 307 case nvme_fabrics_type_property_get: 336 308 return nvme_trace_fabrics_property_get(p, spc); 309 + case nvme_fabrics_type_auth_send: 310 + return nvme_trace_fabrics_auth_send(p, spc); 311 + case nvme_fabrics_type_auth_receive: 312 + return nvme_trace_fabrics_auth_receive(p, spc); 337 313 default: 338 314 return nvme_trace_fabrics_common(p, spc); 339 315 }

+1 -1

drivers/nvme/host/trace.h

··· 98 98 TP_fast_assign( 99 99 __entry->ctrl_id = nvme_req(req)->ctrl->instance; 100 100 __entry->qid = nvme_req_qid(req); 101 - __entry->cid = req->tag; 101 + __entry->cid = nvme_req(req)->cmd->common.command_id; 102 102 __entry->result = le64_to_cpu(nvme_req(req)->result.u64); 103 103 __entry->retries = nvme_req(req)->retries; 104 104 __entry->flags = nvme_req(req)->flags;

+15

drivers/nvme/target/Kconfig

··· 83 83 devices over TCP. 84 84 85 85 If unsure, say N. 86 + 87 + config NVME_TARGET_AUTH 88 + bool "NVMe over Fabrics In-band Authentication support" 89 + depends on NVME_TARGET 90 + select NVME_COMMON 91 + select CRYPTO 92 + select CRYPTO_HMAC 93 + select CRYPTO_SHA256 94 + select CRYPTO_SHA512 95 + select CRYPTO_DH 96 + select CRYPTO_DH_RFC7919_GROUPS 97 + help 98 + This enables support for NVMe over Fabrics In-band Authentication 99 + 100 + If unsure, say N.

+1

drivers/nvme/target/Makefile

··· 13 13 discovery.o io-cmd-file.o io-cmd-bdev.o 14 14 nvmet-$(CONFIG_NVME_TARGET_PASSTHRU) += passthru.o 15 15 nvmet-$(CONFIG_BLK_DEV_ZONED) += zns.o 16 + nvmet-$(CONFIG_NVME_TARGET_AUTH) += fabrics-cmd-auth.o auth.o 16 17 nvme-loop-y += loop.o 17 18 nvmet-rdma-y += rdma.o 18 19 nvmet-fc-y += fc.o

+3 -1

drivers/nvme/target/admin-cmd.c

··· 1017 1017 u16 ret; 1018 1018 1019 1019 if (nvme_is_fabrics(cmd)) 1020 - return nvmet_parse_fabrics_cmd(req); 1020 + return nvmet_parse_fabrics_admin_cmd(req); 1021 + if (unlikely(!nvmet_check_auth_status(req))) 1022 + return NVME_SC_AUTH_REQUIRED | NVME_SC_DNR; 1021 1023 if (nvmet_is_disc_subsys(nvmet_req_subsys(req))) 1022 1024 return nvmet_parse_discovery_cmd(req); 1023 1025

+525

drivers/nvme/target/auth.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * NVMe over Fabrics DH-HMAC-CHAP authentication. 4 + * Copyright (c) 2020 Hannes Reinecke, SUSE Software Solutions. 5 + * All rights reserved. 6 + */ 7 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 8 + #include <linux/module.h> 9 + #include <linux/init.h> 10 + #include <linux/slab.h> 11 + #include <linux/err.h> 12 + #include <crypto/hash.h> 13 + #include <linux/crc32.h> 14 + #include <linux/base64.h> 15 + #include <linux/ctype.h> 16 + #include <linux/random.h> 17 + #include <linux/nvme-auth.h> 18 + #include <asm/unaligned.h> 19 + 20 + #include "nvmet.h" 21 + 22 + int nvmet_auth_set_key(struct nvmet_host *host, const char *secret, 23 + bool set_ctrl) 24 + { 25 + unsigned char key_hash; 26 + char *dhchap_secret; 27 + 28 + if (sscanf(secret, "DHHC-1:%hhd:%*s", &key_hash) != 1) 29 + return -EINVAL; 30 + if (key_hash > 3) { 31 + pr_warn("Invalid DH-HMAC-CHAP hash id %d\n", 32 + key_hash); 33 + return -EINVAL; 34 + } 35 + if (key_hash > 0) { 36 + /* Validate selected hash algorithm */ 37 + const char *hmac = nvme_auth_hmac_name(key_hash); 38 + 39 + if (!crypto_has_shash(hmac, 0, 0)) { 40 + pr_err("DH-HMAC-CHAP hash %s unsupported\n", hmac); 41 + return -ENOTSUPP; 42 + } 43 + } 44 + dhchap_secret = kstrdup(secret, GFP_KERNEL); 45 + if (!dhchap_secret) 46 + return -ENOMEM; 47 + if (set_ctrl) { 48 + host->dhchap_ctrl_secret = strim(dhchap_secret); 49 + host->dhchap_ctrl_key_hash = key_hash; 50 + } else { 51 + host->dhchap_secret = strim(dhchap_secret); 52 + host->dhchap_key_hash = key_hash; 53 + } 54 + return 0; 55 + } 56 + 57 + int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id) 58 + { 59 + const char *dhgroup_kpp; 60 + int ret = 0; 61 + 62 + pr_debug("%s: ctrl %d selecting dhgroup %d\n", 63 + __func__, ctrl->cntlid, dhgroup_id); 64 + 65 + if (ctrl->dh_tfm) { 66 + if (ctrl->dh_gid == dhgroup_id) { 67 + pr_debug("%s: ctrl %d reuse existing DH group %d\n", 68 + __func__, ctrl->cntlid, dhgroup_id); 69 + return 0; 70 + } 71 + crypto_free_kpp(ctrl->dh_tfm); 72 + ctrl->dh_tfm = NULL; 73 + ctrl->dh_gid = 0; 74 + } 75 + 76 + if (dhgroup_id == NVME_AUTH_DHGROUP_NULL) 77 + return 0; 78 + 79 + dhgroup_kpp = nvme_auth_dhgroup_kpp(dhgroup_id); 80 + if (!dhgroup_kpp) { 81 + pr_debug("%s: ctrl %d invalid DH group %d\n", 82 + __func__, ctrl->cntlid, dhgroup_id); 83 + return -EINVAL; 84 + } 85 + ctrl->dh_tfm = crypto_alloc_kpp(dhgroup_kpp, 0, 0); 86 + if (IS_ERR(ctrl->dh_tfm)) { 87 + pr_debug("%s: ctrl %d failed to setup DH group %d, err %ld\n", 88 + __func__, ctrl->cntlid, dhgroup_id, 89 + PTR_ERR(ctrl->dh_tfm)); 90 + ret = PTR_ERR(ctrl->dh_tfm); 91 + ctrl->dh_tfm = NULL; 92 + ctrl->dh_gid = 0; 93 + } else { 94 + ctrl->dh_gid = dhgroup_id; 95 + pr_debug("%s: ctrl %d setup DH group %d\n", 96 + __func__, ctrl->cntlid, ctrl->dh_gid); 97 + ret = nvme_auth_gen_privkey(ctrl->dh_tfm, ctrl->dh_gid); 98 + if (ret < 0) { 99 + pr_debug("%s: ctrl %d failed to generate private key, err %d\n", 100 + __func__, ctrl->cntlid, ret); 101 + kfree_sensitive(ctrl->dh_key); 102 + return ret; 103 + } 104 + ctrl->dh_keysize = crypto_kpp_maxsize(ctrl->dh_tfm); 105 + kfree_sensitive(ctrl->dh_key); 106 + ctrl->dh_key = kzalloc(ctrl->dh_keysize, GFP_KERNEL); 107 + if (!ctrl->dh_key) { 108 + pr_warn("ctrl %d failed to allocate public key\n", 109 + ctrl->cntlid); 110 + return -ENOMEM; 111 + } 112 + ret = nvme_auth_gen_pubkey(ctrl->dh_tfm, ctrl->dh_key, 113 + ctrl->dh_keysize); 114 + if (ret < 0) { 115 + pr_warn("ctrl %d failed to generate public key\n", 116 + ctrl->cntlid); 117 + kfree(ctrl->dh_key); 118 + ctrl->dh_key = NULL; 119 + } 120 + } 121 + 122 + return ret; 123 + } 124 + 125 + int nvmet_setup_auth(struct nvmet_ctrl *ctrl) 126 + { 127 + int ret = 0; 128 + struct nvmet_host_link *p; 129 + struct nvmet_host *host = NULL; 130 + const char *hash_name; 131 + 132 + down_read(&nvmet_config_sem); 133 + if (nvmet_is_disc_subsys(ctrl->subsys)) 134 + goto out_unlock; 135 + 136 + if (ctrl->subsys->allow_any_host) 137 + goto out_unlock; 138 + 139 + list_for_each_entry(p, &ctrl->subsys->hosts, entry) { 140 + pr_debug("check %s\n", nvmet_host_name(p->host)); 141 + if (strcmp(nvmet_host_name(p->host), ctrl->hostnqn)) 142 + continue; 143 + host = p->host; 144 + break; 145 + } 146 + if (!host) { 147 + pr_debug("host %s not found\n", ctrl->hostnqn); 148 + ret = -EPERM; 149 + goto out_unlock; 150 + } 151 + 152 + ret = nvmet_setup_dhgroup(ctrl, host->dhchap_dhgroup_id); 153 + if (ret < 0) 154 + pr_warn("Failed to setup DH group"); 155 + 156 + if (!host->dhchap_secret) { 157 + pr_debug("No authentication provided\n"); 158 + goto out_unlock; 159 + } 160 + 161 + if (host->dhchap_hash_id == ctrl->shash_id) { 162 + pr_debug("Re-use existing hash ID %d\n", 163 + ctrl->shash_id); 164 + } else { 165 + hash_name = nvme_auth_hmac_name(host->dhchap_hash_id); 166 + if (!hash_name) { 167 + pr_warn("Hash ID %d invalid\n", host->dhchap_hash_id); 168 + ret = -EINVAL; 169 + goto out_unlock; 170 + } 171 + ctrl->shash_id = host->dhchap_hash_id; 172 + } 173 + 174 + /* Skip the 'DHHC-1:XX:' prefix */ 175 + nvme_auth_free_key(ctrl->host_key); 176 + ctrl->host_key = nvme_auth_extract_key(host->dhchap_secret + 10, 177 + host->dhchap_key_hash); 178 + if (IS_ERR(ctrl->host_key)) { 179 + ret = PTR_ERR(ctrl->host_key); 180 + ctrl->host_key = NULL; 181 + goto out_free_hash; 182 + } 183 + pr_debug("%s: using hash %s key %*ph\n", __func__, 184 + ctrl->host_key->hash > 0 ? 185 + nvme_auth_hmac_name(ctrl->host_key->hash) : "none", 186 + (int)ctrl->host_key->len, ctrl->host_key->key); 187 + 188 + nvme_auth_free_key(ctrl->ctrl_key); 189 + if (!host->dhchap_ctrl_secret) { 190 + ctrl->ctrl_key = NULL; 191 + goto out_unlock; 192 + } 193 + 194 + ctrl->ctrl_key = nvme_auth_extract_key(host->dhchap_ctrl_secret + 10, 195 + host->dhchap_ctrl_key_hash); 196 + if (IS_ERR(ctrl->ctrl_key)) { 197 + ret = PTR_ERR(ctrl->ctrl_key); 198 + ctrl->ctrl_key = NULL; 199 + } 200 + pr_debug("%s: using ctrl hash %s key %*ph\n", __func__, 201 + ctrl->ctrl_key->hash > 0 ? 202 + nvme_auth_hmac_name(ctrl->ctrl_key->hash) : "none", 203 + (int)ctrl->ctrl_key->len, ctrl->ctrl_key->key); 204 + 205 + out_free_hash: 206 + if (ret) { 207 + if (ctrl->host_key) { 208 + nvme_auth_free_key(ctrl->host_key); 209 + ctrl->host_key = NULL; 210 + } 211 + ctrl->shash_id = 0; 212 + } 213 + out_unlock: 214 + up_read(&nvmet_config_sem); 215 + 216 + return ret; 217 + } 218 + 219 + void nvmet_auth_sq_free(struct nvmet_sq *sq) 220 + { 221 + cancel_delayed_work(&sq->auth_expired_work); 222 + kfree(sq->dhchap_c1); 223 + sq->dhchap_c1 = NULL; 224 + kfree(sq->dhchap_c2); 225 + sq->dhchap_c2 = NULL; 226 + kfree(sq->dhchap_skey); 227 + sq->dhchap_skey = NULL; 228 + } 229 + 230 + void nvmet_destroy_auth(struct nvmet_ctrl *ctrl) 231 + { 232 + ctrl->shash_id = 0; 233 + 234 + if (ctrl->dh_tfm) { 235 + crypto_free_kpp(ctrl->dh_tfm); 236 + ctrl->dh_tfm = NULL; 237 + ctrl->dh_gid = 0; 238 + } 239 + kfree_sensitive(ctrl->dh_key); 240 + ctrl->dh_key = NULL; 241 + 242 + if (ctrl->host_key) { 243 + nvme_auth_free_key(ctrl->host_key); 244 + ctrl->host_key = NULL; 245 + } 246 + if (ctrl->ctrl_key) { 247 + nvme_auth_free_key(ctrl->ctrl_key); 248 + ctrl->ctrl_key = NULL; 249 + } 250 + } 251 + 252 + bool nvmet_check_auth_status(struct nvmet_req *req) 253 + { 254 + if (req->sq->ctrl->host_key && 255 + !req->sq->authenticated) 256 + return false; 257 + return true; 258 + } 259 + 260 + int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, 261 + unsigned int shash_len) 262 + { 263 + struct crypto_shash *shash_tfm; 264 + struct shash_desc *shash; 265 + struct nvmet_ctrl *ctrl = req->sq->ctrl; 266 + const char *hash_name; 267 + u8 *challenge = req->sq->dhchap_c1, *host_response; 268 + u8 buf[4]; 269 + int ret; 270 + 271 + hash_name = nvme_auth_hmac_name(ctrl->shash_id); 272 + if (!hash_name) { 273 + pr_warn("Hash ID %d invalid\n", ctrl->shash_id); 274 + return -EINVAL; 275 + } 276 + 277 + shash_tfm = crypto_alloc_shash(hash_name, 0, 0); 278 + if (IS_ERR(shash_tfm)) { 279 + pr_err("failed to allocate shash %s\n", hash_name); 280 + return PTR_ERR(shash_tfm); 281 + } 282 + 283 + if (shash_len != crypto_shash_digestsize(shash_tfm)) { 284 + pr_debug("%s: hash len mismatch (len %d digest %d)\n", 285 + __func__, shash_len, 286 + crypto_shash_digestsize(shash_tfm)); 287 + ret = -EINVAL; 288 + goto out_free_tfm; 289 + } 290 + 291 + host_response = nvme_auth_transform_key(ctrl->host_key, ctrl->hostnqn); 292 + if (IS_ERR(host_response)) { 293 + ret = PTR_ERR(host_response); 294 + goto out_free_tfm; 295 + } 296 + 297 + ret = crypto_shash_setkey(shash_tfm, host_response, 298 + ctrl->host_key->len); 299 + if (ret) 300 + goto out_free_response; 301 + 302 + if (ctrl->dh_gid != NVME_AUTH_DHGROUP_NULL) { 303 + challenge = kmalloc(shash_len, GFP_KERNEL); 304 + if (!challenge) { 305 + ret = -ENOMEM; 306 + goto out_free_response; 307 + } 308 + ret = nvme_auth_augmented_challenge(ctrl->shash_id, 309 + req->sq->dhchap_skey, 310 + req->sq->dhchap_skey_len, 311 + req->sq->dhchap_c1, 312 + challenge, shash_len); 313 + if (ret) 314 + goto out_free_response; 315 + } 316 + 317 + pr_debug("ctrl %d qid %d host response seq %u transaction %d\n", 318 + ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1, 319 + req->sq->dhchap_tid); 320 + 321 + shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(shash_tfm), 322 + GFP_KERNEL); 323 + if (!shash) { 324 + ret = -ENOMEM; 325 + goto out_free_response; 326 + } 327 + shash->tfm = shash_tfm; 328 + ret = crypto_shash_init(shash); 329 + if (ret) 330 + goto out; 331 + ret = crypto_shash_update(shash, challenge, shash_len); 332 + if (ret) 333 + goto out; 334 + put_unaligned_le32(req->sq->dhchap_s1, buf); 335 + ret = crypto_shash_update(shash, buf, 4); 336 + if (ret) 337 + goto out; 338 + put_unaligned_le16(req->sq->dhchap_tid, buf); 339 + ret = crypto_shash_update(shash, buf, 2); 340 + if (ret) 341 + goto out; 342 + memset(buf, 0, 4); 343 + ret = crypto_shash_update(shash, buf, 1); 344 + if (ret) 345 + goto out; 346 + ret = crypto_shash_update(shash, "HostHost", 8); 347 + if (ret) 348 + goto out; 349 + ret = crypto_shash_update(shash, ctrl->hostnqn, strlen(ctrl->hostnqn)); 350 + if (ret) 351 + goto out; 352 + ret = crypto_shash_update(shash, buf, 1); 353 + if (ret) 354 + goto out; 355 + ret = crypto_shash_update(shash, ctrl->subsysnqn, 356 + strlen(ctrl->subsysnqn)); 357 + if (ret) 358 + goto out; 359 + ret = crypto_shash_final(shash, response); 360 + out: 361 + if (challenge != req->sq->dhchap_c1) 362 + kfree(challenge); 363 + kfree(shash); 364 + out_free_response: 365 + kfree_sensitive(host_response); 366 + out_free_tfm: 367 + crypto_free_shash(shash_tfm); 368 + return 0; 369 + } 370 + 371 + int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response, 372 + unsigned int shash_len) 373 + { 374 + struct crypto_shash *shash_tfm; 375 + struct shash_desc *shash; 376 + struct nvmet_ctrl *ctrl = req->sq->ctrl; 377 + const char *hash_name; 378 + u8 *challenge = req->sq->dhchap_c2, *ctrl_response; 379 + u8 buf[4]; 380 + int ret; 381 + 382 + hash_name = nvme_auth_hmac_name(ctrl->shash_id); 383 + if (!hash_name) { 384 + pr_warn("Hash ID %d invalid\n", ctrl->shash_id); 385 + return -EINVAL; 386 + } 387 + 388 + shash_tfm = crypto_alloc_shash(hash_name, 0, 0); 389 + if (IS_ERR(shash_tfm)) { 390 + pr_err("failed to allocate shash %s\n", hash_name); 391 + return PTR_ERR(shash_tfm); 392 + } 393 + 394 + if (shash_len != crypto_shash_digestsize(shash_tfm)) { 395 + pr_debug("%s: hash len mismatch (len %d digest %d)\n", 396 + __func__, shash_len, 397 + crypto_shash_digestsize(shash_tfm)); 398 + ret = -EINVAL; 399 + goto out_free_tfm; 400 + } 401 + 402 + ctrl_response = nvme_auth_transform_key(ctrl->ctrl_key, 403 + ctrl->subsysnqn); 404 + if (IS_ERR(ctrl_response)) { 405 + ret = PTR_ERR(ctrl_response); 406 + goto out_free_tfm; 407 + } 408 + 409 + ret = crypto_shash_setkey(shash_tfm, ctrl_response, 410 + ctrl->ctrl_key->len); 411 + if (ret) 412 + goto out_free_response; 413 + 414 + if (ctrl->dh_gid != NVME_AUTH_DHGROUP_NULL) { 415 + challenge = kmalloc(shash_len, GFP_KERNEL); 416 + if (!challenge) { 417 + ret = -ENOMEM; 418 + goto out_free_response; 419 + } 420 + ret = nvme_auth_augmented_challenge(ctrl->shash_id, 421 + req->sq->dhchap_skey, 422 + req->sq->dhchap_skey_len, 423 + req->sq->dhchap_c2, 424 + challenge, shash_len); 425 + if (ret) 426 + goto out_free_response; 427 + } 428 + 429 + shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(shash_tfm), 430 + GFP_KERNEL); 431 + if (!shash) { 432 + ret = -ENOMEM; 433 + goto out_free_response; 434 + } 435 + shash->tfm = shash_tfm; 436 + 437 + ret = crypto_shash_init(shash); 438 + if (ret) 439 + goto out; 440 + ret = crypto_shash_update(shash, challenge, shash_len); 441 + if (ret) 442 + goto out; 443 + put_unaligned_le32(req->sq->dhchap_s2, buf); 444 + ret = crypto_shash_update(shash, buf, 4); 445 + if (ret) 446 + goto out; 447 + put_unaligned_le16(req->sq->dhchap_tid, buf); 448 + ret = crypto_shash_update(shash, buf, 2); 449 + if (ret) 450 + goto out; 451 + memset(buf, 0, 4); 452 + ret = crypto_shash_update(shash, buf, 1); 453 + if (ret) 454 + goto out; 455 + ret = crypto_shash_update(shash, "Controller", 10); 456 + if (ret) 457 + goto out; 458 + ret = crypto_shash_update(shash, ctrl->subsysnqn, 459 + strlen(ctrl->subsysnqn)); 460 + if (ret) 461 + goto out; 462 + ret = crypto_shash_update(shash, buf, 1); 463 + if (ret) 464 + goto out; 465 + ret = crypto_shash_update(shash, ctrl->hostnqn, strlen(ctrl->hostnqn)); 466 + if (ret) 467 + goto out; 468 + ret = crypto_shash_final(shash, response); 469 + out: 470 + if (challenge != req->sq->dhchap_c2) 471 + kfree(challenge); 472 + kfree(shash); 473 + out_free_response: 474 + kfree_sensitive(ctrl_response); 475 + out_free_tfm: 476 + crypto_free_shash(shash_tfm); 477 + return 0; 478 + } 479 + 480 + int nvmet_auth_ctrl_exponential(struct nvmet_req *req, 481 + u8 *buf, int buf_size) 482 + { 483 + struct nvmet_ctrl *ctrl = req->sq->ctrl; 484 + int ret = 0; 485 + 486 + if (!ctrl->dh_key) { 487 + pr_warn("ctrl %d no DH public key!\n", ctrl->cntlid); 488 + return -ENOKEY; 489 + } 490 + if (buf_size != ctrl->dh_keysize) { 491 + pr_warn("ctrl %d DH public key size mismatch, need %zu is %d\n", 492 + ctrl->cntlid, ctrl->dh_keysize, buf_size); 493 + ret = -EINVAL; 494 + } else { 495 + memcpy(buf, ctrl->dh_key, buf_size); 496 + pr_debug("%s: ctrl %d public key %*ph\n", __func__, 497 + ctrl->cntlid, (int)buf_size, buf); 498 + } 499 + 500 + return ret; 501 + } 502 + 503 + int nvmet_auth_ctrl_sesskey(struct nvmet_req *req, 504 + u8 *pkey, int pkey_size) 505 + { 506 + struct nvmet_ctrl *ctrl = req->sq->ctrl; 507 + int ret; 508 + 509 + req->sq->dhchap_skey_len = ctrl->dh_keysize; 510 + req->sq->dhchap_skey = kzalloc(req->sq->dhchap_skey_len, GFP_KERNEL); 511 + if (!req->sq->dhchap_skey) 512 + return -ENOMEM; 513 + ret = nvme_auth_gen_shared_secret(ctrl->dh_tfm, 514 + pkey, pkey_size, 515 + req->sq->dhchap_skey, 516 + req->sq->dhchap_skey_len); 517 + if (ret) 518 + pr_debug("failed to compute shared secret, err %d\n", ret); 519 + else 520 + pr_debug("%s: shared secret %*ph\n", __func__, 521 + (int)req->sq->dhchap_skey_len, 522 + req->sq->dhchap_skey); 523 + 524 + return ret; 525 + }

+136

drivers/nvme/target/configfs.c

··· 11 11 #include <linux/ctype.h> 12 12 #include <linux/pci.h> 13 13 #include <linux/pci-p2pdma.h> 14 + #ifdef CONFIG_NVME_TARGET_AUTH 15 + #include <linux/nvme-auth.h> 16 + #endif 17 + #include <crypto/hash.h> 18 + #include <crypto/kpp.h> 14 19 15 20 #include "nvmet.h" 16 21 ··· 1685 1680 static struct config_group nvmet_subsystems_group; 1686 1681 static struct config_group nvmet_ports_group; 1687 1682 1683 + #ifdef CONFIG_NVME_TARGET_AUTH 1684 + static ssize_t nvmet_host_dhchap_key_show(struct config_item *item, 1685 + char *page) 1686 + { 1687 + u8 *dhchap_secret = to_host(item)->dhchap_secret; 1688 + 1689 + if (!dhchap_secret) 1690 + return sprintf(page, "\n"); 1691 + return sprintf(page, "%s\n", dhchap_secret); 1692 + } 1693 + 1694 + static ssize_t nvmet_host_dhchap_key_store(struct config_item *item, 1695 + const char *page, size_t count) 1696 + { 1697 + struct nvmet_host *host = to_host(item); 1698 + int ret; 1699 + 1700 + ret = nvmet_auth_set_key(host, page, false); 1701 + /* 1702 + * Re-authentication is a soft state, so keep the 1703 + * current authentication valid until the host 1704 + * requests re-authentication. 1705 + */ 1706 + return ret < 0 ? ret : count; 1707 + } 1708 + 1709 + CONFIGFS_ATTR(nvmet_host_, dhchap_key); 1710 + 1711 + static ssize_t nvmet_host_dhchap_ctrl_key_show(struct config_item *item, 1712 + char *page) 1713 + { 1714 + u8 *dhchap_secret = to_host(item)->dhchap_ctrl_secret; 1715 + 1716 + if (!dhchap_secret) 1717 + return sprintf(page, "\n"); 1718 + return sprintf(page, "%s\n", dhchap_secret); 1719 + } 1720 + 1721 + static ssize_t nvmet_host_dhchap_ctrl_key_store(struct config_item *item, 1722 + const char *page, size_t count) 1723 + { 1724 + struct nvmet_host *host = to_host(item); 1725 + int ret; 1726 + 1727 + ret = nvmet_auth_set_key(host, page, true); 1728 + /* 1729 + * Re-authentication is a soft state, so keep the 1730 + * current authentication valid until the host 1731 + * requests re-authentication. 1732 + */ 1733 + return ret < 0 ? ret : count; 1734 + } 1735 + 1736 + CONFIGFS_ATTR(nvmet_host_, dhchap_ctrl_key); 1737 + 1738 + static ssize_t nvmet_host_dhchap_hash_show(struct config_item *item, 1739 + char *page) 1740 + { 1741 + struct nvmet_host *host = to_host(item); 1742 + const char *hash_name = nvme_auth_hmac_name(host->dhchap_hash_id); 1743 + 1744 + return sprintf(page, "%s\n", hash_name ? hash_name : "none"); 1745 + } 1746 + 1747 + static ssize_t nvmet_host_dhchap_hash_store(struct config_item *item, 1748 + const char *page, size_t count) 1749 + { 1750 + struct nvmet_host *host = to_host(item); 1751 + u8 hmac_id; 1752 + 1753 + hmac_id = nvme_auth_hmac_id(page); 1754 + if (hmac_id == NVME_AUTH_HASH_INVALID) 1755 + return -EINVAL; 1756 + if (!crypto_has_shash(nvme_auth_hmac_name(hmac_id), 0, 0)) 1757 + return -ENOTSUPP; 1758 + host->dhchap_hash_id = hmac_id; 1759 + return count; 1760 + } 1761 + 1762 + CONFIGFS_ATTR(nvmet_host_, dhchap_hash); 1763 + 1764 + static ssize_t nvmet_host_dhchap_dhgroup_show(struct config_item *item, 1765 + char *page) 1766 + { 1767 + struct nvmet_host *host = to_host(item); 1768 + const char *dhgroup = nvme_auth_dhgroup_name(host->dhchap_dhgroup_id); 1769 + 1770 + return sprintf(page, "%s\n", dhgroup ? dhgroup : "none"); 1771 + } 1772 + 1773 + static ssize_t nvmet_host_dhchap_dhgroup_store(struct config_item *item, 1774 + const char *page, size_t count) 1775 + { 1776 + struct nvmet_host *host = to_host(item); 1777 + int dhgroup_id; 1778 + 1779 + dhgroup_id = nvme_auth_dhgroup_id(page); 1780 + if (dhgroup_id == NVME_AUTH_DHGROUP_INVALID) 1781 + return -EINVAL; 1782 + if (dhgroup_id != NVME_AUTH_DHGROUP_NULL) { 1783 + const char *kpp = nvme_auth_dhgroup_kpp(dhgroup_id); 1784 + 1785 + if (!crypto_has_kpp(kpp, 0, 0)) 1786 + return -EINVAL; 1787 + } 1788 + host->dhchap_dhgroup_id = dhgroup_id; 1789 + return count; 1790 + } 1791 + 1792 + CONFIGFS_ATTR(nvmet_host_, dhchap_dhgroup); 1793 + 1794 + static struct configfs_attribute *nvmet_host_attrs[] = { 1795 + &nvmet_host_attr_dhchap_key, 1796 + &nvmet_host_attr_dhchap_ctrl_key, 1797 + &nvmet_host_attr_dhchap_hash, 1798 + &nvmet_host_attr_dhchap_dhgroup, 1799 + NULL, 1800 + }; 1801 + #endif /* CONFIG_NVME_TARGET_AUTH */ 1802 + 1688 1803 static void nvmet_host_release(struct config_item *item) 1689 1804 { 1690 1805 struct nvmet_host *host = to_host(item); 1691 1806 1807 + #ifdef CONFIG_NVME_TARGET_AUTH 1808 + kfree(host->dhchap_secret); 1809 + #endif 1692 1810 kfree(host); 1693 1811 } 1694 1812 ··· 1821 1693 1822 1694 static const struct config_item_type nvmet_host_type = { 1823 1695 .ct_item_ops = &nvmet_host_item_ops, 1696 + #ifdef CONFIG_NVME_TARGET_AUTH 1697 + .ct_attrs = nvmet_host_attrs, 1698 + #endif 1824 1699 .ct_owner = THIS_MODULE, 1825 1700 }; 1826 1701 ··· 1835 1704 host = kzalloc(sizeof(*host), GFP_KERNEL); 1836 1705 if (!host) 1837 1706 return ERR_PTR(-ENOMEM); 1707 + 1708 + #ifdef CONFIG_NVME_TARGET_AUTH 1709 + /* Default to SHA256 */ 1710 + host->dhchap_hash_id = NVME_AUTH_HASH_SHA256; 1711 + #endif 1838 1712 1839 1713 config_group_init_type_name(&host->group, name, &nvmet_host_type); 1840 1714

+15

drivers/nvme/target/core.c

··· 795 795 wait_for_completion(&sq->confirm_done); 796 796 wait_for_completion(&sq->free_done); 797 797 percpu_ref_exit(&sq->ref); 798 + nvmet_auth_sq_free(sq); 798 799 799 800 if (ctrl) { 800 801 /* ··· 866 865 867 866 static u16 nvmet_parse_io_cmd(struct nvmet_req *req) 868 867 { 868 + struct nvme_command *cmd = req->cmd; 869 869 u16 ret; 870 + 871 + if (nvme_is_fabrics(cmd)) 872 + return nvmet_parse_fabrics_io_cmd(req); 873 + 874 + if (unlikely(!nvmet_check_auth_status(req))) 875 + return NVME_SC_AUTH_REQUIRED | NVME_SC_DNR; 870 876 871 877 ret = nvmet_check_ctrl_status(req); 872 878 if (unlikely(ret)) ··· 1279 1271 req->cmd->common.opcode, req->sq->qid); 1280 1272 return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; 1281 1273 } 1274 + 1275 + if (unlikely(!nvmet_check_auth_status(req))) { 1276 + pr_warn("qid %d not authenticated\n", req->sq->qid); 1277 + return NVME_SC_AUTH_REQUIRED | NVME_SC_DNR; 1278 + } 1282 1279 return 0; 1283 1280 } 1284 1281 ··· 1479 1466 1480 1467 flush_work(&ctrl->async_event_work); 1481 1468 cancel_work_sync(&ctrl->fatal_err_work); 1469 + 1470 + nvmet_destroy_auth(ctrl); 1482 1471 1483 1472 ida_free(&cntlid_ida, ctrl->cntlid); 1484 1473

+544

drivers/nvme/target/fabrics-cmd-auth.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * NVMe over Fabrics DH-HMAC-CHAP authentication command handling. 4 + * Copyright (c) 2020 Hannes Reinecke, SUSE Software Solutions. 5 + * All rights reserved. 6 + */ 7 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 8 + #include <linux/blkdev.h> 9 + #include <linux/random.h> 10 + #include <linux/nvme-auth.h> 11 + #include <crypto/hash.h> 12 + #include <crypto/kpp.h> 13 + #include "nvmet.h" 14 + 15 + static void nvmet_auth_expired_work(struct work_struct *work) 16 + { 17 + struct nvmet_sq *sq = container_of(to_delayed_work(work), 18 + struct nvmet_sq, auth_expired_work); 19 + 20 + pr_debug("%s: ctrl %d qid %d transaction %u expired, resetting\n", 21 + __func__, sq->ctrl->cntlid, sq->qid, sq->dhchap_tid); 22 + sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE; 23 + sq->dhchap_tid = -1; 24 + } 25 + 26 + void nvmet_init_auth(struct nvmet_ctrl *ctrl, struct nvmet_req *req) 27 + { 28 + u32 result = le32_to_cpu(req->cqe->result.u32); 29 + 30 + /* Initialize in-band authentication */ 31 + INIT_DELAYED_WORK(&req->sq->auth_expired_work, 32 + nvmet_auth_expired_work); 33 + req->sq->authenticated = false; 34 + req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE; 35 + result |= (u32)NVME_CONNECT_AUTHREQ_ATR << 16; 36 + req->cqe->result.u32 = cpu_to_le32(result); 37 + } 38 + 39 + static u16 nvmet_auth_negotiate(struct nvmet_req *req, void *d) 40 + { 41 + struct nvmet_ctrl *ctrl = req->sq->ctrl; 42 + struct nvmf_auth_dhchap_negotiate_data *data = d; 43 + int i, hash_id = 0, fallback_hash_id = 0, dhgid, fallback_dhgid; 44 + 45 + pr_debug("%s: ctrl %d qid %d: data sc_d %d napd %d authid %d halen %d dhlen %d\n", 46 + __func__, ctrl->cntlid, req->sq->qid, 47 + data->sc_c, data->napd, data->auth_protocol[0].dhchap.authid, 48 + data->auth_protocol[0].dhchap.halen, 49 + data->auth_protocol[0].dhchap.dhlen); 50 + req->sq->dhchap_tid = le16_to_cpu(data->t_id); 51 + if (data->sc_c) 52 + return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; 53 + 54 + if (data->napd != 1) 55 + return NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; 56 + 57 + if (data->auth_protocol[0].dhchap.authid != 58 + NVME_AUTH_DHCHAP_AUTH_ID) 59 + return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; 60 + 61 + for (i = 0; i < data->auth_protocol[0].dhchap.halen; i++) { 62 + u8 host_hmac_id = data->auth_protocol[0].dhchap.idlist[i]; 63 + 64 + if (!fallback_hash_id && 65 + crypto_has_shash(nvme_auth_hmac_name(host_hmac_id), 0, 0)) 66 + fallback_hash_id = host_hmac_id; 67 + if (ctrl->shash_id != host_hmac_id) 68 + continue; 69 + hash_id = ctrl->shash_id; 70 + break; 71 + } 72 + if (hash_id == 0) { 73 + if (fallback_hash_id == 0) { 74 + pr_debug("%s: ctrl %d qid %d: no usable hash found\n", 75 + __func__, ctrl->cntlid, req->sq->qid); 76 + return NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; 77 + } 78 + pr_debug("%s: ctrl %d qid %d: no usable hash found, falling back to %s\n", 79 + __func__, ctrl->cntlid, req->sq->qid, 80 + nvme_auth_hmac_name(fallback_hash_id)); 81 + ctrl->shash_id = fallback_hash_id; 82 + } 83 + 84 + dhgid = -1; 85 + fallback_dhgid = -1; 86 + for (i = 0; i < data->auth_protocol[0].dhchap.dhlen; i++) { 87 + int tmp_dhgid = data->auth_protocol[0].dhchap.idlist[i + 30]; 88 + 89 + if (tmp_dhgid != ctrl->dh_gid) { 90 + dhgid = tmp_dhgid; 91 + break; 92 + } 93 + if (fallback_dhgid < 0) { 94 + const char *kpp = nvme_auth_dhgroup_kpp(tmp_dhgid); 95 + 96 + if (crypto_has_kpp(kpp, 0, 0)) 97 + fallback_dhgid = tmp_dhgid; 98 + } 99 + } 100 + if (dhgid < 0) { 101 + if (fallback_dhgid < 0) { 102 + pr_debug("%s: ctrl %d qid %d: no usable DH group found\n", 103 + __func__, ctrl->cntlid, req->sq->qid); 104 + return NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE; 105 + } 106 + pr_debug("%s: ctrl %d qid %d: configured DH group %s not found\n", 107 + __func__, ctrl->cntlid, req->sq->qid, 108 + nvme_auth_dhgroup_name(fallback_dhgid)); 109 + ctrl->dh_gid = fallback_dhgid; 110 + } 111 + pr_debug("%s: ctrl %d qid %d: selected DH group %s (%d)\n", 112 + __func__, ctrl->cntlid, req->sq->qid, 113 + nvme_auth_dhgroup_name(ctrl->dh_gid), ctrl->dh_gid); 114 + return 0; 115 + } 116 + 117 + static u16 nvmet_auth_reply(struct nvmet_req *req, void *d) 118 + { 119 + struct nvmet_ctrl *ctrl = req->sq->ctrl; 120 + struct nvmf_auth_dhchap_reply_data *data = d; 121 + u16 dhvlen = le16_to_cpu(data->dhvlen); 122 + u8 *response; 123 + 124 + pr_debug("%s: ctrl %d qid %d: data hl %d cvalid %d dhvlen %u\n", 125 + __func__, ctrl->cntlid, req->sq->qid, 126 + data->hl, data->cvalid, dhvlen); 127 + 128 + if (dhvlen) { 129 + if (!ctrl->dh_tfm) 130 + return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; 131 + if (nvmet_auth_ctrl_sesskey(req, data->rval + 2 * data->hl, 132 + dhvlen) < 0) 133 + return NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE; 134 + } 135 + 136 + response = kmalloc(data->hl, GFP_KERNEL); 137 + if (!response) 138 + return NVME_AUTH_DHCHAP_FAILURE_FAILED; 139 + 140 + if (!ctrl->host_key) { 141 + pr_warn("ctrl %d qid %d no host key\n", 142 + ctrl->cntlid, req->sq->qid); 143 + kfree(response); 144 + return NVME_AUTH_DHCHAP_FAILURE_FAILED; 145 + } 146 + if (nvmet_auth_host_hash(req, response, data->hl) < 0) { 147 + pr_debug("ctrl %d qid %d host hash failed\n", 148 + ctrl->cntlid, req->sq->qid); 149 + kfree(response); 150 + return NVME_AUTH_DHCHAP_FAILURE_FAILED; 151 + } 152 + 153 + if (memcmp(data->rval, response, data->hl)) { 154 + pr_info("ctrl %d qid %d host response mismatch\n", 155 + ctrl->cntlid, req->sq->qid); 156 + kfree(response); 157 + return NVME_AUTH_DHCHAP_FAILURE_FAILED; 158 + } 159 + kfree(response); 160 + pr_debug("%s: ctrl %d qid %d host authenticated\n", 161 + __func__, ctrl->cntlid, req->sq->qid); 162 + if (data->cvalid) { 163 + req->sq->dhchap_c2 = kmalloc(data->hl, GFP_KERNEL); 164 + if (!req->sq->dhchap_c2) 165 + return NVME_AUTH_DHCHAP_FAILURE_FAILED; 166 + memcpy(req->sq->dhchap_c2, data->rval + data->hl, data->hl); 167 + 168 + pr_debug("%s: ctrl %d qid %d challenge %*ph\n", 169 + __func__, ctrl->cntlid, req->sq->qid, data->hl, 170 + req->sq->dhchap_c2); 171 + req->sq->dhchap_s2 = le32_to_cpu(data->seqnum); 172 + } else { 173 + req->sq->authenticated = true; 174 + req->sq->dhchap_c2 = NULL; 175 + } 176 + 177 + return 0; 178 + } 179 + 180 + static u16 nvmet_auth_failure2(struct nvmet_req *req, void *d) 181 + { 182 + struct nvmf_auth_dhchap_failure_data *data = d; 183 + 184 + return data->rescode_exp; 185 + } 186 + 187 + void nvmet_execute_auth_send(struct nvmet_req *req) 188 + { 189 + struct nvmet_ctrl *ctrl = req->sq->ctrl; 190 + struct nvmf_auth_dhchap_success2_data *data; 191 + void *d; 192 + u32 tl; 193 + u16 status = 0; 194 + 195 + if (req->cmd->auth_send.secp != NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER) { 196 + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; 197 + req->error_loc = 198 + offsetof(struct nvmf_auth_send_command, secp); 199 + goto done; 200 + } 201 + if (req->cmd->auth_send.spsp0 != 0x01) { 202 + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; 203 + req->error_loc = 204 + offsetof(struct nvmf_auth_send_command, spsp0); 205 + goto done; 206 + } 207 + if (req->cmd->auth_send.spsp1 != 0x01) { 208 + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; 209 + req->error_loc = 210 + offsetof(struct nvmf_auth_send_command, spsp1); 211 + goto done; 212 + } 213 + tl = le32_to_cpu(req->cmd->auth_send.tl); 214 + if (!tl) { 215 + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; 216 + req->error_loc = 217 + offsetof(struct nvmf_auth_send_command, tl); 218 + goto done; 219 + } 220 + if (!nvmet_check_transfer_len(req, tl)) { 221 + pr_debug("%s: transfer length mismatch (%u)\n", __func__, tl); 222 + return; 223 + } 224 + 225 + d = kmalloc(tl, GFP_KERNEL); 226 + if (!d) { 227 + status = NVME_SC_INTERNAL; 228 + goto done; 229 + } 230 + 231 + status = nvmet_copy_from_sgl(req, 0, d, tl); 232 + if (status) { 233 + kfree(d); 234 + goto done; 235 + } 236 + 237 + data = d; 238 + pr_debug("%s: ctrl %d qid %d type %d id %d step %x\n", __func__, 239 + ctrl->cntlid, req->sq->qid, data->auth_type, data->auth_id, 240 + req->sq->dhchap_step); 241 + if (data->auth_type != NVME_AUTH_COMMON_MESSAGES && 242 + data->auth_type != NVME_AUTH_DHCHAP_MESSAGES) 243 + goto done_failure1; 244 + if (data->auth_type == NVME_AUTH_COMMON_MESSAGES) { 245 + if (data->auth_id == NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE) { 246 + /* Restart negotiation */ 247 + pr_debug("%s: ctrl %d qid %d reset negotiation\n", __func__, 248 + ctrl->cntlid, req->sq->qid); 249 + if (!req->sq->qid) { 250 + if (nvmet_setup_auth(ctrl) < 0) { 251 + status = NVME_SC_INTERNAL; 252 + pr_err("ctrl %d qid 0 failed to setup" 253 + "re-authentication", 254 + ctrl->cntlid); 255 + goto done_failure1; 256 + } 257 + } 258 + req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE; 259 + } else if (data->auth_id != req->sq->dhchap_step) 260 + goto done_failure1; 261 + /* Validate negotiation parameters */ 262 + status = nvmet_auth_negotiate(req, d); 263 + if (status == 0) 264 + req->sq->dhchap_step = 265 + NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE; 266 + else { 267 + req->sq->dhchap_step = 268 + NVME_AUTH_DHCHAP_MESSAGE_FAILURE1; 269 + req->sq->dhchap_status = status; 270 + status = 0; 271 + } 272 + goto done_kfree; 273 + } 274 + if (data->auth_id != req->sq->dhchap_step) { 275 + pr_debug("%s: ctrl %d qid %d step mismatch (%d != %d)\n", 276 + __func__, ctrl->cntlid, req->sq->qid, 277 + data->auth_id, req->sq->dhchap_step); 278 + goto done_failure1; 279 + } 280 + if (le16_to_cpu(data->t_id) != req->sq->dhchap_tid) { 281 + pr_debug("%s: ctrl %d qid %d invalid transaction %d (expected %d)\n", 282 + __func__, ctrl->cntlid, req->sq->qid, 283 + le16_to_cpu(data->t_id), 284 + req->sq->dhchap_tid); 285 + req->sq->dhchap_step = 286 + NVME_AUTH_DHCHAP_MESSAGE_FAILURE1; 287 + req->sq->dhchap_status = 288 + NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; 289 + goto done_kfree; 290 + } 291 + 292 + switch (data->auth_id) { 293 + case NVME_AUTH_DHCHAP_MESSAGE_REPLY: 294 + status = nvmet_auth_reply(req, d); 295 + if (status == 0) 296 + req->sq->dhchap_step = 297 + NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1; 298 + else { 299 + req->sq->dhchap_step = 300 + NVME_AUTH_DHCHAP_MESSAGE_FAILURE1; 301 + req->sq->dhchap_status = status; 302 + status = 0; 303 + } 304 + goto done_kfree; 305 + break; 306 + case NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2: 307 + req->sq->authenticated = true; 308 + pr_debug("%s: ctrl %d qid %d ctrl authenticated\n", 309 + __func__, ctrl->cntlid, req->sq->qid); 310 + goto done_kfree; 311 + break; 312 + case NVME_AUTH_DHCHAP_MESSAGE_FAILURE2: 313 + status = nvmet_auth_failure2(req, d); 314 + if (status) { 315 + pr_warn("ctrl %d qid %d: authentication failed (%d)\n", 316 + ctrl->cntlid, req->sq->qid, status); 317 + req->sq->dhchap_status = status; 318 + req->sq->authenticated = false; 319 + status = 0; 320 + } 321 + goto done_kfree; 322 + break; 323 + default: 324 + req->sq->dhchap_status = 325 + NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE; 326 + req->sq->dhchap_step = 327 + NVME_AUTH_DHCHAP_MESSAGE_FAILURE2; 328 + req->sq->authenticated = false; 329 + goto done_kfree; 330 + break; 331 + } 332 + done_failure1: 333 + req->sq->dhchap_status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE; 334 + req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_FAILURE2; 335 + 336 + done_kfree: 337 + kfree(d); 338 + done: 339 + pr_debug("%s: ctrl %d qid %d dhchap status %x step %x\n", __func__, 340 + ctrl->cntlid, req->sq->qid, 341 + req->sq->dhchap_status, req->sq->dhchap_step); 342 + if (status) 343 + pr_debug("%s: ctrl %d qid %d nvme status %x error loc %d\n", 344 + __func__, ctrl->cntlid, req->sq->qid, 345 + status, req->error_loc); 346 + req->cqe->result.u64 = 0; 347 + nvmet_req_complete(req, status); 348 + if (req->sq->dhchap_step != NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2 && 349 + req->sq->dhchap_step != NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) { 350 + unsigned long auth_expire_secs = ctrl->kato ? ctrl->kato : 120; 351 + 352 + mod_delayed_work(system_wq, &req->sq->auth_expired_work, 353 + auth_expire_secs * HZ); 354 + return; 355 + } 356 + /* Final states, clear up variables */ 357 + nvmet_auth_sq_free(req->sq); 358 + if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) 359 + nvmet_ctrl_fatal_error(ctrl); 360 + } 361 + 362 + static int nvmet_auth_challenge(struct nvmet_req *req, void *d, int al) 363 + { 364 + struct nvmf_auth_dhchap_challenge_data *data = d; 365 + struct nvmet_ctrl *ctrl = req->sq->ctrl; 366 + int ret = 0; 367 + int hash_len = nvme_auth_hmac_hash_len(ctrl->shash_id); 368 + int data_size = sizeof(*d) + hash_len; 369 + 370 + if (ctrl->dh_tfm) 371 + data_size += ctrl->dh_keysize; 372 + if (al < data_size) { 373 + pr_debug("%s: buffer too small (al %d need %d)\n", __func__, 374 + al, data_size); 375 + return -EINVAL; 376 + } 377 + memset(data, 0, data_size); 378 + req->sq->dhchap_s1 = nvme_auth_get_seqnum(); 379 + data->auth_type = NVME_AUTH_DHCHAP_MESSAGES; 380 + data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE; 381 + data->t_id = cpu_to_le16(req->sq->dhchap_tid); 382 + data->hashid = ctrl->shash_id; 383 + data->hl = hash_len; 384 + data->seqnum = cpu_to_le32(req->sq->dhchap_s1); 385 + req->sq->dhchap_c1 = kmalloc(data->hl, GFP_KERNEL); 386 + if (!req->sq->dhchap_c1) 387 + return -ENOMEM; 388 + get_random_bytes(req->sq->dhchap_c1, data->hl); 389 + memcpy(data->cval, req->sq->dhchap_c1, data->hl); 390 + if (ctrl->dh_tfm) { 391 + data->dhgid = ctrl->dh_gid; 392 + data->dhvlen = cpu_to_le16(ctrl->dh_keysize); 393 + ret = nvmet_auth_ctrl_exponential(req, data->cval + data->hl, 394 + ctrl->dh_keysize); 395 + } 396 + pr_debug("%s: ctrl %d qid %d seq %d transaction %d hl %d dhvlen %zu\n", 397 + __func__, ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1, 398 + req->sq->dhchap_tid, data->hl, ctrl->dh_keysize); 399 + return ret; 400 + } 401 + 402 + static int nvmet_auth_success1(struct nvmet_req *req, void *d, int al) 403 + { 404 + struct nvmf_auth_dhchap_success1_data *data = d; 405 + struct nvmet_ctrl *ctrl = req->sq->ctrl; 406 + int hash_len = nvme_auth_hmac_hash_len(ctrl->shash_id); 407 + 408 + WARN_ON(al < sizeof(*data)); 409 + memset(data, 0, sizeof(*data)); 410 + data->auth_type = NVME_AUTH_DHCHAP_MESSAGES; 411 + data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1; 412 + data->t_id = cpu_to_le16(req->sq->dhchap_tid); 413 + data->hl = hash_len; 414 + if (req->sq->dhchap_c2) { 415 + if (!ctrl->ctrl_key) { 416 + pr_warn("ctrl %d qid %d no ctrl key\n", 417 + ctrl->cntlid, req->sq->qid); 418 + return NVME_AUTH_DHCHAP_FAILURE_FAILED; 419 + } 420 + if (nvmet_auth_ctrl_hash(req, data->rval, data->hl)) 421 + return NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; 422 + data->rvalid = 1; 423 + pr_debug("ctrl %d qid %d response %*ph\n", 424 + ctrl->cntlid, req->sq->qid, data->hl, data->rval); 425 + } 426 + return 0; 427 + } 428 + 429 + static void nvmet_auth_failure1(struct nvmet_req *req, void *d, int al) 430 + { 431 + struct nvmf_auth_dhchap_failure_data *data = d; 432 + 433 + WARN_ON(al < sizeof(*data)); 434 + data->auth_type = NVME_AUTH_COMMON_MESSAGES; 435 + data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_FAILURE1; 436 + data->t_id = cpu_to_le16(req->sq->dhchap_tid); 437 + data->rescode = NVME_AUTH_DHCHAP_FAILURE_REASON_FAILED; 438 + data->rescode_exp = req->sq->dhchap_status; 439 + } 440 + 441 + void nvmet_execute_auth_receive(struct nvmet_req *req) 442 + { 443 + struct nvmet_ctrl *ctrl = req->sq->ctrl; 444 + void *d; 445 + u32 al; 446 + u16 status = 0; 447 + 448 + if (req->cmd->auth_receive.secp != NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER) { 449 + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; 450 + req->error_loc = 451 + offsetof(struct nvmf_auth_receive_command, secp); 452 + goto done; 453 + } 454 + if (req->cmd->auth_receive.spsp0 != 0x01) { 455 + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; 456 + req->error_loc = 457 + offsetof(struct nvmf_auth_receive_command, spsp0); 458 + goto done; 459 + } 460 + if (req->cmd->auth_receive.spsp1 != 0x01) { 461 + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; 462 + req->error_loc = 463 + offsetof(struct nvmf_auth_receive_command, spsp1); 464 + goto done; 465 + } 466 + al = le32_to_cpu(req->cmd->auth_receive.al); 467 + if (!al) { 468 + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; 469 + req->error_loc = 470 + offsetof(struct nvmf_auth_receive_command, al); 471 + goto done; 472 + } 473 + if (!nvmet_check_transfer_len(req, al)) { 474 + pr_debug("%s: transfer length mismatch (%u)\n", __func__, al); 475 + return; 476 + } 477 + 478 + d = kmalloc(al, GFP_KERNEL); 479 + if (!d) { 480 + status = NVME_SC_INTERNAL; 481 + goto done; 482 + } 483 + pr_debug("%s: ctrl %d qid %d step %x\n", __func__, 484 + ctrl->cntlid, req->sq->qid, req->sq->dhchap_step); 485 + switch (req->sq->dhchap_step) { 486 + case NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE: 487 + if (nvmet_auth_challenge(req, d, al) < 0) { 488 + pr_warn("ctrl %d qid %d: challenge error (%d)\n", 489 + ctrl->cntlid, req->sq->qid, status); 490 + status = NVME_SC_INTERNAL; 491 + break; 492 + } 493 + if (status) { 494 + req->sq->dhchap_status = status; 495 + nvmet_auth_failure1(req, d, al); 496 + pr_warn("ctrl %d qid %d: challenge status (%x)\n", 497 + ctrl->cntlid, req->sq->qid, 498 + req->sq->dhchap_status); 499 + status = 0; 500 + break; 501 + } 502 + req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_REPLY; 503 + break; 504 + case NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1: 505 + status = nvmet_auth_success1(req, d, al); 506 + if (status) { 507 + req->sq->dhchap_status = status; 508 + req->sq->authenticated = false; 509 + nvmet_auth_failure1(req, d, al); 510 + pr_warn("ctrl %d qid %d: success1 status (%x)\n", 511 + ctrl->cntlid, req->sq->qid, 512 + req->sq->dhchap_status); 513 + break; 514 + } 515 + req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2; 516 + break; 517 + case NVME_AUTH_DHCHAP_MESSAGE_FAILURE1: 518 + req->sq->authenticated = false; 519 + nvmet_auth_failure1(req, d, al); 520 + pr_warn("ctrl %d qid %d failure1 (%x)\n", 521 + ctrl->cntlid, req->sq->qid, req->sq->dhchap_status); 522 + break; 523 + default: 524 + pr_warn("ctrl %d qid %d unhandled step (%d)\n", 525 + ctrl->cntlid, req->sq->qid, req->sq->dhchap_step); 526 + req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_FAILURE1; 527 + req->sq->dhchap_status = NVME_AUTH_DHCHAP_FAILURE_FAILED; 528 + nvmet_auth_failure1(req, d, al); 529 + status = 0; 530 + break; 531 + } 532 + 533 + status = nvmet_copy_to_sgl(req, 0, d, al); 534 + kfree(d); 535 + done: 536 + req->cqe->result.u64 = 0; 537 + nvmet_req_complete(req, status); 538 + if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2) 539 + nvmet_auth_sq_free(req->sq); 540 + else if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE1) { 541 + nvmet_auth_sq_free(req->sq); 542 + nvmet_ctrl_fatal_error(ctrl); 543 + } 544 + }

+52 -3

drivers/nvme/target/fabrics-cmd.c

··· 82 82 nvmet_req_complete(req, status); 83 83 } 84 84 85 - u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req) 85 + u16 nvmet_parse_fabrics_admin_cmd(struct nvmet_req *req) 86 86 { 87 87 struct nvme_command *cmd = req->cmd; 88 88 ··· 93 93 case nvme_fabrics_type_property_get: 94 94 req->execute = nvmet_execute_prop_get; 95 95 break; 96 + #ifdef CONFIG_NVME_TARGET_AUTH 97 + case nvme_fabrics_type_auth_send: 98 + req->execute = nvmet_execute_auth_send; 99 + break; 100 + case nvme_fabrics_type_auth_receive: 101 + req->execute = nvmet_execute_auth_receive; 102 + break; 103 + #endif 104 + default: 105 + pr_debug("received unknown capsule type 0x%x\n", 106 + cmd->fabrics.fctype); 107 + req->error_loc = offsetof(struct nvmf_common_command, fctype); 108 + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; 109 + } 110 + 111 + return 0; 112 + } 113 + 114 + u16 nvmet_parse_fabrics_io_cmd(struct nvmet_req *req) 115 + { 116 + struct nvme_command *cmd = req->cmd; 117 + 118 + switch (cmd->fabrics.fctype) { 119 + #ifdef CONFIG_NVME_TARGET_AUTH 120 + case nvme_fabrics_type_auth_send: 121 + req->execute = nvmet_execute_auth_send; 122 + break; 123 + case nvme_fabrics_type_auth_receive: 124 + req->execute = nvmet_execute_auth_receive; 125 + break; 126 + #endif 96 127 default: 97 128 pr_debug("received unknown capsule type 0x%x\n", 98 129 cmd->fabrics.fctype); ··· 204 173 struct nvmf_connect_data *d; 205 174 struct nvmet_ctrl *ctrl = NULL; 206 175 u16 status = 0; 176 + int ret; 207 177 208 178 if (!nvmet_check_transfer_len(req, sizeof(struct nvmf_connect_data))) 209 179 return; ··· 247 215 248 216 uuid_copy(&ctrl->hostid, &d->hostid); 249 217 218 + ret = nvmet_setup_auth(ctrl); 219 + if (ret < 0) { 220 + pr_err("Failed to setup authentication, error %d\n", ret); 221 + nvmet_ctrl_put(ctrl); 222 + if (ret == -EPERM) 223 + status = (NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR); 224 + else 225 + status = NVME_SC_INTERNAL; 226 + goto out; 227 + } 228 + 250 229 status = nvmet_install_queue(ctrl, req); 251 230 if (status) { 252 231 nvmet_ctrl_put(ctrl); 253 232 goto out; 254 233 } 255 234 256 - pr_info("creating %s controller %d for subsystem %s for NQN %s%s.\n", 235 + pr_info("creating %s controller %d for subsystem %s for NQN %s%s%s.\n", 257 236 nvmet_is_disc_subsys(ctrl->subsys) ? "discovery" : "nvm", 258 237 ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn, 259 - ctrl->pi_support ? " T10-PI is enabled" : ""); 238 + ctrl->pi_support ? " T10-PI is enabled" : "", 239 + nvmet_has_auth(ctrl) ? " with DH-HMAC-CHAP" : ""); 260 240 req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); 261 241 242 + if (nvmet_has_auth(ctrl)) 243 + nvmet_init_auth(ctrl, req); 262 244 out: 263 245 kfree(d); 264 246 complete: ··· 332 286 req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); 333 287 334 288 pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); 289 + req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); 290 + if (nvmet_has_auth(ctrl)) 291 + nvmet_init_auth(ctrl, req); 335 292 336 293 out: 337 294 kfree(d);

+2 -6

drivers/nvme/target/loop.c

··· 424 424 { 425 425 if (ctrl->ctrl.queue_count > 1) { 426 426 nvme_stop_queues(&ctrl->ctrl); 427 - blk_mq_tagset_busy_iter(&ctrl->tag_set, 428 - nvme_cancel_request, &ctrl->ctrl); 429 - blk_mq_tagset_wait_completed_request(&ctrl->tag_set); 427 + nvme_cancel_tagset(&ctrl->ctrl); 430 428 nvme_loop_destroy_io_queues(ctrl); 431 429 } 432 430 ··· 432 434 if (ctrl->ctrl.state == NVME_CTRL_LIVE) 433 435 nvme_shutdown_ctrl(&ctrl->ctrl); 434 436 435 - blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, 436 - nvme_cancel_request, &ctrl->ctrl); 437 - blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); 437 + nvme_cancel_admin_tagset(&ctrl->ctrl); 438 438 nvme_loop_destroy_admin_queue(ctrl); 439 439 } 440 440

+74 -1

drivers/nvme/target/nvmet.h

··· 108 108 u16 size; 109 109 u32 sqhd; 110 110 bool sqhd_disabled; 111 + #ifdef CONFIG_NVME_TARGET_AUTH 112 + struct delayed_work auth_expired_work; 113 + bool authenticated; 114 + u16 dhchap_tid; 115 + u16 dhchap_status; 116 + int dhchap_step; 117 + u8 *dhchap_c1; 118 + u8 *dhchap_c2; 119 + u32 dhchap_s1; 120 + u32 dhchap_s2; 121 + u8 *dhchap_skey; 122 + int dhchap_skey_len; 123 + #endif 111 124 struct completion free_done; 112 125 struct completion confirm_done; 113 126 }; ··· 222 209 u64 err_counter; 223 210 struct nvme_error_slot slots[NVMET_ERROR_LOG_SLOTS]; 224 211 bool pi_support; 212 + #ifdef CONFIG_NVME_TARGET_AUTH 213 + struct nvme_dhchap_key *host_key; 214 + struct nvme_dhchap_key *ctrl_key; 215 + u8 shash_id; 216 + struct crypto_kpp *dh_tfm; 217 + u8 dh_gid; 218 + u8 *dh_key; 219 + size_t dh_keysize; 220 + #endif 225 221 }; 226 222 227 223 struct nvmet_subsys { ··· 293 271 294 272 struct nvmet_host { 295 273 struct config_group group; 274 + u8 *dhchap_secret; 275 + u8 *dhchap_ctrl_secret; 276 + u8 dhchap_key_hash; 277 + u8 dhchap_ctrl_key_hash; 278 + u8 dhchap_hash_id; 279 + u8 dhchap_dhgroup_id; 296 280 }; 297 281 298 282 static inline struct nvmet_host *to_host(struct config_item *item) ··· 448 420 u16 nvmet_bdev_zns_parse_io_cmd(struct nvmet_req *req); 449 421 u16 nvmet_parse_admin_cmd(struct nvmet_req *req); 450 422 u16 nvmet_parse_discovery_cmd(struct nvmet_req *req); 451 - u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req); 423 + u16 nvmet_parse_fabrics_admin_cmd(struct nvmet_req *req); 424 + u16 nvmet_parse_fabrics_io_cmd(struct nvmet_req *req); 452 425 453 426 bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, 454 427 struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops); ··· 696 667 if (bio != &req->b.inline_bio) 697 668 bio_put(bio); 698 669 } 670 + 671 + #ifdef CONFIG_NVME_TARGET_AUTH 672 + void nvmet_execute_auth_send(struct nvmet_req *req); 673 + void nvmet_execute_auth_receive(struct nvmet_req *req); 674 + int nvmet_auth_set_key(struct nvmet_host *host, const char *secret, 675 + bool set_ctrl); 676 + int nvmet_auth_set_host_hash(struct nvmet_host *host, const char *hash); 677 + int nvmet_setup_auth(struct nvmet_ctrl *ctrl); 678 + void nvmet_init_auth(struct nvmet_ctrl *ctrl, struct nvmet_req *req); 679 + void nvmet_destroy_auth(struct nvmet_ctrl *ctrl); 680 + void nvmet_auth_sq_free(struct nvmet_sq *sq); 681 + int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id); 682 + bool nvmet_check_auth_status(struct nvmet_req *req); 683 + int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, 684 + unsigned int hash_len); 685 + int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response, 686 + unsigned int hash_len); 687 + static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl) 688 + { 689 + return ctrl->host_key != NULL; 690 + } 691 + int nvmet_auth_ctrl_exponential(struct nvmet_req *req, 692 + u8 *buf, int buf_size); 693 + int nvmet_auth_ctrl_sesskey(struct nvmet_req *req, 694 + u8 *buf, int buf_size); 695 + #else 696 + static inline int nvmet_setup_auth(struct nvmet_ctrl *ctrl) 697 + { 698 + return 0; 699 + } 700 + static inline void nvmet_init_auth(struct nvmet_ctrl *ctrl, 701 + struct nvmet_req *req) {}; 702 + static inline void nvmet_destroy_auth(struct nvmet_ctrl *ctrl) {}; 703 + static inline void nvmet_auth_sq_free(struct nvmet_sq *sq) {}; 704 + static inline bool nvmet_check_auth_status(struct nvmet_req *req) 705 + { 706 + return true; 707 + } 708 + static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl) 709 + { 710 + return false; 711 + } 712 + static inline const char *nvmet_dhchap_dhgroup_name(u8 dhgid) { return NULL; } 713 + #endif 699 714 700 715 #endif /* _NVMET_H */

+2 -1

drivers/nvme/target/tcp.c

··· 1839 1839 { 1840 1840 int ret; 1841 1841 1842 - nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI, 0); 1842 + nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", 1843 + WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); 1843 1844 if (!nvmet_tcp_wq) 1844 1845 return -ENOMEM; 1845 1846

+1 -1

drivers/s390/block/dasd.c

··· 1725 1725 dasd_put_device(device); 1726 1726 } 1727 1727 1728 - /* check for for attention message */ 1728 + /* check for attention message */ 1729 1729 if (scsw_dstat(&irb->scsw) & DEV_STAT_ATTENTION) { 1730 1730 device = dasd_device_from_cdev_locked(cdev); 1731 1731 if (!IS_ERR(device)) {

+1

drivers/s390/block/dasd_diag.c

··· 639 639 /* With page sized segments each segment can be translated into one idaw/tidaw */ 640 640 blk_queue_max_segment_size(q, PAGE_SIZE); 641 641 blk_queue_segment_boundary(q, PAGE_SIZE - 1); 642 + blk_queue_dma_alignment(q, PAGE_SIZE - 1); 642 643 } 643 644 644 645 static int dasd_diag_pe_handler(struct dasd_device *device,

+1

drivers/s390/block/dasd_eckd.c

··· 6626 6626 /* With page sized segments each segment can be translated into one idaw/tidaw */ 6627 6627 blk_queue_max_segment_size(q, PAGE_SIZE); 6628 6628 blk_queue_segment_boundary(q, PAGE_SIZE - 1); 6629 + blk_queue_dma_alignment(q, PAGE_SIZE - 1); 6629 6630 } 6630 6631 6631 6632 static struct ccw_driver dasd_eckd_driver = {

+1 -1

drivers/s390/block/dcssblk.c

··· 863 863 unsigned long source_addr; 864 864 unsigned long bytes_done; 865 865 866 - blk_queue_split(&bio); 866 + bio = bio_split_to_limits(bio); 867 867 868 868 bytes_done = 0; 869 869 dev_info = bio->bi_bdev->bd_disk->private_data;

+2

include/crypto/hash.h

··· 718 718 struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type, 719 719 u32 mask); 720 720 721 + int crypto_has_shash(const char *alg_name, u32 type, u32 mask); 722 + 721 723 static inline struct crypto_tfm *crypto_shash_tfm(struct crypto_shash *tfm) 722 724 { 723 725 return &tfm->base;

+2

include/crypto/kpp.h

··· 104 104 */ 105 105 struct crypto_kpp *crypto_alloc_kpp(const char *alg_name, u32 type, u32 mask); 106 106 107 + int crypto_has_kpp(const char *alg_name, u32 type, u32 mask); 108 + 107 109 static inline struct crypto_tfm *crypto_kpp_tfm(struct crypto_kpp *tfm) 108 110 { 109 111 return &tfm->base;

+16

include/linux/base64.h

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * base64 encoding, lifted from fs/crypto/fname.c. 4 + */ 5 + 6 + #ifndef _LINUX_BASE64_H 7 + #define _LINUX_BASE64_H 8 + 9 + #include <linux/types.h> 10 + 11 + #define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) 12 + 13 + int base64_encode(const u8 *src, int len, char *dst); 14 + int base64_decode(const char *src, int len, u8 *dst); 15 + 16 + #endif /* _LINUX_BASE64_H */

+3 -2

include/linux/blkdev.h

··· 140 140 struct request_queue *queue; 141 141 void *private_data; 142 142 143 + struct bio_set bio_split; 144 + 143 145 int flags; 144 146 unsigned long state; 145 147 #define GD_NEED_PART_SCAN 0 ··· 533 531 534 532 struct blk_mq_tag_set *tag_set; 535 533 struct list_head tag_set_list; 536 - struct bio_set bio_split; 537 534 538 535 struct dentry *debugfs_dir; 539 536 struct dentry *sched_debugfs_dir; ··· 865 864 extern int blk_register_queue(struct gendisk *disk); 866 865 extern void blk_unregister_queue(struct gendisk *disk); 867 866 void submit_bio_noacct(struct bio *bio); 867 + struct bio *bio_split_to_limits(struct bio *bio); 868 868 869 869 extern int blk_lld_busy(struct request_queue *q); 870 - extern void blk_queue_split(struct bio **); 871 870 extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); 872 871 extern void blk_queue_exit(struct request_queue *q); 873 872 extern void blk_sync_queue(struct request_queue *q);

+41

include/linux/nvme-auth.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Copyright (c) 2021 Hannes Reinecke, SUSE Software Solutions 4 + */ 5 + 6 + #ifndef _NVME_AUTH_H 7 + #define _NVME_AUTH_H 8 + 9 + #include <crypto/kpp.h> 10 + 11 + struct nvme_dhchap_key { 12 + u8 *key; 13 + size_t len; 14 + u8 hash; 15 + }; 16 + 17 + u32 nvme_auth_get_seqnum(void); 18 + const char *nvme_auth_dhgroup_name(u8 dhgroup_id); 19 + const char *nvme_auth_dhgroup_kpp(u8 dhgroup_id); 20 + u8 nvme_auth_dhgroup_id(const char *dhgroup_name); 21 + 22 + const char *nvme_auth_hmac_name(u8 hmac_id); 23 + const char *nvme_auth_digest_name(u8 hmac_id); 24 + size_t nvme_auth_hmac_hash_len(u8 hmac_id); 25 + u8 nvme_auth_hmac_id(const char *hmac_name); 26 + 27 + struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret, 28 + u8 key_hash); 29 + void nvme_auth_free_key(struct nvme_dhchap_key *key); 30 + u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn); 31 + int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key); 32 + int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len, 33 + u8 *challenge, u8 *aug, size_t hlen); 34 + int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid); 35 + int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm, 36 + u8 *host_key, size_t host_key_len); 37 + int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, 38 + u8 *ctrl_key, size_t ctrl_key_len, 39 + u8 *sess_key, size_t sess_key_len); 40 + 41 + #endif /* _NVME_AUTH_H */

+212 -1

include/linux/nvme.h

··· 19 19 #define NVMF_TRSVCID_SIZE 32 20 20 #define NVMF_TRADDR_SIZE 256 21 21 #define NVMF_TSAS_SIZE 256 22 + #define NVMF_AUTH_HASH_LEN 64 22 23 23 24 #define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery" 24 25 ··· 713 712 }; 714 713 715 714 enum { 715 + NVME_AER_ERROR_PERSIST_INT_ERR = 0x03, 716 + }; 717 + 718 + enum { 716 719 NVME_AER_NOTICE_NS_CHANGED = 0x00, 717 720 NVME_AER_NOTICE_FW_ACT_STARTING = 0x01, 718 721 NVME_AER_NOTICE_ANA = 0x03, ··· 1374 1369 nvme_fabrics_type_property_set = 0x00, 1375 1370 nvme_fabrics_type_connect = 0x01, 1376 1371 nvme_fabrics_type_property_get = 0x04, 1372 + nvme_fabrics_type_auth_send = 0x05, 1373 + nvme_fabrics_type_auth_receive = 0x06, 1377 1374 }; 1378 1375 1379 1376 #define nvme_fabrics_type_name(type) { type, #type } ··· 1383 1376 __print_symbolic(type, \ 1384 1377 nvme_fabrics_type_name(nvme_fabrics_type_property_set), \ 1385 1378 nvme_fabrics_type_name(nvme_fabrics_type_connect), \ 1386 - nvme_fabrics_type_name(nvme_fabrics_type_property_get)) 1379 + nvme_fabrics_type_name(nvme_fabrics_type_property_get), \ 1380 + nvme_fabrics_type_name(nvme_fabrics_type_auth_send), \ 1381 + nvme_fabrics_type_name(nvme_fabrics_type_auth_receive)) 1387 1382 1388 1383 /* 1389 1384 * If not fabrics command, fctype will be ignored. ··· 1481 1472 __u8 resv4[12]; 1482 1473 }; 1483 1474 1475 + enum { 1476 + NVME_CONNECT_AUTHREQ_ASCR = (1 << 2), 1477 + NVME_CONNECT_AUTHREQ_ATR = (1 << 1), 1478 + }; 1479 + 1484 1480 struct nvmf_connect_data { 1485 1481 uuid_t hostid; 1486 1482 __le16 cntlid; ··· 1519 1505 __le32 offset; 1520 1506 __u8 resv4[16]; 1521 1507 }; 1508 + 1509 + struct nvmf_auth_common_command { 1510 + __u8 opcode; 1511 + __u8 resv1; 1512 + __u16 command_id; 1513 + __u8 fctype; 1514 + __u8 resv2[19]; 1515 + union nvme_data_ptr dptr; 1516 + __u8 resv3; 1517 + __u8 spsp0; 1518 + __u8 spsp1; 1519 + __u8 secp; 1520 + __le32 al_tl; 1521 + __u8 resv4[16]; 1522 + }; 1523 + 1524 + struct nvmf_auth_send_command { 1525 + __u8 opcode; 1526 + __u8 resv1; 1527 + __u16 command_id; 1528 + __u8 fctype; 1529 + __u8 resv2[19]; 1530 + union nvme_data_ptr dptr; 1531 + __u8 resv3; 1532 + __u8 spsp0; 1533 + __u8 spsp1; 1534 + __u8 secp; 1535 + __le32 tl; 1536 + __u8 resv4[16]; 1537 + }; 1538 + 1539 + struct nvmf_auth_receive_command { 1540 + __u8 opcode; 1541 + __u8 resv1; 1542 + __u16 command_id; 1543 + __u8 fctype; 1544 + __u8 resv2[19]; 1545 + union nvme_data_ptr dptr; 1546 + __u8 resv3; 1547 + __u8 spsp0; 1548 + __u8 spsp1; 1549 + __u8 secp; 1550 + __le32 al; 1551 + __u8 resv4[16]; 1552 + }; 1553 + 1554 + /* Value for secp */ 1555 + enum { 1556 + NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER = 0xe9, 1557 + }; 1558 + 1559 + /* Defined value for auth_type */ 1560 + enum { 1561 + NVME_AUTH_COMMON_MESSAGES = 0x00, 1562 + NVME_AUTH_DHCHAP_MESSAGES = 0x01, 1563 + }; 1564 + 1565 + /* Defined messages for auth_id */ 1566 + enum { 1567 + NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE = 0x00, 1568 + NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE = 0x01, 1569 + NVME_AUTH_DHCHAP_MESSAGE_REPLY = 0x02, 1570 + NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1 = 0x03, 1571 + NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2 = 0x04, 1572 + NVME_AUTH_DHCHAP_MESSAGE_FAILURE2 = 0xf0, 1573 + NVME_AUTH_DHCHAP_MESSAGE_FAILURE1 = 0xf1, 1574 + }; 1575 + 1576 + struct nvmf_auth_dhchap_protocol_descriptor { 1577 + __u8 authid; 1578 + __u8 rsvd; 1579 + __u8 halen; 1580 + __u8 dhlen; 1581 + __u8 idlist[60]; 1582 + }; 1583 + 1584 + enum { 1585 + NVME_AUTH_DHCHAP_AUTH_ID = 0x01, 1586 + }; 1587 + 1588 + /* Defined hash functions for DH-HMAC-CHAP authentication */ 1589 + enum { 1590 + NVME_AUTH_HASH_SHA256 = 0x01, 1591 + NVME_AUTH_HASH_SHA384 = 0x02, 1592 + NVME_AUTH_HASH_SHA512 = 0x03, 1593 + NVME_AUTH_HASH_INVALID = 0xff, 1594 + }; 1595 + 1596 + /* Defined Diffie-Hellman group identifiers for DH-HMAC-CHAP authentication */ 1597 + enum { 1598 + NVME_AUTH_DHGROUP_NULL = 0x00, 1599 + NVME_AUTH_DHGROUP_2048 = 0x01, 1600 + NVME_AUTH_DHGROUP_3072 = 0x02, 1601 + NVME_AUTH_DHGROUP_4096 = 0x03, 1602 + NVME_AUTH_DHGROUP_6144 = 0x04, 1603 + NVME_AUTH_DHGROUP_8192 = 0x05, 1604 + NVME_AUTH_DHGROUP_INVALID = 0xff, 1605 + }; 1606 + 1607 + union nvmf_auth_protocol { 1608 + struct nvmf_auth_dhchap_protocol_descriptor dhchap; 1609 + }; 1610 + 1611 + struct nvmf_auth_dhchap_negotiate_data { 1612 + __u8 auth_type; 1613 + __u8 auth_id; 1614 + __le16 rsvd; 1615 + __le16 t_id; 1616 + __u8 sc_c; 1617 + __u8 napd; 1618 + union nvmf_auth_protocol auth_protocol[]; 1619 + }; 1620 + 1621 + struct nvmf_auth_dhchap_challenge_data { 1622 + __u8 auth_type; 1623 + __u8 auth_id; 1624 + __u16 rsvd1; 1625 + __le16 t_id; 1626 + __u8 hl; 1627 + __u8 rsvd2; 1628 + __u8 hashid; 1629 + __u8 dhgid; 1630 + __le16 dhvlen; 1631 + __le32 seqnum; 1632 + /* 'hl' bytes of challenge value */ 1633 + __u8 cval[]; 1634 + /* followed by 'dhvlen' bytes of DH value */ 1635 + }; 1636 + 1637 + struct nvmf_auth_dhchap_reply_data { 1638 + __u8 auth_type; 1639 + __u8 auth_id; 1640 + __le16 rsvd1; 1641 + __le16 t_id; 1642 + __u8 hl; 1643 + __u8 rsvd2; 1644 + __u8 cvalid; 1645 + __u8 rsvd3; 1646 + __le16 dhvlen; 1647 + __le32 seqnum; 1648 + /* 'hl' bytes of response data */ 1649 + __u8 rval[]; 1650 + /* followed by 'hl' bytes of Challenge value */ 1651 + /* followed by 'dhvlen' bytes of DH value */ 1652 + }; 1653 + 1654 + enum { 1655 + NVME_AUTH_DHCHAP_RESPONSE_VALID = (1 << 0), 1656 + }; 1657 + 1658 + struct nvmf_auth_dhchap_success1_data { 1659 + __u8 auth_type; 1660 + __u8 auth_id; 1661 + __le16 rsvd1; 1662 + __le16 t_id; 1663 + __u8 hl; 1664 + __u8 rsvd2; 1665 + __u8 rvalid; 1666 + __u8 rsvd3[7]; 1667 + /* 'hl' bytes of response value if 'rvalid' is set */ 1668 + __u8 rval[]; 1669 + }; 1670 + 1671 + struct nvmf_auth_dhchap_success2_data { 1672 + __u8 auth_type; 1673 + __u8 auth_id; 1674 + __le16 rsvd1; 1675 + __le16 t_id; 1676 + __u8 rsvd2[10]; 1677 + }; 1678 + 1679 + struct nvmf_auth_dhchap_failure_data { 1680 + __u8 auth_type; 1681 + __u8 auth_id; 1682 + __le16 rsvd1; 1683 + __le16 t_id; 1684 + __u8 rescode; 1685 + __u8 rescode_exp; 1686 + }; 1687 + 1688 + enum { 1689 + NVME_AUTH_DHCHAP_FAILURE_REASON_FAILED = 0x01, 1690 + }; 1691 + 1692 + enum { 1693 + NVME_AUTH_DHCHAP_FAILURE_FAILED = 0x01, 1694 + NVME_AUTH_DHCHAP_FAILURE_NOT_USABLE = 0x02, 1695 + NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH = 0x03, 1696 + NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE = 0x04, 1697 + NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE = 0x05, 1698 + NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD = 0x06, 1699 + NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE = 0x07, 1700 + }; 1701 + 1522 1702 1523 1703 struct nvme_dbbuf { 1524 1704 __u8 opcode; ··· 1757 1549 struct nvmf_connect_command connect; 1758 1550 struct nvmf_property_set_command prop_set; 1759 1551 struct nvmf_property_get_command prop_get; 1552 + struct nvmf_auth_common_command auth_common; 1553 + struct nvmf_auth_send_command auth_send; 1554 + struct nvmf_auth_receive_command auth_receive; 1760 1555 struct nvme_dbbuf dbbuf; 1761 1556 struct nvme_directive_cmd directive; 1762 1557 };

+73 -7

include/uapi/linux/ublk_cmd.h

··· 15 15 #define UBLK_CMD_DEL_DEV 0x05 16 16 #define UBLK_CMD_START_DEV 0x06 17 17 #define UBLK_CMD_STOP_DEV 0x07 18 + #define UBLK_CMD_SET_PARAMS 0x08 19 + #define UBLK_CMD_GET_PARAMS 0x09 18 20 19 21 /* 20 22 * IO commands, issued by ublk server, and handled by ublk driver. ··· 30 28 * this IO request, request's handling result is committed to ublk 31 29 * driver, meantime FETCH_REQ is piggyback, and FETCH_REQ has to be 32 30 * handled before completing io request. 31 + * 32 + * NEED_GET_DATA: only used for write requests to set io addr and copy data 33 + * When NEED_GET_DATA is set, ublksrv has to issue UBLK_IO_NEED_GET_DATA 34 + * command after ublk driver returns UBLK_IO_RES_NEED_GET_DATA. 35 + * 36 + * It is only used if ublksrv set UBLK_F_NEED_GET_DATA flag 37 + * while starting a ublk device. 33 38 */ 34 39 #define UBLK_IO_FETCH_REQ 0x20 35 40 #define UBLK_IO_COMMIT_AND_FETCH_REQ 0x21 41 + #define UBLK_IO_NEED_GET_DATA 0x22 36 42 37 43 /* only ABORT means that no re-fetch */ 38 44 #define UBLK_IO_RES_OK 0 45 + #define UBLK_IO_RES_NEED_GET_DATA 1 39 46 #define UBLK_IO_RES_ABORT (-ENODEV) 40 47 41 48 #define UBLKSRV_CMD_BUF_OFFSET 0 ··· 64 53 * performance comparison is done easily with using task_work_add 65 54 */ 66 55 #define UBLK_F_URING_CMD_COMP_IN_TASK (1ULL << 1) 56 + 57 + /* 58 + * User should issue io cmd again for write requests to 59 + * set io buffer address and copy data from bio vectors 60 + * to the userspace io buffer. 61 + * 62 + * In this mode, task_work is not used. 63 + */ 64 + #define UBLK_F_NEED_GET_DATA (1UL << 2) 67 65 68 66 /* device state */ 69 67 #define UBLK_S_DEV_DEAD 0 ··· 98 78 struct ublksrv_ctrl_dev_info { 99 79 __u16 nr_hw_queues; 100 80 __u16 queue_depth; 101 - __u16 block_size; 102 81 __u16 state; 82 + __u16 pad0; 103 83 104 - __u32 rq_max_blocks; 84 + __u32 max_io_buf_bytes; 105 85 __u32 dev_id; 106 86 107 - __u64 dev_blocks; 108 - 109 87 __s32 ublksrv_pid; 110 - __s32 reserved0; 88 + __u32 pad1; 89 + 111 90 __u64 flags; 112 - __u64 flags_reserved; 113 91 114 92 /* For ublksrv internal use, invisible to ublk driver */ 115 93 __u64 ublksrv_flags; 116 - __u64 reserved1[9]; 94 + 95 + __u64 reserved0; 96 + __u64 reserved1; 97 + __u64 reserved2; 117 98 }; 118 99 119 100 #define UBLK_IO_OP_READ 0 ··· 177 156 * FETCH* command only 178 157 */ 179 158 __u64 addr; 159 + }; 160 + 161 + struct ublk_param_basic { 162 + #define UBLK_ATTR_READ_ONLY (1 << 0) 163 + #define UBLK_ATTR_ROTATIONAL (1 << 1) 164 + #define UBLK_ATTR_VOLATILE_CACHE (1 << 2) 165 + #define UBLK_ATTR_FUA (1 << 3) 166 + __u32 attrs; 167 + __u8 logical_bs_shift; 168 + __u8 physical_bs_shift; 169 + __u8 io_opt_shift; 170 + __u8 io_min_shift; 171 + 172 + __u32 max_sectors; 173 + __u32 chunk_sectors; 174 + 175 + __u64 dev_sectors; 176 + __u64 virt_boundary_mask; 177 + }; 178 + 179 + struct ublk_param_discard { 180 + __u32 discard_alignment; 181 + 182 + __u32 discard_granularity; 183 + __u32 max_discard_sectors; 184 + 185 + __u32 max_write_zeroes_sectors; 186 + __u16 max_discard_segments; 187 + __u16 reserved0; 188 + }; 189 + 190 + struct ublk_params { 191 + /* 192 + * Total length of parameters, userspace has to set 'len' for both 193 + * SET_PARAMS and GET_PARAMS command, and driver may update len 194 + * if two sides use different version of 'ublk_params', same with 195 + * 'types' fields. 196 + */ 197 + __u32 len; 198 + #define UBLK_PARAM_TYPE_BASIC (1 << 0) 199 + #define UBLK_PARAM_TYPE_DISCARD (1 << 1) 200 + __u32 types; /* types of parameter included */ 201 + 202 + struct ublk_param_basic basic; 203 + struct ublk_param_discard discard; 180 204 }; 181 205 182 206 #endif

+1 -1

lib/Makefile

··· 46 46 bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \ 47 47 list_sort.o uuid.o iov_iter.o clz_ctz.o \ 48 48 bsearch.o find_bit.o llist.o memweight.o kfifo.o \ 49 - percpu-refcount.o rhashtable.o \ 49 + percpu-refcount.o rhashtable.o base64.o \ 50 50 once.o refcount.o usercopy.o errseq.o bucket_locks.o \ 51 51 generic-radix-tree.o 52 52 obj-$(CONFIG_STRING_SELFTEST) += test_string.o

+103

lib/base64.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * base64.c - RFC4648-compliant base64 encoding 4 + * 5 + * Copyright (c) 2020 Hannes Reinecke, SUSE 6 + * 7 + * Based on the base64url routines from fs/crypto/fname.c 8 + * (which are using the URL-safe base64 encoding), 9 + * modified to use the standard coding table from RFC4648 section 4. 10 + */ 11 + 12 + #include <linux/kernel.h> 13 + #include <linux/types.h> 14 + #include <linux/export.h> 15 + #include <linux/string.h> 16 + #include <linux/base64.h> 17 + 18 + static const char base64_table[65] = 19 + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; 20 + 21 + /** 22 + * base64_encode() - base64-encode some binary data 23 + * @src: the binary data to encode 24 + * @srclen: the length of @src in bytes 25 + * @dst: (output) the base64-encoded string. Not NUL-terminated. 26 + * 27 + * Encodes data using base64 encoding, i.e. the "Base 64 Encoding" specified 28 + * by RFC 4648, including the '='-padding. 29 + * 30 + * Return: the length of the resulting base64-encoded string in bytes. 31 + */ 32 + int base64_encode(const u8 *src, int srclen, char *dst) 33 + { 34 + u32 ac = 0; 35 + int bits = 0; 36 + int i; 37 + char *cp = dst; 38 + 39 + for (i = 0; i < srclen; i++) { 40 + ac = (ac << 8) | src[i]; 41 + bits += 8; 42 + do { 43 + bits -= 6; 44 + *cp++ = base64_table[(ac >> bits) & 0x3f]; 45 + } while (bits >= 6); 46 + } 47 + if (bits) { 48 + *cp++ = base64_table[(ac << (6 - bits)) & 0x3f]; 49 + bits -= 6; 50 + } 51 + while (bits < 0) { 52 + *cp++ = '='; 53 + bits += 2; 54 + } 55 + return cp - dst; 56 + } 57 + EXPORT_SYMBOL_GPL(base64_encode); 58 + 59 + /** 60 + * base64_decode() - base64-decode a string 61 + * @src: the string to decode. Doesn't need to be NUL-terminated. 62 + * @srclen: the length of @src in bytes 63 + * @dst: (output) the decoded binary data 64 + * 65 + * Decodes a string using base64 encoding, i.e. the "Base 64 Encoding" 66 + * specified by RFC 4648, including the '='-padding. 67 + * 68 + * This implementation hasn't been optimized for performance. 69 + * 70 + * Return: the length of the resulting decoded binary data in bytes, 71 + * or -1 if the string isn't a valid base64 string. 72 + */ 73 + int base64_decode(const char *src, int srclen, u8 *dst) 74 + { 75 + u32 ac = 0; 76 + int bits = 0; 77 + int i; 78 + u8 *bp = dst; 79 + 80 + for (i = 0; i < srclen; i++) { 81 + const char *p = strchr(base64_table, src[i]); 82 + 83 + if (src[i] == '=') { 84 + ac = (ac << 6); 85 + bits += 6; 86 + if (bits >= 8) 87 + bits -= 8; 88 + continue; 89 + } 90 + if (p == NULL || src[i] == 0) 91 + return -1; 92 + ac = (ac << 6) | (p - base64_table); 93 + bits += 6; 94 + if (bits >= 8) { 95 + bits -= 8; 96 + *bp++ = (u8)(ac >> bits); 97 + } 98 + } 99 + if (ac & ((1 << bits) - 1)) 100 + return -1; 101 + return bp - dst; 102 + } 103 + EXPORT_SYMBOL_GPL(base64_decode);