Merge tag 'for-6.6/block-2023-08-28' of git://git.kernel.dk/linux

+3

block/Kconfig

··· 5 5 menuconfig BLOCK 6 6 bool "Enable the block layer" if EXPERT 7 7 default y 8 + select FS_IOMAP 8 9 select SBITMAP 9 10 help 10 11 Provide block layer support for the kernel. ··· 184 183 185 184 config BLK_SED_OPAL 186 185 bool "Logic for interfacing with Opal enabled SEDs" 186 + depends on KEYS 187 + select PSERIES_PLPKS if PPC_PSERIES 187 188 help 188 189 Builds Logic for interfacing with Opal enabled controllers. 189 190 Enabling this option enables users to setup/unlock/lock

+30 -29

block/bio-integrity.c

··· 123 123 int bio_integrity_add_page(struct bio *bio, struct page *page, 124 124 unsigned int len, unsigned int offset) 125 125 { 126 + struct request_queue *q = bdev_get_queue(bio->bi_bdev); 126 127 struct bio_integrity_payload *bip = bio_integrity(bio); 127 128 128 - if (bip->bip_vcnt >= bip->bip_max_vcnt) { 129 - printk(KERN_ERR "%s: bip_vec full\n", __func__); 129 + if (((bip->bip_iter.bi_size + len) >> SECTOR_SHIFT) > 130 + queue_max_hw_sectors(q)) 130 131 return 0; 131 - } 132 132 133 - if (bip->bip_vcnt && 134 - bvec_gap_to_prev(&bdev_get_queue(bio->bi_bdev)->limits, 135 - &bip->bip_vec[bip->bip_vcnt - 1], offset)) 136 - return 0; 133 + if (bip->bip_vcnt > 0) { 134 + struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1]; 135 + bool same_page = false; 136 + 137 + if (bvec_try_merge_hw_page(q, bv, page, len, offset, 138 + &same_page)) { 139 + bip->bip_iter.bi_size += len; 140 + return len; 141 + } 142 + 143 + if (bip->bip_vcnt >= 144 + min(bip->bip_max_vcnt, queue_max_integrity_segments(q))) 145 + return 0; 146 + 147 + /* 148 + * If the queue doesn't support SG gaps and adding this segment 149 + * would create a gap, disallow it. 150 + */ 151 + if (bvec_gap_to_prev(&q->limits, bv, offset)) 152 + return 0; 153 + } 137 154 138 155 bvec_set_page(&bip->bip_vec[bip->bip_vcnt], page, len, offset); 139 156 bip->bip_vcnt++; 157 + bip->bip_iter.bi_size += len; 140 158 141 159 return len; 142 160 } ··· 217 199 unsigned long start, end; 218 200 unsigned int len, nr_pages; 219 201 unsigned int bytes, offset, i; 220 - unsigned int intervals; 221 - blk_status_t status; 222 202 223 203 if (!bi) 224 204 return true; ··· 240 224 !(bi->flags & BLK_INTEGRITY_GENERATE)) 241 225 return true; 242 226 } 243 - intervals = bio_integrity_intervals(bi, bio_sectors(bio)); 244 227 245 228 /* Allocate kernel buffer for protection data */ 246 - len = intervals * bi->tuple_size; 229 + len = bio_integrity_bytes(bi, bio_sectors(bio)); 247 230 buf = kmalloc(len, GFP_NOIO); 248 - status = BLK_STS_RESOURCE; 249 231 if (unlikely(buf == NULL)) { 250 232 printk(KERN_ERR "could not allocate integrity buffer\n"); 251 233 goto err_end_io; ··· 258 244 if (IS_ERR(bip)) { 259 245 printk(KERN_ERR "could not allocate data integrity bioset\n"); 260 246 kfree(buf); 261 - status = BLK_STS_RESOURCE; 262 247 goto err_end_io; 263 248 } 264 249 265 250 bip->bip_flags |= BIP_BLOCK_INTEGRITY; 266 - bip->bip_iter.bi_size = len; 267 251 bip_set_seed(bip, bio->bi_iter.bi_sector); 268 252 269 253 if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM) ··· 269 257 270 258 /* Map it */ 271 259 offset = offset_in_page(buf); 272 - for (i = 0 ; i < nr_pages ; i++) { 273 - int ret; 260 + for (i = 0; i < nr_pages && len > 0; i++) { 274 261 bytes = PAGE_SIZE - offset; 275 - 276 - if (len <= 0) 277 - break; 278 262 279 263 if (bytes > len) 280 264 bytes = len; 281 265 282 - ret = bio_integrity_add_page(bio, virt_to_page(buf), 283 - bytes, offset); 284 - 285 - if (ret == 0) { 266 + if (bio_integrity_add_page(bio, virt_to_page(buf), 267 + bytes, offset) < bytes) { 286 268 printk(KERN_ERR "could not attach integrity payload\n"); 287 - status = BLK_STS_RESOURCE; 288 269 goto err_end_io; 289 270 } 290 - 291 - if (ret < bytes) 292 - break; 293 271 294 272 buf += bytes; 295 273 len -= bytes; ··· 296 294 return true; 297 295 298 296 err_end_io: 299 - bio->bi_status = status; 297 + bio->bi_status = BLK_STS_RESOURCE; 300 298 bio_endio(bio); 301 299 return false; 302 - 303 300 } 304 301 EXPORT_SYMBOL(bio_integrity_prep); 305 302

+64 -78

block/bio.c

··· 606 606 } 607 607 EXPORT_SYMBOL(bio_kmalloc); 608 608 609 - void zero_fill_bio(struct bio *bio) 609 + void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) 610 610 { 611 611 struct bio_vec bv; 612 612 struct bvec_iter iter; 613 613 614 - bio_for_each_segment(bv, bio, iter) 614 + __bio_for_each_segment(bv, bio, iter, start) 615 615 memzero_bvec(&bv); 616 616 } 617 - EXPORT_SYMBOL(zero_fill_bio); 617 + EXPORT_SYMBOL(zero_fill_bio_iter); 618 618 619 619 /** 620 620 * bio_truncate - truncate the bio to small size of @new_size ··· 903 903 return false; 904 904 } 905 905 906 - static inline bool page_is_mergeable(const struct bio_vec *bv, 907 - struct page *page, unsigned int len, unsigned int off, 908 - bool *same_page) 906 + static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, 907 + unsigned int len, unsigned int off, bool *same_page) 909 908 { 910 909 size_t bv_end = bv->bv_offset + bv->bv_len; 911 910 phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1; ··· 918 919 return false; 919 920 920 921 *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); 921 - if (*same_page) 922 - return true; 923 - else if (IS_ENABLED(CONFIG_KMSAN)) 924 - return false; 925 - return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE); 926 - } 927 - 928 - /** 929 - * __bio_try_merge_page - try appending data to an existing bvec. 930 - * @bio: destination bio 931 - * @page: start page to add 932 - * @len: length of the data to add 933 - * @off: offset of the data relative to @page 934 - * @same_page: return if the segment has been merged inside the same page 935 - * 936 - * Try to add the data at @page + @off to the last bvec of @bio. This is a 937 - * useful optimisation for file systems with a block size smaller than the 938 - * page size. 939 - * 940 - * Warn if (@len, @off) crosses pages in case that @same_page is true. 941 - * 942 - * Return %true on success or %false on failure. 943 - */ 944 - static bool __bio_try_merge_page(struct bio *bio, struct page *page, 945 - unsigned int len, unsigned int off, bool *same_page) 946 - { 947 - if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) 948 - return false; 949 - 950 - if (bio->bi_vcnt > 0) { 951 - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; 952 - 953 - if (page_is_mergeable(bv, page, len, off, same_page)) { 954 - if (bio->bi_iter.bi_size > UINT_MAX - len) { 955 - *same_page = false; 956 - return false; 957 - } 958 - bv->bv_len += len; 959 - bio->bi_iter.bi_size += len; 960 - return true; 961 - } 922 + if (!*same_page) { 923 + if (IS_ENABLED(CONFIG_KMSAN)) 924 + return false; 925 + if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE) 926 + return false; 962 927 } 963 - return false; 928 + 929 + bv->bv_len += len; 930 + return true; 964 931 } 965 932 966 933 /* ··· 934 969 * size limit. This is not for normal read/write bios, but for passthrough 935 970 * or Zone Append operations that we can't split. 936 971 */ 937 - static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio, 938 - struct page *page, unsigned len, 939 - unsigned offset, bool *same_page) 972 + bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, 973 + struct page *page, unsigned len, unsigned offset, 974 + bool *same_page) 940 975 { 941 - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; 942 976 unsigned long mask = queue_segment_boundary(q); 943 977 phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset; 944 978 phys_addr_t addr2 = page_to_phys(page) + offset + len - 1; ··· 946 982 return false; 947 983 if (bv->bv_len + len > queue_max_segment_size(q)) 948 984 return false; 949 - return __bio_try_merge_page(bio, page, len, offset, same_page); 985 + return bvec_try_merge_page(bv, page, len, offset, same_page); 950 986 } 951 987 952 988 /** ··· 966 1002 struct page *page, unsigned int len, unsigned int offset, 967 1003 unsigned int max_sectors, bool *same_page) 968 1004 { 969 - struct bio_vec *bvec; 970 - 971 1005 if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) 972 1006 return 0; 973 1007 974 - if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors) 1008 + if (((bio->bi_iter.bi_size + len) >> SECTOR_SHIFT) > max_sectors) 975 1009 return 0; 976 1010 977 1011 if (bio->bi_vcnt > 0) { 978 - if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page)) 1012 + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; 1013 + 1014 + if (bvec_try_merge_hw_page(q, bv, page, len, offset, 1015 + same_page)) { 1016 + bio->bi_iter.bi_size += len; 979 1017 return len; 1018 + } 1019 + 1020 + if (bio->bi_vcnt >= 1021 + min(bio->bi_max_vecs, queue_max_segments(q))) 1022 + return 0; 980 1023 981 1024 /* 982 1025 * If the queue doesn't support SG gaps and adding this segment 983 1026 * would create a gap, disallow it. 984 1027 */ 985 - bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; 986 - if (bvec_gap_to_prev(&q->limits, bvec, offset)) 1028 + if (bvec_gap_to_prev(&q->limits, bv, offset)) 987 1029 return 0; 988 1030 } 989 - 990 - if (bio_full(bio, len)) 991 - return 0; 992 - 993 - if (bio->bi_vcnt >= queue_max_segments(q)) 994 - return 0; 995 1031 996 1032 bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset); 997 1033 bio->bi_vcnt++; ··· 1093 1129 { 1094 1130 bool same_page = false; 1095 1131 1096 - if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { 1097 - if (bio_full(bio, len)) 1098 - return 0; 1099 - __bio_add_page(bio, page, len, offset); 1132 + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) 1133 + return 0; 1134 + if (bio->bi_iter.bi_size > UINT_MAX - len) 1135 + return 0; 1136 + 1137 + if (bio->bi_vcnt > 0 && 1138 + bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], 1139 + page, len, offset, &same_page)) { 1140 + bio->bi_iter.bi_size += len; 1141 + return len; 1100 1142 } 1143 + 1144 + if (bio->bi_vcnt >= bio->bi_max_vecs) 1145 + return 0; 1146 + __bio_add_page(bio, page, len, offset); 1101 1147 return len; 1102 1148 } 1103 1149 EXPORT_SYMBOL(bio_add_page); ··· 1181 1207 { 1182 1208 bool same_page = false; 1183 1209 1184 - if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { 1185 - __bio_add_page(bio, page, len, offset); 1210 + if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len)) 1211 + return -EIO; 1212 + 1213 + if (bio->bi_vcnt > 0 && 1214 + bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], 1215 + page, len, offset, &same_page)) { 1216 + bio->bi_iter.bi_size += len; 1217 + if (same_page) 1218 + bio_release_page(bio, page); 1186 1219 return 0; 1187 1220 } 1188 - 1189 - if (same_page) 1190 - bio_release_page(bio, page); 1221 + __bio_add_page(bio, page, len, offset); 1191 1222 return 0; 1192 1223 } 1193 1224 ··· 1231 1252 struct page **pages = (struct page **)bv; 1232 1253 ssize_t size, left; 1233 1254 unsigned len, i = 0; 1234 - size_t offset, trim; 1255 + size_t offset; 1235 1256 int ret = 0; 1236 1257 1237 1258 /* ··· 1260 1281 1261 1282 nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); 1262 1283 1263 - trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1); 1264 - iov_iter_revert(iter, trim); 1284 + if (bio->bi_bdev) { 1285 + size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1); 1286 + iov_iter_revert(iter, trim); 1287 + size -= trim; 1288 + } 1265 1289 1266 - size -= trim; 1267 1290 if (unlikely(!size)) { 1268 1291 ret = -EFAULT; 1269 1292 goto out; ··· 1317 1336 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) 1318 1337 { 1319 1338 int ret = 0; 1339 + 1340 + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) 1341 + return -EIO; 1320 1342 1321 1343 if (iov_iter_is_bvec(iter)) { 1322 1344 bio_iov_bvec_set(bio, iter); ··· 1474 1490 set_page_dirty_lock(bvec->bv_page); 1475 1491 } 1476 1492 } 1493 + EXPORT_SYMBOL_GPL(bio_set_pages_dirty); 1477 1494 1478 1495 /* 1479 1496 * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. ··· 1534 1549 spin_unlock_irqrestore(&bio_dirty_lock, flags); 1535 1550 schedule_work(&bio_dirty_work); 1536 1551 } 1552 + EXPORT_SYMBOL_GPL(bio_check_pages_dirty); 1537 1553 1538 1554 static inline bool bio_remaining_done(struct bio *bio) 1539 1555 {

+18 -14

block/blk-cgroup.c

··· 1511 1511 retry: 1512 1512 spin_lock_irq(&q->queue_lock); 1513 1513 1514 - /* blkg_list is pushed at the head, reverse walk to allocate parents first */ 1514 + /* blkg_list is pushed at the head, reverse walk to initialize parents first */ 1515 1515 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { 1516 1516 struct blkg_policy_data *pd; 1517 1517 ··· 1549 1549 goto enomem; 1550 1550 } 1551 1551 1552 - blkg->pd[pol->plid] = pd; 1552 + spin_lock(&blkg->blkcg->lock); 1553 + 1553 1554 pd->blkg = blkg; 1554 1555 pd->plid = pol->plid; 1555 - pd->online = false; 1556 - } 1556 + blkg->pd[pol->plid] = pd; 1557 1557 1558 - /* all allocated, init in the same order */ 1559 - if (pol->pd_init_fn) 1560 - list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) 1561 - pol->pd_init_fn(blkg->pd[pol->plid]); 1558 + if (pol->pd_init_fn) 1559 + pol->pd_init_fn(pd); 1562 1560 1563 - list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { 1564 1561 if (pol->pd_online_fn) 1565 - pol->pd_online_fn(blkg->pd[pol->plid]); 1566 - blkg->pd[pol->plid]->online = true; 1562 + pol->pd_online_fn(pd); 1563 + pd->online = true; 1564 + 1565 + spin_unlock(&blkg->blkcg->lock); 1567 1566 } 1568 1567 1569 1568 __set_bit(pol->plid, q->blkcg_pols); ··· 1579 1580 return ret; 1580 1581 1581 1582 enomem: 1582 - /* alloc failed, nothing's initialized yet, free everything */ 1583 + /* alloc failed, take down everything */ 1583 1584 spin_lock_irq(&q->queue_lock); 1584 1585 list_for_each_entry(blkg, &q->blkg_list, q_node) { 1585 1586 struct blkcg *blkcg = blkg->blkcg; 1587 + struct blkg_policy_data *pd; 1586 1588 1587 1589 spin_lock(&blkcg->lock); 1588 - if (blkg->pd[pol->plid]) { 1589 - pol->pd_free_fn(blkg->pd[pol->plid]); 1590 + pd = blkg->pd[pol->plid]; 1591 + if (pd) { 1592 + if (pd->online && pol->pd_offline_fn) 1593 + pol->pd_offline_fn(pd); 1594 + pd->online = false; 1595 + pol->pd_free_fn(pd); 1590 1596 blkg->pd[pol->plid] = NULL; 1591 1597 } 1592 1598 spin_unlock(&blkcg->lock);

+1

block/blk-core.c

··· 208 208 return "<null>"; 209 209 return blk_errors[idx].name; 210 210 } 211 + EXPORT_SYMBOL_GPL(blk_status_to_str); 211 212 212 213 /** 213 214 * blk_sync_queue - cancel any pending callbacks on a queue

+15 -11

block/blk-flush.c

··· 183 183 /* queue for flush */ 184 184 if (list_empty(pending)) 185 185 fq->flush_pending_since = jiffies; 186 - list_move_tail(&rq->flush.list, pending); 186 + list_move_tail(&rq->queuelist, pending); 187 187 break; 188 188 189 189 case REQ_FSEQ_DATA: 190 - list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); 190 + fq->flush_data_in_flight++; 191 191 spin_lock(&q->requeue_lock); 192 - list_add(&rq->queuelist, &q->requeue_list); 192 + list_move(&rq->queuelist, &q->requeue_list); 193 193 spin_unlock(&q->requeue_lock); 194 194 blk_mq_kick_requeue_list(q); 195 195 break; ··· 201 201 * flush data request completion path. Restore @rq for 202 202 * normal completion and end it. 203 203 */ 204 - list_del_init(&rq->flush.list); 204 + list_del_init(&rq->queuelist); 205 205 blk_flush_restore_request(rq); 206 206 blk_mq_end_request(rq, error); 207 207 break; ··· 257 257 fq->flush_running_idx ^= 1; 258 258 259 259 /* and push the waiting requests to the next stage */ 260 - list_for_each_entry_safe(rq, n, running, flush.list) { 260 + list_for_each_entry_safe(rq, n, running, queuelist) { 261 261 unsigned int seq = blk_flush_cur_seq(rq); 262 262 263 263 BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); ··· 291 291 { 292 292 struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; 293 293 struct request *first_rq = 294 - list_first_entry(pending, struct request, flush.list); 294 + list_first_entry(pending, struct request, queuelist); 295 295 struct request *flush_rq = fq->flush_rq; 296 296 297 297 /* C1 described at the top of this file */ ··· 299 299 return; 300 300 301 301 /* C2 and C3 */ 302 - if (!list_empty(&fq->flush_data_in_flight) && 302 + if (fq->flush_data_in_flight && 303 303 time_before(jiffies, 304 304 fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) 305 305 return; ··· 374 374 * the comment in flush_end_io(). 375 375 */ 376 376 spin_lock_irqsave(&fq->mq_flush_lock, flags); 377 + fq->flush_data_in_flight--; 378 + /* 379 + * May have been corrupted by rq->rq_next reuse, we need to 380 + * re-initialize rq->queuelist before reusing it here. 381 + */ 382 + INIT_LIST_HEAD(&rq->queuelist); 377 383 blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error); 378 384 spin_unlock_irqrestore(&fq->mq_flush_lock, flags); 379 385 ··· 390 384 static void blk_rq_init_flush(struct request *rq) 391 385 { 392 386 rq->flush.seq = 0; 393 - INIT_LIST_HEAD(&rq->flush.list); 394 387 rq->rq_flags |= RQF_FLUSH_SEQ; 395 388 rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ 396 389 rq->end_io = mq_flush_data_end_io; ··· 448 443 * the post flush, and then just pass the command on. 449 444 */ 450 445 blk_rq_init_flush(rq); 451 - rq->flush.seq |= REQ_FSEQ_POSTFLUSH; 446 + rq->flush.seq |= REQ_FSEQ_PREFLUSH; 452 447 spin_lock_irq(&fq->mq_flush_lock); 453 - list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); 448 + fq->flush_data_in_flight++; 454 449 spin_unlock_irq(&fq->mq_flush_lock); 455 450 return false; 456 451 default: ··· 501 496 502 497 INIT_LIST_HEAD(&fq->flush_queue[0]); 503 498 INIT_LIST_HEAD(&fq->flush_queue[1]); 504 - INIT_LIST_HEAD(&fq->flush_data_in_flight); 505 499 506 500 return fq; 507 501

+11 -24

block/blk-iolatency.c

··· 824 824 } 825 825 } 826 826 827 - static int blk_iolatency_try_init(struct blkg_conf_ctx *ctx) 828 - { 829 - static DEFINE_MUTEX(init_mutex); 830 - int ret; 831 - 832 - ret = blkg_conf_open_bdev(ctx); 833 - if (ret) 834 - return ret; 835 - 836 - /* 837 - * blk_iolatency_init() may fail after rq_qos_add() succeeds which can 838 - * confuse iolat_rq_qos() test. Make the test and init atomic. 839 - */ 840 - mutex_lock(&init_mutex); 841 - 842 - if (!iolat_rq_qos(ctx->bdev->bd_queue)) 843 - ret = blk_iolatency_init(ctx->bdev->bd_disk); 844 - 845 - mutex_unlock(&init_mutex); 846 - 847 - return ret; 848 - } 849 - 850 827 static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, 851 828 size_t nbytes, loff_t off) 852 829 { ··· 838 861 839 862 blkg_conf_init(&ctx, buf); 840 863 841 - ret = blk_iolatency_try_init(&ctx); 864 + ret = blkg_conf_open_bdev(&ctx); 865 + if (ret) 866 + goto out; 867 + 868 + /* 869 + * blk_iolatency_init() may fail after rq_qos_add() succeeds which can 870 + * confuse iolat_rq_qos() test. Make the test and init atomic. 871 + */ 872 + lockdep_assert_held(&ctx.bdev->bd_queue->rq_qos_mutex); 873 + if (!iolat_rq_qos(ctx.bdev->bd_queue)) 874 + ret = blk_iolatency_init(ctx.bdev->bd_disk); 842 875 if (ret) 843 876 goto out; 844 877

+31 -14

block/blk-mq.c

··· 43 43 #include "blk-ioprio.h" 44 44 45 45 static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); 46 + static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd); 46 47 47 48 static void blk_mq_insert_request(struct request *rq, blk_insert_t flags); 48 49 static void blk_mq_request_bypass_insert(struct request *rq, ··· 1175 1174 1176 1175 static void blk_mq_complete_send_ipi(struct request *rq) 1177 1176 { 1178 - struct llist_head *list; 1179 1177 unsigned int cpu; 1180 1178 1181 1179 cpu = rq->mq_ctx->cpu; 1182 - list = &per_cpu(blk_cpu_done, cpu); 1183 - if (llist_add(&rq->ipi_list, list)) { 1184 - INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq); 1185 - smp_call_function_single_async(cpu, &rq->csd); 1186 - } 1180 + if (llist_add(&rq->ipi_list, &per_cpu(blk_cpu_done, cpu))) 1181 + smp_call_function_single_async(cpu, &per_cpu(blk_cpu_csd, cpu)); 1187 1182 } 1188 1183 1189 1184 static void blk_mq_raise_softirq(struct request *rq) ··· 1340 1343 } 1341 1344 1342 1345 blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); 1343 - blk_mq_run_hw_queue(hctx, false); 1346 + blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); 1344 1347 } 1345 1348 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); 1346 1349 ··· 2239 2242 */ 2240 2243 WARN_ON_ONCE(!async && in_interrupt()); 2241 2244 2245 + might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING); 2246 + 2242 2247 /* 2243 2248 * When queue is quiesced, we may be switching io scheduler, or 2244 2249 * updating nr_hw_queues, or other things, and we can't run queue ··· 2256 2257 if (!need_run) 2257 2258 return; 2258 2259 2259 - if (async || (hctx->flags & BLK_MQ_F_BLOCKING) || 2260 - !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { 2260 + if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { 2261 2261 blk_mq_delay_run_hw_queue(hctx, 0); 2262 2262 return; 2263 2263 } ··· 2391 2393 { 2392 2394 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 2393 2395 2394 - blk_mq_run_hw_queue(hctx, false); 2396 + blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); 2395 2397 } 2396 2398 EXPORT_SYMBOL(blk_mq_start_hw_queue); 2397 2399 ··· 2421 2423 unsigned long i; 2422 2424 2423 2425 queue_for_each_hw_ctx(q, hctx, i) 2424 - blk_mq_start_stopped_hw_queue(hctx, async); 2426 + blk_mq_start_stopped_hw_queue(hctx, async || 2427 + (hctx->flags & BLK_MQ_F_BLOCKING)); 2425 2428 } 2426 2429 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 2427 2430 ··· 2480 2481 list_for_each_entry(rq, list, queuelist) { 2481 2482 BUG_ON(rq->mq_ctx != ctx); 2482 2483 trace_block_rq_insert(rq); 2484 + if (rq->cmd_flags & REQ_NOWAIT) 2485 + run_queue_async = true; 2483 2486 } 2484 2487 2485 2488 spin_lock(&ctx->lock); ··· 2642 2641 2643 2642 if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) { 2644 2643 blk_mq_insert_request(rq, 0); 2645 - blk_mq_run_hw_queue(hctx, false); 2644 + blk_mq_run_hw_queue(hctx, rq->cmd_flags & REQ_NOWAIT); 2646 2645 return; 2647 2646 } 2648 2647 ··· 4403 4402 int new_nr_hw_queues) 4404 4403 { 4405 4404 struct blk_mq_tags **new_tags; 4405 + int i; 4406 4406 4407 - if (set->nr_hw_queues >= new_nr_hw_queues) 4407 + if (set->nr_hw_queues >= new_nr_hw_queues) { 4408 + for (i = new_nr_hw_queues; i < set->nr_hw_queues; i++) 4409 + __blk_mq_free_map_and_rqs(set, i); 4408 4410 goto done; 4411 + } 4409 4412 4410 4413 new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), 4411 4414 GFP_KERNEL, set->numa_node); ··· 4421 4416 sizeof(*set->tags)); 4422 4417 kfree(set->tags); 4423 4418 set->tags = new_tags; 4419 + 4420 + for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) { 4421 + if (!__blk_mq_alloc_map_and_rqs(set, i)) { 4422 + while (--i >= set->nr_hw_queues) 4423 + __blk_mq_free_map_and_rqs(set, i); 4424 + return -ENOMEM; 4425 + } 4426 + cond_resched(); 4427 + } 4428 + 4424 4429 done: 4425 4430 set->nr_hw_queues = new_nr_hw_queues; 4426 4431 return 0; ··· 4764 4749 __blk_mq_free_map_and_rqs(set, i); 4765 4750 4766 4751 set->nr_hw_queues = prev_nr_hw_queues; 4767 - blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); 4768 4752 goto fallback; 4769 4753 } 4770 4754 blk_mq_map_swqueue(q); ··· 4867 4853 4868 4854 for_each_possible_cpu(i) 4869 4855 init_llist_head(&per_cpu(blk_cpu_done, i)); 4856 + for_each_possible_cpu(i) 4857 + INIT_CSD(&per_cpu(blk_cpu_csd, i), 4858 + __blk_mq_complete_request_remote, NULL); 4870 4859 open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); 4871 4860 4872 4861 cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,

+5 -2

block/blk-settings.c

··· 830 830 */ 831 831 void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) 832 832 { 833 - if (wc) 833 + if (wc) { 834 + blk_queue_flag_set(QUEUE_FLAG_HW_WC, q); 834 835 blk_queue_flag_set(QUEUE_FLAG_WC, q); 835 - else 836 + } else { 837 + blk_queue_flag_clear(QUEUE_FLAG_HW_WC, q); 836 838 blk_queue_flag_clear(QUEUE_FLAG_WC, q); 839 + } 837 840 if (fua) 838 841 blk_queue_flag_set(QUEUE_FLAG_FUA, q); 839 842 else

+8 -13

block/blk-sysfs.c

··· 449 449 static ssize_t queue_wc_store(struct request_queue *q, const char *page, 450 450 size_t count) 451 451 { 452 - int set = -1; 453 - 454 - if (!strncmp(page, "write back", 10)) 455 - set = 1; 456 - else if (!strncmp(page, "write through", 13) || 457 - !strncmp(page, "none", 4)) 458 - set = 0; 459 - 460 - if (set == -1) 461 - return -EINVAL; 462 - 463 - if (set) 452 + if (!strncmp(page, "write back", 10)) { 453 + if (!test_bit(QUEUE_FLAG_HW_WC, &q->queue_flags)) 454 + return -EINVAL; 464 455 blk_queue_flag_set(QUEUE_FLAG_WC, q); 465 - else 456 + } else if (!strncmp(page, "write through", 13) || 457 + !strncmp(page, "none", 4)) { 466 458 blk_queue_flag_clear(QUEUE_FLAG_WC, q); 459 + } else { 460 + return -EINVAL; 461 + } 467 462 468 463 return count; 469 464 }

+6 -4

block/blk.h

··· 15 15 extern struct dentry *blk_debugfs_root; 16 16 17 17 struct blk_flush_queue { 18 + spinlock_t mq_flush_lock; 18 19 unsigned int flush_pending_idx:1; 19 20 unsigned int flush_running_idx:1; 20 21 blk_status_t rq_status; 21 22 unsigned long flush_pending_since; 22 23 struct list_head flush_queue[2]; 23 - struct list_head flush_data_in_flight; 24 + unsigned long flush_data_in_flight; 24 25 struct request *flush_rq; 25 - 26 - spinlock_t mq_flush_lock; 27 26 }; 28 27 29 28 bool is_flush_rq(struct request *req); ··· 74 75 struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, 75 76 gfp_t gfp_mask); 76 77 void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs); 78 + 79 + bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, 80 + struct page *page, unsigned len, unsigned offset, 81 + bool *same_page); 77 82 78 83 static inline bool biovec_phys_mergeable(struct request_queue *q, 79 84 struct bio_vec *vec1, struct bio_vec *vec2) ··· 254 251 255 252 unsigned long blk_rq_timeout(unsigned long timeout); 256 253 void blk_add_timer(struct request *req); 257 - const char *blk_status_to_str(blk_status_t status); 258 254 259 255 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 260 256 unsigned int nr_segs);

+130 -13

block/fops.c

··· 15 15 #include <linux/falloc.h> 16 16 #include <linux/suspend.h> 17 17 #include <linux/fs.h> 18 + #include <linux/iomap.h> 18 19 #include <linux/module.h> 19 20 #include "blk.h" 20 21 21 22 static inline struct inode *bdev_file_inode(struct file *file) 22 23 { 23 24 return file->f_mapping->host; 24 - } 25 - 26 - static int blkdev_get_block(struct inode *inode, sector_t iblock, 27 - struct buffer_head *bh, int create) 28 - { 29 - bh->b_bdev = I_BDEV(inode); 30 - bh->b_blocknr = iblock; 31 - set_buffer_mapped(bh); 32 - return 0; 33 25 } 34 26 35 27 static blk_opf_t dio_bio_write_op(struct kiocb *iocb) ··· 379 387 return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); 380 388 } 381 389 390 + static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, 391 + unsigned int flags, struct iomap *iomap, struct iomap *srcmap) 392 + { 393 + struct block_device *bdev = I_BDEV(inode); 394 + loff_t isize = i_size_read(inode); 395 + 396 + iomap->bdev = bdev; 397 + iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); 398 + if (iomap->offset >= isize) 399 + return -EIO; 400 + iomap->type = IOMAP_MAPPED; 401 + iomap->addr = iomap->offset; 402 + iomap->length = isize - iomap->offset; 403 + iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */ 404 + return 0; 405 + } 406 + 407 + static const struct iomap_ops blkdev_iomap_ops = { 408 + .iomap_begin = blkdev_iomap_begin, 409 + }; 410 + 411 + #ifdef CONFIG_BUFFER_HEAD 412 + static int blkdev_get_block(struct inode *inode, sector_t iblock, 413 + struct buffer_head *bh, int create) 414 + { 415 + bh->b_bdev = I_BDEV(inode); 416 + bh->b_blocknr = iblock; 417 + set_buffer_mapped(bh); 418 + return 0; 419 + } 420 + 382 421 static int blkdev_writepage(struct page *page, struct writeback_control *wbc) 383 422 { 384 423 return block_write_full_page(page, blkdev_get_block, wbc); ··· 452 429 .writepage = blkdev_writepage, 453 430 .write_begin = blkdev_write_begin, 454 431 .write_end = blkdev_write_end, 455 - .direct_IO = blkdev_direct_IO, 456 432 .migrate_folio = buffer_migrate_folio_norefs, 457 433 .is_dirty_writeback = buffer_check_dirty_writeback, 458 434 }; 435 + #else /* CONFIG_BUFFER_HEAD */ 436 + static int blkdev_read_folio(struct file *file, struct folio *folio) 437 + { 438 + return iomap_read_folio(folio, &blkdev_iomap_ops); 439 + } 440 + 441 + static void blkdev_readahead(struct readahead_control *rac) 442 + { 443 + iomap_readahead(rac, &blkdev_iomap_ops); 444 + } 445 + 446 + static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc, 447 + struct inode *inode, loff_t offset) 448 + { 449 + loff_t isize = i_size_read(inode); 450 + 451 + if (WARN_ON_ONCE(offset >= isize)) 452 + return -EIO; 453 + if (offset >= wpc->iomap.offset && 454 + offset < wpc->iomap.offset + wpc->iomap.length) 455 + return 0; 456 + return blkdev_iomap_begin(inode, offset, isize - offset, 457 + IOMAP_WRITE, &wpc->iomap, NULL); 458 + } 459 + 460 + static const struct iomap_writeback_ops blkdev_writeback_ops = { 461 + .map_blocks = blkdev_map_blocks, 462 + }; 463 + 464 + static int blkdev_writepages(struct address_space *mapping, 465 + struct writeback_control *wbc) 466 + { 467 + struct iomap_writepage_ctx wpc = { }; 468 + 469 + return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops); 470 + } 471 + 472 + const struct address_space_operations def_blk_aops = { 473 + .dirty_folio = filemap_dirty_folio, 474 + .release_folio = iomap_release_folio, 475 + .invalidate_folio = iomap_invalidate_folio, 476 + .read_folio = blkdev_read_folio, 477 + .readahead = blkdev_readahead, 478 + .writepages = blkdev_writepages, 479 + .is_partially_uptodate = iomap_is_partially_uptodate, 480 + .error_remove_page = generic_error_remove_page, 481 + .migrate_folio = filemap_migrate_folio, 482 + }; 483 + #endif /* CONFIG_BUFFER_HEAD */ 459 484 460 485 /* 461 486 * for a block special file file_inode(file)->i_size is zero ··· 577 506 * during an unstable branch. 578 507 */ 579 508 filp->f_flags |= O_LARGEFILE; 580 - filp->f_mode |= FMODE_BUF_RASYNC; 509 + filp->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; 581 510 582 511 /* 583 512 * Use the file private data to store the holder for exclusive openes. ··· 605 534 return 0; 606 535 } 607 536 537 + static ssize_t 538 + blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from) 539 + { 540 + size_t count = iov_iter_count(from); 541 + ssize_t written; 542 + 543 + written = kiocb_invalidate_pages(iocb, count); 544 + if (written) { 545 + if (written == -EBUSY) 546 + return 0; 547 + return written; 548 + } 549 + 550 + written = blkdev_direct_IO(iocb, from); 551 + if (written > 0) { 552 + kiocb_invalidate_post_direct_write(iocb, count); 553 + iocb->ki_pos += written; 554 + count -= written; 555 + } 556 + if (written != -EIOCBQUEUED) 557 + iov_iter_revert(from, count - iov_iter_count(from)); 558 + return written; 559 + } 560 + 561 + static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from) 562 + { 563 + return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops); 564 + } 565 + 608 566 /* 609 567 * Write data to the block device. Only intended for the block device itself 610 568 * and the raw driver which basically is a fake block device. ··· 643 543 */ 644 544 static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) 645 545 { 646 - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); 546 + struct file *file = iocb->ki_filp; 547 + struct block_device *bdev = I_BDEV(file->f_mapping->host); 647 548 struct inode *bd_inode = bdev->bd_inode; 648 549 loff_t size = bdev_nr_bytes(bdev); 649 550 size_t shorted = 0; ··· 671 570 iov_iter_truncate(from, size); 672 571 } 673 572 674 - ret = __generic_file_write_iter(iocb, from); 573 + ret = file_remove_privs(file); 574 + if (ret) 575 + return ret; 576 + 577 + ret = file_update_time(file); 578 + if (ret) 579 + return ret; 580 + 581 + if (iocb->ki_flags & IOCB_DIRECT) { 582 + ret = blkdev_direct_write(iocb, from); 583 + if (ret >= 0 && iov_iter_count(from)) 584 + ret = direct_write_fallback(iocb, from, ret, 585 + blkdev_buffered_write(iocb, from)); 586 + } else { 587 + ret = blkdev_buffered_write(iocb, from); 588 + } 589 + 675 590 if (ret > 0) 676 591 ret = generic_write_sync(iocb, ret); 677 592 iov_iter_reexpand(from, iov_iter_count(from) + shorted);

+2 -1

block/mq-deadline.c

··· 646 646 struct request_queue *q = hctx->queue; 647 647 struct deadline_data *dd = q->elevator->elevator_data; 648 648 struct blk_mq_tags *tags = hctx->sched_tags; 649 + unsigned int shift = tags->bitmap_tags.sb.shift; 649 650 650 - dd->async_depth = max(1UL, 3 * q->nr_requests / 4); 651 + dd->async_depth = max(1U, 3 * (1U << shift) / 4); 651 652 652 653 sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth); 653 654 }

+4

block/opal_proto.h

··· 225 225 OPAL_SUM_SET_LIST = 0x060000, 226 226 }; 227 227 228 + enum opal_revertlsp { 229 + OPAL_KEEP_GLOBAL_RANGE_KEY = 0x060000, 230 + }; 231 + 228 232 /* Packets derived from: 229 233 * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 230 234 * Secion: 3.2.3 ComPackets, Packets & Subpackets

+4 -8

block/partitions/cmdline.c

··· 81 81 82 82 length = min_t(int, next - partdef, 83 83 sizeof(new_subpart->name) - 1); 84 - strncpy(new_subpart->name, partdef, length); 85 - new_subpart->name[length] = '\0'; 84 + strscpy(new_subpart->name, partdef, length); 86 85 87 86 partdef = ++next; 88 87 } else ··· 139 140 } 140 141 141 142 length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); 142 - strncpy(newparts->name, bdevdef, length); 143 - newparts->name[length] = '\0'; 143 + strscpy(newparts->name, bdevdef, length); 144 144 newparts->nr_subparts = 0; 145 145 146 146 next_subpart = &newparts->subpart; ··· 151 153 length = (!next) ? (sizeof(buf) - 1) : 152 154 min_t(int, next - bdevdef, sizeof(buf) - 1); 153 155 154 - strncpy(buf, bdevdef, length); 155 - buf[length] = '\0'; 156 + strscpy(buf, bdevdef, length); 156 157 157 158 ret = parse_subpart(next_subpart, buf); 158 159 if (ret) ··· 264 267 265 268 label_min = min_t(int, sizeof(info->volname) - 1, 266 269 sizeof(subpart->name)); 267 - strncpy(info->volname, subpart->name, label_min); 268 - info->volname[label_min] = '\0'; 270 + strscpy(info->volname, subpart->name, label_min); 269 271 270 272 snprintf(tmp, sizeof(tmp), "(%s)", info->volname); 271 273 strlcat(state->pp_buf, tmp, PAGE_SIZE);

+247 -5

block/sed-opal.c

··· 20 20 #include <linux/sed-opal.h> 21 21 #include <linux/string.h> 22 22 #include <linux/kdev_t.h> 23 + #include <linux/key.h> 24 + #include <linux/key-type.h> 25 + #include <keys/user-type.h> 23 26 24 27 #include "opal_proto.h" 25 28 ··· 31 28 32 29 /* Number of bytes needed by cmd_finalize. */ 33 30 #define CMD_FINALIZE_BYTES_NEEDED 7 31 + 32 + static struct key *sed_opal_keyring; 34 33 35 34 struct opal_step { 36 35 int (*fn)(struct opal_dev *dev, void *data); ··· 274 269 #endif 275 270 } 276 271 272 + /* 273 + * Allocate/update a SED Opal key and add it to the SED Opal keyring. 274 + */ 275 + static int update_sed_opal_key(const char *desc, u_char *key_data, int keylen) 276 + { 277 + key_ref_t kr; 278 + 279 + if (!sed_opal_keyring) 280 + return -ENOKEY; 281 + 282 + kr = key_create_or_update(make_key_ref(sed_opal_keyring, true), "user", 283 + desc, (const void *)key_data, keylen, 284 + KEY_USR_VIEW | KEY_USR_SEARCH | KEY_USR_WRITE, 285 + KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_BUILT_IN | 286 + KEY_ALLOC_BYPASS_RESTRICTION); 287 + if (IS_ERR(kr)) { 288 + pr_err("Error adding SED key (%ld)\n", PTR_ERR(kr)); 289 + return PTR_ERR(kr); 290 + } 291 + 292 + return 0; 293 + } 294 + 295 + /* 296 + * Read a SED Opal key from the SED Opal keyring. 297 + */ 298 + static int read_sed_opal_key(const char *key_name, u_char *buffer, int buflen) 299 + { 300 + int ret; 301 + key_ref_t kref; 302 + struct key *key; 303 + 304 + if (!sed_opal_keyring) 305 + return -ENOKEY; 306 + 307 + kref = keyring_search(make_key_ref(sed_opal_keyring, true), 308 + &key_type_user, key_name, true); 309 + 310 + if (IS_ERR(kref)) 311 + ret = PTR_ERR(kref); 312 + 313 + key = key_ref_to_ptr(kref); 314 + down_read(&key->sem); 315 + ret = key_validate(key); 316 + if (ret == 0) { 317 + if (buflen > key->datalen) 318 + buflen = key->datalen; 319 + 320 + ret = key->type->read(key, (char *)buffer, buflen); 321 + } 322 + up_read(&key->sem); 323 + 324 + key_ref_put(kref); 325 + 326 + return ret; 327 + } 328 + 329 + static int opal_get_key(struct opal_dev *dev, struct opal_key *key) 330 + { 331 + int ret = 0; 332 + 333 + switch (key->key_type) { 334 + case OPAL_INCLUDED: 335 + /* the key is ready to use */ 336 + break; 337 + case OPAL_KEYRING: 338 + /* the key is in the keyring */ 339 + ret = read_sed_opal_key(OPAL_AUTH_KEY, key->key, OPAL_KEY_MAX); 340 + if (ret > 0) { 341 + if (ret > U8_MAX) { 342 + ret = -ENOSPC; 343 + goto error; 344 + } 345 + key->key_len = ret; 346 + key->key_type = OPAL_INCLUDED; 347 + } 348 + break; 349 + default: 350 + ret = -EINVAL; 351 + break; 352 + } 353 + if (ret < 0) 354 + goto error; 355 + 356 + /* must have a PEK by now or it's an error */ 357 + if (key->key_type != OPAL_INCLUDED || key->key_len == 0) { 358 + ret = -EINVAL; 359 + goto error; 360 + } 361 + return 0; 362 + error: 363 + pr_debug("Error getting password: %d\n", ret); 364 + return ret; 365 + } 366 + 277 367 static bool check_tper(const void *data) 278 368 { 279 369 const struct d0_tper_features *tper = data; ··· 563 463 return error; 564 464 } 565 465 566 - static int opal_discovery0_end(struct opal_dev *dev) 466 + static int opal_discovery0_end(struct opal_dev *dev, void *data) 567 467 { 468 + struct opal_discovery *discv_out = data; /* may be NULL */ 469 + u8 __user *buf_out; 470 + u64 len_out; 568 471 bool found_com_id = false, supported = true, single_user = false; 569 472 const struct d0_header *hdr = (struct d0_header *)dev->resp; 570 473 const u8 *epos = dev->resp, *cpos = dev->resp; ··· 581 478 pr_debug("Discovery length overflows buffer (%zu+%u)/%u\n", 582 479 sizeof(*hdr), hlen, IO_BUFFER_LENGTH); 583 480 return -EFAULT; 481 + } 482 + 483 + if (discv_out) { 484 + buf_out = (u8 __user *)(uintptr_t)discv_out->data; 485 + len_out = min_t(u64, discv_out->size, hlen); 486 + if (buf_out && copy_to_user(buf_out, dev->resp, len_out)) 487 + return -EFAULT; 488 + 489 + discv_out->size = hlen; /* actual size of data */ 584 490 } 585 491 586 492 epos += hlen; /* end of buffer */ ··· 677 565 if (ret) 678 566 return ret; 679 567 680 - return opal_discovery0_end(dev); 568 + return opal_discovery0_end(dev, data); 681 569 } 682 570 683 571 static int opal_discovery0_step(struct opal_dev *dev) 684 572 { 685 573 const struct opal_step discovery0_step = { 686 - opal_discovery0, 574 + opal_discovery0, NULL 687 575 }; 688 576 689 577 return execute_step(dev, &discovery0_step, 0); ··· 1869 1757 return finalize_and_send(dev, parse_and_check_status); 1870 1758 } 1871 1759 1760 + static int revert_lsp(struct opal_dev *dev, void *data) 1761 + { 1762 + struct opal_revert_lsp *rev = data; 1763 + int err; 1764 + 1765 + err = cmd_start(dev, opaluid[OPAL_THISSP_UID], 1766 + opalmethod[OPAL_REVERTSP]); 1767 + add_token_u8(&err, dev, OPAL_STARTNAME); 1768 + add_token_u64(&err, dev, OPAL_KEEP_GLOBAL_RANGE_KEY); 1769 + add_token_u8(&err, dev, (rev->options & OPAL_PRESERVE) ? 1770 + OPAL_TRUE : OPAL_FALSE); 1771 + add_token_u8(&err, dev, OPAL_ENDNAME); 1772 + if (err) { 1773 + pr_debug("Error building REVERT SP command.\n"); 1774 + return err; 1775 + } 1776 + 1777 + return finalize_and_send(dev, parse_and_check_status); 1778 + } 1779 + 1872 1780 static int erase_locking_range(struct opal_dev *dev, void *data) 1873 1781 { 1874 1782 struct opal_session_info *session = data; ··· 2559 2427 }; 2560 2428 int ret; 2561 2429 2430 + ret = opal_get_key(dev, &opal_session->opal_key); 2431 + if (ret) 2432 + return ret; 2562 2433 mutex_lock(&dev->dev_lock); 2563 2434 setup_opal_dev(dev); 2564 2435 ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps)); 2436 + mutex_unlock(&dev->dev_lock); 2437 + 2438 + return ret; 2439 + } 2440 + 2441 + static int opal_get_discv(struct opal_dev *dev, struct opal_discovery *discv) 2442 + { 2443 + const struct opal_step discovery0_step = { 2444 + opal_discovery0, discv 2445 + }; 2446 + int ret = 0; 2447 + 2448 + mutex_lock(&dev->dev_lock); 2449 + setup_opal_dev(dev); 2450 + ret = execute_step(dev, &discovery0_step, 0); 2451 + mutex_unlock(&dev->dev_lock); 2452 + if (ret) 2453 + return ret; 2454 + return discv->size; /* modified to actual length of data */ 2455 + } 2456 + 2457 + static int opal_revertlsp(struct opal_dev *dev, struct opal_revert_lsp *rev) 2458 + { 2459 + /* controller will terminate session */ 2460 + const struct opal_step steps[] = { 2461 + { start_admin1LSP_opal_session, &rev->key }, 2462 + { revert_lsp, rev } 2463 + }; 2464 + int ret; 2465 + 2466 + ret = opal_get_key(dev, &rev->key); 2467 + if (ret) 2468 + return ret; 2469 + mutex_lock(&dev->dev_lock); 2470 + setup_opal_dev(dev); 2471 + ret = execute_steps(dev, steps, ARRAY_SIZE(steps)); 2565 2472 mutex_unlock(&dev->dev_lock); 2566 2473 2567 2474 return ret; ··· 2616 2445 }; 2617 2446 int ret; 2618 2447 2448 + ret = opal_get_key(dev, &opal_session->opal_key); 2449 + if (ret) 2450 + return ret; 2619 2451 mutex_lock(&dev->dev_lock); 2620 2452 setup_opal_dev(dev); 2621 2453 ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps)); ··· 2647 2473 opal_mbr->enable_disable != OPAL_MBR_DISABLE) 2648 2474 return -EINVAL; 2649 2475 2476 + ret = opal_get_key(dev, &opal_mbr->key); 2477 + if (ret) 2478 + return ret; 2650 2479 mutex_lock(&dev->dev_lock); 2651 2480 setup_opal_dev(dev); 2652 2481 ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); ··· 2675 2498 mbr_done->done_flag != OPAL_MBR_NOT_DONE) 2676 2499 return -EINVAL; 2677 2500 2501 + ret = opal_get_key(dev, &mbr_done->key); 2502 + if (ret) 2503 + return ret; 2678 2504 mutex_lock(&dev->dev_lock); 2679 2505 setup_opal_dev(dev); 2680 2506 ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); ··· 2699 2519 if (info->size == 0) 2700 2520 return 0; 2701 2521 2522 + ret = opal_get_key(dev, &info->key); 2523 + if (ret) 2524 + return ret; 2702 2525 mutex_lock(&dev->dev_lock); 2703 2526 setup_opal_dev(dev); 2704 2527 ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); ··· 2759 2576 return -EINVAL; 2760 2577 } 2761 2578 2579 + ret = opal_get_key(dev, &lk_unlk->session.opal_key); 2580 + if (ret) 2581 + return ret; 2762 2582 mutex_lock(&dev->dev_lock); 2763 2583 setup_opal_dev(dev); 2764 2584 ret = execute_steps(dev, steps, ARRAY_SIZE(steps)); ··· 2784 2598 2785 2599 int ret; 2786 2600 2601 + ret = opal_get_key(dev, opal); 2602 + 2603 + if (ret) 2604 + return ret; 2787 2605 mutex_lock(&dev->dev_lock); 2788 2606 setup_opal_dev(dev); 2789 2607 if (psid) ··· 2888 2698 if (lk_unlk->session.who > OPAL_USER9) 2889 2699 return -EINVAL; 2890 2700 2701 + ret = opal_get_key(dev, &lk_unlk->session.opal_key); 2702 + if (ret) 2703 + return ret; 2891 2704 mutex_lock(&dev->dev_lock); 2892 2705 opal_lock_check_for_saved_key(dev, lk_unlk); 2893 2706 ret = __opal_lock_unlock(dev, lk_unlk); ··· 2914 2721 if (!dev) 2915 2722 return -ENODEV; 2916 2723 2724 + ret = opal_get_key(dev, opal); 2725 + if (ret) 2726 + return ret; 2917 2727 mutex_lock(&dev->dev_lock); 2918 2728 setup_opal_dev(dev); 2919 2729 ret = execute_steps(dev, owner_steps, ARRAY_SIZE(owner_steps)); ··· 2939 2743 if (!opal_lr_act->num_lrs || opal_lr_act->num_lrs > OPAL_MAX_LRS) 2940 2744 return -EINVAL; 2941 2745 2746 + ret = opal_get_key(dev, &opal_lr_act->key); 2747 + if (ret) 2748 + return ret; 2942 2749 mutex_lock(&dev->dev_lock); 2943 2750 setup_opal_dev(dev); 2944 2751 ret = execute_steps(dev, active_steps, ARRAY_SIZE(active_steps)); ··· 2960 2761 }; 2961 2762 int ret; 2962 2763 2764 + ret = opal_get_key(dev, &opal_lrs->session.opal_key); 2765 + if (ret) 2766 + return ret; 2963 2767 mutex_lock(&dev->dev_lock); 2964 2768 setup_opal_dev(dev); 2965 2769 ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps)); ··· 3016 2814 ret = execute_steps(dev, pw_steps, ARRAY_SIZE(pw_steps)); 3017 2815 mutex_unlock(&dev->dev_lock); 3018 2816 2817 + if (ret) 2818 + return ret; 2819 + 2820 + /* update keyring with new password */ 2821 + ret = update_sed_opal_key(OPAL_AUTH_KEY, 2822 + opal_pw->new_user_pw.opal_key.key, 2823 + opal_pw->new_user_pw.opal_key.key_len); 2824 + 3019 2825 return ret; 3020 2826 } 3021 2827 ··· 3044 2834 return -EINVAL; 3045 2835 } 3046 2836 2837 + ret = opal_get_key(dev, &opal_session->opal_key); 2838 + if (ret) 2839 + return ret; 3047 2840 mutex_lock(&dev->dev_lock); 3048 2841 setup_opal_dev(dev); 3049 2842 ret = execute_steps(dev, act_steps, ARRAY_SIZE(act_steps)); ··· 3133 2920 { 3134 2921 int ret, bit_set; 3135 2922 2923 + ret = opal_get_key(dev, &rw_tbl->key); 2924 + if (ret) 2925 + return ret; 3136 2926 mutex_lock(&dev->dev_lock); 3137 2927 setup_opal_dev(dev); 3138 2928 ··· 3204 2988 if (!capable(CAP_SYS_ADMIN)) 3205 2989 return -EACCES; 3206 2990 if (!dev) 3207 - return -ENOTSUPP; 2991 + return -EOPNOTSUPP; 3208 2992 if (!(dev->flags & OPAL_FL_SUPPORTED)) 3209 - return -ENOTSUPP; 2993 + return -EOPNOTSUPP; 3210 2994 3211 2995 if (cmd & IOC_IN) { 3212 2996 p = memdup_user(arg, _IOC_SIZE(cmd)); ··· 3272 3056 case IOC_OPAL_GET_GEOMETRY: 3273 3057 ret = opal_get_geometry(dev, arg); 3274 3058 break; 3059 + case IOC_OPAL_REVERT_LSP: 3060 + ret = opal_revertlsp(dev, p); 3061 + break; 3062 + case IOC_OPAL_DISCOVERY: 3063 + ret = opal_get_discv(dev, p); 3064 + break; 3065 + 3275 3066 default: 3276 3067 break; 3277 3068 } ··· 3288 3065 return ret; 3289 3066 } 3290 3067 EXPORT_SYMBOL_GPL(sed_ioctl); 3068 + 3069 + static int __init sed_opal_init(void) 3070 + { 3071 + struct key *kr; 3072 + 3073 + kr = keyring_alloc(".sed_opal", 3074 + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), 3075 + (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | 3076 + KEY_USR_READ | KEY_USR_SEARCH | KEY_USR_WRITE, 3077 + KEY_ALLOC_NOT_IN_QUOTA, 3078 + NULL, NULL); 3079 + if (IS_ERR(kr)) 3080 + return PTR_ERR(kr); 3081 + 3082 + sed_opal_keyring = kr; 3083 + 3084 + return 0; 3085 + } 3086 + late_initcall(sed_opal_init);

+1

drivers/block/nbd.c

··· 2334 2334 .mcgrps = nbd_mcast_grps, 2335 2335 .n_mcgrps = ARRAY_SIZE(nbd_mcast_grps), 2336 2336 }; 2337 + MODULE_ALIAS_GENL_FAMILY(NBD_GENL_FAMILY_NAME); 2337 2338 2338 2339 static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply) 2339 2340 {

+1 -1

drivers/block/swim3.c

··· 1277 1277 }; 1278 1278 1279 1279 1280 - int swim3_init(void) 1280 + static int swim3_init(void) 1281 1281 { 1282 1282 macio_register_driver(&swim3_driver); 1283 1283 return 0;

+338 -28

drivers/block/ublk_drv.c

··· 56 56 | UBLK_F_USER_RECOVERY_REISSUE \ 57 57 | UBLK_F_UNPRIVILEGED_DEV \ 58 58 | UBLK_F_CMD_IOCTL_ENCODE \ 59 - | UBLK_F_USER_COPY) 59 + | UBLK_F_USER_COPY \ 60 + | UBLK_F_ZONED) 60 61 61 62 /* All UBLK_PARAM_TYPE_* should be included here */ 62 - #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \ 63 - UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT) 63 + #define UBLK_PARAM_TYPE_ALL \ 64 + (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \ 65 + UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED) 64 66 65 67 struct ublk_rq_data { 66 68 struct llist_node node; 67 69 68 70 struct kref ref; 71 + __u64 sector; 72 + __u32 operation; 73 + __u32 nr_zones; 69 74 }; 70 75 71 76 struct ublk_uring_cmd_pdu { ··· 190 185 __u32 types; 191 186 }; 192 187 188 + static inline unsigned int ublk_req_build_flags(struct request *req); 189 + static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, 190 + int tag); 191 + 192 + static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub) 193 + { 194 + return ub->dev_info.flags & UBLK_F_USER_COPY; 195 + } 196 + 197 + static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) 198 + { 199 + return ub->dev_info.flags & UBLK_F_ZONED; 200 + } 201 + 202 + static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq) 203 + { 204 + return ubq->flags & UBLK_F_ZONED; 205 + } 206 + 207 + #ifdef CONFIG_BLK_DEV_ZONED 208 + 209 + static int ublk_get_nr_zones(const struct ublk_device *ub) 210 + { 211 + const struct ublk_param_basic *p = &ub->params.basic; 212 + 213 + /* Zone size is a power of 2 */ 214 + return p->dev_sectors >> ilog2(p->chunk_sectors); 215 + } 216 + 217 + static int ublk_revalidate_disk_zones(struct ublk_device *ub) 218 + { 219 + return blk_revalidate_disk_zones(ub->ub_disk, NULL); 220 + } 221 + 222 + static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) 223 + { 224 + const struct ublk_param_zoned *p = &ub->params.zoned; 225 + int nr_zones; 226 + 227 + if (!ublk_dev_is_zoned(ub)) 228 + return -EINVAL; 229 + 230 + if (!p->max_zone_append_sectors) 231 + return -EINVAL; 232 + 233 + nr_zones = ublk_get_nr_zones(ub); 234 + 235 + if (p->max_active_zones > nr_zones) 236 + return -EINVAL; 237 + 238 + if (p->max_open_zones > nr_zones) 239 + return -EINVAL; 240 + 241 + return 0; 242 + } 243 + 244 + static int ublk_dev_param_zoned_apply(struct ublk_device *ub) 245 + { 246 + const struct ublk_param_zoned *p = &ub->params.zoned; 247 + 248 + disk_set_zoned(ub->ub_disk, BLK_ZONED_HM); 249 + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue); 250 + blk_queue_required_elevator_features(ub->ub_disk->queue, 251 + ELEVATOR_F_ZBD_SEQ_WRITE); 252 + disk_set_max_active_zones(ub->ub_disk, p->max_active_zones); 253 + disk_set_max_open_zones(ub->ub_disk, p->max_open_zones); 254 + blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors); 255 + 256 + ub->ub_disk->nr_zones = ublk_get_nr_zones(ub); 257 + 258 + return 0; 259 + } 260 + 261 + /* Based on virtblk_alloc_report_buffer */ 262 + static void *ublk_alloc_report_buffer(struct ublk_device *ublk, 263 + unsigned int nr_zones, size_t *buflen) 264 + { 265 + struct request_queue *q = ublk->ub_disk->queue; 266 + size_t bufsize; 267 + void *buf; 268 + 269 + nr_zones = min_t(unsigned int, nr_zones, 270 + ublk->ub_disk->nr_zones); 271 + 272 + bufsize = nr_zones * sizeof(struct blk_zone); 273 + bufsize = 274 + min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT); 275 + 276 + while (bufsize >= sizeof(struct blk_zone)) { 277 + buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY); 278 + if (buf) { 279 + *buflen = bufsize; 280 + return buf; 281 + } 282 + bufsize >>= 1; 283 + } 284 + 285 + *buflen = 0; 286 + return NULL; 287 + } 288 + 289 + static int ublk_report_zones(struct gendisk *disk, sector_t sector, 290 + unsigned int nr_zones, report_zones_cb cb, void *data) 291 + { 292 + struct ublk_device *ub = disk->private_data; 293 + unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors; 294 + unsigned int first_zone = sector >> ilog2(zone_size_sectors); 295 + unsigned int done_zones = 0; 296 + unsigned int max_zones_per_request; 297 + int ret; 298 + struct blk_zone *buffer; 299 + size_t buffer_length; 300 + 301 + nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone, 302 + nr_zones); 303 + 304 + buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length); 305 + if (!buffer) 306 + return -ENOMEM; 307 + 308 + max_zones_per_request = buffer_length / sizeof(struct blk_zone); 309 + 310 + while (done_zones < nr_zones) { 311 + unsigned int remaining_zones = nr_zones - done_zones; 312 + unsigned int zones_in_request = 313 + min_t(unsigned int, remaining_zones, max_zones_per_request); 314 + struct request *req; 315 + struct ublk_rq_data *pdu; 316 + blk_status_t status; 317 + 318 + memset(buffer, 0, buffer_length); 319 + 320 + req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0); 321 + if (IS_ERR(req)) { 322 + ret = PTR_ERR(req); 323 + goto out; 324 + } 325 + 326 + pdu = blk_mq_rq_to_pdu(req); 327 + pdu->operation = UBLK_IO_OP_REPORT_ZONES; 328 + pdu->sector = sector; 329 + pdu->nr_zones = zones_in_request; 330 + 331 + ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length, 332 + GFP_KERNEL); 333 + if (ret) { 334 + blk_mq_free_request(req); 335 + goto out; 336 + } 337 + 338 + status = blk_execute_rq(req, 0); 339 + ret = blk_status_to_errno(status); 340 + blk_mq_free_request(req); 341 + if (ret) 342 + goto out; 343 + 344 + for (unsigned int i = 0; i < zones_in_request; i++) { 345 + struct blk_zone *zone = buffer + i; 346 + 347 + /* A zero length zone means no more zones in this response */ 348 + if (!zone->len) 349 + break; 350 + 351 + ret = cb(zone, i, data); 352 + if (ret) 353 + goto out; 354 + 355 + done_zones++; 356 + sector += zone_size_sectors; 357 + 358 + } 359 + } 360 + 361 + ret = done_zones; 362 + 363 + out: 364 + kvfree(buffer); 365 + return ret; 366 + } 367 + 368 + static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, 369 + struct request *req) 370 + { 371 + struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); 372 + struct ublk_io *io = &ubq->ios[req->tag]; 373 + struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req); 374 + u32 ublk_op; 375 + 376 + switch (req_op(req)) { 377 + case REQ_OP_ZONE_OPEN: 378 + ublk_op = UBLK_IO_OP_ZONE_OPEN; 379 + break; 380 + case REQ_OP_ZONE_CLOSE: 381 + ublk_op = UBLK_IO_OP_ZONE_CLOSE; 382 + break; 383 + case REQ_OP_ZONE_FINISH: 384 + ublk_op = UBLK_IO_OP_ZONE_FINISH; 385 + break; 386 + case REQ_OP_ZONE_RESET: 387 + ublk_op = UBLK_IO_OP_ZONE_RESET; 388 + break; 389 + case REQ_OP_ZONE_APPEND: 390 + ublk_op = UBLK_IO_OP_ZONE_APPEND; 391 + break; 392 + case REQ_OP_ZONE_RESET_ALL: 393 + ublk_op = UBLK_IO_OP_ZONE_RESET_ALL; 394 + break; 395 + case REQ_OP_DRV_IN: 396 + ublk_op = pdu->operation; 397 + switch (ublk_op) { 398 + case UBLK_IO_OP_REPORT_ZONES: 399 + iod->op_flags = ublk_op | ublk_req_build_flags(req); 400 + iod->nr_zones = pdu->nr_zones; 401 + iod->start_sector = pdu->sector; 402 + return BLK_STS_OK; 403 + default: 404 + return BLK_STS_IOERR; 405 + } 406 + case REQ_OP_DRV_OUT: 407 + /* We do not support drv_out */ 408 + return BLK_STS_NOTSUPP; 409 + default: 410 + return BLK_STS_IOERR; 411 + } 412 + 413 + iod->op_flags = ublk_op | ublk_req_build_flags(req); 414 + iod->nr_sectors = blk_rq_sectors(req); 415 + iod->start_sector = blk_rq_pos(req); 416 + iod->addr = io->addr; 417 + 418 + return BLK_STS_OK; 419 + } 420 + 421 + #else 422 + 423 + #define ublk_report_zones (NULL) 424 + 425 + static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) 426 + { 427 + return -EOPNOTSUPP; 428 + } 429 + 430 + static int ublk_dev_param_zoned_apply(struct ublk_device *ub) 431 + { 432 + return -EOPNOTSUPP; 433 + } 434 + 435 + static int ublk_revalidate_disk_zones(struct ublk_device *ub) 436 + { 437 + return 0; 438 + } 439 + 440 + static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, 441 + struct request *req) 442 + { 443 + return BLK_STS_NOTSUPP; 444 + } 445 + 446 + #endif 447 + 193 448 static inline void __ublk_complete_rq(struct request *req); 194 449 static void ublk_complete_rq(struct kref *ref); 195 450 ··· 546 281 547 282 if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9)) 548 283 return -EINVAL; 284 + 285 + if (ublk_dev_is_zoned(ub) && !p->chunk_sectors) 286 + return -EINVAL; 549 287 } else 550 288 return -EINVAL; 551 289 ··· 567 299 if (ub->params.types & UBLK_PARAM_TYPE_DEVT) 568 300 return -EINVAL; 569 301 302 + if (ub->params.types & UBLK_PARAM_TYPE_ZONED) 303 + return ublk_dev_param_zoned_validate(ub); 304 + else if (ublk_dev_is_zoned(ub)) 305 + return -EINVAL; 306 + 570 307 return 0; 571 308 } 572 309 ··· 584 311 585 312 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) 586 313 ublk_dev_param_discard_apply(ub); 314 + 315 + if (ub->params.types & UBLK_PARAM_TYPE_ZONED) 316 + return ublk_dev_param_zoned_apply(ub); 587 317 588 318 return 0; 589 319 } ··· 758 482 .owner = THIS_MODULE, 759 483 .open = ublk_open, 760 484 .free_disk = ublk_free_disk, 485 + .report_zones = ublk_report_zones, 761 486 }; 762 487 763 488 #define UBLK_MAX_PIN_PAGES 32 ··· 873 596 874 597 static inline bool ublk_need_unmap_req(const struct request *req) 875 598 { 876 - return ublk_rq_has_data(req) && req_op(req) == REQ_OP_READ; 599 + return ublk_rq_has_data(req) && 600 + (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN); 877 601 } 878 602 879 603 static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, ··· 958 680 { 959 681 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); 960 682 struct ublk_io *io = &ubq->ios[req->tag]; 683 + enum req_op op = req_op(req); 961 684 u32 ublk_op; 685 + 686 + if (!ublk_queue_is_zoned(ubq) && 687 + (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND)) 688 + return BLK_STS_IOERR; 962 689 963 690 switch (req_op(req)) { 964 691 case REQ_OP_READ: ··· 982 699 ublk_op = UBLK_IO_OP_WRITE_ZEROES; 983 700 break; 984 701 default: 702 + if (ublk_queue_is_zoned(ubq)) 703 + return ublk_setup_iod_zoned(ubq, req); 985 704 return BLK_STS_IOERR; 986 705 } 987 706 ··· 1036 751 * 1037 752 * Both the two needn't unmap. 1038 753 */ 1039 - if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) 754 + if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE && 755 + req_op(req) != REQ_OP_DRV_IN) 1040 756 goto exit; 1041 757 1042 758 /* for READ request, writing data in iod->addr to rq buffers */ ··· 1400 1114 1401 1115 /* find the io request and complete */ 1402 1116 req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag); 1117 + if (WARN_ON_ONCE(unlikely(!req))) 1118 + return; 1403 1119 1404 - if (req && likely(!blk_should_fake_timeout(req->q))) 1120 + if (req_op(req) == REQ_OP_ZONE_APPEND) 1121 + req->__sector = ub_cmd->zone_append_lba; 1122 + 1123 + if (likely(!blk_should_fake_timeout(req->q))) 1405 1124 ublk_put_req_ref(ubq, req); 1406 1125 } 1407 1126 ··· 1705 1414 ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA)) 1706 1415 goto out; 1707 1416 1708 - if (ublk_support_user_copy(ubq) && ub_cmd->addr) { 1709 - ret = -EINVAL; 1710 - goto out; 1711 - } 1712 - 1713 1417 ret = ublk_check_cmd_op(cmd_op); 1714 1418 if (ret) 1715 1419 goto out; ··· 1731 1445 */ 1732 1446 if (!ub_cmd->addr && !ublk_need_get_data(ubq)) 1733 1447 goto out; 1448 + } else if (ub_cmd->addr) { 1449 + /* User copy requires addr to be unset */ 1450 + ret = -EINVAL; 1451 + goto out; 1734 1452 } 1735 1453 1736 1454 ublk_fill_io_cmd(io, cmd, ub_cmd->addr); ··· 1754 1464 if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || 1755 1465 req_op(req) == REQ_OP_READ)) 1756 1466 goto out; 1467 + } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) { 1468 + /* 1469 + * User copy requires addr to be unset when command is 1470 + * not zone append 1471 + */ 1472 + ret = -EINVAL; 1473 + goto out; 1757 1474 } 1475 + 1758 1476 ublk_fill_io_cmd(io, cmd, ub_cmd->addr); 1759 1477 ublk_commit_completion(ub, ub_cmd); 1760 1478 break; ··· 1835 1537 int ubuf_dir) 1836 1538 { 1837 1539 /* copy ubuf to request pages */ 1838 - if (req_op(req) == REQ_OP_READ && ubuf_dir == ITER_SOURCE) 1540 + if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) && 1541 + ubuf_dir == ITER_SOURCE) 1839 1542 return true; 1840 1543 1841 1544 /* copy request pages to ubuf */ 1842 - if (req_op(req) == REQ_OP_WRITE && ubuf_dir == ITER_DEST) 1545 + if ((req_op(req) == REQ_OP_WRITE || 1546 + req_op(req) == REQ_OP_ZONE_APPEND) && 1547 + ubuf_dir == ITER_DEST) 1843 1548 return true; 1844 1549 1845 1550 return false; ··· 2182 1881 2183 1882 get_device(&ub->cdev_dev); 2184 1883 ub->dev_info.state = UBLK_S_DEV_LIVE; 1884 + 1885 + if (ublk_dev_is_zoned(ub)) { 1886 + ret = ublk_revalidate_disk_zones(ub); 1887 + if (ret) 1888 + goto out_put_cdev; 1889 + } 1890 + 2185 1891 ret = add_disk(disk); 1892 + if (ret) 1893 + goto out_put_cdev; 1894 + 1895 + set_bit(UB_STATE_USED, &ub->state); 1896 + 1897 + out_put_cdev: 2186 1898 if (ret) { 2187 - /* 2188 - * Has to drop the reference since ->free_disk won't be 2189 - * called in case of add_disk failure. 2190 - */ 2191 1899 ub->dev_info.state = UBLK_S_DEV_DEAD; 2192 1900 ublk_put_device(ub); 2193 - goto out_put_disk; 2194 1901 } 2195 - set_bit(UB_STATE_USED, &ub->state); 2196 1902 out_put_disk: 2197 1903 if (ret) 2198 1904 put_disk(disk); ··· 2346 2038 UBLK_F_URING_CMD_COMP_IN_TASK; 2347 2039 2348 2040 /* GET_DATA isn't needed any more with USER_COPY */ 2349 - if (ub->dev_info.flags & UBLK_F_USER_COPY) 2041 + if (ublk_dev_is_user_copy(ub)) 2350 2042 ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; 2043 + 2044 + /* Zoned storage support requires user copy feature */ 2045 + if (ublk_dev_is_zoned(ub) && 2046 + (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !ublk_dev_is_user_copy(ub))) { 2047 + ret = -EINVAL; 2048 + goto out_free_dev_number; 2049 + } 2351 2050 2352 2051 /* We are not ready to support zero copy */ 2353 2052 ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; ··· 2748 2433 if (header->len < header->dev_path_len) 2749 2434 return -EINVAL; 2750 2435 2751 - dev_path = kmalloc(header->dev_path_len + 1, GFP_KERNEL); 2752 - if (!dev_path) 2753 - return -ENOMEM; 2754 - 2755 - ret = -EFAULT; 2756 - if (copy_from_user(dev_path, argp, header->dev_path_len)) 2757 - goto exit; 2758 - dev_path[header->dev_path_len] = 0; 2436 + dev_path = memdup_user_nul(argp, header->dev_path_len); 2437 + if (IS_ERR(dev_path)) 2438 + return PTR_ERR(dev_path); 2759 2439 2760 2440 ret = -EINVAL; 2761 2441 switch (_IOC_NR(cmd->cmd_op)) {

+11

drivers/md/Kconfig

··· 15 15 config BLK_DEV_MD 16 16 tristate "RAID support" 17 17 select BLOCK_HOLDER_DEPRECATED if SYSFS 18 + select BUFFER_HEAD 18 19 # BLOCK_LEGACY_AUTOLOAD requirement should be removed 19 20 # after relevant mdadm enhancements - to make "names=yes" 20 21 # the default - are widely available. ··· 50 49 synchronisation steps that are part of this step. 51 50 52 51 If unsure, say Y. 52 + 53 + config MD_BITMAP_FILE 54 + bool "MD bitmap file support (deprecated)" 55 + default y 56 + help 57 + If you say Y here, support for write intent bitmaps in files on an 58 + external file system is enabled. This is an alternative to the internal 59 + bitmaps near the MD superblock, and very problematic code that abuses 60 + various kernel APIs and can only work with files on a file system not 61 + actually sitting on the MD device. 53 62 54 63 config MD_LINEAR 55 64 tristate "Linear (append) mode (deprecated)"

-1

drivers/md/dm-crypt.c

··· 1160 1160 1161 1161 tag_len = io->cc->on_disk_tag_size * (bio_sectors(bio) >> io->cc->sector_shift); 1162 1162 1163 - bip->bip_iter.bi_size = tag_len; 1164 1163 bip->bip_iter.bi_sector = io->cc->start + io->sector; 1165 1164 1166 1165 ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata),

-1

drivers/md/dm-raid.c

··· 3723 3723 if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) { 3724 3724 if (mddev->sync_thread) { 3725 3725 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3726 - md_unregister_thread(&mddev->sync_thread); 3727 3726 md_reap_sync_thread(mddev); 3728 3727 } 3729 3728 } else if (decipher_sync_action(mddev, mddev->recovery) != st_idle)

+180 -171

drivers/md/md-bitmap.c

··· 139 139 */ 140 140 141 141 /* IO operations when bitmap is stored near all superblocks */ 142 - static int read_sb_page(struct mddev *mddev, loff_t offset, 143 - struct page *page, 144 - unsigned long index, int size) 145 - { 146 - /* choose a good rdev and read the page from there */ 147 142 143 + /* choose a good rdev and read the page from there */ 144 + static int read_sb_page(struct mddev *mddev, loff_t offset, 145 + struct page *page, unsigned long index, int size) 146 + { 147 + 148 + sector_t sector = mddev->bitmap_info.offset + offset + 149 + index * (PAGE_SIZE / SECTOR_SIZE); 148 150 struct md_rdev *rdev; 149 - sector_t target; 150 151 151 152 rdev_for_each(rdev, mddev) { 152 - if (! test_bit(In_sync, &rdev->flags) 153 - || test_bit(Faulty, &rdev->flags) 154 - || test_bit(Bitmap_sync, &rdev->flags)) 153 + u32 iosize = roundup(size, bdev_logical_block_size(rdev->bdev)); 154 + 155 + if (!test_bit(In_sync, &rdev->flags) || 156 + test_bit(Faulty, &rdev->flags) || 157 + test_bit(Bitmap_sync, &rdev->flags)) 155 158 continue; 156 159 157 - target = offset + index * (PAGE_SIZE/512); 158 - 159 - if (sync_page_io(rdev, target, 160 - roundup(size, bdev_logical_block_size(rdev->bdev)), 161 - page, REQ_OP_READ, true)) { 162 - page->index = index; 160 + if (sync_page_io(rdev, sector, iosize, page, REQ_OP_READ, true)) 163 161 return 0; 164 - } 165 162 } 166 163 return -EIO; 167 164 } ··· 222 225 } 223 226 224 227 static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, 225 - struct page *page) 228 + unsigned long pg_index, struct page *page) 226 229 { 227 230 struct block_device *bdev; 228 231 struct mddev *mddev = bitmap->mddev; 229 232 struct bitmap_storage *store = &bitmap->storage; 230 233 loff_t sboff, offset = mddev->bitmap_info.offset; 231 - sector_t ps, doff; 234 + sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE; 232 235 unsigned int size = PAGE_SIZE; 233 236 unsigned int opt_size = PAGE_SIZE; 237 + sector_t doff; 234 238 235 239 bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; 236 - if (page->index == store->file_pages - 1) { 240 + if (pg_index == store->file_pages - 1) { 237 241 unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1); 238 242 239 243 if (last_page_size == 0) ··· 243 245 opt_size = optimal_io_size(bdev, last_page_size, size); 244 246 } 245 247 246 - ps = page->index * PAGE_SIZE / SECTOR_SIZE; 247 248 sboff = rdev->sb_start + offset; 248 249 doff = rdev->data_offset; 249 250 ··· 276 279 return 0; 277 280 } 278 281 279 - static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) 282 + static void write_sb_page(struct bitmap *bitmap, unsigned long pg_index, 283 + struct page *page, bool wait) 280 284 { 281 - struct md_rdev *rdev; 282 285 struct mddev *mddev = bitmap->mddev; 283 - int ret; 284 286 285 287 do { 286 - rdev = NULL; 288 + struct md_rdev *rdev = NULL; 289 + 287 290 while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { 288 - ret = __write_sb_page(rdev, bitmap, page); 289 - if (ret) 290 - return ret; 291 + if (__write_sb_page(rdev, bitmap, pg_index, page) < 0) { 292 + set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); 293 + return; 294 + } 291 295 } 292 296 } while (wait && md_super_wait(mddev) < 0); 293 - 294 - return 0; 295 297 } 296 298 297 299 static void md_bitmap_file_kick(struct bitmap *bitmap); 298 - /* 299 - * write out a page to a file 300 - */ 301 - static void write_page(struct bitmap *bitmap, struct page *page, int wait) 300 + 301 + #ifdef CONFIG_MD_BITMAP_FILE 302 + static void write_file_page(struct bitmap *bitmap, struct page *page, int wait) 302 303 { 303 - struct buffer_head *bh; 304 + struct buffer_head *bh = page_buffers(page); 304 305 305 - if (bitmap->storage.file == NULL) { 306 - switch (write_sb_page(bitmap, page, wait)) { 307 - case -EINVAL: 308 - set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); 309 - } 310 - } else { 311 - 312 - bh = page_buffers(page); 313 - 314 - while (bh && bh->b_blocknr) { 315 - atomic_inc(&bitmap->pending_writes); 316 - set_buffer_locked(bh); 317 - set_buffer_mapped(bh); 318 - submit_bh(REQ_OP_WRITE | REQ_SYNC, bh); 319 - bh = bh->b_this_page; 320 - } 321 - 322 - if (wait) 323 - wait_event(bitmap->write_wait, 324 - atomic_read(&bitmap->pending_writes)==0); 306 + while (bh && bh->b_blocknr) { 307 + atomic_inc(&bitmap->pending_writes); 308 + set_buffer_locked(bh); 309 + set_buffer_mapped(bh); 310 + submit_bh(REQ_OP_WRITE | REQ_SYNC, bh); 311 + bh = bh->b_this_page; 325 312 } 326 - if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) 327 - md_bitmap_file_kick(bitmap); 313 + 314 + if (wait) 315 + wait_event(bitmap->write_wait, 316 + atomic_read(&bitmap->pending_writes) == 0); 328 317 } 329 318 330 319 static void end_bitmap_write(struct buffer_head *bh, int uptodate) ··· 347 364 * This usage is similar to how swap files are handled, and allows us 348 365 * to write to a file with no concerns of memory allocation failing. 349 366 */ 350 - static int read_page(struct file *file, unsigned long index, 351 - struct bitmap *bitmap, 352 - unsigned long count, 353 - struct page *page) 367 + static int read_file_page(struct file *file, unsigned long index, 368 + struct bitmap *bitmap, unsigned long count, struct page *page) 354 369 { 355 370 int ret = 0; 356 371 struct inode *inode = file_inode(file); ··· 396 415 blk_cur++; 397 416 bh = bh->b_this_page; 398 417 } 399 - page->index = index; 400 418 401 419 wait_event(bitmap->write_wait, 402 420 atomic_read(&bitmap->pending_writes)==0); ··· 409 429 ret); 410 430 return ret; 411 431 } 432 + #else /* CONFIG_MD_BITMAP_FILE */ 433 + static void write_file_page(struct bitmap *bitmap, struct page *page, int wait) 434 + { 435 + } 436 + static int read_file_page(struct file *file, unsigned long index, 437 + struct bitmap *bitmap, unsigned long count, struct page *page) 438 + { 439 + return -EIO; 440 + } 441 + static void free_buffers(struct page *page) 442 + { 443 + put_page(page); 444 + } 445 + #endif /* CONFIG_MD_BITMAP_FILE */ 412 446 413 447 /* 414 448 * bitmap file superblock operations 415 449 */ 450 + 451 + /* 452 + * write out a page to a file 453 + */ 454 + static void filemap_write_page(struct bitmap *bitmap, unsigned long pg_index, 455 + bool wait) 456 + { 457 + struct bitmap_storage *store = &bitmap->storage; 458 + struct page *page = store->filemap[pg_index]; 459 + 460 + if (mddev_is_clustered(bitmap->mddev)) { 461 + pg_index += bitmap->cluster_slot * 462 + DIV_ROUND_UP(store->bytes, PAGE_SIZE); 463 + } 464 + 465 + if (store->file) 466 + write_file_page(bitmap, page, wait); 467 + else 468 + write_sb_page(bitmap, pg_index, page, wait); 469 + } 416 470 417 471 /* 418 472 * md_bitmap_wait_writes() should be called before writing any bitmap ··· 502 488 sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> 503 489 bitmap_info.space); 504 490 kunmap_atomic(sb); 505 - write_page(bitmap, bitmap->storage.sb_page, 1); 491 + 492 + if (bitmap->storage.file) 493 + write_file_page(bitmap, bitmap->storage.sb_page, 1); 494 + else 495 + write_sb_page(bitmap, bitmap->storage.sb_index, 496 + bitmap->storage.sb_page, 1); 506 497 } 507 498 EXPORT_SYMBOL(md_bitmap_update_sb); 508 499 ··· 559 540 bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 560 541 if (bitmap->storage.sb_page == NULL) 561 542 return -ENOMEM; 562 - bitmap->storage.sb_page->index = 0; 543 + bitmap->storage.sb_index = 0; 563 544 564 545 sb = kmap_atomic(bitmap->storage.sb_page); 565 546 ··· 620 601 unsigned long sectors_reserved = 0; 621 602 int err = -EINVAL; 622 603 struct page *sb_page; 623 - loff_t offset = bitmap->mddev->bitmap_info.offset; 604 + loff_t offset = 0; 624 605 625 606 if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) { 626 607 chunksize = 128 * 1024 * 1024; ··· 647 628 bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t); 648 629 /* to 4k blocks */ 649 630 bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); 650 - offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3)); 631 + offset = bitmap->cluster_slot * (bm_blocks << 3); 651 632 pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, 652 633 bitmap->cluster_slot, offset); 653 634 } ··· 656 637 loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host); 657 638 int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; 658 639 659 - err = read_page(bitmap->storage.file, 0, 640 + err = read_file_page(bitmap->storage.file, 0, 660 641 bitmap, bytes, sb_page); 661 642 } else { 662 - err = read_sb_page(bitmap->mddev, 663 - offset, 664 - sb_page, 665 - 0, sizeof(bitmap_super_t)); 643 + err = read_sb_page(bitmap->mddev, offset, sb_page, 0, 644 + sizeof(bitmap_super_t)); 666 645 } 667 646 if (err) 668 647 return err; ··· 836 819 if (store->sb_page) { 837 820 store->filemap[0] = store->sb_page; 838 821 pnum = 1; 839 - store->sb_page->index = offset; 822 + store->sb_index = offset; 840 823 } 841 824 842 825 for ( ; pnum < num_pages; pnum++) { ··· 845 828 store->file_pages = pnum; 846 829 return -ENOMEM; 847 830 } 848 - store->filemap[pnum]->index = pnum + offset; 849 831 } 850 832 store->file_pages = pnum; 851 833 ··· 863 847 864 848 static void md_bitmap_file_unmap(struct bitmap_storage *store) 865 849 { 866 - struct page **map, *sb_page; 867 - int pages; 868 - struct file *file; 869 - 870 - file = store->file; 871 - map = store->filemap; 872 - pages = store->file_pages; 873 - sb_page = store->sb_page; 850 + struct file *file = store->file; 851 + struct page *sb_page = store->sb_page; 852 + struct page **map = store->filemap; 853 + int pages = store->file_pages; 874 854 875 855 while (pages--) 876 856 if (map[pages] != sb_page) /* 0 is sb_page, release it below */ ··· 891 879 */ 892 880 static void md_bitmap_file_kick(struct bitmap *bitmap) 893 881 { 894 - char *path, *ptr = NULL; 895 - 896 882 if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) { 897 883 md_bitmap_update_sb(bitmap); 898 884 899 885 if (bitmap->storage.file) { 900 - path = kmalloc(PAGE_SIZE, GFP_KERNEL); 901 - if (path) 902 - ptr = file_path(bitmap->storage.file, 903 - path, PAGE_SIZE); 886 + pr_warn("%s: kicking failed bitmap file %pD4 from array!\n", 887 + bmname(bitmap), bitmap->storage.file); 904 888 905 - pr_warn("%s: kicking failed bitmap file %s from array!\n", 906 - bmname(bitmap), IS_ERR(ptr) ? "" : ptr); 907 - 908 - kfree(path); 909 889 } else 910 890 pr_warn("%s: disabling internal bitmap due to errors\n", 911 891 bmname(bitmap)); ··· 949 945 void *kaddr; 950 946 unsigned long chunk = block >> bitmap->counts.chunkshift; 951 947 struct bitmap_storage *store = &bitmap->storage; 948 + unsigned long index = file_page_index(store, chunk); 952 949 unsigned long node_offset = 0; 953 950 954 951 if (mddev_is_clustered(bitmap->mddev)) ··· 967 962 else 968 963 set_bit_le(bit, kaddr); 969 964 kunmap_atomic(kaddr); 970 - pr_debug("set file bit %lu page %lu\n", bit, page->index); 965 + pr_debug("set file bit %lu page %lu\n", bit, index); 971 966 /* record page number so it gets flushed to disk when unplug occurs */ 972 - set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_DIRTY); 967 + set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_DIRTY); 973 968 } 974 969 975 970 static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) ··· 979 974 void *paddr; 980 975 unsigned long chunk = block >> bitmap->counts.chunkshift; 981 976 struct bitmap_storage *store = &bitmap->storage; 977 + unsigned long index = file_page_index(store, chunk); 982 978 unsigned long node_offset = 0; 983 979 984 980 if (mddev_is_clustered(bitmap->mddev)) ··· 995 989 else 996 990 clear_bit_le(bit, paddr); 997 991 kunmap_atomic(paddr); 998 - if (!test_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_NEEDWRITE)) { 999 - set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_PENDING); 992 + if (!test_page_attr(bitmap, index - node_offset, BITMAP_PAGE_NEEDWRITE)) { 993 + set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_PENDING); 1000 994 bitmap->allclean = 0; 1001 995 } 1002 996 } ··· 1048 1042 "md bitmap_unplug"); 1049 1043 } 1050 1044 clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); 1051 - write_page(bitmap, bitmap->storage.filemap[i], 0); 1045 + filemap_write_page(bitmap, i, false); 1052 1046 writing = 1; 1053 1047 } 1054 1048 } ··· 1090 1084 EXPORT_SYMBOL(md_bitmap_unplug_async); 1091 1085 1092 1086 static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); 1093 - /* * bitmap_init_from_disk -- called at bitmap_create time to initialize 1094 - * the in-memory bitmap from the on-disk bitmap -- also, sets up the 1095 - * memory mapping of the bitmap file 1096 - * Special cases: 1097 - * if there's no bitmap file, or if the bitmap file had been 1098 - * previously kicked from the array, we mark all the bits as 1099 - * 1's in order to cause a full resync. 1087 + 1088 + /* 1089 + * Initialize the in-memory bitmap from the on-disk bitmap and set up the memory 1090 + * mapping of the bitmap file. 1091 + * 1092 + * Special case: If there's no bitmap file, or if the bitmap file had been 1093 + * previously kicked from the array, we mark all the bits as 1's in order to 1094 + * cause a full resync. 1100 1095 * 1101 1096 * We ignore all bits for sectors that end earlier than 'start'. 1102 - * This is used when reading an out-of-date bitmap... 1097 + * This is used when reading an out-of-date bitmap. 1103 1098 */ 1104 1099 static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) 1105 1100 { 1106 - unsigned long i, chunks, index, oldindex, bit, node_offset = 0; 1107 - struct page *page = NULL; 1108 - unsigned long bit_cnt = 0; 1109 - struct file *file; 1110 - unsigned long offset; 1111 - int outofdate; 1112 - int ret = -ENOSPC; 1113 - void *paddr; 1101 + bool outofdate = test_bit(BITMAP_STALE, &bitmap->flags); 1102 + struct mddev *mddev = bitmap->mddev; 1103 + unsigned long chunks = bitmap->counts.chunks; 1114 1104 struct bitmap_storage *store = &bitmap->storage; 1105 + struct file *file = store->file; 1106 + unsigned long node_offset = 0; 1107 + unsigned long bit_cnt = 0; 1108 + unsigned long i; 1109 + int ret; 1115 1110 1116 - chunks = bitmap->counts.chunks; 1117 - file = store->file; 1118 - 1119 - if (!file && !bitmap->mddev->bitmap_info.offset) { 1111 + if (!file && !mddev->bitmap_info.offset) { 1120 1112 /* No permanent bitmap - fill with '1s'. */ 1121 1113 store->filemap = NULL; 1122 1114 store->file_pages = 0; ··· 1129 1125 return 0; 1130 1126 } 1131 1127 1132 - outofdate = test_bit(BITMAP_STALE, &bitmap->flags); 1133 - if (outofdate) 1134 - pr_warn("%s: bitmap file is out of date, doing full recovery\n", bmname(bitmap)); 1135 - 1136 1128 if (file && i_size_read(file->f_mapping->host) < store->bytes) { 1137 1129 pr_warn("%s: bitmap file too short %lu < %lu\n", 1138 1130 bmname(bitmap), 1139 1131 (unsigned long) i_size_read(file->f_mapping->host), 1140 1132 store->bytes); 1133 + ret = -ENOSPC; 1141 1134 goto err; 1142 1135 } 1143 1136 1144 - oldindex = ~0L; 1145 - offset = 0; 1146 - if (!bitmap->mddev->bitmap_info.external) 1147 - offset = sizeof(bitmap_super_t); 1148 - 1149 - if (mddev_is_clustered(bitmap->mddev)) 1137 + if (mddev_is_clustered(mddev)) 1150 1138 node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE)); 1151 1139 1152 - for (i = 0; i < chunks; i++) { 1153 - int b; 1154 - index = file_page_index(&bitmap->storage, i); 1155 - bit = file_page_offset(&bitmap->storage, i); 1156 - if (index != oldindex) { /* this is a new page, read it in */ 1157 - int count; 1158 - /* unmap the old page, we're done with it */ 1159 - if (index == store->file_pages-1) 1160 - count = store->bytes - index * PAGE_SIZE; 1161 - else 1162 - count = PAGE_SIZE; 1163 - page = store->filemap[index]; 1164 - if (file) 1165 - ret = read_page(file, index, bitmap, 1166 - count, page); 1167 - else 1168 - ret = read_sb_page( 1169 - bitmap->mddev, 1170 - bitmap->mddev->bitmap_info.offset, 1171 - page, 1172 - index + node_offset, count); 1140 + for (i = 0; i < store->file_pages; i++) { 1141 + struct page *page = store->filemap[i]; 1142 + int count; 1173 1143 1174 - if (ret) 1175 - goto err; 1144 + /* unmap the old page, we're done with it */ 1145 + if (i == store->file_pages - 1) 1146 + count = store->bytes - i * PAGE_SIZE; 1147 + else 1148 + count = PAGE_SIZE; 1176 1149 1177 - oldindex = index; 1150 + if (file) 1151 + ret = read_file_page(file, i, bitmap, count, page); 1152 + else 1153 + ret = read_sb_page(mddev, 0, page, i + node_offset, 1154 + count); 1155 + if (ret) 1156 + goto err; 1157 + } 1178 1158 1179 - if (outofdate) { 1180 - /* 1181 - * if bitmap is out of date, dirty the 1182 - * whole page and write it out 1183 - */ 1184 - paddr = kmap_atomic(page); 1185 - memset(paddr + offset, 0xff, 1186 - PAGE_SIZE - offset); 1187 - kunmap_atomic(paddr); 1188 - write_page(bitmap, page, 1); 1159 + if (outofdate) { 1160 + pr_warn("%s: bitmap file is out of date, doing full recovery\n", 1161 + bmname(bitmap)); 1189 1162 1163 + for (i = 0; i < store->file_pages; i++) { 1164 + struct page *page = store->filemap[i]; 1165 + unsigned long offset = 0; 1166 + void *paddr; 1167 + 1168 + if (i == 0 && !mddev->bitmap_info.external) 1169 + offset = sizeof(bitmap_super_t); 1170 + 1171 + /* 1172 + * If the bitmap is out of date, dirty the whole page 1173 + * and write it out 1174 + */ 1175 + paddr = kmap_atomic(page); 1176 + memset(paddr + offset, 0xff, PAGE_SIZE - offset); 1177 + kunmap_atomic(paddr); 1178 + 1179 + filemap_write_page(bitmap, i, true); 1180 + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) { 1190 1181 ret = -EIO; 1191 - if (test_bit(BITMAP_WRITE_ERROR, 1192 - &bitmap->flags)) 1193 - goto err; 1182 + goto err; 1194 1183 } 1195 1184 } 1185 + } 1186 + 1187 + for (i = 0; i < chunks; i++) { 1188 + struct page *page = filemap_get_page(&bitmap->storage, i); 1189 + unsigned long bit = file_page_offset(&bitmap->storage, i); 1190 + void *paddr; 1191 + bool was_set; 1192 + 1196 1193 paddr = kmap_atomic(page); 1197 1194 if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) 1198 - b = test_bit(bit, paddr); 1195 + was_set = test_bit(bit, paddr); 1199 1196 else 1200 - b = test_bit_le(bit, paddr); 1197 + was_set = test_bit_le(bit, paddr); 1201 1198 kunmap_atomic(paddr); 1202 - if (b) { 1199 + 1200 + if (was_set) { 1203 1201 /* if the disk bit is set, set the memory bit */ 1204 1202 int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift 1205 1203 >= start); ··· 1210 1204 needed); 1211 1205 bit_cnt++; 1212 1206 } 1213 - offset = 0; 1214 1207 } 1215 1208 1216 1209 pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n", ··· 1401 1396 break; 1402 1397 if (bitmap->storage.filemap && 1403 1398 test_and_clear_page_attr(bitmap, j, 1404 - BITMAP_PAGE_NEEDWRITE)) { 1405 - write_page(bitmap, bitmap->storage.filemap[j], 0); 1406 - } 1399 + BITMAP_PAGE_NEEDWRITE)) 1400 + filemap_write_page(bitmap, j, false); 1407 1401 } 1408 1402 1409 1403 done: ··· 2546 2542 if (backlog > COUNTER_MAX) 2547 2543 return -EINVAL; 2548 2544 2545 + rv = mddev_lock(mddev); 2546 + if (rv) 2547 + return rv; 2548 + 2549 2549 /* 2550 2550 * Without write mostly device, it doesn't make sense to set 2551 2551 * backlog for max_write_behind. ··· 2563 2555 if (!has_write_mostly) { 2564 2556 pr_warn_ratelimited("%s: can't set backlog, no write mostly device available\n", 2565 2557 mdname(mddev)); 2558 + mddev_unlock(mddev); 2566 2559 return -EINVAL; 2567 2560 } 2568 2561 ··· 2574 2565 mddev_destroy_serial_pool(mddev, NULL, false); 2575 2566 } else if (backlog && !mddev->serial_info_pool) { 2576 2567 /* serial_info_pool is needed since backlog is not zero */ 2577 - struct md_rdev *rdev; 2578 - 2579 2568 rdev_for_each(rdev, mddev) 2580 2569 mddev_create_serial_pool(mddev, rdev, false); 2581 2570 } 2582 2571 if (old_mwb != backlog) 2583 2572 md_bitmap_update_sb(mddev->bitmap); 2573 + 2574 + mddev_unlock(mddev); 2584 2575 return len; 2585 2576 } 2586 2577

+1

drivers/md/md-bitmap.h

··· 201 201 struct file *file; /* backing disk file */ 202 202 struct page *sb_page; /* cached copy of the bitmap 203 203 * file superblock */ 204 + unsigned long sb_index; 204 205 struct page **filemap; /* list of cache pages for 205 206 * the file */ 206 207 unsigned long *filemap_attr; /* attributes associated

+4 -4

drivers/md/md-cluster.c

··· 952 952 return 0; 953 953 err: 954 954 set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); 955 - md_unregister_thread(&cinfo->recovery_thread); 956 - md_unregister_thread(&cinfo->recv_thread); 955 + md_unregister_thread(mddev, &cinfo->recovery_thread); 956 + md_unregister_thread(mddev, &cinfo->recv_thread); 957 957 lockres_free(cinfo->message_lockres); 958 958 lockres_free(cinfo->token_lockres); 959 959 lockres_free(cinfo->ack_lockres); ··· 1015 1015 resync_bitmap(mddev); 1016 1016 1017 1017 set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); 1018 - md_unregister_thread(&cinfo->recovery_thread); 1019 - md_unregister_thread(&cinfo->recv_thread); 1018 + md_unregister_thread(mddev, &cinfo->recovery_thread); 1019 + md_unregister_thread(mddev, &cinfo->recv_thread); 1020 1020 lockres_free(cinfo->message_lockres); 1021 1021 lockres_free(cinfo->token_lockres); 1022 1022 lockres_free(cinfo->ack_lockres);

+2

drivers/md/md-faulty.c

··· 204 204 failit = 1; 205 205 } 206 206 } 207 + 208 + md_account_bio(mddev, &bio); 207 209 if (failit) { 208 210 struct bio *b = bio_alloc_clone(conf->rdev->bdev, bio, GFP_NOIO, 209 211 &mddev->bio_set);

+1

drivers/md/md-linear.c

··· 238 238 bio = split; 239 239 } 240 240 241 + md_account_bio(mddev, &bio); 241 242 bio_set_dev(bio, tmp_dev->rdev->bdev); 242 243 bio->bi_iter.bi_sector = bio->bi_iter.bi_sector - 243 244 start_sector + data_offset;

+1

drivers/md/md-multipath.c

··· 107 107 && md_flush_request(mddev, bio)) 108 108 return true; 109 109 110 + md_account_bio(mddev, &bio); 110 111 mp_bh = mempool_alloc(&conf->pool, GFP_NOIO); 111 112 112 113 mp_bh->master_bio = bio;

+135 -91

drivers/md/md.c

··· 453 453 mddev->pers->prepare_suspend(mddev); 454 454 455 455 wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); 456 - mddev->pers->quiesce(mddev, 1); 457 456 clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); 458 457 wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); 459 458 ··· 464 465 465 466 void mddev_resume(struct mddev *mddev) 466 467 { 467 - /* entred the memalloc scope from mddev_suspend() */ 468 - memalloc_noio_restore(mddev->noio_flag); 469 468 lockdep_assert_held(&mddev->reconfig_mutex); 470 469 if (--mddev->suspended) 471 470 return; 471 + 472 + /* entred the memalloc scope from mddev_suspend() */ 473 + memalloc_noio_restore(mddev->noio_flag); 474 + 472 475 percpu_ref_resurrect(&mddev->active_io); 473 476 wake_up(&mddev->sb_wait); 474 - mddev->pers->quiesce(mddev, 0); 475 477 476 478 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 477 479 md_wakeup_thread(mddev->thread); ··· 643 643 { 644 644 mutex_init(&mddev->open_mutex); 645 645 mutex_init(&mddev->reconfig_mutex); 646 + mutex_init(&mddev->sync_mutex); 646 647 mutex_init(&mddev->bitmap_info.mutex); 647 648 INIT_LIST_HEAD(&mddev->disks); 648 649 INIT_LIST_HEAD(&mddev->all_mddevs); ··· 651 650 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); 652 651 atomic_set(&mddev->active, 1); 653 652 atomic_set(&mddev->openers, 0); 653 + atomic_set(&mddev->sync_seq, 0); 654 654 spin_lock_init(&mddev->lock); 655 655 atomic_set(&mddev->flush_pending, 0); 656 656 init_waitqueue_head(&mddev->sb_wait); ··· 2306 2304 pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); 2307 2305 if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || 2308 2306 (mddev->level != 1 && mddev->level != 10 && 2309 - bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE))) { 2307 + bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { 2310 2308 /* 2311 2309 * No need to handle the failure of bioset_integrity_create, 2312 2310 * because the function is called by md_run() -> pers->run(), ··· 4749 4747 return sprintf(page, "%s\n", type); 4750 4748 } 4751 4749 4750 + static void stop_sync_thread(struct mddev *mddev) 4751 + { 4752 + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4753 + return; 4754 + 4755 + if (mddev_lock(mddev)) 4756 + return; 4757 + 4758 + /* 4759 + * Check again in case MD_RECOVERY_RUNNING is cleared before lock is 4760 + * held. 4761 + */ 4762 + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 4763 + mddev_unlock(mddev); 4764 + return; 4765 + } 4766 + 4767 + if (work_pending(&mddev->del_work)) 4768 + flush_workqueue(md_misc_wq); 4769 + 4770 + set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4771 + /* 4772 + * Thread might be blocked waiting for metadata update which will now 4773 + * never happen 4774 + */ 4775 + md_wakeup_thread_directly(mddev->sync_thread); 4776 + 4777 + mddev_unlock(mddev); 4778 + } 4779 + 4780 + static void idle_sync_thread(struct mddev *mddev) 4781 + { 4782 + int sync_seq = atomic_read(&mddev->sync_seq); 4783 + 4784 + mutex_lock(&mddev->sync_mutex); 4785 + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4786 + stop_sync_thread(mddev); 4787 + 4788 + wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) || 4789 + !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); 4790 + 4791 + mutex_unlock(&mddev->sync_mutex); 4792 + } 4793 + 4794 + static void frozen_sync_thread(struct mddev *mddev) 4795 + { 4796 + mutex_lock(&mddev->sync_mutex); 4797 + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4798 + stop_sync_thread(mddev); 4799 + 4800 + wait_event(resync_wait, mddev->sync_thread == NULL && 4801 + !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); 4802 + 4803 + mutex_unlock(&mddev->sync_mutex); 4804 + } 4805 + 4752 4806 static ssize_t 4753 4807 action_store(struct mddev *mddev, const char *page, size_t len) 4754 4808 { ··· 4812 4754 return -EINVAL; 4813 4755 4814 4756 4815 - if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { 4816 - if (cmd_match(page, "frozen")) 4817 - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4818 - else 4819 - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4820 - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 4821 - mddev_lock(mddev) == 0) { 4822 - if (work_pending(&mddev->del_work)) 4823 - flush_workqueue(md_misc_wq); 4824 - if (mddev->sync_thread) { 4825 - sector_t save_rp = mddev->reshape_position; 4826 - 4827 - mddev_unlock(mddev); 4828 - set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4829 - md_unregister_thread(&mddev->sync_thread); 4830 - mddev_lock_nointr(mddev); 4831 - /* 4832 - * set RECOVERY_INTR again and restore reshape 4833 - * position in case others changed them after 4834 - * got lock, eg, reshape_position_store and 4835 - * md_check_recovery. 4836 - */ 4837 - mddev->reshape_position = save_rp; 4838 - set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4839 - md_reap_sync_thread(mddev); 4840 - } 4841 - mddev_unlock(mddev); 4842 - } 4843 - } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4757 + if (cmd_match(page, "idle")) 4758 + idle_sync_thread(mddev); 4759 + else if (cmd_match(page, "frozen")) 4760 + frozen_sync_thread(mddev); 4761 + else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4844 4762 return -EBUSY; 4845 4763 else if (cmd_match(page, "resync")) 4846 4764 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); ··· 5876 5842 goto exit_bio_set; 5877 5843 } 5878 5844 5845 + if (!bioset_initialized(&mddev->io_clone_set)) { 5846 + err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, 5847 + offsetof(struct md_io_clone, bio_clone), 0); 5848 + if (err) 5849 + goto exit_sync_set; 5850 + } 5851 + 5879 5852 spin_lock(&pers_lock); 5880 5853 pers = find_pers(mddev->level, mddev->clevel); 5881 5854 if (!pers || !try_module_get(pers->owner)) { ··· 6060 6019 module_put(pers->owner); 6061 6020 md_bitmap_destroy(mddev); 6062 6021 abort: 6022 + bioset_exit(&mddev->io_clone_set); 6023 + exit_sync_set: 6063 6024 bioset_exit(&mddev->sync_set); 6064 6025 exit_bio_set: 6065 6026 bioset_exit(&mddev->bio_set); ··· 6219 6176 flush_workqueue(md_misc_wq); 6220 6177 if (mddev->sync_thread) { 6221 6178 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6222 - md_unregister_thread(&mddev->sync_thread); 6223 6179 md_reap_sync_thread(mddev); 6224 6180 } 6225 6181 ··· 6258 6216 mddev->pers->quiesce(mddev, 1); 6259 6217 mddev->pers->quiesce(mddev, 0); 6260 6218 } 6261 - md_unregister_thread(&mddev->thread); 6219 + md_unregister_thread(mddev, &mddev->thread); 6262 6220 if (mddev->queue) 6263 6221 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 6264 6222 } ··· 6285 6243 percpu_ref_exit(&mddev->active_io); 6286 6244 bioset_exit(&mddev->bio_set); 6287 6245 bioset_exit(&mddev->sync_set); 6246 + bioset_exit(&mddev->io_clone_set); 6288 6247 } 6289 6248 6290 6249 void md_stop(struct mddev *mddev) ··· 7055 7012 7056 7013 if (mddev->bitmap || mddev->bitmap_info.file) 7057 7014 return -EEXIST; /* cannot add when bitmap is present */ 7015 + 7016 + if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { 7017 + pr_warn("%s: bitmap files not supported by this kernel\n", 7018 + mdname(mddev)); 7019 + return -EINVAL; 7020 + } 7021 + pr_warn("%s: using deprecated bitmap file support\n", 7022 + mdname(mddev)); 7023 + 7058 7024 f = fget(fd); 7059 7025 7060 7026 if (f == NULL) { ··· 7992 7940 } 7993 7941 EXPORT_SYMBOL(md_register_thread); 7994 7942 7995 - void md_unregister_thread(struct md_thread __rcu **threadp) 7943 + void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) 7996 7944 { 7997 - struct md_thread *thread = rcu_dereference_protected(*threadp, true); 7945 + struct md_thread *thread = rcu_dereference_protected(*threadp, 7946 + lockdep_is_held(&mddev->reconfig_mutex)); 7998 7947 7999 7948 if (!thread) 8000 7949 return; ··· 8654 8601 } 8655 8602 EXPORT_SYMBOL_GPL(md_submit_discard_bio); 8656 8603 8657 - int acct_bioset_init(struct mddev *mddev) 8604 + static void md_end_clone_io(struct bio *bio) 8658 8605 { 8659 - int err = 0; 8660 - 8661 - if (!bioset_initialized(&mddev->io_acct_set)) 8662 - err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE, 8663 - offsetof(struct md_io_acct, bio_clone), 0); 8664 - return err; 8665 - } 8666 - EXPORT_SYMBOL_GPL(acct_bioset_init); 8667 - 8668 - void acct_bioset_exit(struct mddev *mddev) 8669 - { 8670 - bioset_exit(&mddev->io_acct_set); 8671 - } 8672 - EXPORT_SYMBOL_GPL(acct_bioset_exit); 8673 - 8674 - static void md_end_io_acct(struct bio *bio) 8675 - { 8676 - struct md_io_acct *md_io_acct = bio->bi_private; 8677 - struct bio *orig_bio = md_io_acct->orig_bio; 8678 - struct mddev *mddev = md_io_acct->mddev; 8606 + struct md_io_clone *md_io_clone = bio->bi_private; 8607 + struct bio *orig_bio = md_io_clone->orig_bio; 8608 + struct mddev *mddev = md_io_clone->mddev; 8679 8609 8680 8610 orig_bio->bi_status = bio->bi_status; 8681 8611 8682 - bio_end_io_acct(orig_bio, md_io_acct->start_time); 8612 + if (md_io_clone->start_time) 8613 + bio_end_io_acct(orig_bio, md_io_clone->start_time); 8614 + 8683 8615 bio_put(bio); 8684 8616 bio_endio(orig_bio); 8685 - 8686 8617 percpu_ref_put(&mddev->active_io); 8687 8618 } 8688 8619 8689 - /* 8690 - * Used by personalities that don't already clone the bio and thus can't 8691 - * easily add the timestamp to their extended bio structure. 8692 - */ 8693 - void md_account_bio(struct mddev *mddev, struct bio **bio) 8620 + static void md_clone_bio(struct mddev *mddev, struct bio **bio) 8694 8621 { 8695 8622 struct block_device *bdev = (*bio)->bi_bdev; 8696 - struct md_io_acct *md_io_acct; 8697 - struct bio *clone; 8623 + struct md_io_clone *md_io_clone; 8624 + struct bio *clone = 8625 + bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set); 8698 8626 8699 - if (!blk_queue_io_stat(bdev->bd_disk->queue)) 8700 - return; 8627 + md_io_clone = container_of(clone, struct md_io_clone, bio_clone); 8628 + md_io_clone->orig_bio = *bio; 8629 + md_io_clone->mddev = mddev; 8630 + if (blk_queue_io_stat(bdev->bd_disk->queue)) 8631 + md_io_clone->start_time = bio_start_io_acct(*bio); 8701 8632 8702 - percpu_ref_get(&mddev->active_io); 8703 - 8704 - clone = bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_acct_set); 8705 - md_io_acct = container_of(clone, struct md_io_acct, bio_clone); 8706 - md_io_acct->orig_bio = *bio; 8707 - md_io_acct->start_time = bio_start_io_acct(*bio); 8708 - md_io_acct->mddev = mddev; 8709 - 8710 - clone->bi_end_io = md_end_io_acct; 8711 - clone->bi_private = md_io_acct; 8633 + clone->bi_end_io = md_end_clone_io; 8634 + clone->bi_private = md_io_clone; 8712 8635 *bio = clone; 8636 + } 8637 + 8638 + void md_account_bio(struct mddev *mddev, struct bio **bio) 8639 + { 8640 + percpu_ref_get(&mddev->active_io); 8641 + md_clone_bio(mddev, bio); 8713 8642 } 8714 8643 EXPORT_SYMBOL_GPL(md_account_bio); 8715 8644 ··· 9364 9329 * ->spare_active and clear saved_raid_disk 9365 9330 */ 9366 9331 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9367 - md_unregister_thread(&mddev->sync_thread); 9368 9332 md_reap_sync_thread(mddev); 9369 9333 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9370 9334 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); ··· 9392 9358 if (mddev->sb_flags) 9393 9359 md_update_sb(mddev, 0); 9394 9360 9395 - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 9396 - !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 9397 - /* resync/recovery still happening */ 9398 - clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9399 - goto unlock; 9400 - } 9401 - if (mddev->sync_thread) { 9402 - md_unregister_thread(&mddev->sync_thread); 9361 + /* 9362 + * Never start a new sync thread if MD_RECOVERY_RUNNING is 9363 + * still set. 9364 + */ 9365 + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 9366 + if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 9367 + /* resync/recovery still happening */ 9368 + clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9369 + goto unlock; 9370 + } 9371 + 9372 + if (WARN_ON_ONCE(!mddev->sync_thread)) 9373 + goto unlock; 9374 + 9403 9375 md_reap_sync_thread(mddev); 9404 9376 goto unlock; 9405 9377 } 9378 + 9406 9379 /* Set RUNNING before clearing NEEDED to avoid 9407 9380 * any transients in the value of "sync_action". 9408 9381 */ ··· 9486 9445 sector_t old_dev_sectors = mddev->dev_sectors; 9487 9446 bool is_reshaped = false; 9488 9447 9489 - /* sync_thread should be unregistered, collect result */ 9448 + /* resync has finished, collect result */ 9449 + md_unregister_thread(mddev, &mddev->sync_thread); 9450 + atomic_inc(&mddev->sync_seq); 9451 + 9490 9452 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9491 9453 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 9492 9454 mddev->degraded != mddev->raid_disks) { ··· 9534 9490 if (mddev_is_clustered(mddev) && is_reshaped 9535 9491 && !test_bit(MD_CLOSING, &mddev->flags)) 9536 9492 md_cluster_ops->update_size(mddev, old_dev_sectors); 9537 - wake_up(&resync_wait); 9538 9493 /* flag recovery needed just to double check */ 9539 9494 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9540 9495 sysfs_notify_dirent_safe(mddev->sysfs_completed); ··· 9541 9498 md_new_event(); 9542 9499 if (mddev->event_work.func) 9543 9500 queue_work(md_misc_wq, &mddev->event_work); 9501 + wake_up(&resync_wait); 9544 9502 } 9545 9503 EXPORT_SYMBOL(md_reap_sync_thread); 9546 9504

+8 -5

drivers/md/md.h

··· 510 510 struct bio_set sync_set; /* for sync operations like 511 511 * metadata and bitmap writes 512 512 */ 513 - struct bio_set io_acct_set; /* for raid0 and raid5 io accounting */ 513 + struct bio_set io_clone_set; 514 514 515 515 /* Generic flush handling. 516 516 * The last to finish preflush schedules a worker to submit ··· 534 534 * reconfig_mutex is unlocked, protected by reconfig_mutex. 535 535 */ 536 536 struct list_head deleting; 537 + 538 + /* Used to synchronize idle and frozen for action_store() */ 539 + struct mutex sync_mutex; 540 + /* The sequence number for sync thread */ 541 + atomic_t sync_seq; 537 542 538 543 bool has_superblocks:1; 539 544 bool fail_last_dev:1; ··· 736 731 void *private; 737 732 }; 738 733 739 - struct md_io_acct { 734 + struct md_io_clone { 740 735 struct mddev *mddev; 741 736 struct bio *orig_bio; 742 737 unsigned long start_time; ··· 761 756 void (*run)(struct md_thread *thread), 762 757 struct mddev *mddev, 763 758 const char *name); 764 - extern void md_unregister_thread(struct md_thread __rcu **threadp); 759 + extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp); 765 760 extern void md_wakeup_thread(struct md_thread __rcu *thread); 766 761 extern void md_check_recovery(struct mddev *mddev); 767 762 extern void md_reap_sync_thread(struct mddev *mddev); ··· 774 769 extern void md_finish_reshape(struct mddev *mddev); 775 770 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, 776 771 struct bio *bio, sector_t start, sector_t size); 777 - int acct_bioset_init(struct mddev *mddev); 778 - void acct_bioset_exit(struct mddev *mddev); 779 772 void md_account_bio(struct mddev *mddev, struct bio **bio); 780 773 781 774 extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);

+44 -56

drivers/md/raid0.c

··· 377 377 struct r0conf *conf = priv; 378 378 379 379 free_conf(mddev, conf); 380 - acct_bioset_exit(mddev); 381 380 } 382 381 383 382 static int raid0_run(struct mddev *mddev) ··· 391 392 if (md_check_no_bitmap(mddev)) 392 393 return -EINVAL; 393 394 394 - if (acct_bioset_init(mddev)) { 395 - pr_err("md/raid0:%s: alloc acct bioset failed.\n", mdname(mddev)); 396 - return -ENOMEM; 397 - } 398 - 399 395 /* if private is not null, we are here after takeover */ 400 396 if (mddev->private == NULL) { 401 397 ret = create_strip_zones(mddev, &conf); 402 398 if (ret < 0) 403 - goto exit_acct_set; 399 + return ret; 404 400 mddev->private = conf; 405 401 } 406 402 conf = mddev->private; ··· 426 432 427 433 ret = md_integrity_register(mddev); 428 434 if (ret) 429 - goto free; 435 + free_conf(mddev, conf); 430 436 431 - return ret; 432 - 433 - free: 434 - free_conf(mddev, conf); 435 - exit_acct_set: 436 - acct_bioset_exit(mddev); 437 437 return ret; 438 438 } 439 439 ··· 545 557 bio_endio(bio); 546 558 } 547 559 548 - static bool raid0_make_request(struct mddev *mddev, struct bio *bio) 560 + static void raid0_map_submit_bio(struct mddev *mddev, struct bio *bio) 549 561 { 550 562 struct r0conf *conf = mddev->private; 551 563 struct strip_zone *zone; 552 564 struct md_rdev *tmp_dev; 553 - sector_t bio_sector; 565 + sector_t bio_sector = bio->bi_iter.bi_sector; 566 + sector_t sector = bio_sector; 567 + 568 + md_account_bio(mddev, &bio); 569 + 570 + zone = find_zone(mddev->private, &sector); 571 + switch (conf->layout) { 572 + case RAID0_ORIG_LAYOUT: 573 + tmp_dev = map_sector(mddev, zone, bio_sector, &sector); 574 + break; 575 + case RAID0_ALT_MULTIZONE_LAYOUT: 576 + tmp_dev = map_sector(mddev, zone, sector, &sector); 577 + break; 578 + default: 579 + WARN(1, "md/raid0:%s: Invalid layout\n", mdname(mddev)); 580 + bio_io_error(bio); 581 + return; 582 + } 583 + 584 + if (unlikely(is_rdev_broken(tmp_dev))) { 585 + bio_io_error(bio); 586 + md_error(mddev, tmp_dev); 587 + return; 588 + } 589 + 590 + bio_set_dev(bio, tmp_dev->bdev); 591 + bio->bi_iter.bi_sector = sector + zone->dev_start + 592 + tmp_dev->data_offset; 593 + 594 + if (mddev->gendisk) 595 + trace_block_bio_remap(bio, disk_devt(mddev->gendisk), 596 + bio_sector); 597 + mddev_check_write_zeroes(mddev, bio); 598 + submit_bio_noacct(bio); 599 + } 600 + 601 + static bool raid0_make_request(struct mddev *mddev, struct bio *bio) 602 + { 554 603 sector_t sector; 555 - sector_t orig_sector; 556 604 unsigned chunk_sects; 557 605 unsigned sectors; 558 606 ··· 601 577 return true; 602 578 } 603 579 604 - bio_sector = bio->bi_iter.bi_sector; 605 - sector = bio_sector; 580 + sector = bio->bi_iter.bi_sector; 606 581 chunk_sects = mddev->chunk_sectors; 607 582 608 583 sectors = chunk_sects - ··· 609 586 ? (sector & (chunk_sects-1)) 610 587 : sector_div(sector, chunk_sects)); 611 588 612 - /* Restore due to sector_div */ 613 - sector = bio_sector; 614 - 615 589 if (sectors < bio_sectors(bio)) { 616 590 struct bio *split = bio_split(bio, sectors, GFP_NOIO, 617 591 &mddev->bio_set); 618 592 bio_chain(split, bio); 619 - submit_bio_noacct(bio); 593 + raid0_map_submit_bio(mddev, bio); 620 594 bio = split; 621 595 } 622 596 623 - if (bio->bi_pool != &mddev->bio_set) 624 - md_account_bio(mddev, &bio); 625 - 626 - orig_sector = sector; 627 - zone = find_zone(mddev->private, &sector); 628 - switch (conf->layout) { 629 - case RAID0_ORIG_LAYOUT: 630 - tmp_dev = map_sector(mddev, zone, orig_sector, &sector); 631 - break; 632 - case RAID0_ALT_MULTIZONE_LAYOUT: 633 - tmp_dev = map_sector(mddev, zone, sector, &sector); 634 - break; 635 - default: 636 - WARN(1, "md/raid0:%s: Invalid layout\n", mdname(mddev)); 637 - bio_io_error(bio); 638 - return true; 639 - } 640 - 641 - if (unlikely(is_rdev_broken(tmp_dev))) { 642 - bio_io_error(bio); 643 - md_error(mddev, tmp_dev); 644 - return true; 645 - } 646 - 647 - bio_set_dev(bio, tmp_dev->bdev); 648 - bio->bi_iter.bi_sector = sector + zone->dev_start + 649 - tmp_dev->data_offset; 650 - 651 - if (mddev->gendisk) 652 - trace_block_bio_remap(bio, disk_devt(mddev->gendisk), 653 - bio_sector); 654 - mddev_check_write_zeroes(mddev, bio); 655 - submit_bio_noacct(bio); 597 + raid0_map_submit_bio(mddev, bio); 656 598 return true; 657 599 } 658 600

+50 -36

drivers/md/raid1.c

··· 304 304 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) 305 305 bio->bi_status = BLK_STS_IOERR; 306 306 307 - if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) 308 - bio_end_io_acct(bio, r1_bio->start_time); 309 307 bio_endio(bio); 310 308 } 311 309 ··· 311 313 { 312 314 struct bio *bio = r1_bio->master_bio; 313 315 struct r1conf *conf = r1_bio->mddev->private; 316 + sector_t sector = r1_bio->sector; 314 317 315 318 /* if nobody has done the final endio yet, do it now */ 316 319 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { ··· 322 323 323 324 call_bio_endio(r1_bio); 324 325 } 326 + 327 + free_r1bio(r1_bio); 325 328 /* 326 329 * Wake up any possible resync thread that waits for the device 327 330 * to go idle. All I/Os, even write-behind writes, are done. 328 331 */ 329 - allow_barrier(conf, r1_bio->sector); 330 - 331 - free_r1bio(r1_bio); 332 + allow_barrier(conf, sector); 332 333 } 333 334 334 335 /* ··· 790 791 return best_disk; 791 792 } 792 793 794 + static void wake_up_barrier(struct r1conf *conf) 795 + { 796 + if (wq_has_sleeper(&conf->wait_barrier)) 797 + wake_up(&conf->wait_barrier); 798 + } 799 + 793 800 static void flush_bio_list(struct r1conf *conf, struct bio *bio) 794 801 { 795 802 /* flush any pending bitmap writes to disk before proceeding w/ I/O */ 796 803 raid1_prepare_flush_writes(conf->mddev->bitmap); 797 - wake_up(&conf->wait_barrier); 804 + wake_up_barrier(conf); 798 805 799 806 while (bio) { /* submit pending writes */ 800 807 struct bio *next = bio->bi_next; ··· 977 972 * In case freeze_array() is waiting for 978 973 * get_unqueued_pending() == extra 979 974 */ 980 - wake_up(&conf->wait_barrier); 975 + wake_up_barrier(conf); 981 976 /* Wait for the barrier in same barrier unit bucket to drop. */ 982 977 983 978 /* Return false when nowait flag is set */ ··· 1020 1015 * In case freeze_array() is waiting for 1021 1016 * get_unqueued_pending() == extra 1022 1017 */ 1023 - wake_up(&conf->wait_barrier); 1018 + wake_up_barrier(conf); 1024 1019 /* Wait for array to be unfrozen */ 1025 1020 1026 1021 /* Return false when nowait flag is set */ ··· 1049 1044 static void _allow_barrier(struct r1conf *conf, int idx) 1050 1045 { 1051 1046 atomic_dec(&conf->nr_pending[idx]); 1052 - wake_up(&conf->wait_barrier); 1047 + wake_up_barrier(conf); 1053 1048 } 1054 1049 1055 1050 static void allow_barrier(struct r1conf *conf, sector_t sector_nr) ··· 1178 1173 spin_lock_irq(&conf->device_lock); 1179 1174 bio_list_merge(&conf->pending_bio_list, &plug->pending); 1180 1175 spin_unlock_irq(&conf->device_lock); 1181 - wake_up(&conf->wait_barrier); 1176 + wake_up_barrier(conf); 1182 1177 md_wakeup_thread(mddev->thread); 1183 1178 kfree(plug); 1184 1179 return; ··· 1308 1303 } 1309 1304 1310 1305 r1_bio->read_disk = rdisk; 1311 - 1312 - if (!r1bio_existed && blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) 1313 - r1_bio->start_time = bio_start_io_acct(bio); 1314 - 1306 + if (!r1bio_existed) { 1307 + md_account_bio(mddev, &bio); 1308 + r1_bio->master_bio = bio; 1309 + } 1315 1310 read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp, 1316 1311 &mddev->bio_set); 1317 1312 ··· 1378 1373 return; 1379 1374 } 1380 1375 1376 + retry_write: 1381 1377 r1_bio = alloc_r1bio(mddev, bio); 1382 1378 r1_bio->sectors = max_write_sectors; 1383 1379 ··· 1394 1388 */ 1395 1389 1396 1390 disks = conf->raid_disks * 2; 1397 - retry_write: 1398 1391 blocked_rdev = NULL; 1399 1392 rcu_read_lock(); 1400 1393 max_sectors = r1_bio->sectors; ··· 1473 1468 for (j = 0; j < i; j++) 1474 1469 if (r1_bio->bios[j]) 1475 1470 rdev_dec_pending(conf->mirrors[j].rdev, mddev); 1476 - r1_bio->state = 0; 1471 + free_r1bio(r1_bio); 1477 1472 allow_barrier(conf, bio->bi_iter.bi_sector); 1478 1473 1479 1474 if (bio->bi_opf & REQ_NOWAIT) { ··· 1505 1500 r1_bio->sectors = max_sectors; 1506 1501 } 1507 1502 1508 - if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) 1509 - r1_bio->start_time = bio_start_io_acct(bio); 1503 + md_account_bio(mddev, &bio); 1504 + r1_bio->master_bio = bio; 1510 1505 atomic_set(&r1_bio->remaining, 1); 1511 1506 atomic_set(&r1_bio->behind_remaining, 0); 1512 1507 ··· 1523 1518 * Not if there are too many, or cannot 1524 1519 * allocate memory, or a reader on WriteMostly 1525 1520 * is waiting for behind writes to flush */ 1526 - if (bitmap && 1527 - test_bit(WriteMostly, &rdev->flags) && 1521 + if (bitmap && write_behind && 1528 1522 (atomic_read(&bitmap->behind_writes) 1529 1523 < mddev->bitmap_info.max_write_behind) && 1530 1524 !waitqueue_active(&bitmap->behind_wait)) { ··· 1580 1576 r1_bio_write_done(r1_bio); 1581 1577 1582 1578 /* In case raid1d snuck in to freeze_array */ 1583 - wake_up(&conf->wait_barrier); 1579 + wake_up_barrier(conf); 1584 1580 } 1585 1581 1586 1582 static bool raid1_make_request(struct mddev *mddev, struct bio *bio) ··· 1770 1766 { 1771 1767 struct r1conf *conf = mddev->private; 1772 1768 int err = -EEXIST; 1773 - int mirror = 0; 1769 + int mirror = 0, repl_slot = -1; 1774 1770 struct raid1_info *p; 1775 1771 int first = 0; 1776 1772 int last = conf->raid_disks - 1; ··· 1813 1809 break; 1814 1810 } 1815 1811 if (test_bit(WantReplacement, &p->rdev->flags) && 1816 - p[conf->raid_disks].rdev == NULL) { 1817 - /* Add this device as a replacement */ 1818 - clear_bit(In_sync, &rdev->flags); 1819 - set_bit(Replacement, &rdev->flags); 1820 - rdev->raid_disk = mirror; 1821 - err = 0; 1822 - conf->fullsync = 1; 1823 - rcu_assign_pointer(p[conf->raid_disks].rdev, rdev); 1824 - break; 1825 - } 1812 + p[conf->raid_disks].rdev == NULL && repl_slot < 0) 1813 + repl_slot = mirror; 1826 1814 } 1815 + 1816 + if (err && repl_slot >= 0) { 1817 + /* Add this device as a replacement */ 1818 + p = conf->mirrors + repl_slot; 1819 + clear_bit(In_sync, &rdev->flags); 1820 + set_bit(Replacement, &rdev->flags); 1821 + rdev->raid_disk = repl_slot; 1822 + err = 0; 1823 + conf->fullsync = 1; 1824 + rcu_assign_pointer(p[conf->raid_disks].rdev, rdev); 1825 + } 1826 + 1827 1827 print_conf(conf); 1828 1828 return err; 1829 1829 } ··· 1837 1829 struct r1conf *conf = mddev->private; 1838 1830 int err = 0; 1839 1831 int number = rdev->raid_disk; 1832 + 1833 + if (unlikely(number >= conf->raid_disks)) 1834 + goto abort; 1835 + 1840 1836 struct raid1_info *p = conf->mirrors + number; 1841 1837 1842 1838 if (rdev != p->rdev) ··· 2311 2299 d++; 2312 2300 if (d == conf->raid_disks * 2) 2313 2301 d = 0; 2314 - } while (!success && d != read_disk); 2302 + } while (d != read_disk); 2315 2303 2316 2304 if (!success) { 2317 2305 /* Cannot read from anywhere - mark it bad */ ··· 2510 2498 struct mddev *mddev = conf->mddev; 2511 2499 struct bio *bio; 2512 2500 struct md_rdev *rdev; 2501 + sector_t sector; 2513 2502 2514 2503 clear_bit(R1BIO_ReadError, &r1_bio->state); 2515 2504 /* we got a read error. Maybe the drive is bad. Maybe just ··· 2540 2527 } 2541 2528 2542 2529 rdev_dec_pending(rdev, conf->mddev); 2543 - allow_barrier(conf, r1_bio->sector); 2530 + sector = r1_bio->sector; 2544 2531 bio = r1_bio->master_bio; 2545 2532 2546 2533 /* Reuse the old r1_bio so that the IO_BLOCKED settings are preserved */ 2547 2534 r1_bio->state = 0; 2548 2535 raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio); 2536 + allow_barrier(conf, sector); 2549 2537 } 2550 2538 2551 2539 static void raid1d(struct md_thread *thread) ··· 3158 3144 * RAID1 needs at least one disk in active 3159 3145 */ 3160 3146 if (conf->raid_disks - mddev->degraded < 1) { 3161 - md_unregister_thread(&conf->thread); 3147 + md_unregister_thread(mddev, &conf->thread); 3162 3148 ret = -EINVAL; 3163 3149 goto abort; 3164 3150 } ··· 3185 3171 3186 3172 ret = md_integrity_register(mddev); 3187 3173 if (ret) { 3188 - md_unregister_thread(&mddev->thread); 3174 + md_unregister_thread(mddev, &mddev->thread); 3189 3175 goto abort; 3190 3176 } 3191 3177 return 0;

-1

drivers/md/raid1.h

··· 157 157 sector_t sector; 158 158 int sectors; 159 159 unsigned long state; 160 - unsigned long start_time; 161 160 struct mddev *mddev; 162 161 /* 163 162 * original bio going to /dev/mdx

+45 -40

drivers/md/raid10.c

··· 325 325 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) 326 326 bio->bi_status = BLK_STS_IOERR; 327 327 328 - if (r10_bio->start_time) 329 - bio_end_io_acct(bio, r10_bio->start_time); 330 328 bio_endio(bio); 331 329 /* 332 330 * Wake up any possible resync thread that waits for the device ··· 1170 1172 } 1171 1173 1172 1174 static void raid10_read_request(struct mddev *mddev, struct bio *bio, 1173 - struct r10bio *r10_bio) 1175 + struct r10bio *r10_bio, bool io_accounting) 1174 1176 { 1175 1177 struct r10conf *conf = mddev->private; 1176 1178 struct bio *read_bio; ··· 1241 1243 } 1242 1244 slot = r10_bio->read_slot; 1243 1245 1244 - if (!r10_bio->start_time && 1245 - blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) 1246 - r10_bio->start_time = bio_start_io_acct(bio); 1246 + if (io_accounting) { 1247 + md_account_bio(mddev, &bio); 1248 + r10_bio->master_bio = bio; 1249 + } 1247 1250 read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set); 1248 1251 1249 1252 r10_bio->devs[slot].bio = read_bio; ··· 1321 1322 } 1322 1323 } 1323 1324 1325 + static struct md_rdev *dereference_rdev_and_rrdev(struct raid10_info *mirror, 1326 + struct md_rdev **prrdev) 1327 + { 1328 + struct md_rdev *rdev, *rrdev; 1329 + 1330 + rrdev = rcu_dereference(mirror->replacement); 1331 + /* 1332 + * Read replacement first to prevent reading both rdev and 1333 + * replacement as NULL during replacement replace rdev. 1334 + */ 1335 + smp_mb(); 1336 + rdev = rcu_dereference(mirror->rdev); 1337 + if (rdev == rrdev) 1338 + rrdev = NULL; 1339 + 1340 + *prrdev = rrdev; 1341 + return rdev; 1342 + } 1343 + 1324 1344 static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) 1325 1345 { 1326 1346 int i; ··· 1350 1332 blocked_rdev = NULL; 1351 1333 rcu_read_lock(); 1352 1334 for (i = 0; i < conf->copies; i++) { 1353 - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 1354 - struct md_rdev *rrdev = rcu_dereference( 1355 - conf->mirrors[i].replacement); 1356 - if (rdev == rrdev) 1357 - rrdev = NULL; 1335 + struct md_rdev *rdev, *rrdev; 1336 + 1337 + rdev = dereference_rdev_and_rrdev(&conf->mirrors[i], &rrdev); 1358 1338 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 1359 1339 atomic_inc(&rdev->nr_pending); 1360 1340 blocked_rdev = rdev; ··· 1481 1465 int d = r10_bio->devs[i].devnum; 1482 1466 struct md_rdev *rdev, *rrdev; 1483 1467 1484 - rrdev = rcu_dereference(conf->mirrors[d].replacement); 1485 - /* 1486 - * Read replacement first to prevent reading both rdev and 1487 - * replacement as NULL during replacement replace rdev. 1488 - */ 1489 - smp_mb(); 1490 - rdev = rcu_dereference(conf->mirrors[d].rdev); 1491 - if (rdev == rrdev) 1492 - rrdev = NULL; 1468 + rdev = dereference_rdev_and_rrdev(&conf->mirrors[d], &rrdev); 1493 1469 if (rdev && (test_bit(Faulty, &rdev->flags))) 1494 1470 rdev = NULL; 1495 1471 if (rrdev && (test_bit(Faulty, &rrdev->flags))) ··· 1551 1543 r10_bio->master_bio = bio; 1552 1544 } 1553 1545 1554 - if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) 1555 - r10_bio->start_time = bio_start_io_acct(bio); 1546 + md_account_bio(mddev, &bio); 1547 + r10_bio->master_bio = bio; 1556 1548 atomic_set(&r10_bio->remaining, 1); 1557 1549 md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); 1558 1550 ··· 1579 1571 r10_bio->sector = bio->bi_iter.bi_sector; 1580 1572 r10_bio->state = 0; 1581 1573 r10_bio->read_slot = -1; 1582 - r10_bio->start_time = 0; 1583 1574 memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * 1584 1575 conf->geo.raid_disks); 1585 1576 1586 1577 if (bio_data_dir(bio) == READ) 1587 - raid10_read_request(mddev, bio, r10_bio); 1578 + raid10_read_request(mddev, bio, r10_bio, true); 1588 1579 else 1589 1580 raid10_write_request(mddev, bio, r10_bio); 1590 1581 } ··· 1787 1780 */ 1788 1781 rcu_read_lock(); 1789 1782 for (disk = 0; disk < geo->raid_disks; disk++) { 1790 - struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); 1791 - struct md_rdev *rrdev = rcu_dereference( 1792 - conf->mirrors[disk].replacement); 1783 + struct md_rdev *rdev, *rrdev; 1793 1784 1785 + rdev = dereference_rdev_and_rrdev(&conf->mirrors[disk], &rrdev); 1794 1786 r10_bio->devs[disk].bio = NULL; 1795 1787 r10_bio->devs[disk].repl_bio = NULL; 1796 1788 ··· 2726 2720 static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio) 2727 2721 { 2728 2722 int sect = 0; /* Offset from r10_bio->sector */ 2729 - int sectors = r10_bio->sectors; 2723 + int sectors = r10_bio->sectors, slot = r10_bio->read_slot; 2730 2724 struct md_rdev *rdev; 2731 2725 int max_read_errors = atomic_read(&mddev->max_corr_read_errors); 2732 - int d = r10_bio->devs[r10_bio->read_slot].devnum; 2726 + int d = r10_bio->devs[slot].devnum; 2733 2727 2734 2728 /* still own a reference to this rdev, so it cannot 2735 2729 * have been cleared recently. ··· 2750 2744 pr_notice("md/raid10:%s: %pg: Failing raid device\n", 2751 2745 mdname(mddev), rdev->bdev); 2752 2746 md_error(mddev, rdev); 2753 - r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; 2747 + r10_bio->devs[slot].bio = IO_BLOCKED; 2754 2748 return; 2755 2749 } 2756 2750 2757 2751 while(sectors) { 2758 2752 int s = sectors; 2759 - int sl = r10_bio->read_slot; 2753 + int sl = slot; 2760 2754 int success = 0; 2761 2755 int start; 2762 2756 ··· 2791 2785 sl++; 2792 2786 if (sl == conf->copies) 2793 2787 sl = 0; 2794 - } while (!success && sl != r10_bio->read_slot); 2788 + } while (sl != slot); 2795 2789 rcu_read_unlock(); 2796 2790 2797 2791 if (!success) { ··· 2799 2793 * as bad on the first device to discourage future 2800 2794 * reads. 2801 2795 */ 2802 - int dn = r10_bio->devs[r10_bio->read_slot].devnum; 2796 + int dn = r10_bio->devs[slot].devnum; 2803 2797 rdev = conf->mirrors[dn].rdev; 2804 2798 2805 2799 if (!rdev_set_badblocks( 2806 2800 rdev, 2807 - r10_bio->devs[r10_bio->read_slot].addr 2801 + r10_bio->devs[slot].addr 2808 2802 + sect, 2809 2803 s, 0)) { 2810 2804 md_error(mddev, rdev); 2811 - r10_bio->devs[r10_bio->read_slot].bio 2805 + r10_bio->devs[slot].bio 2812 2806 = IO_BLOCKED; 2813 2807 } 2814 2808 break; ··· 2817 2811 start = sl; 2818 2812 /* write it back and re-read */ 2819 2813 rcu_read_lock(); 2820 - while (sl != r10_bio->read_slot) { 2814 + while (sl != slot) { 2821 2815 if (sl==0) 2822 2816 sl = conf->copies; 2823 2817 sl--; ··· 2851 2845 rcu_read_lock(); 2852 2846 } 2853 2847 sl = start; 2854 - while (sl != r10_bio->read_slot) { 2848 + while (sl != slot) { 2855 2849 if (sl==0) 2856 2850 sl = conf->copies; 2857 2851 sl--; ··· 2991 2985 2992 2986 rdev_dec_pending(rdev, mddev); 2993 2987 r10_bio->state = 0; 2994 - raid10_read_request(mddev, r10_bio->master_bio, r10_bio); 2988 + raid10_read_request(mddev, r10_bio->master_bio, r10_bio, false); 2995 2989 /* 2996 2990 * allow_barrier after re-submit to ensure no sync io 2997 2991 * can be issued while regular io pending. ··· 4320 4314 return 0; 4321 4315 4322 4316 out_free_conf: 4323 - md_unregister_thread(&mddev->thread); 4317 + md_unregister_thread(mddev, &mddev->thread); 4324 4318 raid10_free_conf(conf); 4325 4319 mddev->private = NULL; 4326 4320 out: ··· 4417 4411 rdev->new_raid_disk = rdev->raid_disk * 2; 4418 4412 rdev->sectors = size; 4419 4413 } 4420 - WRITE_ONCE(conf->barrier, 1); 4421 4414 } 4422 4415 4423 4416 return conf;

-1

drivers/md/raid10.h

··· 123 123 sector_t sector; /* virtual sector number */ 124 124 int sectors; 125 125 unsigned long state; 126 - unsigned long start_time; 127 126 struct mddev *mddev; 128 127 /* 129 128 * original bio going to /dev/mdx

+8 -6

drivers/md/raid5-cache.c

··· 1260 1260 1261 1261 if (bio->bi_status) 1262 1262 md_error(log->rdev->mddev, log->rdev); 1263 + bio_uninit(bio); 1263 1264 1264 1265 spin_lock_irqsave(&log->io_list_lock, flags); 1265 1266 list_for_each_entry(io, &log->flushing_ios, log_sibling) 1266 1267 r5l_io_run_stripes(io); 1267 1268 list_splice_tail_init(&log->flushing_ios, &log->finished_ios); 1268 1269 spin_unlock_irqrestore(&log->io_list_lock, flags); 1269 - 1270 - bio_uninit(bio); 1271 1270 } 1272 1271 1273 1272 /* ··· 3167 3168 { 3168 3169 struct r5l_log *log = conf->log; 3169 3170 3170 - /* Ensure disable_writeback_work wakes up and exits */ 3171 + md_unregister_thread(conf->mddev, &log->reclaim_thread); 3172 + 3173 + /* 3174 + * 'reconfig_mutex' is held by caller, set 'confg->log' to NULL to 3175 + * ensure disable_writeback_work wakes up and exits. 3176 + */ 3177 + conf->log = NULL; 3171 3178 wake_up(&conf->mddev->sb_wait); 3172 3179 flush_work(&log->disable_writeback_work); 3173 - md_unregister_thread(&log->reclaim_thread); 3174 - 3175 - conf->log = NULL; 3176 3180 3177 3181 mempool_exit(&log->meta_pool); 3178 3182 bioset_exit(&log->bs);

+20 -52

drivers/md/raid5.c

··· 5468 5468 */ 5469 5469 static void raid5_align_endio(struct bio *bi) 5470 5470 { 5471 - struct md_io_acct *md_io_acct = bi->bi_private; 5472 - struct bio *raid_bi = md_io_acct->orig_bio; 5473 - struct mddev *mddev; 5474 - struct r5conf *conf; 5475 - struct md_rdev *rdev; 5471 + struct bio *raid_bi = bi->bi_private; 5472 + struct md_rdev *rdev = (void *)raid_bi->bi_next; 5473 + struct mddev *mddev = rdev->mddev; 5474 + struct r5conf *conf = mddev->private; 5476 5475 blk_status_t error = bi->bi_status; 5477 - unsigned long start_time = md_io_acct->start_time; 5478 5476 5479 5477 bio_put(bi); 5480 - 5481 - rdev = (void*)raid_bi->bi_next; 5482 5478 raid_bi->bi_next = NULL; 5483 - mddev = rdev->mddev; 5484 - conf = mddev->private; 5485 - 5486 5479 rdev_dec_pending(rdev, conf->mddev); 5487 5480 5488 5481 if (!error) { 5489 - if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue)) 5490 - bio_end_io_acct(raid_bi, start_time); 5491 5482 bio_endio(raid_bi); 5492 5483 if (atomic_dec_and_test(&conf->active_aligned_reads)) 5493 5484 wake_up(&conf->wait_for_quiescent); ··· 5497 5506 struct md_rdev *rdev; 5498 5507 sector_t sector, end_sector, first_bad; 5499 5508 int bad_sectors, dd_idx; 5500 - struct md_io_acct *md_io_acct; 5501 5509 bool did_inc; 5502 5510 5503 5511 if (!in_chunk_boundary(mddev, raid_bio)) { ··· 5533 5543 return 0; 5534 5544 } 5535 5545 5536 - align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO, 5537 - &mddev->io_acct_set); 5538 - md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone); 5546 + md_account_bio(mddev, &raid_bio); 5539 5547 raid_bio->bi_next = (void *)rdev; 5540 - if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue)) 5541 - md_io_acct->start_time = bio_start_io_acct(raid_bio); 5542 - md_io_acct->orig_bio = raid_bio; 5543 5548 5549 + align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO, 5550 + &mddev->bio_set); 5544 5551 align_bio->bi_end_io = raid5_align_endio; 5545 - align_bio->bi_private = md_io_acct; 5552 + align_bio->bi_private = raid_bio; 5546 5553 align_bio->bi_iter.bi_sector = sector; 5547 5554 5548 5555 /* No reshape active, so we can trust rdev->data_offset */ ··· 7774 7787 struct md_rdev *rdev; 7775 7788 struct md_rdev *journal_dev = NULL; 7776 7789 sector_t reshape_offset = 0; 7777 - int i, ret = 0; 7790 + int i; 7778 7791 long long min_offset_diff = 0; 7779 7792 int first = 1; 7780 7793 7781 - if (acct_bioset_init(mddev)) { 7782 - pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev)); 7794 + if (mddev_init_writes_pending(mddev) < 0) 7783 7795 return -ENOMEM; 7784 - } 7785 - 7786 - if (mddev_init_writes_pending(mddev) < 0) { 7787 - ret = -ENOMEM; 7788 - goto exit_acct_set; 7789 - } 7790 7796 7791 7797 if (mddev->recovery_cp != MaxSector) 7792 7798 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", ··· 7810 7830 (mddev->bitmap_info.offset || mddev->bitmap_info.file)) { 7811 7831 pr_notice("md/raid:%s: array cannot have both journal and bitmap\n", 7812 7832 mdname(mddev)); 7813 - ret = -EINVAL; 7814 - goto exit_acct_set; 7833 + return -EINVAL; 7815 7834 } 7816 7835 7817 7836 if (mddev->reshape_position != MaxSector) { ··· 7835 7856 if (journal_dev) { 7836 7857 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", 7837 7858 mdname(mddev)); 7838 - ret = -EINVAL; 7839 - goto exit_acct_set; 7859 + return -EINVAL; 7840 7860 } 7841 7861 7842 7862 if (mddev->new_level != mddev->level) { 7843 7863 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", 7844 7864 mdname(mddev)); 7845 - ret = -EINVAL; 7846 - goto exit_acct_set; 7865 + return -EINVAL; 7847 7866 } 7848 7867 old_disks = mddev->raid_disks - mddev->delta_disks; 7849 7868 /* reshape_position must be on a new-stripe boundary, and one ··· 7857 7880 if (sector_div(here_new, chunk_sectors * new_data_disks)) { 7858 7881 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", 7859 7882 mdname(mddev)); 7860 - ret = -EINVAL; 7861 - goto exit_acct_set; 7883 + return -EINVAL; 7862 7884 } 7863 7885 reshape_offset = here_new * chunk_sectors; 7864 7886 /* here_new is the stripe we will write to */ ··· 7879 7903 else if (mddev->ro == 0) { 7880 7904 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", 7881 7905 mdname(mddev)); 7882 - ret = -EINVAL; 7883 - goto exit_acct_set; 7906 + return -EINVAL; 7884 7907 } 7885 7908 } else if (mddev->reshape_backwards 7886 7909 ? (here_new * chunk_sectors + min_offset_diff <= ··· 7889 7914 /* Reading from the same stripe as writing to - bad */ 7890 7915 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", 7891 7916 mdname(mddev)); 7892 - ret = -EINVAL; 7893 - goto exit_acct_set; 7917 + return -EINVAL; 7894 7918 } 7895 7919 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); 7896 7920 /* OK, we should be able to continue; */ ··· 7913 7939 else 7914 7940 conf = mddev->private; 7915 7941 7916 - if (IS_ERR(conf)) { 7917 - ret = PTR_ERR(conf); 7918 - goto exit_acct_set; 7919 - } 7942 + if (IS_ERR(conf)) 7943 + return PTR_ERR(conf); 7920 7944 7921 7945 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 7922 7946 if (!journal_dev) { ··· 8107 8135 8108 8136 return 0; 8109 8137 abort: 8110 - md_unregister_thread(&mddev->thread); 8138 + md_unregister_thread(mddev, &mddev->thread); 8111 8139 print_raid5_conf(conf); 8112 8140 free_conf(conf); 8113 8141 mddev->private = NULL; 8114 8142 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); 8115 - ret = -EIO; 8116 - exit_acct_set: 8117 - acct_bioset_exit(mddev); 8118 - return ret; 8143 + return -EIO; 8119 8144 } 8120 8145 8121 8146 static void raid5_free(struct mddev *mddev, void *priv) ··· 8120 8151 struct r5conf *conf = priv; 8121 8152 8122 8153 free_conf(conf); 8123 - acct_bioset_exit(mddev); 8124 8154 mddev->to_remove = &raid5_attrs_group; 8125 8155 } 8126 8156

-1

drivers/nvme/host/ioctl.c

··· 118 118 goto out_free_meta; 119 119 } 120 120 121 - bip->bip_iter.bi_size = len; 122 121 bip->bip_iter.bi_sector = seed; 123 122 ret = bio_integrity_add_page(bio, virt_to_page(buf), len, 124 123 offset_in_page(buf));

+1 -2

drivers/nvme/target/io-cmd-bdev.c

··· 206 206 return PTR_ERR(bip); 207 207 } 208 208 209 - bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio)); 210 209 /* virtual start sector must be in integrity interval units */ 211 210 bip_set_seed(bip, bio->bi_iter.bi_sector >> 212 211 (bi->interval_exp - SECTOR_SHIFT)); 213 212 214 - resid = bip->bip_iter.bi_size; 213 + resid = bio_integrity_bytes(bi, bio_sectors(bio)); 215 214 while (resid > 0 && sg_miter_next(miter)) { 216 215 len = min_t(size_t, miter->length, resid); 217 216 rc = bio_integrity_add_page(bio, miter->page, len,

+4 -8

drivers/scsi/scsi_lib.c

··· 300 300 cmd->budget_token = -1; 301 301 } 302 302 303 - static void scsi_kick_queue(struct request_queue *q) 304 - { 305 - blk_mq_run_hw_queues(q, false); 306 - } 307 - 308 303 /* 309 304 * Kick the queue of SCSI device @sdev if @sdev != current_sdev. Called with 310 305 * interrupts disabled. ··· 335 340 * but in most cases, we will be first. Ideally, each LU on the 336 341 * target would get some limited time or requests on the target. 337 342 */ 338 - scsi_kick_queue(current_sdev->request_queue); 343 + blk_mq_run_hw_queues(current_sdev->request_queue, 344 + shost->queuecommand_may_block); 339 345 340 346 spin_lock_irqsave(shost->host_lock, flags); 341 347 if (!starget->starget_sdev_user) ··· 423 427 continue; 424 428 spin_unlock_irqrestore(shost->host_lock, flags); 425 429 426 - scsi_kick_queue(slq); 430 + blk_mq_run_hw_queues(slq, false); 427 431 blk_put_queue(slq); 428 432 429 433 spin_lock_irqsave(shost->host_lock, flags); ··· 448 452 if (!list_empty(&sdev->host->starved_list)) 449 453 scsi_starved_list_run(sdev->host); 450 454 455 + /* Note: blk_mq_kick_requeue_list() runs the queue asynchronously. */ 451 456 blk_mq_kick_requeue_list(q); 452 - blk_mq_run_hw_queues(q, false); 453 457 } 454 458 455 459 void scsi_requeue_run_queue(struct work_struct *work)

+1 -2

drivers/target/target_core_iblock.c

··· 689 689 return PTR_ERR(bip); 690 690 } 691 691 692 - bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio)); 693 692 /* virtual start sector must be in integrity interval units */ 694 693 bip_set_seed(bip, bio->bi_iter.bi_sector >> 695 694 (bi->interval_exp - SECTOR_SHIFT)); ··· 696 697 pr_debug("IBLOCK BIP Size: %u Sector: %llu\n", bip->bip_iter.bi_size, 697 698 (unsigned long long)bip->bip_iter.bi_sector); 698 699 699 - resid = bip->bip_iter.bi_size; 700 + resid = bio_integrity_bytes(bi, bio_sectors(bio)); 700 701 while (resid > 0 && sg_miter_next(miter)) { 701 702 702 703 len = min_t(size_t, miter->length, resid);

+4

fs/Kconfig

··· 18 18 config FS_IOMAP 19 19 bool 20 20 21 + config BUFFER_HEAD 22 + bool 23 + 21 24 # old blockdev_direct_IO implementation. Use iomap for new code instead 22 25 config LEGACY_DIRECT_IO 26 + depends on BUFFER_HEAD 23 27 bool 24 28 25 29 if BLOCK

+1 -1

fs/Makefile

··· 17 17 fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ 18 18 kernel_read_file.o mnt_idmapping.o remap_range.o 19 19 20 - obj-$(CONFIG_BLOCK) += buffer.o mpage.o 20 + obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o 21 21 obj-$(CONFIG_PROC_FS) += proc_namespace.o 22 22 obj-$(CONFIG_LEGACY_DIRECT_IO) += direct-io.o 23 23 obj-y += notify/

+1

fs/adfs/Kconfig

··· 2 2 config ADFS_FS 3 3 tristate "ADFS file system support" 4 4 depends on BLOCK 5 + select BUFFER_HEAD 5 6 help 6 7 The Acorn Disc Filing System is the standard file system of the 7 8 RiscOS operating system which runs on Acorn's ARM-based Risc PC

+1

fs/affs/Kconfig

··· 2 2 config AFFS_FS 3 3 tristate "Amiga FFS file system support" 4 4 depends on BLOCK 5 + select BUFFER_HEAD 5 6 select LEGACY_DIRECT_IO 6 7 help 7 8 The Fast File System (FFS) is the common file system used on hard

+1

fs/befs/Kconfig

··· 2 2 config BEFS_FS 3 3 tristate "BeOS file system (BeFS) support (read only)" 4 4 depends on BLOCK 5 + select BUFFER_HEAD 5 6 select NLS 6 7 help 7 8 The BeOS File System (BeFS) is the native file system of Be, Inc's

+1

fs/bfs/Kconfig

··· 2 2 config BFS_FS 3 3 tristate "BFS file system support" 4 4 depends on BLOCK 5 + select BUFFER_HEAD 5 6 help 6 7 Boot File System (BFS) is a file system used under SCO UnixWare to 7 8 allow the bootloader access to the kernel image and other important

-6

fs/buffer.c

··· 563 563 return err; 564 564 } 565 565 566 - void emergency_thaw_bdev(struct super_block *sb) 567 - { 568 - while (sb->s_bdev && !thaw_bdev(sb->s_bdev)) 569 - printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev); 570 - } 571 - 572 566 /** 573 567 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers 574 568 * @mapping: the mapping which wants those buffers written

+1

fs/efs/Kconfig

··· 2 2 config EFS_FS 3 3 tristate "EFS file system support (read only)" 4 4 depends on BLOCK 5 + select BUFFER_HEAD 5 6 help 6 7 EFS is an older file system used for non-ISO9660 CD-ROMs and hard 7 8 disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer

+1

fs/exfat/Kconfig

··· 2 2 3 3 config EXFAT_FS 4 4 tristate "exFAT filesystem support" 5 + select BUFFER_HEAD 5 6 select NLS 6 7 select LEGACY_DIRECT_IO 7 8 help

+1

fs/ext2/Kconfig

··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 config EXT2_FS 3 3 tristate "Second extended fs support" 4 + select BUFFER_HEAD 4 5 select FS_IOMAP 5 6 select LEGACY_DIRECT_IO 6 7 help

+1

fs/ext4/Kconfig

··· 28 28 29 29 config EXT4_FS 30 30 tristate "The Extended 4 (ext4) filesystem" 31 + select BUFFER_HEAD 31 32 select JBD2 32 33 select CRC16 33 34 select CRYPTO

+1 -1

fs/ext4/inode.c

··· 6138 6138 if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 6139 6139 goto retry_alloc; 6140 6140 out_ret: 6141 - ret = block_page_mkwrite_return(err); 6141 + ret = vmf_fs_error(err); 6142 6142 out: 6143 6143 filemap_invalidate_unlock_shared(mapping); 6144 6144 sb_end_pagefault(inode->i_sb);

+1

fs/f2fs/Kconfig

··· 2 2 config F2FS_FS 3 3 tristate "F2FS filesystem support" 4 4 depends on BLOCK 5 + select BUFFER_HEAD 5 6 select NLS 6 7 select CRYPTO 7 8 select CRYPTO_CRC32

+1 -1

fs/f2fs/file.c

··· 159 159 160 160 sb_end_pagefault(inode->i_sb); 161 161 err: 162 - return block_page_mkwrite_return(err); 162 + return vmf_fs_error(err); 163 163 } 164 164 165 165 static const struct vm_operations_struct f2fs_file_vm_ops = {

+1

fs/fat/Kconfig

··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 config FAT_FS 3 3 tristate 4 + select BUFFER_HEAD 4 5 select NLS 5 6 select LEGACY_DIRECT_IO 6 7 help

+1

fs/freevxfs/Kconfig

··· 2 2 config VXFS_FS 3 3 tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)" 4 4 depends on BLOCK 5 + select BUFFER_HEAD 5 6 help 6 7 FreeVxFS is a file system driver that support the VERITAS VxFS(TM) 7 8 file system format. VERITAS VxFS(TM) is the standard file system

+1

fs/gfs2/Kconfig

··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 config GFS2_FS 3 3 tristate "GFS2 file system support" 4 + select BUFFER_HEAD 4 5 select FS_POSIX_ACL 5 6 select CRC32 6 7 select LIBCRC32C

+8 -8

fs/gfs2/file.c

··· 432 432 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 433 433 err = gfs2_glock_nq(&gh); 434 434 if (err) { 435 - ret = block_page_mkwrite_return(err); 435 + ret = vmf_fs_error(err); 436 436 goto out_uninit; 437 437 } 438 438 ··· 474 474 475 475 err = gfs2_rindex_update(sdp); 476 476 if (err) { 477 - ret = block_page_mkwrite_return(err); 477 + ret = vmf_fs_error(err); 478 478 goto out_unlock; 479 479 } 480 480 ··· 482 482 ap.target = data_blocks + ind_blocks; 483 483 err = gfs2_quota_lock_check(ip, &ap); 484 484 if (err) { 485 - ret = block_page_mkwrite_return(err); 485 + ret = vmf_fs_error(err); 486 486 goto out_unlock; 487 487 } 488 488 err = gfs2_inplace_reserve(ip, &ap); 489 489 if (err) { 490 - ret = block_page_mkwrite_return(err); 490 + ret = vmf_fs_error(err); 491 491 goto out_quota_unlock; 492 492 } 493 493 ··· 500 500 } 501 501 err = gfs2_trans_begin(sdp, rblocks, 0); 502 502 if (err) { 503 - ret = block_page_mkwrite_return(err); 503 + ret = vmf_fs_error(err); 504 504 goto out_trans_fail; 505 505 } 506 506 ··· 508 508 if (gfs2_is_stuffed(ip)) { 509 509 err = gfs2_unstuff_dinode(ip); 510 510 if (err) { 511 - ret = block_page_mkwrite_return(err); 511 + ret = vmf_fs_error(err); 512 512 goto out_trans_end; 513 513 } 514 514 } ··· 524 524 525 525 err = gfs2_allocate_page_backing(page, length); 526 526 if (err) 527 - ret = block_page_mkwrite_return(err); 527 + ret = vmf_fs_error(err); 528 528 529 529 out_page_locked: 530 530 if (ret != VM_FAULT_LOCKED) ··· 558 558 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); 559 559 err = gfs2_glock_nq(&gh); 560 560 if (err) { 561 - ret = block_page_mkwrite_return(err); 561 + ret = vmf_fs_error(err); 562 562 goto out_uninit; 563 563 } 564 564 ret = filemap_fault(vmf);

+1

fs/hfs/Kconfig

··· 2 2 config HFS_FS 3 3 tristate "Apple Macintosh file system support" 4 4 depends on BLOCK 5 + select BUFFER_HEAD 5 6 select NLS 6 7 select LEGACY_DIRECT_IO 7 8 help

+1

fs/hfsplus/Kconfig

··· 2 2 config HFSPLUS_FS 3 3 tristate "Apple Extended HFS file system support" 4 4 depends on BLOCK 5 + select BUFFER_HEAD 5 6 select NLS 6 7 select NLS_UTF8 7 8 select LEGACY_DIRECT_IO

+1

fs/hpfs/Kconfig

··· 2 2 config HPFS_FS 3 3 tristate "OS/2 HPFS file system support" 4 4 depends on BLOCK 5 + select BUFFER_HEAD 5 6 select FS_IOMAP 6 7 help 7 8 OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS

-6

fs/internal.h

··· 23 23 */ 24 24 #ifdef CONFIG_BLOCK 25 25 extern void __init bdev_cache_init(void); 26 - 27 - void emergency_thaw_bdev(struct super_block *sb); 28 26 #else 29 27 static inline void bdev_cache_init(void) 30 28 { 31 - } 32 - static inline int emergency_thaw_bdev(struct super_block *sb) 33 - { 34 - return 0; 35 29 } 36 30 #endif /* CONFIG_BLOCK */ 37 31

+1 -1

fs/iomap/buffered-io.c

··· 1436 1436 return VM_FAULT_LOCKED; 1437 1437 out_unlock: 1438 1438 folio_unlock(folio); 1439 - return block_page_mkwrite_return(ret); 1439 + return vmf_fs_error(ret); 1440 1440 } 1441 1441 EXPORT_SYMBOL_GPL(iomap_page_mkwrite); 1442 1442

+1

fs/isofs/Kconfig

··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 config ISO9660_FS 3 3 tristate "ISO 9660 CDROM file system support" 4 + select BUFFER_HEAD 4 5 help 5 6 This is the standard file system used on CD-ROMs. It was previously 6 7 known as "High Sierra File System" and is called "hsfs" on other

+1

fs/jfs/Kconfig

··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 config JFS_FS 3 3 tristate "JFS filesystem support" 4 + select BUFFER_HEAD 4 5 select NLS 5 6 select CRC32 6 7 select LEGACY_DIRECT_IO

+1

fs/minix/Kconfig

··· 2 2 config MINIX_FS 3 3 tristate "Minix file system support" 4 4 depends on BLOCK 5 + select BUFFER_HEAD 5 6 help 6 7 Minix is a simple operating system used in many classes about OS's. 7 8 The minix file system (method to organize files on a hard disk

+1

fs/nilfs2/Kconfig

··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 config NILFS2_FS 3 3 tristate "NILFS2 file system support" 4 + select BUFFER_HEAD 4 5 select CRC32 5 6 select LEGACY_DIRECT_IO 6 7 help

+1 -1

fs/nilfs2/file.c

··· 108 108 wait_for_stable_page(page); 109 109 out: 110 110 sb_end_pagefault(inode->i_sb); 111 - return block_page_mkwrite_return(ret); 111 + return vmf_fs_error(ret); 112 112 } 113 113 114 114 static const struct vm_operations_struct nilfs_file_vm_ops = {

+1

fs/ntfs/Kconfig

··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 config NTFS_FS 3 3 tristate "NTFS file system support" 4 + select BUFFER_HEAD 4 5 select NLS 5 6 help 6 7 NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.

+1

fs/ntfs3/Kconfig

··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 config NTFS3_FS 3 3 tristate "NTFS Read-Write file system support" 4 + select BUFFER_HEAD 4 5 select NLS 5 6 select LEGACY_DIRECT_IO 6 7 help

+1

fs/ocfs2/Kconfig

··· 2 2 config OCFS2_FS 3 3 tristate "OCFS2 file system support" 4 4 depends on INET && SYSFS && CONFIGFS_FS 5 + select BUFFER_HEAD 5 6 select JBD2 6 7 select CRC32 7 8 select QUOTA

+1

fs/omfs/Kconfig

··· 2 2 config OMFS_FS 3 3 tristate "SonicBlue Optimized MPEG File System support" 4 4 depends on BLOCK 5 + select BUFFER_HEAD 5 6 select CRC_ITU_T 6 7 help 7 8 This is the proprietary file system used by the Rio Karma music

+1

fs/qnx4/Kconfig

··· 2 2 config QNX4FS_FS 3 3 tristate "QNX4 file system support (read only)" 4 4 depends on BLOCK 5 + select BUFFER_HEAD 5 6 help 6 7 This is the file system used by the real-time operating systems 7 8 QNX 4 and QNX 6 (the latter is also called QNX RTP).

+1

fs/qnx6/Kconfig

··· 2 2 config QNX6FS_FS 3 3 tristate "QNX6 file system support (read only)" 4 4 depends on BLOCK && CRC32 5 + select BUFFER_HEAD 5 6 help 6 7 This is the file system used by the real-time operating systems 7 8 QNX 6 (also called QNX RTP).

+1

fs/reiserfs/Kconfig

··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 config REISERFS_FS 3 3 tristate "Reiserfs support (deprecated)" 4 + select BUFFER_HEAD 4 5 select CRC32 5 6 select LEGACY_DIRECT_IO 6 7 help

+1

fs/romfs/Kconfig

··· 57 57 config ROMFS_ON_BLOCK 58 58 bool 59 59 default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH 60 + select BUFFER_HEAD 60 61 61 62 config ROMFS_ON_MTD 62 63 bool

+3 -1

fs/super.c

··· 1209 1209 bool born = super_lock_excl(sb); 1210 1210 1211 1211 if (born && sb->s_root) { 1212 - emergency_thaw_bdev(sb); 1212 + if (IS_ENABLED(CONFIG_BLOCK)) 1213 + while (sb->s_bdev && !thaw_bdev(sb->s_bdev)) 1214 + pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); 1213 1215 thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE); 1214 1216 } else { 1215 1217 super_unlock_excl(sb);

+1

fs/sysv/Kconfig

··· 2 2 config SYSV_FS 3 3 tristate "System V/Xenix/V7/Coherent file system support" 4 4 depends on BLOCK 5 + select BUFFER_HEAD 5 6 help 6 7 SCO, Xenix and Coherent are commercial Unix systems for Intel 7 8 machines, and Version 7 was used on the DEC PDP-11. Saying Y

+1

fs/udf/Kconfig

··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 config UDF_FS 3 3 tristate "UDF file system support" 4 + select BUFFER_HEAD 4 5 select CRC_ITU_T 5 6 select NLS 6 7 select LEGACY_DIRECT_IO

+1 -1

fs/udf/file.c

··· 65 65 err = __block_write_begin(page, 0, end, udf_get_block); 66 66 if (err) { 67 67 unlock_page(page); 68 - ret = block_page_mkwrite_return(err); 68 + ret = vmf_fs_error(err); 69 69 goto out_unlock; 70 70 } 71 71

+1

fs/ufs/Kconfig

··· 2 2 config UFS_FS 3 3 tristate "UFS file system support (read only)" 4 4 depends on BLOCK 5 + select BUFFER_HEAD 5 6 help 6 7 BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, 7 8 OpenBSD and NeXTstep) use a file system called UFS. Some System V

+6 -1

include/linux/bio.h

··· 493 493 extern void bio_copy_data(struct bio *dst, struct bio *src); 494 494 extern void bio_free_pages(struct bio *bio); 495 495 void guard_bio_eod(struct bio *bio); 496 - void zero_fill_bio(struct bio *bio); 496 + void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter); 497 + 498 + static inline void zero_fill_bio(struct bio *bio) 499 + { 500 + zero_fill_bio_iter(bio, bio->bi_iter); 501 + } 497 502 498 503 static inline void bio_release_pages(struct bio *bio, bool mark_dirty) 499 504 {

+1 -5

include/linux/blk-mq.h

··· 178 178 179 179 struct { 180 180 unsigned int seq; 181 - struct list_head list; 182 181 rq_end_io_fn *saved_end_io; 183 182 } flush; 184 183 185 - union { 186 - struct __call_single_data csd; 187 - u64 fifo_time; 188 - }; 184 + u64 fifo_time; 189 185 190 186 /* 191 187 * completion callback.

+2

include/linux/blkdev.h

··· 538 538 #define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ 539 539 #define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ 540 540 #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ 541 + #define QUEUE_FLAG_HW_WC 18 /* Write back caching supported */ 541 542 #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ 542 543 #define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */ 543 544 #define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ ··· 847 846 848 847 int blk_status_to_errno(blk_status_t status); 849 848 blk_status_t errno_to_blk_status(int errno); 849 + const char *blk_status_to_str(blk_status_t status); 850 850 851 851 /* only poll the hardware once, don't continue until a completion was found */ 852 852 #define BLK_POLL_ONESHOT (1 << 0)

+16 -28

include/linux/buffer_head.h

··· 16 16 #include <linux/wait.h> 17 17 #include <linux/atomic.h> 18 18 19 - #ifdef CONFIG_BLOCK 20 - 21 19 enum bh_state_bits { 22 20 BH_Uptodate, /* Contains valid data */ 23 21 BH_Dirty, /* Is dirty */ ··· 194 196 void touch_buffer(struct buffer_head *bh); 195 197 void folio_set_bh(struct buffer_head *bh, struct folio *folio, 196 198 unsigned long offset); 197 - bool try_to_free_buffers(struct folio *); 198 199 struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, 199 200 bool retry); 200 201 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, ··· 208 211 209 212 /* Things to do with buffers at mapping->private_list */ 210 213 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode); 211 - int inode_has_buffers(struct inode *); 212 - void invalidate_inode_buffers(struct inode *); 213 - int remove_inode_buffers(struct inode *inode); 214 - int sync_mapping_buffers(struct address_space *mapping); 215 214 int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end, 216 215 bool datasync); 217 216 int generic_buffers_fsync(struct file *file, loff_t start, loff_t end, ··· 231 238 void __breadahead(struct block_device *, sector_t block, unsigned int size); 232 239 struct buffer_head *__bread_gfp(struct block_device *, 233 240 sector_t block, unsigned size, gfp_t gfp); 234 - void invalidate_bh_lrus(void); 235 - void invalidate_bh_lrus_cpu(void); 236 - bool has_bh_in_lru(int cpu, void *dummy); 237 241 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); 238 242 void free_buffer_head(struct buffer_head * bh); 239 243 void unlock_buffer(struct buffer_head *bh); ··· 245 255 int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait); 246 256 void __bh_read_batch(int nr, struct buffer_head *bhs[], 247 257 blk_opf_t op_flags, bool force_lock); 248 - 249 - extern int buffer_heads_over_limit; 250 258 251 259 /* 252 260 * Generic address_space_operations implementations for buffer_head-backed ··· 277 289 void block_commit_write(struct page *page, unsigned int from, unsigned int to); 278 290 int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, 279 291 get_block_t get_block); 280 - /* Convert errno to return value from ->page_mkwrite() call */ 281 - static inline vm_fault_t block_page_mkwrite_return(int err) 282 - { 283 - if (err == 0) 284 - return VM_FAULT_LOCKED; 285 - if (err == -EFAULT || err == -EAGAIN) 286 - return VM_FAULT_NOPAGE; 287 - if (err == -ENOMEM) 288 - return VM_FAULT_OOM; 289 - /* -ENOSPC, -EDQUOT, -EIO ... */ 290 - return VM_FAULT_SIGBUS; 291 - } 292 292 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); 293 293 int block_truncate_page(struct address_space *, loff_t, get_block_t *); 294 294 ··· 289 313 #define buffer_migrate_folio NULL 290 314 #define buffer_migrate_folio_norefs NULL 291 315 #endif 292 - 293 - void buffer_init(void); 294 316 295 317 /* 296 318 * inline definitions ··· 449 475 450 476 bool block_dirty_folio(struct address_space *mapping, struct folio *folio); 451 477 452 - #else /* CONFIG_BLOCK */ 478 + #ifdef CONFIG_BUFFER_HEAD 479 + 480 + void buffer_init(void); 481 + bool try_to_free_buffers(struct folio *folio); 482 + int inode_has_buffers(struct inode *inode); 483 + void invalidate_inode_buffers(struct inode *inode); 484 + int remove_inode_buffers(struct inode *inode); 485 + int sync_mapping_buffers(struct address_space *mapping); 486 + void invalidate_bh_lrus(void); 487 + void invalidate_bh_lrus_cpu(void); 488 + bool has_bh_in_lru(int cpu, void *dummy); 489 + extern int buffer_heads_over_limit; 490 + 491 + #else /* CONFIG_BUFFER_HEAD */ 453 492 454 493 static inline void buffer_init(void) {} 455 494 static inline bool try_to_free_buffers(struct folio *folio) { return true; } ··· 470 483 static inline void invalidate_inode_buffers(struct inode *inode) {} 471 484 static inline int remove_inode_buffers(struct inode *inode) { return 1; } 472 485 static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } 486 + static inline void invalidate_bh_lrus(void) {} 473 487 static inline void invalidate_bh_lrus_cpu(void) {} 474 488 static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; } 475 489 #define buffer_heads_over_limit 0 476 490 477 - #endif /* CONFIG_BLOCK */ 491 + #endif /* CONFIG_BUFFER_HEAD */ 478 492 #endif /* _LINUX_BUFFER_HEAD_H */

+4

include/linux/iomap.h

··· 58 58 #define IOMAP_F_DIRTY (1U << 1) 59 59 #define IOMAP_F_SHARED (1U << 2) 60 60 #define IOMAP_F_MERGED (1U << 3) 61 + #ifdef CONFIG_BUFFER_HEAD 61 62 #define IOMAP_F_BUFFER_HEAD (1U << 4) 63 + #else 64 + #define IOMAP_F_BUFFER_HEAD 0 65 + #endif /* CONFIG_BUFFER_HEAD */ 62 66 #define IOMAP_F_XATTR (1U << 5) 63 67 64 68 /*

+18

include/linux/mm.h

··· 3475 3475 return VM_FAULT_SIGBUS; 3476 3476 } 3477 3477 3478 + /* 3479 + * Convert errno to return value for ->page_mkwrite() calls. 3480 + * 3481 + * This should eventually be merged with vmf_error() above, but will need a 3482 + * careful audit of all vmf_error() callers. 3483 + */ 3484 + static inline vm_fault_t vmf_fs_error(int err) 3485 + { 3486 + if (err == 0) 3487 + return VM_FAULT_LOCKED; 3488 + if (err == -EFAULT || err == -EAGAIN) 3489 + return VM_FAULT_NOPAGE; 3490 + if (err == -ENOMEM) 3491 + return VM_FAULT_OOM; 3492 + /* -ENOSPC, -EDQUOT, -EIO ... */ 3493 + return VM_FAULT_SIGBUS; 3494 + } 3495 + 3478 3496 struct page *follow_page(struct vm_area_struct *vma, unsigned long address, 3479 3497 unsigned int foll_flags); 3480 3498

+5

include/linux/sed-opal.h

··· 25 25 struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv); 26 26 int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *ioctl_ptr); 27 27 28 + #define OPAL_AUTH_KEY "opal-boot-pin" 29 + #define OPAL_AUTH_KEY_PREV "opal-boot-pin-prev" 30 + 28 31 static inline bool is_sed_ioctl(unsigned int cmd) 29 32 { 30 33 switch (cmd) { ··· 50 47 case IOC_OPAL_GET_STATUS: 51 48 case IOC_OPAL_GET_LR_STATUS: 52 49 case IOC_OPAL_GET_GEOMETRY: 50 + case IOC_OPAL_DISCOVERY: 51 + case IOC_OPAL_REVERT_LSP: 53 52 return true; 54 53 } 55 54 return false;

+2

include/trace/events/block.h

··· 12 12 13 13 #define RWBS_LEN 8 14 14 15 + #ifdef CONFIG_BUFFER_HEAD 15 16 DECLARE_EVENT_CLASS(block_buffer, 16 17 17 18 TP_PROTO(struct buffer_head *bh), ··· 62 61 63 62 TP_ARGS(bh) 64 63 ); 64 + #endif /* CONFIG_BUFFER_HEAD */ 65 65 66 66 /** 67 67 * block_rq_requeue - place block IO request back on a queue

+4 -4

include/trace/events/kyber.h

··· 31 31 32 32 TP_fast_assign( 33 33 __entry->dev = dev; 34 - strlcpy(__entry->domain, domain, sizeof(__entry->domain)); 35 - strlcpy(__entry->type, type, sizeof(__entry->type)); 34 + strscpy(__entry->domain, domain, sizeof(__entry->domain)); 35 + strscpy(__entry->type, type, sizeof(__entry->type)); 36 36 __entry->percentile = percentile; 37 37 __entry->numerator = numerator; 38 38 __entry->denominator = denominator; ··· 59 59 60 60 TP_fast_assign( 61 61 __entry->dev = dev; 62 - strlcpy(__entry->domain, domain, sizeof(__entry->domain)); 62 + strscpy(__entry->domain, domain, sizeof(__entry->domain)); 63 63 __entry->depth = depth; 64 64 ), 65 65 ··· 81 81 82 82 TP_fast_assign( 83 83 __entry->dev = dev; 84 - strlcpy(__entry->domain, domain, sizeof(__entry->domain)); 84 + strscpy(__entry->domain, domain, sizeof(__entry->domain)); 85 85 ), 86 86 87 87 TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev),

+4 -4

include/trace/events/wbt.h

··· 33 33 ), 34 34 35 35 TP_fast_assign( 36 - strlcpy(__entry->name, bdi_dev_name(bdi), 36 + strscpy(__entry->name, bdi_dev_name(bdi), 37 37 ARRAY_SIZE(__entry->name)); 38 38 __entry->rmean = stat[0].mean; 39 39 __entry->rmin = stat[0].min; ··· 68 68 ), 69 69 70 70 TP_fast_assign( 71 - strlcpy(__entry->name, bdi_dev_name(bdi), 71 + strscpy(__entry->name, bdi_dev_name(bdi), 72 72 ARRAY_SIZE(__entry->name)); 73 73 __entry->lat = div_u64(lat, 1000); 74 74 ), ··· 105 105 ), 106 106 107 107 TP_fast_assign( 108 - strlcpy(__entry->name, bdi_dev_name(bdi), 108 + strscpy(__entry->name, bdi_dev_name(bdi), 109 109 ARRAY_SIZE(__entry->name)); 110 110 __entry->msg = msg; 111 111 __entry->step = step; ··· 141 141 ), 142 142 143 143 TP_fast_assign( 144 - strlcpy(__entry->name, bdi_dev_name(bdi), 144 + strscpy(__entry->name, bdi_dev_name(bdi), 145 145 ARRAY_SIZE(__entry->name)); 146 146 __entry->status = status; 147 147 __entry->step = step;

+11 -10

include/uapi/linux/ioprio.h

··· 107 107 /* 108 108 * Return an I/O priority value based on a class, a level and a hint. 109 109 */ 110 - static __always_inline __u16 ioprio_value(int class, int level, int hint) 110 + static __always_inline __u16 ioprio_value(int prioclass, int priolevel, 111 + int priohint) 111 112 { 112 - if (IOPRIO_BAD_VALUE(class, IOPRIO_NR_CLASSES) || 113 - IOPRIO_BAD_VALUE(level, IOPRIO_NR_LEVELS) || 114 - IOPRIO_BAD_VALUE(hint, IOPRIO_NR_HINTS)) 113 + if (IOPRIO_BAD_VALUE(prioclass, IOPRIO_NR_CLASSES) || 114 + IOPRIO_BAD_VALUE(priolevel, IOPRIO_NR_LEVELS) || 115 + IOPRIO_BAD_VALUE(priohint, IOPRIO_NR_HINTS)) 115 116 return IOPRIO_CLASS_INVALID << IOPRIO_CLASS_SHIFT; 116 117 117 - return (class << IOPRIO_CLASS_SHIFT) | 118 - (hint << IOPRIO_HINT_SHIFT) | level; 118 + return (prioclass << IOPRIO_CLASS_SHIFT) | 119 + (priohint << IOPRIO_HINT_SHIFT) | priolevel; 119 120 } 120 121 121 - #define IOPRIO_PRIO_VALUE(class, level) \ 122 - ioprio_value(class, level, IOPRIO_HINT_NONE) 123 - #define IOPRIO_PRIO_VALUE_HINT(class, level, hint) \ 124 - ioprio_value(class, level, hint) 122 + #define IOPRIO_PRIO_VALUE(prioclass, priolevel) \ 123 + ioprio_value(prioclass, priolevel, IOPRIO_HINT_NONE) 124 + #define IOPRIO_PRIO_VALUE_HINT(prioclass, priolevel, priohint) \ 125 + ioprio_value(prioclass, priolevel, priohint) 125 126 126 127 #endif /* _UAPI_LINUX_IOPRIO_H */

+24 -1

include/uapi/linux/sed-opal.h

··· 49 49 OPAL_SAVE_FOR_LOCK = 0x01, 50 50 }; 51 51 52 + enum opal_key_type { 53 + OPAL_INCLUDED = 0, /* key[] is the key */ 54 + OPAL_KEYRING, /* key is in keyring */ 55 + }; 56 + 52 57 struct opal_key { 53 58 __u8 lr; 54 59 __u8 key_len; 55 - __u8 __align[6]; 60 + __u8 key_type; 61 + __u8 __align[5]; 56 62 __u8 key[OPAL_KEY_MAX]; 63 + }; 64 + 65 + enum opal_revert_lsp_opts { 66 + OPAL_PRESERVE = 0x01, 57 67 }; 58 68 59 69 struct opal_lr_act { ··· 183 173 __u8 __align[3]; 184 174 }; 185 175 176 + struct opal_discovery { 177 + __u64 data; 178 + __u64 size; 179 + }; 180 + 181 + struct opal_revert_lsp { 182 + struct opal_key key; 183 + __u32 options; 184 + __u32 __pad; 185 + }; 186 + 186 187 #define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) 187 188 #define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) 188 189 #define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) ··· 213 192 #define IOC_OPAL_GET_STATUS _IOR('p', 236, struct opal_status) 214 193 #define IOC_OPAL_GET_LR_STATUS _IOW('p', 237, struct opal_lr_status) 215 194 #define IOC_OPAL_GET_GEOMETRY _IOR('p', 238, struct opal_geometry) 195 + #define IOC_OPAL_DISCOVERY _IOW('p', 239, struct opal_discovery) 196 + #define IOC_OPAL_REVERT_LSP _IOW('p', 240, struct opal_revert_lsp) 216 197 217 198 #endif /* _UAPI_SED_OPAL_H */

+55 -9

include/uapi/linux/ublk_cmd.h

··· 176 176 /* Copy between request and user buffer by pread()/pwrite() */ 177 177 #define UBLK_F_USER_COPY (1UL << 7) 178 178 179 + /* 180 + * User space sets this flag when setting up the device to request zoned storage support. Kernel may 181 + * deny the request by returning an error. 182 + */ 183 + #define UBLK_F_ZONED (1ULL << 8) 184 + 179 185 /* device state */ 180 186 #define UBLK_S_DEV_DEAD 0 181 187 #define UBLK_S_DEV_LIVE 1 ··· 238 232 #define UBLK_IO_OP_READ 0 239 233 #define UBLK_IO_OP_WRITE 1 240 234 #define UBLK_IO_OP_FLUSH 2 241 - #define UBLK_IO_OP_DISCARD 3 242 - #define UBLK_IO_OP_WRITE_SAME 4 243 - #define UBLK_IO_OP_WRITE_ZEROES 5 235 + #define UBLK_IO_OP_DISCARD 3 236 + #define UBLK_IO_OP_WRITE_SAME 4 237 + #define UBLK_IO_OP_WRITE_ZEROES 5 238 + #define UBLK_IO_OP_ZONE_OPEN 10 239 + #define UBLK_IO_OP_ZONE_CLOSE 11 240 + #define UBLK_IO_OP_ZONE_FINISH 12 241 + #define UBLK_IO_OP_ZONE_APPEND 13 242 + #define UBLK_IO_OP_ZONE_RESET_ALL 14 243 + #define UBLK_IO_OP_ZONE_RESET 15 244 + /* 245 + * Construct a zone report. The report request is carried in `struct 246 + * ublksrv_io_desc`. The `start_sector` field must be the first sector of a zone 247 + * and shall indicate the first zone of the report. The `nr_zones` shall 248 + * indicate how many zones should be reported at most. The report shall be 249 + * delivered as a `struct blk_zone` array. To report fewer zones than requested, 250 + * zero the last entry of the returned array. 251 + * 252 + * Related definitions(blk_zone, blk_zone_cond, blk_zone_type, ...) in 253 + * include/uapi/linux/blkzoned.h are part of ublk UAPI. 254 + */ 255 + #define UBLK_IO_OP_REPORT_ZONES 18 244 256 245 257 #define UBLK_IO_F_FAILFAST_DEV (1U << 8) 246 258 #define UBLK_IO_F_FAILFAST_TRANSPORT (1U << 9) ··· 279 255 /* op: bit 0-7, flags: bit 8-31 */ 280 256 __u32 op_flags; 281 257 282 - __u32 nr_sectors; 258 + union { 259 + __u32 nr_sectors; 260 + __u32 nr_zones; /* for UBLK_IO_OP_REPORT_ZONES */ 261 + }; 283 262 284 263 /* start sector for this io */ 285 264 __u64 start_sector; ··· 311 284 /* io result, it is valid for COMMIT* command only */ 312 285 __s32 result; 313 286 314 - /* 315 - * userspace buffer address in ublksrv daemon process, valid for 316 - * FETCH* command only 317 - */ 318 - __u64 addr; 287 + union { 288 + /* 289 + * userspace buffer address in ublksrv daemon process, valid for 290 + * FETCH* command only 291 + * 292 + * `addr` should not be used when UBLK_F_USER_COPY is enabled, 293 + * because userspace handles data copy by pread()/pwrite() over 294 + * /dev/ublkcN. But in case of UBLK_F_ZONED, this union is 295 + * re-used to pass back the allocated LBA for 296 + * UBLK_IO_OP_ZONE_APPEND which actually depends on 297 + * UBLK_F_USER_COPY 298 + */ 299 + __u64 addr; 300 + __u64 zone_append_lba; 301 + }; 319 302 }; 320 303 321 304 struct ublk_param_basic { ··· 368 331 __u32 disk_minor; 369 332 }; 370 333 334 + struct ublk_param_zoned { 335 + __u32 max_open_zones; 336 + __u32 max_active_zones; 337 + __u32 max_zone_append_sectors; 338 + __u8 reserved[20]; 339 + }; 340 + 371 341 struct ublk_params { 372 342 /* 373 343 * Total length of parameters, userspace has to set 'len' for both ··· 386 342 #define UBLK_PARAM_TYPE_BASIC (1 << 0) 387 343 #define UBLK_PARAM_TYPE_DISCARD (1 << 1) 388 344 #define UBLK_PARAM_TYPE_DEVT (1 << 2) 345 + #define UBLK_PARAM_TYPE_ZONED (1 << 3) 389 346 __u32 types; /* types of parameter included */ 390 347 391 348 struct ublk_param_basic basic; 392 349 struct ublk_param_discard discard; 393 350 struct ublk_param_devt devt; 351 + struct ublk_param_zoned zoned; 394 352 }; 395 353 396 354 #endif

+2

lib/raid6/mktables.c

··· 56 56 uint8_t v; 57 57 uint8_t exptbl[256], invtbl[256]; 58 58 59 + printf("#ifdef __KERNEL__\n"); 59 60 printf("#include <linux/export.h>\n"); 61 + printf("#endif\n"); 60 62 printf("#include <linux/raid/pq.h>\n"); 61 63 62 64 /* Compute multiplication table */

-1

lib/raid6/recov.c

··· 13 13 * the syndrome.) 14 14 */ 15 15 16 - #include <linux/export.h> 17 16 #include <linux/raid/pq.h> 18 17 19 18 /* Recover two failed data blocks. */

+3

lib/raid6/test/.gitignore

··· 1 + /int.uc 2 + /neon.uc 3 + /raid6test

+26 -24

lib/raid6/test/Makefile

··· 6 6 7 7 pound := \# 8 8 9 - CC = gcc 10 - OPTFLAGS = -O2 # Adjust as desired 11 - CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) 12 - LD = ld 13 - AWK = awk -f 14 - AR = ar 15 - RANLIB = ranlib 16 - OBJS = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o 9 + # Adjust as desired 10 + CC = gcc 11 + OPTFLAGS = -O2 12 + CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) 13 + LD = ld 14 + AWK = awk -f 15 + AR = ar 16 + RANLIB = ranlib 17 + OBJS = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o 17 18 18 19 ARCH := $(shell uname -m 2>/dev/null | sed -e /s/i.86/i386/) 19 20 ifeq ($(ARCH),i386) ··· 35 34 HAS_NEON = yes 36 35 endif 37 36 37 + ifeq ($(findstring ppc,$(ARCH)),ppc) 38 + CFLAGS += -I../../../arch/powerpc/include 39 + HAS_ALTIVEC := $(shell printf '$(pound)include <altivec.h>\nvector int a;\n' |\ 40 + gcc -c -x c - >/dev/null && rm ./-.o && echo yes) 41 + endif 42 + 38 43 ifeq ($(IS_X86),yes) 39 44 OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o 40 45 CFLAGS += -DCONFIG_X86 41 - CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" | \ 42 - gcc -c -x assembler - >/dev/null 2>&1 && \ 43 - rm ./-.o && echo -DCONFIG_AS_AVX512=1) 46 + CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" | \ 47 + gcc -c -x assembler - >/dev/null 2>&1 && \ 48 + rm ./-.o && echo -DCONFIG_AS_AVX512=1) 44 49 else ifeq ($(HAS_NEON),yes) 45 50 OBJS += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o 46 51 CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 47 - else 48 - HAS_ALTIVEC := $(shell printf '$(pound)include <altivec.h>\nvector int a;\n' |\ 49 - gcc -c -x c - >/dev/null && rm ./-.o && echo yes) 50 - ifeq ($(HAS_ALTIVEC),yes) 51 - CFLAGS += -I../../../arch/powerpc/include 52 - CFLAGS += -DCONFIG_ALTIVEC 53 - OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \ 54 - vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o 55 - endif 52 + else ifeq ($(HAS_ALTIVEC),yes) 53 + CFLAGS += -DCONFIG_ALTIVEC 54 + OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \ 55 + vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o 56 56 endif 57 57 58 58 .c.o: ··· 65 63 %.uc: ../%.uc 66 64 cp -f $< $@ 67 65 68 - all: raid6.a raid6test 66 + all: raid6.a raid6test 69 67 70 68 raid6.a: $(OBJS) 71 - rm -f $@ 72 - $(AR) cq $@ $^ 73 - $(RANLIB) $@ 69 + rm -f $@ 70 + $(AR) cq $@ $^ 71 + $(RANLIB) $@ 74 72 75 73 raid6test: test.c raid6.a 76 74 $(CC) $(CFLAGS) -o raid6test $^

+2 -2

mm/migrate.c

··· 684 684 } 685 685 EXPORT_SYMBOL(migrate_folio); 686 686 687 - #ifdef CONFIG_BLOCK 687 + #ifdef CONFIG_BUFFER_HEAD 688 688 /* Returns true if all buffers are successfully locked */ 689 689 static bool buffer_migrate_lock_buffers(struct buffer_head *head, 690 690 enum migrate_mode mode) ··· 837 837 return __buffer_migrate_folio(mapping, dst, src, mode, true); 838 838 } 839 839 EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs); 840 - #endif 840 + #endif /* CONFIG_BUFFER_HEAD */ 841 841 842 842 int filemap_migrate_folio(struct address_space *mapping, 843 843 struct folio *dst, struct folio *src, enum migrate_mode mode)

+14 -7

tools/cgroup/iocost_monitor.py

··· 100 100 self.period_at = ioc.period_at.value_() / 1_000_000 101 101 self.vperiod_at = ioc.period_at_vtime.value_() / VTIME_PER_SEC 102 102 self.vrate_pct = ioc.vtime_base_rate.value_() * 100 / VTIME_PER_USEC 103 + self.ivrate_pct = ioc.vtime_rate.counter.value_() * 100 / VTIME_PER_USEC 103 104 self.busy_level = ioc.busy_level.value_() 104 105 self.autop_idx = ioc.autop_idx.value_() 105 106 self.user_cost_model = ioc.user_cost_model.value_() ··· 120 119 'period_at' : self.period_at, 121 120 'period_vtime_at' : self.vperiod_at, 122 121 'busy_level' : self.busy_level, 123 - 'vrate_pct' : self.vrate_pct, } 122 + 'vrate_pct' : self.vrate_pct, 123 + 'ivrate_pct' : self.ivrate_pct, 124 + } 124 125 125 126 def table_preamble_str(self): 126 127 state = ('RUN' if self.running else 'IDLE') if self.enabled else 'OFF' ··· 130 127 f'per={self.period_ms}ms ' \ 131 128 f'cur_per={self.period_at:.3f}:v{self.vperiod_at:.3f} ' \ 132 129 f'busy={self.busy_level:+3} ' \ 133 - f'vrate={self.vrate_pct:6.2f}% ' \ 130 + f'vrate={self.vrate_pct:6.2f}%:{self.ivrate_pct:6.2f}% ' \ 134 131 f'params={self.autop_name}' 135 132 if self.user_cost_model or self.user_qos_params: 136 133 output += f'({"C" if self.user_cost_model else ""}{"Q" if self.user_qos_params else ""})' ··· 138 135 139 136 def table_header_str(self): 140 137 return f'{"":25} active {"weight":>9} {"hweight%":>13} {"inflt%":>6} ' \ 141 - f'{"debt":>7} {"delay":>7} {"usage%"}' 138 + f'{"usage%":>6} {"wait":>7} {"debt":>7} {"delay":>7}' 142 139 143 140 class IocgStat: 144 141 def __init__(self, iocg): ··· 164 161 165 162 self.usage = (100 * iocg.usage_delta_us.value_() / 166 163 ioc.period_us.value_()) if self.active else 0 164 + self.wait_ms = (iocg.stat.wait_us.value_() - 165 + iocg.last_stat.wait_us.value_()) / 1000 167 166 self.debt_ms = iocg.abs_vdebt.value_() / VTIME_PER_USEC / 1000 168 167 if blkg.use_delay.counter.value_() != 0: 169 168 self.delay_ms = blkg.delay_nsec.counter.value_() / 1_000_000 ··· 182 177 'hweight_active_pct' : self.hwa_pct, 183 178 'hweight_inuse_pct' : self.hwi_pct, 184 179 'inflight_pct' : self.inflight_pct, 180 + 'usage_pct' : self.usage, 181 + 'wait_ms' : self.wait_ms, 185 182 'debt_ms' : self.debt_ms, 186 183 'delay_ms' : self.delay_ms, 187 - 'usage_pct' : self.usage, 188 184 'address' : self.address } 189 185 return out 190 186 ··· 195 189 f'{round(self.inuse):5}/{round(self.active):5} ' \ 196 190 f'{self.hwi_pct:6.2f}/{self.hwa_pct:6.2f} ' \ 197 191 f'{self.inflight_pct:6.2f} ' \ 192 + f'{min(self.usage, 999):6.2f} ' \ 193 + f'{self.wait_ms:7.2f} ' \ 198 194 f'{self.debt_ms:7.2f} ' \ 199 - f'{self.delay_ms:7.2f} '\ 200 - f'{min(self.usage, 999):6.2f}' 195 + f'{self.delay_ms:7.2f}' 201 196 out = out.rstrip(':') 202 197 return out 203 198 ··· 228 221 for i, ptr in radix_tree_for_each(blkcg_root.blkg_tree.address_of_()): 229 222 blkg = drgn.Object(prog, 'struct blkcg_gq', address=ptr) 230 223 try: 231 - if devname == blkg.q.kobj.parent.name.string_().decode('utf-8'): 224 + if devname == blkg.q.mq_kobj.parent.name.string_().decode('utf-8'): 232 225 q_id = blkg.q.id.value_() 233 226 if blkg.pd[plid]: 234 227 root_iocg = container_of(blkg.pd[plid], 'struct ioc_gq', 'pd')