Merge tag 'for-6.9/block-20240310' of git://git.kernel.dk/linux

+7 -3

arch/m68k/emu/nfblock.c

··· 96 96 97 97 static int __init nfhd_init_one(int id, u32 blocks, u32 bsize) 98 98 { 99 + struct queue_limits lim = { 100 + .logical_block_size = bsize, 101 + }; 99 102 struct nfhd_device *dev; 100 103 int dev_id = id - NFHD_DEV_OFFSET; 101 104 int err = -ENOMEM; ··· 120 117 dev->bsize = bsize; 121 118 dev->bshift = ffs(bsize) - 10; 122 119 123 - dev->disk = blk_alloc_disk(NUMA_NO_NODE); 124 - if (!dev->disk) 120 + dev->disk = blk_alloc_disk(&lim, NUMA_NO_NODE); 121 + if (IS_ERR(dev->disk)) { 122 + err = PTR_ERR(dev->disk); 125 123 goto free_dev; 124 + } 126 125 127 126 dev->disk->major = major_num; 128 127 dev->disk->first_minor = dev_id * 16; ··· 133 128 dev->disk->private_data = dev; 134 129 sprintf(dev->disk->disk_name, "nfhd%u", dev_id); 135 130 set_capacity(dev->disk, (sector_t)blocks * (bsize / 512)); 136 - blk_queue_logical_block_size(dev->disk->queue, bsize); 137 131 err = add_disk(dev->disk); 138 132 if (err) 139 133 goto out_cleanup_disk;

+49 -86

arch/um/drivers/ubd_kern.c

··· 108 108 static DEFINE_MUTEX(ubd_lock); 109 109 static DEFINE_MUTEX(ubd_mutex); /* replaces BKL, might not be needed */ 110 110 111 - static int ubd_open(struct gendisk *disk, blk_mode_t mode); 112 - static void ubd_release(struct gendisk *disk); 113 111 static int ubd_ioctl(struct block_device *bdev, blk_mode_t mode, 114 112 unsigned int cmd, unsigned long arg); 115 113 static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo); ··· 116 118 117 119 static const struct block_device_operations ubd_blops = { 118 120 .owner = THIS_MODULE, 119 - .open = ubd_open, 120 - .release = ubd_release, 121 121 .ioctl = ubd_ioctl, 122 122 .compat_ioctl = blkdev_compat_ptr_ioctl, 123 123 .getgeo = ubd_getgeo, 124 124 }; 125 - 126 - /* Protected by ubd_lock */ 127 - static struct gendisk *ubd_gendisk[MAX_DEV]; 128 125 129 126 #ifdef CONFIG_BLK_DEV_UBD_SYNC 130 127 #define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 1, .c = 0, \ ··· 148 155 * backing or the cow file. */ 149 156 char *file; 150 157 char *serial; 151 - int count; 152 158 int fd; 153 159 __u64 size; 154 160 struct openflags boot_openflags; ··· 157 165 unsigned no_trim:1; 158 166 struct cow cow; 159 167 struct platform_device pdev; 160 - struct request_queue *queue; 168 + struct gendisk *disk; 161 169 struct blk_mq_tag_set tag_set; 162 170 spinlock_t lock; 163 171 }; ··· 173 181 #define DEFAULT_UBD { \ 174 182 .file = NULL, \ 175 183 .serial = NULL, \ 176 - .count = 0, \ 177 184 .fd = -1, \ 178 185 .size = -1, \ 179 186 .boot_openflags = OPEN_FLAGS, \ ··· 765 774 ubd_dev->fd = fd; 766 775 767 776 if(ubd_dev->cow.file != NULL){ 768 - blk_queue_max_hw_sectors(ubd_dev->queue, 8 * sizeof(long)); 769 - 770 777 err = -ENOMEM; 771 778 ubd_dev->cow.bitmap = vmalloc(ubd_dev->cow.bitmap_len); 772 779 if(ubd_dev->cow.bitmap == NULL){ ··· 786 797 if(err < 0) goto error; 787 798 ubd_dev->cow.fd = err; 788 799 } 789 - if (ubd_dev->no_trim == 0) { 790 - blk_queue_max_discard_sectors(ubd_dev->queue, UBD_MAX_REQUEST); 791 - blk_queue_max_write_zeroes_sectors(ubd_dev->queue, UBD_MAX_REQUEST); 792 - } 793 - blk_queue_flag_set(QUEUE_FLAG_NONROT, ubd_dev->queue); 794 800 return 0; 795 801 error: 796 802 os_close_file(ubd_dev->fd); ··· 835 851 NULL, 836 852 }; 837 853 838 - static int ubd_disk_register(int major, u64 size, int unit, 839 - struct gendisk *disk) 840 - { 841 - disk->major = major; 842 - disk->first_minor = unit << UBD_SHIFT; 843 - disk->minors = 1 << UBD_SHIFT; 844 - disk->fops = &ubd_blops; 845 - set_capacity(disk, size / 512); 846 - sprintf(disk->disk_name, "ubd%c", 'a' + unit); 847 - 848 - ubd_devs[unit].pdev.id = unit; 849 - ubd_devs[unit].pdev.name = DRIVER_NAME; 850 - ubd_devs[unit].pdev.dev.release = ubd_device_release; 851 - dev_set_drvdata(&ubd_devs[unit].pdev.dev, &ubd_devs[unit]); 852 - platform_device_register(&ubd_devs[unit].pdev); 853 - 854 - disk->private_data = &ubd_devs[unit]; 855 - disk->queue = ubd_devs[unit].queue; 856 - return device_add_disk(&ubd_devs[unit].pdev.dev, disk, ubd_attr_groups); 857 - } 858 - 859 854 #define ROUND_BLOCK(n) ((n + (SECTOR_SIZE - 1)) & (-SECTOR_SIZE)) 860 855 861 856 static const struct blk_mq_ops ubd_mq_ops = { ··· 844 881 static int ubd_add(int n, char **error_out) 845 882 { 846 883 struct ubd *ubd_dev = &ubd_devs[n]; 884 + struct queue_limits lim = { 885 + .max_segments = MAX_SG, 886 + .seg_boundary_mask = PAGE_SIZE - 1, 887 + }; 847 888 struct gendisk *disk; 848 889 int err = 0; 849 890 850 891 if(ubd_dev->file == NULL) 851 892 goto out; 852 893 894 + if (ubd_dev->cow.file) 895 + lim.max_hw_sectors = 8 * sizeof(long); 896 + if (!ubd_dev->no_trim) { 897 + lim.max_hw_discard_sectors = UBD_MAX_REQUEST; 898 + lim.max_write_zeroes_sectors = UBD_MAX_REQUEST; 899 + } 900 + 853 901 err = ubd_file_size(ubd_dev, &ubd_dev->size); 854 902 if(err < 0){ 855 903 *error_out = "Couldn't determine size of device's file"; 904 + goto out; 905 + } 906 + 907 + err = ubd_open_dev(ubd_dev); 908 + if (err) { 909 + pr_err("ubd%c: Can't open \"%s\": errno = %d\n", 910 + 'a' + n, ubd_dev->file, -err); 856 911 goto out; 857 912 } 858 913 ··· 885 904 886 905 err = blk_mq_alloc_tag_set(&ubd_dev->tag_set); 887 906 if (err) 888 - goto out; 907 + goto out_close; 889 908 890 - disk = blk_mq_alloc_disk(&ubd_dev->tag_set, ubd_dev); 909 + disk = blk_mq_alloc_disk(&ubd_dev->tag_set, &lim, ubd_dev); 891 910 if (IS_ERR(disk)) { 892 911 err = PTR_ERR(disk); 893 912 goto out_cleanup_tags; 894 913 } 895 - ubd_dev->queue = disk->queue; 896 914 897 - blk_queue_write_cache(ubd_dev->queue, true, false); 898 - blk_queue_max_segments(ubd_dev->queue, MAX_SG); 899 - blk_queue_segment_boundary(ubd_dev->queue, PAGE_SIZE - 1); 900 - err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, disk); 915 + blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); 916 + blk_queue_write_cache(disk->queue, true, false); 917 + disk->major = UBD_MAJOR; 918 + disk->first_minor = n << UBD_SHIFT; 919 + disk->minors = 1 << UBD_SHIFT; 920 + disk->fops = &ubd_blops; 921 + set_capacity(disk, ubd_dev->size / 512); 922 + sprintf(disk->disk_name, "ubd%c", 'a' + n); 923 + disk->private_data = ubd_dev; 924 + set_disk_ro(disk, !ubd_dev->openflags.w); 925 + 926 + ubd_dev->pdev.id = n; 927 + ubd_dev->pdev.name = DRIVER_NAME; 928 + ubd_dev->pdev.dev.release = ubd_device_release; 929 + dev_set_drvdata(&ubd_dev->pdev.dev, ubd_dev); 930 + platform_device_register(&ubd_dev->pdev); 931 + 932 + err = device_add_disk(&ubd_dev->pdev.dev, disk, ubd_attr_groups); 901 933 if (err) 902 934 goto out_cleanup_disk; 903 935 904 - ubd_gendisk[n] = disk; 905 936 return 0; 906 937 907 938 out_cleanup_disk: 908 939 put_disk(disk); 909 940 out_cleanup_tags: 910 941 blk_mq_free_tag_set(&ubd_dev->tag_set); 942 + out_close: 943 + ubd_close_dev(ubd_dev); 911 944 out: 912 945 return err; 913 946 } ··· 1007 1012 1008 1013 static int ubd_remove(int n, char **error_out) 1009 1014 { 1010 - struct gendisk *disk = ubd_gendisk[n]; 1011 1015 struct ubd *ubd_dev; 1012 1016 int err = -ENODEV; 1013 1017 ··· 1017 1023 if(ubd_dev->file == NULL) 1018 1024 goto out; 1019 1025 1020 - /* you cannot remove a open disk */ 1021 - err = -EBUSY; 1022 - if(ubd_dev->count > 0) 1023 - goto out; 1026 + if (ubd_dev->disk) { 1027 + /* you cannot remove a open disk */ 1028 + err = -EBUSY; 1029 + if (disk_openers(ubd_dev->disk)) 1030 + goto out; 1024 1031 1025 - ubd_gendisk[n] = NULL; 1026 - if(disk != NULL){ 1027 - del_gendisk(disk); 1028 - put_disk(disk); 1032 + del_gendisk(ubd_dev->disk); 1033 + ubd_close_dev(ubd_dev); 1034 + put_disk(ubd_dev->disk); 1029 1035 } 1030 1036 1031 1037 err = 0; ··· 1146 1152 } 1147 1153 1148 1154 device_initcall(ubd_driver_init); 1149 - 1150 - static int ubd_open(struct gendisk *disk, blk_mode_t mode) 1151 - { 1152 - struct ubd *ubd_dev = disk->private_data; 1153 - int err = 0; 1154 - 1155 - mutex_lock(&ubd_mutex); 1156 - if(ubd_dev->count == 0){ 1157 - err = ubd_open_dev(ubd_dev); 1158 - if(err){ 1159 - printk(KERN_ERR "%s: Can't open \"%s\": errno = %d\n", 1160 - disk->disk_name, ubd_dev->file, -err); 1161 - goto out; 1162 - } 1163 - } 1164 - ubd_dev->count++; 1165 - set_disk_ro(disk, !ubd_dev->openflags.w); 1166 - out: 1167 - mutex_unlock(&ubd_mutex); 1168 - return err; 1169 - } 1170 - 1171 - static void ubd_release(struct gendisk *disk) 1172 - { 1173 - struct ubd *ubd_dev = disk->private_data; 1174 - 1175 - mutex_lock(&ubd_mutex); 1176 - if(--ubd_dev->count == 0) 1177 - ubd_close_dev(ubd_dev); 1178 - mutex_unlock(&ubd_mutex); 1179 - } 1180 1155 1181 1156 static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask, 1182 1157 __u64 *cow_offset, unsigned long *bitmap,

+5 -3

arch/xtensa/platforms/iss/simdisk.c

··· 264 264 struct proc_dir_entry *procdir) 265 265 { 266 266 char tmp[2] = { '0' + which, 0 }; 267 - int err = -ENOMEM; 267 + int err; 268 268 269 269 dev->fd = -1; 270 270 dev->filename = NULL; 271 271 spin_lock_init(&dev->lock); 272 272 dev->users = 0; 273 273 274 - dev->gd = blk_alloc_disk(NUMA_NO_NODE); 275 - if (!dev->gd) 274 + dev->gd = blk_alloc_disk(NULL, NUMA_NO_NODE); 275 + if (IS_ERR(dev->gd)) { 276 + err = PTR_ERR(dev->gd); 276 277 goto out; 278 + } 277 279 dev->gd->major = simdisk_major; 278 280 dev->gd->first_minor = which; 279 281 dev->gd->minors = SIMDISK_MINORS;

+1 -1

block/bdev.c

··· 383 383 384 384 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 385 385 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 386 - SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC), 386 + SLAB_ACCOUNT|SLAB_PANIC), 387 387 init_once); 388 388 err = register_filesystem(&bd_type); 389 389 if (err)

+7 -7

block/bfq-cgroup.c

··· 127 127 if (!bfqg_stats_waiting(stats)) 128 128 return; 129 129 130 - now = ktime_get_ns(); 130 + now = blk_time_get_ns(); 131 131 if (now > stats->start_group_wait_time) 132 132 bfq_stat_add(&stats->group_wait_time, 133 133 now - stats->start_group_wait_time); ··· 144 144 return; 145 145 if (bfqg == curr_bfqg) 146 146 return; 147 - stats->start_group_wait_time = ktime_get_ns(); 147 + stats->start_group_wait_time = blk_time_get_ns(); 148 148 bfqg_stats_mark_waiting(stats); 149 149 } 150 150 ··· 156 156 if (!bfqg_stats_empty(stats)) 157 157 return; 158 158 159 - now = ktime_get_ns(); 159 + now = blk_time_get_ns(); 160 160 if (now > stats->start_empty_time) 161 161 bfq_stat_add(&stats->empty_time, 162 162 now - stats->start_empty_time); ··· 183 183 if (bfqg_stats_empty(stats)) 184 184 return; 185 185 186 - stats->start_empty_time = ktime_get_ns(); 186 + stats->start_empty_time = blk_time_get_ns(); 187 187 bfqg_stats_mark_empty(stats); 188 188 } 189 189 ··· 192 192 struct bfqg_stats *stats = &bfqg->stats; 193 193 194 194 if (bfqg_stats_idling(stats)) { 195 - u64 now = ktime_get_ns(); 195 + u64 now = blk_time_get_ns(); 196 196 197 197 if (now > stats->start_idle_time) 198 198 bfq_stat_add(&stats->idle_time, ··· 205 205 { 206 206 struct bfqg_stats *stats = &bfqg->stats; 207 207 208 - stats->start_idle_time = ktime_get_ns(); 208 + stats->start_idle_time = blk_time_get_ns(); 209 209 bfqg_stats_mark_idling(stats); 210 210 } 211 211 ··· 242 242 u64 io_start_time_ns, blk_opf_t opf) 243 243 { 244 244 struct bfqg_stats *stats = &bfqg->stats; 245 - u64 now = ktime_get_ns(); 245 + u64 now = blk_time_get_ns(); 246 246 247 247 if (now > io_start_time_ns) 248 248 blkg_rwstat_add(&stats->service_time, opf,

+14 -14

block/bfq-iosched.c

··· 1005 1005 1006 1006 rq = rq_entry_fifo(bfqq->fifo.next); 1007 1007 1008 - if (rq == last || ktime_get_ns() < rq->fifo_time) 1008 + if (rq == last || blk_time_get_ns() < rq->fifo_time) 1009 1009 return NULL; 1010 1010 1011 1011 bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); ··· 1829 1829 * bfq_bfqq_update_budg_for_activation for 1830 1830 * details on the usage of the next variable. 1831 1831 */ 1832 - arrived_in_time = ktime_get_ns() <= 1832 + arrived_in_time = blk_time_get_ns() <= 1833 1833 bfqq->ttime.last_end_request + 1834 1834 bfqd->bfq_slice_idle * 3; 1835 1835 unsigned int act_idx = bfq_actuator_index(bfqd, rq->bio); ··· 2208 2208 struct request *next_rq, *prev; 2209 2209 unsigned int old_wr_coeff = bfqq->wr_coeff; 2210 2210 bool interactive = false; 2211 - u64 now_ns = ktime_get_ns(); 2211 + u64 now_ns = blk_time_get_ns(); 2212 2212 2213 2213 bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); 2214 2214 bfqq->queued[rq_is_sync(rq)]++; ··· 2262 2262 bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) && 2263 2263 time_is_before_eq_jiffies(bfqq->decrease_time_jif + 2264 2264 msecs_to_jiffies(10))) { 2265 - bfqd->last_empty_occupied_ns = ktime_get_ns(); 2265 + bfqd->last_empty_occupied_ns = blk_time_get_ns(); 2266 2266 /* 2267 2267 * Start the state machine for measuring the 2268 2268 * total service time of rq: setting ··· 3294 3294 else 3295 3295 timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; 3296 3296 3297 - bfqd->last_budget_start = ktime_get(); 3297 + bfqd->last_budget_start = blk_time_get(); 3298 3298 3299 3299 bfqq->budget_timeout = jiffies + 3300 3300 bfqd->bfq_timeout * timeout_coeff; ··· 3394 3394 else if (bfqq->wr_coeff > 1) 3395 3395 sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC); 3396 3396 3397 - bfqd->last_idling_start = ktime_get(); 3397 + bfqd->last_idling_start = blk_time_get(); 3398 3398 bfqd->last_idling_start_jiffies = jiffies; 3399 3399 3400 3400 hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), ··· 3433 3433 struct request *rq) 3434 3434 { 3435 3435 if (rq != NULL) { /* new rq dispatch now, reset accordingly */ 3436 - bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns(); 3436 + bfqd->last_dispatch = bfqd->first_dispatch = blk_time_get_ns(); 3437 3437 bfqd->peak_rate_samples = 1; 3438 3438 bfqd->sequential_samples = 0; 3439 3439 bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = ··· 3590 3590 */ 3591 3591 static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) 3592 3592 { 3593 - u64 now_ns = ktime_get_ns(); 3593 + u64 now_ns = blk_time_get_ns(); 3594 3594 3595 3595 if (bfqd->peak_rate_samples == 0) { /* first dispatch */ 3596 3596 bfq_log(bfqd, "update_peak_rate: goto reset, samples %d", ··· 4162 4162 if (compensate) 4163 4163 delta_ktime = bfqd->last_idling_start; 4164 4164 else 4165 - delta_ktime = ktime_get(); 4165 + delta_ktime = blk_time_get(); 4166 4166 delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); 4167 4167 delta_usecs = ktime_to_us(delta_ktime); 4168 4168 ··· 5591 5591 struct bfq_io_cq *bic, pid_t pid, int is_sync, 5592 5592 unsigned int act_idx) 5593 5593 { 5594 - u64 now_ns = ktime_get_ns(); 5594 + u64 now_ns = blk_time_get_ns(); 5595 5595 5596 5596 bfqq->actuator_idx = act_idx; 5597 5597 RB_CLEAR_NODE(&bfqq->entity.rb_node); ··· 5903 5903 */ 5904 5904 if (bfqq->dispatched || bfq_bfqq_busy(bfqq)) 5905 5905 return; 5906 - elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; 5906 + elapsed = blk_time_get_ns() - bfqq->ttime.last_end_request; 5907 5907 elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle); 5908 5908 5909 5909 ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; ··· 6194 6194 bfq_add_request(rq); 6195 6195 idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); 6196 6196 6197 - rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; 6197 + rq->fifo_time = blk_time_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; 6198 6198 list_add_tail(&rq->queuelist, &bfqq->fifo); 6199 6199 6200 6200 bfq_rq_enqueued(bfqd, bfqq, rq); ··· 6370 6370 bfq_weights_tree_remove(bfqq); 6371 6371 } 6372 6372 6373 - now_ns = ktime_get_ns(); 6373 + now_ns = blk_time_get_ns(); 6374 6374 6375 6375 bfqq->ttime.last_end_request = now_ns; 6376 6376 ··· 6585 6585 static void bfq_update_inject_limit(struct bfq_data *bfqd, 6586 6586 struct bfq_queue *bfqq) 6587 6587 { 6588 - u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns; 6588 + u64 tot_time_ns = blk_time_get_ns() - bfqd->last_empty_occupied_ns; 6589 6589 unsigned int old_limit = bfqq->inject_limit; 6590 6590 6591 6591 if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) {

+1

block/bio-integrity.c

··· 395 395 iter.tuple_size = bi->tuple_size; 396 396 iter.seed = proc_iter->bi_sector; 397 397 iter.prot_buf = bvec_virt(bip->bip_vec); 398 + iter.pi_offset = bi->pi_offset; 398 399 399 400 __bio_for_each_segment(bv, bio, bviter, *proc_iter) { 400 401 void *kaddr = bvec_kmap_local(&bv);

+19 -26

block/bio.c

··· 16 16 #include <linux/workqueue.h> 17 17 #include <linux/cgroup.h> 18 18 #include <linux/highmem.h> 19 - #include <linux/sched/sysctl.h> 20 19 #include <linux/blk-crypto.h> 21 20 #include <linux/xarray.h> 22 21 ··· 762 763 struct bio_alloc_cache *cache; 763 764 764 765 cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); 765 - if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) { 766 - put_cpu(); 767 - bio_free(bio); 768 - return; 769 - } 766 + if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) 767 + goto out_free; 770 768 771 - bio_uninit(bio); 772 - 773 - if ((bio->bi_opf & REQ_POLLED) && !WARN_ON_ONCE(in_interrupt())) { 769 + if (in_task()) { 770 + bio_uninit(bio); 774 771 bio->bi_next = cache->free_list; 772 + /* Not necessary but helps not to iopoll already freed bios */ 775 773 bio->bi_bdev = NULL; 776 774 cache->free_list = bio; 777 775 cache->nr++; 778 - } else { 779 - unsigned long flags; 776 + } else if (in_hardirq()) { 777 + lockdep_assert_irqs_disabled(); 780 778 781 - local_irq_save(flags); 779 + bio_uninit(bio); 782 780 bio->bi_next = cache->free_list_irq; 783 781 cache->free_list_irq = bio; 784 782 cache->nr_irq++; 785 - local_irq_restore(flags); 783 + } else { 784 + goto out_free; 786 785 } 787 786 put_cpu(); 787 + return; 788 + out_free: 789 + put_cpu(); 790 + bio_free(bio); 788 791 } 789 792 790 793 /** ··· 1155 1154 1156 1155 bio_for_each_folio_all(fi, bio) { 1157 1156 struct page *page; 1158 - size_t done = 0; 1157 + size_t nr_pages; 1159 1158 1160 1159 if (mark_dirty) { 1161 1160 folio_lock(fi.folio); ··· 1163 1162 folio_unlock(fi.folio); 1164 1163 } 1165 1164 page = folio_page(fi.folio, fi.offset / PAGE_SIZE); 1165 + nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE - 1166 + fi.offset / PAGE_SIZE + 1; 1166 1167 do { 1167 1168 bio_release_page(bio, page++); 1168 - done += PAGE_SIZE; 1169 - } while (done < fi.length); 1169 + } while (--nr_pages != 0); 1170 1170 } 1171 1171 } 1172 1172 EXPORT_SYMBOL_GPL(__bio_release_pages); ··· 1373 1371 { 1374 1372 DECLARE_COMPLETION_ONSTACK_MAP(done, 1375 1373 bio->bi_bdev->bd_disk->lockdep_map); 1376 - unsigned long hang_check; 1377 1374 1378 1375 bio->bi_private = &done; 1379 1376 bio->bi_end_io = submit_bio_wait_endio; 1380 1377 bio->bi_opf |= REQ_SYNC; 1381 1378 submit_bio(bio); 1382 - 1383 - /* Prevent hang_check timer from firing at us during very long I/O */ 1384 - hang_check = sysctl_hung_task_timeout_secs; 1385 - if (hang_check) 1386 - while (!wait_for_completion_io_timeout(&done, 1387 - hang_check * (HZ/2))) 1388 - ; 1389 - else 1390 - wait_for_completion_io(&done); 1379 + blk_wait_io(&done); 1391 1380 1392 1381 return blk_status_to_errno(bio->bi_status); 1393 1382 }

+1 -1

block/blk-cgroup.c

··· 1846 1846 { 1847 1847 unsigned long pflags; 1848 1848 bool clamp; 1849 - u64 now = ktime_to_ns(ktime_get()); 1849 + u64 now = blk_time_get_ns(); 1850 1850 u64 exp; 1851 1851 u64 delay_nsec = 0; 1852 1852 int tok;

+1

block/blk-cgroup.h

··· 19 19 #include <linux/kthread.h> 20 20 #include <linux/blk-mq.h> 21 21 #include <linux/llist.h> 22 + #include "blk.h" 22 23 23 24 struct blkcg_gq; 24 25 struct blkg_policy_data;

+23 -10

block/blk-core.c

··· 394 394 { 395 395 } 396 396 397 - struct request_queue *blk_alloc_queue(int node_id) 397 + struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) 398 398 { 399 399 struct request_queue *q; 400 + int error; 400 401 401 402 q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO, 402 403 node_id); 403 404 if (!q) 404 - return NULL; 405 + return ERR_PTR(-ENOMEM); 405 406 406 407 q->last_merge = NULL; 407 408 408 409 q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL); 409 - if (q->id < 0) 410 + if (q->id < 0) { 411 + error = q->id; 410 412 goto fail_q; 413 + } 411 414 412 415 q->stats = blk_alloc_queue_stats(); 413 - if (!q->stats) 416 + if (!q->stats) { 417 + error = -ENOMEM; 414 418 goto fail_id; 419 + } 420 + 421 + error = blk_set_default_limits(lim); 422 + if (error) 423 + goto fail_stats; 424 + q->limits = *lim; 415 425 416 426 q->node = node_id; 417 427 ··· 435 425 mutex_init(&q->debugfs_mutex); 436 426 mutex_init(&q->sysfs_lock); 437 427 mutex_init(&q->sysfs_dir_lock); 428 + mutex_init(&q->limits_lock); 438 429 mutex_init(&q->rq_qos_mutex); 439 430 spin_lock_init(&q->queue_lock); 440 431 ··· 446 435 * Init percpu_ref in atomic mode so that it's faster to shutdown. 447 436 * See blk_register_queue() for details. 448 437 */ 449 - if (percpu_ref_init(&q->q_usage_counter, 438 + error = percpu_ref_init(&q->q_usage_counter, 450 439 blk_queue_usage_counter_release, 451 - PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) 440 + PERCPU_REF_INIT_ATOMIC, GFP_KERNEL); 441 + if (error) 452 442 goto fail_stats; 453 443 454 - blk_set_default_limits(&q->limits); 455 444 q->nr_requests = BLKDEV_DEFAULT_RQ; 456 445 457 446 return q; ··· 462 451 ida_free(&blk_queue_ida, q->id); 463 452 fail_q: 464 453 kmem_cache_free(blk_requestq_cachep, q); 465 - return NULL; 454 + return ERR_PTR(error); 466 455 } 467 456 468 457 /** ··· 1094 1083 if (tsk->plug) 1095 1084 return; 1096 1085 1086 + plug->cur_ktime = 0; 1097 1087 plug->mq_list = NULL; 1098 1088 plug->cached_rq = NULL; 1099 1089 plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT); ··· 1194 1182 */ 1195 1183 if (unlikely(!rq_list_empty(plug->cached_rq))) 1196 1184 blk_mq_free_plug_rqs(plug); 1185 + 1186 + current->flags &= ~PF_BLOCK_TS; 1197 1187 } 1198 1188 1199 1189 /** ··· 1243 1229 if (!kblockd_workqueue) 1244 1230 panic("Failed to create kblockd\n"); 1245 1231 1246 - blk_requestq_cachep = kmem_cache_create("request_queue", 1247 - sizeof(struct request_queue), 0, SLAB_PANIC, NULL); 1232 + blk_requestq_cachep = KMEM_CACHE(request_queue, SLAB_PANIC); 1248 1233 1249 1234 blk_debugfs_root = debugfs_create_dir("block", NULL); 1250 1235

+1 -1

block/blk-flush.c

··· 143 143 part_stat_lock(); 144 144 part_stat_inc(part, ios[STAT_FLUSH]); 145 145 part_stat_add(part, nsecs[STAT_FLUSH], 146 - ktime_get_ns() - rq->start_time_ns); 146 + blk_time_get_ns() - rq->start_time_ns); 147 147 part_stat_unlock(); 148 148 } 149 149

+1

block/blk-integrity.c

··· 370 370 bi->profile = template->profile ? template->profile : &nop_profile; 371 371 bi->tuple_size = template->tuple_size; 372 372 bi->tag_size = template->tag_size; 373 + bi->pi_offset = template->pi_offset; 373 374 374 375 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); 375 376

+4 -4

block/blk-iocost.c

··· 829 829 830 830 /* step up/down based on the vrate */ 831 831 vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); 832 - now_ns = ktime_get_ns(); 832 + now_ns = blk_time_get_ns(); 833 833 834 834 if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { 835 835 if (!ioc->autop_too_fast_at) ··· 1044 1044 unsigned seq; 1045 1045 u64 vrate; 1046 1046 1047 - now->now_ns = ktime_get(); 1047 + now->now_ns = blk_time_get_ns(); 1048 1048 now->now = ktime_to_us(now->now_ns); 1049 1049 vrate = atomic64_read(&ioc->vtime_rate); 1050 1050 ··· 2817 2817 return; 2818 2818 } 2819 2819 2820 - on_q_ns = ktime_get_ns() - rq->alloc_time_ns; 2820 + on_q_ns = blk_time_get_ns() - rq->alloc_time_ns; 2821 2821 rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; 2822 2822 size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC); 2823 2823 ··· 2900 2900 ioc->vtime_base_rate = VTIME_PER_USEC; 2901 2901 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); 2902 2902 seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock); 2903 - ioc->period_at = ktime_to_us(ktime_get()); 2903 + ioc->period_at = ktime_to_us(blk_time_get()); 2904 2904 atomic64_set(&ioc->cur_period, 0); 2905 2905 atomic_set(&ioc->hweight_gen, 0); 2906 2906

+3 -3

block/blk-iolatency.c

··· 609 609 if (!iolat->blkiolat->enabled) 610 610 return; 611 611 612 - now = ktime_to_ns(ktime_get()); 612 + now = blk_time_get_ns(); 613 613 while (blkg && blkg->parent) { 614 614 iolat = blkg_to_lat(blkg); 615 615 if (!iolat) { ··· 661 661 struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer); 662 662 struct blkcg_gq *blkg; 663 663 struct cgroup_subsys_state *pos_css; 664 - u64 now = ktime_to_ns(ktime_get()); 664 + u64 now = blk_time_get_ns(); 665 665 666 666 rcu_read_lock(); 667 667 blkg_for_each_descendant_pre(blkg, pos_css, ··· 985 985 struct blkcg_gq *blkg = lat_to_blkg(iolat); 986 986 struct rq_qos *rqos = iolat_rq_qos(blkg->q); 987 987 struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); 988 - u64 now = ktime_to_ns(ktime_get()); 988 + u64 now = blk_time_get_ns(); 989 989 int cpu; 990 990 991 991 if (blk_queue_nonrot(blkg->q))

+52 -18

block/blk-lib.c

··· 35 35 return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT; 36 36 } 37 37 38 + static void await_bio_endio(struct bio *bio) 39 + { 40 + complete(bio->bi_private); 41 + bio_put(bio); 42 + } 43 + 44 + /* 45 + * await_bio_chain - ends @bio and waits for every chained bio to complete 46 + */ 47 + static void await_bio_chain(struct bio *bio) 48 + { 49 + DECLARE_COMPLETION_ONSTACK_MAP(done, 50 + bio->bi_bdev->bd_disk->lockdep_map); 51 + 52 + bio->bi_private = &done; 53 + bio->bi_end_io = await_bio_endio; 54 + bio_endio(bio); 55 + blk_wait_io(&done); 56 + } 57 + 38 58 int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, 39 59 sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) 40 60 { ··· 97 77 * is disabled. 98 78 */ 99 79 cond_resched(); 80 + if (fatal_signal_pending(current)) { 81 + await_bio_chain(bio); 82 + return -EINTR; 83 + } 100 84 } 101 85 102 86 *biop = bio; ··· 144 120 struct bio **biop, unsigned flags) 145 121 { 146 122 struct bio *bio = *biop; 147 - unsigned int max_write_zeroes_sectors; 123 + unsigned int max_sectors; 148 124 149 125 if (bdev_read_only(bdev)) 150 126 return -EPERM; 151 127 152 - /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */ 153 - max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev); 128 + /* Ensure that max_sectors doesn't overflow bi_size */ 129 + max_sectors = bdev_write_zeroes_sectors(bdev); 154 130 155 - if (max_write_zeroes_sectors == 0) 131 + if (max_sectors == 0) 156 132 return -EOPNOTSUPP; 157 133 158 134 while (nr_sects) { 135 + unsigned int len = min_t(sector_t, nr_sects, max_sectors); 136 + 159 137 bio = blk_next_bio(bio, bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask); 160 138 bio->bi_iter.bi_sector = sector; 161 139 if (flags & BLKDEV_ZERO_NOUNMAP) 162 140 bio->bi_opf |= REQ_NOUNMAP; 163 141 164 - if (nr_sects > max_write_zeroes_sectors) { 165 - bio->bi_iter.bi_size = max_write_zeroes_sectors << 9; 166 - nr_sects -= max_write_zeroes_sectors; 167 - sector += max_write_zeroes_sectors; 168 - } else { 169 - bio->bi_iter.bi_size = nr_sects << 9; 170 - nr_sects = 0; 171 - } 142 + bio->bi_iter.bi_size = len << SECTOR_SHIFT; 143 + nr_sects -= len; 144 + sector += len; 172 145 cond_resched(); 146 + if (fatal_signal_pending(current)) { 147 + await_bio_chain(bio); 148 + return -EINTR; 149 + } 173 150 } 174 151 175 152 *biop = bio; ··· 215 190 break; 216 191 } 217 192 cond_resched(); 193 + if (fatal_signal_pending(current)) { 194 + await_bio_chain(bio); 195 + return -EINTR; 196 + } 218 197 } 219 198 220 199 *biop = bio; ··· 309 280 bio_put(bio); 310 281 } 311 282 blk_finish_plug(&plug); 312 - if (ret && try_write_zeroes) { 283 + if (ret && ret != -EINTR && try_write_zeroes) { 313 284 if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { 314 285 try_write_zeroes = false; 315 286 goto retry; ··· 351 322 return -EPERM; 352 323 353 324 blk_start_plug(&plug); 354 - for (;;) { 325 + while (nr_sects) { 355 326 unsigned int len = min_t(sector_t, nr_sects, max_sectors); 356 327 357 328 bio = blk_next_bio(bio, bdev, 0, REQ_OP_SECURE_ERASE, gfp); ··· 360 331 361 332 sector += len; 362 333 nr_sects -= len; 363 - if (!nr_sects) { 364 - ret = submit_bio_wait(bio); 365 - bio_put(bio); 334 + cond_resched(); 335 + if (fatal_signal_pending(current)) { 336 + await_bio_chain(bio); 337 + ret = -EINTR; 338 + bio = NULL; 366 339 break; 367 340 } 368 - cond_resched(); 341 + } 342 + if (bio) { 343 + ret = submit_bio_wait(bio); 344 + bio_put(bio); 369 345 } 370 346 blk_finish_plug(&plug); 371 347

+88 -98

block/blk-mq.c

··· 21 21 #include <linux/llist.h> 22 22 #include <linux/cpu.h> 23 23 #include <linux/cache.h> 24 - #include <linux/sched/sysctl.h> 25 24 #include <linux/sched/topology.h> 26 25 #include <linux/sched/signal.h> 27 26 #include <linux/delay.h> ··· 321 322 RB_CLEAR_NODE(&rq->rb_node); 322 323 rq->tag = BLK_MQ_NO_TAG; 323 324 rq->internal_tag = BLK_MQ_NO_TAG; 324 - rq->start_time_ns = ktime_get_ns(); 325 + rq->start_time_ns = blk_time_get_ns(); 325 326 rq->part = NULL; 326 327 blk_crypto_rq_set_defaults(rq); 327 328 } ··· 331 332 static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) 332 333 { 333 334 if (blk_mq_need_time_stamp(rq)) 334 - rq->start_time_ns = ktime_get_ns(); 335 + rq->start_time_ns = blk_time_get_ns(); 335 336 else 336 337 rq->start_time_ns = 0; 337 338 ··· 442 443 443 444 /* alloc_time includes depth and tag waits */ 444 445 if (blk_queue_rq_alloc_time(q)) 445 - alloc_time_ns = ktime_get_ns(); 446 + alloc_time_ns = blk_time_get_ns(); 446 447 447 448 if (data->cmd_flags & REQ_NOWAIT) 448 449 data->flags |= BLK_MQ_REQ_NOWAIT; ··· 627 628 628 629 /* alloc_time includes depth and tag waits */ 629 630 if (blk_queue_rq_alloc_time(q)) 630 - alloc_time_ns = ktime_get_ns(); 631 + alloc_time_ns = blk_time_get_ns(); 631 632 632 633 /* 633 634 * If the tag allocator sleeps we could get an allocation for a ··· 1040 1041 inline void __blk_mq_end_request(struct request *rq, blk_status_t error) 1041 1042 { 1042 1043 if (blk_mq_need_time_stamp(rq)) 1043 - __blk_mq_end_request_acct(rq, ktime_get_ns()); 1044 + __blk_mq_end_request_acct(rq, blk_time_get_ns()); 1044 1045 1045 1046 blk_mq_finish_request(rq); 1046 1047 ··· 1083 1084 u64 now = 0; 1084 1085 1085 1086 if (iob->need_ts) 1086 - now = ktime_get_ns(); 1087 + now = blk_time_get_ns(); 1087 1088 1088 1089 while ((rq = rq_list_pop(&iob->req_list)) != NULL) { 1089 1090 prefetch(rq->bio); ··· 1166 1167 if (force_irqthreads()) 1167 1168 return false; 1168 1169 1169 - /* same CPU or cache domain? Complete locally */ 1170 + /* same CPU or cache domain and capacity? Complete locally */ 1170 1171 if (cpu == rq->mq_ctx->cpu || 1171 1172 (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) && 1172 - cpus_share_cache(cpu, rq->mq_ctx->cpu))) 1173 + cpus_share_cache(cpu, rq->mq_ctx->cpu) && 1174 + cpus_equal_capacity(cpu, rq->mq_ctx->cpu))) 1173 1175 return false; 1174 1176 1175 1177 /* don't try to IPI to an offline CPU */ ··· 1254 1254 1255 1255 if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) && 1256 1256 !blk_rq_is_passthrough(rq)) { 1257 - rq->io_start_time_ns = ktime_get_ns(); 1257 + rq->io_start_time_ns = blk_time_get_ns(); 1258 1258 rq->stats_sectors = blk_rq_sectors(rq); 1259 1259 rq->rq_flags |= RQF_STATS; 1260 1260 rq_qos_issue(q, rq); ··· 1409 1409 blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); 1410 1410 blk_mq_run_hw_queue(hctx, false); 1411 1411 1412 - if (blk_rq_is_poll(rq)) { 1412 + if (blk_rq_is_poll(rq)) 1413 1413 blk_rq_poll_completion(rq, &wait.done); 1414 - } else { 1415 - /* 1416 - * Prevent hang_check timer from firing at us during very long 1417 - * I/O 1418 - */ 1419 - unsigned long hang_check = sysctl_hung_task_timeout_secs; 1420 - 1421 - if (hang_check) 1422 - while (!wait_for_completion_io_timeout(&wait.done, 1423 - hang_check * (HZ/2))) 1424 - ; 1425 - else 1426 - wait_for_completion_io(&wait.done); 1427 - } 1414 + else 1415 + blk_wait_io(&wait.done); 1428 1416 1429 1417 return wait.ret; 1430 1418 } ··· 2880 2892 }; 2881 2893 struct request *rq; 2882 2894 2883 - if (blk_mq_attempt_bio_merge(q, bio, nsegs)) 2884 - return NULL; 2885 - 2886 2895 rq_qos_throttle(q, bio); 2887 2896 2888 2897 if (plug) { ··· 2898 2913 } 2899 2914 2900 2915 /* 2901 - * Check if we can use the passed on request for submitting the passed in bio, 2902 - * and remove it from the request list if it can be used. 2916 + * Check if there is a suitable cached request and return it. 2903 2917 */ 2904 - static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, 2918 + static struct request *blk_mq_peek_cached_request(struct blk_plug *plug, 2919 + struct request_queue *q, blk_opf_t opf) 2920 + { 2921 + enum hctx_type type = blk_mq_get_hctx_type(opf); 2922 + struct request *rq; 2923 + 2924 + if (!plug) 2925 + return NULL; 2926 + rq = rq_list_peek(&plug->cached_rq); 2927 + if (!rq || rq->q != q) 2928 + return NULL; 2929 + if (type != rq->mq_hctx->type && 2930 + (type != HCTX_TYPE_READ || rq->mq_hctx->type != HCTX_TYPE_DEFAULT)) 2931 + return NULL; 2932 + if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) 2933 + return NULL; 2934 + return rq; 2935 + } 2936 + 2937 + static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, 2905 2938 struct bio *bio) 2906 2939 { 2907 - enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf); 2908 - enum hctx_type hctx_type = rq->mq_hctx->type; 2909 - 2910 2940 WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq); 2911 - 2912 - if (type != hctx_type && 2913 - !(type == HCTX_TYPE_READ && hctx_type == HCTX_TYPE_DEFAULT)) 2914 - return false; 2915 - if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf)) 2916 - return false; 2917 2941 2918 2942 /* 2919 2943 * If any qos ->throttle() end up blocking, we will have flushed the ··· 2935 2941 blk_mq_rq_time_init(rq, 0); 2936 2942 rq->cmd_flags = bio->bi_opf; 2937 2943 INIT_LIST_HEAD(&rq->queuelist); 2938 - return true; 2939 2944 } 2940 2945 2941 2946 /** ··· 2956 2963 struct blk_plug *plug = blk_mq_plug(bio); 2957 2964 const int is_sync = op_is_sync(bio->bi_opf); 2958 2965 struct blk_mq_hw_ctx *hctx; 2959 - struct request *rq = NULL; 2960 2966 unsigned int nr_segs = 1; 2967 + struct request *rq; 2961 2968 blk_status_t ret; 2962 2969 2963 2970 bio = blk_queue_bounce(bio, q); 2964 2971 2965 - if (plug) { 2966 - rq = rq_list_peek(&plug->cached_rq); 2967 - if (rq && rq->q != q) 2968 - rq = NULL; 2969 - } 2970 - if (rq) { 2971 - if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { 2972 - bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); 2973 - if (!bio) 2974 - return; 2975 - } 2976 - if (!bio_integrity_prep(bio)) 2977 - return; 2978 - if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) 2979 - return; 2980 - if (blk_mq_use_cached_rq(rq, plug, bio)) 2981 - goto done; 2982 - percpu_ref_get(&q->q_usage_counter); 2983 - } else { 2972 + /* 2973 + * If the plug has a cached request for this queue, try use it. 2974 + * 2975 + * The cached request already holds a q_usage_counter reference and we 2976 + * don't have to acquire a new one if we use it. 2977 + */ 2978 + rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf); 2979 + if (!rq) { 2984 2980 if (unlikely(bio_queue_enter(bio))) 2985 2981 return; 2986 - if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { 2987 - bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); 2988 - if (!bio) 2989 - goto fail; 2990 - } 2991 - if (!bio_integrity_prep(bio)) 2992 - goto fail; 2993 2982 } 2994 2983 2995 - rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); 2996 - if (unlikely(!rq)) { 2997 - fail: 2998 - blk_queue_exit(q); 2999 - return; 2984 + if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { 2985 + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); 2986 + if (!bio) 2987 + goto queue_exit; 2988 + } 2989 + if (!bio_integrity_prep(bio)) 2990 + goto queue_exit; 2991 + 2992 + if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) 2993 + goto queue_exit; 2994 + 2995 + if (!rq) { 2996 + rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); 2997 + if (unlikely(!rq)) 2998 + goto queue_exit; 2999 + } else { 3000 + blk_mq_use_cached_rq(rq, plug, bio); 3000 3001 } 3001 3002 3002 - done: 3003 3003 trace_block_getrq(bio); 3004 3004 3005 3005 rq_qos_track(q, rq, bio); ··· 3023 3037 } else { 3024 3038 blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq)); 3025 3039 } 3040 + return; 3041 + 3042 + queue_exit: 3043 + /* 3044 + * Don't drop the queue reference if we were trying to use a cached 3045 + * request and thus didn't acquire one. 3046 + */ 3047 + if (!rq) 3048 + blk_queue_exit(q); 3026 3049 } 3027 3050 3028 3051 #ifdef CONFIG_BLK_MQ_STACKING ··· 3093 3098 blk_mq_run_dispatch_ops(q, 3094 3099 ret = blk_mq_request_issue_directly(rq, true)); 3095 3100 if (ret) 3096 - blk_account_io_done(rq, ktime_get_ns()); 3101 + blk_account_io_done(rq, blk_time_get_ns()); 3097 3102 return ret; 3098 3103 } 3099 3104 EXPORT_SYMBOL_GPL(blk_insert_cloned_request); ··· 4073 4078 blk_mq_sysfs_deinit(q); 4074 4079 } 4075 4080 4076 - static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, 4077 - void *queuedata) 4081 + struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, 4082 + struct queue_limits *lim, void *queuedata) 4078 4083 { 4084 + struct queue_limits default_lim = { }; 4079 4085 struct request_queue *q; 4080 4086 int ret; 4081 4087 4082 - q = blk_alloc_queue(set->numa_node); 4083 - if (!q) 4084 - return ERR_PTR(-ENOMEM); 4088 + q = blk_alloc_queue(lim ? lim : &default_lim, set->numa_node); 4089 + if (IS_ERR(q)) 4090 + return q; 4085 4091 q->queuedata = queuedata; 4086 4092 ret = blk_mq_init_allocated_queue(set, q); 4087 4093 if (ret) { ··· 4091 4095 } 4092 4096 return q; 4093 4097 } 4094 - 4095 - struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) 4096 - { 4097 - return blk_mq_init_queue_data(set, NULL); 4098 - } 4099 - EXPORT_SYMBOL(blk_mq_init_queue); 4098 + EXPORT_SYMBOL(blk_mq_alloc_queue); 4100 4099 4101 4100 /** 4102 4101 * blk_mq_destroy_queue - shutdown a request queue 4103 4102 * @q: request queue to shutdown 4104 4103 * 4105 - * This shuts down a request queue allocated by blk_mq_init_queue(). All future 4104 + * This shuts down a request queue allocated by blk_mq_alloc_queue(). All future 4106 4105 * requests will be failed with -ENODEV. The caller is responsible for dropping 4107 - * the reference from blk_mq_init_queue() by calling blk_put_queue(). 4106 + * the reference from blk_mq_alloc_queue() by calling blk_put_queue(). 4108 4107 * 4109 4108 * Context: can sleep 4110 4109 */ ··· 4120 4129 } 4121 4130 EXPORT_SYMBOL(blk_mq_destroy_queue); 4122 4131 4123 - struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, 4132 + struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, 4133 + struct queue_limits *lim, void *queuedata, 4124 4134 struct lock_class_key *lkclass) 4125 4135 { 4126 4136 struct request_queue *q; 4127 4137 struct gendisk *disk; 4128 4138 4129 - q = blk_mq_init_queue_data(set, queuedata); 4139 + q = blk_mq_alloc_queue(set, lim, queuedata); 4130 4140 if (IS_ERR(q)) 4131 4141 return ERR_CAST(q); 4132 4142 ··· 4381 4389 if (set->nr_maps == 1) 4382 4390 set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues; 4383 4391 4384 - if (set->ops->map_queues && !is_kdump_kernel()) { 4392 + if (set->ops->map_queues) { 4385 4393 int i; 4386 4394 4387 4395 /* ··· 4480 4488 4481 4489 /* 4482 4490 * If a crashdump is active, then we are potentially in a very 4483 - * memory constrained environment. Limit us to 1 queue and 4484 - * 64 tags to prevent using too much memory. 4491 + * memory constrained environment. Limit us to 64 tags to prevent 4492 + * using too much memory. 4485 4493 */ 4486 - if (is_kdump_kernel()) { 4487 - set->nr_hw_queues = 1; 4488 - set->nr_maps = 1; 4494 + if (is_kdump_kernel()) 4489 4495 set->queue_depth = min(64U, set->queue_depth); 4490 - } 4496 + 4491 4497 /* 4492 4498 * There is no use for more h/w queues than cpus if we just have 4493 4499 * a single map ··· 4515 4525 GFP_KERNEL, set->numa_node); 4516 4526 if (!set->map[i].mq_map) 4517 4527 goto out_free_mq_map; 4518 - set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; 4528 + set->map[i].nr_queues = set->nr_hw_queues; 4519 4529 } 4520 4530 4521 4531 blk_mq_update_queue_map(set);

+264 -65

block/blk-settings.c

··· 26 26 EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); 27 27 28 28 /** 29 - * blk_set_default_limits - reset limits to default values 30 - * @lim: the queue_limits structure to reset 31 - * 32 - * Description: 33 - * Returns a queue_limit struct to its default state. 34 - */ 35 - void blk_set_default_limits(struct queue_limits *lim) 36 - { 37 - lim->max_segments = BLK_MAX_SEGMENTS; 38 - lim->max_discard_segments = 1; 39 - lim->max_integrity_segments = 0; 40 - lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; 41 - lim->virt_boundary_mask = 0; 42 - lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; 43 - lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; 44 - lim->max_user_sectors = lim->max_dev_sectors = 0; 45 - lim->chunk_sectors = 0; 46 - lim->max_write_zeroes_sectors = 0; 47 - lim->max_zone_append_sectors = 0; 48 - lim->max_discard_sectors = 0; 49 - lim->max_hw_discard_sectors = 0; 50 - lim->max_secure_erase_sectors = 0; 51 - lim->discard_granularity = 512; 52 - lim->discard_alignment = 0; 53 - lim->discard_misaligned = 0; 54 - lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; 55 - lim->bounce = BLK_BOUNCE_NONE; 56 - lim->alignment_offset = 0; 57 - lim->io_opt = 0; 58 - lim->misaligned = 0; 59 - lim->zoned = false; 60 - lim->zone_write_granularity = 0; 61 - lim->dma_alignment = 511; 62 - } 63 - 64 - /** 65 29 * blk_set_stacking_limits - set default limits for stacking devices 66 30 * @lim: the queue_limits structure to reset 67 31 * 68 - * Description: 69 - * Returns a queue_limit struct to its default state. Should be used 70 - * by stacking drivers like DM that have no internal limits. 32 + * Prepare queue limits for applying limits from underlying devices using 33 + * blk_stack_limits(). 71 34 */ 72 35 void blk_set_stacking_limits(struct queue_limits *lim) 73 36 { 74 - blk_set_default_limits(lim); 37 + memset(lim, 0, sizeof(*lim)); 38 + lim->logical_block_size = SECTOR_SIZE; 39 + lim->physical_block_size = SECTOR_SIZE; 40 + lim->io_min = SECTOR_SIZE; 41 + lim->discard_granularity = SECTOR_SIZE; 42 + lim->dma_alignment = SECTOR_SIZE - 1; 43 + lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; 75 44 76 45 /* Inherit limits from component devices */ 77 46 lim->max_segments = USHRT_MAX; ··· 51 82 lim->max_dev_sectors = UINT_MAX; 52 83 lim->max_write_zeroes_sectors = UINT_MAX; 53 84 lim->max_zone_append_sectors = UINT_MAX; 85 + lim->max_user_discard_sectors = UINT_MAX; 54 86 } 55 87 EXPORT_SYMBOL(blk_set_stacking_limits); 88 + 89 + static void blk_apply_bdi_limits(struct backing_dev_info *bdi, 90 + struct queue_limits *lim) 91 + { 92 + /* 93 + * For read-ahead of large files to be effective, we need to read ahead 94 + * at least twice the optimal I/O size. 95 + */ 96 + bdi->ra_pages = max(lim->io_opt * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); 97 + bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT; 98 + } 99 + 100 + static int blk_validate_zoned_limits(struct queue_limits *lim) 101 + { 102 + if (!lim->zoned) { 103 + if (WARN_ON_ONCE(lim->max_open_zones) || 104 + WARN_ON_ONCE(lim->max_active_zones) || 105 + WARN_ON_ONCE(lim->zone_write_granularity) || 106 + WARN_ON_ONCE(lim->max_zone_append_sectors)) 107 + return -EINVAL; 108 + return 0; 109 + } 110 + 111 + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED))) 112 + return -EINVAL; 113 + 114 + if (lim->zone_write_granularity < lim->logical_block_size) 115 + lim->zone_write_granularity = lim->logical_block_size; 116 + 117 + if (lim->max_zone_append_sectors) { 118 + /* 119 + * The Zone Append size is limited by the maximum I/O size 120 + * and the zone size given that it can't span zones. 121 + */ 122 + lim->max_zone_append_sectors = 123 + min3(lim->max_hw_sectors, 124 + lim->max_zone_append_sectors, 125 + lim->chunk_sectors); 126 + } 127 + 128 + return 0; 129 + } 130 + 131 + /* 132 + * Check that the limits in lim are valid, initialize defaults for unset 133 + * values, and cap values based on others where needed. 134 + */ 135 + static int blk_validate_limits(struct queue_limits *lim) 136 + { 137 + unsigned int max_hw_sectors; 138 + 139 + /* 140 + * Unless otherwise specified, default to 512 byte logical blocks and a 141 + * physical block size equal to the logical block size. 142 + */ 143 + if (!lim->logical_block_size) 144 + lim->logical_block_size = SECTOR_SIZE; 145 + if (lim->physical_block_size < lim->logical_block_size) 146 + lim->physical_block_size = lim->logical_block_size; 147 + 148 + /* 149 + * The minimum I/O size defaults to the physical block size unless 150 + * explicitly overridden. 151 + */ 152 + if (lim->io_min < lim->physical_block_size) 153 + lim->io_min = lim->physical_block_size; 154 + 155 + /* 156 + * max_hw_sectors has a somewhat weird default for historical reason, 157 + * but driver really should set their own instead of relying on this 158 + * value. 159 + * 160 + * The block layer relies on the fact that every driver can 161 + * handle at lest a page worth of data per I/O, and needs the value 162 + * aligned to the logical block size. 163 + */ 164 + if (!lim->max_hw_sectors) 165 + lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; 166 + if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SECTORS)) 167 + return -EINVAL; 168 + lim->max_hw_sectors = round_down(lim->max_hw_sectors, 169 + lim->logical_block_size >> SECTOR_SHIFT); 170 + 171 + /* 172 + * The actual max_sectors value is a complex beast and also takes the 173 + * max_dev_sectors value (set by SCSI ULPs) and a user configurable 174 + * value into account. The ->max_sectors value is always calculated 175 + * from these, so directly setting it won't have any effect. 176 + */ 177 + max_hw_sectors = min_not_zero(lim->max_hw_sectors, 178 + lim->max_dev_sectors); 179 + if (lim->max_user_sectors) { 180 + if (lim->max_user_sectors > max_hw_sectors || 181 + lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE) 182 + return -EINVAL; 183 + lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors); 184 + } else { 185 + lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP); 186 + } 187 + lim->max_sectors = round_down(lim->max_sectors, 188 + lim->logical_block_size >> SECTOR_SHIFT); 189 + 190 + /* 191 + * Random default for the maximum number of segments. Driver should not 192 + * rely on this and set their own. 193 + */ 194 + if (!lim->max_segments) 195 + lim->max_segments = BLK_MAX_SEGMENTS; 196 + 197 + lim->max_discard_sectors = 198 + min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors); 199 + 200 + if (!lim->max_discard_segments) 201 + lim->max_discard_segments = 1; 202 + 203 + if (lim->discard_granularity < lim->physical_block_size) 204 + lim->discard_granularity = lim->physical_block_size; 205 + 206 + /* 207 + * By default there is no limit on the segment boundary alignment, 208 + * but if there is one it can't be smaller than the page size as 209 + * that would break all the normal I/O patterns. 210 + */ 211 + if (!lim->seg_boundary_mask) 212 + lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; 213 + if (WARN_ON_ONCE(lim->seg_boundary_mask < PAGE_SIZE - 1)) 214 + return -EINVAL; 215 + 216 + /* 217 + * Devices that require a virtual boundary do not support scatter/gather 218 + * I/O natively, but instead require a descriptor list entry for each 219 + * page (which might not be identical to the Linux PAGE_SIZE). Because 220 + * of that they are not limited by our notion of "segment size". 221 + */ 222 + if (lim->virt_boundary_mask) { 223 + if (WARN_ON_ONCE(lim->max_segment_size && 224 + lim->max_segment_size != UINT_MAX)) 225 + return -EINVAL; 226 + lim->max_segment_size = UINT_MAX; 227 + } else { 228 + /* 229 + * The maximum segment size has an odd historic 64k default that 230 + * drivers probably should override. Just like the I/O size we 231 + * require drivers to at least handle a full page per segment. 232 + */ 233 + if (!lim->max_segment_size) 234 + lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; 235 + if (WARN_ON_ONCE(lim->max_segment_size < PAGE_SIZE)) 236 + return -EINVAL; 237 + } 238 + 239 + /* 240 + * We require drivers to at least do logical block aligned I/O, but 241 + * historically could not check for that due to the separate calls 242 + * to set the limits. Once the transition is finished the check 243 + * below should be narrowed down to check the logical block size. 244 + */ 245 + if (!lim->dma_alignment) 246 + lim->dma_alignment = SECTOR_SIZE - 1; 247 + if (WARN_ON_ONCE(lim->dma_alignment > PAGE_SIZE)) 248 + return -EINVAL; 249 + 250 + if (lim->alignment_offset) { 251 + lim->alignment_offset &= (lim->physical_block_size - 1); 252 + lim->misaligned = 0; 253 + } 254 + 255 + return blk_validate_zoned_limits(lim); 256 + } 257 + 258 + /* 259 + * Set the default limits for a newly allocated queue. @lim contains the 260 + * initial limits set by the driver, which could be no limit in which case 261 + * all fields are cleared to zero. 262 + */ 263 + int blk_set_default_limits(struct queue_limits *lim) 264 + { 265 + /* 266 + * Most defaults are set by capping the bounds in blk_validate_limits, 267 + * but max_user_discard_sectors is special and needs an explicit 268 + * initialization to the max value here. 269 + */ 270 + lim->max_user_discard_sectors = UINT_MAX; 271 + return blk_validate_limits(lim); 272 + } 273 + 274 + /** 275 + * queue_limits_commit_update - commit an atomic update of queue limits 276 + * @q: queue to update 277 + * @lim: limits to apply 278 + * 279 + * Apply the limits in @lim that were obtained from queue_limits_start_update() 280 + * and updated by the caller to @q. 281 + * 282 + * Returns 0 if successful, else a negative error code. 283 + */ 284 + int queue_limits_commit_update(struct request_queue *q, 285 + struct queue_limits *lim) 286 + __releases(q->limits_lock) 287 + { 288 + int error = blk_validate_limits(lim); 289 + 290 + if (!error) { 291 + q->limits = *lim; 292 + if (q->disk) 293 + blk_apply_bdi_limits(q->disk->bdi, lim); 294 + } 295 + mutex_unlock(&q->limits_lock); 296 + return error; 297 + } 298 + EXPORT_SYMBOL_GPL(queue_limits_commit_update); 299 + 300 + /** 301 + * queue_limits_set - apply queue limits to queue 302 + * @q: queue to update 303 + * @lim: limits to apply 304 + * 305 + * Apply the limits in @lim that were freshly initialized to @q. 306 + * To update existing limits use queue_limits_start_update() and 307 + * queue_limits_commit_update() instead. 308 + * 309 + * Returns 0 if successful, else a negative error code. 310 + */ 311 + int queue_limits_set(struct request_queue *q, struct queue_limits *lim) 312 + { 313 + mutex_lock(&q->limits_lock); 314 + return queue_limits_commit_update(q, lim); 315 + } 316 + EXPORT_SYMBOL_GPL(queue_limits_set); 56 317 57 318 /** 58 319 * blk_queue_bounce_limit - set bounce buffer limit for queue ··· 376 177 void blk_queue_max_discard_sectors(struct request_queue *q, 377 178 unsigned int max_discard_sectors) 378 179 { 379 - q->limits.max_hw_discard_sectors = max_discard_sectors; 380 - q->limits.max_discard_sectors = max_discard_sectors; 180 + struct queue_limits *lim = &q->limits; 181 + 182 + lim->max_hw_discard_sectors = max_discard_sectors; 183 + lim->max_discard_sectors = 184 + min(max_discard_sectors, lim->max_user_discard_sectors); 381 185 } 382 186 EXPORT_SYMBOL(blk_queue_max_discard_sectors); 383 187 ··· 595 393 596 394 void disk_update_readahead(struct gendisk *disk) 597 395 { 598 - struct request_queue *q = disk->queue; 599 - 600 - /* 601 - * For read-ahead of large files to be effective, we need to read ahead 602 - * at least twice the optimal I/O size. 603 - */ 604 - disk->bdi->ra_pages = 605 - max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); 606 - disk->bdi->io_pages = queue_max_sectors(q) >> (PAGE_SHIFT - 9); 396 + blk_apply_bdi_limits(disk->bdi, &disk->queue->limits); 607 397 } 608 398 EXPORT_SYMBOL_GPL(disk_update_readahead); 609 399 ··· 883 689 t->zone_write_granularity = max(t->zone_write_granularity, 884 690 b->zone_write_granularity); 885 691 t->zoned = max(t->zoned, b->zoned); 692 + if (!t->zoned) { 693 + t->zone_write_granularity = 0; 694 + t->max_zone_append_sectors = 0; 695 + } 886 696 return ret; 887 697 } 888 698 EXPORT_SYMBOL(blk_stack_limits); 889 699 890 700 /** 891 - * disk_stack_limits - adjust queue limits for stacked drivers 892 - * @disk: MD/DM gendisk (top) 701 + * queue_limits_stack_bdev - adjust queue_limits for stacked devices 702 + * @t: the stacking driver limits (top device) 893 703 * @bdev: the underlying block device (bottom) 894 704 * @offset: offset to beginning of data within component device 705 + * @pfx: prefix to use for warnings logged 895 706 * 896 707 * Description: 897 - * Merges the limits for a top level gendisk and a bottom level 898 - * block_device. 708 + * This function is used by stacking drivers like MD and DM to ensure 709 + * that all component devices have compatible block sizes and 710 + * alignments. The stacking driver must provide a queue_limits 711 + * struct (top) and then iteratively call the stacking function for 712 + * all component (bottom) devices. The stacking function will 713 + * attempt to combine the values and ensure proper alignment. 899 714 */ 900 - void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, 901 - sector_t offset) 715 + void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, 716 + sector_t offset, const char *pfx) 902 717 { 903 - struct request_queue *t = disk->queue; 904 - 905 - if (blk_stack_limits(&t->limits, &bdev_get_queue(bdev)->limits, 906 - get_start_sect(bdev) + (offset >> 9)) < 0) 718 + if (blk_stack_limits(t, &bdev_get_queue(bdev)->limits, 719 + get_start_sect(bdev) + offset)) 907 720 pr_notice("%s: Warning: Device %pg is misaligned\n", 908 - disk->disk_name, bdev); 909 - 910 - disk_update_readahead(disk); 721 + pfx, bdev); 911 722 } 912 - EXPORT_SYMBOL(disk_stack_limits); 723 + EXPORT_SYMBOL_GPL(queue_limits_stack_bdev); 913 724 914 725 /** 915 726 * blk_queue_update_dma_pad - update pad mask

+1 -1

block/blk-stat.c

··· 27 27 /* src is a per-cpu stat, mean isn't initialized */ 28 28 void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) 29 29 { 30 - if (!src->nr_samples) 30 + if (dst->nr_samples + src->nr_samples <= dst->nr_samples) 31 31 return; 32 32 33 33 dst->min = min(dst->min, src->min);

+26 -33

block/blk-sysfs.c

··· 174 174 static ssize_t queue_discard_max_store(struct request_queue *q, 175 175 const char *page, size_t count) 176 176 { 177 - unsigned long max_discard; 178 - ssize_t ret = queue_var_store(&max_discard, page, count); 177 + unsigned long max_discard_bytes; 178 + struct queue_limits lim; 179 + ssize_t ret; 180 + int err; 179 181 182 + ret = queue_var_store(&max_discard_bytes, page, count); 180 183 if (ret < 0) 181 184 return ret; 182 185 183 - if (max_discard & (q->limits.discard_granularity - 1)) 186 + if (max_discard_bytes & (q->limits.discard_granularity - 1)) 184 187 return -EINVAL; 185 188 186 - max_discard >>= 9; 187 - if (max_discard > UINT_MAX) 189 + if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX) 188 190 return -EINVAL; 189 191 190 - if (max_discard > q->limits.max_hw_discard_sectors) 191 - max_discard = q->limits.max_hw_discard_sectors; 192 + blk_mq_freeze_queue(q); 193 + lim = queue_limits_start_update(q); 194 + lim.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT; 195 + err = queue_limits_commit_update(q, &lim); 196 + blk_mq_unfreeze_queue(q); 192 197 193 - q->limits.max_discard_sectors = max_discard; 198 + if (err) 199 + return err; 194 200 return ret; 195 201 } 196 202 ··· 232 226 static ssize_t 233 227 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) 234 228 { 235 - unsigned long var; 236 - unsigned int max_sectors_kb, 237 - max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1, 238 - page_kb = 1 << (PAGE_SHIFT - 10); 239 - ssize_t ret = queue_var_store(&var, page, count); 229 + unsigned long max_sectors_kb; 230 + struct queue_limits lim; 231 + ssize_t ret; 232 + int err; 240 233 234 + ret = queue_var_store(&max_sectors_kb, page, count); 241 235 if (ret < 0) 242 236 return ret; 243 237 244 - max_sectors_kb = (unsigned int)var; 245 - max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb, 246 - q->limits.max_dev_sectors >> 1); 247 - if (max_sectors_kb == 0) { 248 - q->limits.max_user_sectors = 0; 249 - max_sectors_kb = min(max_hw_sectors_kb, 250 - BLK_DEF_MAX_SECTORS_CAP >> 1); 251 - } else { 252 - if (max_sectors_kb > max_hw_sectors_kb || 253 - max_sectors_kb < page_kb) 254 - return -EINVAL; 255 - q->limits.max_user_sectors = max_sectors_kb << 1; 256 - } 257 - 258 - spin_lock_irq(&q->queue_lock); 259 - q->limits.max_sectors = max_sectors_kb << 1; 260 - if (q->disk) 261 - q->disk->bdi->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); 262 - spin_unlock_irq(&q->queue_lock); 263 - 238 + blk_mq_freeze_queue(q); 239 + lim = queue_limits_start_update(q); 240 + lim.max_user_sectors = max_sectors_kb << 1; 241 + err = queue_limits_commit_update(q, &lim); 242 + blk_mq_unfreeze_queue(q); 243 + if (err) 244 + return err; 264 245 return ret; 265 246 } 266 247

+5 -5

block/blk-throttle.c

··· 1098 1098 while ((bio = throtl_peek_queued(&sq->queued[READ])) && 1099 1099 tg_may_dispatch(tg, bio, NULL)) { 1100 1100 1101 - tg_dispatch_one_bio(tg, bio_data_dir(bio)); 1101 + tg_dispatch_one_bio(tg, READ); 1102 1102 nr_reads++; 1103 1103 1104 1104 if (nr_reads >= max_nr_reads) ··· 1108 1108 while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && 1109 1109 tg_may_dispatch(tg, bio, NULL)) { 1110 1110 1111 - tg_dispatch_one_bio(tg, bio_data_dir(bio)); 1111 + tg_dispatch_one_bio(tg, WRITE); 1112 1112 nr_writes++; 1113 1113 1114 1114 if (nr_writes >= max_nr_writes) ··· 1815 1815 time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold); 1816 1816 ret = tg->latency_target == DFL_LATENCY_TARGET || 1817 1817 tg->idletime_threshold == DFL_IDLE_THRESHOLD || 1818 - (ktime_get_ns() >> 10) - tg->last_finish_time > time || 1818 + (blk_time_get_ns() >> 10) - tg->last_finish_time > time || 1819 1819 tg->avg_idletime > tg->idletime_threshold || 1820 1820 (tg->latency_target && tg->bio_cnt && 1821 1821 tg->bad_bio_cnt * 5 < tg->bio_cnt); ··· 2060 2060 if (last_finish_time == 0) 2061 2061 return; 2062 2062 2063 - now = ktime_get_ns() >> 10; 2063 + now = blk_time_get_ns() >> 10; 2064 2064 if (now <= last_finish_time || 2065 2065 last_finish_time == tg->checked_last_finish_time) 2066 2066 return; ··· 2327 2327 if (!tg->td->limit_valid[LIMIT_LOW]) 2328 2328 return; 2329 2329 2330 - finish_time_ns = ktime_get_ns(); 2330 + finish_time_ns = blk_time_get_ns(); 2331 2331 tg->last_finish_time = finish_time_ns >> 10; 2332 2332 2333 2333 start_time = bio_issue_time(&bio->bi_issue) >> 10;

+3 -3

block/blk-wbt.c

··· 29 29 #include "blk-wbt.h" 30 30 #include "blk-rq-qos.h" 31 31 #include "elevator.h" 32 + #include "blk.h" 32 33 33 34 #define CREATE_TRACE_POINTS 34 35 #include <trace/events/wbt.h> ··· 275 274 276 275 static u64 rwb_sync_issue_lat(struct rq_wb *rwb) 277 276 { 278 - u64 now, issue = READ_ONCE(rwb->sync_issue); 277 + u64 issue = READ_ONCE(rwb->sync_issue); 279 278 280 279 if (!issue || !rwb->sync_cookie) 281 280 return 0; 282 281 283 - now = ktime_to_ns(ktime_get()); 284 - return now - issue; 282 + return blk_time_get_ns() - issue; 285 283 } 286 284 287 285 static inline unsigned int wbt_inflight(struct rq_wb *rwb)

+8 -12

block/blk-zoned.c

··· 11 11 12 12 #include <linux/kernel.h> 13 13 #include <linux/module.h> 14 - #include <linux/rbtree.h> 15 14 #include <linux/blkdev.h> 16 15 #include <linux/blk-mq.h> 17 16 #include <linux/mm.h> ··· 176 177 } 177 178 } 178 179 179 - static int blkdev_zone_reset_all_emulated(struct block_device *bdev, 180 - gfp_t gfp_mask) 180 + static int blkdev_zone_reset_all_emulated(struct block_device *bdev) 181 181 { 182 182 struct gendisk *disk = bdev->bd_disk; 183 183 sector_t capacity = bdev_nr_sectors(bdev); ··· 203 205 } 204 206 205 207 bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC, 206 - gfp_mask); 208 + GFP_KERNEL); 207 209 bio->bi_iter.bi_sector = sector; 208 210 sector += zone_sectors; 209 211 ··· 221 223 return ret; 222 224 } 223 225 224 - static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask) 226 + static int blkdev_zone_reset_all(struct block_device *bdev) 225 227 { 226 228 struct bio bio; 227 229 ··· 236 238 * @sector: Start sector of the first zone to operate on 237 239 * @nr_sectors: Number of sectors, should be at least the length of one zone and 238 240 * must be zone size aligned. 239 - * @gfp_mask: Memory allocation flags (for bio_alloc) 240 241 * 241 242 * Description: 242 243 * Perform the specified operation on the range of zones specified by ··· 245 248 * or finish request. 246 249 */ 247 250 int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, 248 - sector_t sector, sector_t nr_sectors, gfp_t gfp_mask) 251 + sector_t sector, sector_t nr_sectors) 249 252 { 250 253 struct request_queue *q = bdev_get_queue(bdev); 251 254 sector_t zone_sectors = bdev_zone_sectors(bdev); ··· 282 285 */ 283 286 if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) { 284 287 if (!blk_queue_zone_resetall(q)) 285 - return blkdev_zone_reset_all_emulated(bdev, gfp_mask); 286 - return blkdev_zone_reset_all(bdev, gfp_mask); 288 + return blkdev_zone_reset_all_emulated(bdev); 289 + return blkdev_zone_reset_all(bdev); 287 290 } 288 291 289 292 while (sector < end_sector) { 290 - bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, gfp_mask); 293 + bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL); 291 294 bio->bi_iter.bi_sector = sector; 292 295 sector += zone_sectors; 293 296 ··· 416 419 return -ENOTTY; 417 420 } 418 421 419 - ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors, 420 - GFP_KERNEL); 422 + ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); 421 423 422 424 fail: 423 425 if (cmd == BLKRESETZONE)

+83 -2

block/blk.h

··· 4 4 5 5 #include <linux/blk-crypto.h> 6 6 #include <linux/memblock.h> /* for max_pfn/max_low_pfn */ 7 + #include <linux/sched/sysctl.h> 8 + #include <linux/timekeeping.h> 7 9 #include <xen/xen.h> 8 10 #include "blk-crypto-internal.h" 9 11 ··· 70 68 if (blk_try_enter_queue(q, false)) 71 69 return 0; 72 70 return __bio_queue_enter(q, bio); 71 + } 72 + 73 + static inline void blk_wait_io(struct completion *done) 74 + { 75 + /* Prevent hang_check timer from firing at us during very long I/O */ 76 + unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2; 77 + 78 + if (timeout) 79 + while (!wait_for_completion_io_timeout(done, timeout)) 80 + ; 81 + else 82 + wait_for_completion_io(done); 73 83 } 74 84 75 85 #define BIO_INLINE_VECS 4 ··· 343 329 bool blk_rq_merge_ok(struct request *rq, struct bio *bio); 344 330 enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); 345 331 346 - void blk_set_default_limits(struct queue_limits *lim); 332 + int blk_set_default_limits(struct queue_limits *lim); 347 333 int blk_dev_init(void); 348 334 349 335 /* ··· 461 447 unpin_user_page(page); 462 448 } 463 449 464 - struct request_queue *blk_alloc_queue(int node_id); 450 + struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id); 465 451 466 452 int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode); 467 453 ··· 530 516 return atomic_read(&req->ref); 531 517 } 532 518 519 + static inline u64 blk_time_get_ns(void) 520 + { 521 + struct blk_plug *plug = current->plug; 522 + 523 + if (!plug) 524 + return ktime_get_ns(); 525 + 526 + /* 527 + * 0 could very well be a valid time, but rather than flag "this is 528 + * a valid timestamp" separately, just accept that we'll do an extra 529 + * ktime_get_ns() if we just happen to get 0 as the current time. 530 + */ 531 + if (!plug->cur_ktime) { 532 + plug->cur_ktime = ktime_get_ns(); 533 + current->flags |= PF_BLOCK_TS; 534 + } 535 + return plug->cur_ktime; 536 + } 537 + 538 + static inline ktime_t blk_time_get(void) 539 + { 540 + return ns_to_ktime(blk_time_get_ns()); 541 + } 542 + 543 + /* 544 + * From most significant bit: 545 + * 1 bit: reserved for other usage, see below 546 + * 12 bits: original size of bio 547 + * 51 bits: issue time of bio 548 + */ 549 + #define BIO_ISSUE_RES_BITS 1 550 + #define BIO_ISSUE_SIZE_BITS 12 551 + #define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) 552 + #define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) 553 + #define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) 554 + #define BIO_ISSUE_SIZE_MASK \ 555 + (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) 556 + #define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) 557 + 558 + /* Reserved bit for blk-throtl */ 559 + #define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) 560 + 561 + static inline u64 __bio_issue_time(u64 time) 562 + { 563 + return time & BIO_ISSUE_TIME_MASK; 564 + } 565 + 566 + static inline u64 bio_issue_time(struct bio_issue *issue) 567 + { 568 + return __bio_issue_time(issue->value); 569 + } 570 + 571 + static inline sector_t bio_issue_size(struct bio_issue *issue) 572 + { 573 + return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); 574 + } 575 + 576 + static inline void bio_issue_init(struct bio_issue *issue, 577 + sector_t size) 578 + { 579 + size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; 580 + issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | 581 + (blk_time_get_ns() & BIO_ISSUE_TIME_MASK) | 582 + ((u64)size << BIO_ISSUE_SIZE_SHIFT)); 583 + } 584 + 533 585 void bdev_release(struct file *bdev_file); 534 586 int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, 535 587 const struct blk_holder_ops *hops, struct file *bdev_file); 536 588 int bdev_permission(dev_t dev, blk_mode_t mode, void *holder); 589 + 537 590 #endif /* BLK_INTERNAL_H */

+1 -1

block/bsg-lib.c

··· 383 383 if (blk_mq_alloc_tag_set(set)) 384 384 goto out_tag_set; 385 385 386 - q = blk_mq_init_queue(set); 386 + q = blk_mq_alloc_queue(set, NULL, NULL); 387 387 if (IS_ERR(q)) { 388 388 ret = PTR_ERR(q); 389 389 goto out_queue;

+8 -6

block/genhd.c

··· 1201 1201 return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq); 1202 1202 } 1203 1203 1204 - struct class block_class = { 1204 + const struct class block_class = { 1205 1205 .name = "block", 1206 1206 .dev_uevent = block_uevent, 1207 1207 }; ··· 1391 1391 return NULL; 1392 1392 } 1393 1393 1394 - struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) 1394 + struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node, 1395 + struct lock_class_key *lkclass) 1395 1396 { 1397 + struct queue_limits default_lim = { }; 1396 1398 struct request_queue *q; 1397 1399 struct gendisk *disk; 1398 1400 1399 - q = blk_alloc_queue(node); 1400 - if (!q) 1401 - return NULL; 1401 + q = blk_alloc_queue(lim ? lim : &default_lim, node); 1402 + if (IS_ERR(q)) 1403 + return ERR_CAST(q); 1402 1404 1403 1405 disk = __alloc_disk_node(q, node, lkclass); 1404 1406 if (!disk) { 1405 1407 blk_put_queue(q); 1406 - return NULL; 1408 + return ERR_PTR(-ENOMEM); 1407 1409 } 1408 1410 set_bit(GD_OWNS_QUEUE, &disk->state); 1409 1411 return disk;

+7 -5

block/holder.c

··· 8 8 int refcnt; 9 9 }; 10 10 11 + static DEFINE_MUTEX(blk_holder_mutex); 12 + 11 13 static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, 12 14 struct gendisk *disk) 13 15 { ··· 82 80 kobject_get(bdev->bd_holder_dir); 83 81 mutex_unlock(&bdev->bd_disk->open_mutex); 84 82 85 - mutex_lock(&disk->open_mutex); 83 + mutex_lock(&blk_holder_mutex); 86 84 WARN_ON_ONCE(!bdev->bd_holder); 87 85 88 86 holder = bd_find_holder_disk(bdev, disk); ··· 110 108 goto out_del_symlink; 111 109 list_add(&holder->list, &disk->slave_bdevs); 112 110 113 - mutex_unlock(&disk->open_mutex); 111 + mutex_unlock(&blk_holder_mutex); 114 112 return 0; 115 113 116 114 out_del_symlink: ··· 118 116 out_free_holder: 119 117 kfree(holder); 120 118 out_unlock: 121 - mutex_unlock(&disk->open_mutex); 119 + mutex_unlock(&blk_holder_mutex); 122 120 if (ret) 123 121 kobject_put(bdev->bd_holder_dir); 124 122 return ret; ··· 142 140 if (WARN_ON_ONCE(!disk->slave_dir)) 143 141 return; 144 142 145 - mutex_lock(&disk->open_mutex); 143 + mutex_lock(&blk_holder_mutex); 146 144 holder = bd_find_holder_disk(bdev, disk); 147 145 if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { 148 146 del_symlink(disk->slave_dir, bdev_kobj(bdev)); ··· 151 149 list_del_init(&holder->list); 152 150 kfree(holder); 153 151 } 154 - mutex_unlock(&disk->open_mutex); 152 + mutex_unlock(&blk_holder_mutex); 155 153 } 156 154 EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);

+8 -1

block/ioctl.c

··· 18 18 { 19 19 struct gendisk *disk = bdev->bd_disk; 20 20 struct blkpg_partition p; 21 - sector_t start, length; 21 + sector_t start, length, capacity, end; 22 22 23 23 if (!capable(CAP_SYS_ADMIN)) 24 24 return -EACCES; ··· 41 41 42 42 start = p.start >> SECTOR_SHIFT; 43 43 length = p.length >> SECTOR_SHIFT; 44 + capacity = get_capacity(disk); 45 + 46 + if (check_add_overflow(start, length, &end)) 47 + return -EINVAL; 48 + 49 + if (start >= capacity || end > capacity) 50 + return -EINVAL; 44 51 45 52 switch (op) { 46 53 case BLKPG_ADD_PARTITION:

-11

block/partitions/core.c

··· 419 419 int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, 420 420 sector_t length) 421 421 { 422 - sector_t capacity = get_capacity(disk), end; 423 422 struct block_device *part; 424 423 int ret; 425 424 426 425 mutex_lock(&disk->open_mutex); 427 - if (check_add_overflow(start, length, &end)) { 428 - ret = -EINVAL; 429 - goto out; 430 - } 431 - 432 - if (start >= capacity || end > capacity) { 433 - ret = -EINVAL; 434 - goto out; 435 - } 436 - 437 426 if (!disk_live(disk)) { 438 427 ret = -ENXIO; 439 428 goto out;

+2

block/partitions/mac.c

··· 20 20 * Code to understand MacOS partition tables. 21 21 */ 22 22 23 + #ifdef CONFIG_PPC_PMAC 23 24 static inline void mac_fix_string(char *stg, int len) 24 25 { 25 26 int i; ··· 28 27 for (i = len - 1; i >= 0 && stg[i] == ' '; i--) 29 28 stg[i] = 0; 30 29 } 30 + #endif 31 31 32 32 int mac_partition(struct parsed_partitions *state) 33 33 {

+7 -9

block/sed-opal.c

··· 1212 1212 static int start_opal_session_cont(struct opal_dev *dev) 1213 1213 { 1214 1214 u32 hsn, tsn; 1215 - int error = 0; 1215 + int error; 1216 1216 1217 1217 error = parse_and_check_status(dev); 1218 1218 if (error) ··· 1354 1354 { 1355 1355 const char *activekey; 1356 1356 size_t keylen; 1357 - int error = 0; 1357 + int error; 1358 1358 1359 1359 error = parse_and_check_status(dev); 1360 1360 if (error) ··· 2157 2157 u8 lr_buffer[OPAL_UID_LENGTH]; 2158 2158 struct opal_lock_unlock *lkul = data; 2159 2159 u8 read_locked = 1, write_locked = 1; 2160 - int err = 0; 2160 + int err; 2161 2161 2162 2162 if (build_locking_range(lr_buffer, sizeof(lr_buffer), 2163 2163 lkul->session.opal_key.lr) < 0) ··· 2580 2580 const struct opal_step discovery0_step = { 2581 2581 opal_discovery0, discv 2582 2582 }; 2583 - int ret = 0; 2583 + int ret; 2584 2584 2585 2585 mutex_lock(&dev->dev_lock); 2586 2586 setup_opal_dev(dev); ··· 3069 3069 { 3070 3070 struct opal_suspend_data *suspend; 3071 3071 bool was_failure = false; 3072 - int ret = 0; 3072 + int ret; 3073 3073 3074 3074 if (!dev) 3075 3075 return false; ··· 3112 3112 { read_table_data, rw_tbl }, 3113 3113 { end_opal_session, } 3114 3114 }; 3115 - int ret = 0; 3116 3115 3117 3116 if (!rw_tbl->size) 3118 - return ret; 3117 + return 0; 3119 3118 3120 3119 return execute_steps(dev, read_table_steps, 3121 3120 ARRAY_SIZE(read_table_steps)); ··· 3128 3129 { write_table_data, rw_tbl }, 3129 3130 { end_opal_session, } 3130 3131 }; 3131 - int ret = 0; 3132 3132 3133 3133 if (!rw_tbl->size) 3134 - return ret; 3134 + return 0; 3135 3135 3136 3136 return execute_steps(dev, write_table_steps, 3137 3137 ARRAY_SIZE(write_table_steps));

+48 -24

block/t10-pi.c

··· 12 12 #include <net/checksum.h> 13 13 #include <asm/unaligned.h> 14 14 15 - typedef __be16 (csum_fn) (void *, unsigned int); 15 + typedef __be16 (csum_fn) (__be16, void *, unsigned int); 16 16 17 - static __be16 t10_pi_crc_fn(void *data, unsigned int len) 17 + static __be16 t10_pi_crc_fn(__be16 crc, void *data, unsigned int len) 18 18 { 19 - return cpu_to_be16(crc_t10dif(data, len)); 19 + return cpu_to_be16(crc_t10dif_update(be16_to_cpu(crc), data, len)); 20 20 } 21 21 22 - static __be16 t10_pi_ip_fn(void *data, unsigned int len) 22 + static __be16 t10_pi_ip_fn(__be16 csum, void *data, unsigned int len) 23 23 { 24 24 return (__force __be16)ip_compute_csum(data, len); 25 25 } ··· 32 32 static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, 33 33 csum_fn *fn, enum t10_dif_type type) 34 34 { 35 + u8 offset = iter->pi_offset; 35 36 unsigned int i; 36 37 37 38 for (i = 0 ; i < iter->data_size ; i += iter->interval) { 38 - struct t10_pi_tuple *pi = iter->prot_buf; 39 + struct t10_pi_tuple *pi = iter->prot_buf + offset; 39 40 40 - pi->guard_tag = fn(iter->data_buf, iter->interval); 41 + pi->guard_tag = fn(0, iter->data_buf, iter->interval); 42 + if (offset) 43 + pi->guard_tag = fn(pi->guard_tag, iter->prot_buf, 44 + offset); 41 45 pi->app_tag = 0; 42 46 43 47 if (type == T10_PI_TYPE1_PROTECTION) ··· 60 56 static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, 61 57 csum_fn *fn, enum t10_dif_type type) 62 58 { 59 + u8 offset = iter->pi_offset; 63 60 unsigned int i; 64 61 65 62 BUG_ON(type == T10_PI_TYPE0_PROTECTION); 66 63 67 64 for (i = 0 ; i < iter->data_size ; i += iter->interval) { 68 - struct t10_pi_tuple *pi = iter->prot_buf; 65 + struct t10_pi_tuple *pi = iter->prot_buf + offset; 69 66 __be16 csum; 70 67 71 68 if (type == T10_PI_TYPE1_PROTECTION || ··· 88 83 goto next; 89 84 } 90 85 91 - csum = fn(iter->data_buf, iter->interval); 86 + csum = fn(0, iter->data_buf, iter->interval); 87 + if (offset) 88 + csum = fn(csum, iter->prot_buf, offset); 92 89 93 90 if (pi->guard_tag != csum) { 94 91 pr_err("%s: guard tag error at sector %llu " \ ··· 141 134 */ 142 135 static void t10_pi_type1_prepare(struct request *rq) 143 136 { 144 - const int tuple_sz = rq->q->integrity.tuple_size; 137 + struct blk_integrity *bi = &rq->q->integrity; 138 + const int tuple_sz = bi->tuple_size; 145 139 u32 ref_tag = t10_pi_ref_tag(rq); 140 + u8 offset = bi->pi_offset; 146 141 struct bio *bio; 147 142 148 143 __rq_for_each_bio(bio, rq) { ··· 163 154 164 155 p = bvec_kmap_local(&iv); 165 156 for (j = 0; j < iv.bv_len; j += tuple_sz) { 166 - struct t10_pi_tuple *pi = p; 157 + struct t10_pi_tuple *pi = p + offset; 167 158 168 159 if (be32_to_cpu(pi->ref_tag) == virt) 169 160 pi->ref_tag = cpu_to_be32(ref_tag); ··· 192 183 */ 193 184 static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) 194 185 { 195 - unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp; 196 - const int tuple_sz = rq->q->integrity.tuple_size; 186 + struct blk_integrity *bi = &rq->q->integrity; 187 + unsigned intervals = nr_bytes >> bi->interval_exp; 188 + const int tuple_sz = bi->tuple_size; 197 189 u32 ref_tag = t10_pi_ref_tag(rq); 190 + u8 offset = bi->pi_offset; 198 191 struct bio *bio; 199 192 200 193 __rq_for_each_bio(bio, rq) { ··· 211 200 212 201 p = bvec_kmap_local(&iv); 213 202 for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { 214 - struct t10_pi_tuple *pi = p; 203 + struct t10_pi_tuple *pi = p + offset; 215 204 216 205 if (be32_to_cpu(pi->ref_tag) == ref_tag) 217 206 pi->ref_tag = cpu_to_be32(virt); ··· 291 280 }; 292 281 EXPORT_SYMBOL(t10_pi_type3_ip); 293 282 294 - static __be64 ext_pi_crc64(void *data, unsigned int len) 283 + static __be64 ext_pi_crc64(u64 crc, void *data, unsigned int len) 295 284 { 296 - return cpu_to_be64(crc64_rocksoft(data, len)); 285 + return cpu_to_be64(crc64_rocksoft_update(crc, data, len)); 297 286 } 298 287 299 288 static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter, 300 289 enum t10_dif_type type) 301 290 { 291 + u8 offset = iter->pi_offset; 302 292 unsigned int i; 303 293 304 294 for (i = 0 ; i < iter->data_size ; i += iter->interval) { 305 - struct crc64_pi_tuple *pi = iter->prot_buf; 295 + struct crc64_pi_tuple *pi = iter->prot_buf + offset; 306 296 307 - pi->guard_tag = ext_pi_crc64(iter->data_buf, iter->interval); 297 + pi->guard_tag = ext_pi_crc64(0, iter->data_buf, iter->interval); 298 + if (offset) 299 + pi->guard_tag = ext_pi_crc64(be64_to_cpu(pi->guard_tag), 300 + iter->prot_buf, offset); 308 301 pi->app_tag = 0; 309 302 310 303 if (type == T10_PI_TYPE1_PROTECTION) ··· 334 319 static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, 335 320 enum t10_dif_type type) 336 321 { 322 + u8 offset = iter->pi_offset; 337 323 unsigned int i; 338 324 339 325 for (i = 0; i < iter->data_size; i += iter->interval) { 340 - struct crc64_pi_tuple *pi = iter->prot_buf; 326 + struct crc64_pi_tuple *pi = iter->prot_buf + offset; 341 327 u64 ref, seed; 342 328 __be64 csum; 343 329 ··· 359 343 goto next; 360 344 } 361 345 362 - csum = ext_pi_crc64(iter->data_buf, iter->interval); 346 + csum = ext_pi_crc64(0, iter->data_buf, iter->interval); 347 + if (offset) 348 + csum = ext_pi_crc64(be64_to_cpu(csum), iter->prot_buf, 349 + offset); 350 + 363 351 if (pi->guard_tag != csum) { 364 352 pr_err("%s: guard tag error at sector %llu " \ 365 353 "(rcvd %016llx, want %016llx)\n", ··· 393 373 394 374 static void ext_pi_type1_prepare(struct request *rq) 395 375 { 396 - const int tuple_sz = rq->q->integrity.tuple_size; 376 + struct blk_integrity *bi = &rq->q->integrity; 377 + const int tuple_sz = bi->tuple_size; 397 378 u64 ref_tag = ext_pi_ref_tag(rq); 379 + u8 offset = bi->pi_offset; 398 380 struct bio *bio; 399 381 400 382 __rq_for_each_bio(bio, rq) { ··· 415 393 416 394 p = bvec_kmap_local(&iv); 417 395 for (j = 0; j < iv.bv_len; j += tuple_sz) { 418 - struct crc64_pi_tuple *pi = p; 396 + struct crc64_pi_tuple *pi = p + offset; 419 397 u64 ref = get_unaligned_be48(pi->ref_tag); 420 398 421 399 if (ref == virt) ··· 433 411 434 412 static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) 435 413 { 436 - unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp; 437 - const int tuple_sz = rq->q->integrity.tuple_size; 414 + struct blk_integrity *bi = &rq->q->integrity; 415 + unsigned intervals = nr_bytes >> bi->interval_exp; 416 + const int tuple_sz = bi->tuple_size; 438 417 u64 ref_tag = ext_pi_ref_tag(rq); 418 + u8 offset = bi->pi_offset; 439 419 struct bio *bio; 440 420 441 421 __rq_for_each_bio(bio, rq) { ··· 452 428 453 429 p = bvec_kmap_local(&iv); 454 430 for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { 455 - struct crc64_pi_tuple *pi = p; 431 + struct crc64_pi_tuple *pi = p + offset; 456 432 u64 ref = get_unaligned_be48(pi->ref_tag); 457 433 458 434 if (ref == ref_tag)

+1 -1

drivers/base/base.h

··· 207 207 #endif 208 208 209 209 #ifdef CONFIG_BLOCK 210 - extern struct class block_class; 210 + extern const struct class block_class; 211 211 static inline bool is_blockdev(struct device *dev) 212 212 { 213 213 return dev->class == &block_class;

+1 -1

drivers/block/amiflop.c

··· 1779 1779 struct gendisk *disk; 1780 1780 int err; 1781 1781 1782 - disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL); 1782 + disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL, NULL); 1783 1783 if (IS_ERR(disk)) 1784 1784 return PTR_ERR(disk); 1785 1785

+7 -8

drivers/block/aoe/aoeblk.c

··· 24 24 static struct kmem_cache *buf_pool_cache; 25 25 static struct dentry *aoe_debugfs_dir; 26 26 27 - /* GPFS needs a larger value than the default. */ 28 - static int aoe_maxsectors; 27 + /* random default picked from the historic block max_sectors cap */ 28 + static int aoe_maxsectors = 2560; 29 29 module_param(aoe_maxsectors, int, 0644); 30 30 MODULE_PARM_DESC(aoe_maxsectors, 31 31 "When nonzero, set the maximum number of sectors per I/O request"); ··· 334 334 mempool_t *mp; 335 335 struct blk_mq_tag_set *set; 336 336 sector_t ssize; 337 + struct queue_limits lim = { 338 + .max_hw_sectors = aoe_maxsectors, 339 + .io_opt = SZ_2M, 340 + }; 337 341 ulong flags; 338 342 int late = 0; 339 343 int err; ··· 375 371 goto err_mempool; 376 372 } 377 373 378 - gd = blk_mq_alloc_disk(set, d); 374 + gd = blk_mq_alloc_disk(set, &lim, d); 379 375 if (IS_ERR(gd)) { 380 376 pr_err("aoe: cannot allocate block queue for %ld.%d\n", 381 377 d->aoemajor, d->aoeminor); ··· 388 384 WARN_ON(d->flags & DEVFL_TKILL); 389 385 WARN_ON(d->gd); 390 386 WARN_ON(d->flags & DEVFL_UP); 391 - /* random number picked from the history block max_sectors cap */ 392 - blk_queue_max_hw_sectors(gd->queue, 2560u); 393 - blk_queue_io_opt(gd->queue, SZ_2M); 394 387 d->bufpool = mp; 395 388 d->blkq = gd->queue; 396 389 d->gd = gd; 397 - if (aoe_maxsectors) 398 - blk_queue_max_hw_sectors(gd->queue, aoe_maxsectors); 399 390 gd->major = AOE_MAJOR; 400 391 gd->first_minor = d->sysminor; 401 392 gd->minors = AOE_PARTITIONS;

+6 -6

drivers/block/aoe/aoecmd.c

··· 419 419 rcu_read_lock(); 420 420 for_each_netdev_rcu(&init_net, ifp) { 421 421 dev_hold(ifp); 422 - if (!is_aoe_netif(ifp)) 423 - goto cont; 422 + if (!is_aoe_netif(ifp)) { 423 + dev_put(ifp); 424 + continue; 425 + } 424 426 425 427 skb = new_skb(sizeof *h + sizeof *ch); 426 428 if (skb == NULL) { 427 429 printk(KERN_INFO "aoe: skb alloc failure\n"); 428 - goto cont; 430 + dev_put(ifp); 431 + continue; 429 432 } 430 433 skb_put(skb, sizeof *h + sizeof *ch); 431 434 skb->dev = ifp; ··· 443 440 h->major = cpu_to_be16(aoemajor); 444 441 h->minor = aoeminor; 445 442 h->cmd = AOECMD_CFG; 446 - 447 - cont: 448 - dev_put(ifp); 449 443 } 450 444 rcu_read_unlock(); 451 445 }

+1

drivers/block/aoe/aoenet.c

··· 63 63 pr_warn("aoe: packet could not be sent on %s. %s\n", 64 64 ifp ? ifp->name : "netif", 65 65 "consider increasing tx_queue_len"); 66 + dev_put(ifp); 66 67 spin_lock_irq(&txlock); 67 68 } 68 69 return 0;

+1 -1

drivers/block/ataflop.c

··· 1994 1994 { 1995 1995 struct gendisk *disk; 1996 1996 1997 - disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL); 1997 + disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL, NULL); 1998 1998 if (IS_ERR(disk)) 1999 1999 return PTR_ERR(disk); 2000 2000

+14 -12

drivers/block/brd.c

··· 318 318 struct gendisk *disk; 319 319 char buf[DISK_NAME_LEN]; 320 320 int err = -ENOMEM; 321 + struct queue_limits lim = { 322 + /* 323 + * This is so fdisk will align partitions on 4k, because of 324 + * direct_access API needing 4k alignment, returning a PFN 325 + * (This is only a problem on very small devices <= 4M, 326 + * otherwise fdisk will align on 1M. Regardless this call 327 + * is harmless) 328 + */ 329 + .physical_block_size = PAGE_SIZE, 330 + }; 321 331 322 332 list_for_each_entry(brd, &brd_devices, brd_list) 323 333 if (brd->brd_number == i) ··· 345 335 debugfs_create_u64(buf, 0444, brd_debugfs_dir, 346 336 &brd->brd_nr_pages); 347 337 348 - disk = brd->brd_disk = blk_alloc_disk(NUMA_NO_NODE); 349 - if (!disk) 338 + disk = brd->brd_disk = blk_alloc_disk(&lim, NUMA_NO_NODE); 339 + if (IS_ERR(disk)) { 340 + err = PTR_ERR(disk); 350 341 goto out_free_dev; 351 - 342 + } 352 343 disk->major = RAMDISK_MAJOR; 353 344 disk->first_minor = i * max_part; 354 345 disk->minors = max_part; ··· 358 347 strscpy(disk->disk_name, buf, DISK_NAME_LEN); 359 348 set_capacity(disk, rd_size * 2); 360 349 361 - /* 362 - * This is so fdisk will align partitions on 4k, because of 363 - * direct_access API needing 4k alignment, returning a PFN 364 - * (This is only a problem on very small devices <= 4M, 365 - * otherwise fdisk will align on 1M. Regardless this call 366 - * is harmless) 367 - */ 368 - blk_queue_physical_block_size(disk->queue, PAGE_SIZE); 369 - 370 350 /* Tell the block layer that this is not a rotational device */ 371 351 blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); 372 352 blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue);

+12 -5

drivers/block/drbd/drbd_main.c

··· 2690 2690 int id; 2691 2691 int vnr = adm_ctx->volume; 2692 2692 enum drbd_ret_code err = ERR_NOMEM; 2693 + struct queue_limits lim = { 2694 + /* 2695 + * Setting the max_hw_sectors to an odd value of 8kibyte here. 2696 + * This triggers a max_bio_size message upon first attach or 2697 + * connect. 2698 + */ 2699 + .max_hw_sectors = DRBD_MAX_BIO_SIZE_SAFE >> 8, 2700 + }; 2693 2701 2694 2702 device = minor_to_device(minor); 2695 2703 if (device) ··· 2716 2708 2717 2709 drbd_init_set_defaults(device); 2718 2710 2719 - disk = blk_alloc_disk(NUMA_NO_NODE); 2720 - if (!disk) 2711 + disk = blk_alloc_disk(&lim, NUMA_NO_NODE); 2712 + if (IS_ERR(disk)) { 2713 + err = PTR_ERR(disk); 2721 2714 goto out_no_disk; 2715 + } 2722 2716 2723 2717 device->vdisk = disk; 2724 2718 device->rq_queue = disk->queue; ··· 2737 2727 2738 2728 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); 2739 2729 blk_queue_write_cache(disk->queue, true, true); 2740 - /* Setting the max_hw_sectors to an odd value of 8kibyte here 2741 - This triggers a max_bio_size message upon first attach or connect */ 2742 - blk_queue_max_hw_sectors(disk->queue, DRBD_MAX_BIO_SIZE_SAFE >> 8); 2743 2730 2744 2731 device->md_io.page = alloc_page(GFP_KERNEL); 2745 2732 if (!device->md_io.page)

+112 -120

drivers/block/drbd/drbd_nl.c

··· 1189 1189 return 0; 1190 1190 } 1191 1191 1192 - static void blk_queue_discard_granularity(struct request_queue *q, unsigned int granularity) 1192 + static unsigned int drbd_max_peer_bio_size(struct drbd_device *device) 1193 1193 { 1194 - q->limits.discard_granularity = granularity; 1194 + /* 1195 + * We may ignore peer limits if the peer is modern enough. From 8.3.8 1196 + * onwards the peer can use multiple BIOs for a single peer_request. 1197 + */ 1198 + if (device->state.conn < C_WF_REPORT_PARAMS) 1199 + return device->peer_max_bio_size; 1200 + 1201 + if (first_peer_device(device)->connection->agreed_pro_version < 94) 1202 + return min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); 1203 + 1204 + /* 1205 + * Correct old drbd (up to 8.3.7) if it believes it can do more than 1206 + * 32KiB. 1207 + */ 1208 + if (first_peer_device(device)->connection->agreed_pro_version == 94) 1209 + return DRBD_MAX_SIZE_H80_PACKET; 1210 + 1211 + /* 1212 + * drbd 8.3.8 onwards, before 8.4.0 1213 + */ 1214 + if (first_peer_device(device)->connection->agreed_pro_version < 100) 1215 + return DRBD_MAX_BIO_SIZE_P95; 1216 + return DRBD_MAX_BIO_SIZE; 1195 1217 } 1196 1218 1197 1219 static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection) ··· 1226 1204 return AL_EXTENT_SIZE >> 9; 1227 1205 } 1228 1206 1229 - static void decide_on_discard_support(struct drbd_device *device, 1207 + static bool drbd_discard_supported(struct drbd_connection *connection, 1230 1208 struct drbd_backing_dev *bdev) 1231 1209 { 1232 - struct drbd_connection *connection = 1233 - first_peer_device(device)->connection; 1234 - struct request_queue *q = device->rq_queue; 1235 - unsigned int max_discard_sectors; 1236 - 1237 1210 if (bdev && !bdev_max_discard_sectors(bdev->backing_bdev)) 1238 - goto not_supported; 1211 + return false; 1239 1212 1240 1213 if (connection->cstate >= C_CONNECTED && 1241 1214 !(connection->agreed_features & DRBD_FF_TRIM)) { 1242 1215 drbd_info(connection, 1243 1216 "peer DRBD too old, does not support TRIM: disabling discards\n"); 1244 - goto not_supported; 1217 + return false; 1245 1218 } 1219 + 1220 + return true; 1221 + } 1222 + 1223 + /* This is the workaround for "bio would need to, but cannot, be split" */ 1224 + static unsigned int drbd_backing_dev_max_segments(struct drbd_device *device) 1225 + { 1226 + unsigned int max_segments; 1227 + 1228 + rcu_read_lock(); 1229 + max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs; 1230 + rcu_read_unlock(); 1231 + 1232 + if (!max_segments) 1233 + return BLK_MAX_SEGMENTS; 1234 + return max_segments; 1235 + } 1236 + 1237 + void drbd_reconsider_queue_parameters(struct drbd_device *device, 1238 + struct drbd_backing_dev *bdev, struct o_qlim *o) 1239 + { 1240 + struct drbd_connection *connection = 1241 + first_peer_device(device)->connection; 1242 + struct request_queue * const q = device->rq_queue; 1243 + unsigned int now = queue_max_hw_sectors(q) << 9; 1244 + struct queue_limits lim; 1245 + struct request_queue *b = NULL; 1246 + unsigned int new; 1247 + 1248 + if (bdev) { 1249 + b = bdev->backing_bdev->bd_disk->queue; 1250 + 1251 + device->local_max_bio_size = 1252 + queue_max_hw_sectors(b) << SECTOR_SHIFT; 1253 + } 1254 + 1255 + /* 1256 + * We may later detach and re-attach on a disconnected Primary. Avoid 1257 + * decreasing the value in this case. 1258 + * 1259 + * We want to store what we know the peer DRBD can handle, not what the 1260 + * peer IO backend can handle. 1261 + */ 1262 + new = min3(DRBD_MAX_BIO_SIZE, device->local_max_bio_size, 1263 + max(drbd_max_peer_bio_size(device), device->peer_max_bio_size)); 1264 + if (new != now) { 1265 + if (device->state.role == R_PRIMARY && new < now) 1266 + drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n", 1267 + new, now); 1268 + drbd_info(device, "max BIO size = %u\n", new); 1269 + } 1270 + 1271 + lim = queue_limits_start_update(q); 1272 + if (bdev) { 1273 + blk_set_stacking_limits(&lim); 1274 + lim.max_segments = drbd_backing_dev_max_segments(device); 1275 + } else { 1276 + lim.max_segments = BLK_MAX_SEGMENTS; 1277 + } 1278 + 1279 + lim.max_hw_sectors = new >> SECTOR_SHIFT; 1280 + lim.seg_boundary_mask = PAGE_SIZE - 1; 1246 1281 1247 1282 /* 1248 1283 * We don't care for the granularity, really. ··· 1309 1230 * problem, really. If you care, you need to use devices with similar 1310 1231 * topology on all peers. 1311 1232 */ 1312 - blk_queue_discard_granularity(q, 512); 1313 - max_discard_sectors = drbd_max_discard_sectors(connection); 1314 - blk_queue_max_discard_sectors(q, max_discard_sectors); 1315 - blk_queue_max_write_zeroes_sectors(q, max_discard_sectors); 1316 - return; 1233 + if (drbd_discard_supported(connection, bdev)) { 1234 + lim.discard_granularity = 512; 1235 + lim.max_hw_discard_sectors = 1236 + drbd_max_discard_sectors(connection); 1237 + } else { 1238 + lim.discard_granularity = 0; 1239 + lim.max_hw_discard_sectors = 0; 1240 + } 1317 1241 1318 - not_supported: 1319 - blk_queue_discard_granularity(q, 0); 1320 - blk_queue_max_discard_sectors(q, 0); 1321 - } 1242 + if (bdev) 1243 + blk_stack_limits(&lim, &b->limits, 0); 1322 1244 1323 - static void fixup_write_zeroes(struct drbd_device *device, struct request_queue *q) 1324 - { 1325 - /* Fixup max_write_zeroes_sectors after blk_stack_limits(): 1326 - * if we can handle "zeroes" efficiently on the protocol, 1327 - * we want to do that, even if our backend does not announce 1328 - * max_write_zeroes_sectors itself. */ 1329 - struct drbd_connection *connection = first_peer_device(device)->connection; 1330 - /* If the peer announces WZEROES support, use it. Otherwise, rather 1331 - * send explicit zeroes than rely on some discard-zeroes-data magic. */ 1245 + /* 1246 + * If we can handle "zeroes" efficiently on the protocol, we want to do 1247 + * that, even if our backend does not announce max_write_zeroes_sectors 1248 + * itself. 1249 + */ 1332 1250 if (connection->agreed_features & DRBD_FF_WZEROES) 1333 - q->limits.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS; 1251 + lim.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS; 1334 1252 else 1335 - q->limits.max_write_zeroes_sectors = 0; 1336 - } 1253 + lim.max_write_zeroes_sectors = 0; 1337 1254 1338 - static void fixup_discard_support(struct drbd_device *device, struct request_queue *q) 1339 - { 1340 - unsigned int max_discard = device->rq_queue->limits.max_discard_sectors; 1341 - unsigned int discard_granularity = 1342 - device->rq_queue->limits.discard_granularity >> SECTOR_SHIFT; 1343 - 1344 - if (discard_granularity > max_discard) { 1345 - blk_queue_discard_granularity(q, 0); 1346 - blk_queue_max_discard_sectors(q, 0); 1347 - } 1348 - } 1349 - 1350 - static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev, 1351 - unsigned int max_bio_size, struct o_qlim *o) 1352 - { 1353 - struct request_queue * const q = device->rq_queue; 1354 - unsigned int max_hw_sectors = max_bio_size >> 9; 1355 - unsigned int max_segments = 0; 1356 - struct request_queue *b = NULL; 1357 - struct disk_conf *dc; 1358 - 1359 - if (bdev) { 1360 - b = bdev->backing_bdev->bd_disk->queue; 1361 - 1362 - max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); 1363 - rcu_read_lock(); 1364 - dc = rcu_dereference(device->ldev->disk_conf); 1365 - max_segments = dc->max_bio_bvecs; 1366 - rcu_read_unlock(); 1367 - 1368 - blk_set_stacking_limits(&q->limits); 1255 + if ((lim.discard_granularity >> SECTOR_SHIFT) > 1256 + lim.max_hw_discard_sectors) { 1257 + lim.discard_granularity = 0; 1258 + lim.max_hw_discard_sectors = 0; 1369 1259 } 1370 1260 1371 - blk_queue_max_hw_sectors(q, max_hw_sectors); 1372 - /* This is the workaround for "bio would need to, but cannot, be split" */ 1373 - blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); 1374 - blk_queue_segment_boundary(q, PAGE_SIZE-1); 1375 - decide_on_discard_support(device, bdev); 1376 - 1377 - if (b) { 1378 - blk_stack_limits(&q->limits, &b->limits, 0); 1379 - disk_update_readahead(device->vdisk); 1380 - } 1381 - fixup_write_zeroes(device, q); 1382 - fixup_discard_support(device, q); 1383 - } 1384 - 1385 - void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o) 1386 - { 1387 - unsigned int now, new, local, peer; 1388 - 1389 - now = queue_max_hw_sectors(device->rq_queue) << 9; 1390 - local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */ 1391 - peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */ 1392 - 1393 - if (bdev) { 1394 - local = queue_max_hw_sectors(bdev->backing_bdev->bd_disk->queue) << 9; 1395 - device->local_max_bio_size = local; 1396 - } 1397 - local = min(local, DRBD_MAX_BIO_SIZE); 1398 - 1399 - /* We may ignore peer limits if the peer is modern enough. 1400 - Because new from 8.3.8 onwards the peer can use multiple 1401 - BIOs for a single peer_request */ 1402 - if (device->state.conn >= C_WF_REPORT_PARAMS) { 1403 - if (first_peer_device(device)->connection->agreed_pro_version < 94) 1404 - peer = min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); 1405 - /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ 1406 - else if (first_peer_device(device)->connection->agreed_pro_version == 94) 1407 - peer = DRBD_MAX_SIZE_H80_PACKET; 1408 - else if (first_peer_device(device)->connection->agreed_pro_version < 100) 1409 - peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */ 1410 - else 1411 - peer = DRBD_MAX_BIO_SIZE; 1412 - 1413 - /* We may later detach and re-attach on a disconnected Primary. 1414 - * Avoid this setting to jump back in that case. 1415 - * We want to store what we know the peer DRBD can handle, 1416 - * not what the peer IO backend can handle. */ 1417 - if (peer > device->peer_max_bio_size) 1418 - device->peer_max_bio_size = peer; 1419 - } 1420 - new = min(local, peer); 1421 - 1422 - if (device->state.role == R_PRIMARY && new < now) 1423 - drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n", new, now); 1424 - 1425 - if (new != now) 1426 - drbd_info(device, "max BIO size = %u\n", new); 1427 - 1428 - drbd_setup_queue_param(device, bdev, new, o); 1261 + if (queue_limits_commit_update(q, &lim)) 1262 + drbd_err(device, "setting new queue limits failed\n"); 1429 1263 } 1430 1264 1431 1265 /* Starts the worker thread */

+14 -10

drivers/block/drbd/drbd_state.c

··· 1542 1542 1543 1543 int notify_resource_state_change(struct sk_buff *skb, 1544 1544 unsigned int seq, 1545 - struct drbd_resource_state_change *resource_state_change, 1545 + void *state_change, 1546 1546 enum drbd_notification_type type) 1547 1547 { 1548 + struct drbd_resource_state_change *resource_state_change = state_change; 1548 1549 struct drbd_resource *resource = resource_state_change->resource; 1549 1550 struct resource_info resource_info = { 1550 1551 .res_role = resource_state_change->role[NEW], ··· 1559 1558 1560 1559 int notify_connection_state_change(struct sk_buff *skb, 1561 1560 unsigned int seq, 1562 - struct drbd_connection_state_change *connection_state_change, 1561 + void *state_change, 1563 1562 enum drbd_notification_type type) 1564 1563 { 1565 - struct drbd_connection *connection = connection_state_change->connection; 1564 + struct drbd_connection_state_change *p = state_change; 1565 + struct drbd_connection *connection = p->connection; 1566 1566 struct connection_info connection_info = { 1567 - .conn_connection_state = connection_state_change->cstate[NEW], 1568 - .conn_role = connection_state_change->peer_role[NEW], 1567 + .conn_connection_state = p->cstate[NEW], 1568 + .conn_role = p->peer_role[NEW], 1569 1569 }; 1570 1570 1571 1571 return notify_connection_state(skb, seq, connection, &connection_info, type); ··· 1574 1572 1575 1573 int notify_device_state_change(struct sk_buff *skb, 1576 1574 unsigned int seq, 1577 - struct drbd_device_state_change *device_state_change, 1575 + void *state_change, 1578 1576 enum drbd_notification_type type) 1579 1577 { 1578 + struct drbd_device_state_change *device_state_change = state_change; 1580 1579 struct drbd_device *device = device_state_change->device; 1581 1580 struct device_info device_info = { 1582 1581 .dev_disk_state = device_state_change->disk_state[NEW], ··· 1588 1585 1589 1586 int notify_peer_device_state_change(struct sk_buff *skb, 1590 1587 unsigned int seq, 1591 - struct drbd_peer_device_state_change *p, 1588 + void *state_change, 1592 1589 enum drbd_notification_type type) 1593 1590 { 1591 + struct drbd_peer_device_state_change *p = state_change; 1594 1592 struct drbd_peer_device *peer_device = p->peer_device; 1595 1593 struct peer_device_info peer_device_info = { 1596 1594 .peer_repl_state = p->repl_state[NEW], ··· 1609 1605 struct drbd_resource_state_change *resource_state_change = &state_change->resource[0]; 1610 1606 bool resource_state_has_changed; 1611 1607 unsigned int n_device, n_connection, n_peer_device, n_peer_devices; 1612 - int (*last_func)(struct sk_buff *, unsigned int, void *, 1613 - enum drbd_notification_type) = NULL; 1608 + int (*last_func)(struct sk_buff *, unsigned int, 1609 + void *, enum drbd_notification_type) = NULL; 1614 1610 void *last_arg = NULL; 1615 1611 1616 1612 #define HAS_CHANGED(state) ((state)[OLD] != (state)[NEW]) ··· 1620 1616 }) 1621 1617 #define REMEMBER_STATE_CHANGE(func, arg, type) \ 1622 1618 ({ FINAL_STATE_CHANGE(type | NOTIFY_CONTINUES); \ 1623 - last_func = (typeof(last_func))func; \ 1619 + last_func = func; \ 1624 1620 last_arg = arg; \ 1625 1621 }) 1626 1622

+4 -4

drivers/block/drbd/drbd_state_change.h

··· 46 46 47 47 extern int notify_resource_state_change(struct sk_buff *, 48 48 unsigned int, 49 - struct drbd_resource_state_change *, 49 + void *, 50 50 enum drbd_notification_type type); 51 51 extern int notify_connection_state_change(struct sk_buff *, 52 52 unsigned int, 53 - struct drbd_connection_state_change *, 53 + void *, 54 54 enum drbd_notification_type type); 55 55 extern int notify_device_state_change(struct sk_buff *, 56 56 unsigned int, 57 - struct drbd_device_state_change *, 57 + void *, 58 58 enum drbd_notification_type type); 59 59 extern int notify_peer_device_state_change(struct sk_buff *, 60 60 unsigned int, 61 - struct drbd_peer_device_state_change *, 61 + void *, 62 62 enum drbd_notification_type type); 63 63 64 64 #endif /* DRBD_STATE_CHANGE_H */

+11 -6

drivers/block/floppy.c

··· 530 530 static char *floppy_track_buffer; 531 531 static int max_buffer_sectors; 532 532 533 - typedef void (*done_f)(int); 534 533 static const struct cont_t { 535 534 void (*interrupt)(void); 536 535 /* this is called after the interrupt of the 537 536 * main command */ 538 537 void (*redo)(void); /* this is called to retry the operation */ 539 538 void (*error)(void); /* this is called to tally an error */ 540 - done_f done; /* this is called to say if the operation has 539 + void (*done)(int); /* this is called to say if the operation has 541 540 * succeeded/failed */ 542 541 } *cont; 543 542 ··· 981 982 } 982 983 983 984 static void empty(void) 985 + { 986 + } 987 + 988 + static void empty_done(int result) 984 989 { 985 990 } 986 991 ··· 2001 1998 .interrupt = empty, 2002 1999 .redo = do_wakeup, 2003 2000 .error = empty, 2004 - .done = (done_f)empty 2001 + .done = empty_done, 2005 2002 }; 2006 2003 2007 2004 static const struct cont_t intr_cont = { 2008 2005 .interrupt = empty, 2009 2006 .redo = process_fd_request, 2010 2007 .error = empty, 2011 - .done = (done_f)empty 2008 + .done = empty_done, 2012 2009 }; 2013 2010 2014 2011 /* schedules handler, waiting for completion. May be interrupted, will then ··· 4516 4513 4517 4514 static int floppy_alloc_disk(unsigned int drive, unsigned int type) 4518 4515 { 4516 + struct queue_limits lim = { 4517 + .max_hw_sectors = 64, 4518 + }; 4519 4519 struct gendisk *disk; 4520 4520 4521 - disk = blk_mq_alloc_disk(&tag_sets[drive], NULL); 4521 + disk = blk_mq_alloc_disk(&tag_sets[drive], &lim, NULL); 4522 4522 if (IS_ERR(disk)) 4523 4523 return PTR_ERR(disk); 4524 4524 4525 - blk_queue_max_hw_sectors(disk->queue, 64); 4526 4525 disk->major = FLOPPY_MAJOR; 4527 4526 disk->first_minor = TOMINOR(drive) | (type << 2); 4528 4527 disk->minors = 1;

+38 -37

drivers/block/loop.c

··· 750 750 &loop_attribute_group); 751 751 } 752 752 753 - static void loop_config_discard(struct loop_device *lo) 753 + static void loop_config_discard(struct loop_device *lo, 754 + struct queue_limits *lim) 754 755 { 755 756 struct file *file = lo->lo_backing_file; 756 757 struct inode *inode = file->f_mapping->host; 757 - struct request_queue *q = lo->lo_queue; 758 - u32 granularity, max_discard_sectors; 758 + u32 granularity = 0, max_discard_sectors = 0; 759 + struct kstatfs sbuf; 759 760 760 761 /* 761 762 * If the backing device is a block device, mirror its zeroing ··· 776 775 * We use punch hole to reclaim the free space used by the 777 776 * image a.k.a. discard. 778 777 */ 779 - } else if (!file->f_op->fallocate) { 780 - max_discard_sectors = 0; 781 - granularity = 0; 782 - 783 - } else { 784 - struct kstatfs sbuf; 785 - 778 + } else if (file->f_op->fallocate && !vfs_statfs(&file->f_path, &sbuf)) { 786 779 max_discard_sectors = UINT_MAX >> 9; 787 - if (!vfs_statfs(&file->f_path, &sbuf)) 788 - granularity = sbuf.f_bsize; 789 - else 790 - max_discard_sectors = 0; 780 + granularity = sbuf.f_bsize; 791 781 } 792 782 793 - if (max_discard_sectors) { 794 - q->limits.discard_granularity = granularity; 795 - blk_queue_max_discard_sectors(q, max_discard_sectors); 796 - blk_queue_max_write_zeroes_sectors(q, max_discard_sectors); 797 - } else { 798 - q->limits.discard_granularity = 0; 799 - blk_queue_max_discard_sectors(q, 0); 800 - blk_queue_max_write_zeroes_sectors(q, 0); 801 - } 783 + lim->max_hw_discard_sectors = max_discard_sectors; 784 + lim->max_write_zeroes_sectors = max_discard_sectors; 785 + if (max_discard_sectors) 786 + lim->discard_granularity = granularity; 787 + else 788 + lim->discard_granularity = 0; 802 789 } 803 790 804 791 struct loop_worker { ··· 975 986 return 0; 976 987 } 977 988 989 + static int loop_reconfigure_limits(struct loop_device *lo, unsigned short bsize, 990 + bool update_discard_settings) 991 + { 992 + struct queue_limits lim; 993 + 994 + lim = queue_limits_start_update(lo->lo_queue); 995 + lim.logical_block_size = bsize; 996 + lim.physical_block_size = bsize; 997 + lim.io_min = bsize; 998 + if (update_discard_settings) 999 + loop_config_discard(lo, &lim); 1000 + return queue_limits_commit_update(lo->lo_queue, &lim); 1001 + } 1002 + 978 1003 static int loop_configure(struct loop_device *lo, blk_mode_t mode, 979 1004 struct block_device *bdev, 980 1005 const struct loop_config *config) ··· 1086 1083 else 1087 1084 bsize = 512; 1088 1085 1089 - blk_queue_logical_block_size(lo->lo_queue, bsize); 1090 - blk_queue_physical_block_size(lo->lo_queue, bsize); 1091 - blk_queue_io_min(lo->lo_queue, bsize); 1086 + error = loop_reconfigure_limits(lo, bsize, true); 1087 + if (WARN_ON_ONCE(error)) 1088 + goto out_unlock; 1092 1089 1093 - loop_config_discard(lo); 1094 1090 loop_update_rotational(lo); 1095 1091 loop_update_dio(lo); 1096 1092 loop_sysfs_init(lo); ··· 1156 1154 lo->lo_offset = 0; 1157 1155 lo->lo_sizelimit = 0; 1158 1156 memset(lo->lo_file_name, 0, LO_NAME_SIZE); 1159 - blk_queue_logical_block_size(lo->lo_queue, 512); 1160 - blk_queue_physical_block_size(lo->lo_queue, 512); 1161 - blk_queue_io_min(lo->lo_queue, 512); 1157 + loop_reconfigure_limits(lo, 512, false); 1162 1158 invalidate_disk(lo->lo_disk); 1163 1159 loop_sysfs_exit(lo); 1164 1160 /* let user-space know about this change */ ··· 1488 1488 invalidate_bdev(lo->lo_device); 1489 1489 1490 1490 blk_mq_freeze_queue(lo->lo_queue); 1491 - blk_queue_logical_block_size(lo->lo_queue, arg); 1492 - blk_queue_physical_block_size(lo->lo_queue, arg); 1493 - blk_queue_io_min(lo->lo_queue, arg); 1491 + err = loop_reconfigure_limits(lo, arg, false); 1494 1492 loop_update_dio(lo); 1495 1493 blk_mq_unfreeze_queue(lo->lo_queue); 1496 1494 ··· 1980 1982 1981 1983 static int loop_add(int i) 1982 1984 { 1985 + struct queue_limits lim = { 1986 + /* 1987 + * Random number picked from the historic block max_sectors cap. 1988 + */ 1989 + .max_hw_sectors = 2560u, 1990 + }; 1983 1991 struct loop_device *lo; 1984 1992 struct gendisk *disk; 1985 1993 int err; ··· 2029 2025 if (err) 2030 2026 goto out_free_idr; 2031 2027 2032 - disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, lo); 2028 + disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, &lim, lo); 2033 2029 if (IS_ERR(disk)) { 2034 2030 err = PTR_ERR(disk); 2035 2031 goto out_cleanup_tags; 2036 2032 } 2037 2033 lo->lo_queue = lo->lo_disk->queue; 2038 - 2039 - /* random number picked from the history block max_sectors cap */ 2040 - blk_queue_max_hw_sectors(lo->lo_queue, 2560u); 2041 2034 2042 2035 /* 2043 2036 * By default, we do buffer IO, so it doesn't make sense to enable

+7 -6

drivers/block/mtip32xx/mtip32xx.c

··· 3401 3401 */ 3402 3402 static int mtip_block_initialize(struct driver_data *dd) 3403 3403 { 3404 + struct queue_limits lim = { 3405 + .physical_block_size = 4096, 3406 + .max_hw_sectors = 0xffff, 3407 + .max_segments = MTIP_MAX_SG, 3408 + .max_segment_size = 0x400000, 3409 + }; 3404 3410 int rv = 0, wait_for_rebuild = 0; 3405 3411 sector_t capacity; 3406 3412 unsigned int index = 0; ··· 3437 3431 goto block_queue_alloc_tag_error; 3438 3432 } 3439 3433 3440 - dd->disk = blk_mq_alloc_disk(&dd->tags, dd); 3434 + dd->disk = blk_mq_alloc_disk(&dd->tags, &lim, dd); 3441 3435 if (IS_ERR(dd->disk)) { 3442 3436 dev_err(&dd->pdev->dev, 3443 3437 "Unable to allocate request queue\n"); ··· 3487 3481 /* Set device limits. */ 3488 3482 blk_queue_flag_set(QUEUE_FLAG_NONROT, dd->queue); 3489 3483 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, dd->queue); 3490 - blk_queue_max_segments(dd->queue, MTIP_MAX_SG); 3491 - blk_queue_physical_block_size(dd->queue, 4096); 3492 - blk_queue_max_hw_sectors(dd->queue, 0xffff); 3493 - blk_queue_max_segment_size(dd->queue, 0x400000); 3494 3484 dma_set_max_seg_size(&dd->pdev->dev, 0x400000); 3495 - blk_queue_io_min(dd->queue, 4096); 3496 3485 3497 3486 /* Set the capacity of the device in 512 byte sectors. */ 3498 3487 if (!(mtip_hw_get_capacity(dd, &capacity))) {

+8 -4

drivers/block/n64cart.c

··· 114 114 */ 115 115 static int __init n64cart_probe(struct platform_device *pdev) 116 116 { 117 + struct queue_limits lim = { 118 + .physical_block_size = 4096, 119 + .logical_block_size = 4096, 120 + }; 117 121 struct gendisk *disk; 118 122 int err = -ENOMEM; 119 123 ··· 135 131 if (IS_ERR(reg_base)) 136 132 return PTR_ERR(reg_base); 137 133 138 - disk = blk_alloc_disk(NUMA_NO_NODE); 139 - if (!disk) 134 + disk = blk_alloc_disk(&lim, NUMA_NO_NODE); 135 + if (IS_ERR(disk)) { 136 + err = PTR_ERR(disk); 140 137 goto out; 138 + } 141 139 142 140 disk->first_minor = 0; 143 141 disk->flags = GENHD_FL_NO_PART; ··· 151 145 set_disk_ro(disk, 1); 152 146 153 147 blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); 154 - blk_queue_physical_block_size(disk->queue, 4096); 155 - blk_queue_logical_block_size(disk->queue, 4096); 156 148 157 149 err = add_disk(disk); 158 150 if (err)

+38 -11

drivers/block/nbd.c

··· 316 316 nsock->sent = 0; 317 317 } 318 318 319 - static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, 319 + static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize, 320 320 loff_t blksize) 321 321 { 322 + struct queue_limits lim; 323 + int error; 324 + 322 325 if (!blksize) 323 326 blksize = 1u << NBD_DEF_BLKSIZE_BITS; 324 327 ··· 337 334 if (!nbd->pid) 338 335 return 0; 339 336 337 + lim = queue_limits_start_update(nbd->disk->queue); 340 338 if (nbd->config->flags & NBD_FLAG_SEND_TRIM) 341 - blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX); 342 - blk_queue_logical_block_size(nbd->disk->queue, blksize); 343 - blk_queue_physical_block_size(nbd->disk->queue, blksize); 339 + lim.max_hw_discard_sectors = UINT_MAX; 340 + else 341 + lim.max_hw_discard_sectors = 0; 342 + lim.logical_block_size = blksize; 343 + lim.physical_block_size = blksize; 344 + error = queue_limits_commit_update(nbd->disk->queue, &lim); 345 + if (error) 346 + return error; 344 347 345 348 if (max_part) 346 349 set_bit(GD_NEED_PART_SCAN, &nbd->disk->state); 347 350 if (!set_capacity_and_notify(nbd->disk, bytesize >> 9)) 348 351 kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE); 349 352 return 0; 353 + } 354 + 355 + static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, 356 + loff_t blksize) 357 + { 358 + int error; 359 + 360 + blk_mq_freeze_queue(nbd->disk->queue); 361 + error = __nbd_set_size(nbd, bytesize, blksize); 362 + blk_mq_unfreeze_queue(nbd->disk->queue); 363 + 364 + return error; 350 365 } 351 366 352 367 static void nbd_complete_rq(struct request *req) ··· 1372 1351 nbd->config = NULL; 1373 1352 1374 1353 nbd->tag_set.timeout = 0; 1375 - blk_queue_max_discard_sectors(nbd->disk->queue, 0); 1376 1354 1377 1355 mutex_unlock(&nbd->config_lock); 1378 1356 nbd_put(nbd); ··· 1803 1783 1804 1784 static struct nbd_device *nbd_dev_add(int index, unsigned int refs) 1805 1785 { 1786 + struct queue_limits lim = { 1787 + .max_hw_sectors = 65536, 1788 + .max_user_sectors = 256, 1789 + .max_segments = USHRT_MAX, 1790 + .max_segment_size = UINT_MAX, 1791 + }; 1806 1792 struct nbd_device *nbd; 1807 1793 struct gendisk *disk; 1808 1794 int err = -ENOMEM; ··· 1849 1823 if (err < 0) 1850 1824 goto out_free_tags; 1851 1825 1852 - disk = blk_mq_alloc_disk(&nbd->tag_set, NULL); 1826 + disk = blk_mq_alloc_disk(&nbd->tag_set, &lim, NULL); 1853 1827 if (IS_ERR(disk)) { 1854 1828 err = PTR_ERR(disk); 1855 1829 goto out_free_idr; ··· 1869 1843 * Tell the block layer that we are not a rotational device 1870 1844 */ 1871 1845 blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); 1872 - blk_queue_max_discard_sectors(disk->queue, 0); 1873 - blk_queue_max_segment_size(disk->queue, UINT_MAX); 1874 - blk_queue_max_segments(disk->queue, USHRT_MAX); 1875 - blk_queue_max_hw_sectors(disk->queue, 65536); 1876 - disk->queue->limits.max_sectors = 256; 1877 1846 1878 1847 mutex_init(&nbd->config_lock); 1879 1848 refcount_set(&nbd->config_refs, 0); ··· 2454 2433 } 2455 2434 2456 2435 dev_list = nla_nest_start_noflag(reply, NBD_ATTR_DEVICE_LIST); 2436 + if (!dev_list) { 2437 + nlmsg_free(reply); 2438 + ret = -EMSGSIZE; 2439 + goto out; 2440 + } 2441 + 2457 2442 if (index == -1) { 2458 2443 ret = idr_for_each(&nbd_index_idr, &status_cb, reply); 2459 2444 if (ret) {

+146 -405

drivers/block/null_blk/main.c

··· 115 115 MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=<interval>,<probability>,<space>,<times>"); 116 116 #endif 117 117 118 + /* 119 + * Historic queue modes. 120 + * 121 + * These days nothing but NULL_Q_MQ is actually supported, but we keep it the 122 + * enum for error reporting. 123 + */ 124 + enum { 125 + NULL_Q_BIO = 0, 126 + NULL_Q_RQ = 1, 127 + NULL_Q_MQ = 2, 128 + }; 129 + 118 130 static int g_queue_mode = NULL_Q_MQ; 119 131 120 132 static int null_param_store_val(const char *str, int *val, int min, int max) ··· 177 165 module_param_named(blocking, g_blocking, bool, 0444); 178 166 MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); 179 167 180 - static bool shared_tags; 181 - module_param(shared_tags, bool, 0444); 168 + static bool g_shared_tags; 169 + module_param_named(shared_tags, g_shared_tags, bool, 0444); 182 170 MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); 183 171 184 172 static bool g_shared_tag_bitmap; ··· 438 426 NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); 439 427 NULLB_DEVICE_ATTR(virt_boundary, bool, NULL); 440 428 NULLB_DEVICE_ATTR(no_sched, bool, NULL); 429 + NULLB_DEVICE_ATTR(shared_tags, bool, NULL); 441 430 NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL); 442 431 443 432 static ssize_t nullb_device_power_show(struct config_item *item, char *page) ··· 584 571 &nullb_device_attr_zone_offline, 585 572 &nullb_device_attr_virt_boundary, 586 573 &nullb_device_attr_no_sched, 574 + &nullb_device_attr_shared_tags, 587 575 &nullb_device_attr_shared_tag_bitmap, 588 576 NULL, 589 577 }; ··· 667 653 "badblocks,blocking,blocksize,cache_size," 668 654 "completion_nsec,discard,home_node,hw_queue_depth," 669 655 "irqmode,max_sectors,mbps,memory_backed,no_sched," 670 - "poll_queues,power,queue_mode,shared_tag_bitmap,size," 671 - "submit_queues,use_per_node_hctx,virt_boundary,zoned," 672 - "zone_capacity,zone_max_active,zone_max_open," 673 - "zone_nr_conv,zone_offline,zone_readonly,zone_size\n"); 656 + "poll_queues,power,queue_mode,shared_tag_bitmap," 657 + "shared_tags,size,submit_queues,use_per_node_hctx," 658 + "virt_boundary,zoned,zone_capacity,zone_max_active," 659 + "zone_max_open,zone_nr_conv,zone_offline,zone_readonly," 660 + "zone_size\n"); 674 661 } 675 662 676 663 CONFIGFS_ATTR_RO(memb_group_, features); ··· 753 738 dev->zone_max_active = g_zone_max_active; 754 739 dev->virt_boundary = g_virt_boundary; 755 740 dev->no_sched = g_no_sched; 741 + dev->shared_tags = g_shared_tags; 756 742 dev->shared_tag_bitmap = g_shared_tag_bitmap; 757 743 return dev; 758 744 } ··· 768 752 kfree(dev); 769 753 } 770 754 771 - static void put_tag(struct nullb_queue *nq, unsigned int tag) 772 - { 773 - clear_bit_unlock(tag, nq->tag_map); 774 - 775 - if (waitqueue_active(&nq->wait)) 776 - wake_up(&nq->wait); 777 - } 778 - 779 - static unsigned int get_tag(struct nullb_queue *nq) 780 - { 781 - unsigned int tag; 782 - 783 - do { 784 - tag = find_first_zero_bit(nq->tag_map, nq->queue_depth); 785 - if (tag >= nq->queue_depth) 786 - return -1U; 787 - } while (test_and_set_bit_lock(tag, nq->tag_map)); 788 - 789 - return tag; 790 - } 791 - 792 - static void free_cmd(struct nullb_cmd *cmd) 793 - { 794 - put_tag(cmd->nq, cmd->tag); 795 - } 796 - 797 - static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer); 798 - 799 - static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) 800 - { 801 - struct nullb_cmd *cmd; 802 - unsigned int tag; 803 - 804 - tag = get_tag(nq); 805 - if (tag != -1U) { 806 - cmd = &nq->cmds[tag]; 807 - cmd->tag = tag; 808 - cmd->error = BLK_STS_OK; 809 - cmd->nq = nq; 810 - if (nq->dev->irqmode == NULL_IRQ_TIMER) { 811 - hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, 812 - HRTIMER_MODE_REL); 813 - cmd->timer.function = null_cmd_timer_expired; 814 - } 815 - return cmd; 816 - } 817 - 818 - return NULL; 819 - } 820 - 821 - static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, struct bio *bio) 822 - { 823 - struct nullb_cmd *cmd; 824 - DEFINE_WAIT(wait); 825 - 826 - do { 827 - /* 828 - * This avoids multiple return statements, multiple calls to 829 - * __alloc_cmd() and a fast path call to prepare_to_wait(). 830 - */ 831 - cmd = __alloc_cmd(nq); 832 - if (cmd) { 833 - cmd->bio = bio; 834 - return cmd; 835 - } 836 - prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); 837 - io_schedule(); 838 - finish_wait(&nq->wait, &wait); 839 - } while (1); 840 - } 841 - 842 - static void end_cmd(struct nullb_cmd *cmd) 843 - { 844 - int queue_mode = cmd->nq->dev->queue_mode; 845 - 846 - switch (queue_mode) { 847 - case NULL_Q_MQ: 848 - blk_mq_end_request(cmd->rq, cmd->error); 849 - return; 850 - case NULL_Q_BIO: 851 - cmd->bio->bi_status = cmd->error; 852 - bio_endio(cmd->bio); 853 - break; 854 - } 855 - 856 - free_cmd(cmd); 857 - } 858 - 859 755 static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) 860 756 { 861 - end_cmd(container_of(timer, struct nullb_cmd, timer)); 757 + struct nullb_cmd *cmd = container_of(timer, struct nullb_cmd, timer); 862 758 759 + blk_mq_end_request(blk_mq_rq_from_pdu(cmd), cmd->error); 863 760 return HRTIMER_NORESTART; 864 761 } 865 762 ··· 785 856 786 857 static void null_complete_rq(struct request *rq) 787 858 { 788 - end_cmd(blk_mq_rq_to_pdu(rq)); 859 + struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); 860 + 861 + blk_mq_end_request(rq, cmd->error); 789 862 } 790 863 791 864 static struct nullb_page *null_alloc_page(void) ··· 1204 1273 1205 1274 static int null_handle_rq(struct nullb_cmd *cmd) 1206 1275 { 1207 - struct request *rq = cmd->rq; 1276 + struct request *rq = blk_mq_rq_from_pdu(cmd); 1208 1277 struct nullb *nullb = cmd->nq->dev->nullb; 1209 1278 int err; 1210 1279 unsigned int len; ··· 1229 1298 return 0; 1230 1299 } 1231 1300 1232 - static int null_handle_bio(struct nullb_cmd *cmd) 1233 - { 1234 - struct bio *bio = cmd->bio; 1235 - struct nullb *nullb = cmd->nq->dev->nullb; 1236 - int err; 1237 - unsigned int len; 1238 - sector_t sector = bio->bi_iter.bi_sector; 1239 - struct bio_vec bvec; 1240 - struct bvec_iter iter; 1241 - 1242 - spin_lock_irq(&nullb->lock); 1243 - bio_for_each_segment(bvec, bio, iter) { 1244 - len = bvec.bv_len; 1245 - err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, 1246 - op_is_write(bio_op(bio)), sector, 1247 - bio->bi_opf & REQ_FUA); 1248 - if (err) { 1249 - spin_unlock_irq(&nullb->lock); 1250 - return err; 1251 - } 1252 - sector += len >> SECTOR_SHIFT; 1253 - } 1254 - spin_unlock_irq(&nullb->lock); 1255 - return 0; 1256 - } 1257 - 1258 - static void null_stop_queue(struct nullb *nullb) 1259 - { 1260 - struct request_queue *q = nullb->q; 1261 - 1262 - if (nullb->dev->queue_mode == NULL_Q_MQ) 1263 - blk_mq_stop_hw_queues(q); 1264 - } 1265 - 1266 - static void null_restart_queue_async(struct nullb *nullb) 1267 - { 1268 - struct request_queue *q = nullb->q; 1269 - 1270 - if (nullb->dev->queue_mode == NULL_Q_MQ) 1271 - blk_mq_start_stopped_hw_queues(q, true); 1272 - } 1273 - 1274 1301 static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) 1275 1302 { 1276 1303 struct nullb_device *dev = cmd->nq->dev; 1277 1304 struct nullb *nullb = dev->nullb; 1278 1305 blk_status_t sts = BLK_STS_OK; 1279 - struct request *rq = cmd->rq; 1306 + struct request *rq = blk_mq_rq_from_pdu(cmd); 1280 1307 1281 1308 if (!hrtimer_active(&nullb->bw_timer)) 1282 1309 hrtimer_restart(&nullb->bw_timer); 1283 1310 1284 1311 if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) { 1285 - null_stop_queue(nullb); 1312 + blk_mq_stop_hw_queues(nullb->q); 1286 1313 /* race with timer */ 1287 1314 if (atomic_long_read(&nullb->cur_bytes) > 0) 1288 - null_restart_queue_async(nullb); 1315 + blk_mq_start_stopped_hw_queues(nullb->q, true); 1289 1316 /* requeue request */ 1290 1317 sts = BLK_STS_DEV_RESOURCE; 1291 1318 } ··· 1270 1381 sector_t nr_sectors) 1271 1382 { 1272 1383 struct nullb_device *dev = cmd->nq->dev; 1273 - int err; 1274 1384 1275 1385 if (op == REQ_OP_DISCARD) 1276 1386 return null_handle_discard(dev, sector, nr_sectors); 1387 + return errno_to_blk_status(null_handle_rq(cmd)); 1277 1388 1278 - if (dev->queue_mode == NULL_Q_BIO) 1279 - err = null_handle_bio(cmd); 1280 - else 1281 - err = null_handle_rq(cmd); 1282 - 1283 - return errno_to_blk_status(err); 1284 1389 } 1285 1390 1286 1391 static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) 1287 1392 { 1393 + struct request *rq = blk_mq_rq_from_pdu(cmd); 1288 1394 struct nullb_device *dev = cmd->nq->dev; 1289 1395 struct bio *bio; 1290 1396 1291 - if (dev->memory_backed) 1292 - return; 1293 - 1294 - if (dev->queue_mode == NULL_Q_BIO && bio_op(cmd->bio) == REQ_OP_READ) { 1295 - zero_fill_bio(cmd->bio); 1296 - } else if (req_op(cmd->rq) == REQ_OP_READ) { 1297 - __rq_for_each_bio(bio, cmd->rq) 1397 + if (!dev->memory_backed && req_op(rq) == REQ_OP_READ) { 1398 + __rq_for_each_bio(bio, rq) 1298 1399 zero_fill_bio(bio); 1299 1400 } 1300 1401 } 1301 1402 1302 1403 static inline void nullb_complete_cmd(struct nullb_cmd *cmd) 1303 1404 { 1405 + struct request *rq = blk_mq_rq_from_pdu(cmd); 1406 + 1304 1407 /* 1305 1408 * Since root privileges are required to configure the null_blk 1306 1409 * driver, it is fine that this driver does not initialize the ··· 1306 1425 /* Complete IO by inline, softirq or timer */ 1307 1426 switch (cmd->nq->dev->irqmode) { 1308 1427 case NULL_IRQ_SOFTIRQ: 1309 - switch (cmd->nq->dev->queue_mode) { 1310 - case NULL_Q_MQ: 1311 - blk_mq_complete_request(cmd->rq); 1312 - break; 1313 - case NULL_Q_BIO: 1314 - /* 1315 - * XXX: no proper submitting cpu information available. 1316 - */ 1317 - end_cmd(cmd); 1318 - break; 1319 - } 1428 + blk_mq_complete_request(rq); 1320 1429 break; 1321 1430 case NULL_IRQ_NONE: 1322 - end_cmd(cmd); 1431 + blk_mq_end_request(rq, cmd->error); 1323 1432 break; 1324 1433 case NULL_IRQ_TIMER: 1325 1434 null_cmd_end_timer(cmd); ··· 1370 1499 return HRTIMER_NORESTART; 1371 1500 1372 1501 atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps)); 1373 - null_restart_queue_async(nullb); 1502 + blk_mq_start_stopped_hw_queues(nullb->q, true); 1374 1503 1375 1504 hrtimer_forward_now(&nullb->bw_timer, timer_interval); 1376 1505 ··· 1385 1514 nullb->bw_timer.function = nullb_bwtimer_fn; 1386 1515 atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps)); 1387 1516 hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL); 1388 - } 1389 - 1390 - static struct nullb_queue *nullb_to_queue(struct nullb *nullb) 1391 - { 1392 - int index = 0; 1393 - 1394 - if (nullb->nr_queues != 1) 1395 - index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues); 1396 - 1397 - return &nullb->queues[index]; 1398 - } 1399 - 1400 - static void null_submit_bio(struct bio *bio) 1401 - { 1402 - sector_t sector = bio->bi_iter.bi_sector; 1403 - sector_t nr_sectors = bio_sectors(bio); 1404 - struct nullb *nullb = bio->bi_bdev->bd_disk->private_data; 1405 - struct nullb_queue *nq = nullb_to_queue(nullb); 1406 - 1407 - null_handle_cmd(alloc_cmd(nq, bio), sector, nr_sectors, bio_op(bio)); 1408 1517 } 1409 1518 1410 1519 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION ··· 1506 1655 blk_rq_sectors(req)); 1507 1656 if (!blk_mq_add_to_batch(req, iob, (__force int) cmd->error, 1508 1657 blk_mq_end_request_batch)) 1509 - end_cmd(cmd); 1658 + blk_mq_end_request(req, cmd->error); 1510 1659 nr++; 1511 1660 } 1512 1661 ··· 1562 1711 hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1563 1712 cmd->timer.function = null_cmd_timer_expired; 1564 1713 } 1565 - cmd->rq = rq; 1566 1714 cmd->error = BLK_STS_OK; 1567 1715 cmd->nq = nq; 1568 1716 cmd->fake_timeout = should_timeout_request(rq) || ··· 1620 1770 *rqlist = requeue_list; 1621 1771 } 1622 1772 1623 - static void cleanup_queue(struct nullb_queue *nq) 1624 - { 1625 - bitmap_free(nq->tag_map); 1626 - kfree(nq->cmds); 1627 - } 1628 - 1629 - static void cleanup_queues(struct nullb *nullb) 1630 - { 1631 - int i; 1632 - 1633 - for (i = 0; i < nullb->nr_queues; i++) 1634 - cleanup_queue(&nullb->queues[i]); 1635 - 1636 - kfree(nullb->queues); 1637 - } 1638 - 1639 - static void null_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 1640 - { 1641 - struct nullb_queue *nq = hctx->driver_data; 1642 - struct nullb *nullb = nq->dev->nullb; 1643 - 1644 - nullb->nr_queues--; 1645 - } 1646 - 1647 1773 static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) 1648 1774 { 1649 - init_waitqueue_head(&nq->wait); 1650 - nq->queue_depth = nullb->queue_depth; 1651 1775 nq->dev = nullb->dev; 1652 1776 INIT_LIST_HEAD(&nq->poll_list); 1653 1777 spin_lock_init(&nq->poll_lock); ··· 1639 1815 nq = &nullb->queues[hctx_idx]; 1640 1816 hctx->driver_data = nq; 1641 1817 null_init_queue(nullb, nq); 1642 - nullb->nr_queues++; 1643 1818 1644 1819 return 0; 1645 1820 } ··· 1651 1828 .poll = null_poll, 1652 1829 .map_queues = null_map_queues, 1653 1830 .init_hctx = null_init_hctx, 1654 - .exit_hctx = null_exit_hctx, 1655 1831 }; 1656 1832 1657 1833 static void null_del_dev(struct nullb *nullb) ··· 1671 1849 if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { 1672 1850 hrtimer_cancel(&nullb->bw_timer); 1673 1851 atomic_long_set(&nullb->cur_bytes, LONG_MAX); 1674 - null_restart_queue_async(nullb); 1852 + blk_mq_start_stopped_hw_queues(nullb->q, true); 1675 1853 } 1676 1854 1677 1855 put_disk(nullb->disk); 1678 - if (dev->queue_mode == NULL_Q_MQ && 1679 - nullb->tag_set == &nullb->__tag_set) 1856 + if (nullb->tag_set == &nullb->__tag_set) 1680 1857 blk_mq_free_tag_set(nullb->tag_set); 1681 - cleanup_queues(nullb); 1858 + kfree(nullb->queues); 1682 1859 if (null_cache_active(nullb)) 1683 1860 null_free_device_storage(nullb->dev, true); 1684 1861 kfree(nullb); 1685 1862 dev->nullb = NULL; 1686 1863 } 1687 1864 1688 - static void null_config_discard(struct nullb *nullb) 1865 + static void null_config_discard(struct nullb *nullb, struct queue_limits *lim) 1689 1866 { 1690 1867 if (nullb->dev->discard == false) 1691 1868 return; ··· 1701 1880 return; 1702 1881 } 1703 1882 1704 - blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9); 1883 + lim->max_hw_discard_sectors = UINT_MAX >> 9; 1705 1884 } 1706 1885 1707 - static const struct block_device_operations null_bio_ops = { 1708 - .owner = THIS_MODULE, 1709 - .submit_bio = null_submit_bio, 1710 - .report_zones = null_report_zones, 1711 - }; 1712 - 1713 - static const struct block_device_operations null_rq_ops = { 1886 + static const struct block_device_operations null_ops = { 1714 1887 .owner = THIS_MODULE, 1715 1888 .report_zones = null_report_zones, 1716 1889 }; 1717 - 1718 - static int setup_commands(struct nullb_queue *nq) 1719 - { 1720 - struct nullb_cmd *cmd; 1721 - int i; 1722 - 1723 - nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL); 1724 - if (!nq->cmds) 1725 - return -ENOMEM; 1726 - 1727 - nq->tag_map = bitmap_zalloc(nq->queue_depth, GFP_KERNEL); 1728 - if (!nq->tag_map) { 1729 - kfree(nq->cmds); 1730 - return -ENOMEM; 1731 - } 1732 - 1733 - for (i = 0; i < nq->queue_depth; i++) { 1734 - cmd = &nq->cmds[i]; 1735 - cmd->tag = -1U; 1736 - } 1737 - 1738 - return 0; 1739 - } 1740 1890 1741 1891 static int setup_queues(struct nullb *nullb) 1742 1892 { ··· 1721 1929 if (!nullb->queues) 1722 1930 return -ENOMEM; 1723 1931 1724 - nullb->queue_depth = nullb->dev->hw_queue_depth; 1725 1932 return 0; 1726 1933 } 1727 1934 1728 - static int init_driver_queues(struct nullb *nullb) 1935 + static int null_init_tag_set(struct blk_mq_tag_set *set, int poll_queues) 1729 1936 { 1730 - struct nullb_queue *nq; 1731 - int i, ret = 0; 1732 - 1733 - for (i = 0; i < nullb->dev->submit_queues; i++) { 1734 - nq = &nullb->queues[i]; 1735 - 1736 - null_init_queue(nullb, nq); 1737 - 1738 - ret = setup_commands(nq); 1739 - if (ret) 1740 - return ret; 1741 - nullb->nr_queues++; 1742 - } 1743 - return 0; 1744 - } 1745 - 1746 - static int null_gendisk_register(struct nullb *nullb) 1747 - { 1748 - sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT; 1749 - struct gendisk *disk = nullb->disk; 1750 - 1751 - set_capacity(disk, size); 1752 - 1753 - disk->major = null_major; 1754 - disk->first_minor = nullb->index; 1755 - disk->minors = 1; 1756 - if (queue_is_mq(nullb->q)) 1757 - disk->fops = &null_rq_ops; 1758 - else 1759 - disk->fops = &null_bio_ops; 1760 - disk->private_data = nullb; 1761 - strscpy_pad(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); 1762 - 1763 - if (nullb->dev->zoned) { 1764 - int ret = null_register_zoned_dev(nullb); 1765 - 1766 - if (ret) 1767 - return ret; 1768 - } 1769 - 1770 - return add_disk(disk); 1771 - } 1772 - 1773 - static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) 1774 - { 1775 - unsigned int flags = BLK_MQ_F_SHOULD_MERGE; 1776 - int hw_queues, numa_node; 1777 - unsigned int queue_depth; 1778 - int poll_queues; 1779 - 1780 - if (nullb) { 1781 - hw_queues = nullb->dev->submit_queues; 1782 - poll_queues = nullb->dev->poll_queues; 1783 - queue_depth = nullb->dev->hw_queue_depth; 1784 - numa_node = nullb->dev->home_node; 1785 - if (nullb->dev->no_sched) 1786 - flags |= BLK_MQ_F_NO_SCHED; 1787 - if (nullb->dev->shared_tag_bitmap) 1788 - flags |= BLK_MQ_F_TAG_HCTX_SHARED; 1789 - if (nullb->dev->blocking) 1790 - flags |= BLK_MQ_F_BLOCKING; 1791 - } else { 1792 - hw_queues = g_submit_queues; 1793 - poll_queues = g_poll_queues; 1794 - queue_depth = g_hw_queue_depth; 1795 - numa_node = g_home_node; 1796 - if (g_no_sched) 1797 - flags |= BLK_MQ_F_NO_SCHED; 1798 - if (g_shared_tag_bitmap) 1799 - flags |= BLK_MQ_F_TAG_HCTX_SHARED; 1800 - if (g_blocking) 1801 - flags |= BLK_MQ_F_BLOCKING; 1802 - } 1803 - 1804 1937 set->ops = &null_mq_ops; 1805 - set->cmd_size = sizeof(struct nullb_cmd); 1806 - set->flags = flags; 1807 - set->driver_data = nullb; 1808 - set->nr_hw_queues = hw_queues; 1809 - set->queue_depth = queue_depth; 1810 - set->numa_node = numa_node; 1938 + set->cmd_size = sizeof(struct nullb_cmd); 1939 + set->timeout = 5 * HZ; 1940 + set->nr_maps = 1; 1811 1941 if (poll_queues) { 1812 1942 set->nr_hw_queues += poll_queues; 1813 - set->nr_maps = 3; 1814 - } else { 1815 - set->nr_maps = 1; 1943 + set->nr_maps += 2; 1944 + } 1945 + return blk_mq_alloc_tag_set(set); 1946 + } 1947 + 1948 + static int null_init_global_tag_set(void) 1949 + { 1950 + int error; 1951 + 1952 + if (tag_set.ops) 1953 + return 0; 1954 + 1955 + tag_set.nr_hw_queues = g_submit_queues; 1956 + tag_set.queue_depth = g_hw_queue_depth; 1957 + tag_set.numa_node = g_home_node; 1958 + tag_set.flags = BLK_MQ_F_SHOULD_MERGE; 1959 + if (g_no_sched) 1960 + tag_set.flags |= BLK_MQ_F_NO_SCHED; 1961 + if (g_shared_tag_bitmap) 1962 + tag_set.flags |= BLK_MQ_F_TAG_HCTX_SHARED; 1963 + if (g_blocking) 1964 + tag_set.flags |= BLK_MQ_F_BLOCKING; 1965 + 1966 + error = null_init_tag_set(&tag_set, g_poll_queues); 1967 + if (error) 1968 + tag_set.ops = NULL; 1969 + return error; 1970 + } 1971 + 1972 + static int null_setup_tagset(struct nullb *nullb) 1973 + { 1974 + if (nullb->dev->shared_tags) { 1975 + nullb->tag_set = &tag_set; 1976 + return null_init_global_tag_set(); 1816 1977 } 1817 1978 1818 - return blk_mq_alloc_tag_set(set); 1979 + nullb->tag_set = &nullb->__tag_set; 1980 + nullb->tag_set->driver_data = nullb; 1981 + nullb->tag_set->nr_hw_queues = nullb->dev->submit_queues; 1982 + nullb->tag_set->queue_depth = nullb->dev->hw_queue_depth; 1983 + nullb->tag_set->numa_node = nullb->dev->home_node; 1984 + nullb->tag_set->flags = BLK_MQ_F_SHOULD_MERGE; 1985 + if (nullb->dev->no_sched) 1986 + nullb->tag_set->flags |= BLK_MQ_F_NO_SCHED; 1987 + if (nullb->dev->shared_tag_bitmap) 1988 + nullb->tag_set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; 1989 + if (nullb->dev->blocking) 1990 + nullb->tag_set->flags |= BLK_MQ_F_BLOCKING; 1991 + return null_init_tag_set(nullb->tag_set, nullb->dev->poll_queues); 1819 1992 } 1820 1993 1821 1994 static int null_validate_conf(struct nullb_device *dev) ··· 1789 2032 pr_err("legacy IO path is no longer available\n"); 1790 2033 return -EINVAL; 1791 2034 } 2035 + if (dev->queue_mode == NULL_Q_BIO) { 2036 + pr_err("BIO-based IO path is no longer available, using blk-mq instead.\n"); 2037 + dev->queue_mode = NULL_Q_MQ; 2038 + } 1792 2039 1793 2040 dev->blocksize = round_down(dev->blocksize, 512); 1794 2041 dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); 1795 2042 1796 - if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) { 2043 + if (dev->use_per_node_hctx) { 1797 2044 if (dev->submit_queues != nr_online_nodes) 1798 2045 dev->submit_queues = nr_online_nodes; 1799 2046 } else if (dev->submit_queues > nr_cpu_ids) ··· 1809 2048 if (dev->poll_queues > g_poll_queues) 1810 2049 dev->poll_queues = g_poll_queues; 1811 2050 dev->prev_poll_queues = dev->poll_queues; 1812 - 1813 - dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ); 1814 2051 dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER); 1815 2052 1816 2053 /* Do memory allocation, so set blocking */ ··· 1819 2060 dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024, 1820 2061 dev->cache_size); 1821 2062 dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps); 1822 - /* can not stop a queue */ 1823 - if (dev->queue_mode == NULL_Q_BIO) 1824 - dev->mbps = 0; 1825 2063 1826 2064 if (dev->zoned && 1827 2065 (!dev->zone_size || !is_power_of_2(dev->zone_size))) { ··· 1858 2102 1859 2103 static int null_add_dev(struct nullb_device *dev) 1860 2104 { 2105 + struct queue_limits lim = { 2106 + .logical_block_size = dev->blocksize, 2107 + .physical_block_size = dev->blocksize, 2108 + .max_hw_sectors = dev->max_sectors, 2109 + }; 2110 + 1861 2111 struct nullb *nullb; 1862 2112 int rv; 1863 2113 ··· 1885 2123 if (rv) 1886 2124 goto out_free_nullb; 1887 2125 1888 - if (dev->queue_mode == NULL_Q_MQ) { 1889 - if (shared_tags) { 1890 - nullb->tag_set = &tag_set; 1891 - rv = 0; 1892 - } else { 1893 - nullb->tag_set = &nullb->__tag_set; 1894 - rv = null_init_tag_set(nullb, nullb->tag_set); 1895 - } 2126 + rv = null_setup_tagset(nullb); 2127 + if (rv) 2128 + goto out_cleanup_queues; 1896 2129 2130 + if (dev->virt_boundary) 2131 + lim.virt_boundary_mask = PAGE_SIZE - 1; 2132 + null_config_discard(nullb, &lim); 2133 + if (dev->zoned) { 2134 + rv = null_init_zoned_dev(dev, &lim); 1897 2135 if (rv) 1898 - goto out_cleanup_queues; 1899 - 1900 - nullb->tag_set->timeout = 5 * HZ; 1901 - nullb->disk = blk_mq_alloc_disk(nullb->tag_set, nullb); 1902 - if (IS_ERR(nullb->disk)) { 1903 - rv = PTR_ERR(nullb->disk); 1904 2136 goto out_cleanup_tags; 1905 - } 1906 - nullb->q = nullb->disk->queue; 1907 - } else if (dev->queue_mode == NULL_Q_BIO) { 1908 - rv = -ENOMEM; 1909 - nullb->disk = blk_alloc_disk(nullb->dev->home_node); 1910 - if (!nullb->disk) 1911 - goto out_cleanup_queues; 1912 - 1913 - nullb->q = nullb->disk->queue; 1914 - rv = init_driver_queues(nullb); 1915 - if (rv) 1916 - goto out_cleanup_disk; 1917 2137 } 2138 + 2139 + nullb->disk = blk_mq_alloc_disk(nullb->tag_set, &lim, nullb); 2140 + if (IS_ERR(nullb->disk)) { 2141 + rv = PTR_ERR(nullb->disk); 2142 + goto out_cleanup_zone; 2143 + } 2144 + nullb->q = nullb->disk->queue; 1918 2145 1919 2146 if (dev->mbps) { 1920 2147 set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags); ··· 1915 2164 blk_queue_write_cache(nullb->q, true, true); 1916 2165 } 1917 2166 1918 - if (dev->zoned) { 1919 - rv = null_init_zoned_dev(dev, nullb->q); 1920 - if (rv) 1921 - goto out_cleanup_disk; 1922 - } 1923 - 1924 2167 nullb->q->queuedata = nullb; 1925 2168 blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); 1926 2169 ··· 1922 2177 rv = ida_alloc(&nullb_indexes, GFP_KERNEL); 1923 2178 if (rv < 0) { 1924 2179 mutex_unlock(&lock); 1925 - goto out_cleanup_zone; 2180 + goto out_cleanup_disk; 1926 2181 } 1927 2182 nullb->index = rv; 1928 2183 dev->index = rv; 1929 2184 mutex_unlock(&lock); 1930 - 1931 - blk_queue_logical_block_size(nullb->q, dev->blocksize); 1932 - blk_queue_physical_block_size(nullb->q, dev->blocksize); 1933 - if (dev->max_sectors) 1934 - blk_queue_max_hw_sectors(nullb->q, dev->max_sectors); 1935 - 1936 - if (dev->virt_boundary) 1937 - blk_queue_virt_boundary(nullb->q, PAGE_SIZE - 1); 1938 - 1939 - null_config_discard(nullb); 1940 2185 1941 2186 if (config_item_name(&dev->group.cg_item)) { 1942 2187 /* Use configfs dir name as the device name */ ··· 1936 2201 sprintf(nullb->disk_name, "nullb%d", nullb->index); 1937 2202 } 1938 2203 1939 - rv = null_gendisk_register(nullb); 2204 + set_capacity(nullb->disk, 2205 + ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT); 2206 + nullb->disk->major = null_major; 2207 + nullb->disk->first_minor = nullb->index; 2208 + nullb->disk->minors = 1; 2209 + nullb->disk->fops = &null_ops; 2210 + nullb->disk->private_data = nullb; 2211 + strscpy_pad(nullb->disk->disk_name, nullb->disk_name, DISK_NAME_LEN); 2212 + 2213 + if (nullb->dev->zoned) { 2214 + rv = null_register_zoned_dev(nullb); 2215 + if (rv) 2216 + goto out_ida_free; 2217 + } 2218 + 2219 + rv = add_disk(nullb->disk); 1940 2220 if (rv) 1941 2221 goto out_ida_free; 1942 2222 ··· 1970 2220 out_cleanup_disk: 1971 2221 put_disk(nullb->disk); 1972 2222 out_cleanup_tags: 1973 - if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) 2223 + if (nullb->tag_set == &nullb->__tag_set) 1974 2224 blk_mq_free_tag_set(nullb->tag_set); 1975 2225 out_cleanup_queues: 1976 - cleanup_queues(nullb); 2226 + kfree(nullb->queues); 1977 2227 out_free_nullb: 1978 2228 kfree(nullb); 1979 2229 dev->nullb = NULL; ··· 2049 2299 return -EINVAL; 2050 2300 } 2051 2301 2052 - if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { 2302 + if (g_use_per_node_hctx) { 2053 2303 if (g_submit_queues != nr_online_nodes) { 2054 2304 pr_warn("submit_queues param is set to %u.\n", 2055 2305 nr_online_nodes); ··· 2061 2311 g_submit_queues = 1; 2062 2312 } 2063 2313 2064 - if (g_queue_mode == NULL_Q_MQ && shared_tags) { 2065 - ret = null_init_tag_set(NULL, &tag_set); 2066 - if (ret) 2067 - return ret; 2068 - } 2069 - 2070 2314 config_group_init(&nullb_subsys.su_group); 2071 2315 mutex_init(&nullb_subsys.su_mutex); 2072 2316 2073 2317 ret = configfs_register_subsystem(&nullb_subsys); 2074 2318 if (ret) 2075 - goto err_tagset; 2319 + return ret; 2076 2320 2077 2321 mutex_init(&lock); 2078 2322 ··· 2093 2349 unregister_blkdev(null_major, "nullb"); 2094 2350 err_conf: 2095 2351 configfs_unregister_subsystem(&nullb_subsys); 2096 - err_tagset: 2097 - if (g_queue_mode == NULL_Q_MQ && shared_tags) 2098 - blk_mq_free_tag_set(&tag_set); 2099 2352 return ret; 2100 2353 } 2101 2354 ··· 2111 2370 } 2112 2371 mutex_unlock(&lock); 2113 2372 2114 - if (g_queue_mode == NULL_Q_MQ && shared_tags) 2373 + if (tag_set.ops) 2115 2374 blk_mq_free_tag_set(&tag_set); 2116 2375 } 2117 2376

+3 -21

drivers/block/null_blk/null_blk.h

··· 16 16 #include <linux/mutex.h> 17 17 18 18 struct nullb_cmd { 19 - union { 20 - struct request *rq; 21 - struct bio *bio; 22 - }; 23 - unsigned int tag; 24 19 blk_status_t error; 25 20 bool fake_timeout; 26 21 struct nullb_queue *nq; ··· 23 28 }; 24 29 25 30 struct nullb_queue { 26 - unsigned long *tag_map; 27 - wait_queue_head_t wait; 28 - unsigned int queue_depth; 29 31 struct nullb_device *dev; 30 32 unsigned int requeue_selection; 31 33 32 34 struct list_head poll_list; 33 35 spinlock_t poll_lock; 34 - 35 - struct nullb_cmd *cmds; 36 36 }; 37 37 38 38 struct nullb_zone { ··· 48 58 sector_t wp; 49 59 unsigned int len; 50 60 unsigned int capacity; 51 - }; 52 - 53 - /* Queue modes */ 54 - enum { 55 - NULL_Q_BIO = 0, 56 - NULL_Q_RQ = 1, 57 - NULL_Q_MQ = 2, 58 61 }; 59 62 60 63 struct nullb_device { ··· 102 119 bool zoned; /* if device is zoned */ 103 120 bool virt_boundary; /* virtual boundary on/off for the device */ 104 121 bool no_sched; /* no IO scheduler for the device */ 122 + bool shared_tags; /* share tag set between devices for blk-mq */ 105 123 bool shared_tag_bitmap; /* use hostwide shared tags */ 106 124 }; 107 125 ··· 114 130 struct gendisk *disk; 115 131 struct blk_mq_tag_set *tag_set; 116 132 struct blk_mq_tag_set __tag_set; 117 - unsigned int queue_depth; 118 133 atomic_long_t cur_bytes; 119 134 struct hrtimer bw_timer; 120 135 unsigned long cache_flush_pos; 121 136 spinlock_t lock; 122 137 123 138 struct nullb_queue *queues; 124 - unsigned int nr_queues; 125 139 char disk_name[DISK_NAME_LEN]; 126 140 }; 127 141 ··· 129 147 sector_t sector, unsigned int nr_sectors); 130 148 131 149 #ifdef CONFIG_BLK_DEV_ZONED 132 - int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q); 150 + int null_init_zoned_dev(struct nullb_device *dev, struct queue_limits *lim); 133 151 int null_register_zoned_dev(struct nullb *nullb); 134 152 void null_free_zoned_dev(struct nullb_device *dev); 135 153 int null_report_zones(struct gendisk *disk, sector_t sector, ··· 142 160 size_t count, enum blk_zone_cond cond); 143 161 #else 144 162 static inline int null_init_zoned_dev(struct nullb_device *dev, 145 - struct request_queue *q) 163 + struct queue_limits *lim) 146 164 { 147 165 pr_err("CONFIG_BLK_DEV_ZONED not enabled\n"); 148 166 return -EINVAL;

+3 -2

drivers/block/null_blk/trace.h

··· 41 41 __field(unsigned int, zone_cond) 42 42 ), 43 43 TP_fast_assign( 44 - __entry->op = req_op(cmd->rq); 44 + __entry->op = req_op(blk_mq_rq_from_pdu(cmd)); 45 45 __entry->zone_no = zone_no; 46 46 __entry->zone_cond = zone_cond; 47 - __assign_disk_name(__entry->disk, cmd->rq->q->disk); 47 + __assign_disk_name(__entry->disk, 48 + blk_mq_rq_from_pdu(cmd)->q->disk); 48 49 ), 49 50 TP_printk("%s req=%-15s zone_no=%u zone_cond=%-10s", 50 51 __print_disk_name(__entry->disk),

+9 -16

drivers/block/null_blk/zoned.c

··· 58 58 mutex_unlock(&zone->mutex); 59 59 } 60 60 61 - int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) 61 + int null_init_zoned_dev(struct nullb_device *dev, 62 + struct queue_limits *lim) 62 63 { 63 64 sector_t dev_capacity_sects, zone_capacity_sects; 64 65 struct nullb_zone *zone; ··· 152 151 sector += dev->zone_size_sects; 153 152 } 154 153 154 + lim->zoned = true; 155 + lim->chunk_sectors = dev->zone_size_sects; 156 + lim->max_zone_append_sectors = dev->zone_size_sects; 157 + lim->max_open_zones = dev->zone_max_open; 158 + lim->max_active_zones = dev->zone_max_active; 155 159 return 0; 156 160 } 157 161 158 162 int null_register_zoned_dev(struct nullb *nullb) 159 163 { 160 - struct nullb_device *dev = nullb->dev; 161 164 struct request_queue *q = nullb->q; 162 165 163 - disk_set_zoned(nullb->disk); 164 166 blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); 165 167 blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); 166 - blk_queue_chunk_sectors(q, dev->zone_size_sects); 167 168 nullb->disk->nr_zones = bdev_nr_zones(nullb->disk->part0); 168 - blk_queue_max_zone_append_sectors(q, dev->zone_size_sects); 169 - disk_set_max_open_zones(nullb->disk, dev->zone_max_open); 170 - disk_set_max_active_zones(nullb->disk, dev->zone_max_active); 171 - 172 - if (queue_is_mq(q)) 173 - return blk_revalidate_disk_zones(nullb->disk, NULL); 174 - 175 - return 0; 169 + return blk_revalidate_disk_zones(nullb->disk, NULL); 176 170 } 177 171 178 172 void null_free_zoned_dev(struct nullb_device *dev) ··· 390 394 */ 391 395 if (append) { 392 396 sector = zone->wp; 393 - if (dev->queue_mode == NULL_Q_MQ) 394 - cmd->rq->__sector = sector; 395 - else 396 - cmd->bio->bi_iter.bi_sector = sector; 397 + blk_mq_rq_from_pdu(cmd)->__sector = sector; 397 398 } else if (sector != zone->wp) { 398 399 ret = BLK_STS_IOERR; 399 400 goto unlock;

+18 -23

drivers/block/pktcdvd.c

··· 828 828 */ 829 829 static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio) 830 830 { 831 + /* 832 + * Some CDRW drives can not handle writes larger than one packet, 833 + * even if the size is a multiple of the packet size. 834 + */ 835 + bio->bi_opf |= REQ_NOMERGE; 836 + 831 837 spin_lock(&pd->iosched.lock); 832 838 if (bio_data_dir(bio) == READ) 833 839 bio_list_add(&pd->iosched.read_queue, bio); ··· 2197 2191 ret = pkt_open_write(pd); 2198 2192 if (ret) 2199 2193 goto out_putdev; 2200 - /* 2201 - * Some CDRW drives can not handle writes larger than one packet, 2202 - * even if the size is a multiple of the packet size. 2203 - */ 2204 - blk_queue_max_hw_sectors(q, pd->settings.size); 2205 2194 set_bit(PACKET_WRITABLE, &pd->flags); 2206 2195 } else { 2207 2196 pkt_set_speed(pd, MAX_SPEED, MAX_SPEED); ··· 2339 2338 pkt_queue_bio(pd, cloned_bio); 2340 2339 } 2341 2340 2342 - static void pkt_make_request_write(struct request_queue *q, struct bio *bio) 2341 + static void pkt_make_request_write(struct bio *bio) 2343 2342 { 2344 - struct pktcdvd_device *pd = q->queuedata; 2343 + struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->private_data; 2345 2344 sector_t zone; 2346 2345 struct packet_data *pkt; 2347 2346 int was_empty, blocked_bio; ··· 2433 2432 2434 2433 static void pkt_submit_bio(struct bio *bio) 2435 2434 { 2436 - struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->queue->queuedata; 2435 + struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->private_data; 2437 2436 struct device *ddev = disk_to_dev(pd->disk); 2438 2437 struct bio *split; 2439 2438 ··· 2477 2476 split = bio; 2478 2477 } 2479 2478 2480 - pkt_make_request_write(bio->bi_bdev->bd_disk->queue, split); 2479 + pkt_make_request_write(split); 2481 2480 } while (split != bio); 2482 2481 2483 2482 return; 2484 2483 end_io: 2485 2484 bio_io_error(bio); 2486 - } 2487 - 2488 - static void pkt_init_queue(struct pktcdvd_device *pd) 2489 - { 2490 - struct request_queue *q = pd->disk->queue; 2491 - 2492 - blk_queue_logical_block_size(q, CD_FRAMESIZE); 2493 - blk_queue_max_hw_sectors(q, PACKET_MAX_SECTORS); 2494 - q->queuedata = pd; 2495 2485 } 2496 2486 2497 2487 static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) ··· 2527 2535 2528 2536 pd->bdev_file = bdev_file; 2529 2537 set_blocksize(file_bdev(bdev_file), CD_FRAMESIZE); 2530 - 2531 - pkt_init_queue(pd); 2532 2538 2533 2539 atomic_set(&pd->cdrw.pending_bios, 0); 2534 2540 pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->disk->disk_name); ··· 2624 2634 */ 2625 2635 static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) 2626 2636 { 2637 + struct queue_limits lim = { 2638 + .max_hw_sectors = PACKET_MAX_SECTORS, 2639 + .logical_block_size = CD_FRAMESIZE, 2640 + }; 2627 2641 int idx; 2628 2642 int ret = -ENOMEM; 2629 2643 struct pktcdvd_device *pd; ··· 2667 2673 pd->write_congestion_on = write_congestion_on; 2668 2674 pd->write_congestion_off = write_congestion_off; 2669 2675 2670 - ret = -ENOMEM; 2671 - disk = blk_alloc_disk(NUMA_NO_NODE); 2672 - if (!disk) 2676 + disk = blk_alloc_disk(&lim, NUMA_NO_NODE); 2677 + if (IS_ERR(disk)) { 2678 + ret = PTR_ERR(disk); 2673 2679 goto out_mem; 2680 + } 2674 2681 pd->disk = disk; 2675 2682 disk->major = pktdev_major; 2676 2683 disk->first_minor = idx;

+9 -8

drivers/block/ps3disk.c

··· 382 382 struct ps3disk_private *priv; 383 383 int error; 384 384 unsigned int devidx; 385 + struct queue_limits lim = { 386 + .logical_block_size = dev->blk_size, 387 + .max_hw_sectors = dev->bounce_size >> 9, 388 + .max_segments = -1, 389 + .max_segment_size = dev->bounce_size, 390 + .dma_alignment = dev->blk_size - 1, 391 + }; 392 + 385 393 struct request_queue *queue; 386 394 struct gendisk *gendisk; 387 395 ··· 439 431 if (error) 440 432 goto fail_teardown; 441 433 442 - gendisk = blk_mq_alloc_disk(&priv->tag_set, dev); 434 + gendisk = blk_mq_alloc_disk(&priv->tag_set, &lim, dev); 443 435 if (IS_ERR(gendisk)) { 444 436 dev_err(&dev->sbd.core, "%s:%u: blk_mq_alloc_disk failed\n", 445 437 __func__, __LINE__); ··· 449 441 450 442 queue = gendisk->queue; 451 443 452 - blk_queue_max_hw_sectors(queue, dev->bounce_size >> 9); 453 - blk_queue_dma_alignment(queue, dev->blk_size-1); 454 - blk_queue_logical_block_size(queue, dev->blk_size); 455 - 456 444 blk_queue_write_cache(queue, true, false); 457 - 458 - blk_queue_max_segments(queue, -1); 459 - blk_queue_max_segment_size(queue, dev->bounce_size); 460 445 461 446 priv->gendisk = gendisk; 462 447 gendisk->major = ps3disk_major;

+3 -3

drivers/block/ps3vram.c

··· 730 730 731 731 ps3vram_proc_init(dev); 732 732 733 - gendisk = blk_alloc_disk(NUMA_NO_NODE); 734 - if (!gendisk) { 733 + gendisk = blk_alloc_disk(NULL, NUMA_NO_NODE); 734 + if (IS_ERR(gendisk)) { 735 735 dev_err(&dev->core, "blk_alloc_disk failed\n"); 736 - error = -ENOMEM; 736 + error = PTR_ERR(gendisk); 737 737 goto out_cache_cleanup; 738 738 } 739 739

+16 -15

drivers/block/rbd.c

··· 575 575 }; 576 576 __ATTRIBUTE_GROUPS(rbd_bus); 577 577 578 - static struct bus_type rbd_bus_type = { 578 + static const struct bus_type rbd_bus_type = { 579 579 .name = "rbd", 580 580 .bus_groups = rbd_bus_groups, 581 581 }; ··· 4952 4952 struct request_queue *q; 4953 4953 unsigned int objset_bytes = 4954 4954 rbd_dev->layout.object_size * rbd_dev->layout.stripe_count; 4955 + struct queue_limits lim = { 4956 + .max_hw_sectors = objset_bytes >> SECTOR_SHIFT, 4957 + .max_user_sectors = objset_bytes >> SECTOR_SHIFT, 4958 + .io_min = rbd_dev->opts->alloc_size, 4959 + .io_opt = rbd_dev->opts->alloc_size, 4960 + .max_segments = USHRT_MAX, 4961 + .max_segment_size = UINT_MAX, 4962 + }; 4955 4963 int err; 4956 4964 4957 4965 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); ··· 4974 4966 if (err) 4975 4967 return err; 4976 4968 4977 - disk = blk_mq_alloc_disk(&rbd_dev->tag_set, rbd_dev); 4969 + if (rbd_dev->opts->trim) { 4970 + lim.discard_granularity = rbd_dev->opts->alloc_size; 4971 + lim.max_hw_discard_sectors = objset_bytes >> SECTOR_SHIFT; 4972 + lim.max_write_zeroes_sectors = objset_bytes >> SECTOR_SHIFT; 4973 + } 4974 + 4975 + disk = blk_mq_alloc_disk(&rbd_dev->tag_set, &lim, rbd_dev); 4978 4976 if (IS_ERR(disk)) { 4979 4977 err = PTR_ERR(disk); 4980 4978 goto out_tag_set; ··· 5000 4986 5001 4987 blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 5002 4988 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ 5003 - 5004 - blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT); 5005 - q->limits.max_sectors = queue_max_hw_sectors(q); 5006 - blk_queue_max_segments(q, USHRT_MAX); 5007 - blk_queue_max_segment_size(q, UINT_MAX); 5008 - blk_queue_io_min(q, rbd_dev->opts->alloc_size); 5009 - blk_queue_io_opt(q, rbd_dev->opts->alloc_size); 5010 - 5011 - if (rbd_dev->opts->trim) { 5012 - q->limits.discard_granularity = rbd_dev->opts->alloc_size; 5013 - blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT); 5014 - blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT); 5015 - } 5016 4989 5017 4990 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) 5018 4991 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);

+25 -39

drivers/block/rnbd/rnbd-clt.c

··· 1329 1329 } 1330 1330 } 1331 1331 1332 - static void setup_request_queue(struct rnbd_clt_dev *dev, 1333 - struct rnbd_msg_open_rsp *rsp) 1334 - { 1335 - blk_queue_logical_block_size(dev->queue, 1336 - le16_to_cpu(rsp->logical_block_size)); 1337 - blk_queue_physical_block_size(dev->queue, 1338 - le16_to_cpu(rsp->physical_block_size)); 1339 - blk_queue_max_hw_sectors(dev->queue, 1340 - dev->sess->max_io_size / SECTOR_SIZE); 1341 - 1342 - /* 1343 - * we don't support discards to "discontiguous" segments 1344 - * in on request 1345 - */ 1346 - blk_queue_max_discard_segments(dev->queue, 1); 1347 - 1348 - blk_queue_max_discard_sectors(dev->queue, 1349 - le32_to_cpu(rsp->max_discard_sectors)); 1350 - dev->queue->limits.discard_granularity = 1351 - le32_to_cpu(rsp->discard_granularity); 1352 - dev->queue->limits.discard_alignment = 1353 - le32_to_cpu(rsp->discard_alignment); 1354 - if (le16_to_cpu(rsp->secure_discard)) 1355 - blk_queue_max_secure_erase_sectors(dev->queue, 1356 - le32_to_cpu(rsp->max_discard_sectors)); 1357 - blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue); 1358 - blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue); 1359 - blk_queue_max_segments(dev->queue, dev->sess->max_segments); 1360 - blk_queue_io_opt(dev->queue, dev->sess->max_io_size); 1361 - blk_queue_virt_boundary(dev->queue, SZ_4K - 1); 1362 - blk_queue_write_cache(dev->queue, 1363 - !!(rsp->cache_policy & RNBD_WRITEBACK), 1364 - !!(rsp->cache_policy & RNBD_FUA)); 1365 - blk_queue_max_write_zeroes_sectors(dev->queue, 1366 - le32_to_cpu(rsp->max_write_zeroes_sectors)); 1367 - } 1368 - 1369 1332 static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, 1370 1333 struct rnbd_msg_open_rsp *rsp, int idx) 1371 1334 { ··· 1366 1403 static int rnbd_client_setup_device(struct rnbd_clt_dev *dev, 1367 1404 struct rnbd_msg_open_rsp *rsp) 1368 1405 { 1406 + struct queue_limits lim = { 1407 + .logical_block_size = le16_to_cpu(rsp->logical_block_size), 1408 + .physical_block_size = le16_to_cpu(rsp->physical_block_size), 1409 + .io_opt = dev->sess->max_io_size, 1410 + .max_hw_sectors = dev->sess->max_io_size / SECTOR_SIZE, 1411 + .max_hw_discard_sectors = le32_to_cpu(rsp->max_discard_sectors), 1412 + .discard_granularity = le32_to_cpu(rsp->discard_granularity), 1413 + .discard_alignment = le32_to_cpu(rsp->discard_alignment), 1414 + .max_segments = dev->sess->max_segments, 1415 + .virt_boundary_mask = SZ_4K - 1, 1416 + .max_write_zeroes_sectors = 1417 + le32_to_cpu(rsp->max_write_zeroes_sectors), 1418 + }; 1369 1419 int idx = dev->clt_device_id; 1370 1420 1371 1421 dev->size = le64_to_cpu(rsp->nsectors) * 1372 1422 le16_to_cpu(rsp->logical_block_size); 1373 1423 1374 - dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, dev); 1424 + if (rsp->secure_discard) { 1425 + lim.max_secure_erase_sectors = 1426 + le32_to_cpu(rsp->max_discard_sectors); 1427 + } 1428 + 1429 + dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, &lim, dev); 1375 1430 if (IS_ERR(dev->gd)) 1376 1431 return PTR_ERR(dev->gd); 1377 1432 dev->queue = dev->gd->queue; 1378 1433 rnbd_init_mq_hw_queues(dev); 1379 1434 1380 - setup_request_queue(dev, rsp); 1435 + blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue); 1436 + blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue); 1437 + blk_queue_write_cache(dev->queue, 1438 + !!(rsp->cache_policy & RNBD_WRITEBACK), 1439 + !!(rsp->cache_policy & RNBD_FUA)); 1440 + 1381 1441 return rnbd_clt_setup_gen_disk(dev, rsp, idx); 1382 1442 } 1383 1443

+9 -9

drivers/block/sunvdc.c

··· 784 784 785 785 static int probe_disk(struct vdc_port *port) 786 786 { 787 + struct queue_limits lim = { 788 + .physical_block_size = port->vdisk_phys_blksz, 789 + .max_hw_sectors = port->max_xfer_size, 790 + /* Each segment in a request is up to an aligned page in size. */ 791 + .seg_boundary_mask = PAGE_SIZE - 1, 792 + .max_segment_size = PAGE_SIZE, 793 + .max_segments = port->ring_cookies, 794 + }; 787 795 struct request_queue *q; 788 796 struct gendisk *g; 789 797 int err; ··· 832 824 if (err) 833 825 return err; 834 826 835 - g = blk_mq_alloc_disk(&port->tag_set, port); 827 + g = blk_mq_alloc_disk(&port->tag_set, &lim, port); 836 828 if (IS_ERR(g)) { 837 829 printk(KERN_ERR PFX "%s: Could not allocate gendisk.\n", 838 830 port->vio.name); ··· 843 835 port->disk = g; 844 836 q = g->queue; 845 837 846 - /* Each segment in a request is up to an aligned page in size. */ 847 - blk_queue_segment_boundary(q, PAGE_SIZE - 1); 848 - blk_queue_max_segment_size(q, PAGE_SIZE); 849 - 850 - blk_queue_max_segments(q, port->ring_cookies); 851 - blk_queue_max_hw_sectors(q, port->max_xfer_size); 852 838 g->major = vdc_major; 853 839 g->first_minor = port->vio.vdev->dev_no << PARTITION_SHIFT; 854 840 g->minors = 1 << PARTITION_SHIFT; ··· 873 871 break; 874 872 } 875 873 } 876 - 877 - blk_queue_physical_block_size(q, port->vdisk_phys_blksz); 878 874 879 875 pr_info(PFX "%s: %u sectors (%u MB) protocol %d.%d\n", 880 876 g->disk_name,

+3 -5

drivers/block/swim.c

··· 820 820 goto exit_put_disks; 821 821 822 822 swd->unit[drive].disk = 823 - blk_mq_alloc_disk(&swd->unit[drive].tag_set, 823 + blk_mq_alloc_disk(&swd->unit[drive].tag_set, NULL, 824 824 &swd->unit[drive]); 825 825 if (IS_ERR(swd->unit[drive].disk)) { 826 826 blk_mq_free_tag_set(&swd->unit[drive].tag_set); ··· 916 916 return ret; 917 917 } 918 918 919 - static int swim_remove(struct platform_device *dev) 919 + static void swim_remove(struct platform_device *dev) 920 920 { 921 921 struct swim_priv *swd = platform_get_drvdata(dev); 922 922 int drive; ··· 937 937 release_mem_region(res->start, resource_size(res)); 938 938 939 939 kfree(swd); 940 - 941 - return 0; 942 940 } 943 941 944 942 static struct platform_driver swim_driver = { 945 943 .probe = swim_probe, 946 - .remove = swim_remove, 944 + .remove_new = swim_remove, 947 945 .driver = { 948 946 .name = CARDNAME, 949 947 },

+1 -1

drivers/block/swim3.c

··· 1210 1210 if (rc) 1211 1211 goto out_unregister; 1212 1212 1213 - disk = blk_mq_alloc_disk(&fs->tag_set, fs); 1213 + disk = blk_mq_alloc_disk(&fs->tag_set, NULL, fs); 1214 1214 if (IS_ERR(disk)) { 1215 1215 rc = PTR_ERR(disk); 1216 1216 goto out_free_tag_set;

+54 -57

drivers/block/ublk_drv.c

··· 246 246 return 0; 247 247 } 248 248 249 - static int ublk_dev_param_zoned_apply(struct ublk_device *ub) 249 + static void ublk_dev_param_zoned_apply(struct ublk_device *ub) 250 250 { 251 - const struct ublk_param_zoned *p = &ub->params.zoned; 252 - 253 - disk_set_zoned(ub->ub_disk); 254 251 blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue); 255 252 blk_queue_required_elevator_features(ub->ub_disk->queue, 256 253 ELEVATOR_F_ZBD_SEQ_WRITE); 257 - disk_set_max_active_zones(ub->ub_disk, p->max_active_zones); 258 - disk_set_max_open_zones(ub->ub_disk, p->max_open_zones); 259 - blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors); 260 - 261 254 ub->ub_disk->nr_zones = ublk_get_nr_zones(ub); 262 - 263 - return 0; 264 255 } 265 256 266 257 /* Based on virtblk_alloc_report_buffer */ ··· 423 432 return -EOPNOTSUPP; 424 433 } 425 434 426 - static int ublk_dev_param_zoned_apply(struct ublk_device *ub) 435 + static void ublk_dev_param_zoned_apply(struct ublk_device *ub) 427 436 { 428 - return -EOPNOTSUPP; 429 437 } 430 438 431 439 static int ublk_revalidate_disk_zones(struct ublk_device *ub) ··· 488 498 struct request_queue *q = ub->ub_disk->queue; 489 499 const struct ublk_param_basic *p = &ub->params.basic; 490 500 491 - blk_queue_logical_block_size(q, 1 << p->logical_bs_shift); 492 - blk_queue_physical_block_size(q, 1 << p->physical_bs_shift); 493 - blk_queue_io_min(q, 1 << p->io_min_shift); 494 - blk_queue_io_opt(q, 1 << p->io_opt_shift); 495 - 496 501 blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE, 497 502 p->attrs & UBLK_ATTR_FUA); 498 503 if (p->attrs & UBLK_ATTR_ROTATIONAL) ··· 495 510 else 496 511 blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 497 512 498 - blk_queue_max_hw_sectors(q, p->max_sectors); 499 - blk_queue_chunk_sectors(q, p->chunk_sectors); 500 - blk_queue_virt_boundary(q, p->virt_boundary_mask); 501 - 502 513 if (p->attrs & UBLK_ATTR_READ_ONLY) 503 514 set_disk_ro(ub->ub_disk, true); 504 515 505 516 set_capacity(ub->ub_disk, p->dev_sectors); 506 - } 507 - 508 - static void ublk_dev_param_discard_apply(struct ublk_device *ub) 509 - { 510 - struct request_queue *q = ub->ub_disk->queue; 511 - const struct ublk_param_discard *p = &ub->params.discard; 512 - 513 - q->limits.discard_alignment = p->discard_alignment; 514 - q->limits.discard_granularity = p->discard_granularity; 515 - blk_queue_max_discard_sectors(q, p->max_discard_sectors); 516 - blk_queue_max_write_zeroes_sectors(q, 517 - p->max_write_zeroes_sectors); 518 - blk_queue_max_discard_segments(q, p->max_discard_segments); 519 517 } 520 518 521 519 static int ublk_validate_params(const struct ublk_device *ub) ··· 544 576 return 0; 545 577 } 546 578 547 - static int ublk_apply_params(struct ublk_device *ub) 579 + static void ublk_apply_params(struct ublk_device *ub) 548 580 { 549 - if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC)) 550 - return -EINVAL; 551 - 552 581 ublk_dev_param_basic_apply(ub); 553 582 554 - if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) 555 - ublk_dev_param_discard_apply(ub); 556 - 557 583 if (ub->params.types & UBLK_PARAM_TYPE_ZONED) 558 - return ublk_dev_param_zoned_apply(ub); 559 - 560 - return 0; 584 + ublk_dev_param_zoned_apply(ub); 561 585 } 562 586 563 587 static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) ··· 605 645 return ubq->flags & UBLK_F_NEED_GET_DATA; 606 646 } 607 647 608 - static struct ublk_device *ublk_get_device(struct ublk_device *ub) 648 + /* Called in slow path only, keep it noinline for trace purpose */ 649 + static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub) 609 650 { 610 651 if (kobject_get_unless_zero(&ub->cdev_dev.kobj)) 611 652 return ub; 612 653 return NULL; 613 654 } 614 655 615 - static void ublk_put_device(struct ublk_device *ub) 656 + /* Called in slow path only, keep it noinline for trace purpose */ 657 + static noinline void ublk_put_device(struct ublk_device *ub) 616 658 { 617 659 put_device(&ub->cdev_dev); 618 660 } ··· 673 711 struct ublk_device *ub = disk->private_data; 674 712 675 713 clear_bit(UB_STATE_USED, &ub->state); 676 - put_device(&ub->cdev_dev); 714 + ublk_put_device(ub); 677 715 } 678 716 679 717 static void ublk_store_owner_uid_gid(unsigned int *owner_uid, ··· 2144 2182 cancel_work_sync(&ub->stop_work); 2145 2183 cancel_work_sync(&ub->quiesce_work); 2146 2184 cdev_device_del(&ub->cdev, &ub->cdev_dev); 2147 - put_device(&ub->cdev_dev); 2185 + ublk_put_device(ub); 2148 2186 ublks_added--; 2149 2187 } 2150 2188 ··· 2167 2205 static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) 2168 2206 { 2169 2207 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); 2208 + const struct ublk_param_basic *p = &ub->params.basic; 2170 2209 int ublksrv_pid = (int)header->data[0]; 2210 + struct queue_limits lim = { 2211 + .logical_block_size = 1 << p->logical_bs_shift, 2212 + .physical_block_size = 1 << p->physical_bs_shift, 2213 + .io_min = 1 << p->io_min_shift, 2214 + .io_opt = 1 << p->io_opt_shift, 2215 + .max_hw_sectors = p->max_sectors, 2216 + .chunk_sectors = p->chunk_sectors, 2217 + .virt_boundary_mask = p->virt_boundary_mask, 2218 + 2219 + }; 2171 2220 struct gendisk *disk; 2172 2221 int ret = -EINVAL; 2173 2222 2174 2223 if (ublksrv_pid <= 0) 2175 2224 return -EINVAL; 2225 + if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC)) 2226 + return -EINVAL; 2227 + 2228 + if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) { 2229 + const struct ublk_param_discard *pd = &ub->params.discard; 2230 + 2231 + lim.discard_alignment = pd->discard_alignment; 2232 + lim.discard_granularity = pd->discard_granularity; 2233 + lim.max_hw_discard_sectors = pd->max_discard_sectors; 2234 + lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors; 2235 + lim.max_discard_segments = pd->max_discard_segments; 2236 + } 2237 + 2238 + if (ub->params.types & UBLK_PARAM_TYPE_ZONED) { 2239 + const struct ublk_param_zoned *p = &ub->params.zoned; 2240 + 2241 + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) 2242 + return -EOPNOTSUPP; 2243 + 2244 + lim.zoned = true; 2245 + lim.max_active_zones = p->max_active_zones; 2246 + lim.max_open_zones = p->max_open_zones; 2247 + lim.max_zone_append_sectors = p->max_zone_append_sectors; 2248 + } 2176 2249 2177 2250 if (wait_for_completion_interruptible(&ub->completion) != 0) 2178 2251 return -EINTR; ··· 2219 2222 goto out_unlock; 2220 2223 } 2221 2224 2222 - disk = blk_mq_alloc_disk(&ub->tag_set, NULL); 2225 + disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL); 2223 2226 if (IS_ERR(disk)) { 2224 2227 ret = PTR_ERR(disk); 2225 2228 goto out_unlock; ··· 2231 2234 ub->dev_info.ublksrv_pid = ublksrv_pid; 2232 2235 ub->ub_disk = disk; 2233 2236 2234 - ret = ublk_apply_params(ub); 2235 - if (ret) 2236 - goto out_put_disk; 2237 + ublk_apply_params(ub); 2237 2238 2238 2239 /* don't probe partitions if any one ubq daemon is un-trusted */ 2239 2240 if (ub->nr_privileged_daemon != ub->nr_queues_ready) 2240 2241 set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); 2241 2242 2242 - get_device(&ub->cdev_dev); 2243 + ublk_get_device(ub); 2243 2244 ub->dev_info.state = UBLK_S_DEV_LIVE; 2244 2245 2245 2246 if (ublk_dev_is_zoned(ub)) { ··· 2257 2262 ub->dev_info.state = UBLK_S_DEV_DEAD; 2258 2263 ublk_put_device(ub); 2259 2264 } 2260 - out_put_disk: 2261 2265 if (ret) 2262 2266 put_disk(disk); 2263 2267 out_unlock: ··· 2468 2474 return ptr == NULL; 2469 2475 } 2470 2476 2471 - static int ublk_ctrl_del_dev(struct ublk_device **p_ub) 2477 + static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait) 2472 2478 { 2473 2479 struct ublk_device *ub = *p_ub; 2474 2480 int idx = ub->ub_number; ··· 2502 2508 * - the device number is freed already, we will not find this 2503 2509 * device via ublk_get_device_from_id() 2504 2510 */ 2505 - if (wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx))) 2511 + if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx))) 2506 2512 return -EINTR; 2507 2513 return 0; 2508 2514 } ··· 2901 2907 ret = ublk_ctrl_add_dev(cmd); 2902 2908 break; 2903 2909 case UBLK_CMD_DEL_DEV: 2904 - ret = ublk_ctrl_del_dev(&ub); 2910 + ret = ublk_ctrl_del_dev(&ub, true); 2911 + break; 2912 + case UBLK_U_CMD_DEL_DEV_ASYNC: 2913 + ret = ublk_ctrl_del_dev(&ub, false); 2905 2914 break; 2906 2915 case UBLK_CMD_GET_QUEUE_AFFINITY: 2907 2916 ret = ublk_ctrl_get_queue_affinity(ub, cmd);

+227 -220

drivers/block/virtio_blk.c

··· 720 720 return ret; 721 721 } 722 722 723 - static int virtblk_probe_zoned_device(struct virtio_device *vdev, 724 - struct virtio_blk *vblk, 725 - struct request_queue *q) 723 + static int virtblk_read_zoned_limits(struct virtio_blk *vblk, 724 + struct queue_limits *lim) 726 725 { 726 + struct virtio_device *vdev = vblk->vdev; 727 727 u32 v, wg; 728 728 729 729 dev_dbg(&vdev->dev, "probing host-managed zoned device\n"); 730 730 731 - disk_set_zoned(vblk->disk); 732 - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); 731 + lim->zoned = true; 733 732 734 733 virtio_cread(vdev, struct virtio_blk_config, 735 734 zoned.max_open_zones, &v); 736 - disk_set_max_open_zones(vblk->disk, v); 735 + lim->max_open_zones = v; 737 736 dev_dbg(&vdev->dev, "max open zones = %u\n", v); 738 737 739 738 virtio_cread(vdev, struct virtio_blk_config, 740 739 zoned.max_active_zones, &v); 741 - disk_set_max_active_zones(vblk->disk, v); 740 + lim->max_active_zones = v; 742 741 dev_dbg(&vdev->dev, "max active zones = %u\n", v); 743 742 744 743 virtio_cread(vdev, struct virtio_blk_config, ··· 746 747 dev_warn(&vdev->dev, "zero write granularity reported\n"); 747 748 return -ENODEV; 748 749 } 749 - blk_queue_physical_block_size(q, wg); 750 - blk_queue_io_min(q, wg); 750 + lim->physical_block_size = wg; 751 + lim->io_min = wg; 751 752 752 753 dev_dbg(&vdev->dev, "write granularity = %u\n", wg); 753 754 ··· 763 764 vblk->zone_sectors); 764 765 return -ENODEV; 765 766 } 766 - blk_queue_chunk_sectors(q, vblk->zone_sectors); 767 + lim->chunk_sectors = vblk->zone_sectors; 767 768 dev_dbg(&vdev->dev, "zone sectors = %u\n", vblk->zone_sectors); 768 769 769 770 if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) { 770 771 dev_warn(&vblk->vdev->dev, 771 772 "ignoring negotiated F_DISCARD for zoned device\n"); 772 - blk_queue_max_discard_sectors(q, 0); 773 + lim->max_hw_discard_sectors = 0; 773 774 } 774 775 775 776 virtio_cread(vdev, struct virtio_blk_config, ··· 784 785 wg, v); 785 786 return -ENODEV; 786 787 } 787 - blk_queue_max_zone_append_sectors(q, v); 788 + lim->max_zone_append_sectors = v; 788 789 dev_dbg(&vdev->dev, "max append sectors = %u\n", v); 789 790 790 - return blk_revalidate_disk_zones(vblk->disk, NULL); 791 + return 0; 791 792 } 792 - 793 793 #else 794 - 795 794 /* 796 - * Zoned block device support is not configured in this kernel. 797 - * Host-managed zoned devices can't be supported, but others are 798 - * good to go as regular block devices. 795 + * Zoned block device support is not configured in this kernel, host-managed 796 + * zoned devices can't be supported. 799 797 */ 800 798 #define virtblk_report_zones NULL 801 - 802 - static inline int virtblk_probe_zoned_device(struct virtio_device *vdev, 803 - struct virtio_blk *vblk, struct request_queue *q) 799 + static inline int virtblk_read_zoned_limits(struct virtio_blk *vblk, 800 + struct queue_limits *lim) 804 801 { 805 - dev_err(&vdev->dev, 802 + dev_err(&vblk->vdev->dev, 806 803 "virtio_blk: zoned devices are not supported"); 807 804 return -EOPNOTSUPP; 808 805 } ··· 1243 1248 static unsigned int virtblk_queue_depth; 1244 1249 module_param_named(queue_depth, virtblk_queue_depth, uint, 0444); 1245 1250 1246 - static int virtblk_probe(struct virtio_device *vdev) 1251 + static int virtblk_read_limits(struct virtio_blk *vblk, 1252 + struct queue_limits *lim) 1247 1253 { 1248 - struct virtio_blk *vblk; 1249 - struct request_queue *q; 1250 - int err, index; 1251 - 1254 + struct virtio_device *vdev = vblk->vdev; 1252 1255 u32 v, blk_size, max_size, sg_elems, opt_io_size; 1253 1256 u32 max_discard_segs = 0; 1254 1257 u32 discard_granularity = 0; 1255 1258 u16 min_io_size; 1256 1259 u8 physical_block_exp, alignment_offset; 1257 - unsigned int queue_depth; 1258 1260 size_t max_dma_size; 1259 - 1260 - if (!vdev->config->get) { 1261 - dev_err(&vdev->dev, "%s failure: config access disabled\n", 1262 - __func__); 1263 - return -EINVAL; 1264 - } 1265 - 1266 - err = ida_alloc_range(&vd_index_ida, 0, 1267 - minor_to_index(1 << MINORBITS) - 1, GFP_KERNEL); 1268 - if (err < 0) 1269 - goto out; 1270 - index = err; 1261 + int err; 1271 1262 1272 1263 /* We need to know how many segments before we allocate. */ 1273 1264 err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SEG_MAX, ··· 1266 1285 1267 1286 /* Prevent integer overflows and honor max vq size */ 1268 1287 sg_elems = min_t(u32, sg_elems, VIRTIO_BLK_MAX_SG_ELEMS - 2); 1288 + 1289 + /* We can handle whatever the host told us to handle. */ 1290 + lim->max_segments = sg_elems; 1291 + 1292 + /* No real sector limit. */ 1293 + lim->max_hw_sectors = UINT_MAX; 1294 + 1295 + max_dma_size = virtio_max_dma_size(vdev); 1296 + max_size = max_dma_size > U32_MAX ? U32_MAX : max_dma_size; 1297 + 1298 + /* Host can optionally specify maximum segment size and number of 1299 + * segments. */ 1300 + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SIZE_MAX, 1301 + struct virtio_blk_config, size_max, &v); 1302 + if (!err) 1303 + max_size = min(max_size, v); 1304 + 1305 + lim->max_segment_size = max_size; 1306 + 1307 + /* Host can optionally specify the block size of the device */ 1308 + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE, 1309 + struct virtio_blk_config, blk_size, 1310 + &blk_size); 1311 + if (!err) { 1312 + err = blk_validate_block_size(blk_size); 1313 + if (err) { 1314 + dev_err(&vdev->dev, 1315 + "virtio_blk: invalid block size: 0x%x\n", 1316 + blk_size); 1317 + return err; 1318 + } 1319 + 1320 + lim->logical_block_size = blk_size; 1321 + } else 1322 + blk_size = lim->logical_block_size; 1323 + 1324 + /* Use topology information if available */ 1325 + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, 1326 + struct virtio_blk_config, physical_block_exp, 1327 + &physical_block_exp); 1328 + if (!err && physical_block_exp) 1329 + lim->physical_block_size = blk_size * (1 << physical_block_exp); 1330 + 1331 + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, 1332 + struct virtio_blk_config, alignment_offset, 1333 + &alignment_offset); 1334 + if (!err && alignment_offset) 1335 + lim->alignment_offset = blk_size * alignment_offset; 1336 + 1337 + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, 1338 + struct virtio_blk_config, min_io_size, 1339 + &min_io_size); 1340 + if (!err && min_io_size) 1341 + lim->io_min = blk_size * min_io_size; 1342 + 1343 + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, 1344 + struct virtio_blk_config, opt_io_size, 1345 + &opt_io_size); 1346 + if (!err && opt_io_size) 1347 + lim->io_opt = blk_size * opt_io_size; 1348 + 1349 + if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) { 1350 + virtio_cread(vdev, struct virtio_blk_config, 1351 + discard_sector_alignment, &discard_granularity); 1352 + 1353 + virtio_cread(vdev, struct virtio_blk_config, 1354 + max_discard_sectors, &v); 1355 + lim->max_hw_discard_sectors = v ? v : UINT_MAX; 1356 + 1357 + virtio_cread(vdev, struct virtio_blk_config, max_discard_seg, 1358 + &max_discard_segs); 1359 + } 1360 + 1361 + if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) { 1362 + virtio_cread(vdev, struct virtio_blk_config, 1363 + max_write_zeroes_sectors, &v); 1364 + lim->max_write_zeroes_sectors = v ? v : UINT_MAX; 1365 + } 1366 + 1367 + /* The discard and secure erase limits are combined since the Linux 1368 + * block layer uses the same limit for both commands. 1369 + * 1370 + * If both VIRTIO_BLK_F_SECURE_ERASE and VIRTIO_BLK_F_DISCARD features 1371 + * are negotiated, we will use the minimum between the limits. 1372 + * 1373 + * discard sector alignment is set to the minimum between discard_sector_alignment 1374 + * and secure_erase_sector_alignment. 1375 + * 1376 + * max discard sectors is set to the minimum between max_discard_seg and 1377 + * max_secure_erase_seg. 1378 + */ 1379 + if (virtio_has_feature(vdev, VIRTIO_BLK_F_SECURE_ERASE)) { 1380 + 1381 + virtio_cread(vdev, struct virtio_blk_config, 1382 + secure_erase_sector_alignment, &v); 1383 + 1384 + /* secure_erase_sector_alignment should not be zero, the device should set a 1385 + * valid number of sectors. 1386 + */ 1387 + if (!v) { 1388 + dev_err(&vdev->dev, 1389 + "virtio_blk: secure_erase_sector_alignment can't be 0\n"); 1390 + return -EINVAL; 1391 + } 1392 + 1393 + discard_granularity = min_not_zero(discard_granularity, v); 1394 + 1395 + virtio_cread(vdev, struct virtio_blk_config, 1396 + max_secure_erase_sectors, &v); 1397 + 1398 + /* max_secure_erase_sectors should not be zero, the device should set a 1399 + * valid number of sectors. 1400 + */ 1401 + if (!v) { 1402 + dev_err(&vdev->dev, 1403 + "virtio_blk: max_secure_erase_sectors can't be 0\n"); 1404 + return -EINVAL; 1405 + } 1406 + 1407 + lim->max_secure_erase_sectors = v; 1408 + 1409 + virtio_cread(vdev, struct virtio_blk_config, 1410 + max_secure_erase_seg, &v); 1411 + 1412 + /* max_secure_erase_seg should not be zero, the device should set a 1413 + * valid number of segments 1414 + */ 1415 + if (!v) { 1416 + dev_err(&vdev->dev, 1417 + "virtio_blk: max_secure_erase_seg can't be 0\n"); 1418 + return -EINVAL; 1419 + } 1420 + 1421 + max_discard_segs = min_not_zero(max_discard_segs, v); 1422 + } 1423 + 1424 + if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD) || 1425 + virtio_has_feature(vdev, VIRTIO_BLK_F_SECURE_ERASE)) { 1426 + /* max_discard_seg and discard_granularity will be 0 only 1427 + * if max_discard_seg and discard_sector_alignment fields in the virtio 1428 + * config are 0 and VIRTIO_BLK_F_SECURE_ERASE feature is not negotiated. 1429 + * In this case, we use default values. 1430 + */ 1431 + if (!max_discard_segs) 1432 + max_discard_segs = sg_elems; 1433 + 1434 + lim->max_discard_segments = 1435 + min(max_discard_segs, MAX_DISCARD_SEGMENTS); 1436 + 1437 + if (discard_granularity) 1438 + lim->discard_granularity = 1439 + discard_granularity << SECTOR_SHIFT; 1440 + else 1441 + lim->discard_granularity = blk_size; 1442 + } 1443 + 1444 + if (virtio_has_feature(vdev, VIRTIO_BLK_F_ZONED)) { 1445 + u8 model; 1446 + 1447 + virtio_cread(vdev, struct virtio_blk_config, zoned.model, &model); 1448 + switch (model) { 1449 + case VIRTIO_BLK_Z_NONE: 1450 + case VIRTIO_BLK_Z_HA: 1451 + /* treat host-aware devices as non-zoned */ 1452 + return 0; 1453 + case VIRTIO_BLK_Z_HM: 1454 + err = virtblk_read_zoned_limits(vblk, lim); 1455 + if (err) 1456 + return err; 1457 + break; 1458 + default: 1459 + dev_err(&vdev->dev, "unsupported zone model %d\n", model); 1460 + return -EINVAL; 1461 + } 1462 + } 1463 + 1464 + return 0; 1465 + } 1466 + 1467 + static int virtblk_probe(struct virtio_device *vdev) 1468 + { 1469 + struct virtio_blk *vblk; 1470 + struct queue_limits lim = { }; 1471 + int err, index; 1472 + unsigned int queue_depth; 1473 + 1474 + if (!vdev->config->get) { 1475 + dev_err(&vdev->dev, "%s failure: config access disabled\n", 1476 + __func__); 1477 + return -EINVAL; 1478 + } 1479 + 1480 + err = ida_alloc_range(&vd_index_ida, 0, 1481 + minor_to_index(1 << MINORBITS) - 1, GFP_KERNEL); 1482 + if (err < 0) 1483 + goto out; 1484 + index = err; 1269 1485 1270 1486 vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); 1271 1487 if (!vblk) { ··· 1508 1330 if (err) 1509 1331 goto out_free_vq; 1510 1332 1511 - vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, vblk); 1333 + err = virtblk_read_limits(vblk, &lim); 1334 + if (err) 1335 + goto out_free_tags; 1336 + 1337 + vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, &lim, vblk); 1512 1338 if (IS_ERR(vblk->disk)) { 1513 1339 err = PTR_ERR(vblk->disk); 1514 1340 goto out_free_tags; 1515 1341 } 1516 - q = vblk->disk->queue; 1517 1342 1518 1343 virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN); 1519 1344 ··· 1534 1353 if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO)) 1535 1354 set_disk_ro(vblk->disk, 1); 1536 1355 1537 - /* We can handle whatever the host told us to handle. */ 1538 - blk_queue_max_segments(q, sg_elems); 1539 - 1540 - /* No real sector limit. */ 1541 - blk_queue_max_hw_sectors(q, UINT_MAX); 1542 - 1543 - max_dma_size = virtio_max_dma_size(vdev); 1544 - max_size = max_dma_size > U32_MAX ? U32_MAX : max_dma_size; 1545 - 1546 - /* Host can optionally specify maximum segment size and number of 1547 - * segments. */ 1548 - err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SIZE_MAX, 1549 - struct virtio_blk_config, size_max, &v); 1550 - if (!err) 1551 - max_size = min(max_size, v); 1552 - 1553 - blk_queue_max_segment_size(q, max_size); 1554 - 1555 - /* Host can optionally specify the block size of the device */ 1556 - err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE, 1557 - struct virtio_blk_config, blk_size, 1558 - &blk_size); 1559 - if (!err) { 1560 - err = blk_validate_block_size(blk_size); 1561 - if (err) { 1562 - dev_err(&vdev->dev, 1563 - "virtio_blk: invalid block size: 0x%x\n", 1564 - blk_size); 1565 - goto out_cleanup_disk; 1566 - } 1567 - 1568 - blk_queue_logical_block_size(q, blk_size); 1569 - } else 1570 - blk_size = queue_logical_block_size(q); 1571 - 1572 - /* Use topology information if available */ 1573 - err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, 1574 - struct virtio_blk_config, physical_block_exp, 1575 - &physical_block_exp); 1576 - if (!err && physical_block_exp) 1577 - blk_queue_physical_block_size(q, 1578 - blk_size * (1 << physical_block_exp)); 1579 - 1580 - err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, 1581 - struct virtio_blk_config, alignment_offset, 1582 - &alignment_offset); 1583 - if (!err && alignment_offset) 1584 - blk_queue_alignment_offset(q, blk_size * alignment_offset); 1585 - 1586 - err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, 1587 - struct virtio_blk_config, min_io_size, 1588 - &min_io_size); 1589 - if (!err && min_io_size) 1590 - blk_queue_io_min(q, blk_size * min_io_size); 1591 - 1592 - err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, 1593 - struct virtio_blk_config, opt_io_size, 1594 - &opt_io_size); 1595 - if (!err && opt_io_size) 1596 - blk_queue_io_opt(q, blk_size * opt_io_size); 1597 - 1598 - if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) { 1599 - virtio_cread(vdev, struct virtio_blk_config, 1600 - discard_sector_alignment, &discard_granularity); 1601 - 1602 - virtio_cread(vdev, struct virtio_blk_config, 1603 - max_discard_sectors, &v); 1604 - blk_queue_max_discard_sectors(q, v ? v : UINT_MAX); 1605 - 1606 - virtio_cread(vdev, struct virtio_blk_config, max_discard_seg, 1607 - &max_discard_segs); 1608 - } 1609 - 1610 - if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) { 1611 - virtio_cread(vdev, struct virtio_blk_config, 1612 - max_write_zeroes_sectors, &v); 1613 - blk_queue_max_write_zeroes_sectors(q, v ? v : UINT_MAX); 1614 - } 1615 - 1616 - /* The discard and secure erase limits are combined since the Linux 1617 - * block layer uses the same limit for both commands. 1618 - * 1619 - * If both VIRTIO_BLK_F_SECURE_ERASE and VIRTIO_BLK_F_DISCARD features 1620 - * are negotiated, we will use the minimum between the limits. 1621 - * 1622 - * discard sector alignment is set to the minimum between discard_sector_alignment 1623 - * and secure_erase_sector_alignment. 1624 - * 1625 - * max discard sectors is set to the minimum between max_discard_seg and 1626 - * max_secure_erase_seg. 1627 - */ 1628 - if (virtio_has_feature(vdev, VIRTIO_BLK_F_SECURE_ERASE)) { 1629 - 1630 - virtio_cread(vdev, struct virtio_blk_config, 1631 - secure_erase_sector_alignment, &v); 1632 - 1633 - /* secure_erase_sector_alignment should not be zero, the device should set a 1634 - * valid number of sectors. 1635 - */ 1636 - if (!v) { 1637 - dev_err(&vdev->dev, 1638 - "virtio_blk: secure_erase_sector_alignment can't be 0\n"); 1639 - err = -EINVAL; 1640 - goto out_cleanup_disk; 1641 - } 1642 - 1643 - discard_granularity = min_not_zero(discard_granularity, v); 1644 - 1645 - virtio_cread(vdev, struct virtio_blk_config, 1646 - max_secure_erase_sectors, &v); 1647 - 1648 - /* max_secure_erase_sectors should not be zero, the device should set a 1649 - * valid number of sectors. 1650 - */ 1651 - if (!v) { 1652 - dev_err(&vdev->dev, 1653 - "virtio_blk: max_secure_erase_sectors can't be 0\n"); 1654 - err = -EINVAL; 1655 - goto out_cleanup_disk; 1656 - } 1657 - 1658 - blk_queue_max_secure_erase_sectors(q, v); 1659 - 1660 - virtio_cread(vdev, struct virtio_blk_config, 1661 - max_secure_erase_seg, &v); 1662 - 1663 - /* max_secure_erase_seg should not be zero, the device should set a 1664 - * valid number of segments 1665 - */ 1666 - if (!v) { 1667 - dev_err(&vdev->dev, 1668 - "virtio_blk: max_secure_erase_seg can't be 0\n"); 1669 - err = -EINVAL; 1670 - goto out_cleanup_disk; 1671 - } 1672 - 1673 - max_discard_segs = min_not_zero(max_discard_segs, v); 1674 - } 1675 - 1676 - if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD) || 1677 - virtio_has_feature(vdev, VIRTIO_BLK_F_SECURE_ERASE)) { 1678 - /* max_discard_seg and discard_granularity will be 0 only 1679 - * if max_discard_seg and discard_sector_alignment fields in the virtio 1680 - * config are 0 and VIRTIO_BLK_F_SECURE_ERASE feature is not negotiated. 1681 - * In this case, we use default values. 1682 - */ 1683 - if (!max_discard_segs) 1684 - max_discard_segs = sg_elems; 1685 - 1686 - blk_queue_max_discard_segments(q, 1687 - min(max_discard_segs, MAX_DISCARD_SEGMENTS)); 1688 - 1689 - if (discard_granularity) 1690 - q->limits.discard_granularity = discard_granularity << SECTOR_SHIFT; 1691 - else 1692 - q->limits.discard_granularity = blk_size; 1693 - } 1694 - 1695 1356 virtblk_update_capacity(vblk, false); 1696 1357 virtio_device_ready(vdev); 1697 1358 ··· 1541 1518 * All steps that follow use the VQs therefore they need to be 1542 1519 * placed after the virtio_device_ready() call above. 1543 1520 */ 1544 - if (virtio_has_feature(vdev, VIRTIO_BLK_F_ZONED)) { 1545 - u8 model; 1546 - 1547 - virtio_cread(vdev, struct virtio_blk_config, zoned.model, 1548 - &model); 1549 - switch (model) { 1550 - case VIRTIO_BLK_Z_NONE: 1551 - case VIRTIO_BLK_Z_HA: 1552 - /* Present the host-aware device as non-zoned */ 1553 - break; 1554 - case VIRTIO_BLK_Z_HM: 1555 - err = virtblk_probe_zoned_device(vdev, vblk, q); 1556 - if (err) 1557 - goto out_cleanup_disk; 1558 - break; 1559 - default: 1560 - dev_err(&vdev->dev, "unsupported zone model %d\n", 1561 - model); 1562 - err = -EINVAL; 1521 + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && lim.zoned) { 1522 + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, vblk->disk->queue); 1523 + err = blk_revalidate_disk_zones(vblk->disk, NULL); 1524 + if (err) 1563 1525 goto out_cleanup_disk; 1564 - } 1565 1526 } 1566 1527 1567 1528 err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups);

+27 -26

drivers/block/xen-blkfront.c

··· 941 941 .complete = blkif_complete_rq, 942 942 }; 943 943 944 - static void blkif_set_queue_limits(struct blkfront_info *info) 944 + static void blkif_set_queue_limits(const struct blkfront_info *info, 945 + struct queue_limits *lim) 945 946 { 946 - struct request_queue *rq = info->rq; 947 - struct gendisk *gd = info->gd; 948 947 unsigned int segments = info->max_indirect_segments ? : 949 948 BLKIF_MAX_SEGMENTS_PER_REQUEST; 950 949 951 - blk_queue_flag_set(QUEUE_FLAG_VIRT, rq); 952 - 953 950 if (info->feature_discard) { 954 - blk_queue_max_discard_sectors(rq, get_capacity(gd)); 955 - rq->limits.discard_granularity = info->discard_granularity ?: 956 - info->physical_sector_size; 957 - rq->limits.discard_alignment = info->discard_alignment; 951 + lim->max_hw_discard_sectors = UINT_MAX; 952 + if (info->discard_granularity) 953 + lim->discard_granularity = info->discard_granularity; 954 + lim->discard_alignment = info->discard_alignment; 958 955 if (info->feature_secdiscard) 959 - blk_queue_max_secure_erase_sectors(rq, 960 - get_capacity(gd)); 956 + lim->max_secure_erase_sectors = UINT_MAX; 961 957 } 962 958 963 959 /* Hard sector size and max sectors impersonate the equiv. hardware. */ 964 - blk_queue_logical_block_size(rq, info->sector_size); 965 - blk_queue_physical_block_size(rq, info->physical_sector_size); 966 - blk_queue_max_hw_sectors(rq, (segments * XEN_PAGE_SIZE) / 512); 960 + lim->logical_block_size = info->sector_size; 961 + lim->physical_block_size = info->physical_sector_size; 962 + lim->max_hw_sectors = (segments * XEN_PAGE_SIZE) / 512; 967 963 968 964 /* Each segment in a request is up to an aligned page in size. */ 969 - blk_queue_segment_boundary(rq, PAGE_SIZE - 1); 970 - blk_queue_max_segment_size(rq, PAGE_SIZE); 965 + lim->seg_boundary_mask = PAGE_SIZE - 1; 966 + lim->max_segment_size = PAGE_SIZE; 971 967 972 968 /* Ensure a merged request will fit in a single I/O ring slot. */ 973 - blk_queue_max_segments(rq, segments / GRANTS_PER_PSEG); 969 + lim->max_segments = segments / GRANTS_PER_PSEG; 974 970 975 971 /* Make sure buffer addresses are sector-aligned. */ 976 - blk_queue_dma_alignment(rq, 511); 972 + lim->dma_alignment = 511; 977 973 } 978 974 979 975 static const char *flush_info(struct blkfront_info *info) ··· 1066 1070 struct blkfront_info *info, u16 sector_size, 1067 1071 unsigned int physical_sector_size) 1068 1072 { 1073 + struct queue_limits lim = {}; 1069 1074 struct gendisk *gd; 1070 1075 int nr_minors = 1; 1071 1076 int err; ··· 1133 1136 if (err) 1134 1137 goto out_release_minors; 1135 1138 1136 - gd = blk_mq_alloc_disk(&info->tag_set, info); 1139 + blkif_set_queue_limits(info, &lim); 1140 + gd = blk_mq_alloc_disk(&info->tag_set, &lim, info); 1137 1141 if (IS_ERR(gd)) { 1138 1142 err = PTR_ERR(gd); 1139 1143 goto out_free_tag_set; 1140 1144 } 1145 + blk_queue_flag_set(QUEUE_FLAG_VIRT, gd->queue); 1141 1146 1142 1147 strcpy(gd->disk_name, DEV_NAME); 1143 1148 ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset); ··· 1161 1162 info->gd = gd; 1162 1163 info->sector_size = sector_size; 1163 1164 info->physical_sector_size = physical_sector_size; 1164 - blkif_set_queue_limits(info); 1165 1165 1166 1166 xlvbd_flush(info); 1167 1167 ··· 2004 2006 2005 2007 static int blkif_recover(struct blkfront_info *info) 2006 2008 { 2009 + struct queue_limits lim; 2007 2010 unsigned int r_index; 2008 2011 struct request *req, *n; 2009 2012 int rc; 2010 2013 struct bio *bio; 2011 - unsigned int segs; 2012 2014 struct blkfront_ring_info *rinfo; 2013 2015 2016 + lim = queue_limits_start_update(info->rq); 2014 2017 blkfront_gather_backend_features(info); 2015 - /* Reset limits changed by blk_mq_update_nr_hw_queues(). */ 2016 - blkif_set_queue_limits(info); 2017 - segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; 2018 - blk_queue_max_segments(info->rq, segs / GRANTS_PER_PSEG); 2018 + blkif_set_queue_limits(info, &lim); 2019 + rc = queue_limits_commit_update(info->rq, &lim); 2020 + if (rc) 2021 + return rc; 2019 2022 2020 2023 for_each_rinfo(info, rinfo, r_index) { 2021 2024 rc = blkfront_setup_indirect(rinfo); ··· 2036 2037 list_for_each_entry_safe(req, n, &info->requests, queuelist) { 2037 2038 /* Requeue pending requests (flush or discard) */ 2038 2039 list_del_init(&req->queuelist); 2039 - BUG_ON(req->nr_phys_segments > segs); 2040 + BUG_ON(req->nr_phys_segments > 2041 + (info->max_indirect_segments ? : 2042 + BLKIF_MAX_SEGMENTS_PER_REQUEST)); 2040 2043 blk_mq_requeue_request(req, false); 2041 2044 } 2042 2045 blk_mq_start_stopped_hw_queues(info->rq, true);

+1 -1

drivers/block/z2ram.c

··· 318 318 struct gendisk *disk; 319 319 int err; 320 320 321 - disk = blk_mq_alloc_disk(&tag_set, NULL); 321 + disk = blk_mq_alloc_disk(&tag_set, NULL, NULL); 322 322 if (IS_ERR(disk)) 323 323 return PTR_ERR(disk); 324 324

+25 -26

drivers/block/zram/zram_drv.c

··· 2177 2177 */ 2178 2178 static int zram_add(void) 2179 2179 { 2180 + struct queue_limits lim = { 2181 + .logical_block_size = ZRAM_LOGICAL_BLOCK_SIZE, 2182 + /* 2183 + * To ensure that we always get PAGE_SIZE aligned and 2184 + * n*PAGE_SIZED sized I/O requests. 2185 + */ 2186 + .physical_block_size = PAGE_SIZE, 2187 + .io_min = PAGE_SIZE, 2188 + .io_opt = PAGE_SIZE, 2189 + .max_hw_discard_sectors = UINT_MAX, 2190 + /* 2191 + * zram_bio_discard() will clear all logical blocks if logical 2192 + * block size is identical with physical block size(PAGE_SIZE). 2193 + * But if it is different, we will skip discarding some parts of 2194 + * logical blocks in the part of the request range which isn't 2195 + * aligned to physical block size. So we can't ensure that all 2196 + * discarded logical blocks are zeroed. 2197 + */ 2198 + #if ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE 2199 + .max_write_zeroes_sectors = UINT_MAX, 2200 + #endif 2201 + }; 2180 2202 struct zram *zram; 2181 2203 int ret, device_id; 2182 2204 ··· 2217 2195 #endif 2218 2196 2219 2197 /* gendisk structure */ 2220 - zram->disk = blk_alloc_disk(NUMA_NO_NODE); 2221 - if (!zram->disk) { 2198 + zram->disk = blk_alloc_disk(&lim, NUMA_NO_NODE); 2199 + if (IS_ERR(zram->disk)) { 2222 2200 pr_err("Error allocating disk structure for device %d\n", 2223 2201 device_id); 2224 - ret = -ENOMEM; 2202 + ret = PTR_ERR(zram->disk); 2225 2203 goto out_free_idr; 2226 2204 } 2227 2205 ··· 2238 2216 /* zram devices sort of resembles non-rotational disks */ 2239 2217 blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue); 2240 2218 blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue); 2241 - 2242 - /* 2243 - * To ensure that we always get PAGE_SIZE aligned 2244 - * and n*PAGE_SIZED sized I/O requests. 2245 - */ 2246 - blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE); 2247 - blk_queue_logical_block_size(zram->disk->queue, 2248 - ZRAM_LOGICAL_BLOCK_SIZE); 2249 - blk_queue_io_min(zram->disk->queue, PAGE_SIZE); 2250 - blk_queue_io_opt(zram->disk->queue, PAGE_SIZE); 2251 - blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX); 2252 - 2253 - /* 2254 - * zram_bio_discard() will clear all logical blocks if logical block 2255 - * size is identical with physical block size(PAGE_SIZE). But if it is 2256 - * different, we will skip discarding some parts of logical blocks in 2257 - * the part of the request range which isn't aligned to physical block 2258 - * size. So we can't ensure that all discarded logical blocks are 2259 - * zeroed. 2260 - */ 2261 - if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE) 2262 - blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX); 2263 - 2264 2219 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue); 2265 2220 ret = device_add_disk(NULL, zram->disk, zram_disk_groups); 2266 2221 if (ret)

+10 -10

drivers/cdrom/gdrom.c

··· 724 724 725 725 static int probe_gdrom_setupqueue(void) 726 726 { 727 - blk_queue_logical_block_size(gd.gdrom_rq, GDROM_HARD_SECTOR); 728 - /* using DMA so memory will need to be contiguous */ 729 - blk_queue_max_segments(gd.gdrom_rq, 1); 730 - /* set a large max size to get most from DMA */ 731 - blk_queue_max_segment_size(gd.gdrom_rq, 0x40000); 732 727 gd.disk->queue = gd.gdrom_rq; 733 728 return gdrom_init_dma_mode(); 734 729 } ··· 738 743 */ 739 744 static int probe_gdrom(struct platform_device *devptr) 740 745 { 746 + struct queue_limits lim = { 747 + .logical_block_size = GDROM_HARD_SECTOR, 748 + /* using DMA so memory will need to be contiguous */ 749 + .max_segments = 1, 750 + /* set a large max size to get most from DMA */ 751 + .max_segment_size = 0x40000, 752 + }; 741 753 int err; 742 754 743 755 /* ··· 780 778 if (err) 781 779 goto probe_fail_free_cd_info; 782 780 783 - gd.disk = blk_mq_alloc_disk(&gd.tag_set, NULL); 781 + gd.disk = blk_mq_alloc_disk(&gd.tag_set, &lim, NULL); 784 782 if (IS_ERR(gd.disk)) { 785 783 err = PTR_ERR(gd.disk); 786 784 goto probe_fail_free_tag_set; ··· 831 829 return err; 832 830 } 833 831 834 - static int remove_gdrom(struct platform_device *devptr) 832 + static void remove_gdrom(struct platform_device *devptr) 835 833 { 836 834 blk_mq_free_tag_set(&gd.tag_set); 837 835 free_irq(HW_EVENT_GDROM_CMD, &gd); ··· 842 840 unregister_cdrom(gd.cd_info); 843 841 kfree(gd.cd_info); 844 842 kfree(gd.toc); 845 - 846 - return 0; 847 843 } 848 844 849 845 static struct platform_driver gdrom_driver = { 850 846 .probe = probe_gdrom, 851 - .remove = remove_gdrom, 847 + .remove_new = remove_gdrom, 852 848 .driver = { 853 849 .name = GDROM_DEV_NAME, 854 850 },

+30 -29

drivers/md/bcache/super.c

··· 900 900 struct request_queue *q; 901 901 const size_t max_stripes = min_t(size_t, INT_MAX, 902 902 SIZE_MAX / sizeof(atomic_t)); 903 + struct queue_limits lim = { 904 + .max_hw_sectors = UINT_MAX, 905 + .max_sectors = UINT_MAX, 906 + .max_segment_size = UINT_MAX, 907 + .max_segments = BIO_MAX_VECS, 908 + .max_hw_discard_sectors = UINT_MAX, 909 + .io_min = block_size, 910 + .logical_block_size = block_size, 911 + .physical_block_size = block_size, 912 + }; 903 913 uint64_t n; 904 914 int idx; 905 915 916 + if (cached_bdev) { 917 + d->stripe_size = bdev_io_opt(cached_bdev) >> SECTOR_SHIFT; 918 + lim.io_opt = umax(block_size, bdev_io_opt(cached_bdev)); 919 + } 906 920 if (!d->stripe_size) 907 921 d->stripe_size = 1 << 31; 908 922 else if (d->stripe_size < BCH_MIN_STRIPE_SZ) ··· 949 935 BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) 950 936 goto out_ida_remove; 951 937 952 - d->disk = blk_alloc_disk(NUMA_NO_NODE); 953 - if (!d->disk) 938 + if (lim.logical_block_size > PAGE_SIZE && cached_bdev) { 939 + /* 940 + * This should only happen with BCACHE_SB_VERSION_BDEV. 941 + * Block/page size is checked for BCACHE_SB_VERSION_CDEV. 942 + */ 943 + pr_info("bcache%i: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n", 944 + idx, lim.logical_block_size, 945 + PAGE_SIZE, bdev_logical_block_size(cached_bdev)); 946 + 947 + /* This also adjusts physical block size/min io size if needed */ 948 + lim.logical_block_size = bdev_logical_block_size(cached_bdev); 949 + } 950 + 951 + d->disk = blk_alloc_disk(&lim, NUMA_NO_NODE); 952 + if (IS_ERR(d->disk)) 954 953 goto out_bioset_exit; 955 954 956 955 set_capacity(d->disk, sectors); ··· 976 949 d->disk->private_data = d; 977 950 978 951 q = d->disk->queue; 979 - q->limits.max_hw_sectors = UINT_MAX; 980 - q->limits.max_sectors = UINT_MAX; 981 - q->limits.max_segment_size = UINT_MAX; 982 - q->limits.max_segments = BIO_MAX_VECS; 983 - blk_queue_max_discard_sectors(q, UINT_MAX); 984 - q->limits.io_min = block_size; 985 - q->limits.logical_block_size = block_size; 986 - q->limits.physical_block_size = block_size; 987 - 988 - if (q->limits.logical_block_size > PAGE_SIZE && cached_bdev) { 989 - /* 990 - * This should only happen with BCACHE_SB_VERSION_BDEV. 991 - * Block/page size is checked for BCACHE_SB_VERSION_CDEV. 992 - */ 993 - pr_info("%s: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n", 994 - d->disk->disk_name, q->limits.logical_block_size, 995 - PAGE_SIZE, bdev_logical_block_size(cached_bdev)); 996 - 997 - /* This also adjusts physical block size/min io size if needed */ 998 - blk_queue_logical_block_size(q, bdev_logical_block_size(cached_bdev)); 999 - } 1000 952 1001 953 blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue); 1002 954 ··· 1422 1416 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); 1423 1417 } 1424 1418 1425 - dc->disk.stripe_size = q->limits.io_opt >> 9; 1426 - 1427 - if (dc->disk.stripe_size) 1419 + if (bdev_io_opt(dc->bdev)) 1428 1420 dc->partial_stripes_expensive = 1429 1421 q->limits.raid_partial_stripes_expensive; 1430 1422 ··· 1431 1427 dc->bdev, &bcache_cached_ops); 1432 1428 if (ret) 1433 1429 return ret; 1434 - 1435 - blk_queue_io_opt(dc->disk.disk->queue, 1436 - max(queue_io_opt(dc->disk.disk->queue), queue_io_opt(q))); 1437 1430 1438 1431 atomic_set(&dc->io_errors, 0); 1439 1432 dc->io_disable = false;

+72 -21

drivers/md/dm-raid.c

··· 213 213 #define RT_FLAG_RS_IN_SYNC 6 214 214 #define RT_FLAG_RS_RESYNCING 7 215 215 #define RT_FLAG_RS_GROW 8 216 + #define RT_FLAG_RS_FROZEN 9 216 217 217 218 /* Array elements of 64 bit needed for rebuild/failed disk bits */ 218 219 #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8) ··· 3241 3240 rs->md.ro = 1; 3242 3241 rs->md.in_sync = 1; 3243 3242 3244 - /* Keep array frozen until resume. */ 3245 - set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); 3246 - 3247 3243 /* Has to be held on running the array */ 3248 3244 mddev_suspend_and_lock_nointr(&rs->md); 3245 + 3246 + /* Keep array frozen until resume. */ 3247 + md_frozen_sync_thread(&rs->md); 3248 + 3249 3249 r = md_run(&rs->md); 3250 3250 rs->md.in_sync = 0; /* Assume already marked dirty */ 3251 3251 if (r) { ··· 3341 3339 if (unlikely(bio_end_sector(bio) > mddev->array_sectors)) 3342 3340 return DM_MAPIO_REQUEUE; 3343 3341 3344 - md_handle_request(mddev, bio); 3342 + if (unlikely(!md_handle_request(mddev, bio))) 3343 + return DM_MAPIO_REQUEUE; 3345 3344 3346 3345 return DM_MAPIO_SUBMITTED; 3347 3346 } ··· 3721 3718 { 3722 3719 struct raid_set *rs = ti->private; 3723 3720 struct mddev *mddev = &rs->md; 3721 + int ret = 0; 3724 3722 3725 3723 if (!mddev->pers || !mddev->pers->sync_request) 3726 3724 return -EINVAL; 3727 3725 3728 - if (!strcasecmp(argv[0], "frozen")) 3729 - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3730 - else 3731 - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3726 + if (test_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags) || 3727 + test_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags)) 3728 + return -EBUSY; 3732 3729 3733 - if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) { 3734 - if (mddev->sync_thread) { 3735 - set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3736 - md_reap_sync_thread(mddev); 3737 - } 3738 - } else if (decipher_sync_action(mddev, mddev->recovery) != st_idle) 3730 + if (!strcasecmp(argv[0], "frozen")) { 3731 + ret = mddev_lock(mddev); 3732 + if (ret) 3733 + return ret; 3734 + 3735 + md_frozen_sync_thread(mddev); 3736 + mddev_unlock(mddev); 3737 + } else if (!strcasecmp(argv[0], "idle")) { 3738 + ret = mddev_lock(mddev); 3739 + if (ret) 3740 + return ret; 3741 + 3742 + md_idle_sync_thread(mddev); 3743 + mddev_unlock(mddev); 3744 + } 3745 + 3746 + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3747 + if (decipher_sync_action(mddev, mddev->recovery) != st_idle) 3739 3748 return -EBUSY; 3740 3749 else if (!strcasecmp(argv[0], "resync")) 3741 3750 ; /* MD_RECOVERY_NEEDED set below */ ··· 3806 3791 blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs)); 3807 3792 } 3808 3793 3794 + static void raid_presuspend(struct dm_target *ti) 3795 + { 3796 + struct raid_set *rs = ti->private; 3797 + struct mddev *mddev = &rs->md; 3798 + 3799 + /* 3800 + * From now on, disallow raid_message() to change sync_thread until 3801 + * resume, raid_postsuspend() is too late. 3802 + */ 3803 + set_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags); 3804 + 3805 + if (!reshape_interrupted(mddev)) 3806 + return; 3807 + 3808 + /* 3809 + * For raid456, if reshape is interrupted, IO across reshape position 3810 + * will never make progress, while caller will wait for IO to be done. 3811 + * Inform raid456 to handle those IO to prevent deadlock. 3812 + */ 3813 + if (mddev->pers && mddev->pers->prepare_suspend) 3814 + mddev->pers->prepare_suspend(mddev); 3815 + } 3816 + 3817 + static void raid_presuspend_undo(struct dm_target *ti) 3818 + { 3819 + struct raid_set *rs = ti->private; 3820 + 3821 + clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags); 3822 + } 3823 + 3809 3824 static void raid_postsuspend(struct dm_target *ti) 3810 3825 { 3811 3826 struct raid_set *rs = ti->private; 3812 3827 3813 3828 if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { 3814 - /* Writes have to be stopped before suspending to avoid deadlocks. */ 3815 - if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery)) 3816 - md_stop_writes(&rs->md); 3817 - 3829 + /* 3830 + * sync_thread must be stopped during suspend, and writes have 3831 + * to be stopped before suspending to avoid deadlocks. 3832 + */ 3833 + md_stop_writes(&rs->md); 3818 3834 mddev_suspend(&rs->md, false); 3819 3835 } 3820 3836 } ··· 4058 4012 } 4059 4013 4060 4014 /* Check for any resize/reshape on @rs and adjust/initiate */ 4061 - /* Be prepared for mddev_resume() in raid_resume() */ 4062 - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4063 4015 if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) { 4064 4016 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 4065 4017 mddev->resync_min = mddev->recovery_cp; ··· 4091 4047 * Take this opportunity to check whether any failed 4092 4048 * devices are reachable again. 4093 4049 */ 4050 + mddev_lock_nointr(mddev); 4094 4051 attempt_restore_of_faulty_devices(rs); 4052 + mddev_unlock(mddev); 4095 4053 } 4096 4054 4097 4055 if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { ··· 4101 4055 if (mddev->delta_disks < 0) 4102 4056 rs_set_capacity(rs); 4103 4057 4058 + WARN_ON_ONCE(!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)); 4059 + WARN_ON_ONCE(test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); 4060 + clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags); 4104 4061 mddev_lock_nointr(mddev); 4105 - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4106 4062 mddev->ro = 0; 4107 4063 mddev->in_sync = 0; 4064 + md_unfrozen_sync_thread(mddev); 4108 4065 mddev_unlock_and_resume(mddev); 4109 4066 } 4110 4067 } ··· 4123 4074 .message = raid_message, 4124 4075 .iterate_devices = raid_iterate_devices, 4125 4076 .io_hints = raid_io_hints, 4077 + .presuspend = raid_presuspend, 4078 + .presuspend_undo = raid_presuspend_undo, 4126 4079 .postsuspend = raid_postsuspend, 4127 4080 .preresume = raid_preresume, 4128 4081 .resume = raid_resume,

+12 -15

drivers/md/dm-table.c

··· 1963 1963 bool wc = false, fua = false; 1964 1964 int r; 1965 1965 1966 - /* 1967 - * Copy table's limits to the DM device's request_queue 1968 - */ 1969 - q->limits = *limits; 1970 - 1971 1966 if (dm_table_supports_nowait(t)) 1972 1967 blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); 1973 1968 else 1974 1969 blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q); 1975 1970 1976 1971 if (!dm_table_supports_discards(t)) { 1977 - q->limits.max_discard_sectors = 0; 1978 - q->limits.max_hw_discard_sectors = 0; 1979 - q->limits.discard_granularity = 0; 1980 - q->limits.discard_alignment = 0; 1981 - q->limits.discard_misaligned = 0; 1972 + limits->max_hw_discard_sectors = 0; 1973 + limits->discard_granularity = 0; 1974 + limits->discard_alignment = 0; 1975 + limits->discard_misaligned = 0; 1982 1976 } 1983 1977 1978 + if (!dm_table_supports_write_zeroes(t)) 1979 + limits->max_write_zeroes_sectors = 0; 1980 + 1984 1981 if (!dm_table_supports_secure_erase(t)) 1985 - q->limits.max_secure_erase_sectors = 0; 1982 + limits->max_secure_erase_sectors = 0; 1983 + 1984 + r = queue_limits_set(q, limits); 1985 + if (r) 1986 + return r; 1986 1987 1987 1988 if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { 1988 1989 wc = true; ··· 2007 2006 blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); 2008 2007 else 2009 2008 blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 2010 - 2011 - if (!dm_table_supports_write_zeroes(t)) 2012 - q->limits.max_write_zeroes_sectors = 0; 2013 2009 2014 2010 dm_table_verify_integrity(t); 2015 2011 ··· 2045 2047 } 2046 2048 2047 2049 dm_update_crypto_profile(q, t); 2048 - disk_update_readahead(t->md->disk); 2049 2050 2050 2051 /* 2051 2052 * Check for request-based device is left to

+4 -1

drivers/md/dm-zoned-metadata.c

··· 1655 1655 1656 1656 if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) { 1657 1657 struct dmz_dev *dev = zone->dev; 1658 + unsigned int noio_flag; 1658 1659 1660 + noio_flag = memalloc_noio_save(); 1659 1661 ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET, 1660 1662 dmz_start_sect(zmd, zone), 1661 - zmd->zone_nr_sectors, GFP_NOIO); 1663 + zmd->zone_nr_sectors); 1664 + memalloc_noio_restore(noio_flag); 1662 1665 if (ret) { 1663 1666 dmz_dev_err(dev, "Reset zone %u failed %d", 1664 1667 zone->id, ret);

+2 -2

drivers/md/dm.c

··· 2101 2101 * established. If request-based table is loaded: blk-mq will 2102 2102 * override accordingly. 2103 2103 */ 2104 - md->disk = blk_alloc_disk(md->numa_node_id); 2105 - if (!md->disk) 2104 + md->disk = blk_alloc_disk(NULL, md->numa_node_id); 2105 + if (IS_ERR(md->disk)) 2106 2106 goto bad; 2107 2107 md->queue = md->disk->queue; 2108 2108

+9 -9

drivers/md/md-bitmap.c

··· 234 234 sector_t doff; 235 235 236 236 bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; 237 - if (pg_index == store->file_pages - 1) { 237 + /* we compare length (page numbers), not page offset. */ 238 + if ((pg_index - store->sb_index) == store->file_pages - 1) { 238 239 unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1); 239 240 240 241 if (last_page_size == 0) ··· 439 438 struct page *page = store->filemap[pg_index]; 440 439 441 440 if (mddev_is_clustered(bitmap->mddev)) { 442 - pg_index += bitmap->cluster_slot * 443 - DIV_ROUND_UP(store->bytes, PAGE_SIZE); 441 + /* go to node bitmap area starting point */ 442 + pg_index += store->sb_index; 444 443 } 445 444 446 445 if (store->file) ··· 953 952 unsigned long index = file_page_index(store, chunk); 954 953 unsigned long node_offset = 0; 955 954 955 + index += store->sb_index; 956 956 if (mddev_is_clustered(bitmap->mddev)) 957 957 node_offset = bitmap->cluster_slot * store->file_pages; 958 958 ··· 984 982 unsigned long index = file_page_index(store, chunk); 985 983 unsigned long node_offset = 0; 986 984 985 + index += store->sb_index; 987 986 if (mddev_is_clustered(bitmap->mddev)) 988 987 node_offset = bitmap->cluster_slot * store->file_pages; 989 988 ··· 1046 1043 if (dirty || need_write) { 1047 1044 if (!writing) { 1048 1045 md_bitmap_wait_writes(bitmap); 1049 - if (bitmap->mddev->queue) 1050 - blk_add_trace_msg(bitmap->mddev->queue, 1051 - "md bitmap_unplug"); 1046 + mddev_add_trace_msg(bitmap->mddev, 1047 + "md bitmap_unplug"); 1052 1048 } 1053 1049 clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); 1054 1050 filemap_write_page(bitmap, i, false); ··· 1318 1316 } 1319 1317 bitmap->allclean = 1; 1320 1318 1321 - if (bitmap->mddev->queue) 1322 - blk_add_trace_msg(bitmap->mddev->queue, 1323 - "md bitmap_daemon_work"); 1319 + mddev_add_trace_msg(bitmap->mddev, "md bitmap_daemon_work"); 1324 1320 1325 1321 /* Any file-page which is PENDING now needs to be written. 1326 1322 * So set NEEDWRITE now, then after we make any last-minute changes

-17

drivers/md/md-linear.h

··· 1 - /* SPDX-License-Identifier: GPL-2.0 */ 2 - #ifndef _LINEAR_H 3 - #define _LINEAR_H 4 - 5 - struct dev_info { 6 - struct md_rdev *rdev; 7 - sector_t end_sector; 8 - }; 9 - 10 - struct linear_conf 11 - { 12 - struct rcu_head rcu; 13 - sector_t array_sectors; 14 - int raid_disks; /* a copy of mddev->raid_disks */ 15 - struct dev_info disks[] __counted_by(raid_disks); 16 - }; 17 - #endif

-32

drivers/md/md-multipath.h

··· 1 - /* SPDX-License-Identifier: GPL-2.0 */ 2 - #ifndef _MULTIPATH_H 3 - #define _MULTIPATH_H 4 - 5 - struct multipath_info { 6 - struct md_rdev *rdev; 7 - }; 8 - 9 - struct mpconf { 10 - struct mddev *mddev; 11 - struct multipath_info *multipaths; 12 - int raid_disks; 13 - spinlock_t device_lock; 14 - struct list_head retry_list; 15 - 16 - mempool_t pool; 17 - }; 18 - 19 - /* 20 - * this is our 'private' 'collective' MULTIPATH buffer head. 21 - * it contains information about what kind of IO operations were started 22 - * for this MULTIPATH operation, and about their status: 23 - */ 24 - 25 - struct multipath_bh { 26 - struct mddev *mddev; 27 - struct bio *master_bio; 28 - struct bio bio; 29 - int path; 30 - struct list_head retry_list; 31 - }; 32 - #endif

+232 -168

drivers/md/md.c

··· 65 65 #include <linux/percpu-refcount.h> 66 66 #include <linux/part_stat.h> 67 67 68 - #include <trace/events/block.h> 69 68 #include "md.h" 70 69 #include "md-bitmap.h" 71 70 #include "md-cluster.h" ··· 97 98 static void mddev_detach(struct mddev *mddev); 98 99 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); 99 100 static void md_wakeup_thread_directly(struct md_thread __rcu *thread); 100 - 101 - enum md_ro_state { 102 - MD_RDWR, 103 - MD_RDONLY, 104 - MD_AUTO_READ, 105 - MD_MAX_STATE 106 - }; 107 - 108 - static bool md_is_rdwr(struct mddev *mddev) 109 - { 110 - return (mddev->ro == MD_RDWR); 111 - } 112 101 113 102 /* 114 103 * Default number of read corrections we'll attempt on an rdev ··· 365 378 return true; 366 379 } 367 380 368 - void md_handle_request(struct mddev *mddev, struct bio *bio) 381 + bool md_handle_request(struct mddev *mddev, struct bio *bio) 369 382 { 370 383 check_suspended: 371 384 if (is_suspended(mddev, bio)) { ··· 373 386 /* Bail out if REQ_NOWAIT is set for the bio */ 374 387 if (bio->bi_opf & REQ_NOWAIT) { 375 388 bio_wouldblock_error(bio); 376 - return; 389 + return true; 377 390 } 378 391 for (;;) { 379 392 prepare_to_wait(&mddev->sb_wait, &__wait, ··· 389 402 390 403 if (!mddev->pers->make_request(mddev, bio)) { 391 404 percpu_ref_put(&mddev->active_io); 405 + if (!mddev->gendisk && mddev->pers->prepare_suspend) 406 + return false; 392 407 goto check_suspended; 393 408 } 394 409 395 410 percpu_ref_put(&mddev->active_io); 411 + return true; 396 412 } 397 413 EXPORT_SYMBOL(md_handle_request); 398 414 ··· 518 528 return __mddev_resume(mddev, true); 519 529 } 520 530 EXPORT_SYMBOL_GPL(mddev_resume); 531 + 532 + /* sync bdev before setting device to readonly or stopping raid*/ 533 + static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num) 534 + { 535 + mutex_lock(&mddev->open_mutex); 536 + if (mddev->pers && atomic_read(&mddev->openers) > opener_num) { 537 + mutex_unlock(&mddev->open_mutex); 538 + return -EBUSY; 539 + } 540 + if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { 541 + mutex_unlock(&mddev->open_mutex); 542 + return -EBUSY; 543 + } 544 + mutex_unlock(&mddev->open_mutex); 545 + 546 + sync_blockdev(mddev->gendisk->part0); 547 + return 0; 548 + } 521 549 522 550 /* 523 551 * Generic flush handling for md ··· 2414 2406 2415 2407 if (list_empty(&mddev->disks)) 2416 2408 return 0; /* nothing to do */ 2417 - if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) 2409 + if (mddev_is_dm(mddev) || blk_get_integrity(mddev->gendisk)) 2418 2410 return 0; /* shouldn't register, or already is */ 2419 2411 rdev_for_each(rdev, mddev) { 2420 2412 /* skip spares and non-functional disks */ ··· 2467 2459 { 2468 2460 struct blk_integrity *bi_mddev; 2469 2461 2470 - if (!mddev->gendisk) 2462 + if (mddev_is_dm(mddev)) 2471 2463 return 0; 2472 2464 2473 2465 bi_mddev = blk_get_integrity(mddev->gendisk); ··· 2574 2566 fail: 2575 2567 pr_warn("md: failed to register dev-%s for %s\n", 2576 2568 b, mdname(mddev)); 2569 + mddev_destroy_serial_pool(mddev, rdev); 2577 2570 return err; 2578 2571 } 2579 2572 ··· 2604 2595 list_del_rcu(&rdev->same_set); 2605 2596 pr_debug("md: unbind<%pg>\n", rdev->bdev); 2606 2597 mddev_destroy_serial_pool(rdev->mddev, rdev); 2607 - rdev->mddev = NULL; 2598 + WRITE_ONCE(rdev->mddev, NULL); 2608 2599 sysfs_remove_link(&rdev->kobj, "block"); 2609 2600 sysfs_put(rdev->sysfs_state); 2610 2601 sysfs_put(rdev->sysfs_unack_badblocks); ··· 2860 2851 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2861 2852 mdname(mddev), mddev->in_sync); 2862 2853 2863 - if (mddev->queue) 2864 - blk_add_trace_msg(mddev->queue, "md md_update_sb"); 2854 + mddev_add_trace_msg(mddev, "md md_update_sb"); 2865 2855 rewrite: 2866 2856 md_bitmap_update_sb(mddev->bitmap); 2867 2857 rdev_for_each(rdev, mddev) { ··· 2941 2933 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2942 2934 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2943 2935 md_new_event(); 2944 - md_wakeup_thread(mddev->thread); 2945 2936 return 0; 2946 2937 } 2947 2938 ··· 3055 3048 3056 3049 if (err == 0) { 3057 3050 md_kick_rdev_from_array(rdev); 3058 - if (mddev->pers) { 3051 + if (mddev->pers) 3059 3052 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 3060 - md_wakeup_thread(mddev->thread); 3061 - } 3062 3053 md_new_event(); 3063 3054 } 3064 3055 } ··· 3086 3081 clear_bit(BlockedBadBlocks, &rdev->flags); 3087 3082 wake_up(&rdev->blocked_wait); 3088 3083 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3089 - md_wakeup_thread(rdev->mddev->thread); 3090 3084 3091 3085 err = 0; 3092 3086 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { ··· 3123 3119 !test_bit(Replacement, &rdev->flags)) 3124 3120 set_bit(WantReplacement, &rdev->flags); 3125 3121 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3126 - md_wakeup_thread(rdev->mddev->thread); 3127 3122 err = 0; 3128 3123 } else if (cmd_match(buf, "-want_replacement")) { 3129 3124 /* Clearing 'want_replacement' is always allowed. ··· 3252 3249 if (rdev->raid_disk >= 0) 3253 3250 return -EBUSY; 3254 3251 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3255 - md_wakeup_thread(rdev->mddev->thread); 3256 3252 } else if (rdev->mddev->pers) { 3257 3253 /* Activating a spare .. or possibly reactivating 3258 3254 * if we ever get bitmaps working here. ··· 3345 3343 if (kstrtoull(buf, 10, &new_offset) < 0) 3346 3344 return -EINVAL; 3347 3345 3348 - if (mddev->sync_thread || 3349 - test_bit(MD_RECOVERY_RUNNING,&mddev->recovery)) 3346 + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3350 3347 return -EBUSY; 3351 3348 if (new_offset == rdev->data_offset) 3352 3349 /* reset is always permitted */ ··· 3676 3675 struct kernfs_node *kn = NULL; 3677 3676 bool suspend = false; 3678 3677 ssize_t rv; 3679 - struct mddev *mddev = rdev->mddev; 3678 + struct mddev *mddev = READ_ONCE(rdev->mddev); 3680 3679 3681 3680 if (!entry->store) 3682 3681 return -EIO; ··· 4018 4017 */ 4019 4018 4020 4019 rv = -EBUSY; 4021 - if (mddev->sync_thread || 4022 - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 4020 + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 4023 4021 mddev->reshape_position != MaxSector || 4024 4022 mddev->sysfs_active) 4025 4023 goto out_unlock; ··· 4168 4168 mddev->in_sync = 1; 4169 4169 del_timer_sync(&mddev->safemode_timer); 4170 4170 } 4171 - blk_set_stacking_limits(&mddev->queue->limits); 4172 4171 pers->run(mddev); 4173 4172 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4174 4173 if (!mddev->thread) ··· 4474 4475 return sprintf(page, "%s\n", array_states[st]); 4475 4476 } 4476 4477 4477 - static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev); 4478 - static int md_set_readonly(struct mddev *mddev, struct block_device *bdev); 4478 + static int do_md_stop(struct mddev *mddev, int ro); 4479 + static int md_set_readonly(struct mddev *mddev); 4479 4480 static int restart_array(struct mddev *mddev); 4480 4481 4481 4482 static ssize_t ··· 4492 4493 case broken: /* cannot be set */ 4493 4494 case bad_word: 4494 4495 return -EINVAL; 4496 + case clear: 4497 + case readonly: 4498 + case inactive: 4499 + case read_auto: 4500 + if (!mddev->pers || !md_is_rdwr(mddev)) 4501 + break; 4502 + /* write sysfs will not open mddev and opener should be 0 */ 4503 + err = mddev_set_closing_and_sync_blockdev(mddev, 0); 4504 + if (err) 4505 + return err; 4506 + break; 4495 4507 default: 4496 4508 break; 4497 4509 } ··· 4536 4526 case inactive: 4537 4527 /* stop an active array, return 0 otherwise */ 4538 4528 if (mddev->pers) 4539 - err = do_md_stop(mddev, 2, NULL); 4529 + err = do_md_stop(mddev, 2); 4540 4530 break; 4541 4531 case clear: 4542 - err = do_md_stop(mddev, 0, NULL); 4532 + err = do_md_stop(mddev, 0); 4543 4533 break; 4544 4534 case readonly: 4545 4535 if (mddev->pers) 4546 - err = md_set_readonly(mddev, NULL); 4536 + err = md_set_readonly(mddev); 4547 4537 else { 4548 4538 mddev->ro = MD_RDONLY; 4549 4539 set_disk_ro(mddev->gendisk, 1); ··· 4553 4543 case read_auto: 4554 4544 if (mddev->pers) { 4555 4545 if (md_is_rdwr(mddev)) 4556 - err = md_set_readonly(mddev, NULL); 4546 + err = md_set_readonly(mddev); 4557 4547 else if (mddev->ro == MD_RDONLY) 4558 4548 err = restart_array(mddev); 4559 4549 if (err == 0) { ··· 4602 4592 sysfs_notify_dirent_safe(mddev->sysfs_state); 4603 4593 } 4604 4594 mddev_unlock(mddev); 4595 + 4596 + if (st == readonly || st == read_auto || st == inactive || 4597 + (err && st == clear)) 4598 + clear_bit(MD_CLOSING, &mddev->flags); 4599 + 4605 4600 return err ?: len; 4606 4601 } 4607 4602 static struct md_sysfs_entry md_array_state = ··· 4933 4918 if (locked) 4934 4919 mddev_lock_nointr(mddev); 4935 4920 } 4921 + 4922 + void md_idle_sync_thread(struct mddev *mddev) 4923 + { 4924 + lockdep_assert_held(&mddev->reconfig_mutex); 4925 + 4926 + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4927 + stop_sync_thread(mddev, true, true); 4928 + } 4929 + EXPORT_SYMBOL_GPL(md_idle_sync_thread); 4930 + 4931 + void md_frozen_sync_thread(struct mddev *mddev) 4932 + { 4933 + lockdep_assert_held(&mddev->reconfig_mutex); 4934 + 4935 + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4936 + stop_sync_thread(mddev, true, false); 4937 + } 4938 + EXPORT_SYMBOL_GPL(md_frozen_sync_thread); 4939 + 4940 + void md_unfrozen_sync_thread(struct mddev *mddev) 4941 + { 4942 + lockdep_assert_held(&mddev->reconfig_mutex); 4943 + 4944 + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4945 + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4946 + md_wakeup_thread(mddev->thread); 4947 + sysfs_notify_dirent_safe(mddev->sysfs_action); 4948 + } 4949 + EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread); 4936 4950 4937 4951 static void idle_sync_thread(struct mddev *mddev) 4938 4952 { ··· 5754 5710 5755 5711 int mdp_major = 0; 5756 5712 5713 + /* stack the limit for all rdevs into lim */ 5714 + void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim) 5715 + { 5716 + struct md_rdev *rdev; 5717 + 5718 + rdev_for_each(rdev, mddev) { 5719 + queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset, 5720 + mddev->gendisk->disk_name); 5721 + } 5722 + } 5723 + EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); 5724 + 5725 + /* apply the extra stacking limits from a new rdev into mddev */ 5726 + int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) 5727 + { 5728 + struct queue_limits lim; 5729 + 5730 + if (mddev_is_dm(mddev)) 5731 + return 0; 5732 + 5733 + lim = queue_limits_start_update(mddev->gendisk->queue); 5734 + queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, 5735 + mddev->gendisk->disk_name); 5736 + return queue_limits_commit_update(mddev->gendisk->queue, &lim); 5737 + } 5738 + EXPORT_SYMBOL_GPL(mddev_stack_new_rdev); 5739 + 5740 + /* update the optimal I/O size after a reshape */ 5741 + void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes) 5742 + { 5743 + struct queue_limits lim; 5744 + 5745 + if (mddev_is_dm(mddev)) 5746 + return; 5747 + 5748 + /* don't bother updating io_opt if we can't suspend the array */ 5749 + if (mddev_suspend(mddev, false) < 0) 5750 + return; 5751 + lim = queue_limits_start_update(mddev->gendisk->queue); 5752 + lim.io_opt = lim.io_min * nr_stripes; 5753 + queue_limits_commit_update(mddev->gendisk->queue, &lim); 5754 + mddev_resume(mddev); 5755 + } 5756 + EXPORT_SYMBOL_GPL(mddev_update_io_opt); 5757 + 5757 5758 static void mddev_delayed_delete(struct work_struct *ws) 5758 5759 { 5759 5760 struct mddev *mddev = container_of(ws, struct mddev, del_work); ··· 5863 5774 */ 5864 5775 mddev->hold_active = UNTIL_STOP; 5865 5776 5866 - error = -ENOMEM; 5867 - disk = blk_alloc_disk(NUMA_NO_NODE); 5868 - if (!disk) 5777 + disk = blk_alloc_disk(NULL, NUMA_NO_NODE); 5778 + if (IS_ERR(disk)) { 5779 + error = PTR_ERR(disk); 5869 5780 goto out_free_mddev; 5781 + } 5870 5782 5871 5783 disk->major = MAJOR(mddev->unit); 5872 5784 disk->first_minor = unit << shift; ··· 5881 5791 disk->fops = &md_fops; 5882 5792 disk->private_data = mddev; 5883 5793 5884 - mddev->queue = disk->queue; 5885 - blk_set_stacking_limits(&mddev->queue->limits); 5886 - blk_queue_write_cache(mddev->queue, true, true); 5794 + blk_queue_write_cache(disk->queue, true, true); 5887 5795 disk->events |= DISK_EVENT_MEDIA_CHANGE; 5888 5796 mddev->gendisk = disk; 5889 5797 error = add_disk(disk); ··· 6023 5935 invalidate_bdev(rdev->bdev); 6024 5936 if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { 6025 5937 mddev->ro = MD_RDONLY; 6026 - if (mddev->gendisk) 5938 + if (!mddev_is_dm(mddev)) 6027 5939 set_disk_ro(mddev->gendisk, 1); 6028 5940 } 6029 5941 ··· 6126 6038 pr_warn("True protection against single-disk failure might be compromised.\n"); 6127 6039 } 6128 6040 6129 - mddev->recovery = 0; 6041 + /* dm-raid expect sync_thread to be frozen until resume */ 6042 + if (mddev->gendisk) 6043 + mddev->recovery = 0; 6044 + 6130 6045 /* may be over-ridden by personality */ 6131 6046 mddev->resync_max_sectors = mddev->dev_sectors; 6132 6047 ··· 6185 6094 } 6186 6095 } 6187 6096 6188 - if (mddev->queue) { 6097 + if (!mddev_is_dm(mddev)) { 6098 + struct request_queue *q = mddev->gendisk->queue; 6189 6099 bool nonrot = true; 6190 6100 6191 6101 rdev_for_each(rdev, mddev) { ··· 6198 6106 if (mddev->degraded) 6199 6107 nonrot = false; 6200 6108 if (nonrot) 6201 - blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue); 6109 + blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 6202 6110 else 6203 - blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue); 6204 - blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue); 6111 + blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); 6112 + blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q); 6205 6113 6206 6114 /* Set the NOWAIT flags if all underlying devices support it */ 6207 6115 if (nowait) 6208 - blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue); 6116 + blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); 6209 6117 } 6210 6118 if (pers->sync_request) { 6211 6119 if (mddev->kobj.sd && ··· 6284 6192 /* run start up tasks that require md_thread */ 6285 6193 md_start(mddev); 6286 6194 6287 - md_wakeup_thread(mddev->thread); 6288 6195 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 6289 6196 6290 6197 set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); ··· 6304 6213 6305 6214 if (mddev->pers->start) { 6306 6215 set_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6307 - md_wakeup_thread(mddev->thread); 6308 6216 ret = mddev->pers->start(mddev); 6309 6217 clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6310 6218 md_wakeup_thread(mddev->sync_thread); ··· 6348 6258 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); 6349 6259 /* Kick recovery or resync if necessary */ 6350 6260 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6351 - md_wakeup_thread(mddev->thread); 6352 6261 md_wakeup_thread(mddev->sync_thread); 6353 6262 sysfs_notify_dirent_safe(mddev->sysfs_state); 6354 6263 return 0; ··· 6367 6278 mddev->persistent = 0; 6368 6279 mddev->level = LEVEL_NONE; 6369 6280 mddev->clevel[0] = 0; 6370 - mddev->flags = 0; 6281 + /* 6282 + * Don't clear MD_CLOSING, or mddev can be opened again. 6283 + * 'hold_active != 0' means mddev is still in the creation 6284 + * process and will be used later. 6285 + */ 6286 + if (mddev->hold_active) 6287 + mddev->flags = 0; 6288 + else 6289 + mddev->flags &= BIT_ULL_MASK(MD_CLOSING); 6371 6290 mddev->sb_flags = 0; 6372 6291 mddev->ro = MD_RDWR; 6373 6292 mddev->metadata_type[0] = 0; ··· 6412 6315 6413 6316 static void __md_stop_writes(struct mddev *mddev) 6414 6317 { 6415 - stop_sync_thread(mddev, true, false); 6416 6318 del_timer_sync(&mddev->safemode_timer); 6417 6319 6418 6320 if (mddev->pers && mddev->pers->quiesce) { ··· 6436 6340 void md_stop_writes(struct mddev *mddev) 6437 6341 { 6438 6342 mddev_lock_nointr(mddev); 6343 + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6344 + stop_sync_thread(mddev, true, false); 6439 6345 __md_stop_writes(mddev); 6440 6346 mddev_unlock(mddev); 6441 6347 } ··· 6451 6353 mddev->pers->quiesce(mddev, 0); 6452 6354 } 6453 6355 md_unregister_thread(mddev, &mddev->thread); 6454 - if (mddev->queue) 6455 - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 6356 + 6357 + /* the unplug fn references 'conf' */ 6358 + if (!mddev_is_dm(mddev)) 6359 + blk_sync_queue(mddev->gendisk->queue); 6456 6360 } 6457 6361 6458 6362 static void __md_stop(struct mddev *mddev) ··· 6491 6391 6492 6392 EXPORT_SYMBOL_GPL(md_stop); 6493 6393 6494 - static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) 6394 + /* ensure 'mddev->pers' exist before calling md_set_readonly() */ 6395 + static int md_set_readonly(struct mddev *mddev) 6495 6396 { 6496 6397 int err = 0; 6497 6398 int did_freeze = 0; ··· 6503 6402 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6504 6403 did_freeze = 1; 6505 6404 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6506 - md_wakeup_thread(mddev->thread); 6507 6405 } 6508 6406 6509 6407 stop_sync_thread(mddev, false, false); ··· 6510 6410 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 6511 6411 mddev_lock_nointr(mddev); 6512 6412 6513 - mutex_lock(&mddev->open_mutex); 6514 - if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 6515 - mddev->sync_thread || 6516 - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6413 + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6517 6414 pr_warn("md: %s still in use.\n",mdname(mddev)); 6518 6415 err = -EBUSY; 6519 6416 goto out; 6520 6417 } 6521 6418 6522 - if (mddev->pers) { 6523 - __md_stop_writes(mddev); 6419 + __md_stop_writes(mddev); 6524 6420 6525 - if (mddev->ro == MD_RDONLY) { 6526 - err = -ENXIO; 6527 - goto out; 6528 - } 6529 - 6530 - mddev->ro = MD_RDONLY; 6531 - set_disk_ro(mddev->gendisk, 1); 6421 + if (mddev->ro == MD_RDONLY) { 6422 + err = -ENXIO; 6423 + goto out; 6532 6424 } 6533 6425 6426 + mddev->ro = MD_RDONLY; 6427 + set_disk_ro(mddev->gendisk, 1); 6428 + 6534 6429 out: 6535 - if ((mddev->pers && !err) || did_freeze) { 6430 + if (!err || did_freeze) { 6536 6431 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6537 6432 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6538 - md_wakeup_thread(mddev->thread); 6539 6433 sysfs_notify_dirent_safe(mddev->sysfs_state); 6540 6434 } 6541 6435 6542 - mutex_unlock(&mddev->open_mutex); 6543 6436 return err; 6544 6437 } 6545 6438 ··· 6540 6447 * 0 - completely stop and dis-assemble array 6541 6448 * 2 - stop but do not disassemble array 6542 6449 */ 6543 - static int do_md_stop(struct mddev *mddev, int mode, 6544 - struct block_device *bdev) 6450 + static int do_md_stop(struct mddev *mddev, int mode) 6545 6451 { 6546 6452 struct gendisk *disk = mddev->gendisk; 6547 6453 struct md_rdev *rdev; ··· 6549 6457 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6550 6458 did_freeze = 1; 6551 6459 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6552 - md_wakeup_thread(mddev->thread); 6553 6460 } 6554 6461 6555 6462 stop_sync_thread(mddev, true, false); 6556 6463 6557 - mutex_lock(&mddev->open_mutex); 6558 - if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 6559 - mddev->sysfs_active || 6560 - mddev->sync_thread || 6464 + if (mddev->sysfs_active || 6561 6465 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6562 6466 pr_warn("md: %s still in use.\n",mdname(mddev)); 6563 - mutex_unlock(&mddev->open_mutex); 6564 6467 if (did_freeze) { 6565 6468 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6566 6469 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6567 - md_wakeup_thread(mddev->thread); 6568 6470 } 6569 6471 return -EBUSY; 6570 6472 } ··· 6577 6491 sysfs_unlink_rdev(mddev, rdev); 6578 6492 6579 6493 set_capacity_and_notify(disk, 0); 6580 - mutex_unlock(&mddev->open_mutex); 6581 6494 mddev->changed = 1; 6582 6495 6583 6496 if (!md_is_rdwr(mddev)) 6584 6497 mddev->ro = MD_RDWR; 6585 - } else 6586 - mutex_unlock(&mddev->open_mutex); 6498 + } 6587 6499 /* 6588 6500 * Free resources if final stop 6589 6501 */ ··· 6627 6543 err = do_md_run(mddev); 6628 6544 if (err) { 6629 6545 pr_warn("md: do_md_run() returned %d\n", err); 6630 - do_md_stop(mddev, 0, NULL); 6546 + do_md_stop(mddev, 0); 6631 6547 } 6632 6548 } 6633 6549 ··· 7097 7013 7098 7014 md_kick_rdev_from_array(rdev); 7099 7015 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7100 - if (mddev->thread) 7101 - md_wakeup_thread(mddev->thread); 7102 - else 7016 + if (!mddev->thread) 7103 7017 md_update_sb(mddev, 1); 7104 7018 md_new_event(); 7105 7019 ··· 7172 7090 if (!bdev_nowait(rdev->bdev)) { 7173 7091 pr_info("%s: Disabling nowait because %pg does not support nowait\n", 7174 7092 mdname(mddev), rdev->bdev); 7175 - blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue); 7093 + blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->gendisk->queue); 7176 7094 } 7177 7095 /* 7178 7096 * Kick recovery, maybe this spare has to be added to the 7179 7097 * array immediately. 7180 7098 */ 7181 7099 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7182 - md_wakeup_thread(mddev->thread); 7183 7100 md_new_event(); 7184 7101 return 0; 7185 7102 ··· 7392 7311 * of each device. If num_sectors is zero, we find the largest size 7393 7312 * that fits. 7394 7313 */ 7395 - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7396 - mddev->sync_thread) 7314 + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 7397 7315 return -EBUSY; 7398 7316 if (!md_is_rdwr(mddev)) 7399 7317 return -EROFS; ··· 7409 7329 if (!rv) { 7410 7330 if (mddev_is_clustered(mddev)) 7411 7331 md_cluster_ops->update_size(mddev, old_dev_sectors); 7412 - else if (mddev->queue) { 7332 + else if (!mddev_is_dm(mddev)) 7413 7333 set_capacity_and_notify(mddev->gendisk, 7414 7334 mddev->array_sectors); 7415 - } 7416 7335 } 7417 7336 return rv; 7418 7337 } ··· 7428 7349 if (raid_disks <= 0 || 7429 7350 (mddev->max_disks && raid_disks >= mddev->max_disks)) 7430 7351 return -EINVAL; 7431 - if (mddev->sync_thread || 7432 - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7352 + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7433 7353 test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || 7434 7354 mddev->reshape_position != MaxSector) 7435 7355 return -EBUSY; ··· 7624 7546 return 0; 7625 7547 } 7626 7548 7627 - static inline bool md_ioctl_valid(unsigned int cmd) 7549 + static inline int md_ioctl_valid(unsigned int cmd) 7628 7550 { 7629 7551 switch (cmd) { 7630 - case ADD_NEW_DISK: 7631 7552 case GET_ARRAY_INFO: 7632 - case GET_BITMAP_FILE: 7633 7553 case GET_DISK_INFO: 7554 + case RAID_VERSION: 7555 + return 0; 7556 + case ADD_NEW_DISK: 7557 + case GET_BITMAP_FILE: 7634 7558 case HOT_ADD_DISK: 7635 7559 case HOT_REMOVE_DISK: 7636 - case RAID_VERSION: 7637 7560 case RESTART_ARRAY_RW: 7638 7561 case RUN_ARRAY: 7639 7562 case SET_ARRAY_INFO: ··· 7643 7564 case STOP_ARRAY: 7644 7565 case STOP_ARRAY_RO: 7645 7566 case CLUSTERED_DISK_NACK: 7646 - return true; 7567 + if (!capable(CAP_SYS_ADMIN)) 7568 + return -EACCES; 7569 + return 0; 7647 7570 default: 7648 - return false; 7571 + return -ENOTTY; 7649 7572 } 7650 7573 } 7651 7574 ··· 7705 7624 int err = 0; 7706 7625 void __user *argp = (void __user *)arg; 7707 7626 struct mddev *mddev = NULL; 7708 - bool did_set_md_closing = false; 7709 7627 7710 - if (!md_ioctl_valid(cmd)) 7711 - return -ENOTTY; 7712 - 7713 - switch (cmd) { 7714 - case RAID_VERSION: 7715 - case GET_ARRAY_INFO: 7716 - case GET_DISK_INFO: 7717 - break; 7718 - default: 7719 - if (!capable(CAP_SYS_ADMIN)) 7720 - return -EACCES; 7721 - } 7628 + err = md_ioctl_valid(cmd); 7629 + if (err) 7630 + return err; 7722 7631 7723 7632 /* 7724 7633 * Commands dealing with the RAID driver but not any 7725 7634 * particular array: 7726 7635 */ 7727 - switch (cmd) { 7728 - case RAID_VERSION: 7729 - err = get_version(argp); 7730 - goto out; 7731 - default:; 7732 - } 7636 + if (cmd == RAID_VERSION) 7637 + return get_version(argp); 7733 7638 7734 7639 /* 7735 7640 * Commands creating/starting a new array: ··· 7723 7656 7724 7657 mddev = bdev->bd_disk->private_data; 7725 7658 7726 - if (!mddev) { 7727 - BUG(); 7728 - goto out; 7729 - } 7730 - 7731 7659 /* Some actions do not requires the mutex */ 7732 7660 switch (cmd) { 7733 7661 case GET_ARRAY_INFO: 7734 7662 if (!mddev->raid_disks && !mddev->external) 7735 - err = -ENODEV; 7736 - else 7737 - err = get_array_info(mddev, argp); 7738 - goto out; 7663 + return -ENODEV; 7664 + return get_array_info(mddev, argp); 7739 7665 7740 7666 case GET_DISK_INFO: 7741 7667 if (!mddev->raid_disks && !mddev->external) 7742 - err = -ENODEV; 7743 - else 7744 - err = get_disk_info(mddev, argp); 7745 - goto out; 7668 + return -ENODEV; 7669 + return get_disk_info(mddev, argp); 7746 7670 7747 7671 case SET_DISK_FAULTY: 7748 - err = set_disk_faulty(mddev, new_decode_dev(arg)); 7749 - goto out; 7672 + return set_disk_faulty(mddev, new_decode_dev(arg)); 7750 7673 7751 7674 case GET_BITMAP_FILE: 7752 - err = get_bitmap_file(mddev, argp); 7753 - goto out; 7754 - 7675 + return get_bitmap_file(mddev, argp); 7755 7676 } 7756 7677 7757 7678 if (cmd == HOT_REMOVE_DISK) ··· 7752 7697 /* Need to flush page cache, and ensure no-one else opens 7753 7698 * and writes 7754 7699 */ 7755 - mutex_lock(&mddev->open_mutex); 7756 - if (mddev->pers && atomic_read(&mddev->openers) > 1) { 7757 - mutex_unlock(&mddev->open_mutex); 7758 - err = -EBUSY; 7759 - goto out; 7760 - } 7761 - if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { 7762 - mutex_unlock(&mddev->open_mutex); 7763 - err = -EBUSY; 7764 - goto out; 7765 - } 7766 - did_set_md_closing = true; 7767 - mutex_unlock(&mddev->open_mutex); 7768 - sync_blockdev(bdev); 7700 + err = mddev_set_closing_and_sync_blockdev(mddev, 1); 7701 + if (err) 7702 + return err; 7769 7703 } 7770 7704 7771 7705 if (!md_is_rdwr(mddev)) ··· 7795 7751 goto unlock; 7796 7752 7797 7753 case STOP_ARRAY: 7798 - err = do_md_stop(mddev, 0, bdev); 7754 + err = do_md_stop(mddev, 0); 7799 7755 goto unlock; 7800 7756 7801 7757 case STOP_ARRAY_RO: 7802 - err = md_set_readonly(mddev, bdev); 7758 + if (mddev->pers) 7759 + err = md_set_readonly(mddev); 7803 7760 goto unlock; 7804 7761 7805 7762 case HOT_REMOVE_DISK: ··· 7895 7850 mddev_unlock(mddev); 7896 7851 7897 7852 out: 7898 - if(did_set_md_closing) 7853 + if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY)) 7899 7854 clear_bit(MD_CLOSING, &mddev->flags); 7900 7855 return err; 7901 7856 } ··· 8732 8687 8733 8688 bio_chain(discard_bio, bio); 8734 8689 bio_clone_blkg_association(discard_bio, bio); 8735 - if (mddev->gendisk) 8736 - trace_block_bio_remap(discard_bio, 8737 - disk_devt(mddev->gendisk), 8738 - bio->bi_iter.bi_sector); 8690 + mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector); 8739 8691 submit_bio_noacct(discard_bio); 8740 8692 } 8741 8693 EXPORT_SYMBOL_GPL(md_submit_discard_bio); ··· 8778 8736 md_clone_bio(mddev, bio); 8779 8737 } 8780 8738 EXPORT_SYMBOL_GPL(md_account_bio); 8739 + 8740 + void md_free_cloned_bio(struct bio *bio) 8741 + { 8742 + struct md_io_clone *md_io_clone = bio->bi_private; 8743 + struct bio *orig_bio = md_io_clone->orig_bio; 8744 + struct mddev *mddev = md_io_clone->mddev; 8745 + 8746 + if (bio->bi_status && !orig_bio->bi_status) 8747 + orig_bio->bi_status = bio->bi_status; 8748 + 8749 + if (md_io_clone->start_time) 8750 + bio_end_io_acct(orig_bio, md_io_clone->start_time); 8751 + 8752 + bio_put(bio); 8753 + percpu_ref_put(&mddev->active_io); 8754 + } 8755 + EXPORT_SYMBOL_GPL(md_free_cloned_bio); 8781 8756 8782 8757 /* md_allow_write(mddev) 8783 8758 * Calling this ensures that the array is marked 'active' so that writes ··· 9229 9170 mddev->delta_disks > 0 && 9230 9171 mddev->pers->finish_reshape && 9231 9172 mddev->pers->size && 9232 - mddev->queue) { 9173 + !mddev_is_dm(mddev)) { 9233 9174 mddev_lock_nointr(mddev); 9234 9175 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); 9235 9176 mddev_unlock(mddev); ··· 9329 9270 { 9330 9271 struct md_rdev *rdev; 9331 9272 9332 - rdev_for_each(rdev, mddev) 9333 - if (rdev_removeable(rdev) || rdev_addable(rdev)) 9273 + rcu_read_lock(); 9274 + rdev_for_each_rcu(rdev, mddev) { 9275 + if (rdev_removeable(rdev) || rdev_addable(rdev)) { 9276 + rcu_read_unlock(); 9334 9277 return true; 9278 + } 9279 + } 9280 + rcu_read_unlock(); 9335 9281 return false; 9336 9282 } 9337 9283

+74 -3

drivers/md/md.h

··· 18 18 #include <linux/timer.h> 19 19 #include <linux/wait.h> 20 20 #include <linux/workqueue.h> 21 + #include <trace/events/block.h> 21 22 #include "md-cluster.h" 22 23 23 24 #define MaxSector (~(sector_t)0) ··· 208 207 * check if there is collision between raid1 209 208 * serial bios. 210 209 */ 210 + Nonrot, /* non-rotational device (SSD) */ 211 211 }; 212 212 213 213 static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, ··· 224 222 } 225 223 return 0; 226 224 } 225 + 226 + static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s, 227 + int sectors) 228 + { 229 + sector_t first_bad; 230 + int bad_sectors; 231 + 232 + return is_badblock(rdev, s, sectors, &first_bad, &bad_sectors); 233 + } 234 + 227 235 extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 228 236 int is_new); 229 237 extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, ··· 480 468 struct timer_list safemode_timer; 481 469 struct percpu_ref writes_pending; 482 470 int sync_checkers; /* # of threads checking writes_pending */ 483 - struct request_queue *queue; /* for plugging ... */ 484 471 485 472 struct bitmap *bitmap; /* the bitmap for the device */ 486 473 struct { ··· 569 558 MD_RESYNCING_REMOTE, /* remote node is running resync thread */ 570 559 }; 571 560 561 + enum md_ro_state { 562 + MD_RDWR, 563 + MD_RDONLY, 564 + MD_AUTO_READ, 565 + MD_MAX_STATE 566 + }; 567 + 568 + static inline bool md_is_rdwr(struct mddev *mddev) 569 + { 570 + return (mddev->ro == MD_RDWR); 571 + } 572 + 573 + static inline bool reshape_interrupted(struct mddev *mddev) 574 + { 575 + /* reshape never start */ 576 + if (mddev->reshape_position == MaxSector) 577 + return false; 578 + 579 + /* interrupted */ 580 + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 581 + return true; 582 + 583 + /* running reshape will be interrupted soon. */ 584 + if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || 585 + test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 586 + test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 587 + return true; 588 + 589 + return false; 590 + } 591 + 572 592 static inline int __must_check mddev_lock(struct mddev *mddev) 573 593 { 574 594 return mutex_lock_interruptible(&mddev->reconfig_mutex); ··· 659 617 int (*start_reshape) (struct mddev *mddev); 660 618 void (*finish_reshape) (struct mddev *mddev); 661 619 void (*update_reshape_pos) (struct mddev *mddev); 620 + void (*prepare_suspend) (struct mddev *mddev); 662 621 /* quiesce suspends or resumes internal processing. 663 622 * 1 - stop new actions and wait for action io to complete 664 623 * 0 - return to normal behaviour ··· 793 750 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, 794 751 struct bio *bio, sector_t start, sector_t size); 795 752 void md_account_bio(struct mddev *mddev, struct bio **bio); 753 + void md_free_cloned_bio(struct bio *bio); 796 754 797 755 extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); 798 756 extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, ··· 822 778 extern int md_rdev_init(struct md_rdev *rdev); 823 779 extern void md_rdev_clear(struct md_rdev *rdev); 824 780 825 - extern void md_handle_request(struct mddev *mddev, struct bio *bio); 781 + extern bool md_handle_request(struct mddev *mddev, struct bio *bio); 826 782 extern int mddev_suspend(struct mddev *mddev, bool interruptible); 827 783 extern void mddev_resume(struct mddev *mddev); 784 + extern void md_idle_sync_thread(struct mddev *mddev); 785 + extern void md_frozen_sync_thread(struct mddev *mddev); 786 + extern void md_unfrozen_sync_thread(struct mddev *mddev); 828 787 829 788 extern void md_reload_sb(struct mddev *mddev, int raid_disk); 830 789 extern void md_update_sb(struct mddev *mddev, int force); ··· 868 821 { 869 822 if (bio_op(bio) == REQ_OP_WRITE_ZEROES && 870 823 !bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors) 871 - mddev->queue->limits.max_write_zeroes_sectors = 0; 824 + mddev->gendisk->queue->limits.max_write_zeroes_sectors = 0; 872 825 } 873 826 874 827 static inline int mddev_suspend_and_lock(struct mddev *mddev) ··· 907 860 int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); 908 861 int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); 909 862 int do_md_run(struct mddev *mddev); 863 + void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim); 864 + int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev); 865 + void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes); 910 866 911 867 extern const struct block_device_operations md_fops; 868 + 869 + /* 870 + * MD devices can be used undeneath by DM, in which case ->gendisk is NULL. 871 + */ 872 + static inline bool mddev_is_dm(struct mddev *mddev) 873 + { 874 + return !mddev->gendisk; 875 + } 876 + 877 + static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio, 878 + sector_t sector) 879 + { 880 + if (!mddev_is_dm(mddev)) 881 + trace_block_bio_remap(bio, disk_devt(mddev->gendisk), sector); 882 + } 883 + 884 + #define mddev_add_trace_msg(mddev, fmt, args...) \ 885 + do { \ 886 + if (!mddev_is_dm(mddev)) \ 887 + blk_add_trace_msg((mddev)->gendisk->queue, fmt, ##args); \ 888 + } while (0) 912 889 913 890 #endif /* _MD_MD_H */

+22 -20

drivers/md/raid0.c

··· 379 379 free_conf(mddev, conf); 380 380 } 381 381 382 + static int raid0_set_limits(struct mddev *mddev) 383 + { 384 + struct queue_limits lim; 385 + 386 + blk_set_stacking_limits(&lim); 387 + lim.max_hw_sectors = mddev->chunk_sectors; 388 + lim.max_write_zeroes_sectors = mddev->chunk_sectors; 389 + lim.io_min = mddev->chunk_sectors << 9; 390 + lim.io_opt = lim.io_min * mddev->raid_disks; 391 + mddev_stack_rdev_limits(mddev, &lim); 392 + return queue_limits_set(mddev->gendisk->queue, &lim); 393 + } 394 + 382 395 static int raid0_run(struct mddev *mddev) 383 396 { 384 397 struct r0conf *conf; ··· 412 399 mddev->private = conf; 413 400 } 414 401 conf = mddev->private; 415 - if (mddev->queue) { 416 - struct md_rdev *rdev; 417 - 418 - blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); 419 - blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors); 420 - 421 - blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); 422 - blk_queue_io_opt(mddev->queue, 423 - (mddev->chunk_sectors << 9) * mddev->raid_disks); 424 - 425 - rdev_for_each(rdev, mddev) { 426 - disk_stack_limits(mddev->gendisk, rdev->bdev, 427 - rdev->data_offset << 9); 428 - } 402 + if (!mddev_is_dm(mddev)) { 403 + ret = raid0_set_limits(mddev); 404 + if (ret) 405 + goto out_free_conf; 429 406 } 430 407 431 408 /* calculate array device size */ ··· 429 426 430 427 ret = md_integrity_register(mddev); 431 428 if (ret) 432 - free_conf(mddev, conf); 433 - 429 + goto out_free_conf; 430 + return 0; 431 + out_free_conf: 432 + free_conf(mddev, conf); 434 433 return ret; 435 434 } 436 435 ··· 583 578 bio_set_dev(bio, tmp_dev->bdev); 584 579 bio->bi_iter.bi_sector = sector + zone->dev_start + 585 580 tmp_dev->data_offset; 586 - 587 - if (mddev->gendisk) 588 - trace_block_bio_remap(bio, disk_devt(mddev->gendisk), 589 - bio_sector); 581 + mddev_trace_remap(mddev, bio, bio_sector); 590 582 mddev_check_write_zeroes(mddev, bio); 591 583 submit_bio_noacct(bio); 592 584 }

+69

drivers/md/raid1-10.c

··· 227 227 228 228 return false; 229 229 } 230 + 231 + /** 232 + * raid1_check_read_range() - check a given read range for bad blocks, 233 + * available read length is returned; 234 + * @rdev: the rdev to read; 235 + * @this_sector: read position; 236 + * @len: read length; 237 + * 238 + * helper function for read_balance() 239 + * 240 + * 1) If there are no bad blocks in the range, @len is returned; 241 + * 2) If the range are all bad blocks, 0 is returned; 242 + * 3) If there are partial bad blocks: 243 + * - If the bad block range starts after @this_sector, the length of first 244 + * good region is returned; 245 + * - If the bad block range starts before @this_sector, 0 is returned and 246 + * the @len is updated to the offset into the region before we get to the 247 + * good blocks; 248 + */ 249 + static inline int raid1_check_read_range(struct md_rdev *rdev, 250 + sector_t this_sector, int *len) 251 + { 252 + sector_t first_bad; 253 + int bad_sectors; 254 + 255 + /* no bad block overlap */ 256 + if (!is_badblock(rdev, this_sector, *len, &first_bad, &bad_sectors)) 257 + return *len; 258 + 259 + /* 260 + * bad block range starts offset into our range so we can return the 261 + * number of sectors before the bad blocks start. 262 + */ 263 + if (first_bad > this_sector) 264 + return first_bad - this_sector; 265 + 266 + /* read range is fully consumed by bad blocks. */ 267 + if (this_sector + *len <= first_bad + bad_sectors) 268 + return 0; 269 + 270 + /* 271 + * final case, bad block range starts before or at the start of our 272 + * range but does not cover our entire range so we still return 0 but 273 + * update the length with the number of sectors before we get to the 274 + * good ones. 275 + */ 276 + *len = first_bad + bad_sectors - this_sector; 277 + return 0; 278 + } 279 + 280 + /* 281 + * Check if read should choose the first rdev. 282 + * 283 + * Balance on the whole device if no resync is going on (recovery is ok) or 284 + * below the resync window. Otherwise, take the first readable disk. 285 + */ 286 + static inline bool raid1_should_read_first(struct mddev *mddev, 287 + sector_t this_sector, int len) 288 + { 289 + if ((mddev->recovery_cp < this_sector + len)) 290 + return true; 291 + 292 + if (mddev_is_clustered(mddev) && 293 + md_cluster_ops->area_resyncing(mddev, READ, this_sector, 294 + this_sector + len)) 295 + return true; 296 + 297 + return false; 298 + }

+378 -265

drivers/md/raid1.c

··· 46 46 static void allow_barrier(struct r1conf *conf, sector_t sector_nr); 47 47 static void lower_barrier(struct r1conf *conf, sector_t sector_nr); 48 48 49 - #define raid1_log(md, fmt, args...) \ 50 - do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) 51 - 52 49 #define RAID_1_10_NAME "raid1" 53 50 #include "raid1-10.c" 54 51 ··· 495 498 * to user-side. So if something waits for IO, then it 496 499 * will wait for the 'master' bio. 497 500 */ 498 - sector_t first_bad; 499 - int bad_sectors; 500 - 501 501 r1_bio->bios[mirror] = NULL; 502 502 to_put = bio; 503 503 /* ··· 510 516 set_bit(R1BIO_Uptodate, &r1_bio->state); 511 517 512 518 /* Maybe we can clear some bad blocks. */ 513 - if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, 514 - &first_bad, &bad_sectors) && !discard_error) { 519 + if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) && 520 + !discard_error) { 515 521 r1_bio->bios[mirror] = IO_MADE_GOOD; 516 522 set_bit(R1BIO_MadeGood, &r1_bio->state); 517 523 } ··· 576 582 return len; 577 583 } 578 584 579 - /* 580 - * This routine returns the disk from which the requested read should 581 - * be done. There is a per-array 'next expected sequential IO' sector 582 - * number - if this matches on the next IO then we use the last disk. 583 - * There is also a per-disk 'last know head position' sector that is 584 - * maintained from IRQ contexts, both the normal and the resync IO 585 - * completion handlers update this position correctly. If there is no 586 - * perfect sequential match then we pick the disk whose head is closest. 587 - * 588 - * If there are 2 mirrors in the same 2 devices, performance degrades 589 - * because position is mirror, not device based. 590 - * 591 - * The rdev for the device selected will have nr_pending incremented. 592 - */ 593 - static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors) 585 + static void update_read_sectors(struct r1conf *conf, int disk, 586 + sector_t this_sector, int len) 594 587 { 595 - const sector_t this_sector = r1_bio->sector; 596 - int sectors; 597 - int best_good_sectors; 598 - int best_disk, best_dist_disk, best_pending_disk; 599 - int has_nonrot_disk; 588 + struct raid1_info *info = &conf->mirrors[disk]; 589 + 590 + atomic_inc(&info->rdev->nr_pending); 591 + if (info->next_seq_sect != this_sector) 592 + info->seq_start = this_sector; 593 + info->next_seq_sect = this_sector + len; 594 + } 595 + 596 + static int choose_first_rdev(struct r1conf *conf, struct r1bio *r1_bio, 597 + int *max_sectors) 598 + { 599 + sector_t this_sector = r1_bio->sector; 600 + int len = r1_bio->sectors; 600 601 int disk; 601 - sector_t best_dist; 602 - unsigned int min_pending; 603 - struct md_rdev *rdev; 604 - int choose_first; 605 - int choose_next_idle; 606 - 607 - /* 608 - * Check if we can balance. We can balance on the whole 609 - * device if no resync is going on, or below the resync window. 610 - * We take the first readable disk when above the resync window. 611 - */ 612 - retry: 613 - sectors = r1_bio->sectors; 614 - best_disk = -1; 615 - best_dist_disk = -1; 616 - best_dist = MaxSector; 617 - best_pending_disk = -1; 618 - min_pending = UINT_MAX; 619 - best_good_sectors = 0; 620 - has_nonrot_disk = 0; 621 - choose_next_idle = 0; 622 - clear_bit(R1BIO_FailFast, &r1_bio->state); 623 - 624 - if ((conf->mddev->recovery_cp < this_sector + sectors) || 625 - (mddev_is_clustered(conf->mddev) && 626 - md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, 627 - this_sector + sectors))) 628 - choose_first = 1; 629 - else 630 - choose_first = 0; 631 602 632 603 for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { 633 - sector_t dist; 634 - sector_t first_bad; 635 - int bad_sectors; 636 - unsigned int pending; 637 - bool nonrot; 604 + struct md_rdev *rdev; 605 + int read_len; 606 + 607 + if (r1_bio->bios[disk] == IO_BLOCKED) 608 + continue; 638 609 639 610 rdev = conf->mirrors[disk].rdev; 640 - if (r1_bio->bios[disk] == IO_BLOCKED 641 - || rdev == NULL 642 - || test_bit(Faulty, &rdev->flags)) 643 - continue; 644 - if (!test_bit(In_sync, &rdev->flags) && 645 - rdev->recovery_offset < this_sector + sectors) 646 - continue; 647 - if (test_bit(WriteMostly, &rdev->flags)) { 648 - /* Don't balance among write-mostly, just 649 - * use the first as a last resort */ 650 - if (best_dist_disk < 0) { 651 - if (is_badblock(rdev, this_sector, sectors, 652 - &first_bad, &bad_sectors)) { 653 - if (first_bad <= this_sector) 654 - /* Cannot use this */ 655 - continue; 656 - best_good_sectors = first_bad - this_sector; 657 - } else 658 - best_good_sectors = sectors; 659 - best_dist_disk = disk; 660 - best_pending_disk = disk; 661 - } 662 - continue; 663 - } 664 - /* This is a reasonable device to use. It might 665 - * even be best. 666 - */ 667 - if (is_badblock(rdev, this_sector, sectors, 668 - &first_bad, &bad_sectors)) { 669 - if (best_dist < MaxSector) 670 - /* already have a better device */ 671 - continue; 672 - if (first_bad <= this_sector) { 673 - /* cannot read here. If this is the 'primary' 674 - * device, then we must not read beyond 675 - * bad_sectors from another device.. 676 - */ 677 - bad_sectors -= (this_sector - first_bad); 678 - if (choose_first && sectors > bad_sectors) 679 - sectors = bad_sectors; 680 - if (best_good_sectors > sectors) 681 - best_good_sectors = sectors; 682 - 683 - } else { 684 - sector_t good_sectors = first_bad - this_sector; 685 - if (good_sectors > best_good_sectors) { 686 - best_good_sectors = good_sectors; 687 - best_disk = disk; 688 - } 689 - if (choose_first) 690 - break; 691 - } 692 - continue; 693 - } else { 694 - if ((sectors > best_good_sectors) && (best_disk >= 0)) 695 - best_disk = -1; 696 - best_good_sectors = sectors; 697 - } 698 - 699 - if (best_disk >= 0) 700 - /* At least two disks to choose from so failfast is OK */ 701 - set_bit(R1BIO_FailFast, &r1_bio->state); 702 - 703 - nonrot = bdev_nonrot(rdev->bdev); 704 - has_nonrot_disk |= nonrot; 705 - pending = atomic_read(&rdev->nr_pending); 706 - dist = abs(this_sector - conf->mirrors[disk].head_position); 707 - if (choose_first) { 708 - best_disk = disk; 709 - break; 710 - } 711 - /* Don't change to another disk for sequential reads */ 712 - if (conf->mirrors[disk].next_seq_sect == this_sector 713 - || dist == 0) { 714 - int opt_iosize = bdev_io_opt(rdev->bdev) >> 9; 715 - struct raid1_info *mirror = &conf->mirrors[disk]; 716 - 717 - best_disk = disk; 718 - /* 719 - * If buffered sequential IO size exceeds optimal 720 - * iosize, check if there is idle disk. If yes, choose 721 - * the idle disk. read_balance could already choose an 722 - * idle disk before noticing it's a sequential IO in 723 - * this disk. This doesn't matter because this disk 724 - * will idle, next time it will be utilized after the 725 - * first disk has IO size exceeds optimal iosize. In 726 - * this way, iosize of the first disk will be optimal 727 - * iosize at least. iosize of the second disk might be 728 - * small, but not a big deal since when the second disk 729 - * starts IO, the first disk is likely still busy. 730 - */ 731 - if (nonrot && opt_iosize > 0 && 732 - mirror->seq_start != MaxSector && 733 - mirror->next_seq_sect > opt_iosize && 734 - mirror->next_seq_sect - opt_iosize >= 735 - mirror->seq_start) { 736 - choose_next_idle = 1; 737 - continue; 738 - } 739 - break; 740 - } 741 - 742 - if (choose_next_idle) 611 + if (!rdev || test_bit(Faulty, &rdev->flags)) 743 612 continue; 744 613 745 - if (min_pending > pending) { 746 - min_pending = pending; 747 - best_pending_disk = disk; 748 - } 749 - 750 - if (dist < best_dist) { 751 - best_dist = dist; 752 - best_dist_disk = disk; 614 + /* choose the first disk even if it has some bad blocks. */ 615 + read_len = raid1_check_read_range(rdev, this_sector, &len); 616 + if (read_len > 0) { 617 + update_read_sectors(conf, disk, this_sector, read_len); 618 + *max_sectors = read_len; 619 + return disk; 753 620 } 754 621 } 622 + 623 + return -1; 624 + } 625 + 626 + static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio, 627 + int *max_sectors) 628 + { 629 + sector_t this_sector = r1_bio->sector; 630 + int best_disk = -1; 631 + int best_len = 0; 632 + int disk; 633 + 634 + for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { 635 + struct md_rdev *rdev; 636 + int len; 637 + int read_len; 638 + 639 + if (r1_bio->bios[disk] == IO_BLOCKED) 640 + continue; 641 + 642 + rdev = conf->mirrors[disk].rdev; 643 + if (!rdev || test_bit(Faulty, &rdev->flags) || 644 + test_bit(WriteMostly, &rdev->flags)) 645 + continue; 646 + 647 + /* keep track of the disk with the most readable sectors. */ 648 + len = r1_bio->sectors; 649 + read_len = raid1_check_read_range(rdev, this_sector, &len); 650 + if (read_len > best_len) { 651 + best_disk = disk; 652 + best_len = read_len; 653 + } 654 + } 655 + 656 + if (best_disk != -1) { 657 + *max_sectors = best_len; 658 + update_read_sectors(conf, best_disk, this_sector, best_len); 659 + } 660 + 661 + return best_disk; 662 + } 663 + 664 + static int choose_slow_rdev(struct r1conf *conf, struct r1bio *r1_bio, 665 + int *max_sectors) 666 + { 667 + sector_t this_sector = r1_bio->sector; 668 + int bb_disk = -1; 669 + int bb_read_len = 0; 670 + int disk; 671 + 672 + for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { 673 + struct md_rdev *rdev; 674 + int len; 675 + int read_len; 676 + 677 + if (r1_bio->bios[disk] == IO_BLOCKED) 678 + continue; 679 + 680 + rdev = conf->mirrors[disk].rdev; 681 + if (!rdev || test_bit(Faulty, &rdev->flags) || 682 + !test_bit(WriteMostly, &rdev->flags)) 683 + continue; 684 + 685 + /* there are no bad blocks, we can use this disk */ 686 + len = r1_bio->sectors; 687 + read_len = raid1_check_read_range(rdev, this_sector, &len); 688 + if (read_len == r1_bio->sectors) { 689 + update_read_sectors(conf, disk, this_sector, read_len); 690 + return disk; 691 + } 692 + 693 + /* 694 + * there are partial bad blocks, choose the rdev with largest 695 + * read length. 696 + */ 697 + if (read_len > bb_read_len) { 698 + bb_disk = disk; 699 + bb_read_len = read_len; 700 + } 701 + } 702 + 703 + if (bb_disk != -1) { 704 + *max_sectors = bb_read_len; 705 + update_read_sectors(conf, bb_disk, this_sector, bb_read_len); 706 + } 707 + 708 + return bb_disk; 709 + } 710 + 711 + static bool is_sequential(struct r1conf *conf, int disk, struct r1bio *r1_bio) 712 + { 713 + /* TODO: address issues with this check and concurrency. */ 714 + return conf->mirrors[disk].next_seq_sect == r1_bio->sector || 715 + conf->mirrors[disk].head_position == r1_bio->sector; 716 + } 717 + 718 + /* 719 + * If buffered sequential IO size exceeds optimal iosize, check if there is idle 720 + * disk. If yes, choose the idle disk. 721 + */ 722 + static bool should_choose_next(struct r1conf *conf, int disk) 723 + { 724 + struct raid1_info *mirror = &conf->mirrors[disk]; 725 + int opt_iosize; 726 + 727 + if (!test_bit(Nonrot, &mirror->rdev->flags)) 728 + return false; 729 + 730 + opt_iosize = bdev_io_opt(mirror->rdev->bdev) >> 9; 731 + return opt_iosize > 0 && mirror->seq_start != MaxSector && 732 + mirror->next_seq_sect > opt_iosize && 733 + mirror->next_seq_sect - opt_iosize >= mirror->seq_start; 734 + } 735 + 736 + static bool rdev_readable(struct md_rdev *rdev, struct r1bio *r1_bio) 737 + { 738 + if (!rdev || test_bit(Faulty, &rdev->flags)) 739 + return false; 740 + 741 + /* still in recovery */ 742 + if (!test_bit(In_sync, &rdev->flags) && 743 + rdev->recovery_offset < r1_bio->sector + r1_bio->sectors) 744 + return false; 745 + 746 + /* don't read from slow disk unless have to */ 747 + if (test_bit(WriteMostly, &rdev->flags)) 748 + return false; 749 + 750 + /* don't split IO for bad blocks unless have to */ 751 + if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors)) 752 + return false; 753 + 754 + return true; 755 + } 756 + 757 + struct read_balance_ctl { 758 + sector_t closest_dist; 759 + int closest_dist_disk; 760 + int min_pending; 761 + int min_pending_disk; 762 + int sequential_disk; 763 + int readable_disks; 764 + }; 765 + 766 + static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio) 767 + { 768 + int disk; 769 + struct read_balance_ctl ctl = { 770 + .closest_dist_disk = -1, 771 + .closest_dist = MaxSector, 772 + .min_pending_disk = -1, 773 + .min_pending = UINT_MAX, 774 + .sequential_disk = -1, 775 + }; 776 + 777 + for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { 778 + struct md_rdev *rdev; 779 + sector_t dist; 780 + unsigned int pending; 781 + 782 + if (r1_bio->bios[disk] == IO_BLOCKED) 783 + continue; 784 + 785 + rdev = conf->mirrors[disk].rdev; 786 + if (!rdev_readable(rdev, r1_bio)) 787 + continue; 788 + 789 + /* At least two disks to choose from so failfast is OK */ 790 + if (ctl.readable_disks++ == 1) 791 + set_bit(R1BIO_FailFast, &r1_bio->state); 792 + 793 + pending = atomic_read(&rdev->nr_pending); 794 + dist = abs(r1_bio->sector - conf->mirrors[disk].head_position); 795 + 796 + /* Don't change to another disk for sequential reads */ 797 + if (is_sequential(conf, disk, r1_bio)) { 798 + if (!should_choose_next(conf, disk)) 799 + return disk; 800 + 801 + /* 802 + * Add 'pending' to avoid choosing this disk if 803 + * there is other idle disk. 804 + */ 805 + pending++; 806 + /* 807 + * If there is no other idle disk, this disk 808 + * will be chosen. 809 + */ 810 + ctl.sequential_disk = disk; 811 + } 812 + 813 + if (ctl.min_pending > pending) { 814 + ctl.min_pending = pending; 815 + ctl.min_pending_disk = disk; 816 + } 817 + 818 + if (ctl.closest_dist > dist) { 819 + ctl.closest_dist = dist; 820 + ctl.closest_dist_disk = disk; 821 + } 822 + } 823 + 824 + /* 825 + * sequential IO size exceeds optimal iosize, however, there is no other 826 + * idle disk, so choose the sequential disk. 827 + */ 828 + if (ctl.sequential_disk != -1 && ctl.min_pending != 0) 829 + return ctl.sequential_disk; 755 830 756 831 /* 757 832 * If all disks are rotational, choose the closest disk. If any disk is ··· 828 765 * disk is rotational, which might/might not be optimal for raids with 829 766 * mixed ratation/non-rotational disks depending on workload. 830 767 */ 831 - if (best_disk == -1) { 832 - if (has_nonrot_disk || min_pending == 0) 833 - best_disk = best_pending_disk; 834 - else 835 - best_disk = best_dist_disk; 768 + if (ctl.min_pending_disk != -1 && 769 + (READ_ONCE(conf->nonrot_disks) || ctl.min_pending == 0)) 770 + return ctl.min_pending_disk; 771 + else 772 + return ctl.closest_dist_disk; 773 + } 774 + 775 + /* 776 + * This routine returns the disk from which the requested read should be done. 777 + * 778 + * 1) If resync is in progress, find the first usable disk and use it even if it 779 + * has some bad blocks. 780 + * 781 + * 2) Now that there is no resync, loop through all disks and skipping slow 782 + * disks and disks with bad blocks for now. Only pay attention to key disk 783 + * choice. 784 + * 785 + * 3) If we've made it this far, now look for disks with bad blocks and choose 786 + * the one with most number of sectors. 787 + * 788 + * 4) If we are all the way at the end, we have no choice but to use a disk even 789 + * if it is write mostly. 790 + * 791 + * The rdev for the device selected will have nr_pending incremented. 792 + */ 793 + static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, 794 + int *max_sectors) 795 + { 796 + int disk; 797 + 798 + clear_bit(R1BIO_FailFast, &r1_bio->state); 799 + 800 + if (raid1_should_read_first(conf->mddev, r1_bio->sector, 801 + r1_bio->sectors)) 802 + return choose_first_rdev(conf, r1_bio, max_sectors); 803 + 804 + disk = choose_best_rdev(conf, r1_bio); 805 + if (disk >= 0) { 806 + *max_sectors = r1_bio->sectors; 807 + update_read_sectors(conf, disk, r1_bio->sector, 808 + r1_bio->sectors); 809 + return disk; 836 810 } 837 811 838 - if (best_disk >= 0) { 839 - rdev = conf->mirrors[best_disk].rdev; 840 - if (!rdev) 841 - goto retry; 842 - atomic_inc(&rdev->nr_pending); 843 - sectors = best_good_sectors; 812 + /* 813 + * If we are here it means we didn't find a perfectly good disk so 814 + * now spend a bit more time trying to find one with the most good 815 + * sectors. 816 + */ 817 + disk = choose_bb_rdev(conf, r1_bio, max_sectors); 818 + if (disk >= 0) 819 + return disk; 844 820 845 - if (conf->mirrors[best_disk].next_seq_sect != this_sector) 846 - conf->mirrors[best_disk].seq_start = this_sector; 847 - 848 - conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; 849 - } 850 - *max_sectors = sectors; 851 - 852 - return best_disk; 821 + return choose_slow_rdev(conf, r1_bio, max_sectors); 853 822 } 854 823 855 824 static void wake_up_barrier(struct r1conf *conf) ··· 1193 1098 */ 1194 1099 spin_lock_irq(&conf->resync_lock); 1195 1100 conf->array_frozen = 1; 1196 - raid1_log(conf->mddev, "wait freeze"); 1101 + mddev_add_trace_msg(conf->mddev, "raid1 wait freeze"); 1197 1102 wait_event_lock_irq_cmd( 1198 1103 conf->wait_barrier, 1199 1104 get_unqueued_pending(conf) == extra, ··· 1382 1287 * Reading from a write-mostly device must take care not to 1383 1288 * over-take any writes that are 'behind' 1384 1289 */ 1385 - raid1_log(mddev, "wait behind writes"); 1290 + mddev_add_trace_msg(mddev, "raid1 wait behind writes"); 1386 1291 wait_event(bitmap->behind_wait, 1387 1292 atomic_read(&bitmap->behind_writes) == 0); 1388 1293 } ··· 1415 1320 test_bit(R1BIO_FailFast, &r1_bio->state)) 1416 1321 read_bio->bi_opf |= MD_FAILFAST; 1417 1322 read_bio->bi_private = r1_bio; 1418 - 1419 - if (mddev->gendisk) 1420 - trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), 1421 - r1_bio->sector); 1422 - 1323 + mddev_trace_remap(mddev, read_bio, r1_bio->sector); 1423 1324 submit_bio_noacct(read_bio); 1424 1325 } 1425 1326 ··· 1565 1474 bio_wouldblock_error(bio); 1566 1475 return; 1567 1476 } 1568 - raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); 1477 + mddev_add_trace_msg(mddev, "raid1 wait rdev %d blocked", 1478 + blocked_rdev->raid_disk); 1569 1479 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1570 1480 wait_barrier(conf, bio->bi_iter.bi_sector, false); 1571 1481 goto retry_write; ··· 1649 1557 mbio->bi_private = r1_bio; 1650 1558 1651 1559 atomic_inc(&r1_bio->remaining); 1652 - 1653 - if (mddev->gendisk) 1654 - trace_block_bio_remap(mbio, disk_devt(mddev->gendisk), 1655 - r1_bio->sector); 1560 + mddev_trace_remap(mddev, mbio, r1_bio->sector); 1656 1561 /* flush_pending_writes() needs access to the rdev so...*/ 1657 1562 mbio->bi_bdev = (void *)rdev; 1658 1563 if (!raid1_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) { ··· 1849 1760 return count; 1850 1761 } 1851 1762 1763 + static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk, 1764 + bool replacement) 1765 + { 1766 + struct raid1_info *info = conf->mirrors + disk; 1767 + 1768 + if (replacement) 1769 + info += conf->raid_disks; 1770 + 1771 + if (info->rdev) 1772 + return false; 1773 + 1774 + if (bdev_nonrot(rdev->bdev)) { 1775 + set_bit(Nonrot, &rdev->flags); 1776 + WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1); 1777 + } 1778 + 1779 + rdev->raid_disk = disk; 1780 + info->head_position = 0; 1781 + info->seq_start = MaxSector; 1782 + WRITE_ONCE(info->rdev, rdev); 1783 + 1784 + return true; 1785 + } 1786 + 1787 + static bool raid1_remove_conf(struct r1conf *conf, int disk) 1788 + { 1789 + struct raid1_info *info = conf->mirrors + disk; 1790 + struct md_rdev *rdev = info->rdev; 1791 + 1792 + if (!rdev || test_bit(In_sync, &rdev->flags) || 1793 + atomic_read(&rdev->nr_pending)) 1794 + return false; 1795 + 1796 + /* Only remove non-faulty devices if recovery is not possible. */ 1797 + if (!test_bit(Faulty, &rdev->flags) && 1798 + rdev->mddev->recovery_disabled != conf->recovery_disabled && 1799 + rdev->mddev->degraded < conf->raid_disks) 1800 + return false; 1801 + 1802 + if (test_and_clear_bit(Nonrot, &rdev->flags)) 1803 + WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks - 1); 1804 + 1805 + WRITE_ONCE(info->rdev, NULL); 1806 + return true; 1807 + } 1808 + 1852 1809 static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) 1853 1810 { 1854 1811 struct r1conf *conf = mddev->private; ··· 1926 1791 for (mirror = first; mirror <= last; mirror++) { 1927 1792 p = conf->mirrors + mirror; 1928 1793 if (!p->rdev) { 1929 - if (mddev->gendisk) 1930 - disk_stack_limits(mddev->gendisk, rdev->bdev, 1931 - rdev->data_offset << 9); 1794 + err = mddev_stack_new_rdev(mddev, rdev); 1795 + if (err) 1796 + return err; 1932 1797 1933 - p->head_position = 0; 1934 - rdev->raid_disk = mirror; 1935 - err = 0; 1798 + raid1_add_conf(conf, rdev, mirror, false); 1936 1799 /* As all devices are equivalent, we don't need a full recovery 1937 1800 * if this was recently any drive of the array 1938 1801 */ 1939 1802 if (rdev->saved_raid_disk < 0) 1940 1803 conf->fullsync = 1; 1941 - WRITE_ONCE(p->rdev, rdev); 1942 1804 break; 1943 1805 } 1944 1806 if (test_bit(WantReplacement, &p->rdev->flags) && ··· 1945 1813 1946 1814 if (err && repl_slot >= 0) { 1947 1815 /* Add this device as a replacement */ 1948 - p = conf->mirrors + repl_slot; 1949 1816 clear_bit(In_sync, &rdev->flags); 1950 1817 set_bit(Replacement, &rdev->flags); 1951 - rdev->raid_disk = repl_slot; 1818 + raid1_add_conf(conf, rdev, repl_slot, true); 1952 1819 err = 0; 1953 1820 conf->fullsync = 1; 1954 - WRITE_ONCE(p[conf->raid_disks].rdev, rdev); 1955 1821 } 1956 1822 1957 1823 print_conf(conf); ··· 1966 1836 if (unlikely(number >= conf->raid_disks)) 1967 1837 goto abort; 1968 1838 1969 - if (rdev != p->rdev) 1970 - p = conf->mirrors + conf->raid_disks + number; 1839 + if (rdev != p->rdev) { 1840 + number += conf->raid_disks; 1841 + p = conf->mirrors + number; 1842 + } 1971 1843 1972 1844 print_conf(conf); 1973 1845 if (rdev == p->rdev) { 1974 - if (test_bit(In_sync, &rdev->flags) || 1975 - atomic_read(&rdev->nr_pending)) { 1846 + if (!raid1_remove_conf(conf, number)) { 1976 1847 err = -EBUSY; 1977 1848 goto abort; 1978 1849 } 1979 - /* Only remove non-faulty devices if recovery 1980 - * is not possible. 1981 - */ 1982 - if (!test_bit(Faulty, &rdev->flags) && 1983 - mddev->recovery_disabled != conf->recovery_disabled && 1984 - mddev->degraded < conf->raid_disks) { 1985 - err = -EBUSY; 1986 - goto abort; 1987 - } 1988 - WRITE_ONCE(p->rdev, NULL); 1989 - if (conf->mirrors[conf->raid_disks + number].rdev) { 1850 + 1851 + if (number < conf->raid_disks && 1852 + conf->mirrors[conf->raid_disks + number].rdev) { 1990 1853 /* We just removed a device that is being replaced. 1991 1854 * Move down the replacement. We drain all IO before 1992 1855 * doing this to avoid confusion. ··· 2067 1944 struct r1bio *r1_bio = get_resync_r1bio(bio); 2068 1945 struct mddev *mddev = r1_bio->mddev; 2069 1946 struct r1conf *conf = mddev->private; 2070 - sector_t first_bad; 2071 - int bad_sectors; 2072 1947 struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev; 2073 1948 2074 1949 if (!uptodate) { ··· 2076 1955 set_bit(MD_RECOVERY_NEEDED, & 2077 1956 mddev->recovery); 2078 1957 set_bit(R1BIO_WriteError, &r1_bio->state); 2079 - } else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, 2080 - &first_bad, &bad_sectors) && 2081 - !is_badblock(conf->mirrors[r1_bio->read_disk].rdev, 2082 - r1_bio->sector, 2083 - r1_bio->sectors, 2084 - &first_bad, &bad_sectors) 2085 - ) 1958 + } else if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) && 1959 + !rdev_has_badblock(conf->mirrors[r1_bio->read_disk].rdev, 1960 + r1_bio->sector, r1_bio->sectors)) { 2086 1961 set_bit(R1BIO_MadeGood, &r1_bio->state); 1962 + } 2087 1963 2088 1964 put_sync_write_buf(r1_bio, uptodate); 2089 1965 } ··· 2397 2279 s = PAGE_SIZE >> 9; 2398 2280 2399 2281 do { 2400 - sector_t first_bad; 2401 - int bad_sectors; 2402 - 2403 2282 rdev = conf->mirrors[d].rdev; 2404 2283 if (rdev && 2405 2284 (test_bit(In_sync, &rdev->flags) || 2406 2285 (!test_bit(Faulty, &rdev->flags) && 2407 2286 rdev->recovery_offset >= sect + s)) && 2408 - is_badblock(rdev, sect, s, 2409 - &first_bad, &bad_sectors) == 0) { 2287 + rdev_has_badblock(rdev, sect, s) == 0) { 2410 2288 atomic_inc(&rdev->nr_pending); 2411 2289 if (sync_page_io(rdev, sect, s<<9, 2412 2290 conf->tmppage, REQ_OP_READ, false)) ··· 3120 3006 3121 3007 err = -EINVAL; 3122 3008 spin_lock_init(&conf->device_lock); 3009 + conf->raid_disks = mddev->raid_disks; 3123 3010 rdev_for_each(rdev, mddev) { 3124 3011 int disk_idx = rdev->raid_disk; 3125 - if (disk_idx >= mddev->raid_disks 3126 - || disk_idx < 0) 3127 - continue; 3128 - if (test_bit(Replacement, &rdev->flags)) 3129 - disk = conf->mirrors + mddev->raid_disks + disk_idx; 3130 - else 3131 - disk = conf->mirrors + disk_idx; 3132 3012 3133 - if (disk->rdev) 3013 + if (disk_idx >= conf->raid_disks || disk_idx < 0) 3014 + continue; 3015 + 3016 + if (!raid1_add_conf(conf, rdev, disk_idx, 3017 + test_bit(Replacement, &rdev->flags))) 3134 3018 goto abort; 3135 - disk->rdev = rdev; 3136 - disk->head_position = 0; 3137 - disk->seq_start = MaxSector; 3138 3019 } 3139 - conf->raid_disks = mddev->raid_disks; 3140 3020 conf->mddev = mddev; 3141 3021 INIT_LIST_HEAD(&conf->retry_list); 3142 3022 INIT_LIST_HEAD(&conf->bio_end_io_list); ··· 3194 3086 return ERR_PTR(err); 3195 3087 } 3196 3088 3089 + static int raid1_set_limits(struct mddev *mddev) 3090 + { 3091 + struct queue_limits lim; 3092 + 3093 + blk_set_stacking_limits(&lim); 3094 + lim.max_write_zeroes_sectors = 0; 3095 + mddev_stack_rdev_limits(mddev, &lim); 3096 + return queue_limits_set(mddev->gendisk->queue, &lim); 3097 + } 3098 + 3197 3099 static void raid1_free(struct mddev *mddev, void *priv); 3198 3100 static int raid1_run(struct mddev *mddev) 3199 3101 { 3200 3102 struct r1conf *conf; 3201 3103 int i; 3202 - struct md_rdev *rdev; 3203 3104 int ret; 3204 3105 3205 3106 if (mddev->level != 1) { ··· 3235 3118 if (IS_ERR(conf)) 3236 3119 return PTR_ERR(conf); 3237 3120 3238 - if (mddev->queue) 3239 - blk_queue_max_write_zeroes_sectors(mddev->queue, 0); 3240 - 3241 - rdev_for_each(rdev, mddev) { 3242 - if (!mddev->gendisk) 3243 - continue; 3244 - disk_stack_limits(mddev->gendisk, rdev->bdev, 3245 - rdev->data_offset << 9); 3121 + if (!mddev_is_dm(mddev)) { 3122 + ret = raid1_set_limits(mddev); 3123 + if (ret) 3124 + goto abort; 3246 3125 } 3247 3126 3248 3127 mddev->degraded = 0;

+1

drivers/md/raid1.h

··· 71 71 * allow for replacements. 72 72 */ 73 73 int raid_disks; 74 + int nonrot_disks; 74 75 75 76 spinlock_t device_lock; 76 77

+58 -85

drivers/md/raid10.c

··· 76 76 static void end_reshape_write(struct bio *bio); 77 77 static void end_reshape(struct r10conf *conf); 78 78 79 - #define raid10_log(md, fmt, args...) \ 80 - do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0) 81 - 82 79 #include "raid1-10.c" 83 80 84 81 #define NULL_CMD ··· 515 518 * The 'master' represents the composite IO operation to 516 519 * user-side. So if something waits for IO, then it will 517 520 * wait for the 'master' bio. 518 - */ 519 - sector_t first_bad; 520 - int bad_sectors; 521 - 522 - /* 521 + * 523 522 * Do not set R10BIO_Uptodate if the current device is 524 523 * rebuilding or Faulty. This is because we cannot use 525 524 * such device for properly reading the data back (we could ··· 528 535 set_bit(R10BIO_Uptodate, &r10_bio->state); 529 536 530 537 /* Maybe we can clear some bad blocks. */ 531 - if (is_badblock(rdev, 532 - r10_bio->devs[slot].addr, 533 - r10_bio->sectors, 534 - &first_bad, &bad_sectors) && !discard_error) { 538 + if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr, 539 + r10_bio->sectors) && 540 + !discard_error) { 535 541 bio_put(bio); 536 542 if (repl) 537 543 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; ··· 745 753 best_good_sectors = 0; 746 754 do_balance = 1; 747 755 clear_bit(R10BIO_FailFast, &r10_bio->state); 748 - /* 749 - * Check if we can balance. We can balance on the whole 750 - * device if no resync is going on (recovery is ok), or below 751 - * the resync window. We take the first readable disk when 752 - * above the resync window. 753 - */ 754 - if ((conf->mddev->recovery_cp < MaxSector 755 - && (this_sector + sectors >= conf->next_resync)) || 756 - (mddev_is_clustered(conf->mddev) && 757 - md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, 758 - this_sector + sectors))) 756 + 757 + if (raid1_should_read_first(conf->mddev, this_sector, sectors)) 759 758 do_balance = 0; 760 759 761 760 for (slot = 0; slot < conf->copies ; slot++) { ··· 1016 1033 ret = false; 1017 1034 } else { 1018 1035 conf->nr_waiting++; 1019 - raid10_log(conf->mddev, "wait barrier"); 1036 + mddev_add_trace_msg(conf->mddev, "raid10 wait barrier"); 1020 1037 wait_event_barrier(conf, stop_waiting_barrier(conf)); 1021 1038 conf->nr_waiting--; 1022 1039 } ··· 1135 1152 bio_wouldblock_error(bio); 1136 1153 return false; 1137 1154 } 1138 - raid10_log(conf->mddev, "wait reshape"); 1155 + mddev_add_trace_msg(conf->mddev, "raid10 wait reshape"); 1139 1156 wait_event(conf->wait_barrier, 1140 1157 conf->reshape_progress <= bio->bi_iter.bi_sector || 1141 1158 conf->reshape_progress >= bio->bi_iter.bi_sector + ··· 1232 1249 test_bit(R10BIO_FailFast, &r10_bio->state)) 1233 1250 read_bio->bi_opf |= MD_FAILFAST; 1234 1251 read_bio->bi_private = r10_bio; 1235 - 1236 - if (mddev->gendisk) 1237 - trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), 1238 - r10_bio->sector); 1252 + mddev_trace_remap(mddev, read_bio, r10_bio->sector); 1239 1253 submit_bio_noacct(read_bio); 1240 1254 return; 1241 1255 } ··· 1268 1288 && enough(conf, devnum)) 1269 1289 mbio->bi_opf |= MD_FAILFAST; 1270 1290 mbio->bi_private = r10_bio; 1271 - 1272 - if (conf->mddev->gendisk) 1273 - trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk), 1274 - r10_bio->sector); 1291 + mddev_trace_remap(mddev, mbio, r10_bio->sector); 1275 1292 /* flush_pending_writes() needs access to the rdev so...*/ 1276 1293 mbio->bi_bdev = (void *)rdev; 1277 1294 ··· 1307 1330 } 1308 1331 1309 1332 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { 1310 - sector_t first_bad; 1311 1333 sector_t dev_sector = r10_bio->devs[i].addr; 1312 - int bad_sectors; 1313 - int is_bad; 1314 1334 1315 1335 /* 1316 1336 * Discard request doesn't care the write result ··· 1316 1342 if (!r10_bio->sectors) 1317 1343 continue; 1318 1344 1319 - is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors, 1320 - &first_bad, &bad_sectors); 1321 - if (is_bad < 0) { 1345 + if (rdev_has_badblock(rdev, dev_sector, 1346 + r10_bio->sectors) < 0) { 1322 1347 /* 1323 1348 * Mustn't write here until the bad block 1324 1349 * is acknowledged ··· 1333 1360 if (unlikely(blocked_rdev)) { 1334 1361 /* Have to wait for this device to get unblocked, then retry */ 1335 1362 allow_barrier(conf); 1336 - raid10_log(conf->mddev, "%s wait rdev %d blocked", 1337 - __func__, blocked_rdev->raid_disk); 1363 + mddev_add_trace_msg(conf->mddev, 1364 + "raid10 %s wait rdev %d blocked", 1365 + __func__, blocked_rdev->raid_disk); 1338 1366 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1339 1367 wait_barrier(conf, false); 1340 1368 goto retry_wait; ··· 1390 1416 bio_wouldblock_error(bio); 1391 1417 return; 1392 1418 } 1393 - raid10_log(conf->mddev, "wait reshape metadata"); 1419 + mddev_add_trace_msg(conf->mddev, 1420 + "raid10 wait reshape metadata"); 1394 1421 wait_event(mddev->sb_wait, 1395 1422 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 1396 1423 ··· 2106 2131 continue; 2107 2132 } 2108 2133 2109 - if (mddev->gendisk) 2110 - disk_stack_limits(mddev->gendisk, rdev->bdev, 2111 - rdev->data_offset << 9); 2112 - 2134 + err = mddev_stack_new_rdev(mddev, rdev); 2135 + if (err) 2136 + return err; 2113 2137 p->head_position = 0; 2114 2138 p->recovery_disabled = mddev->recovery_disabled - 1; 2115 2139 rdev->raid_disk = mirror; ··· 2124 2150 clear_bit(In_sync, &rdev->flags); 2125 2151 set_bit(Replacement, &rdev->flags); 2126 2152 rdev->raid_disk = repl_slot; 2127 - err = 0; 2128 - if (mddev->gendisk) 2129 - disk_stack_limits(mddev->gendisk, rdev->bdev, 2130 - rdev->data_offset << 9); 2153 + err = mddev_stack_new_rdev(mddev, rdev); 2154 + if (err) 2155 + return err; 2131 2156 conf->fullsync = 1; 2132 2157 WRITE_ONCE(p->replacement, rdev); 2133 2158 } ··· 2263 2290 struct mddev *mddev = r10_bio->mddev; 2264 2291 struct r10conf *conf = mddev->private; 2265 2292 int d; 2266 - sector_t first_bad; 2267 - int bad_sectors; 2268 2293 int slot; 2269 2294 int repl; 2270 2295 struct md_rdev *rdev = NULL; ··· 2283 2312 &rdev->mddev->recovery); 2284 2313 set_bit(R10BIO_WriteError, &r10_bio->state); 2285 2314 } 2286 - } else if (is_badblock(rdev, 2287 - r10_bio->devs[slot].addr, 2288 - r10_bio->sectors, 2289 - &first_bad, &bad_sectors)) 2315 + } else if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr, 2316 + r10_bio->sectors)) { 2290 2317 set_bit(R10BIO_MadeGood, &r10_bio->state); 2318 + } 2291 2319 2292 2320 rdev_dec_pending(rdev, mddev); 2293 2321 ··· 2567 2597 static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, 2568 2598 int sectors, struct page *page, enum req_op op) 2569 2599 { 2570 - sector_t first_bad; 2571 - int bad_sectors; 2572 - 2573 - if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) 2574 - && (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags))) 2600 + if (rdev_has_badblock(rdev, sector, sectors) && 2601 + (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags))) 2575 2602 return -1; 2576 2603 if (sync_page_io(rdev, sector, sectors << 9, page, op, false)) 2577 2604 /* success */ ··· 2625 2658 s = PAGE_SIZE >> 9; 2626 2659 2627 2660 do { 2628 - sector_t first_bad; 2629 - int bad_sectors; 2630 - 2631 2661 d = r10_bio->devs[sl].devnum; 2632 2662 rdev = conf->mirrors[d].rdev; 2633 2663 if (rdev && 2634 2664 test_bit(In_sync, &rdev->flags) && 2635 2665 !test_bit(Faulty, &rdev->flags) && 2636 - is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, 2637 - &first_bad, &bad_sectors) == 0) { 2666 + rdev_has_badblock(rdev, 2667 + r10_bio->devs[sl].addr + sect, 2668 + s) == 0) { 2638 2669 atomic_inc(&rdev->nr_pending); 2639 2670 success = sync_page_io(rdev, 2640 2671 r10_bio->devs[sl].addr + ··· 3967 4002 return ERR_PTR(err); 3968 4003 } 3969 4004 3970 - static void raid10_set_io_opt(struct r10conf *conf) 4005 + static unsigned int raid10_nr_stripes(struct r10conf *conf) 3971 4006 { 3972 - int raid_disks = conf->geo.raid_disks; 4007 + unsigned int raid_disks = conf->geo.raid_disks; 3973 4008 3974 - if (!(conf->geo.raid_disks % conf->geo.near_copies)) 3975 - raid_disks /= conf->geo.near_copies; 3976 - blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) * 3977 - raid_disks); 4009 + if (conf->geo.raid_disks % conf->geo.near_copies) 4010 + return raid_disks; 4011 + return raid_disks / conf->geo.near_copies; 4012 + } 4013 + 4014 + static int raid10_set_queue_limits(struct mddev *mddev) 4015 + { 4016 + struct r10conf *conf = mddev->private; 4017 + struct queue_limits lim; 4018 + 4019 + blk_set_stacking_limits(&lim); 4020 + lim.max_write_zeroes_sectors = 0; 4021 + lim.io_min = mddev->chunk_sectors << 9; 4022 + lim.io_opt = lim.io_min * raid10_nr_stripes(conf); 4023 + mddev_stack_rdev_limits(mddev, &lim); 4024 + return queue_limits_set(mddev->gendisk->queue, &lim); 3978 4025 } 3979 4026 3980 4027 static int raid10_run(struct mddev *mddev) ··· 3998 4021 sector_t size; 3999 4022 sector_t min_offset_diff = 0; 4000 4023 int first = 1; 4024 + int ret = -EIO; 4001 4025 4002 4026 if (mddev->private == NULL) { 4003 4027 conf = setup_conf(mddev); ··· 4023 4045 " raid10\n"); 4024 4046 goto out_free_conf; 4025 4047 } 4026 - } 4027 - 4028 - if (mddev->queue) { 4029 - blk_queue_max_write_zeroes_sectors(mddev->queue, 0); 4030 - blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); 4031 - raid10_set_io_opt(conf); 4032 4048 } 4033 4049 4034 4050 rdev_for_each(rdev, mddev) { ··· 4053 4081 if (first || diff < min_offset_diff) 4054 4082 min_offset_diff = diff; 4055 4083 4056 - if (mddev->gendisk) 4057 - disk_stack_limits(mddev->gendisk, rdev->bdev, 4058 - rdev->data_offset << 9); 4059 - 4060 4084 disk->head_position = 0; 4061 4085 first = 0; 4086 + } 4087 + 4088 + if (!mddev_is_dm(conf->mddev)) { 4089 + ret = raid10_set_queue_limits(mddev); 4090 + if (ret) 4091 + goto out_free_conf; 4062 4092 } 4063 4093 4064 4094 /* need to check that every block has at least one working mirror */ ··· 4159 4185 raid10_free_conf(conf); 4160 4186 mddev->private = NULL; 4161 4187 out: 4162 - return -EIO; 4188 + return ret; 4163 4189 } 4164 4190 4165 4191 static void raid10_free(struct mddev *mddev, void *priv) ··· 4928 4954 conf->reshape_safe = MaxSector; 4929 4955 spin_unlock_irq(&conf->device_lock); 4930 4956 4931 - if (conf->mddev->queue) 4932 - raid10_set_io_opt(conf); 4957 + mddev_update_io_opt(conf->mddev, raid10_nr_stripes(conf)); 4933 4958 conf->fullsync = 0; 4934 4959 } 4935 4960

+2 -1

drivers/md/raid5-ppl.c

··· 1393 1393 ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid)); 1394 1394 ppl_conf->block_size = 512; 1395 1395 } else { 1396 - ppl_conf->block_size = queue_logical_block_size(mddev->queue); 1396 + ppl_conf->block_size = 1397 + queue_logical_block_size(mddev->gendisk->queue); 1397 1398 } 1398 1399 1399 1400 for (i = 0; i < ppl_conf->count; i++) {

+149 -126

drivers/md/raid5.c

··· 36 36 */ 37 37 38 38 #include <linux/blkdev.h> 39 + #include <linux/delay.h> 39 40 #include <linux/kthread.h> 40 41 #include <linux/raid/pq.h> 41 42 #include <linux/async_tx.h> ··· 761 760 STRIPE_RETRY, 762 761 STRIPE_SCHEDULE_AND_RETRY, 763 762 STRIPE_FAIL, 763 + STRIPE_WAIT_RESHAPE, 764 764 }; 765 765 766 766 struct stripe_request_ctx { ··· 1212 1210 */ 1213 1211 while (op_is_write(op) && rdev && 1214 1212 test_bit(WriteErrorSeen, &rdev->flags)) { 1215 - sector_t first_bad; 1216 - int bad_sectors; 1217 - int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 1218 - &first_bad, &bad_sectors); 1213 + int bad = rdev_has_badblock(rdev, sh->sector, 1214 + RAID5_STRIPE_SECTORS(conf)); 1219 1215 if (!bad) 1220 1216 break; 1221 1217 ··· 1295 1295 if (rrdev) 1296 1296 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 1297 1297 1298 - if (conf->mddev->gendisk) 1299 - trace_block_bio_remap(bi, 1300 - disk_devt(conf->mddev->gendisk), 1301 - sh->dev[i].sector); 1298 + mddev_trace_remap(conf->mddev, bi, sh->dev[i].sector); 1302 1299 if (should_defer && op_is_write(op)) 1303 1300 bio_list_add(&pending_bios, bi); 1304 1301 else ··· 1339 1342 */ 1340 1343 if (op == REQ_OP_DISCARD) 1341 1344 rbi->bi_vcnt = 0; 1342 - if (conf->mddev->gendisk) 1343 - trace_block_bio_remap(rbi, 1344 - disk_devt(conf->mddev->gendisk), 1345 - sh->dev[i].sector); 1345 + mddev_trace_remap(conf->mddev, rbi, sh->dev[i].sector); 1346 1346 if (should_defer && op_is_write(op)) 1347 1347 bio_list_add(&pending_bios, rbi); 1348 1348 else ··· 2406 2412 atomic_inc(&conf->active_stripes); 2407 2413 2408 2414 raid5_release_stripe(sh); 2409 - conf->max_nr_stripes++; 2415 + WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes + 1); 2410 2416 return 1; 2411 2417 } 2412 2418 ··· 2416 2422 size_t namelen = sizeof(conf->cache_name[0]); 2417 2423 int devs = max(conf->raid_disks, conf->previous_raid_disks); 2418 2424 2419 - if (conf->mddev->gendisk) 2420 - snprintf(conf->cache_name[0], namelen, 2421 - "raid%d-%s", conf->level, mdname(conf->mddev)); 2422 - else 2425 + if (mddev_is_dm(conf->mddev)) 2423 2426 snprintf(conf->cache_name[0], namelen, 2424 2427 "raid%d-%p", conf->level, conf->mddev); 2428 + else 2429 + snprintf(conf->cache_name[0], namelen, 2430 + "raid%d-%s", conf->level, mdname(conf->mddev)); 2425 2431 snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]); 2426 2432 2427 2433 conf->active_name = 0; ··· 2701 2707 shrink_buffers(sh); 2702 2708 free_stripe(conf->slab_cache, sh); 2703 2709 atomic_dec(&conf->active_stripes); 2704 - conf->max_nr_stripes--; 2710 + WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes - 1); 2705 2711 return 1; 2706 2712 } 2707 2713 ··· 2849 2855 struct r5conf *conf = sh->raid_conf; 2850 2856 int disks = sh->disks, i; 2851 2857 struct md_rdev *rdev; 2852 - sector_t first_bad; 2853 - int bad_sectors; 2854 2858 int replacement = 0; 2855 2859 2856 2860 for (i = 0 ; i < disks; i++) { ··· 2880 2888 if (replacement) { 2881 2889 if (bi->bi_status) 2882 2890 md_error(conf->mddev, rdev); 2883 - else if (is_badblock(rdev, sh->sector, 2884 - RAID5_STRIPE_SECTORS(conf), 2885 - &first_bad, &bad_sectors)) 2891 + else if (rdev_has_badblock(rdev, sh->sector, 2892 + RAID5_STRIPE_SECTORS(conf))) 2886 2893 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 2887 2894 } else { 2888 2895 if (bi->bi_status) { ··· 2891 2900 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2892 2901 set_bit(MD_RECOVERY_NEEDED, 2893 2902 &rdev->mddev->recovery); 2894 - } else if (is_badblock(rdev, sh->sector, 2895 - RAID5_STRIPE_SECTORS(conf), 2896 - &first_bad, &bad_sectors)) { 2903 + } else if (rdev_has_badblock(rdev, sh->sector, 2904 + RAID5_STRIPE_SECTORS(conf))) { 2897 2905 set_bit(R5_MadeGood, &sh->dev[i].flags); 2898 2906 if (test_bit(R5_ReadError, &sh->dev[i].flags)) 2899 2907 /* That was a successful write so make ··· 4195 4205 set_bit(STRIPE_HANDLE, &sh->state); 4196 4206 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { 4197 4207 /* prefer read-modify-write, but need to get some data */ 4198 - if (conf->mddev->queue) 4199 - blk_add_trace_msg(conf->mddev->queue, 4200 - "raid5 rmw %llu %d", 4201 - (unsigned long long)sh->sector, rmw); 4208 + mddev_add_trace_msg(conf->mddev, "raid5 rmw %llu %d", 4209 + sh->sector, rmw); 4210 + 4202 4211 for (i = disks; i--; ) { 4203 4212 struct r5dev *dev = &sh->dev[i]; 4204 4213 if (test_bit(R5_InJournal, &dev->flags) && ··· 4274 4285 set_bit(STRIPE_DELAYED, &sh->state); 4275 4286 } 4276 4287 } 4277 - if (rcw && conf->mddev->queue) 4278 - blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", 4279 - (unsigned long long)sh->sector, 4280 - rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); 4288 + if (rcw && !mddev_is_dm(conf->mddev)) 4289 + blk_add_trace_msg(conf->mddev->gendisk->queue, 4290 + "raid5 rcw %llu %d %d %d", 4291 + (unsigned long long)sh->sector, rcw, qread, 4292 + test_bit(STRIPE_DELAYED, &sh->state)); 4281 4293 } 4282 4294 4283 4295 if (rcw > disks && rmw > disks && ··· 4664 4674 /* Now to look around and see what can be done */ 4665 4675 for (i=disks; i--; ) { 4666 4676 struct md_rdev *rdev; 4667 - sector_t first_bad; 4668 - int bad_sectors; 4669 4677 int is_bad = 0; 4670 4678 4671 4679 dev = &sh->dev[i]; ··· 4707 4719 rdev = conf->disks[i].replacement; 4708 4720 if (rdev && !test_bit(Faulty, &rdev->flags) && 4709 4721 rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) && 4710 - !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 4711 - &first_bad, &bad_sectors)) 4722 + !rdev_has_badblock(rdev, sh->sector, 4723 + RAID5_STRIPE_SECTORS(conf))) 4712 4724 set_bit(R5_ReadRepl, &dev->flags); 4713 4725 else { 4714 4726 if (rdev && !test_bit(Faulty, &rdev->flags)) ··· 4721 4733 if (rdev && test_bit(Faulty, &rdev->flags)) 4722 4734 rdev = NULL; 4723 4735 if (rdev) { 4724 - is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 4725 - &first_bad, &bad_sectors); 4736 + is_bad = rdev_has_badblock(rdev, sh->sector, 4737 + RAID5_STRIPE_SECTORS(conf)); 4726 4738 if (s->blocked_rdev == NULL 4727 4739 && (test_bit(Blocked, &rdev->flags) 4728 4740 || is_bad < 0)) { ··· 5451 5463 struct r5conf *conf = mddev->private; 5452 5464 struct bio *align_bio; 5453 5465 struct md_rdev *rdev; 5454 - sector_t sector, end_sector, first_bad; 5455 - int bad_sectors, dd_idx; 5466 + sector_t sector, end_sector; 5467 + int dd_idx; 5456 5468 bool did_inc; 5457 5469 5458 5470 if (!in_chunk_boundary(mddev, raid_bio)) { ··· 5481 5493 5482 5494 atomic_inc(&rdev->nr_pending); 5483 5495 5484 - if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad, 5485 - &bad_sectors)) { 5496 + if (rdev_has_badblock(rdev, sector, bio_sectors(raid_bio))) { 5486 5497 rdev_dec_pending(rdev, mddev); 5487 5498 return 0; 5488 5499 } ··· 5517 5530 spin_unlock_irq(&conf->device_lock); 5518 5531 } 5519 5532 5520 - if (mddev->gendisk) 5521 - trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk), 5522 - raid_bio->bi_iter.bi_sector); 5533 + mddev_trace_remap(mddev, align_bio, raid_bio->bi_iter.bi_sector); 5523 5534 submit_bio_noacct(align_bio); 5524 5535 return 1; 5525 5536 } ··· 5686 5701 } 5687 5702 release_inactive_stripe_list(conf, cb->temp_inactive_list, 5688 5703 NR_STRIPE_HASH_LOCKS); 5689 - if (mddev->queue) 5690 - trace_block_unplug(mddev->queue, cnt, !from_schedule); 5704 + if (!mddev_is_dm(mddev)) 5705 + trace_block_unplug(mddev->gendisk->queue, cnt, !from_schedule); 5691 5706 kfree(cb); 5692 5707 } 5693 5708 ··· 5931 5946 if (ahead_of_reshape(mddev, logical_sector, 5932 5947 conf->reshape_safe)) { 5933 5948 spin_unlock_irq(&conf->device_lock); 5934 - return STRIPE_SCHEDULE_AND_RETRY; 5949 + ret = STRIPE_SCHEDULE_AND_RETRY; 5950 + goto out; 5935 5951 } 5936 5952 } 5937 5953 spin_unlock_irq(&conf->device_lock); ··· 6011 6025 6012 6026 out_release: 6013 6027 raid5_release_stripe(sh); 6028 + out: 6029 + if (ret == STRIPE_SCHEDULE_AND_RETRY && reshape_interrupted(mddev)) { 6030 + bi->bi_status = BLK_STS_RESOURCE; 6031 + ret = STRIPE_WAIT_RESHAPE; 6032 + pr_err_ratelimited("dm-raid456: io across reshape position while reshape can't make progress"); 6033 + } 6014 6034 return ret; 6015 6035 } 6016 6036 ··· 6138 6146 while (1) { 6139 6147 res = make_stripe_request(mddev, conf, &ctx, logical_sector, 6140 6148 bi); 6141 - if (res == STRIPE_FAIL) 6149 + if (res == STRIPE_FAIL || res == STRIPE_WAIT_RESHAPE) 6142 6150 break; 6143 6151 6144 6152 if (res == STRIPE_RETRY) ··· 6176 6184 6177 6185 if (rw == WRITE) 6178 6186 md_write_end(mddev); 6187 + if (res == STRIPE_WAIT_RESHAPE) { 6188 + md_free_cloned_bio(bi); 6189 + return false; 6190 + } 6191 + 6179 6192 bio_endio(bi); 6180 6193 return true; 6181 6194 } ··· 6770 6773 spin_unlock_irq(&conf->device_lock); 6771 6774 md_check_recovery(mddev); 6772 6775 spin_lock_irq(&conf->device_lock); 6776 + 6777 + /* 6778 + * Waiting on MD_SB_CHANGE_PENDING below may deadlock 6779 + * seeing md_check_recovery() is needed to clear 6780 + * the flag when using mdmon. 6781 + */ 6782 + continue; 6773 6783 } 6784 + 6785 + wait_event_lock_irq(mddev->sb_wait, 6786 + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), 6787 + conf->device_lock); 6774 6788 } 6775 6789 pr_debug("%d stripes handled\n", handled); 6776 6790 ··· 6828 6820 if (size <= 16 || size > 32768) 6829 6821 return -EINVAL; 6830 6822 6831 - conf->min_nr_stripes = size; 6823 + WRITE_ONCE(conf->min_nr_stripes, size); 6832 6824 mutex_lock(&conf->cache_size_mutex); 6833 6825 while (size < conf->max_nr_stripes && 6834 6826 drop_one_stripe(conf)) ··· 6840 6832 mutex_lock(&conf->cache_size_mutex); 6841 6833 while (size > conf->max_nr_stripes) 6842 6834 if (!grow_one_stripe(conf, GFP_KERNEL)) { 6843 - conf->min_nr_stripes = conf->max_nr_stripes; 6835 + WRITE_ONCE(conf->min_nr_stripes, conf->max_nr_stripes); 6844 6836 result = -ENOMEM; 6845 6837 break; 6846 6838 } ··· 6975 6967 pr_debug("md/raid: change stripe_size from %lu to %lu\n", 6976 6968 conf->stripe_size, new); 6977 6969 6978 - if (mddev->sync_thread || 6979 - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 6980 - mddev->reshape_position != MaxSector || 6981 - mddev->sysfs_active) { 6970 + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 6971 + mddev->reshape_position != MaxSector || mddev->sysfs_active) { 6982 6972 err = -EBUSY; 6983 6973 goto out_unlock; 6984 6974 } ··· 7090 7084 if (!conf) 7091 7085 err = -ENODEV; 7092 7086 else if (new != conf->skip_copy) { 7093 - struct request_queue *q = mddev->queue; 7087 + struct request_queue *q = mddev->gendisk->queue; 7094 7088 7095 7089 conf->skip_copy = new; 7096 7090 if (new) ··· 7396 7390 struct shrink_control *sc) 7397 7391 { 7398 7392 struct r5conf *conf = shrink->private_data; 7393 + int max_stripes = READ_ONCE(conf->max_nr_stripes); 7394 + int min_stripes = READ_ONCE(conf->min_nr_stripes); 7399 7395 7400 - if (conf->max_nr_stripes < conf->min_nr_stripes) 7396 + if (max_stripes < min_stripes) 7401 7397 /* unlikely, but not impossible */ 7402 7398 return 0; 7403 - return conf->max_nr_stripes - conf->min_nr_stripes; 7399 + return max_stripes - min_stripes; 7404 7400 } 7405 7401 7406 7402 static struct r5conf *setup_conf(struct mddev *mddev) ··· 7692 7684 return 0; 7693 7685 } 7694 7686 7695 - static void raid5_set_io_opt(struct r5conf *conf) 7687 + static int raid5_set_limits(struct mddev *mddev) 7696 7688 { 7697 - blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) * 7698 - (conf->raid_disks - conf->max_degraded)); 7689 + struct r5conf *conf = mddev->private; 7690 + struct queue_limits lim; 7691 + int data_disks, stripe; 7692 + struct md_rdev *rdev; 7693 + 7694 + /* 7695 + * The read-ahead size must cover two whole stripes, which is 7696 + * 2 * (datadisks) * chunksize where 'n' is the number of raid devices. 7697 + */ 7698 + data_disks = conf->previous_raid_disks - conf->max_degraded; 7699 + 7700 + /* 7701 + * We can only discard a whole stripe. It doesn't make sense to 7702 + * discard data disk but write parity disk 7703 + */ 7704 + stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9)); 7705 + 7706 + blk_set_stacking_limits(&lim); 7707 + lim.io_min = mddev->chunk_sectors << 9; 7708 + lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded); 7709 + lim.raid_partial_stripes_expensive = 1; 7710 + lim.discard_granularity = stripe; 7711 + lim.max_write_zeroes_sectors = 0; 7712 + mddev_stack_rdev_limits(mddev, &lim); 7713 + rdev_for_each(rdev, mddev) 7714 + queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset, 7715 + mddev->gendisk->disk_name); 7716 + 7717 + /* 7718 + * Zeroing is required for discard, otherwise data could be lost. 7719 + * 7720 + * Consider a scenario: discard a stripe (the stripe could be 7721 + * inconsistent if discard_zeroes_data is 0); write one disk of the 7722 + * stripe (the stripe could be inconsistent again depending on which 7723 + * disks are used to calculate parity); the disk is broken; The stripe 7724 + * data of this disk is lost. 7725 + * 7726 + * We only allow DISCARD if the sysadmin has confirmed that only safe 7727 + * devices are in use by setting a module parameter. A better idea 7728 + * might be to turn DISCARD into WRITE_ZEROES requests, as that is 7729 + * required to be safe. 7730 + */ 7731 + if (!devices_handle_discard_safely || 7732 + lim.max_discard_sectors < (stripe >> 9) || 7733 + lim.discard_granularity < stripe) 7734 + lim.max_hw_discard_sectors = 0; 7735 + 7736 + /* 7737 + * Requests require having a bitmap for each stripe. 7738 + * Limit the max sectors based on this. 7739 + */ 7740 + lim.max_hw_sectors = RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf); 7741 + 7742 + /* No restrictions on the number of segments in the request */ 7743 + lim.max_segments = USHRT_MAX; 7744 + 7745 + return queue_limits_set(mddev->gendisk->queue, &lim); 7699 7746 } 7700 7747 7701 7748 static int raid5_run(struct mddev *mddev) ··· 7763 7700 int i; 7764 7701 long long min_offset_diff = 0; 7765 7702 int first = 1; 7703 + int ret = -EIO; 7766 7704 7767 7705 if (mddev->recovery_cp != MaxSector) 7768 7706 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", ··· 8012 7948 mdname(mddev)); 8013 7949 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 8014 7950 8015 - if (mddev->queue) { 8016 - int chunk_size; 8017 - /* read-ahead size must cover two whole stripes, which 8018 - * is 2 * (datadisks) * chunksize where 'n' is the 8019 - * number of raid devices 8020 - */ 8021 - int data_disks = conf->previous_raid_disks - conf->max_degraded; 8022 - int stripe = data_disks * 8023 - ((mddev->chunk_sectors << 9) / PAGE_SIZE); 8024 - 8025 - chunk_size = mddev->chunk_sectors << 9; 8026 - blk_queue_io_min(mddev->queue, chunk_size); 8027 - raid5_set_io_opt(conf); 8028 - mddev->queue->limits.raid_partial_stripes_expensive = 1; 8029 - /* 8030 - * We can only discard a whole stripe. It doesn't make sense to 8031 - * discard data disk but write parity disk 8032 - */ 8033 - stripe = stripe * PAGE_SIZE; 8034 - stripe = roundup_pow_of_two(stripe); 8035 - mddev->queue->limits.discard_granularity = stripe; 8036 - 8037 - blk_queue_max_write_zeroes_sectors(mddev->queue, 0); 8038 - 8039 - rdev_for_each(rdev, mddev) { 8040 - disk_stack_limits(mddev->gendisk, rdev->bdev, 8041 - rdev->data_offset << 9); 8042 - disk_stack_limits(mddev->gendisk, rdev->bdev, 8043 - rdev->new_data_offset << 9); 8044 - } 8045 - 8046 - /* 8047 - * zeroing is required, otherwise data 8048 - * could be lost. Consider a scenario: discard a stripe 8049 - * (the stripe could be inconsistent if 8050 - * discard_zeroes_data is 0); write one disk of the 8051 - * stripe (the stripe could be inconsistent again 8052 - * depending on which disks are used to calculate 8053 - * parity); the disk is broken; The stripe data of this 8054 - * disk is lost. 8055 - * 8056 - * We only allow DISCARD if the sysadmin has confirmed that 8057 - * only safe devices are in use by setting a module parameter. 8058 - * A better idea might be to turn DISCARD into WRITE_ZEROES 8059 - * requests, as that is required to be safe. 8060 - */ 8061 - if (!devices_handle_discard_safely || 8062 - mddev->queue->limits.max_discard_sectors < (stripe >> 9) || 8063 - mddev->queue->limits.discard_granularity < stripe) 8064 - blk_queue_max_discard_sectors(mddev->queue, 0); 8065 - 8066 - /* 8067 - * Requests require having a bitmap for each stripe. 8068 - * Limit the max sectors based on this. 8069 - */ 8070 - blk_queue_max_hw_sectors(mddev->queue, 8071 - RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf)); 8072 - 8073 - /* No restrictions on the number of segments in the request */ 8074 - blk_queue_max_segments(mddev->queue, USHRT_MAX); 7951 + if (!mddev_is_dm(mddev)) { 7952 + ret = raid5_set_limits(mddev); 7953 + if (ret) 7954 + goto abort; 8075 7955 } 8076 7956 8077 7957 if (log_init(conf, journal_dev, raid5_has_ppl(conf))) ··· 8028 8020 free_conf(conf); 8029 8021 mddev->private = NULL; 8030 8022 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); 8031 - return -EIO; 8023 + return ret; 8032 8024 } 8033 8025 8034 8026 static void raid5_free(struct mddev *mddev, void *priv) ··· 8539 8531 spin_unlock_irq(&conf->device_lock); 8540 8532 wake_up(&conf->wait_for_overlap); 8541 8533 8542 - if (conf->mddev->queue) 8543 - raid5_set_io_opt(conf); 8534 + mddev_update_io_opt(conf->mddev, 8535 + conf->raid_disks - conf->max_degraded); 8544 8536 } 8545 8537 } 8546 8538 ··· 8917 8909 return r5l_start(conf->log); 8918 8910 } 8919 8911 8912 + /* 8913 + * This is only used for dm-raid456, caller already frozen sync_thread, hence 8914 + * if rehsape is still in progress, io that is waiting for reshape can never be 8915 + * done now, hence wake up and handle those IO. 8916 + */ 8917 + static void raid5_prepare_suspend(struct mddev *mddev) 8918 + { 8919 + struct r5conf *conf = mddev->private; 8920 + 8921 + wake_up(&conf->wait_for_overlap); 8922 + } 8923 + 8920 8924 static struct md_personality raid6_personality = 8921 8925 { 8922 8926 .name = "raid6", ··· 8952 8932 .quiesce = raid5_quiesce, 8953 8933 .takeover = raid6_takeover, 8954 8934 .change_consistency_policy = raid5_change_consistency_policy, 8935 + .prepare_suspend = raid5_prepare_suspend, 8955 8936 }; 8956 8937 static struct md_personality raid5_personality = 8957 8938 { ··· 8977 8956 .quiesce = raid5_quiesce, 8978 8957 .takeover = raid5_takeover, 8979 8958 .change_consistency_policy = raid5_change_consistency_policy, 8959 + .prepare_suspend = raid5_prepare_suspend, 8980 8960 }; 8981 8961 8982 8962 static struct md_personality raid4_personality = ··· 9003 8981 .quiesce = raid5_quiesce, 9004 8982 .takeover = raid4_takeover, 9005 8983 .change_consistency_policy = raid5_change_consistency_policy, 8984 + .prepare_suspend = raid5_prepare_suspend, 9006 8985 }; 9007 8986 9008 8987 static int __init raid5_init(void)

+7 -7

drivers/memstick/core/ms_block.c

··· 2078 2078 static int msb_init_disk(struct memstick_dev *card) 2079 2079 { 2080 2080 struct msb_data *msb = memstick_get_drvdata(card); 2081 + struct queue_limits lim = { 2082 + .logical_block_size = msb->page_size, 2083 + .max_hw_sectors = MS_BLOCK_MAX_PAGES, 2084 + .max_segments = MS_BLOCK_MAX_SEGS, 2085 + .max_segment_size = MS_BLOCK_MAX_PAGES * msb->page_size, 2086 + }; 2081 2087 int rc; 2082 2088 unsigned long capacity; 2083 2089 ··· 2099 2093 if (rc) 2100 2094 goto out_release_id; 2101 2095 2102 - msb->disk = blk_mq_alloc_disk(&msb->tag_set, card); 2096 + msb->disk = blk_mq_alloc_disk(&msb->tag_set, &lim, card); 2103 2097 if (IS_ERR(msb->disk)) { 2104 2098 rc = PTR_ERR(msb->disk); 2105 2099 goto out_free_tag_set; 2106 2100 } 2107 2101 msb->queue = msb->disk->queue; 2108 - 2109 - blk_queue_max_hw_sectors(msb->queue, MS_BLOCK_MAX_PAGES); 2110 - blk_queue_max_segments(msb->queue, MS_BLOCK_MAX_SEGS); 2111 - blk_queue_max_segment_size(msb->queue, 2112 - MS_BLOCK_MAX_PAGES * msb->page_size); 2113 - blk_queue_logical_block_size(msb->queue, msb->page_size); 2114 2102 2115 2103 sprintf(msb->disk->disk_name, "msblk%d", msb->disk_id); 2116 2104 msb->disk->fops = &msb_bdops;

+7 -8

drivers/memstick/core/mspro_block.c

··· 1103 1103 static int mspro_block_init_disk(struct memstick_dev *card) 1104 1104 { 1105 1105 struct mspro_block_data *msb = memstick_get_drvdata(card); 1106 + struct queue_limits lim = { 1107 + .logical_block_size = msb->page_size, 1108 + .max_hw_sectors = MSPRO_BLOCK_MAX_PAGES, 1109 + .max_segments = MSPRO_BLOCK_MAX_SEGS, 1110 + .max_segment_size = MSPRO_BLOCK_MAX_PAGES * msb->page_size, 1111 + }; 1106 1112 struct mspro_devinfo *dev_info = NULL; 1107 1113 struct mspro_sys_info *sys_info = NULL; 1108 1114 struct mspro_sys_attr *s_attr = NULL; ··· 1144 1138 if (rc) 1145 1139 goto out_release_id; 1146 1140 1147 - msb->disk = blk_mq_alloc_disk(&msb->tag_set, card); 1141 + msb->disk = blk_mq_alloc_disk(&msb->tag_set, &lim, card); 1148 1142 if (IS_ERR(msb->disk)) { 1149 1143 rc = PTR_ERR(msb->disk); 1150 1144 goto out_free_tag_set; 1151 1145 } 1152 1146 msb->queue = msb->disk->queue; 1153 - 1154 - blk_queue_max_hw_sectors(msb->queue, MSPRO_BLOCK_MAX_PAGES); 1155 - blk_queue_max_segments(msb->queue, MSPRO_BLOCK_MAX_SEGS); 1156 - blk_queue_max_segment_size(msb->queue, 1157 - MSPRO_BLOCK_MAX_PAGES * msb->page_size); 1158 1147 1159 1148 msb->disk->major = major; 1160 1149 msb->disk->first_minor = disk_id << MSPRO_BLOCK_PART_SHIFT; ··· 1158 1157 msb->disk->private_data = msb; 1159 1158 1160 1159 sprintf(msb->disk->disk_name, "mspblk%d", disk_id); 1161 - 1162 - blk_queue_logical_block_size(msb->queue, msb->page_size); 1163 1160 1164 1161 capacity = be16_to_cpu(sys_info->user_block_count); 1165 1162 capacity *= be16_to_cpu(sys_info->block_size);

+56 -49

drivers/mmc/core/queue.c

··· 174 174 return sg; 175 175 } 176 176 177 - static void mmc_queue_setup_discard(struct request_queue *q, 178 - struct mmc_card *card) 177 + static void mmc_queue_setup_discard(struct mmc_card *card, 178 + struct queue_limits *lim) 179 179 { 180 180 unsigned max_discard; 181 181 ··· 183 183 if (!max_discard) 184 184 return; 185 185 186 - blk_queue_max_discard_sectors(q, max_discard); 187 - q->limits.discard_granularity = card->pref_erase << 9; 186 + lim->max_hw_discard_sectors = max_discard; 187 + if (mmc_can_secure_erase_trim(card)) 188 + lim->max_secure_erase_sectors = max_discard; 189 + if (mmc_can_trim(card) && card->erased_byte == 0) 190 + lim->max_write_zeroes_sectors = max_discard; 191 + 188 192 /* granularity must not be greater than max. discard */ 189 193 if (card->pref_erase > max_discard) 190 - q->limits.discard_granularity = SECTOR_SIZE; 191 - if (mmc_can_secure_erase_trim(card)) 192 - blk_queue_max_secure_erase_sectors(q, max_discard); 193 - if (mmc_can_trim(card) && card->erased_byte == 0) 194 - blk_queue_max_write_zeroes_sectors(q, max_discard); 194 + lim->discard_granularity = SECTOR_SIZE; 195 + else 196 + lim->discard_granularity = card->pref_erase << 9; 195 197 } 196 198 197 199 static unsigned short mmc_get_max_segments(struct mmc_host *host) ··· 343 341 .timeout = mmc_mq_timed_out, 344 342 }; 345 343 346 - static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card) 344 + static struct gendisk *mmc_alloc_disk(struct mmc_queue *mq, 345 + struct mmc_card *card) 347 346 { 348 347 struct mmc_host *host = card->host; 349 - unsigned block_size = 512; 348 + struct queue_limits lim = { }; 349 + struct gendisk *disk; 350 + 351 + if (mmc_can_erase(card)) 352 + mmc_queue_setup_discard(card, &lim); 353 + 354 + if (!mmc_dev(host)->dma_mask || !*mmc_dev(host)->dma_mask) 355 + lim.bounce = BLK_BOUNCE_HIGH; 356 + 357 + lim.max_hw_sectors = min(host->max_blk_count, host->max_req_size / 512); 358 + 359 + if (mmc_card_mmc(card) && card->ext_csd.data_sector_size) 360 + lim.logical_block_size = card->ext_csd.data_sector_size; 361 + else 362 + lim.logical_block_size = 512; 363 + 364 + WARN_ON_ONCE(lim.logical_block_size != 512 && 365 + lim.logical_block_size != 4096); 366 + 367 + /* 368 + * Setting a virt_boundary implicity sets a max_segment_size, so try 369 + * to set the hardware one here. 370 + */ 371 + if (host->can_dma_map_merge) { 372 + lim.virt_boundary_mask = dma_get_merge_boundary(mmc_dev(host)); 373 + lim.max_segments = MMC_DMA_MAP_MERGE_SEGMENTS; 374 + } else { 375 + lim.max_segment_size = 376 + round_down(host->max_seg_size, lim.logical_block_size); 377 + lim.max_segments = host->max_segs; 378 + } 379 + 380 + disk = blk_mq_alloc_disk(&mq->tag_set, &lim, mq); 381 + if (IS_ERR(disk)) 382 + return disk; 383 + mq->queue = disk->queue; 384 + 385 + if (mmc_host_is_spi(host) && host->use_spi_crc) 386 + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, mq->queue); 387 + blk_queue_rq_timeout(mq->queue, 60 * HZ); 350 388 351 389 blk_queue_flag_set(QUEUE_FLAG_NONROT, mq->queue); 352 390 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, mq->queue); 353 - if (mmc_can_erase(card)) 354 - mmc_queue_setup_discard(mq->queue, card); 355 - 356 - if (!mmc_dev(host)->dma_mask || !*mmc_dev(host)->dma_mask) 357 - blk_queue_bounce_limit(mq->queue, BLK_BOUNCE_HIGH); 358 - blk_queue_max_hw_sectors(mq->queue, 359 - min(host->max_blk_count, host->max_req_size / 512)); 360 - if (host->can_dma_map_merge) 361 - WARN(!blk_queue_can_use_dma_map_merging(mq->queue, 362 - mmc_dev(host)), 363 - "merging was advertised but not possible"); 364 - blk_queue_max_segments(mq->queue, mmc_get_max_segments(host)); 365 - 366 - if (mmc_card_mmc(card) && card->ext_csd.data_sector_size) { 367 - block_size = card->ext_csd.data_sector_size; 368 - WARN_ON(block_size != 512 && block_size != 4096); 369 - } 370 - 371 - blk_queue_logical_block_size(mq->queue, block_size); 372 - /* 373 - * After blk_queue_can_use_dma_map_merging() was called with succeed, 374 - * since it calls blk_queue_virt_boundary(), the mmc should not call 375 - * both blk_queue_max_segment_size(). 376 - */ 377 - if (!host->can_dma_map_merge) 378 - blk_queue_max_segment_size(mq->queue, 379 - round_down(host->max_seg_size, block_size)); 380 391 381 392 dma_set_max_seg_size(mmc_dev(host), queue_max_segment_size(mq->queue)); 382 393 ··· 401 386 init_waitqueue_head(&mq->wait); 402 387 403 388 mmc_crypto_setup_queue(mq->queue, host); 389 + return disk; 404 390 } 405 391 406 392 static inline bool mmc_merge_capable(struct mmc_host *host) ··· 463 447 return ERR_PTR(ret); 464 448 465 449 466 - disk = blk_mq_alloc_disk(&mq->tag_set, mq); 467 - if (IS_ERR(disk)) { 450 + disk = mmc_alloc_disk(mq, card); 451 + if (IS_ERR(disk)) 468 452 blk_mq_free_tag_set(&mq->tag_set); 469 - return disk; 470 - } 471 - mq->queue = disk->queue; 472 - 473 - if (mmc_host_is_spi(host) && host->use_spi_crc) 474 - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, mq->queue); 475 - blk_queue_rq_timeout(mq->queue, 60 * HZ); 476 - 477 - mmc_setup_queue(mq, card); 478 453 return disk; 479 454 } 480 455

+6 -6

drivers/mtd/mtd_blkdevs.c

··· 277 277 { 278 278 struct mtd_blktrans_ops *tr = new->tr; 279 279 struct mtd_blktrans_dev *d; 280 + struct queue_limits lim = { }; 280 281 int last_devnum = -1; 281 282 struct gendisk *gd; 282 283 int ret; ··· 332 331 BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING); 333 332 if (ret) 334 333 goto out_kfree_tag_set; 334 + 335 + lim.logical_block_size = tr->blksize; 336 + if (tr->discard) 337 + lim.max_hw_discard_sectors = UINT_MAX; 335 338 336 339 /* Create gendisk */ 337 - gd = blk_mq_alloc_disk(new->tag_set, new); 340 + gd = blk_mq_alloc_disk(new->tag_set, &lim, new); 338 341 if (IS_ERR(gd)) { 339 342 ret = PTR_ERR(gd); 340 343 goto out_free_tag_set; ··· 376 371 if (tr->flush) 377 372 blk_queue_write_cache(new->rq, true, false); 378 373 379 - blk_queue_logical_block_size(new->rq, tr->blksize); 380 - 381 374 blk_queue_flag_set(QUEUE_FLAG_NONROT, new->rq); 382 375 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, new->rq); 383 - 384 - if (tr->discard) 385 - blk_queue_max_discard_sectors(new->rq, UINT_MAX); 386 376 387 377 gd->queue = new->rq; 388 378

+4 -2

drivers/mtd/ubi/block.c

··· 348 348 349 349 int ubiblock_create(struct ubi_volume_info *vi) 350 350 { 351 + struct queue_limits lim = { 352 + .max_segments = UBI_MAX_SG_COUNT, 353 + }; 351 354 struct ubiblock *dev; 352 355 struct gendisk *gd; 353 356 u64 disk_capacity; ··· 396 393 397 394 398 395 /* Initialize the gendisk of this ubiblock device */ 399 - gd = blk_mq_alloc_disk(&dev->tag_set, dev); 396 + gd = blk_mq_alloc_disk(&dev->tag_set, &lim, dev); 400 397 if (IS_ERR(gd)) { 401 398 ret = PTR_ERR(gd); 402 399 goto out_free_tags; ··· 419 416 dev->gd = gd; 420 417 421 418 dev->rq = gd->queue; 422 - blk_queue_max_segments(dev->rq, UBI_MAX_SG_COUNT); 423 419 424 420 list_add_tail(&dev->list, &ubiblock_devices); 425 421

+8 -6

drivers/nvdimm/btt.c

··· 1496 1496 { 1497 1497 struct nd_btt *nd_btt = btt->nd_btt; 1498 1498 struct nd_namespace_common *ndns = nd_btt->ndns; 1499 - int rc = -ENOMEM; 1499 + struct queue_limits lim = { 1500 + .logical_block_size = btt->sector_size, 1501 + .max_hw_sectors = UINT_MAX, 1502 + }; 1503 + int rc; 1500 1504 1501 - btt->btt_disk = blk_alloc_disk(NUMA_NO_NODE); 1502 - if (!btt->btt_disk) 1503 - return -ENOMEM; 1505 + btt->btt_disk = blk_alloc_disk(&lim, NUMA_NO_NODE); 1506 + if (IS_ERR(btt->btt_disk)) 1507 + return PTR_ERR(btt->btt_disk); 1504 1508 1505 1509 nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name); 1506 1510 btt->btt_disk->first_minor = 0; 1507 1511 btt->btt_disk->fops = &btt_fops; 1508 1512 btt->btt_disk->private_data = btt; 1509 1513 1510 - blk_queue_logical_block_size(btt->btt_disk->queue, btt->sector_size); 1511 - blk_queue_max_hw_sectors(btt->btt_disk->queue, UINT_MAX); 1512 1514 blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_disk->queue); 1513 1515 blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, btt->btt_disk->queue); 1514 1516

+8 -6

drivers/nvdimm/pmem.c

··· 451 451 { 452 452 struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); 453 453 struct nd_region *nd_region = to_nd_region(dev->parent); 454 + struct queue_limits lim = { 455 + .logical_block_size = pmem_sector_size(ndns), 456 + .physical_block_size = PAGE_SIZE, 457 + .max_hw_sectors = UINT_MAX, 458 + }; 454 459 int nid = dev_to_node(dev), fua; 455 460 struct resource *res = &nsio->res; 456 461 struct range bb_range; ··· 502 497 return -EBUSY; 503 498 } 504 499 505 - disk = blk_alloc_disk(nid); 506 - if (!disk) 507 - return -ENOMEM; 500 + disk = blk_alloc_disk(&lim, nid); 501 + if (IS_ERR(disk)) 502 + return PTR_ERR(disk); 508 503 q = disk->queue; 509 504 510 505 pmem->disk = disk; ··· 544 539 pmem->virt_addr = addr; 545 540 546 541 blk_queue_write_cache(q, true, fua); 547 - blk_queue_physical_block_size(q, PAGE_SIZE); 548 - blk_queue_logical_block_size(q, pmem_sector_size(ndns)); 549 - blk_queue_max_hw_sectors(q, UINT_MAX); 550 542 blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 551 543 blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, q); 552 544 if (pmem->pfn_flags & PFN_MAP)

+1 -1

drivers/nvme/host/apple.c

··· 1516 1516 goto put_dev; 1517 1517 } 1518 1518 1519 - anv->ctrl.admin_q = blk_mq_init_queue(&anv->admin_tagset); 1519 + anv->ctrl.admin_q = blk_mq_alloc_queue(&anv->admin_tagset, NULL, NULL); 1520 1520 if (IS_ERR(anv->ctrl.admin_q)) { 1521 1521 ret = -ENOMEM; 1522 1522 goto put_dev;

+251 -225

drivers/nvme/host/core.c

··· 114 114 115 115 static DEFINE_IDA(nvme_instance_ida); 116 116 static dev_t nvme_ctrl_base_chr_devt; 117 - static struct class *nvme_class; 118 - static struct class *nvme_subsys_class; 117 + static int nvme_class_uevent(const struct device *dev, struct kobj_uevent_env *env); 118 + static const struct class nvme_class = { 119 + .name = "nvme", 120 + .dev_uevent = nvme_class_uevent, 121 + }; 122 + 123 + static const struct class nvme_subsys_class = { 124 + .name = "nvme-subsystem", 125 + }; 119 126 120 127 static DEFINE_IDA(nvme_ns_chr_minor_ida); 121 128 static dev_t nvme_ns_chr_devt; 122 - static struct class *nvme_ns_chr_class; 129 + static const struct class nvme_ns_chr_class = { 130 + .name = "nvme-generic", 131 + }; 123 132 124 133 static void nvme_put_subsystem(struct nvme_subsystem *subsys); 125 134 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, ··· 1407 1398 1408 1399 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, 1409 1400 sizeof(struct nvme_id_ctrl)); 1410 - if (error) 1401 + if (error) { 1411 1402 kfree(*id); 1403 + *id = NULL; 1404 + } 1412 1405 return error; 1413 1406 } 1414 1407 ··· 1539 1528 if (error) { 1540 1529 dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error); 1541 1530 kfree(*id); 1531 + *id = NULL; 1542 1532 } 1543 1533 return error; 1544 1534 } ··· 1739 1727 return 0; 1740 1728 } 1741 1729 1742 - #ifdef CONFIG_BLK_DEV_INTEGRITY 1743 - static void nvme_init_integrity(struct gendisk *disk, 1744 - struct nvme_ns_head *head, u32 max_integrity_segments) 1730 + static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) 1745 1731 { 1746 1732 struct blk_integrity integrity = { }; 1733 + 1734 + blk_integrity_unregister(disk); 1735 + 1736 + if (!head->ms) 1737 + return true; 1738 + 1739 + /* 1740 + * PI can always be supported as we can ask the controller to simply 1741 + * insert/strip it, which is not possible for other kinds of metadata. 1742 + */ 1743 + if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) || 1744 + !(head->features & NVME_NS_METADATA_SUPPORTED)) 1745 + return nvme_ns_has_pi(head); 1747 1746 1748 1747 switch (head->pi_type) { 1749 1748 case NVME_NS_DPS_PI_TYPE3: ··· 1798 1775 } 1799 1776 1800 1777 integrity.tuple_size = head->ms; 1778 + integrity.pi_offset = head->pi_offset; 1801 1779 blk_integrity_register(disk, &integrity); 1802 - blk_queue_max_integrity_segments(disk->queue, max_integrity_segments); 1780 + return true; 1803 1781 } 1804 - #else 1805 - static void nvme_init_integrity(struct gendisk *disk, 1806 - struct nvme_ns_head *head, u32 max_integrity_segments) 1807 - { 1808 - } 1809 - #endif /* CONFIG_BLK_DEV_INTEGRITY */ 1810 1782 1811 - static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk, 1812 - struct nvme_ns_head *head) 1783 + static void nvme_config_discard(struct nvme_ns *ns, struct queue_limits *lim) 1813 1784 { 1814 - struct request_queue *queue = disk->queue; 1815 - u32 max_discard_sectors; 1816 - 1817 - if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX)) { 1818 - max_discard_sectors = nvme_lba_to_sect(head, ctrl->dmrsl); 1819 - } else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) { 1820 - max_discard_sectors = UINT_MAX; 1821 - } else { 1822 - blk_queue_max_discard_sectors(queue, 0); 1823 - return; 1824 - } 1785 + struct nvme_ctrl *ctrl = ns->ctrl; 1825 1786 1826 1787 BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < 1827 1788 NVME_DSM_MAX_RANGES); 1828 1789 1829 - /* 1830 - * If discard is already enabled, don't reset queue limits. 1831 - * 1832 - * This works around the fact that the block layer can't cope well with 1833 - * updating the hardware limits when overridden through sysfs. This is 1834 - * harmless because discard limits in NVMe are purely advisory. 1835 - */ 1836 - if (queue->limits.max_discard_sectors) 1837 - return; 1838 - 1839 - blk_queue_max_discard_sectors(queue, max_discard_sectors); 1840 - if (ctrl->dmrl) 1841 - blk_queue_max_discard_segments(queue, ctrl->dmrl); 1790 + if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX)) 1791 + lim->max_hw_discard_sectors = 1792 + nvme_lba_to_sect(ns->head, ctrl->dmrsl); 1793 + else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) 1794 + lim->max_hw_discard_sectors = UINT_MAX; 1842 1795 else 1843 - blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES); 1844 - queue->limits.discard_granularity = queue_logical_block_size(queue); 1796 + lim->max_hw_discard_sectors = 0; 1845 1797 1846 - if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) 1847 - blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); 1798 + lim->discard_granularity = lim->logical_block_size; 1799 + 1800 + if (ctrl->dmrl) 1801 + lim->max_discard_segments = ctrl->dmrl; 1802 + else 1803 + lim->max_discard_segments = NVME_DSM_MAX_RANGES; 1848 1804 } 1849 1805 1850 1806 static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) ··· 1834 1832 a->csi == b->csi; 1835 1833 } 1836 1834 1837 - static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head, 1838 - struct nvme_id_ns *id) 1835 + static int nvme_identify_ns_nvm(struct nvme_ctrl *ctrl, unsigned int nsid, 1836 + struct nvme_id_ns_nvm **nvmp) 1839 1837 { 1840 - bool first = id->dps & NVME_NS_DPS_PI_FIRST; 1841 - unsigned lbaf = nvme_lbaf_index(id->flbas); 1842 - struct nvme_command c = { }; 1838 + struct nvme_command c = { 1839 + .identify.opcode = nvme_admin_identify, 1840 + .identify.nsid = cpu_to_le32(nsid), 1841 + .identify.cns = NVME_ID_CNS_CS_NS, 1842 + .identify.csi = NVME_CSI_NVM, 1843 + }; 1843 1844 struct nvme_id_ns_nvm *nvm; 1844 - int ret = 0; 1845 - u32 elbaf; 1846 - 1847 - head->pi_size = 0; 1848 - head->ms = le16_to_cpu(id->lbaf[lbaf].ms); 1849 - if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) { 1850 - head->pi_size = sizeof(struct t10_pi_tuple); 1851 - head->guard_type = NVME_NVM_NS_16B_GUARD; 1852 - goto set_pi; 1853 - } 1845 + int ret; 1854 1846 1855 1847 nvm = kzalloc(sizeof(*nvm), GFP_KERNEL); 1856 1848 if (!nvm) 1857 1849 return -ENOMEM; 1858 1850 1859 - c.identify.opcode = nvme_admin_identify; 1860 - c.identify.nsid = cpu_to_le32(head->ns_id); 1861 - c.identify.cns = NVME_ID_CNS_CS_NS; 1862 - c.identify.csi = NVME_CSI_NVM; 1863 - 1864 1851 ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm)); 1865 1852 if (ret) 1866 - goto free_data; 1853 + kfree(nvm); 1854 + else 1855 + *nvmp = nvm; 1856 + return ret; 1857 + } 1867 1858 1868 - elbaf = le32_to_cpu(nvm->elbaf[lbaf]); 1859 + static void nvme_configure_pi_elbas(struct nvme_ns_head *head, 1860 + struct nvme_id_ns *id, struct nvme_id_ns_nvm *nvm) 1861 + { 1862 + u32 elbaf = le32_to_cpu(nvm->elbaf[nvme_lbaf_index(id->flbas)]); 1869 1863 1870 1864 /* no support for storage tag formats right now */ 1871 1865 if (nvme_elbaf_sts(elbaf)) 1872 - goto free_data; 1866 + return; 1873 1867 1874 1868 head->guard_type = nvme_elbaf_guard_type(elbaf); 1875 1869 switch (head->guard_type) { ··· 1878 1880 default: 1879 1881 break; 1880 1882 } 1881 - 1882 - free_data: 1883 - kfree(nvm); 1884 - set_pi: 1885 - if (head->pi_size && (first || head->ms == head->pi_size)) 1886 - head->pi_type = id->dps & NVME_NS_DPS_PI_MASK; 1887 - else 1888 - head->pi_type = 0; 1889 - 1890 - return ret; 1891 1883 } 1892 1884 1893 - static int nvme_configure_metadata(struct nvme_ctrl *ctrl, 1894 - struct nvme_ns_head *head, struct nvme_id_ns *id) 1885 + static void nvme_configure_metadata(struct nvme_ctrl *ctrl, 1886 + struct nvme_ns_head *head, struct nvme_id_ns *id, 1887 + struct nvme_id_ns_nvm *nvm) 1895 1888 { 1896 - int ret; 1897 - 1898 - ret = nvme_init_ms(ctrl, head, id); 1899 - if (ret) 1900 - return ret; 1901 - 1902 1889 head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); 1890 + head->pi_type = 0; 1891 + head->pi_size = 0; 1892 + head->pi_offset = 0; 1893 + head->ms = le16_to_cpu(id->lbaf[nvme_lbaf_index(id->flbas)].ms); 1903 1894 if (!head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) 1904 - return 0; 1895 + return; 1896 + 1897 + if (nvm && (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) { 1898 + nvme_configure_pi_elbas(head, id, nvm); 1899 + } else { 1900 + head->pi_size = sizeof(struct t10_pi_tuple); 1901 + head->guard_type = NVME_NVM_NS_16B_GUARD; 1902 + } 1903 + 1904 + if (head->pi_size && head->ms >= head->pi_size) 1905 + head->pi_type = id->dps & NVME_NS_DPS_PI_MASK; 1906 + if (!(id->dps & NVME_NS_DPS_PI_FIRST)) 1907 + head->pi_offset = head->ms - head->pi_size; 1905 1908 1906 1909 if (ctrl->ops->flags & NVME_F_FABRICS) { 1907 1910 /* ··· 1911 1912 * remap the separate metadata buffer from the block layer. 1912 1913 */ 1913 1914 if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT))) 1914 - return 0; 1915 + return; 1915 1916 1916 1917 head->features |= NVME_NS_EXT_LBAS; 1917 1918 ··· 1938 1939 else 1939 1940 head->features |= NVME_NS_METADATA_SUPPORTED; 1940 1941 } 1941 - return 0; 1942 1942 } 1943 1943 1944 - static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, 1945 - struct request_queue *q) 1944 + static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl) 1946 1945 { 1947 - bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT; 1948 - 1949 - if (ctrl->max_hw_sectors) { 1950 - u32 max_segments = 1951 - (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1; 1952 - 1953 - max_segments = min_not_zero(max_segments, ctrl->max_segments); 1954 - blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); 1955 - blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); 1956 - } 1957 - blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); 1958 - blk_queue_dma_alignment(q, 3); 1959 - blk_queue_write_cache(q, vwc, vwc); 1946 + return ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> SECTOR_SHIFT) + 1; 1960 1947 } 1961 1948 1962 - static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk, 1963 - struct nvme_ns_head *head, struct nvme_id_ns *id) 1949 + static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl, 1950 + struct queue_limits *lim) 1964 1951 { 1965 - sector_t capacity = nvme_lba_to_sect(head, le64_to_cpu(id->nsze)); 1952 + lim->max_hw_sectors = ctrl->max_hw_sectors; 1953 + lim->max_segments = min_t(u32, USHRT_MAX, 1954 + min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments)); 1955 + lim->max_integrity_segments = ctrl->max_integrity_segments; 1956 + lim->virt_boundary_mask = NVME_CTRL_PAGE_SIZE - 1; 1957 + lim->max_segment_size = UINT_MAX; 1958 + lim->dma_alignment = 3; 1959 + } 1960 + 1961 + static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, 1962 + struct queue_limits *lim) 1963 + { 1964 + struct nvme_ns_head *head = ns->head; 1966 1965 u32 bs = 1U << head->lba_shift; 1967 1966 u32 atomic_bs, phys_bs, io_opt = 0; 1967 + bool valid = true; 1968 1968 1969 1969 /* 1970 1970 * The block layer can't support LBA sizes larger than the page size ··· 1971 1973 * allow block I/O. 1972 1974 */ 1973 1975 if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) { 1974 - capacity = 0; 1975 1976 bs = (1 << 9); 1977 + valid = false; 1976 1978 } 1977 - 1978 - blk_integrity_unregister(disk); 1979 1979 1980 1980 atomic_bs = phys_bs = bs; 1981 1981 if (id->nabo == 0) { ··· 1985 1989 if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) 1986 1990 atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; 1987 1991 else 1988 - atomic_bs = (1 + ctrl->subsys->awupf) * bs; 1992 + atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; 1989 1993 } 1990 1994 1991 1995 if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { ··· 1995 1999 io_opt = bs * (1 + le16_to_cpu(id->nows)); 1996 2000 } 1997 2001 1998 - blk_queue_logical_block_size(disk->queue, bs); 1999 2002 /* 2000 2003 * Linux filesystems assume writing a single physical block is 2001 2004 * an atomic operation. Hence limit the physical block size to the 2002 2005 * value of the Atomic Write Unit Power Fail parameter. 2003 2006 */ 2004 - blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs)); 2005 - blk_queue_io_min(disk->queue, phys_bs); 2006 - blk_queue_io_opt(disk->queue, io_opt); 2007 - 2008 - /* 2009 - * Register a metadata profile for PI, or the plain non-integrity NVMe 2010 - * metadata masquerading as Type 0 if supported, otherwise reject block 2011 - * I/O to namespaces with metadata except when the namespace supports 2012 - * PI, as it can strip/insert in that case. 2013 - */ 2014 - if (head->ms) { 2015 - if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && 2016 - (head->features & NVME_NS_METADATA_SUPPORTED)) 2017 - nvme_init_integrity(disk, head, 2018 - ctrl->max_integrity_segments); 2019 - else if (!nvme_ns_has_pi(head)) 2020 - capacity = 0; 2021 - } 2022 - 2023 - set_capacity_and_notify(disk, capacity); 2024 - 2025 - nvme_config_discard(ctrl, disk, head); 2026 - blk_queue_max_write_zeroes_sectors(disk->queue, 2027 - ctrl->max_zeroes_sectors); 2007 + lim->logical_block_size = bs; 2008 + lim->physical_block_size = min(phys_bs, atomic_bs); 2009 + lim->io_min = phys_bs; 2010 + lim->io_opt = io_opt; 2011 + if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) 2012 + lim->max_write_zeroes_sectors = UINT_MAX; 2013 + else 2014 + lim->max_write_zeroes_sectors = ns->ctrl->max_zeroes_sectors; 2015 + return valid; 2028 2016 } 2029 2017 2030 2018 static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info) ··· 2022 2042 return !disk_live(disk); 2023 2043 } 2024 2044 2025 - static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id) 2045 + static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id, 2046 + struct queue_limits *lim) 2026 2047 { 2027 2048 struct nvme_ctrl *ctrl = ns->ctrl; 2028 2049 u32 iob; ··· 2051 2070 return; 2052 2071 } 2053 2072 2054 - blk_queue_chunk_sectors(ns->queue, iob); 2073 + lim->chunk_sectors = iob; 2055 2074 } 2056 2075 2057 2076 static int nvme_update_ns_info_generic(struct nvme_ns *ns, 2058 2077 struct nvme_ns_info *info) 2059 2078 { 2079 + struct queue_limits lim; 2080 + int ret; 2081 + 2060 2082 blk_mq_freeze_queue(ns->disk->queue); 2061 - nvme_set_queue_limits(ns->ctrl, ns->queue); 2083 + lim = queue_limits_start_update(ns->disk->queue); 2084 + nvme_set_ctrl_limits(ns->ctrl, &lim); 2085 + ret = queue_limits_commit_update(ns->disk->queue, &lim); 2062 2086 set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info)); 2063 2087 blk_mq_unfreeze_queue(ns->disk->queue); 2064 2088 2065 - if (nvme_ns_head_multipath(ns->head)) { 2066 - blk_mq_freeze_queue(ns->head->disk->queue); 2067 - set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); 2068 - nvme_mpath_revalidate_paths(ns); 2069 - blk_stack_limits(&ns->head->disk->queue->limits, 2070 - &ns->queue->limits, 0); 2071 - ns->head->disk->flags |= GENHD_FL_HIDDEN; 2072 - blk_mq_unfreeze_queue(ns->head->disk->queue); 2073 - } 2074 - 2075 2089 /* Hide the block-interface for these devices */ 2076 - ns->disk->flags |= GENHD_FL_HIDDEN; 2077 - set_bit(NVME_NS_READY, &ns->flags); 2078 - 2079 - return 0; 2090 + if (!ret) 2091 + ret = -ENODEV; 2092 + return ret; 2080 2093 } 2081 2094 2082 2095 static int nvme_update_ns_info_block(struct nvme_ns *ns, 2083 2096 struct nvme_ns_info *info) 2084 2097 { 2098 + bool vwc = ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT; 2099 + struct queue_limits lim; 2100 + struct nvme_id_ns_nvm *nvm = NULL; 2085 2101 struct nvme_id_ns *id; 2102 + sector_t capacity; 2086 2103 unsigned lbaf; 2087 2104 int ret; 2088 2105 ··· 2092 2113 /* namespace not allocated or attached */ 2093 2114 info->is_removed = true; 2094 2115 ret = -ENODEV; 2095 - goto error; 2116 + goto out; 2117 + } 2118 + 2119 + if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) { 2120 + ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm); 2121 + if (ret < 0) 2122 + goto out; 2096 2123 } 2097 2124 2098 2125 blk_mq_freeze_queue(ns->disk->queue); 2099 2126 lbaf = nvme_lbaf_index(id->flbas); 2100 2127 ns->head->lba_shift = id->lbaf[lbaf].ds; 2101 2128 ns->head->nuse = le64_to_cpu(id->nuse); 2102 - nvme_set_queue_limits(ns->ctrl, ns->queue); 2129 + capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze)); 2103 2130 2104 - ret = nvme_configure_metadata(ns->ctrl, ns->head, id); 2105 - if (ret < 0) { 2106 - blk_mq_unfreeze_queue(ns->disk->queue); 2107 - goto out; 2108 - } 2109 - nvme_set_chunk_sectors(ns, id); 2110 - nvme_update_disk_info(ns->ctrl, ns->disk, ns->head, id); 2111 - 2112 - if (ns->head->ids.csi == NVME_CSI_ZNS) { 2113 - ret = nvme_update_zone_info(ns, lbaf); 2131 + lim = queue_limits_start_update(ns->disk->queue); 2132 + nvme_set_ctrl_limits(ns->ctrl, &lim); 2133 + nvme_configure_metadata(ns->ctrl, ns->head, id, nvm); 2134 + nvme_set_chunk_sectors(ns, id, &lim); 2135 + if (!nvme_update_disk_info(ns, id, &lim)) 2136 + capacity = 0; 2137 + nvme_config_discard(ns, &lim); 2138 + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && 2139 + ns->head->ids.csi == NVME_CSI_ZNS) { 2140 + ret = nvme_update_zone_info(ns, lbaf, &lim); 2114 2141 if (ret) { 2115 2142 blk_mq_unfreeze_queue(ns->disk->queue); 2116 2143 goto out; 2117 2144 } 2118 2145 } 2146 + ret = queue_limits_commit_update(ns->disk->queue, &lim); 2147 + if (ret) { 2148 + blk_mq_unfreeze_queue(ns->disk->queue); 2149 + goto out; 2150 + } 2151 + 2152 + /* 2153 + * Register a metadata profile for PI, or the plain non-integrity NVMe 2154 + * metadata masquerading as Type 0 if supported, otherwise reject block 2155 + * I/O to namespaces with metadata except when the namespace supports 2156 + * PI, as it can strip/insert in that case. 2157 + */ 2158 + if (!nvme_init_integrity(ns->disk, ns->head)) 2159 + capacity = 0; 2160 + 2161 + set_capacity_and_notify(ns->disk, capacity); 2119 2162 2120 2163 /* 2121 2164 * Only set the DEAC bit if the device guarantees that reads from ··· 2148 2147 if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3))) 2149 2148 ns->head->features |= NVME_NS_DEAC; 2150 2149 set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info)); 2150 + blk_queue_write_cache(ns->disk->queue, vwc, vwc); 2151 2151 set_bit(NVME_NS_READY, &ns->flags); 2152 2152 blk_mq_unfreeze_queue(ns->disk->queue); 2153 2153 2154 2154 if (blk_queue_is_zoned(ns->queue)) { 2155 - ret = nvme_revalidate_zones(ns); 2155 + ret = blk_revalidate_disk_zones(ns->disk, NULL); 2156 2156 if (ret && !nvme_first_scan(ns->disk)) 2157 2157 goto out; 2158 2158 } 2159 2159 2160 - if (nvme_ns_head_multipath(ns->head)) { 2161 - blk_mq_freeze_queue(ns->head->disk->queue); 2162 - nvme_update_disk_info(ns->ctrl, ns->head->disk, ns->head, id); 2163 - set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); 2164 - nvme_mpath_revalidate_paths(ns); 2165 - blk_stack_limits(&ns->head->disk->queue->limits, 2166 - &ns->queue->limits, 0); 2167 - disk_update_readahead(ns->head->disk); 2168 - blk_mq_unfreeze_queue(ns->head->disk->queue); 2169 - } 2170 - 2171 2160 ret = 0; 2172 2161 out: 2162 + kfree(nvm); 2163 + kfree(id); 2164 + return ret; 2165 + } 2166 + 2167 + static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) 2168 + { 2169 + bool unsupported = false; 2170 + int ret; 2171 + 2172 + switch (info->ids.csi) { 2173 + case NVME_CSI_ZNS: 2174 + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { 2175 + dev_info(ns->ctrl->device, 2176 + "block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n", 2177 + info->nsid); 2178 + ret = nvme_update_ns_info_generic(ns, info); 2179 + break; 2180 + } 2181 + ret = nvme_update_ns_info_block(ns, info); 2182 + break; 2183 + case NVME_CSI_NVM: 2184 + ret = nvme_update_ns_info_block(ns, info); 2185 + break; 2186 + default: 2187 + dev_info(ns->ctrl->device, 2188 + "block device for nsid %u not supported (csi %u)\n", 2189 + info->nsid, info->ids.csi); 2190 + ret = nvme_update_ns_info_generic(ns, info); 2191 + break; 2192 + } 2193 + 2173 2194 /* 2174 2195 * If probing fails due an unsupported feature, hide the block device, 2175 2196 * but still allow other access. ··· 2199 2176 if (ret == -ENODEV) { 2200 2177 ns->disk->flags |= GENHD_FL_HIDDEN; 2201 2178 set_bit(NVME_NS_READY, &ns->flags); 2179 + unsupported = true; 2202 2180 ret = 0; 2203 2181 } 2204 2182 2205 - error: 2206 - kfree(id); 2207 - return ret; 2208 - } 2183 + if (!ret && nvme_ns_head_multipath(ns->head)) { 2184 + struct queue_limits lim; 2209 2185 2210 - static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) 2211 - { 2212 - switch (info->ids.csi) { 2213 - case NVME_CSI_ZNS: 2214 - if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { 2215 - dev_info(ns->ctrl->device, 2216 - "block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n", 2217 - info->nsid); 2218 - return nvme_update_ns_info_generic(ns, info); 2219 - } 2220 - return nvme_update_ns_info_block(ns, info); 2221 - case NVME_CSI_NVM: 2222 - return nvme_update_ns_info_block(ns, info); 2223 - default: 2224 - dev_info(ns->ctrl->device, 2225 - "block device for nsid %u not supported (csi %u)\n", 2226 - info->nsid, info->ids.csi); 2227 - return nvme_update_ns_info_generic(ns, info); 2186 + blk_mq_freeze_queue(ns->head->disk->queue); 2187 + if (unsupported) 2188 + ns->head->disk->flags |= GENHD_FL_HIDDEN; 2189 + else 2190 + nvme_init_integrity(ns->head->disk, ns->head); 2191 + set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk)); 2192 + set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); 2193 + nvme_mpath_revalidate_paths(ns); 2194 + 2195 + lim = queue_limits_start_update(ns->head->disk->queue); 2196 + queue_limits_stack_bdev(&lim, ns->disk->part0, 0, 2197 + ns->head->disk->disk_name); 2198 + ret = queue_limits_commit_update(ns->head->disk->queue, &lim); 2199 + blk_mq_unfreeze_queue(ns->head->disk->queue); 2228 2200 } 2201 + 2202 + return ret; 2229 2203 } 2230 2204 2231 2205 #ifdef CONFIG_BLK_SED_OPAL ··· 2897 2877 subsys->awupf = le16_to_cpu(id->awupf); 2898 2878 nvme_mpath_default_iopolicy(subsys); 2899 2879 2900 - subsys->dev.class = nvme_subsys_class; 2880 + subsys->dev.class = &nvme_subsys_class; 2901 2881 subsys->dev.release = nvme_release_subsystem; 2902 2882 subsys->dev.groups = nvme_subsys_attrs_groups; 2903 2883 dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance); ··· 3137 3117 return -EINVAL; 3138 3118 } 3139 3119 3120 + if (!ctrl->maxcmd) { 3121 + dev_err(ctrl->device, "Maximum outstanding commands is 0\n"); 3122 + return -EINVAL; 3123 + } 3124 + 3140 3125 return 0; 3141 3126 } 3142 3127 3143 3128 static int nvme_init_identify(struct nvme_ctrl *ctrl) 3144 3129 { 3130 + struct queue_limits lim; 3145 3131 struct nvme_id_ctrl *id; 3146 3132 u32 max_hw_sectors; 3147 3133 bool prev_apst_enabled; ··· 3214 3188 ctrl->max_hw_sectors = 3215 3189 min_not_zero(ctrl->max_hw_sectors, max_hw_sectors); 3216 3190 3217 - nvme_set_queue_limits(ctrl, ctrl->admin_q); 3191 + lim = queue_limits_start_update(ctrl->admin_q); 3192 + nvme_set_ctrl_limits(ctrl, &lim); 3193 + ret = queue_limits_commit_update(ctrl->admin_q, &lim); 3194 + if (ret) 3195 + goto out_free; 3196 + 3218 3197 ctrl->sgls = le32_to_cpu(id->sgls); 3219 3198 ctrl->kas = le16_to_cpu(id->kas); 3220 3199 ctrl->max_namespaces = le32_to_cpu(id->mnan); ··· 3451 3420 if (minor < 0) 3452 3421 return minor; 3453 3422 cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor); 3454 - cdev_device->class = nvme_ns_chr_class; 3423 + cdev_device->class = &nvme_ns_chr_class; 3455 3424 cdev_device->release = nvme_cdev_rel; 3456 3425 device_initialize(cdev_device); 3457 3426 cdev_init(cdev, fops); ··· 3723 3692 if (!ns) 3724 3693 return; 3725 3694 3726 - disk = blk_mq_alloc_disk(ctrl->tagset, ns); 3695 + disk = blk_mq_alloc_disk(ctrl->tagset, NULL, ns); 3727 3696 if (IS_ERR(disk)) 3728 3697 goto out_free_ns; 3729 3698 disk->fops = &nvme_bdev_ops; ··· 4384 4353 int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, 4385 4354 const struct blk_mq_ops *ops, unsigned int cmd_size) 4386 4355 { 4356 + struct queue_limits lim = {}; 4387 4357 int ret; 4388 4358 4389 4359 memset(set, 0, sizeof(*set)); ··· 4404 4372 if (ret) 4405 4373 return ret; 4406 4374 4407 - ctrl->admin_q = blk_mq_init_queue(set); 4375 + ctrl->admin_q = blk_mq_alloc_queue(set, &lim, NULL); 4408 4376 if (IS_ERR(ctrl->admin_q)) { 4409 4377 ret = PTR_ERR(ctrl->admin_q); 4410 4378 goto out_free_tagset; 4411 4379 } 4412 4380 4413 4381 if (ctrl->ops->flags & NVME_F_FABRICS) { 4414 - ctrl->fabrics_q = blk_mq_init_queue(set); 4382 + ctrl->fabrics_q = blk_mq_alloc_queue(set, NULL, NULL); 4415 4383 if (IS_ERR(ctrl->fabrics_q)) { 4416 4384 ret = PTR_ERR(ctrl->fabrics_q); 4417 4385 goto out_cleanup_admin_q; ··· 4475 4443 return ret; 4476 4444 4477 4445 if (ctrl->ops->flags & NVME_F_FABRICS) { 4478 - ctrl->connect_q = blk_mq_init_queue(set); 4446 + ctrl->connect_q = blk_mq_alloc_queue(set, NULL, NULL); 4479 4447 if (IS_ERR(ctrl->connect_q)) { 4480 4448 ret = PTR_ERR(ctrl->connect_q); 4481 4449 goto out_free_tag_set; ··· 4645 4613 ctrl->device = &ctrl->ctrl_device; 4646 4614 ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt), 4647 4615 ctrl->instance); 4648 - ctrl->device->class = nvme_class; 4616 + ctrl->device->class = &nvme_class; 4649 4617 ctrl->device->parent = ctrl->dev; 4650 4618 if (ops->dev_attr_groups) 4651 4619 ctrl->device->groups = ops->dev_attr_groups; ··· 4878 4846 if (result < 0) 4879 4847 goto destroy_delete_wq; 4880 4848 4881 - nvme_class = class_create("nvme"); 4882 - if (IS_ERR(nvme_class)) { 4883 - result = PTR_ERR(nvme_class); 4849 + result = class_register(&nvme_class); 4850 + if (result) 4884 4851 goto unregister_chrdev; 4885 - } 4886 - nvme_class->dev_uevent = nvme_class_uevent; 4887 4852 4888 - nvme_subsys_class = class_create("nvme-subsystem"); 4889 - if (IS_ERR(nvme_subsys_class)) { 4890 - result = PTR_ERR(nvme_subsys_class); 4853 + result = class_register(&nvme_subsys_class); 4854 + if (result) 4891 4855 goto destroy_class; 4892 - } 4893 4856 4894 4857 result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS, 4895 4858 "nvme-generic"); 4896 4859 if (result < 0) 4897 4860 goto destroy_subsys_class; 4898 4861 4899 - nvme_ns_chr_class = class_create("nvme-generic"); 4900 - if (IS_ERR(nvme_ns_chr_class)) { 4901 - result = PTR_ERR(nvme_ns_chr_class); 4862 + result = class_register(&nvme_ns_chr_class); 4863 + if (result) 4902 4864 goto unregister_generic_ns; 4903 - } 4865 + 4904 4866 result = nvme_init_auth(); 4905 4867 if (result) 4906 4868 goto destroy_ns_chr; 4907 4869 return 0; 4908 4870 4909 4871 destroy_ns_chr: 4910 - class_destroy(nvme_ns_chr_class); 4872 + class_unregister(&nvme_ns_chr_class); 4911 4873 unregister_generic_ns: 4912 4874 unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); 4913 4875 destroy_subsys_class: 4914 - class_destroy(nvme_subsys_class); 4876 + class_unregister(&nvme_subsys_class); 4915 4877 destroy_class: 4916 - class_destroy(nvme_class); 4878 + class_unregister(&nvme_class); 4917 4879 unregister_chrdev: 4918 4880 unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); 4919 4881 destroy_delete_wq: ··· 4923 4897 static void __exit nvme_core_exit(void) 4924 4898 { 4925 4899 nvme_exit_auth(); 4926 - class_destroy(nvme_ns_chr_class); 4927 - class_destroy(nvme_subsys_class); 4928 - class_destroy(nvme_class); 4900 + class_unregister(&nvme_ns_chr_class); 4901 + class_unregister(&nvme_subsys_class); 4902 + class_unregister(&nvme_class); 4929 4903 unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); 4930 4904 unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); 4931 4905 destroy_workqueue(nvme_delete_wq);

+12 -10

drivers/nvme/host/fabrics.c

··· 638 638 } 639 639 640 640 key = key_lookup(key_id); 641 - if (!IS_ERR(key)) 641 + if (IS_ERR(key)) 642 642 pr_err("key id %08x not found\n", key_id); 643 643 else 644 644 pr_debug("Using key id %08x\n", key_id); ··· 1319 1319 return ERR_PTR(ret); 1320 1320 } 1321 1321 1322 - static struct class *nvmf_class; 1322 + static const struct class nvmf_class = { 1323 + .name = "nvme-fabrics", 1324 + }; 1325 + 1323 1326 static struct device *nvmf_device; 1324 1327 static DEFINE_MUTEX(nvmf_dev_mutex); 1325 1328 ··· 1442 1439 if (!nvmf_default_host) 1443 1440 return -ENOMEM; 1444 1441 1445 - nvmf_class = class_create("nvme-fabrics"); 1446 - if (IS_ERR(nvmf_class)) { 1442 + ret = class_register(&nvmf_class); 1443 + if (ret) { 1447 1444 pr_err("couldn't register class nvme-fabrics\n"); 1448 - ret = PTR_ERR(nvmf_class); 1449 1445 goto out_free_host; 1450 1446 } 1451 1447 1452 1448 nvmf_device = 1453 - device_create(nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl"); 1449 + device_create(&nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl"); 1454 1450 if (IS_ERR(nvmf_device)) { 1455 1451 pr_err("couldn't create nvme-fabrics device!\n"); 1456 1452 ret = PTR_ERR(nvmf_device); ··· 1465 1463 return 0; 1466 1464 1467 1465 out_destroy_device: 1468 - device_destroy(nvmf_class, MKDEV(0, 0)); 1466 + device_destroy(&nvmf_class, MKDEV(0, 0)); 1469 1467 out_destroy_class: 1470 - class_destroy(nvmf_class); 1468 + class_unregister(&nvmf_class); 1471 1469 out_free_host: 1472 1470 nvmf_host_put(nvmf_default_host); 1473 1471 return ret; ··· 1476 1474 static void __exit nvmf_exit(void) 1477 1475 { 1478 1476 misc_deregister(&nvmf_misc); 1479 - device_destroy(nvmf_class, MKDEV(0, 0)); 1480 - class_destroy(nvmf_class); 1477 + device_destroy(&nvmf_class, MKDEV(0, 0)); 1478 + class_unregister(&nvmf_class); 1481 1479 nvmf_host_put(nvmf_default_host); 1482 1480 1483 1481 BUILD_BUG_ON(sizeof(struct nvmf_common_command) != 64);

+9 -8

drivers/nvme/host/multipath.c

··· 516 516 517 517 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) 518 518 { 519 + struct queue_limits lim; 519 520 bool vwc = false; 520 521 521 522 mutex_init(&head->lock); ··· 533 532 !nvme_is_unique_nsid(ctrl, head) || !multipath) 534 533 return 0; 535 534 536 - head->disk = blk_alloc_disk(ctrl->numa_node); 537 - if (!head->disk) 538 - return -ENOMEM; 535 + blk_set_stacking_limits(&lim); 536 + lim.dma_alignment = 3; 537 + if (head->ids.csi != NVME_CSI_ZNS) 538 + lim.max_zone_append_sectors = 0; 539 + 540 + head->disk = blk_alloc_disk(&lim, ctrl->numa_node); 541 + if (IS_ERR(head->disk)) 542 + return PTR_ERR(head->disk); 539 543 head->disk->fops = &nvme_ns_head_ops; 540 544 head->disk->private_data = head; 541 545 sprintf(head->disk->disk_name, "nvme%dn%d", ··· 558 552 if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL && 559 553 ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues) 560 554 blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue); 561 - 562 - /* set to a default value of 512 until the disk is validated */ 563 - blk_queue_logical_block_size(head->disk->queue, 512); 564 - blk_set_stacking_limits(&head->disk->queue->limits); 565 - blk_queue_dma_alignment(head->disk->queue, 3); 566 555 567 556 /* we need to propagate up the VMC settings */ 568 557 if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)

+3 -9

drivers/nvme/host/nvme.h

··· 464 464 u16 ms; 465 465 u16 pi_size; 466 466 u8 pi_type; 467 + u8 pi_offset; 467 468 u8 guard_type; 468 469 u16 sgs; 469 470 u32 sws; ··· 1036 1035 } 1037 1036 #endif /* CONFIG_NVME_MULTIPATH */ 1038 1037 1039 - int nvme_revalidate_zones(struct nvme_ns *ns); 1040 1038 int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, 1041 1039 unsigned int nr_zones, report_zones_cb cb, void *data); 1040 + int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf, 1041 + struct queue_limits *lim); 1042 1042 #ifdef CONFIG_BLK_DEV_ZONED 1043 - int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf); 1044 1043 blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, 1045 1044 struct nvme_command *cmnd, 1046 1045 enum nvme_zone_mgmt_action action); ··· 1050 1049 enum nvme_zone_mgmt_action action) 1051 1050 { 1052 1051 return BLK_STS_NOTSUPP; 1053 - } 1054 - 1055 - static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) 1056 - { 1057 - dev_warn(ns->ctrl->device, 1058 - "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n"); 1059 - return -EPROTONOSUPPORT; 1060 1052 } 1061 1053 #endif 1062 1054

+10 -4

drivers/nvme/host/rdma.c

··· 1006 1006 { 1007 1007 int ret; 1008 1008 bool changed; 1009 + u16 max_queue_size; 1009 1010 1010 1011 ret = nvme_rdma_configure_admin_queue(ctrl, new); 1011 1012 if (ret) ··· 1031 1030 ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1); 1032 1031 } 1033 1032 1034 - if (ctrl->ctrl.sqsize + 1 > NVME_RDMA_MAX_QUEUE_SIZE) { 1033 + if (ctrl->ctrl.max_integrity_segments) 1034 + max_queue_size = NVME_RDMA_MAX_METADATA_QUEUE_SIZE; 1035 + else 1036 + max_queue_size = NVME_RDMA_MAX_QUEUE_SIZE; 1037 + 1038 + if (ctrl->ctrl.sqsize + 1 > max_queue_size) { 1035 1039 dev_warn(ctrl->ctrl.device, 1036 - "ctrl sqsize %u > max queue size %u, clamping down\n", 1037 - ctrl->ctrl.sqsize + 1, NVME_RDMA_MAX_QUEUE_SIZE); 1038 - ctrl->ctrl.sqsize = NVME_RDMA_MAX_QUEUE_SIZE - 1; 1040 + "ctrl sqsize %u > max queue size %u, clamping down\n", 1041 + ctrl->ctrl.sqsize + 1, max_queue_size); 1042 + ctrl->ctrl.sqsize = max_queue_size - 1; 1039 1043 } 1040 1044 1041 1045 if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {

+2 -5

drivers/nvme/host/sysfs.c

··· 221 221 222 222 ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, &id); 223 223 if (ret) 224 - goto out_free_id; 224 + return ret; 225 225 226 226 ns->head->nuse = le64_to_cpu(id->nuse); 227 - 228 - out_free_id: 229 227 kfree(id); 230 - 231 - return ret; 228 + return 0; 232 229 } 233 230 234 231 static ssize_t nuse_show(struct device *dev, struct device_attribute *attr,

+8 -16

drivers/nvme/host/zns.c

··· 7 7 #include <linux/vmalloc.h> 8 8 #include "nvme.h" 9 9 10 - int nvme_revalidate_zones(struct nvme_ns *ns) 11 - { 12 - struct request_queue *q = ns->queue; 13 - 14 - blk_queue_chunk_sectors(q, ns->head->zsze); 15 - blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append); 16 - 17 - return blk_revalidate_disk_zones(ns->disk, NULL); 18 - } 19 - 20 10 static int nvme_set_max_append(struct nvme_ctrl *ctrl) 21 11 { 22 12 struct nvme_command c = { }; ··· 35 45 return 0; 36 46 } 37 47 38 - int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) 48 + int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf, 49 + struct queue_limits *lim) 39 50 { 40 51 struct nvme_effects_log *log = ns->head->effects; 41 - struct request_queue *q = ns->queue; 42 52 struct nvme_command c = { }; 43 53 struct nvme_id_ns_zns *id; 44 54 int status; ··· 99 109 goto free_data; 100 110 } 101 111 102 - disk_set_zoned(ns->disk); 103 - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); 104 - disk_set_max_open_zones(ns->disk, le32_to_cpu(id->mor) + 1); 105 - disk_set_max_active_zones(ns->disk, le32_to_cpu(id->mar) + 1); 112 + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ns->queue); 113 + lim->zoned = 1; 114 + lim->max_open_zones = le32_to_cpu(id->mor) + 1; 115 + lim->max_active_zones = le32_to_cpu(id->mar) + 1; 116 + lim->chunk_sectors = ns->head->zsze; 117 + lim->max_zone_append_sectors = ns->ctrl->max_zone_append; 106 118 free_data: 107 119 kfree(id); 108 120 return status;

+1 -1

drivers/nvme/target/admin-cmd.c

··· 428 428 id->cqes = (0x4 << 4) | 0x4; 429 429 430 430 /* no enforcement soft-limit for maxcmd - pick arbitrary high value */ 431 - id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); 431 + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD(ctrl)); 432 432 433 433 id->nn = cpu_to_le32(NVMET_MAX_NAMESPACES); 434 434 id->mnan = cpu_to_le32(NVMET_MAX_NAMESPACES);

+28

drivers/nvme/target/configfs.c

··· 273 273 274 274 CONFIGFS_ATTR(nvmet_, param_inline_data_size); 275 275 276 + static ssize_t nvmet_param_max_queue_size_show(struct config_item *item, 277 + char *page) 278 + { 279 + struct nvmet_port *port = to_nvmet_port(item); 280 + 281 + return snprintf(page, PAGE_SIZE, "%d\n", port->max_queue_size); 282 + } 283 + 284 + static ssize_t nvmet_param_max_queue_size_store(struct config_item *item, 285 + const char *page, size_t count) 286 + { 287 + struct nvmet_port *port = to_nvmet_port(item); 288 + int ret; 289 + 290 + if (nvmet_is_port_enabled(port, __func__)) 291 + return -EACCES; 292 + ret = kstrtoint(page, 0, &port->max_queue_size); 293 + if (ret) { 294 + pr_err("Invalid value '%s' for max_queue_size\n", page); 295 + return -EINVAL; 296 + } 297 + return count; 298 + } 299 + 300 + CONFIGFS_ATTR(nvmet_, param_max_queue_size); 301 + 276 302 #ifdef CONFIG_BLK_DEV_INTEGRITY 277 303 static ssize_t nvmet_param_pi_enable_show(struct config_item *item, 278 304 char *page) ··· 1885 1859 &nvmet_attr_addr_trtype, 1886 1860 &nvmet_attr_addr_tsas, 1887 1861 &nvmet_attr_param_inline_data_size, 1862 + &nvmet_attr_param_max_queue_size, 1888 1863 #ifdef CONFIG_BLK_DEV_INTEGRITY 1889 1864 &nvmet_attr_param_pi_enable, 1890 1865 #endif ··· 1944 1917 INIT_LIST_HEAD(&port->subsystems); 1945 1918 INIT_LIST_HEAD(&port->referrals); 1946 1919 port->inline_data_size = -1; /* < 0 == let the transport choose */ 1920 + port->max_queue_size = -1; /* < 0 == let the transport choose */ 1947 1921 1948 1922 port->disc_addr.portid = cpu_to_le16(portid); 1949 1923 port->disc_addr.adrfam = NVMF_ADDR_FAMILY_MAX;

+16 -2

drivers/nvme/target/core.c

··· 358 358 if (port->inline_data_size < 0) 359 359 port->inline_data_size = 0; 360 360 361 + /* 362 + * If the transport didn't set the max_queue_size properly, then clamp 363 + * it to the target limits. Also set default values in case the 364 + * transport didn't set it at all. 365 + */ 366 + if (port->max_queue_size < 0) 367 + port->max_queue_size = NVMET_MAX_QUEUE_SIZE; 368 + else 369 + port->max_queue_size = clamp_t(int, port->max_queue_size, 370 + NVMET_MIN_QUEUE_SIZE, 371 + NVMET_MAX_QUEUE_SIZE); 372 + 361 373 port->enabled = true; 362 374 port->tr_ops = ops; 363 375 return 0; ··· 1235 1223 ctrl->cap |= (15ULL << 24); 1236 1224 /* maximum queue entries supported: */ 1237 1225 if (ctrl->ops->get_max_queue_size) 1238 - ctrl->cap |= ctrl->ops->get_max_queue_size(ctrl) - 1; 1226 + ctrl->cap |= min_t(u16, ctrl->ops->get_max_queue_size(ctrl), 1227 + ctrl->port->max_queue_size) - 1; 1239 1228 else 1240 - ctrl->cap |= NVMET_QUEUE_SIZE - 1; 1229 + ctrl->cap |= ctrl->port->max_queue_size - 1; 1241 1230 1242 1231 if (nvmet_is_passthru_subsys(ctrl->subsys)) 1243 1232 nvmet_passthrough_override_cap(ctrl); ··· 1424 1411 1425 1412 kref_init(&ctrl->ref); 1426 1413 ctrl->subsys = subsys; 1414 + ctrl->pi_support = ctrl->port->pi_enable && ctrl->subsys->pi_support; 1427 1415 nvmet_init_cap(ctrl); 1428 1416 WRITE_ONCE(ctrl->aen_enabled, NVMET_AEN_CFG_OPTIONAL); 1429 1417

+1 -1

drivers/nvme/target/discovery.c

··· 282 282 id->lpa = (1 << 2); 283 283 284 284 /* no enforcement soft-limit for maxcmd - pick arbitrary high value */ 285 - id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); 285 + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD(ctrl)); 286 286 287 287 id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ 288 288 if (ctrl->ops->flags & NVMF_KEYED_SGLS)

+2 -3

drivers/nvme/target/fabrics-cmd.c

··· 157 157 return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; 158 158 } 159 159 160 - if (sqsize > mqes) { 160 + /* for fabrics, this value applies to only the I/O Submission Queues */ 161 + if (qid && sqsize > mqes) { 161 162 pr_warn("sqsize %u is larger than MQES supported %u cntlid %d\n", 162 163 sqsize, mqes, ctrl->cntlid); 163 164 req->error_loc = offsetof(struct nvmf_connect_command, sqsize); ··· 251 250 le32_to_cpu(c->kato), &ctrl); 252 251 if (status) 253 252 goto out; 254 - 255 - ctrl->pi_support = ctrl->port->pi_enable && ctrl->subsys->pi_support; 256 253 257 254 uuid_copy(&ctrl->hostid, &d->hostid); 258 255

+9 -8

drivers/nvme/target/fcloop.c

··· 1556 1556 NULL, 1557 1557 }; 1558 1558 1559 - static struct class *fcloop_class; 1559 + static const struct class fcloop_class = { 1560 + .name = "fcloop", 1561 + }; 1560 1562 static struct device *fcloop_device; 1561 1563 1562 1564 ··· 1566 1564 { 1567 1565 int ret; 1568 1566 1569 - fcloop_class = class_create("fcloop"); 1570 - if (IS_ERR(fcloop_class)) { 1567 + ret = class_register(&fcloop_class); 1568 + if (ret) { 1571 1569 pr_err("couldn't register class fcloop\n"); 1572 - ret = PTR_ERR(fcloop_class); 1573 1570 return ret; 1574 1571 } 1575 1572 1576 1573 fcloop_device = device_create_with_groups( 1577 - fcloop_class, NULL, MKDEV(0, 0), NULL, 1574 + &fcloop_class, NULL, MKDEV(0, 0), NULL, 1578 1575 fcloop_dev_attr_groups, "ctl"); 1579 1576 if (IS_ERR(fcloop_device)) { 1580 1577 pr_err("couldn't create ctl device!\n"); ··· 1586 1585 return 0; 1587 1586 1588 1587 out_destroy_class: 1589 - class_destroy(fcloop_class); 1588 + class_unregister(&fcloop_class); 1590 1589 return ret; 1591 1590 } 1592 1591 ··· 1644 1643 1645 1644 put_device(fcloop_device); 1646 1645 1647 - device_destroy(fcloop_class, MKDEV(0, 0)); 1648 - class_destroy(fcloop_class); 1646 + device_destroy(&fcloop_class, MKDEV(0, 0)); 1647 + class_unregister(&fcloop_class); 1649 1648 } 1650 1649 1651 1650 module_init(fcloop_init);

+4 -2

drivers/nvme/target/nvmet.h

··· 163 163 void *priv; 164 164 bool enabled; 165 165 int inline_data_size; 166 + int max_queue_size; 166 167 const struct nvmet_fabrics_ops *tr_ops; 167 168 bool pi_enable; 168 169 }; ··· 544 543 void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, 545 544 u8 event_info, u8 log_page); 546 545 547 - #define NVMET_QUEUE_SIZE 1024 546 + #define NVMET_MIN_QUEUE_SIZE 16 547 + #define NVMET_MAX_QUEUE_SIZE 1024 548 548 #define NVMET_NR_QUEUES 128 549 - #define NVMET_MAX_CMD NVMET_QUEUE_SIZE 549 + #define NVMET_MAX_CMD(ctrl) (NVME_CAP_MQES(ctrl->cap) + 1) 550 550 551 551 /* 552 552 * Nice round number that makes a list of nsids fit into a page.

+1 -1

drivers/nvme/target/passthru.c

··· 132 132 133 133 id->sqes = min_t(__u8, ((0x6 << 4) | 0x6), id->sqes); 134 134 id->cqes = min_t(__u8, ((0x4 << 4) | 0x4), id->cqes); 135 - id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); 135 + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD(ctrl)); 136 136 137 137 /* don't support fuse commands */ 138 138 id->fuses = 0;

+10

drivers/nvme/target/rdma.c

··· 1956 1956 nport->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; 1957 1957 } 1958 1958 1959 + if (nport->max_queue_size < 0) { 1960 + nport->max_queue_size = NVME_RDMA_DEFAULT_QUEUE_SIZE; 1961 + } else if (nport->max_queue_size > NVME_RDMA_MAX_QUEUE_SIZE) { 1962 + pr_warn("max_queue_size %u is too large, reducing to %u\n", 1963 + nport->max_queue_size, NVME_RDMA_MAX_QUEUE_SIZE); 1964 + nport->max_queue_size = NVME_RDMA_MAX_QUEUE_SIZE; 1965 + } 1966 + 1959 1967 ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr, 1960 1968 nport->disc_addr.trsvcid, &port->addr); 1961 1969 if (ret) { ··· 2023 2015 2024 2016 static u16 nvmet_rdma_get_max_queue_size(const struct nvmet_ctrl *ctrl) 2025 2017 { 2018 + if (ctrl->pi_support) 2019 + return NVME_RDMA_MAX_METADATA_QUEUE_SIZE; 2026 2020 return NVME_RDMA_MAX_QUEUE_SIZE; 2027 2021 } 2028 2022

+2 -3

drivers/nvme/target/zns.c

··· 456 456 switch (zsa_req_op(req->cmd->zms.zsa)) { 457 457 case REQ_OP_ZONE_RESET: 458 458 ret = blkdev_zone_mgmt(req->ns->bdev, REQ_OP_ZONE_RESET, 0, 459 - get_capacity(req->ns->bdev->bd_disk), 460 - GFP_KERNEL); 459 + get_capacity(req->ns->bdev->bd_disk)); 461 460 if (ret < 0) 462 461 return blkdev_zone_mgmt_errno_to_nvme_status(ret); 463 462 break; ··· 507 508 goto out; 508 509 } 509 510 510 - ret = blkdev_zone_mgmt(bdev, op, sect, zone_sectors, GFP_KERNEL); 511 + ret = blkdev_zone_mgmt(bdev, op, sect, zone_sectors); 511 512 if (ret < 0) 512 513 status = blkdev_zone_mgmt_errno_to_nvme_status(ret); 513 514

+86 -96

drivers/s390/block/dasd.c

··· 8 8 * Copyright IBM Corp. 1999, 2009 9 9 */ 10 10 11 - #define KMSG_COMPONENT "dasd" 12 - #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 13 - 14 11 #include <linux/kmod.h> 15 12 #include <linux/init.h> 16 13 #include <linux/interrupt.h> ··· 26 29 #include <asm/idals.h> 27 30 #include <asm/itcw.h> 28 31 #include <asm/diag.h> 29 - 30 - /* This is ugly... */ 31 - #define PRINTK_HEADER "dasd:" 32 32 33 33 #include "dasd_int.h" 34 34 /* ··· 307 313 */ 308 314 static int dasd_state_basic_to_ready(struct dasd_device *device) 309 315 { 310 - int rc; 311 - struct dasd_block *block; 312 - struct gendisk *disk; 316 + struct dasd_block *block = device->block; 317 + struct queue_limits lim; 318 + int rc = 0; 313 319 314 - rc = 0; 315 - block = device->block; 316 320 /* make disk known with correct capacity */ 317 - if (block) { 318 - if (block->base->discipline->do_analysis != NULL) 319 - rc = block->base->discipline->do_analysis(block); 320 - if (rc) { 321 - if (rc != -EAGAIN) { 322 - device->state = DASD_STATE_UNFMT; 323 - disk = device->block->gdp; 324 - kobject_uevent(&disk_to_dev(disk)->kobj, 325 - KOBJ_CHANGE); 326 - goto out; 327 - } 328 - return rc; 329 - } 330 - if (device->discipline->setup_blk_queue) 331 - device->discipline->setup_blk_queue(block); 332 - set_capacity(block->gdp, 333 - block->blocks << block->s2b_shift); 321 + if (!block) { 334 322 device->state = DASD_STATE_READY; 335 - rc = dasd_scan_partitions(block); 336 - if (rc) { 337 - device->state = DASD_STATE_BASIC; 338 - return rc; 339 - } 340 - } else { 341 - device->state = DASD_STATE_READY; 323 + goto out; 342 324 } 325 + 326 + if (block->base->discipline->do_analysis != NULL) 327 + rc = block->base->discipline->do_analysis(block); 328 + if (rc) { 329 + if (rc == -EAGAIN) 330 + return rc; 331 + device->state = DASD_STATE_UNFMT; 332 + kobject_uevent(&disk_to_dev(device->block->gdp)->kobj, 333 + KOBJ_CHANGE); 334 + goto out; 335 + } 336 + 337 + lim = queue_limits_start_update(block->gdp->queue); 338 + lim.max_dev_sectors = device->discipline->max_sectors(block); 339 + lim.max_hw_sectors = lim.max_dev_sectors; 340 + lim.logical_block_size = block->bp_block; 341 + 342 + if (device->discipline->has_discard) { 343 + unsigned int max_bytes; 344 + 345 + lim.discard_granularity = block->bp_block; 346 + 347 + /* Calculate max_discard_sectors and make it PAGE aligned */ 348 + max_bytes = USHRT_MAX * block->bp_block; 349 + max_bytes = ALIGN_DOWN(max_bytes, PAGE_SIZE); 350 + 351 + lim.max_hw_discard_sectors = max_bytes / block->bp_block; 352 + lim.max_write_zeroes_sectors = lim.max_hw_discard_sectors; 353 + } 354 + rc = queue_limits_commit_update(block->gdp->queue, &lim); 355 + if (rc) 356 + return rc; 357 + 358 + set_capacity(block->gdp, block->blocks << block->s2b_shift); 359 + device->state = DASD_STATE_READY; 360 + 361 + rc = dasd_scan_partitions(block); 362 + if (rc) { 363 + device->state = DASD_STATE_BASIC; 364 + return rc; 365 + } 366 + 343 367 out: 344 368 if (device->discipline->basic_to_ready) 345 369 rc = device->discipline->basic_to_ready(device); ··· 1313 1301 { 1314 1302 struct dasd_device *device; 1315 1303 int retries, rc; 1316 - char errorstring[ERRORLENGTH]; 1317 1304 1318 1305 /* Check the cqr */ 1319 1306 rc = dasd_check_cqr(cqr); ··· 1351 1340 rc = 0; 1352 1341 break; 1353 1342 default: 1354 - /* internal error 10 - unknown rc*/ 1355 - snprintf(errorstring, ERRORLENGTH, "10 %d", rc); 1356 - dev_err(&device->cdev->dev, "An error occurred in the " 1357 - "DASD device driver, reason=%s\n", errorstring); 1343 + dev_err(&device->cdev->dev, 1344 + "Unexpected error during request termination %d\n", rc); 1358 1345 BUG(); 1359 1346 break; 1360 1347 } ··· 1371 1362 { 1372 1363 struct dasd_device *device; 1373 1364 int rc; 1374 - char errorstring[ERRORLENGTH]; 1375 1365 1376 1366 /* Check the cqr */ 1377 1367 rc = dasd_check_cqr(cqr); ··· 1390 1382 return -EPERM; 1391 1383 } 1392 1384 if (cqr->retries < 0) { 1393 - /* internal error 14 - start_IO run out of retries */ 1394 - sprintf(errorstring, "14 %p", cqr); 1395 - dev_err(&device->cdev->dev, "An error occurred in the DASD " 1396 - "device driver, reason=%s\n", errorstring); 1385 + dev_err(&device->cdev->dev, 1386 + "Start I/O ran out of retries\n"); 1397 1387 cqr->status = DASD_CQR_ERROR; 1398 1388 return -EIO; 1399 1389 } ··· 1469 1463 "not accessible"); 1470 1464 break; 1471 1465 default: 1472 - /* internal error 11 - unknown rc */ 1473 - snprintf(errorstring, ERRORLENGTH, "11 %d", rc); 1474 1466 dev_err(&device->cdev->dev, 1475 - "An error occurred in the DASD device driver, " 1476 - "reason=%s\n", errorstring); 1467 + "Unexpected error during request start %d", rc); 1477 1468 BUG(); 1478 1469 break; 1479 1470 } ··· 1907 1904 static void __dasd_process_cqr(struct dasd_device *device, 1908 1905 struct dasd_ccw_req *cqr) 1909 1906 { 1910 - char errorstring[ERRORLENGTH]; 1911 - 1912 1907 switch (cqr->status) { 1913 1908 case DASD_CQR_SUCCESS: 1914 1909 cqr->status = DASD_CQR_DONE; ··· 1918 1917 cqr->status = DASD_CQR_TERMINATED; 1919 1918 break; 1920 1919 default: 1921 - /* internal error 12 - wrong cqr status*/ 1922 - snprintf(errorstring, ERRORLENGTH, "12 %p %x02", cqr, cqr->status); 1923 1920 dev_err(&device->cdev->dev, 1924 - "An error occurred in the DASD device driver, " 1925 - "reason=%s\n", errorstring); 1921 + "Unexpected CQR status %02x", cqr->status); 1926 1922 BUG(); 1927 1923 } 1928 1924 if (cqr->callback) ··· 1984 1986 if (device->discipline->term_IO(cqr) != 0) { 1985 1987 /* Hmpf, try again in 5 sec */ 1986 1988 dev_err(&device->cdev->dev, 1987 - "cqr %p timed out (%lus) but cannot be " 1988 - "ended, retrying in 5 s\n", 1989 - cqr, (cqr->expires/HZ)); 1989 + "CQR timed out (%lus) but cannot be ended, retrying in 5s\n", 1990 + (cqr->expires / HZ)); 1990 1991 cqr->expires += 5*HZ; 1991 1992 dasd_device_set_timer(device, 5*HZ); 1992 1993 } else { 1993 1994 dev_err(&device->cdev->dev, 1994 - "cqr %p timed out (%lus), %i retries " 1995 - "remaining\n", cqr, (cqr->expires/HZ), 1996 - cqr->retries); 1995 + "CQR timed out (%lus), %i retries remaining\n", 1996 + (cqr->expires / HZ), cqr->retries); 1997 1997 } 1998 1998 __dasd_device_check_autoquiesce_timeout(device, cqr); 1999 1999 } ··· 2112 2116 if (rc) { 2113 2117 /* unable to terminate requeust */ 2114 2118 dev_err(&device->cdev->dev, 2115 - "Flushing the DASD request queue " 2116 - "failed for request %p\n", cqr); 2119 + "Flushing the DASD request queue failed\n"); 2117 2120 /* stop flush processing */ 2118 2121 goto finished; 2119 2122 } ··· 2628 2633 rc = device->discipline->term_IO(cqr); 2629 2634 if (rc) { 2630 2635 dev_err(&device->cdev->dev, 2631 - "Cancelling request %p failed with rc=%d\n", 2632 - cqr, rc); 2636 + "Cancelling request failed with rc=%d\n", rc); 2633 2637 } else { 2634 2638 cqr->stopclk = get_tod_clock(); 2635 2639 } ··· 3396 3402 3397 3403 ret = ccw_device_set_online(cdev); 3398 3404 if (ret) 3399 - pr_warn("%s: Setting the DASD online failed with rc=%d\n", 3400 - dev_name(&cdev->dev), ret); 3405 + dev_warn(&cdev->dev, "Setting the DASD online failed with rc=%d\n", ret); 3401 3406 } 3402 3407 3403 3408 /* ··· 3483 3490 { 3484 3491 struct dasd_discipline *discipline; 3485 3492 struct dasd_device *device; 3493 + struct device *dev; 3486 3494 int rc; 3495 + 3496 + dev = &cdev->dev; 3487 3497 3488 3498 /* first online clears initial online feature flag */ 3489 3499 dasd_set_feature(cdev, DASD_FEATURE_INITIAL_ONLINE, 0); ··· 3500 3504 /* Try to load the required module. */ 3501 3505 rc = request_module(DASD_DIAG_MOD); 3502 3506 if (rc) { 3503 - pr_warn("%s Setting the DASD online failed " 3504 - "because the required module %s " 3505 - "could not be loaded (rc=%d)\n", 3506 - dev_name(&cdev->dev), DASD_DIAG_MOD, 3507 - rc); 3507 + dev_warn(dev, "Setting the DASD online failed " 3508 + "because the required module %s " 3509 + "could not be loaded (rc=%d)\n", 3510 + DASD_DIAG_MOD, rc); 3508 3511 dasd_delete_device(device); 3509 3512 return -ENODEV; 3510 3513 } ··· 3511 3516 /* Module init could have failed, so check again here after 3512 3517 * request_module(). */ 3513 3518 if (!dasd_diag_discipline_pointer) { 3514 - pr_warn("%s Setting the DASD online failed because of missing DIAG discipline\n", 3515 - dev_name(&cdev->dev)); 3519 + dev_warn(dev, "Setting the DASD online failed because of missing DIAG discipline\n"); 3516 3520 dasd_delete_device(device); 3517 3521 return -ENODEV; 3518 3522 } ··· 3521 3527 dasd_delete_device(device); 3522 3528 return -EINVAL; 3523 3529 } 3530 + device->base_discipline = base_discipline; 3524 3531 if (!try_module_get(discipline->owner)) { 3525 - module_put(base_discipline->owner); 3526 3532 dasd_delete_device(device); 3527 3533 return -EINVAL; 3528 3534 } 3529 - device->base_discipline = base_discipline; 3530 3535 device->discipline = discipline; 3531 3536 3532 3537 /* check_device will allocate block device if necessary */ 3533 3538 rc = discipline->check_device(device); 3534 3539 if (rc) { 3535 - pr_warn("%s Setting the DASD online with discipline %s failed with rc=%i\n", 3536 - dev_name(&cdev->dev), discipline->name, rc); 3537 - module_put(discipline->owner); 3538 - module_put(base_discipline->owner); 3540 + dev_warn(dev, "Setting the DASD online with discipline %s failed with rc=%i\n", 3541 + discipline->name, rc); 3539 3542 dasd_delete_device(device); 3540 3543 return rc; 3541 3544 } 3542 3545 3543 3546 dasd_set_target_state(device, DASD_STATE_ONLINE); 3544 3547 if (device->state <= DASD_STATE_KNOWN) { 3545 - pr_warn("%s Setting the DASD online failed because of a missing discipline\n", 3546 - dev_name(&cdev->dev)); 3548 + dev_warn(dev, "Setting the DASD online failed because of a missing discipline\n"); 3547 3549 rc = -ENODEV; 3548 3550 dasd_set_target_state(device, DASD_STATE_NEW); 3549 3551 if (device->block) 3550 3552 dasd_free_block(device->block); 3551 3553 dasd_delete_device(device); 3552 - } else 3553 - pr_debug("dasd_generic device %s found\n", 3554 - dev_name(&cdev->dev)); 3554 + } else { 3555 + dev_dbg(dev, "dasd_generic device found\n"); 3556 + } 3555 3557 3556 3558 wait_event(dasd_init_waitq, _wait_for_device(device)); 3557 3559 ··· 3558 3568 3559 3569 int dasd_generic_set_offline(struct ccw_device *cdev) 3560 3570 { 3571 + int max_count, open_count, rc; 3561 3572 struct dasd_device *device; 3562 3573 struct dasd_block *block; 3563 - int max_count, open_count, rc; 3564 3574 unsigned long flags; 3575 + struct device *dev; 3576 + 3577 + dev = &cdev->dev; 3565 3578 3566 3579 rc = 0; 3567 3580 spin_lock_irqsave(get_ccwdev_lock(cdev), flags); ··· 3585 3592 open_count = atomic_read(&device->block->open_count); 3586 3593 if (open_count > max_count) { 3587 3594 if (open_count > 0) 3588 - pr_warn("%s: The DASD cannot be set offline with open count %i\n", 3589 - dev_name(&cdev->dev), open_count); 3595 + dev_warn(dev, "The DASD cannot be set offline with open count %i\n", 3596 + open_count); 3590 3597 else 3591 - pr_warn("%s: The DASD cannot be set offline while it is in use\n", 3592 - dev_name(&cdev->dev)); 3598 + dev_warn(dev, "The DASD cannot be set offline while it is in use\n"); 3593 3599 rc = -EBUSY; 3594 3600 goto out_err; 3595 3601 } ··· 3948 3956 if (dasd_eer_enabled(device)) 3949 3957 dasd_eer_write(device, NULL, DASD_EER_AUTOQUIESCE); 3950 3958 3951 - pr_info("%s: The DASD has been put in the quiesce state\n", 3952 - dev_name(&device->cdev->dev)); 3959 + dev_info(&device->cdev->dev, 3960 + "The DASD has been put in the quiesce state\n"); 3953 3961 dasd_device_set_stop_bits(device, DASD_STOPPED_QUIESCE); 3954 3962 3955 3963 if (device->features & DASD_FEATURE_REQUEUEQUIESCE) ··· 3969 3977 NULL); 3970 3978 3971 3979 if (IS_ERR(cqr)) { 3972 - /* internal error 13 - Allocating the RDC request failed*/ 3973 - dev_err(&device->cdev->dev, 3974 - "An error occurred in the DASD device driver, " 3975 - "reason=%s\n", "13"); 3980 + DBF_EVENT_DEVID(DBF_WARNING, device->cdev, "%s", 3981 + "Could not allocate RDC request"); 3976 3982 return cqr; 3977 3983 } 3978 3984

+21 -59

drivers/s390/block/dasd_3990_erp.c

··· 7 7 * 8 8 */ 9 9 10 - #define KMSG_COMPONENT "dasd-eckd" 11 - 12 10 #include <linux/timer.h> 13 11 #include <asm/idals.h> 14 - 15 - #define PRINTK_HEADER "dasd_erp(3990): " 16 12 17 13 #include "dasd_int.h" 18 14 #include "dasd_eckd.h" ··· 394 398 struct dasd_device *device = erp->startdev; 395 399 char msg_format = (sense[7] & 0xF0); 396 400 char msg_no = (sense[7] & 0x0F); 397 - char errorstring[ERRORLENGTH]; 398 401 399 402 switch (msg_format) { 400 403 case 0x00: /* Format 0 - Program or System Checks */ ··· 999 1004 } 1000 1005 break; 1001 1006 1002 - default: /* unknown message format - should not happen 1003 - internal error 03 - unknown message format */ 1004 - snprintf(errorstring, ERRORLENGTH, "03 %x02", msg_format); 1007 + default: 1005 1008 dev_err(&device->cdev->dev, 1006 - "An error occurred in the DASD device driver, " 1007 - "reason=%s\n", errorstring); 1009 + "Unknown message format %02x", msg_format); 1008 1010 break; 1009 1011 } /* end switch message format */ 1010 1012 ··· 1048 1056 set_bit(DASD_CQR_SUPPRESS_CR, &erp->refers->flags); 1049 1057 erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED); 1050 1058 } else { 1051 - /* fatal error - set status to FAILED 1052 - internal error 09 - Command Reject */ 1053 1059 if (!test_bit(DASD_CQR_SUPPRESS_CR, &erp->flags)) 1054 1060 dev_err(&device->cdev->dev, 1055 - "An error occurred in the DASD device driver, reason=09\n"); 1061 + "An I/O command request was rejected\n"); 1056 1062 1057 1063 erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED); 1058 1064 } ··· 1118 1128 erp->function = dasd_3990_erp_equip_check; 1119 1129 1120 1130 if (sense[1] & SNS1_WRITE_INHIBITED) { 1121 - dev_info(&device->cdev->dev, 1122 - "Write inhibited path encountered\n"); 1123 - 1124 - /* vary path offline 1125 - internal error 04 - Path should be varied off-line.*/ 1126 - dev_err(&device->cdev->dev, "An error occurred in the DASD " 1127 - "device driver, reason=%s\n", "04"); 1131 + dev_err(&device->cdev->dev, "Write inhibited path encountered\n"); 1128 1132 1129 1133 erp = dasd_3990_erp_action_1(erp); 1130 1134 ··· 1269 1285 erp = dasd_3990_erp_action_4(erp, sense); 1270 1286 1271 1287 } else { 1272 - /* internal error 06 - The track format is not valid*/ 1273 - dev_err(&device->cdev->dev, 1274 - "An error occurred in the DASD device driver, " 1275 - "reason=%s\n", "06"); 1276 - 1288 + dev_err(&device->cdev->dev, "Track format is not valid\n"); 1277 1289 erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED); 1278 1290 } 1279 1291 ··· 1643 1663 sizeof(struct LO_eckd_data), device); 1644 1664 1645 1665 if (IS_ERR(erp)) { 1646 - /* internal error 01 - Unable to allocate ERP */ 1647 - dev_err(&device->cdev->dev, "An error occurred in the DASD " 1648 - "device driver, reason=%s\n", "01"); 1666 + DBF_DEV_EVENT(DBF_ERR, device, "%s", 1667 + "Unable to allocate ERP request (1B 32)"); 1649 1668 return dasd_3990_erp_cleanup(default_erp, DASD_CQR_FAILED); 1650 1669 } 1651 1670 ··· 1786 1807 cpa = previous_erp->irb.scsw.cmd.cpa; 1787 1808 1788 1809 if (cpa == 0) { 1789 - /* internal error 02 - 1790 - Unable to determine address of the CCW to be restarted */ 1791 - dev_err(&device->cdev->dev, "An error occurred in the DASD " 1792 - "device driver, reason=%s\n", "02"); 1810 + dev_err(&device->cdev->dev, 1811 + "Unable to determine address of to be restarted CCW\n"); 1793 1812 1794 1813 previous_erp->status = DASD_CQR_FAILED; 1795 1814 ··· 1986 2009 { 1987 2010 1988 2011 if ((sense[25] & DASD_SENSE_BIT_1) && (sense[26] & DASD_SENSE_BIT_2)) { 1989 - 1990 - /* set to suspended duplex state then restart 1991 - internal error 05 - Set device to suspended duplex state 1992 - should be done */ 1993 2012 struct dasd_device *device = erp->startdev; 1994 2013 dev_err(&device->cdev->dev, 1995 - "An error occurred in the DASD device driver, " 1996 - "reason=%s\n", "05"); 1997 - 2014 + "Compound configuration error occurred\n"); 1998 2015 } 1999 2016 2000 2017 erp->function = dasd_3990_erp_compound_config; ··· 2124 2153 erp = dasd_3990_erp_int_req(erp); 2125 2154 break; 2126 2155 2127 - case 0x0F: /* length mismatch during update write command 2128 - internal error 08 - update write command error*/ 2129 - dev_err(&device->cdev->dev, "An error occurred in the " 2130 - "DASD device driver, reason=%s\n", "08"); 2156 + case 0x0F: 2157 + dev_err(&device->cdev->dev, 2158 + "Update write command error occurred\n"); 2131 2159 2132 2160 erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED); 2133 2161 break; ··· 2135 2165 erp = dasd_3990_erp_action_10_32(erp, sense); 2136 2166 break; 2137 2167 2138 - case 0x15: /* next track outside defined extend 2139 - internal error 07 - The next track is not 2140 - within the defined storage extent */ 2168 + case 0x15: 2141 2169 dev_err(&device->cdev->dev, 2142 - "An error occurred in the DASD device driver, " 2143 - "reason=%s\n", "07"); 2170 + "Track outside defined extent error occurred\n"); 2144 2171 2145 2172 erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED); 2146 2173 break; ··· 2630 2663 * necessary 2631 2664 */ 2632 2665 dev_err(&device->cdev->dev, 2633 - "ERP %p has run out of retries and failed\n", erp); 2666 + "ERP %px has run out of retries and failed\n", erp); 2634 2667 2635 2668 erp->status = DASD_CQR_FAILED; 2636 2669 } ··· 2671 2704 while (erp_done != erp) { 2672 2705 2673 2706 if (erp_done == NULL) /* end of chain reached */ 2674 - panic(PRINTK_HEADER "Programming error in ERP! The " 2675 - "original request was lost\n"); 2707 + panic("Programming error in ERP! The original request was lost\n"); 2676 2708 2677 2709 /* remove the request from the device queue */ 2678 2710 list_del(&erp_done->blocklist); ··· 2752 2786 "ERP chain at BEGINNING of ERP-ACTION\n"); 2753 2787 for (temp_erp = cqr; 2754 2788 temp_erp != NULL; temp_erp = temp_erp->refers) { 2755 - 2756 2789 dev_err(&device->cdev->dev, 2757 - "ERP %p (%02x) refers to %p\n", 2758 - temp_erp, temp_erp->status, 2759 - temp_erp->refers); 2790 + "ERP %px (%02x) refers to %px\n", 2791 + temp_erp, temp_erp->status, temp_erp->refers); 2760 2792 } 2761 2793 } 2762 2794 ··· 2801 2837 "ERP chain at END of ERP-ACTION\n"); 2802 2838 for (temp_erp = erp; 2803 2839 temp_erp != NULL; temp_erp = temp_erp->refers) { 2804 - 2805 2840 dev_err(&device->cdev->dev, 2806 - "ERP %p (%02x) refers to %p\n", 2807 - temp_erp, temp_erp->status, 2808 - temp_erp->refers); 2841 + "ERP %px (%02x) refers to %px\n", 2842 + temp_erp, temp_erp->status, temp_erp->refers); 2809 2843 } 2810 2844 } 2811 2845

-8

drivers/s390/block/dasd_alias.c

··· 6 6 * Author(s): Stefan Weinhuber <wein@de.ibm.com> 7 7 */ 8 8 9 - #define KMSG_COMPONENT "dasd-eckd" 10 - 11 9 #include <linux/list.h> 12 10 #include <linux/slab.h> 13 11 #include <asm/ebcdic.h> 14 12 #include "dasd_int.h" 15 13 #include "dasd_eckd.h" 16 - 17 - #ifdef PRINTK_HEADER 18 - #undef PRINTK_HEADER 19 - #endif /* PRINTK_HEADER */ 20 - #define PRINTK_HEADER "dasd(eckd):" 21 - 22 14 23 15 /* 24 16 * General concept of alias management:

+12 -22

drivers/s390/block/dasd_devmap.c

··· 13 13 * 14 14 */ 15 15 16 - #define KMSG_COMPONENT "dasd" 17 - 18 16 #include <linux/ctype.h> 19 17 #include <linux/init.h> 20 18 #include <linux/module.h> ··· 22 24 #include <linux/uaccess.h> 23 25 #include <asm/ipl.h> 24 26 25 - /* This is ugly... */ 26 - #define PRINTK_HEADER "dasd_devmap:" 27 27 #define DASD_MAX_PARAMS 256 28 28 29 29 #include "dasd_int.h" ··· 1110 1114 use_diag = (devmap->features & DASD_FEATURE_USEDIAG) != 0; 1111 1115 else 1112 1116 use_diag = (DASD_FEATURE_DEFAULT & DASD_FEATURE_USEDIAG) != 0; 1113 - return sprintf(buf, use_diag ? "1\n" : "0\n"); 1117 + return sysfs_emit(buf, use_diag ? "1\n" : "0\n"); 1114 1118 } 1115 1119 1116 1120 static ssize_t ··· 1159 1163 use_raw = (devmap->features & DASD_FEATURE_USERAW) != 0; 1160 1164 else 1161 1165 use_raw = (DASD_FEATURE_DEFAULT & DASD_FEATURE_USERAW) != 0; 1162 - return sprintf(buf, use_raw ? "1\n" : "0\n"); 1166 + return sysfs_emit(buf, use_raw ? "1\n" : "0\n"); 1163 1167 } 1164 1168 1165 1169 static ssize_t ··· 1255 1259 if (count < 0) 1256 1260 return count; 1257 1261 1258 - return sprintf(buf, "%d\n", count); 1262 + return sysfs_emit(buf, "%d\n", count); 1259 1263 } 1260 1264 1261 1265 static DEVICE_ATTR(host_access_count, 0444, dasd_access_show, NULL); ··· 1334 1338 1335 1339 device = dasd_device_from_cdev(to_ccwdev(dev)); 1336 1340 if (IS_ERR(device)) 1337 - return sprintf(buf, "0\n"); 1341 + return sysfs_emit(buf, "0\n"); 1338 1342 1339 1343 if (device->discipline && device->discipline->get_uid && 1340 1344 !device->discipline->get_uid(device, &uid)) { 1341 1345 if (uid.type == UA_BASE_PAV_ALIAS || 1342 1346 uid.type == UA_HYPER_PAV_ALIAS) { 1343 1347 dasd_put_device(device); 1344 - return sprintf(buf, "1\n"); 1348 + return sysfs_emit(buf, "1\n"); 1345 1349 } 1346 1350 } 1347 1351 dasd_put_device(device); 1348 1352 1349 - return sprintf(buf, "0\n"); 1353 + return sysfs_emit(buf, "0\n"); 1350 1354 } 1351 1355 1352 1356 static DEVICE_ATTR(alias, 0444, dasd_alias_show, NULL); ··· 1408 1412 break; 1409 1413 } 1410 1414 1411 - if (strlen(uid.vduit) > 0) 1412 - snprintf(uid_string, sizeof(uid_string), 1413 - "%s.%s.%04x.%s.%s", 1414 - uid.vendor, uid.serial, uid.ssid, ua_string, 1415 - uid.vduit); 1416 - else 1417 - snprintf(uid_string, sizeof(uid_string), 1418 - "%s.%s.%04x.%s", 1419 - uid.vendor, uid.serial, uid.ssid, ua_string); 1415 + snprintf(uid_string, sizeof(uid_string), "%s.%s.%04x.%s%s%s", 1416 + uid.vendor, uid.serial, uid.ssid, ua_string, 1417 + uid.vduit[0] ? "." : "", uid.vduit); 1420 1418 } 1421 1419 dasd_put_device(device); 1422 1420 ··· 1852 1862 1853 1863 device = dasd_device_from_cdev(to_ccwdev(dev)); 1854 1864 if (IS_ERR(device)) 1855 - return sprintf(buf, "0\n"); 1865 + return sysfs_emit(buf, "0\n"); 1856 1866 1857 1867 opm = dasd_path_get_opm(device); 1858 1868 nppm = dasd_path_get_nppm(device); ··· 1862 1872 ifccpm = dasd_path_get_ifccpm(device); 1863 1873 dasd_put_device(device); 1864 1874 1865 - return sprintf(buf, "%02x %02x %02x %02x %02x %02x\n", opm, nppm, 1866 - cablepm, cuirpm, hpfpm, ifccpm); 1875 + return sysfs_emit(buf, "%02x %02x %02x %02x %02x %02x\n", opm, nppm, 1876 + cablepm, cuirpm, hpfpm, ifccpm); 1867 1877 } 1868 1878 1869 1879 static DEVICE_ATTR(path_masks, 0444, dasd_pm_show, NULL);

+3 -23

drivers/s390/block/dasd_diag.c

··· 8 8 * 9 9 */ 10 10 11 - #define KMSG_COMPONENT "dasd" 12 - 13 11 #include <linux/kernel_stat.h> 14 12 #include <linux/stddef.h> 15 13 #include <linux/kernel.h> ··· 28 30 29 31 #include "dasd_int.h" 30 32 #include "dasd_diag.h" 31 - 32 - #define PRINTK_HEADER "dasd(diag):" 33 33 34 34 MODULE_LICENSE("GPL"); 35 35 ··· 617 621 "dump sense not available for DIAG data"); 618 622 } 619 623 620 - /* 621 - * Initialize block layer request queue. 622 - */ 623 - static void dasd_diag_setup_blk_queue(struct dasd_block *block) 624 + static unsigned int dasd_diag_max_sectors(struct dasd_block *block) 624 625 { 625 - unsigned int logical_block_size = block->bp_block; 626 - struct request_queue *q = block->gdp->queue; 627 - int max; 628 - 629 - max = DIAG_MAX_BLOCKS << block->s2b_shift; 630 - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 631 - q->limits.max_dev_sectors = max; 632 - blk_queue_logical_block_size(q, logical_block_size); 633 - blk_queue_max_hw_sectors(q, max); 634 - blk_queue_max_segments(q, USHRT_MAX); 635 - /* With page sized segments each segment can be translated into one idaw/tidaw */ 636 - blk_queue_max_segment_size(q, PAGE_SIZE); 637 - blk_queue_segment_boundary(q, PAGE_SIZE - 1); 638 - blk_queue_dma_alignment(q, PAGE_SIZE - 1); 626 + return DIAG_MAX_BLOCKS << block->s2b_shift; 639 627 } 640 628 641 629 static int dasd_diag_pe_handler(struct dasd_device *device, ··· 632 652 .owner = THIS_MODULE, 633 653 .name = "DIAG", 634 654 .ebcname = "DIAG", 655 + .max_sectors = dasd_diag_max_sectors, 635 656 .check_device = dasd_diag_check_device, 636 657 .pe_handler = dasd_diag_pe_handler, 637 658 .fill_geometry = dasd_diag_fill_geometry, 638 - .setup_blk_queue = dasd_diag_setup_blk_queue, 639 659 .start_IO = dasd_start_diag, 640 660 .term_IO = dasd_diag_term_IO, 641 661 .handle_terminated_request = dasd_diag_handle_terminated_request,

+65 -121

drivers/s390/block/dasd_eckd.c

··· 10 10 * Author.........: Nigel Hislop <hislop_nigel@emc.com> 11 11 */ 12 12 13 - #define KMSG_COMPONENT "dasd-eckd" 14 - 15 13 #include <linux/stddef.h> 16 14 #include <linux/kernel.h> 17 15 #include <linux/slab.h> ··· 34 36 35 37 #include "dasd_int.h" 36 38 #include "dasd_eckd.h" 37 - 38 - #ifdef PRINTK_HEADER 39 - #undef PRINTK_HEADER 40 - #endif /* PRINTK_HEADER */ 41 - #define PRINTK_HEADER "dasd(eckd):" 42 39 43 40 /* 44 41 * raw track access always map to 64k in memory ··· 1065 1072 } 1066 1073 } 1067 1074 1068 - static void dasd_eckd_get_uid_string(struct dasd_conf *conf, 1069 - char *print_uid) 1075 + static void dasd_eckd_get_uid_string(struct dasd_conf *conf, char *print_uid) 1070 1076 { 1071 1077 struct dasd_uid uid; 1072 1078 1073 1079 create_uid(conf, &uid); 1074 - if (strlen(uid.vduit) > 0) 1075 - snprintf(print_uid, DASD_UID_STRLEN, 1076 - "%s.%s.%04x.%02x.%s", 1077 - uid.vendor, uid.serial, uid.ssid, 1078 - uid.real_unit_addr, uid.vduit); 1079 - else 1080 - snprintf(print_uid, DASD_UID_STRLEN, 1081 - "%s.%s.%04x.%02x", 1082 - uid.vendor, uid.serial, uid.ssid, 1083 - uid.real_unit_addr); 1080 + snprintf(print_uid, DASD_UID_STRLEN, "%s.%s.%04x.%02x%s%s", 1081 + uid.vendor, uid.serial, uid.ssid, uid.real_unit_addr, 1082 + uid.vduit[0] ? "." : "", uid.vduit); 1084 1083 } 1085 1084 1086 1085 static int dasd_eckd_check_cabling(struct dasd_device *device, ··· 5514 5529 * and return number of printed chars. 5515 5530 */ 5516 5531 static void 5517 - dasd_eckd_dump_ccw_range(struct ccw1 *from, struct ccw1 *to, char *page) 5532 + dasd_eckd_dump_ccw_range(struct dasd_device *device, struct ccw1 *from, 5533 + struct ccw1 *to, char *page) 5518 5534 { 5519 5535 int len, count; 5520 5536 char *datap; 5521 5537 5522 5538 len = 0; 5523 5539 while (from <= to) { 5524 - len += sprintf(page + len, PRINTK_HEADER 5525 - " CCW %p: %08X %08X DAT:", 5540 + len += sprintf(page + len, "CCW %px: %08X %08X DAT:", 5526 5541 from, ((int *) from)[0], ((int *) from)[1]); 5527 5542 5528 5543 /* get pointer to data (consider IDALs) */ ··· 5545 5560 from++; 5546 5561 } 5547 5562 if (len > 0) 5548 - printk(KERN_ERR "%s", page); 5563 + dev_err(&device->cdev->dev, "%s", page); 5549 5564 } 5550 5565 5551 5566 static void ··· 5576 5591 static void dasd_eckd_dump_sense_ccw(struct dasd_device *device, 5577 5592 struct dasd_ccw_req *req, struct irb *irb) 5578 5593 { 5579 - char *page; 5580 5594 struct ccw1 *first, *last, *fail, *from, *to; 5595 + struct device *dev; 5581 5596 int len, sl, sct; 5597 + char *page; 5598 + 5599 + dev = &device->cdev->dev; 5582 5600 5583 5601 page = (char *) get_zeroed_page(GFP_ATOMIC); 5584 5602 if (page == NULL) { ··· 5590 5602 return; 5591 5603 } 5592 5604 /* dump the sense data */ 5593 - len = sprintf(page, PRINTK_HEADER 5594 - " I/O status report for device %s:\n", 5595 - dev_name(&device->cdev->dev)); 5596 - len += sprintf(page + len, PRINTK_HEADER 5597 - " in req: %p CC:%02X FC:%02X AC:%02X SC:%02X DS:%02X " 5598 - "CS:%02X RC:%d\n", 5605 + len = sprintf(page, "I/O status report:\n"); 5606 + len += sprintf(page + len, 5607 + "in req: %px CC:%02X FC:%02X AC:%02X SC:%02X DS:%02X CS:%02X RC:%d\n", 5599 5608 req, scsw_cc(&irb->scsw), scsw_fctl(&irb->scsw), 5600 5609 scsw_actl(&irb->scsw), scsw_stctl(&irb->scsw), 5601 5610 scsw_dstat(&irb->scsw), scsw_cstat(&irb->scsw), 5602 5611 req ? req->intrc : 0); 5603 - len += sprintf(page + len, PRINTK_HEADER 5604 - " device %s: Failing CCW: %p\n", 5605 - dev_name(&device->cdev->dev), 5612 + len += sprintf(page + len, "Failing CCW: %px\n", 5606 5613 phys_to_virt(irb->scsw.cmd.cpa)); 5607 5614 if (irb->esw.esw0.erw.cons) { 5608 5615 for (sl = 0; sl < 4; sl++) { 5609 - len += sprintf(page + len, PRINTK_HEADER 5610 - " Sense(hex) %2d-%2d:", 5616 + len += sprintf(page + len, "Sense(hex) %2d-%2d:", 5611 5617 (8 * sl), ((8 * sl) + 7)); 5612 5618 5613 5619 for (sct = 0; sct < 8; sct++) { ··· 5613 5631 5614 5632 if (irb->ecw[27] & DASD_SENSE_BIT_0) { 5615 5633 /* 24 Byte Sense Data */ 5616 - sprintf(page + len, PRINTK_HEADER 5617 - " 24 Byte: %x MSG %x, " 5618 - "%s MSGb to SYSOP\n", 5634 + sprintf(page + len, 5635 + "24 Byte: %x MSG %x, %s MSGb to SYSOP\n", 5619 5636 irb->ecw[7] >> 4, irb->ecw[7] & 0x0f, 5620 5637 irb->ecw[1] & 0x10 ? "" : "no"); 5621 5638 } else { 5622 5639 /* 32 Byte Sense Data */ 5623 - sprintf(page + len, PRINTK_HEADER 5624 - " 32 Byte: Format: %x " 5625 - "Exception class %x\n", 5640 + sprintf(page + len, 5641 + "32 Byte: Format: %x Exception class %x\n", 5626 5642 irb->ecw[6] & 0x0f, irb->ecw[22] >> 4); 5627 5643 } 5628 5644 } else { 5629 - sprintf(page + len, PRINTK_HEADER 5630 - " SORRY - NO VALID SENSE AVAILABLE\n"); 5645 + sprintf(page + len, "SORRY - NO VALID SENSE AVAILABLE\n"); 5631 5646 } 5632 - printk(KERN_ERR "%s", page); 5647 + dev_err(dev, "%s", page); 5633 5648 5634 5649 if (req) { 5635 5650 /* req == NULL for unsolicited interrupts */ ··· 5635 5656 first = req->cpaddr; 5636 5657 for (last = first; last->flags & (CCW_FLAG_CC | CCW_FLAG_DC); last++); 5637 5658 to = min(first + 6, last); 5638 - printk(KERN_ERR PRINTK_HEADER " Related CP in req: %p\n", req); 5639 - dasd_eckd_dump_ccw_range(first, to, page); 5659 + dev_err(dev, "Related CP in req: %px\n", req); 5660 + dasd_eckd_dump_ccw_range(device, first, to, page); 5640 5661 5641 5662 /* print failing CCW area (maximum 4) */ 5642 5663 /* scsw->cda is either valid or zero */ ··· 5644 5665 fail = phys_to_virt(irb->scsw.cmd.cpa); /* failing CCW */ 5645 5666 if (from < fail - 2) { 5646 5667 from = fail - 2; /* there is a gap - print header */ 5647 - printk(KERN_ERR PRINTK_HEADER "......\n"); 5668 + dev_err(dev, "......\n"); 5648 5669 } 5649 5670 to = min(fail + 1, last); 5650 - dasd_eckd_dump_ccw_range(from, to, page + len); 5671 + dasd_eckd_dump_ccw_range(device, from, to, page + len); 5651 5672 5652 5673 /* print last CCWs (maximum 2) */ 5653 5674 len = 0; 5654 5675 from = max(from, ++to); 5655 5676 if (from < last - 1) { 5656 5677 from = last - 1; /* there is a gap - print header */ 5657 - printk(KERN_ERR PRINTK_HEADER "......\n"); 5678 + dev_err(dev, "......\n"); 5658 5679 } 5659 - dasd_eckd_dump_ccw_range(from, last, page + len); 5680 + dasd_eckd_dump_ccw_range(device, from, last, page + len); 5660 5681 } 5661 5682 free_page((unsigned long) page); 5662 5683 } ··· 5680 5701 return; 5681 5702 } 5682 5703 /* dump the sense data */ 5683 - len = sprintf(page, PRINTK_HEADER 5684 - " I/O status report for device %s:\n", 5685 - dev_name(&device->cdev->dev)); 5686 - len += sprintf(page + len, PRINTK_HEADER 5687 - " in req: %p CC:%02X FC:%02X AC:%02X SC:%02X DS:%02X " 5704 + len = sprintf(page, "I/O status report:\n"); 5705 + len += sprintf(page + len, 5706 + "in req: %px CC:%02X FC:%02X AC:%02X SC:%02X DS:%02X " 5688 5707 "CS:%02X fcxs:%02X schxs:%02X RC:%d\n", 5689 5708 req, scsw_cc(&irb->scsw), scsw_fctl(&irb->scsw), 5690 5709 scsw_actl(&irb->scsw), scsw_stctl(&irb->scsw), ··· 5690 5713 irb->scsw.tm.fcxs, 5691 5714 (irb->scsw.tm.ifob << 7) | irb->scsw.tm.sesq, 5692 5715 req ? req->intrc : 0); 5693 - len += sprintf(page + len, PRINTK_HEADER 5694 - " device %s: Failing TCW: %p\n", 5695 - dev_name(&device->cdev->dev), 5716 + len += sprintf(page + len, "Failing TCW: %px\n", 5696 5717 phys_to_virt(irb->scsw.tm.tcw)); 5697 5718 5698 5719 tsb = NULL; ··· 5699 5724 tsb = tcw_get_tsb(phys_to_virt(irb->scsw.tm.tcw)); 5700 5725 5701 5726 if (tsb) { 5702 - len += sprintf(page + len, PRINTK_HEADER 5703 - " tsb->length %d\n", tsb->length); 5704 - len += sprintf(page + len, PRINTK_HEADER 5705 - " tsb->flags %x\n", tsb->flags); 5706 - len += sprintf(page + len, PRINTK_HEADER 5707 - " tsb->dcw_offset %d\n", tsb->dcw_offset); 5708 - len += sprintf(page + len, PRINTK_HEADER 5709 - " tsb->count %d\n", tsb->count); 5727 + len += sprintf(page + len, "tsb->length %d\n", tsb->length); 5728 + len += sprintf(page + len, "tsb->flags %x\n", tsb->flags); 5729 + len += sprintf(page + len, "tsb->dcw_offset %d\n", tsb->dcw_offset); 5730 + len += sprintf(page + len, "tsb->count %d\n", tsb->count); 5710 5731 residual = tsb->count - 28; 5711 - len += sprintf(page + len, PRINTK_HEADER 5712 - " residual %d\n", residual); 5732 + len += sprintf(page + len, "residual %d\n", residual); 5713 5733 5714 5734 switch (tsb->flags & 0x07) { 5715 5735 case 1: /* tsa_iostat */ 5716 - len += sprintf(page + len, PRINTK_HEADER 5717 - " tsb->tsa.iostat.dev_time %d\n", 5736 + len += sprintf(page + len, "tsb->tsa.iostat.dev_time %d\n", 5718 5737 tsb->tsa.iostat.dev_time); 5719 - len += sprintf(page + len, PRINTK_HEADER 5720 - " tsb->tsa.iostat.def_time %d\n", 5738 + len += sprintf(page + len, "tsb->tsa.iostat.def_time %d\n", 5721 5739 tsb->tsa.iostat.def_time); 5722 - len += sprintf(page + len, PRINTK_HEADER 5723 - " tsb->tsa.iostat.queue_time %d\n", 5740 + len += sprintf(page + len, "tsb->tsa.iostat.queue_time %d\n", 5724 5741 tsb->tsa.iostat.queue_time); 5725 - len += sprintf(page + len, PRINTK_HEADER 5726 - " tsb->tsa.iostat.dev_busy_time %d\n", 5742 + len += sprintf(page + len, "tsb->tsa.iostat.dev_busy_time %d\n", 5727 5743 tsb->tsa.iostat.dev_busy_time); 5728 - len += sprintf(page + len, PRINTK_HEADER 5729 - " tsb->tsa.iostat.dev_act_time %d\n", 5744 + len += sprintf(page + len, "tsb->tsa.iostat.dev_act_time %d\n", 5730 5745 tsb->tsa.iostat.dev_act_time); 5731 5746 sense = tsb->tsa.iostat.sense; 5732 5747 break; 5733 5748 case 2: /* ts_ddpc */ 5734 - len += sprintf(page + len, PRINTK_HEADER 5735 - " tsb->tsa.ddpc.rc %d\n", tsb->tsa.ddpc.rc); 5749 + len += sprintf(page + len, "tsb->tsa.ddpc.rc %d\n", 5750 + tsb->tsa.ddpc.rc); 5736 5751 for (sl = 0; sl < 2; sl++) { 5737 - len += sprintf(page + len, PRINTK_HEADER 5738 - " tsb->tsa.ddpc.rcq %2d-%2d: ", 5752 + len += sprintf(page + len, 5753 + "tsb->tsa.ddpc.rcq %2d-%2d: ", 5739 5754 (8 * sl), ((8 * sl) + 7)); 5740 5755 rcq = tsb->tsa.ddpc.rcq; 5741 5756 for (sct = 0; sct < 8; sct++) { 5742 - len += sprintf(page + len, " %02x", 5757 + len += sprintf(page + len, "%02x", 5743 5758 rcq[8 * sl + sct]); 5744 5759 } 5745 5760 len += sprintf(page + len, "\n"); ··· 5737 5772 sense = tsb->tsa.ddpc.sense; 5738 5773 break; 5739 5774 case 3: /* tsa_intrg */ 5740 - len += sprintf(page + len, PRINTK_HEADER 5741 - " tsb->tsa.intrg.: not supported yet\n"); 5775 + len += sprintf(page + len, 5776 + "tsb->tsa.intrg.: not supported yet\n"); 5742 5777 break; 5743 5778 } 5744 5779 5745 5780 if (sense) { 5746 5781 for (sl = 0; sl < 4; sl++) { 5747 - len += sprintf(page + len, PRINTK_HEADER 5748 - " Sense(hex) %2d-%2d:", 5782 + len += sprintf(page + len, 5783 + "Sense(hex) %2d-%2d:", 5749 5784 (8 * sl), ((8 * sl) + 7)); 5750 5785 for (sct = 0; sct < 8; sct++) { 5751 5786 len += sprintf(page + len, " %02x", ··· 5756 5791 5757 5792 if (sense[27] & DASD_SENSE_BIT_0) { 5758 5793 /* 24 Byte Sense Data */ 5759 - sprintf(page + len, PRINTK_HEADER 5760 - " 24 Byte: %x MSG %x, " 5761 - "%s MSGb to SYSOP\n", 5794 + sprintf(page + len, 5795 + "24 Byte: %x MSG %x, %s MSGb to SYSOP\n", 5762 5796 sense[7] >> 4, sense[7] & 0x0f, 5763 5797 sense[1] & 0x10 ? "" : "no"); 5764 5798 } else { 5765 5799 /* 32 Byte Sense Data */ 5766 - sprintf(page + len, PRINTK_HEADER 5767 - " 32 Byte: Format: %x " 5768 - "Exception class %x\n", 5800 + sprintf(page + len, 5801 + "32 Byte: Format: %x Exception class %x\n", 5769 5802 sense[6] & 0x0f, sense[22] >> 4); 5770 5803 } 5771 5804 } else { 5772 - sprintf(page + len, PRINTK_HEADER 5773 - " SORRY - NO VALID SENSE AVAILABLE\n"); 5805 + sprintf(page + len, "SORRY - NO VALID SENSE AVAILABLE\n"); 5774 5806 } 5775 5807 } else { 5776 - sprintf(page + len, PRINTK_HEADER 5777 - " SORRY - NO TSB DATA AVAILABLE\n"); 5808 + sprintf(page + len, "SORRY - NO TSB DATA AVAILABLE\n"); 5778 5809 } 5779 - printk(KERN_ERR "%s", page); 5810 + dev_err(&device->cdev->dev, "%s", page); 5780 5811 free_page((unsigned long) page); 5781 5812 } 5782 5813 ··· 6826 6865 dasd_schedule_requeue(device); 6827 6866 } 6828 6867 6829 - /* 6830 - * Initialize block layer request queue. 6831 - */ 6832 - static void dasd_eckd_setup_blk_queue(struct dasd_block *block) 6868 + static unsigned int dasd_eckd_max_sectors(struct dasd_block *block) 6833 6869 { 6834 - unsigned int logical_block_size = block->bp_block; 6835 - struct request_queue *q = block->gdp->queue; 6836 - struct dasd_device *device = block->base; 6837 - int max; 6838 - 6839 - if (device->features & DASD_FEATURE_USERAW) { 6870 + if (block->base->features & DASD_FEATURE_USERAW) { 6840 6871 /* 6841 6872 * the max_blocks value for raw_track access is 256 6842 6873 * it is higher than the native ECKD value because we ··· 6836 6883 * so the max_hw_sectors are 6837 6884 * 2048 x 512B = 1024kB = 16 tracks 6838 6885 */ 6839 - max = DASD_ECKD_MAX_BLOCKS_RAW << block->s2b_shift; 6840 - } else { 6841 - max = DASD_ECKD_MAX_BLOCKS << block->s2b_shift; 6886 + return DASD_ECKD_MAX_BLOCKS_RAW << block->s2b_shift; 6842 6887 } 6843 - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 6844 - q->limits.max_dev_sectors = max; 6845 - blk_queue_logical_block_size(q, logical_block_size); 6846 - blk_queue_max_hw_sectors(q, max); 6847 - blk_queue_max_segments(q, USHRT_MAX); 6848 - /* With page sized segments each segment can be translated into one idaw/tidaw */ 6849 - blk_queue_max_segment_size(q, PAGE_SIZE); 6850 - blk_queue_segment_boundary(q, PAGE_SIZE - 1); 6851 - blk_queue_dma_alignment(q, PAGE_SIZE - 1); 6888 + 6889 + return DASD_ECKD_MAX_BLOCKS << block->s2b_shift; 6852 6890 } 6853 6891 6854 6892 static struct ccw_driver dasd_eckd_driver = { ··· 6871 6927 .basic_to_ready = dasd_eckd_basic_to_ready, 6872 6928 .online_to_ready = dasd_eckd_online_to_ready, 6873 6929 .basic_to_known = dasd_eckd_basic_to_known, 6874 - .setup_blk_queue = dasd_eckd_setup_blk_queue, 6930 + .max_sectors = dasd_eckd_max_sectors, 6875 6931 .fill_geometry = dasd_eckd_fill_geometry, 6876 6932 .start_IO = dasd_start_IO, 6877 6933 .term_IO = dasd_term_IO,

-7

drivers/s390/block/dasd_eer.c

··· 7 7 * Author(s): Stefan Weinhuber <wein@de.ibm.com> 8 8 */ 9 9 10 - #define KMSG_COMPONENT "dasd-eckd" 11 - 12 10 #include <linux/init.h> 13 11 #include <linux/fs.h> 14 12 #include <linux/kernel.h> ··· 25 27 26 28 #include "dasd_int.h" 27 29 #include "dasd_eckd.h" 28 - 29 - #ifdef PRINTK_HEADER 30 - #undef PRINTK_HEADER 31 - #endif /* PRINTK_HEADER */ 32 - #define PRINTK_HEADER "dasd(eer):" 33 30 34 31 /* 35 32 * SECTION: the internal buffer

+2 -7

drivers/s390/block/dasd_erp.c

··· 9 9 * 10 10 */ 11 11 12 - #define KMSG_COMPONENT "dasd" 13 - 14 12 #include <linux/ctype.h> 15 13 #include <linux/init.h> 16 14 17 15 #include <asm/debug.h> 18 16 #include <asm/ebcdic.h> 19 17 #include <linux/uaccess.h> 20 - 21 - /* This is ugly... */ 22 - #define PRINTK_HEADER "dasd_erp:" 23 18 24 19 #include "dasd_int.h" 25 20 ··· 165 170 device = cqr->startdev; 166 171 if (cqr->intrc == -ETIMEDOUT) { 167 172 dev_err(&device->cdev->dev, 168 - "A timeout error occurred for cqr %p\n", cqr); 173 + "A timeout error occurred for cqr %px\n", cqr); 169 174 return; 170 175 } 171 176 if (cqr->intrc == -ENOLINK) { 172 177 dev_err(&device->cdev->dev, 173 - "A transport error occurred for cqr %p\n", cqr); 178 + "A transport error occurred for cqr %px\n", cqr); 174 179 return; 175 180 } 176 181 /* dump sense data */

+25 -63

drivers/s390/block/dasd_fba.c

··· 25 25 #include "dasd_int.h" 26 26 #include "dasd_fba.h" 27 27 28 - #ifdef PRINTK_HEADER 29 - #undef PRINTK_HEADER 30 - #endif /* PRINTK_HEADER */ 31 - #define PRINTK_HEADER "dasd(fba):" 32 - 33 28 #define FBA_DEFAULT_RETRIES 32 34 29 35 30 #define DASD_FBA_CCW_WRITE 0x41 ··· 655 660 dasd_fba_dump_sense(struct dasd_device *device, struct dasd_ccw_req * req, 656 661 struct irb *irb) 657 662 { 658 - char *page; 659 663 struct ccw1 *act, *end, *last; 660 664 int len, sl, sct, count; 665 + struct device *dev; 666 + char *page; 667 + 668 + dev = &device->cdev->dev; 661 669 662 670 page = (char *) get_zeroed_page(GFP_ATOMIC); 663 671 if (page == NULL) { 664 672 DBF_DEV_EVENT(DBF_WARNING, device, "%s", 665 - "No memory to dump sense data"); 673 + "No memory to dump sense data"); 666 674 return; 667 675 } 668 - len = sprintf(page, PRINTK_HEADER 669 - " I/O status report for device %s:\n", 670 - dev_name(&device->cdev->dev)); 671 - len += sprintf(page + len, PRINTK_HEADER 672 - " in req: %p CS: 0x%02X DS: 0x%02X\n", req, 673 - irb->scsw.cmd.cstat, irb->scsw.cmd.dstat); 674 - len += sprintf(page + len, PRINTK_HEADER 675 - " device %s: Failing CCW: %p\n", 676 - dev_name(&device->cdev->dev), 676 + len = sprintf(page, "I/O status report:\n"); 677 + len += sprintf(page + len, "in req: %px CS: 0x%02X DS: 0x%02X\n", 678 + req, irb->scsw.cmd.cstat, irb->scsw.cmd.dstat); 679 + len += sprintf(page + len, "Failing CCW: %px\n", 677 680 (void *) (addr_t) irb->scsw.cmd.cpa); 678 681 if (irb->esw.esw0.erw.cons) { 679 682 for (sl = 0; sl < 4; sl++) { 680 - len += sprintf(page + len, PRINTK_HEADER 681 - " Sense(hex) %2d-%2d:", 683 + len += sprintf(page + len, "Sense(hex) %2d-%2d:", 682 684 (8 * sl), ((8 * sl) + 7)); 683 685 684 686 for (sct = 0; sct < 8; sct++) { ··· 685 693 len += sprintf(page + len, "\n"); 686 694 } 687 695 } else { 688 - len += sprintf(page + len, PRINTK_HEADER 689 - " SORRY - NO VALID SENSE AVAILABLE\n"); 696 + len += sprintf(page + len, "SORRY - NO VALID SENSE AVAILABLE\n"); 690 697 } 691 - printk(KERN_ERR "%s", page); 698 + dev_err(dev, "%s", page); 692 699 693 700 /* dump the Channel Program */ 694 701 /* print first CCWs (maximum 8) */ 695 702 act = req->cpaddr; 696 - for (last = act; last->flags & (CCW_FLAG_CC | CCW_FLAG_DC); last++); 703 + for (last = act; last->flags & (CCW_FLAG_CC | CCW_FLAG_DC); last++); 697 704 end = min(act + 8, last); 698 - len = sprintf(page, PRINTK_HEADER " Related CP in req: %p\n", req); 705 + len = sprintf(page, "Related CP in req: %px\n", req); 699 706 while (act <= end) { 700 - len += sprintf(page + len, PRINTK_HEADER 701 - " CCW %p: %08X %08X DAT:", 707 + len += sprintf(page + len, "CCW %px: %08X %08X DAT:", 702 708 act, ((int *) act)[0], ((int *) act)[1]); 703 709 for (count = 0; count < 32 && count < act->count; 704 710 count += sizeof(int)) ··· 706 716 len += sprintf(page + len, "\n"); 707 717 act++; 708 718 } 709 - printk(KERN_ERR "%s", page); 710 - 719 + dev_err(dev, "%s", page); 711 720 712 721 /* print failing CCW area */ 713 722 len = 0; 714 723 if (act < ((struct ccw1 *)(addr_t) irb->scsw.cmd.cpa) - 2) { 715 724 act = ((struct ccw1 *)(addr_t) irb->scsw.cmd.cpa) - 2; 716 - len += sprintf(page + len, PRINTK_HEADER "......\n"); 725 + len += sprintf(page + len, "......\n"); 717 726 } 718 727 end = min((struct ccw1 *)(addr_t) irb->scsw.cmd.cpa + 2, last); 719 728 while (act <= end) { 720 - len += sprintf(page + len, PRINTK_HEADER 721 - " CCW %p: %08X %08X DAT:", 729 + len += sprintf(page + len, "CCW %px: %08X %08X DAT:", 722 730 act, ((int *) act)[0], ((int *) act)[1]); 723 731 for (count = 0; count < 32 && count < act->count; 724 732 count += sizeof(int)) ··· 730 742 /* print last CCWs */ 731 743 if (act < last - 2) { 732 744 act = last - 2; 733 - len += sprintf(page + len, PRINTK_HEADER "......\n"); 745 + len += sprintf(page + len, "......\n"); 734 746 } 735 747 while (act <= last) { 736 - len += sprintf(page + len, PRINTK_HEADER 737 - " CCW %p: %08X %08X DAT:", 748 + len += sprintf(page + len, "CCW %px: %08X %08X DAT:", 738 749 act, ((int *) act)[0], ((int *) act)[1]); 739 750 for (count = 0; count < 32 && count < act->count; 740 751 count += sizeof(int)) ··· 744 757 act++; 745 758 } 746 759 if (len > 0) 747 - printk(KERN_ERR "%s", page); 760 + dev_err(dev, "%s", page); 748 761 free_page((unsigned long) page); 749 762 } 750 763 751 - /* 752 - * Initialize block layer request queue. 753 - */ 754 - static void dasd_fba_setup_blk_queue(struct dasd_block *block) 764 + static unsigned int dasd_fba_max_sectors(struct dasd_block *block) 755 765 { 756 - unsigned int logical_block_size = block->bp_block; 757 - struct request_queue *q = block->gdp->queue; 758 - unsigned int max_bytes, max_discard_sectors; 759 - int max; 760 - 761 - max = DASD_FBA_MAX_BLOCKS << block->s2b_shift; 762 - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 763 - q->limits.max_dev_sectors = max; 764 - blk_queue_logical_block_size(q, logical_block_size); 765 - blk_queue_max_hw_sectors(q, max); 766 - blk_queue_max_segments(q, USHRT_MAX); 767 - /* With page sized segments each segment can be translated into one idaw/tidaw */ 768 - blk_queue_max_segment_size(q, PAGE_SIZE); 769 - blk_queue_segment_boundary(q, PAGE_SIZE - 1); 770 - 771 - q->limits.discard_granularity = logical_block_size; 772 - 773 - /* Calculate max_discard_sectors and make it PAGE aligned */ 774 - max_bytes = USHRT_MAX * logical_block_size; 775 - max_bytes = ALIGN_DOWN(max_bytes, PAGE_SIZE); 776 - max_discard_sectors = max_bytes / logical_block_size; 777 - 778 - blk_queue_max_discard_sectors(q, max_discard_sectors); 779 - blk_queue_max_write_zeroes_sectors(q, max_discard_sectors); 766 + return DASD_FBA_MAX_BLOCKS << block->s2b_shift; 780 767 } 781 768 782 769 static int dasd_fba_pe_handler(struct dasd_device *device, ··· 763 802 .owner = THIS_MODULE, 764 803 .name = "FBA ", 765 804 .ebcname = "FBA ", 805 + .has_discard = true, 766 806 .check_device = dasd_fba_check_characteristics, 767 807 .do_analysis = dasd_fba_do_analysis, 768 808 .pe_handler = dasd_fba_pe_handler, 769 - .setup_blk_queue = dasd_fba_setup_blk_queue, 809 + .max_sectors = dasd_fba_max_sectors, 770 810 .fill_geometry = dasd_fba_fill_geometry, 771 811 .start_IO = dasd_start_IO, 772 812 .term_IO = dasd_term_IO,

+12 -6

drivers/s390/block/dasd_genhd.c

··· 11 11 * 12 12 */ 13 13 14 - #define KMSG_COMPONENT "dasd" 15 - 16 14 #include <linux/interrupt.h> 17 15 #include <linux/major.h> 18 16 #include <linux/fs.h> 19 17 #include <linux/blkpg.h> 20 18 21 19 #include <linux/uaccess.h> 22 - 23 - /* This is ugly... */ 24 - #define PRINTK_HEADER "dasd_gendisk:" 25 20 26 21 #include "dasd_int.h" 27 22 ··· 34 39 */ 35 40 int dasd_gendisk_alloc(struct dasd_block *block) 36 41 { 42 + struct queue_limits lim = { 43 + /* 44 + * With page sized segments, each segment can be translated into 45 + * one idaw/tidaw. 46 + */ 47 + .max_segment_size = PAGE_SIZE, 48 + .seg_boundary_mask = PAGE_SIZE - 1, 49 + .dma_alignment = PAGE_SIZE - 1, 50 + .max_segments = USHRT_MAX, 51 + }; 37 52 struct gendisk *gdp; 38 53 struct dasd_device *base; 39 54 int len, rc; ··· 63 58 if (rc) 64 59 return rc; 65 60 66 - gdp = blk_mq_alloc_disk(&block->tag_set, block); 61 + gdp = blk_mq_alloc_disk(&block->tag_set, &lim, block); 67 62 if (IS_ERR(gdp)) { 68 63 blk_mq_free_tag_set(&block->tag_set); 69 64 return PTR_ERR(gdp); 70 65 } 66 + blk_queue_flag_set(QUEUE_FLAG_NONROT, gdp->queue); 71 67 72 68 /* Initialize gendisk structure. */ 73 69 gdp->major = DASD_MAJOR;

+2 -33

drivers/s390/block/dasd_int.h

··· 113 113 __dev_id.ssid, __dev_id.devno, d_data); \ 114 114 } while (0) 115 115 116 - /* limit size for an errorstring */ 117 - #define ERRORLENGTH 30 118 - 119 116 /* definition of dbf debug levels */ 120 117 #define DBF_EMERG 0 /* system is unusable */ 121 118 #define DBF_ALERT 1 /* action must be taken immediately */ ··· 122 125 #define DBF_NOTICE 5 /* normal but significant condition */ 123 126 #define DBF_INFO 6 /* informational */ 124 127 #define DBF_DEBUG 6 /* debug-level messages */ 125 - 126 - /* messages to be written via klogd and dbf */ 127 - #define DEV_MESSAGE(d_loglevel,d_device,d_string,d_args...)\ 128 - do { \ 129 - printk(d_loglevel PRINTK_HEADER " %s: " d_string "\n", \ 130 - dev_name(&d_device->cdev->dev), d_args); \ 131 - DBF_DEV_EVENT(DBF_ALERT, d_device, d_string, d_args); \ 132 - } while(0) 133 - 134 - #define MESSAGE(d_loglevel,d_string,d_args...)\ 135 - do { \ 136 - printk(d_loglevel PRINTK_HEADER " " d_string "\n", d_args); \ 137 - DBF_EVENT(DBF_ALERT, d_string, d_args); \ 138 - } while(0) 139 - 140 - /* messages to be written via klogd only */ 141 - #define DEV_MESSAGE_LOG(d_loglevel,d_device,d_string,d_args...)\ 142 - do { \ 143 - printk(d_loglevel PRINTK_HEADER " %s: " d_string "\n", \ 144 - dev_name(&d_device->cdev->dev), d_args); \ 145 - } while(0) 146 - 147 - #define MESSAGE_LOG(d_loglevel,d_string,d_args...)\ 148 - do { \ 149 - printk(d_loglevel PRINTK_HEADER " " d_string "\n", d_args); \ 150 - } while(0) 151 128 152 129 /* Macro to calculate number of blocks per page */ 153 130 #define BLOCKS_PER_PAGE(blksize) (PAGE_SIZE / blksize) ··· 293 322 struct module *owner; 294 323 char ebcname[8]; /* a name used for tagging and printks */ 295 324 char name[8]; /* a name used for tagging and printks */ 325 + bool has_discard; 296 326 297 327 struct list_head list; /* used for list of disciplines */ 298 328 ··· 332 360 int (*online_to_ready) (struct dasd_device *); 333 361 int (*basic_to_known)(struct dasd_device *); 334 362 335 - /* 336 - * Initialize block layer request queue. 337 - */ 338 - void (*setup_blk_queue)(struct dasd_block *); 363 + unsigned int (*max_sectors)(struct dasd_block *); 339 364 /* (struct dasd_device *); 340 365 * Device operation functions. build_cp creates a ccw chain for 341 366 * a block device request, start_io starts the request and

-6

drivers/s390/block/dasd_ioctl.c

··· 10 10 * i/o controls for the dasd driver. 11 11 */ 12 12 13 - #define KMSG_COMPONENT "dasd" 14 - 15 13 #include <linux/interrupt.h> 16 14 #include <linux/compat.h> 17 15 #include <linux/major.h> ··· 22 24 #include <linux/uaccess.h> 23 25 #include <linux/dasd_mod.h> 24 26 25 - /* This is ugly... */ 26 - #define PRINTK_HEADER "dasd_ioctl:" 27 - 28 27 #include "dasd_int.h" 29 - 30 28 31 29 static int 32 30 dasd_ioctl_api_version(void __user *argp)

-5

drivers/s390/block/dasd_proc.c

··· 11 11 * 12 12 */ 13 13 14 - #define KMSG_COMPONENT "dasd" 15 - 16 14 #include <linux/ctype.h> 17 15 #include <linux/slab.h> 18 16 #include <linux/string.h> ··· 20 22 21 23 #include <asm/debug.h> 22 24 #include <linux/uaccess.h> 23 - 24 - /* This is ugly... */ 25 - #define PRINTK_HEADER "dasd_proc:" 26 25 27 26 #include "dasd_int.h" 28 27

+6 -4

drivers/s390/block/dcssblk.c

··· 546 546 static ssize_t 547 547 dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) 548 548 { 549 + struct queue_limits lim = { 550 + .logical_block_size = 4096, 551 + }; 549 552 int rc, i, j, num_of_segments; 550 553 struct dcssblk_dev_info *dev_info; 551 554 struct segment_info *seg_info, *temp; ··· 632 629 dev_info->dev.release = dcssblk_release_segment; 633 630 dev_info->dev.groups = dcssblk_dev_attr_groups; 634 631 INIT_LIST_HEAD(&dev_info->lh); 635 - dev_info->gd = blk_alloc_disk(NUMA_NO_NODE); 636 - if (dev_info->gd == NULL) { 637 - rc = -ENOMEM; 632 + dev_info->gd = blk_alloc_disk(&lim, NUMA_NO_NODE); 633 + if (IS_ERR(dev_info->gd)) { 634 + rc = PTR_ERR(dev_info->gd); 638 635 goto seg_list_del; 639 636 } 640 637 dev_info->gd->major = dcssblk_major; ··· 642 639 dev_info->gd->fops = &dcssblk_devops; 643 640 dev_info->gd->private_data = dev_info; 644 641 dev_info->gd->flags |= GENHD_FL_NO_PART; 645 - blk_queue_logical_block_size(dev_info->gd->queue, 4096); 646 642 blk_queue_flag_set(QUEUE_FLAG_DAX, dev_info->gd->queue); 647 643 648 644 seg_byte_size = (dev_info->end - dev_info->start + 1);

+9 -8

drivers/s390/block/scm_blk.c

··· 435 435 436 436 int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) 437 437 { 438 - unsigned int devindex, nr_max_blk; 438 + struct queue_limits lim = { 439 + .logical_block_size = 1 << 12, 440 + }; 441 + unsigned int devindex; 439 442 struct request_queue *rq; 440 443 int len, ret; 444 + 445 + lim.max_segments = min(scmdev->nr_max_block, 446 + (unsigned int) (PAGE_SIZE / sizeof(struct aidaw))); 447 + lim.max_hw_sectors = lim.max_segments << 3; /* 8 * 512 = blk_size */ 441 448 442 449 devindex = atomic_inc_return(&nr_devices) - 1; 443 450 /* scma..scmz + scmaa..scmzz */ ··· 469 462 if (ret) 470 463 goto out; 471 464 472 - bdev->gendisk = blk_mq_alloc_disk(&bdev->tag_set, scmdev); 465 + bdev->gendisk = blk_mq_alloc_disk(&bdev->tag_set, &lim, scmdev); 473 466 if (IS_ERR(bdev->gendisk)) { 474 467 ret = PTR_ERR(bdev->gendisk); 475 468 goto out_tag; 476 469 } 477 470 rq = bdev->rq = bdev->gendisk->queue; 478 - nr_max_blk = min(scmdev->nr_max_block, 479 - (unsigned int) (PAGE_SIZE / sizeof(struct aidaw))); 480 - 481 - blk_queue_logical_block_size(rq, 1 << 12); 482 - blk_queue_max_hw_sectors(rq, nr_max_blk << 3); /* 8 * 512 = blk_size */ 483 - blk_queue_max_segments(rq, nr_max_blk); 484 471 blk_queue_flag_set(QUEUE_FLAG_NONROT, rq); 485 472 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, rq); 486 473

+1 -1

drivers/scsi/scsi_scan.c

··· 332 332 333 333 sdev->sg_reserved_size = INT_MAX; 334 334 335 - q = blk_mq_init_queue(&sdev->host->tag_set); 335 + q = blk_mq_alloc_queue(&sdev->host->tag_set, NULL, NULL); 336 336 if (IS_ERR(q)) { 337 337 /* release fn is set up in scsi_sysfs_device_initialise, so 338 338 * have to free and put manually here */

+1 -1

drivers/ufs/core/ufshcd.c

··· 10593 10593 err = blk_mq_alloc_tag_set(&hba->tmf_tag_set); 10594 10594 if (err < 0) 10595 10595 goto out_remove_scsi_host; 10596 - hba->tmf_queue = blk_mq_init_queue(&hba->tmf_tag_set); 10596 + hba->tmf_queue = blk_mq_alloc_queue(&hba->tmf_tag_set, NULL, NULL); 10597 10597 if (IS_ERR(hba->tmf_queue)) { 10598 10598 err = PTR_ERR(hba->tmf_queue); 10599 10599 goto free_tmf_tag_set;

+25 -10

fs/btrfs/zoned.c

··· 824 824 reset = &zones[1]; 825 825 826 826 if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { 827 + unsigned int nofs_flags; 828 + 827 829 ASSERT(sb_zone_is_full(reset)); 828 830 831 + nofs_flags = memalloc_nofs_save(); 829 832 ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 830 - reset->start, reset->len, 831 - GFP_NOFS); 833 + reset->start, reset->len); 834 + memalloc_nofs_restore(nofs_flags); 832 835 if (ret) 833 836 return ret; 834 837 ··· 977 974 * explicit ZONE_FINISH is not necessary. 978 975 */ 979 976 if (zone->wp != zone->start + zone->capacity) { 977 + unsigned int nofs_flags; 980 978 int ret; 981 979 980 + nofs_flags = memalloc_nofs_save(); 982 981 ret = blkdev_zone_mgmt(device->bdev, 983 982 REQ_OP_ZONE_FINISH, zone->start, 984 - zone->len, GFP_NOFS); 983 + zone->len); 984 + memalloc_nofs_restore(nofs_flags); 985 985 if (ret) 986 986 return ret; 987 987 } ··· 1002 996 1003 997 int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) 1004 998 { 999 + unsigned int nofs_flags; 1005 1000 sector_t zone_sectors; 1006 1001 sector_t nr_sectors; 1007 1002 u8 zone_sectors_shift; 1008 1003 u32 sb_zone; 1009 1004 u32 nr_zones; 1005 + int ret; 1010 1006 1011 1007 zone_sectors = bdev_zone_sectors(bdev); 1012 1008 zone_sectors_shift = ilog2(zone_sectors); ··· 1019 1011 if (sb_zone + 1 >= nr_zones) 1020 1012 return -ENOENT; 1021 1013 1022 - return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 1023 - zone_start_sector(sb_zone, bdev), 1024 - zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); 1014 + nofs_flags = memalloc_nofs_save(); 1015 + ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 1016 + zone_start_sector(sb_zone, bdev), 1017 + zone_sectors * BTRFS_NR_SB_LOG_ZONES); 1018 + memalloc_nofs_restore(nofs_flags); 1019 + return ret; 1025 1020 } 1026 1021 1027 1022 /* ··· 1135 1124 int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, 1136 1125 u64 length, u64 *bytes) 1137 1126 { 1127 + unsigned int nofs_flags; 1138 1128 int ret; 1139 1129 1140 1130 *bytes = 0; 1131 + nofs_flags = memalloc_nofs_save(); 1141 1132 ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, 1142 - physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, 1143 - GFP_NOFS); 1133 + physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT); 1134 + memalloc_nofs_restore(nofs_flags); 1144 1135 if (ret) 1145 1136 return ret; 1146 1137 ··· 2257 2244 struct btrfs_device *device = map->stripes[i].dev; 2258 2245 const u64 physical = map->stripes[i].physical; 2259 2246 struct btrfs_zoned_device_info *zinfo = device->zone_info; 2247 + unsigned int nofs_flags; 2260 2248 2261 2249 if (zinfo->max_active_zones == 0) 2262 2250 continue; 2263 2251 2252 + nofs_flags = memalloc_nofs_save(); 2264 2253 ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, 2265 2254 physical >> SECTOR_SHIFT, 2266 - zinfo->zone_size >> SECTOR_SHIFT, 2267 - GFP_NOFS); 2255 + zinfo->zone_size >> SECTOR_SHIFT); 2256 + memalloc_nofs_restore(nofs_flags); 2268 2257 2269 2258 if (ret) 2270 2259 return ret;

+12 -3

fs/f2fs/segment.c

··· 1971 1971 } 1972 1972 1973 1973 if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) { 1974 + unsigned int nofs_flags; 1975 + int ret; 1976 + 1974 1977 trace_f2fs_issue_reset_zone(bdev, blkstart); 1975 - return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 1976 - sector, nr_sects, GFP_NOFS); 1978 + nofs_flags = memalloc_nofs_save(); 1979 + ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 1980 + sector, nr_sects); 1981 + memalloc_nofs_restore(nofs_flags); 1982 + return ret; 1977 1983 } 1978 1984 1979 1985 __queue_zone_reset_cmd(sbi, bdev, blkstart, lblkstart, blklen); ··· 4871 4865 block_t zone_block, valid_block_cnt; 4872 4866 unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; 4873 4867 int ret; 4868 + unsigned int nofs_flags; 4874 4869 4875 4870 if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) 4876 4871 return 0; ··· 4919 4912 "pointer: valid block[0x%x,0x%x] cond[0x%x]", 4920 4913 zone_segno, valid_block_cnt, zone->cond); 4921 4914 4915 + nofs_flags = memalloc_nofs_save(); 4922 4916 ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH, 4923 - zone->start, zone->len, GFP_NOFS); 4917 + zone->start, zone->len); 4918 + memalloc_nofs_restore(nofs_flags); 4924 4919 if (ret == -EOPNOTSUPP) { 4925 4920 ret = blkdev_issue_zeroout(fdev->bdev, zone->wp, 4926 4921 zone->len - (zone->wp - zone->start),

+1 -1

fs/zonefs/super.c

··· 113 113 114 114 trace_zonefs_zone_mgmt(sb, z, op); 115 115 ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector, 116 - z->z_size >> SECTOR_SHIFT, GFP_NOFS); 116 + z->z_size >> SECTOR_SHIFT); 117 117 if (ret) { 118 118 zonefs_err(sb, 119 119 "Zone management operation %s at %llu failed %d\n",

+1

include/linux/blk-integrity.h

··· 20 20 unsigned int data_size; 21 21 unsigned short interval; 22 22 unsigned char tuple_size; 23 + unsigned char pi_offset; 23 24 const char *disk_name; 24 25 }; 25 26

+6 -4

include/linux/blk-mq.h

··· 684 684 685 685 #define BLK_MQ_NO_HCTX_IDX (-1U) 686 686 687 - struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, 687 + struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, 688 + struct queue_limits *lim, void *queuedata, 688 689 struct lock_class_key *lkclass); 689 - #define blk_mq_alloc_disk(set, queuedata) \ 690 + #define blk_mq_alloc_disk(set, lim, queuedata) \ 690 691 ({ \ 691 692 static struct lock_class_key __key; \ 692 693 \ 693 - __blk_mq_alloc_disk(set, queuedata, &__key); \ 694 + __blk_mq_alloc_disk(set, lim, queuedata, &__key); \ 694 695 }) 695 696 struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, 696 697 struct lock_class_key *lkclass); 697 - struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); 698 + struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, 699 + struct queue_limits *lim, void *queuedata); 698 700 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 699 701 struct request_queue *q); 700 702 void blk_mq_destroy_queue(struct request_queue *);

-42

include/linux/blk_types.h

··· 207 207 return true; 208 208 } 209 209 210 - /* 211 - * From most significant bit: 212 - * 1 bit: reserved for other usage, see below 213 - * 12 bits: original size of bio 214 - * 51 bits: issue time of bio 215 - */ 216 - #define BIO_ISSUE_RES_BITS 1 217 - #define BIO_ISSUE_SIZE_BITS 12 218 - #define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) 219 - #define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) 220 - #define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) 221 - #define BIO_ISSUE_SIZE_MASK \ 222 - (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) 223 - #define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) 224 - 225 - /* Reserved bit for blk-throtl */ 226 - #define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) 227 - 228 210 struct bio_issue { 229 211 u64 value; 230 212 }; 231 - 232 - static inline u64 __bio_issue_time(u64 time) 233 - { 234 - return time & BIO_ISSUE_TIME_MASK; 235 - } 236 - 237 - static inline u64 bio_issue_time(struct bio_issue *issue) 238 - { 239 - return __bio_issue_time(issue->value); 240 - } 241 - 242 - static inline sector_t bio_issue_size(struct bio_issue *issue) 243 - { 244 - return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); 245 - } 246 - 247 - static inline void bio_issue_init(struct bio_issue *issue, 248 - sector_t size) 249 - { 250 - size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; 251 - issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | 252 - (ktime_get_ns() & BIO_ISSUE_TIME_MASK) | 253 - ((u64)size << BIO_ISSUE_SIZE_SHIFT)); 254 - } 255 213 256 214 typedef __u32 __bitwise blk_opf_t; 257 215

+60 -13

include/linux/blkdev.h

··· 43 43 44 44 extern const struct device_type disk_type; 45 45 extern const struct device_type part_type; 46 - extern struct class block_class; 46 + extern const struct class block_class; 47 47 48 48 /* 49 49 * Maximum number of blkcg policies allowed to be registered concurrently. ··· 109 109 const struct blk_integrity_profile *profile; 110 110 unsigned char flags; 111 111 unsigned char tuple_size; 112 + unsigned char pi_offset; 112 113 unsigned char interval_exp; 113 114 unsigned char tag_size; 114 115 }; ··· 191 190 * blk_mq_unfreeze_queue(). 192 191 */ 193 192 unsigned int nr_zones; 194 - unsigned int max_open_zones; 195 - unsigned int max_active_zones; 196 193 unsigned long *conv_zones_bitmap; 197 194 unsigned long *seq_zones_wlock; 198 195 #endif /* CONFIG_BLK_DEV_ZONED */ ··· 292 293 unsigned int io_opt; 293 294 unsigned int max_discard_sectors; 294 295 unsigned int max_hw_discard_sectors; 296 + unsigned int max_user_discard_sectors; 295 297 unsigned int max_secure_erase_sectors; 296 298 unsigned int max_write_zeroes_sectors; 297 299 unsigned int max_zone_append_sectors; ··· 308 308 unsigned char discard_misaligned; 309 309 unsigned char raid_partial_stripes_expensive; 310 310 bool zoned; 311 + unsigned int max_open_zones; 312 + unsigned int max_active_zones; 311 313 312 314 /* 313 315 * Drivers that set dma_alignment to less than 511 must be prepared to ··· 328 326 int blkdev_report_zones(struct block_device *bdev, sector_t sector, 329 327 unsigned int nr_zones, report_zones_cb cb, void *data); 330 328 int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, 331 - sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask); 329 + sector_t sectors, sector_t nr_sectors); 332 330 int blk_revalidate_disk_zones(struct gendisk *disk, 333 331 void (*update_driver_data)(struct gendisk *disk)); 334 332 ··· 476 474 477 475 struct mutex sysfs_lock; 478 476 struct mutex sysfs_dir_lock; 477 + struct mutex limits_lock; 479 478 480 479 /* 481 480 * for reusing dead hctx instance in case of updating ··· 643 640 static inline void disk_set_max_open_zones(struct gendisk *disk, 644 641 unsigned int max_open_zones) 645 642 { 646 - disk->max_open_zones = max_open_zones; 643 + disk->queue->limits.max_open_zones = max_open_zones; 647 644 } 648 645 649 646 static inline void disk_set_max_active_zones(struct gendisk *disk, 650 647 unsigned int max_active_zones) 651 648 { 652 - disk->max_active_zones = max_active_zones; 649 + disk->queue->limits.max_active_zones = max_active_zones; 653 650 } 654 651 655 652 static inline unsigned int bdev_max_open_zones(struct block_device *bdev) 656 653 { 657 - return bdev->bd_disk->max_open_zones; 654 + return bdev->bd_disk->queue->limits.max_open_zones; 658 655 } 659 656 660 657 static inline unsigned int bdev_max_active_zones(struct block_device *bdev) 661 658 { 662 - return bdev->bd_disk->max_active_zones; 659 + return bdev->bd_disk->queue->limits.max_active_zones; 663 660 } 664 661 665 662 #else /* CONFIG_BLK_DEV_ZONED */ ··· 767 764 int bdev_disk_changed(struct gendisk *disk, bool invalidate); 768 765 769 766 void put_disk(struct gendisk *disk); 770 - struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass); 767 + struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node, 768 + struct lock_class_key *lkclass); 771 769 772 770 /** 773 771 * blk_alloc_disk - allocate a gendisk structure 772 + * @lim: queue limits to be used for this disk. 774 773 * @node_id: numa node to allocate on 775 774 * 776 775 * Allocate and pre-initialize a gendisk structure for use with BIO based 777 776 * drivers. 778 777 * 778 + * Returns an ERR_PTR on error, else the allocated disk. 779 + * 779 780 * Context: can sleep 780 781 */ 781 - #define blk_alloc_disk(node_id) \ 782 + #define blk_alloc_disk(lim, node_id) \ 782 783 ({ \ 783 784 static struct lock_class_key __key; \ 784 785 \ 785 - __blk_alloc_disk(node_id, &__key); \ 786 + __blk_alloc_disk(lim, node_id, &__key); \ 786 787 }) 787 788 788 789 int __register_blkdev(unsigned int major, const char *name, ··· 869 862 return chunk_sectors - (offset & (chunk_sectors - 1)); 870 863 } 871 864 865 + /** 866 + * queue_limits_start_update - start an atomic update of queue limits 867 + * @q: queue to update 868 + * 869 + * This functions starts an atomic update of the queue limits. It takes a lock 870 + * to prevent other updates and returns a snapshot of the current limits that 871 + * the caller can modify. The caller must call queue_limits_commit_update() 872 + * to finish the update. 873 + * 874 + * Context: process context. The caller must have frozen the queue or ensured 875 + * that there is outstanding I/O by other means. 876 + */ 877 + static inline struct queue_limits 878 + queue_limits_start_update(struct request_queue *q) 879 + __acquires(q->limits_lock) 880 + { 881 + mutex_lock(&q->limits_lock); 882 + return q->limits; 883 + } 884 + int queue_limits_commit_update(struct request_queue *q, 885 + struct queue_limits *lim); 886 + int queue_limits_set(struct request_queue *q, struct queue_limits *lim); 887 + 872 888 /* 873 889 * Access functions for manipulating queue properties 874 890 */ ··· 925 895 extern void blk_set_stacking_limits(struct queue_limits *lim); 926 896 extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, 927 897 sector_t offset); 928 - extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, 929 - sector_t offset); 898 + void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, 899 + sector_t offset, const char *pfx); 930 900 extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); 931 901 extern void blk_queue_segment_boundary(struct request_queue *, unsigned long); 932 902 extern void blk_queue_virt_boundary(struct request_queue *, unsigned long); ··· 973 943 974 944 /* if ios_left is > 1, we can batch tag/rq allocations */ 975 945 struct request *cached_rq; 946 + u64 cur_ktime; 976 947 unsigned short nr_ios; 977 948 978 949 unsigned short rq_count; ··· 1004 973 __blk_flush_plug(plug, async); 1005 974 } 1006 975 976 + /* 977 + * tsk == current here 978 + */ 979 + static inline void blk_plug_invalidate_ts(struct task_struct *tsk) 980 + { 981 + struct blk_plug *plug = tsk->plug; 982 + 983 + if (plug) 984 + plug->cur_ktime = 0; 985 + current->flags &= ~PF_BLOCK_TS; 986 + } 987 + 1007 988 int blkdev_issue_flush(struct block_device *bdev); 1008 989 long nr_blockdev_pages(void); 1009 990 #else /* CONFIG_BLOCK */ ··· 1036 993 } 1037 994 1038 995 static inline void blk_flush_plug(struct blk_plug *plug, bool async) 996 + { 997 + } 998 + 999 + static inline void blk_plug_invalidate_ts(struct task_struct *tsk) 1039 1000 { 1040 1001 } 1041 1002

+5 -1

include/linux/nvme-rdma.h

··· 6 6 #ifndef _LINUX_NVME_RDMA_H 7 7 #define _LINUX_NVME_RDMA_H 8 8 9 - #define NVME_RDMA_MAX_QUEUE_SIZE 128 9 + #define NVME_RDMA_IP_PORT 4420 10 + 11 + #define NVME_RDMA_MAX_QUEUE_SIZE 256 12 + #define NVME_RDMA_MAX_METADATA_QUEUE_SIZE 128 13 + #define NVME_RDMA_DEFAULT_QUEUE_SIZE 128 10 14 11 15 enum nvme_rdma_cm_fmt { 12 16 NVME_RDMA_CM_FMT_1_0 = 0x0,

-2

include/linux/nvme.h

··· 23 23 24 24 #define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery" 25 25 26 - #define NVME_RDMA_IP_PORT 4420 27 - 28 26 #define NVME_NSID_ALL 0xffffffff 29 27 30 28 enum nvme_subsys_type {

+1 -1

include/linux/sched.h

··· 1642 1642 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ 1643 1643 #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ 1644 1644 #define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ 1645 - #define PF__HOLE__20000000 0x20000000 1645 + #define PF_BLOCK_TS 0x20000000 /* plug has ts that needs updating */ 1646 1646 #define PF__HOLE__40000000 0x40000000 1647 1647 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ 1648 1648

+6

include/linux/sched/topology.h

··· 176 176 cpumask_var_t *alloc_sched_domains(unsigned int ndoms); 177 177 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); 178 178 179 + bool cpus_equal_capacity(int this_cpu, int that_cpu); 179 180 bool cpus_share_cache(int this_cpu, int that_cpu); 180 181 bool cpus_share_resources(int this_cpu, int that_cpu); 181 182 ··· 225 224 partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 226 225 struct sched_domain_attr *dattr_new) 227 226 { 227 + } 228 + 229 + static inline bool cpus_equal_capacity(int this_cpu, int that_cpu) 230 + { 231 + return true; 228 232 } 229 233 230 234 static inline bool cpus_share_cache(int this_cpu, int that_cpu)

+2

include/uapi/linux/ublk_cmd.h

··· 49 49 _IOR('u', UBLK_CMD_GET_DEV_INFO2, struct ublksrv_ctrl_cmd) 50 50 #define UBLK_U_CMD_GET_FEATURES \ 51 51 _IOR('u', 0x13, struct ublksrv_ctrl_cmd) 52 + #define UBLK_U_CMD_DEL_DEV_ASYNC \ 53 + _IOR('u', 0x14, struct ublksrv_ctrl_cmd) 52 54 53 55 /* 54 56 * 64bits are enough now, and it should be easy to extend in case of

+15 -2

kernel/sched/core.c

··· 3955 3955 } 3956 3956 } 3957 3957 3958 + bool cpus_equal_capacity(int this_cpu, int that_cpu) 3959 + { 3960 + if (!sched_asym_cpucap_active()) 3961 + return true; 3962 + 3963 + if (this_cpu == that_cpu) 3964 + return true; 3965 + 3966 + return arch_scale_cpu_capacity(this_cpu) == arch_scale_cpu_capacity(that_cpu); 3967 + } 3968 + 3958 3969 bool cpus_share_cache(int this_cpu, int that_cpu) 3959 3970 { 3960 3971 if (this_cpu == that_cpu) ··· 6798 6787 6799 6788 static void sched_update_worker(struct task_struct *tsk) 6800 6789 { 6801 - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { 6790 + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_BLOCK_TS)) { 6791 + if (tsk->flags & PF_BLOCK_TS) 6792 + blk_plug_invalidate_ts(tsk); 6802 6793 if (tsk->flags & PF_WQ_WORKER) 6803 6794 wq_worker_running(tsk); 6804 - else 6795 + else if (tsk->flags & PF_IO_WORKER) 6805 6796 io_wq_worker_running(tsk); 6806 6797 } 6807 6798 }