Merge branch 'for-3.20/core' of git://git.kernel.dk/linux-block

+9 -6

Documentation/filesystems/xip.txt

··· 28 28 Execute-in-place is implemented in three steps: block device operation, 29 29 address space operation, and file operations. 30 30 31 - A block device operation named direct_access is used to retrieve a 32 - reference (pointer) to a block on-disk. The reference is supposed to be 33 - cpu-addressable, physical address and remain valid until the release operation 34 - is performed. A struct block_device reference is used to address the device, 35 - and a sector_t argument is used to identify the individual block. As an 36 - alternative, memory technology devices can be used for this. 31 + A block device operation named direct_access is used to translate the 32 + block device sector number to a page frame number (pfn) that identifies 33 + the physical page for the memory. It also returns a kernel virtual 34 + address that can be used to access the memory. 35 + 36 + The direct_access method takes a 'size' parameter that indicates the 37 + number of bytes being requested. The function should return the number 38 + of bytes that can be contiguously accessed at that offset. It may also 39 + return a negative errno if an error occurs. 37 40 38 41 The block device operation is optional, these block devices support it as of 39 42 today:

+4 -13

arch/powerpc/sysdev/axonram.c

··· 139 139 * axon_ram_direct_access - direct_access() method for block device 140 140 * @device, @sector, @data: see block_device_operations method 141 141 */ 142 - static int 142 + static long 143 143 axon_ram_direct_access(struct block_device *device, sector_t sector, 144 - void **kaddr, unsigned long *pfn) 144 + void **kaddr, unsigned long *pfn, long size) 145 145 { 146 146 struct axon_ram_bank *bank = device->bd_disk->private_data; 147 - loff_t offset; 148 - 149 - offset = sector; 150 - if (device->bd_part != NULL) 151 - offset += device->bd_part->start_sect; 152 - offset <<= AXON_RAM_SECTOR_SHIFT; 153 - if (offset >= bank->size) { 154 - dev_err(&bank->device->dev, "Access outside of address space\n"); 155 - return -ERANGE; 156 - } 147 + loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT; 157 148 158 149 *kaddr = (void *)(bank->ph_addr + offset); 159 150 *pfn = virt_to_phys(kaddr) >> PAGE_SHIFT; 160 151 161 - return 0; 152 + return bank->size - offset; 162 153 } 163 154 164 155 static const struct block_device_operations axon_ram_devops = {

+198 -242

block/bio.c

··· 28 28 #include <linux/mempool.h> 29 29 #include <linux/workqueue.h> 30 30 #include <linux/cgroup.h> 31 - #include <scsi/sg.h> /* for struct sg_iovec */ 32 31 33 32 #include <trace/events/block.h> 34 33 ··· 1021 1022 EXPORT_SYMBOL(bio_copy_data); 1022 1023 1023 1024 struct bio_map_data { 1024 - int nr_sgvecs; 1025 1025 int is_our_pages; 1026 - struct sg_iovec sgvecs[]; 1026 + struct iov_iter iter; 1027 + struct iovec iov[]; 1027 1028 }; 1028 - 1029 - static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, 1030 - const struct sg_iovec *iov, int iov_count, 1031 - int is_our_pages) 1032 - { 1033 - memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); 1034 - bmd->nr_sgvecs = iov_count; 1035 - bmd->is_our_pages = is_our_pages; 1036 - bio->bi_private = bmd; 1037 - } 1038 1029 1039 1030 static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count, 1040 1031 gfp_t gfp_mask) ··· 1033 1044 return NULL; 1034 1045 1035 1046 return kmalloc(sizeof(struct bio_map_data) + 1036 - sizeof(struct sg_iovec) * iov_count, gfp_mask); 1047 + sizeof(struct iovec) * iov_count, gfp_mask); 1037 1048 } 1038 1049 1039 - static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count, 1040 - int to_user, int from_user, int do_free_page) 1050 + /** 1051 + * bio_copy_from_iter - copy all pages from iov_iter to bio 1052 + * @bio: The &struct bio which describes the I/O as destination 1053 + * @iter: iov_iter as source 1054 + * 1055 + * Copy all pages from iov_iter to bio. 1056 + * Returns 0 on success, or error on failure. 1057 + */ 1058 + static int bio_copy_from_iter(struct bio *bio, struct iov_iter iter) 1041 1059 { 1042 - int ret = 0, i; 1060 + int i; 1043 1061 struct bio_vec *bvec; 1044 - int iov_idx = 0; 1045 - unsigned int iov_off = 0; 1046 1062 1047 1063 bio_for_each_segment_all(bvec, bio, i) { 1048 - char *bv_addr = page_address(bvec->bv_page); 1049 - unsigned int bv_len = bvec->bv_len; 1064 + ssize_t ret; 1050 1065 1051 - while (bv_len && iov_idx < iov_count) { 1052 - unsigned int bytes; 1053 - char __user *iov_addr; 1066 + ret = copy_page_from_iter(bvec->bv_page, 1067 + bvec->bv_offset, 1068 + bvec->bv_len, 1069 + &iter); 1054 1070 1055 - bytes = min_t(unsigned int, 1056 - iov[iov_idx].iov_len - iov_off, bv_len); 1057 - iov_addr = iov[iov_idx].iov_base + iov_off; 1071 + if (!iov_iter_count(&iter)) 1072 + break; 1058 1073 1059 - if (!ret) { 1060 - if (to_user) 1061 - ret = copy_to_user(iov_addr, bv_addr, 1062 - bytes); 1063 - 1064 - if (from_user) 1065 - ret = copy_from_user(bv_addr, iov_addr, 1066 - bytes); 1067 - 1068 - if (ret) 1069 - ret = -EFAULT; 1070 - } 1071 - 1072 - bv_len -= bytes; 1073 - bv_addr += bytes; 1074 - iov_addr += bytes; 1075 - iov_off += bytes; 1076 - 1077 - if (iov[iov_idx].iov_len == iov_off) { 1078 - iov_idx++; 1079 - iov_off = 0; 1080 - } 1081 - } 1082 - 1083 - if (do_free_page) 1084 - __free_page(bvec->bv_page); 1074 + if (ret < bvec->bv_len) 1075 + return -EFAULT; 1085 1076 } 1086 1077 1087 - return ret; 1078 + return 0; 1079 + } 1080 + 1081 + /** 1082 + * bio_copy_to_iter - copy all pages from bio to iov_iter 1083 + * @bio: The &struct bio which describes the I/O as source 1084 + * @iter: iov_iter as destination 1085 + * 1086 + * Copy all pages from bio to iov_iter. 1087 + * Returns 0 on success, or error on failure. 1088 + */ 1089 + static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) 1090 + { 1091 + int i; 1092 + struct bio_vec *bvec; 1093 + 1094 + bio_for_each_segment_all(bvec, bio, i) { 1095 + ssize_t ret; 1096 + 1097 + ret = copy_page_to_iter(bvec->bv_page, 1098 + bvec->bv_offset, 1099 + bvec->bv_len, 1100 + &iter); 1101 + 1102 + if (!iov_iter_count(&iter)) 1103 + break; 1104 + 1105 + if (ret < bvec->bv_len) 1106 + return -EFAULT; 1107 + } 1108 + 1109 + return 0; 1110 + } 1111 + 1112 + static void bio_free_pages(struct bio *bio) 1113 + { 1114 + struct bio_vec *bvec; 1115 + int i; 1116 + 1117 + bio_for_each_segment_all(bvec, bio, i) 1118 + __free_page(bvec->bv_page); 1088 1119 } 1089 1120 1090 1121 /** 1091 1122 * bio_uncopy_user - finish previously mapped bio 1092 1123 * @bio: bio being terminated 1093 1124 * 1094 - * Free pages allocated from bio_copy_user() and write back data 1125 + * Free pages allocated from bio_copy_user_iov() and write back data 1095 1126 * to user space in case of a read. 1096 1127 */ 1097 1128 int bio_uncopy_user(struct bio *bio) 1098 1129 { 1099 1130 struct bio_map_data *bmd = bio->bi_private; 1100 - struct bio_vec *bvec; 1101 - int ret = 0, i; 1131 + int ret = 0; 1102 1132 1103 1133 if (!bio_flagged(bio, BIO_NULL_MAPPED)) { 1104 1134 /* 1105 1135 * if we're in a workqueue, the request is orphaned, so 1106 1136 * don't copy into a random user address space, just free. 1107 1137 */ 1108 - if (current->mm) 1109 - ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, 1110 - bio_data_dir(bio) == READ, 1111 - 0, bmd->is_our_pages); 1112 - else if (bmd->is_our_pages) 1113 - bio_for_each_segment_all(bvec, bio, i) 1114 - __free_page(bvec->bv_page); 1138 + if (current->mm && bio_data_dir(bio) == READ) 1139 + ret = bio_copy_to_iter(bio, bmd->iter); 1140 + if (bmd->is_our_pages) 1141 + bio_free_pages(bio); 1115 1142 } 1116 1143 kfree(bmd); 1117 1144 bio_put(bio); ··· 1137 1132 1138 1133 /** 1139 1134 * bio_copy_user_iov - copy user data to bio 1140 - * @q: destination block queue 1141 - * @map_data: pointer to the rq_map_data holding pages (if necessary) 1142 - * @iov: the iovec. 1143 - * @iov_count: number of elements in the iovec 1144 - * @write_to_vm: bool indicating writing to pages or not 1145 - * @gfp_mask: memory allocation flags 1135 + * @q: destination block queue 1136 + * @map_data: pointer to the rq_map_data holding pages (if necessary) 1137 + * @iter: iovec iterator 1138 + * @gfp_mask: memory allocation flags 1146 1139 * 1147 1140 * Prepares and returns a bio for indirect user io, bouncing data 1148 1141 * to/from kernel pages as necessary. Must be paired with ··· 1148 1145 */ 1149 1146 struct bio *bio_copy_user_iov(struct request_queue *q, 1150 1147 struct rq_map_data *map_data, 1151 - const struct sg_iovec *iov, int iov_count, 1152 - int write_to_vm, gfp_t gfp_mask) 1148 + const struct iov_iter *iter, 1149 + gfp_t gfp_mask) 1153 1150 { 1154 1151 struct bio_map_data *bmd; 1155 - struct bio_vec *bvec; 1156 1152 struct page *page; 1157 1153 struct bio *bio; 1158 1154 int i, ret; 1159 1155 int nr_pages = 0; 1160 - unsigned int len = 0; 1156 + unsigned int len = iter->count; 1161 1157 unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0; 1162 1158 1163 - for (i = 0; i < iov_count; i++) { 1159 + for (i = 0; i < iter->nr_segs; i++) { 1164 1160 unsigned long uaddr; 1165 1161 unsigned long end; 1166 1162 unsigned long start; 1167 1163 1168 - uaddr = (unsigned long)iov[i].iov_base; 1169 - end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1164 + uaddr = (unsigned long) iter->iov[i].iov_base; 1165 + end = (uaddr + iter->iov[i].iov_len + PAGE_SIZE - 1) 1166 + >> PAGE_SHIFT; 1170 1167 start = uaddr >> PAGE_SHIFT; 1171 1168 1172 1169 /* ··· 1176 1173 return ERR_PTR(-EINVAL); 1177 1174 1178 1175 nr_pages += end - start; 1179 - len += iov[i].iov_len; 1180 1176 } 1181 1177 1182 1178 if (offset) 1183 1179 nr_pages++; 1184 1180 1185 - bmd = bio_alloc_map_data(iov_count, gfp_mask); 1181 + bmd = bio_alloc_map_data(iter->nr_segs, gfp_mask); 1186 1182 if (!bmd) 1187 1183 return ERR_PTR(-ENOMEM); 1184 + 1185 + /* 1186 + * We need to do a deep copy of the iov_iter including the iovecs. 1187 + * The caller provided iov might point to an on-stack or otherwise 1188 + * shortlived one. 1189 + */ 1190 + bmd->is_our_pages = map_data ? 0 : 1; 1191 + memcpy(bmd->iov, iter->iov, sizeof(struct iovec) * iter->nr_segs); 1192 + iov_iter_init(&bmd->iter, iter->type, bmd->iov, 1193 + iter->nr_segs, iter->count); 1188 1194 1189 1195 ret = -ENOMEM; 1190 1196 bio = bio_kmalloc(gfp_mask, nr_pages); 1191 1197 if (!bio) 1192 1198 goto out_bmd; 1193 1199 1194 - if (!write_to_vm) 1200 + if (iter->type & WRITE) 1195 1201 bio->bi_rw |= REQ_WRITE; 1196 1202 1197 1203 ret = 0; ··· 1248 1236 /* 1249 1237 * success 1250 1238 */ 1251 - if ((!write_to_vm && (!map_data || !map_data->null_mapped)) || 1239 + if (((iter->type & WRITE) && (!map_data || !map_data->null_mapped)) || 1252 1240 (map_data && map_data->from_user)) { 1253 - ret = __bio_copy_iov(bio, iov, iov_count, 0, 1, 0); 1241 + ret = bio_copy_from_iter(bio, *iter); 1254 1242 if (ret) 1255 1243 goto cleanup; 1256 1244 } 1257 1245 1258 - bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1); 1246 + bio->bi_private = bmd; 1259 1247 return bio; 1260 1248 cleanup: 1261 1249 if (!map_data) 1262 - bio_for_each_segment_all(bvec, bio, i) 1263 - __free_page(bvec->bv_page); 1264 - 1250 + bio_free_pages(bio); 1265 1251 bio_put(bio); 1266 1252 out_bmd: 1267 1253 kfree(bmd); ··· 1267 1257 } 1268 1258 1269 1259 /** 1270 - * bio_copy_user - copy user data to bio 1271 - * @q: destination block queue 1272 - * @map_data: pointer to the rq_map_data holding pages (if necessary) 1273 - * @uaddr: start of user address 1274 - * @len: length in bytes 1275 - * @write_to_vm: bool indicating writing to pages or not 1276 - * @gfp_mask: memory allocation flags 1260 + * bio_map_user_iov - map user iovec into bio 1261 + * @q: the struct request_queue for the bio 1262 + * @iter: iovec iterator 1263 + * @gfp_mask: memory allocation flags 1277 1264 * 1278 - * Prepares and returns a bio for indirect user io, bouncing data 1279 - * to/from kernel pages as necessary. Must be paired with 1280 - * call bio_uncopy_user() on io completion. 1265 + * Map the user space address into a bio suitable for io to a block 1266 + * device. Returns an error pointer in case of error. 1281 1267 */ 1282 - struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data, 1283 - unsigned long uaddr, unsigned int len, 1284 - int write_to_vm, gfp_t gfp_mask) 1268 + struct bio *bio_map_user_iov(struct request_queue *q, 1269 + const struct iov_iter *iter, 1270 + gfp_t gfp_mask) 1285 1271 { 1286 - struct sg_iovec iov; 1287 - 1288 - iov.iov_base = (void __user *)uaddr; 1289 - iov.iov_len = len; 1290 - 1291 - return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask); 1292 - } 1293 - EXPORT_SYMBOL(bio_copy_user); 1294 - 1295 - static struct bio *__bio_map_user_iov(struct request_queue *q, 1296 - struct block_device *bdev, 1297 - const struct sg_iovec *iov, int iov_count, 1298 - int write_to_vm, gfp_t gfp_mask) 1299 - { 1300 - int i, j; 1272 + int j; 1301 1273 int nr_pages = 0; 1302 1274 struct page **pages; 1303 1275 struct bio *bio; 1304 1276 int cur_page = 0; 1305 1277 int ret, offset; 1278 + struct iov_iter i; 1279 + struct iovec iov; 1306 1280 1307 - for (i = 0; i < iov_count; i++) { 1308 - unsigned long uaddr = (unsigned long)iov[i].iov_base; 1309 - unsigned long len = iov[i].iov_len; 1281 + iov_for_each(iov, i, *iter) { 1282 + unsigned long uaddr = (unsigned long) iov.iov_base; 1283 + unsigned long len = iov.iov_len; 1310 1284 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1311 1285 unsigned long start = uaddr >> PAGE_SHIFT; 1312 1286 ··· 1320 1326 if (!pages) 1321 1327 goto out; 1322 1328 1323 - for (i = 0; i < iov_count; i++) { 1324 - unsigned long uaddr = (unsigned long)iov[i].iov_base; 1325 - unsigned long len = iov[i].iov_len; 1329 + iov_for_each(iov, i, *iter) { 1330 + unsigned long uaddr = (unsigned long) iov.iov_base; 1331 + unsigned long len = iov.iov_len; 1326 1332 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1327 1333 unsigned long start = uaddr >> PAGE_SHIFT; 1328 1334 const int local_nr_pages = end - start; 1329 1335 const int page_limit = cur_page + local_nr_pages; 1330 1336 1331 1337 ret = get_user_pages_fast(uaddr, local_nr_pages, 1332 - write_to_vm, &pages[cur_page]); 1338 + (iter->type & WRITE) != WRITE, 1339 + &pages[cur_page]); 1333 1340 if (ret < local_nr_pages) { 1334 1341 ret = -EFAULT; 1335 1342 goto out_unmap; ··· 1370 1375 /* 1371 1376 * set data direction, and check if mapped pages need bouncing 1372 1377 */ 1373 - if (!write_to_vm) 1378 + if (iter->type & WRITE) 1374 1379 bio->bi_rw |= REQ_WRITE; 1375 1380 1376 - bio->bi_bdev = bdev; 1377 1381 bio->bi_flags |= (1 << BIO_USER_MAPPED); 1378 - return bio; 1379 - 1380 - out_unmap: 1381 - for (i = 0; i < nr_pages; i++) { 1382 - if(!pages[i]) 1383 - break; 1384 - page_cache_release(pages[i]); 1385 - } 1386 - out: 1387 - kfree(pages); 1388 - bio_put(bio); 1389 - return ERR_PTR(ret); 1390 - } 1391 - 1392 - /** 1393 - * bio_map_user - map user address into bio 1394 - * @q: the struct request_queue for the bio 1395 - * @bdev: destination block device 1396 - * @uaddr: start of user address 1397 - * @len: length in bytes 1398 - * @write_to_vm: bool indicating writing to pages or not 1399 - * @gfp_mask: memory allocation flags 1400 - * 1401 - * Map the user space address into a bio suitable for io to a block 1402 - * device. Returns an error pointer in case of error. 1403 - */ 1404 - struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, 1405 - unsigned long uaddr, unsigned int len, int write_to_vm, 1406 - gfp_t gfp_mask) 1407 - { 1408 - struct sg_iovec iov; 1409 - 1410 - iov.iov_base = (void __user *)uaddr; 1411 - iov.iov_len = len; 1412 - 1413 - return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask); 1414 - } 1415 - EXPORT_SYMBOL(bio_map_user); 1416 - 1417 - /** 1418 - * bio_map_user_iov - map user sg_iovec table into bio 1419 - * @q: the struct request_queue for the bio 1420 - * @bdev: destination block device 1421 - * @iov: the iovec. 1422 - * @iov_count: number of elements in the iovec 1423 - * @write_to_vm: bool indicating writing to pages or not 1424 - * @gfp_mask: memory allocation flags 1425 - * 1426 - * Map the user space address into a bio suitable for io to a block 1427 - * device. Returns an error pointer in case of error. 1428 - */ 1429 - struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, 1430 - const struct sg_iovec *iov, int iov_count, 1431 - int write_to_vm, gfp_t gfp_mask) 1432 - { 1433 - struct bio *bio; 1434 - 1435 - bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm, 1436 - gfp_mask); 1437 - if (IS_ERR(bio)) 1438 - return bio; 1439 1382 1440 1383 /* 1441 1384 * subtle -- if __bio_map_user() ended up bouncing a bio, ··· 1382 1449 * reference to it 1383 1450 */ 1384 1451 bio_get(bio); 1385 - 1386 1452 return bio; 1453 + 1454 + out_unmap: 1455 + for (j = 0; j < nr_pages; j++) { 1456 + if (!pages[j]) 1457 + break; 1458 + page_cache_release(pages[j]); 1459 + } 1460 + out: 1461 + kfree(pages); 1462 + bio_put(bio); 1463 + return ERR_PTR(ret); 1387 1464 } 1388 1465 1389 1466 static void __bio_unmap_user(struct bio *bio) ··· 1435 1492 bio_put(bio); 1436 1493 } 1437 1494 1438 - static struct bio *__bio_map_kern(struct request_queue *q, void *data, 1439 - unsigned int len, gfp_t gfp_mask) 1495 + /** 1496 + * bio_map_kern - map kernel address into bio 1497 + * @q: the struct request_queue for the bio 1498 + * @data: pointer to buffer to map 1499 + * @len: length in bytes 1500 + * @gfp_mask: allocation flags for bio allocation 1501 + * 1502 + * Map the kernel address into a bio suitable for io to a block 1503 + * device. Returns an error pointer in case of error. 1504 + */ 1505 + struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, 1506 + gfp_t gfp_mask) 1440 1507 { 1441 1508 unsigned long kaddr = (unsigned long)data; 1442 1509 unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; ··· 1470 1517 bytes = len; 1471 1518 1472 1519 if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, 1473 - offset) < bytes) 1474 - break; 1520 + offset) < bytes) { 1521 + /* we don't support partial mappings */ 1522 + bio_put(bio); 1523 + return ERR_PTR(-EINVAL); 1524 + } 1475 1525 1476 1526 data += bytes; 1477 1527 len -= bytes; ··· 1484 1528 bio->bi_end_io = bio_map_kern_endio; 1485 1529 return bio; 1486 1530 } 1487 - 1488 - /** 1489 - * bio_map_kern - map kernel address into bio 1490 - * @q: the struct request_queue for the bio 1491 - * @data: pointer to buffer to map 1492 - * @len: length in bytes 1493 - * @gfp_mask: allocation flags for bio allocation 1494 - * 1495 - * Map the kernel address into a bio suitable for io to a block 1496 - * device. Returns an error pointer in case of error. 1497 - */ 1498 - struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, 1499 - gfp_t gfp_mask) 1500 - { 1501 - struct bio *bio; 1502 - 1503 - bio = __bio_map_kern(q, data, len, gfp_mask); 1504 - if (IS_ERR(bio)) 1505 - return bio; 1506 - 1507 - if (bio->bi_iter.bi_size == len) 1508 - return bio; 1509 - 1510 - /* 1511 - * Don't support partial mappings. 1512 - */ 1513 - bio_put(bio); 1514 - return ERR_PTR(-EINVAL); 1515 - } 1516 1531 EXPORT_SYMBOL(bio_map_kern); 1517 1532 1518 1533 static void bio_copy_kern_endio(struct bio *bio, int err) 1519 1534 { 1535 + bio_free_pages(bio); 1536 + bio_put(bio); 1537 + } 1538 + 1539 + static void bio_copy_kern_endio_read(struct bio *bio, int err) 1540 + { 1541 + char *p = bio->bi_private; 1520 1542 struct bio_vec *bvec; 1521 - const int read = bio_data_dir(bio) == READ; 1522 - struct bio_map_data *bmd = bio->bi_private; 1523 1543 int i; 1524 - char *p = bmd->sgvecs[0].iov_base; 1525 1544 1526 1545 bio_for_each_segment_all(bvec, bio, i) { 1527 - char *addr = page_address(bvec->bv_page); 1528 - 1529 - if (read) 1530 - memcpy(p, addr, bvec->bv_len); 1531 - 1532 - __free_page(bvec->bv_page); 1546 + memcpy(p, page_address(bvec->bv_page), bvec->bv_len); 1533 1547 p += bvec->bv_len; 1534 1548 } 1535 1549 1536 - kfree(bmd); 1537 - bio_put(bio); 1550 + bio_copy_kern_endio(bio, err); 1538 1551 } 1539 1552 1540 1553 /** ··· 1520 1595 struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, 1521 1596 gfp_t gfp_mask, int reading) 1522 1597 { 1598 + unsigned long kaddr = (unsigned long)data; 1599 + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1600 + unsigned long start = kaddr >> PAGE_SHIFT; 1523 1601 struct bio *bio; 1524 - struct bio_vec *bvec; 1525 - int i; 1602 + void *p = data; 1603 + int nr_pages = 0; 1526 1604 1527 - bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask); 1528 - if (IS_ERR(bio)) 1529 - return bio; 1605 + /* 1606 + * Overflow, abort 1607 + */ 1608 + if (end < start) 1609 + return ERR_PTR(-EINVAL); 1530 1610 1531 - if (!reading) { 1532 - void *p = data; 1611 + nr_pages = end - start; 1612 + bio = bio_kmalloc(gfp_mask, nr_pages); 1613 + if (!bio) 1614 + return ERR_PTR(-ENOMEM); 1533 1615 1534 - bio_for_each_segment_all(bvec, bio, i) { 1535 - char *addr = page_address(bvec->bv_page); 1616 + while (len) { 1617 + struct page *page; 1618 + unsigned int bytes = PAGE_SIZE; 1536 1619 1537 - memcpy(addr, p, bvec->bv_len); 1538 - p += bvec->bv_len; 1539 - } 1620 + if (bytes > len) 1621 + bytes = len; 1622 + 1623 + page = alloc_page(q->bounce_gfp | gfp_mask); 1624 + if (!page) 1625 + goto cleanup; 1626 + 1627 + if (!reading) 1628 + memcpy(page_address(page), p, bytes); 1629 + 1630 + if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) 1631 + break; 1632 + 1633 + len -= bytes; 1634 + p += bytes; 1540 1635 } 1541 1636 1542 - bio->bi_end_io = bio_copy_kern_endio; 1637 + if (reading) { 1638 + bio->bi_end_io = bio_copy_kern_endio_read; 1639 + bio->bi_private = data; 1640 + } else { 1641 + bio->bi_end_io = bio_copy_kern_endio; 1642 + bio->bi_rw |= REQ_WRITE; 1643 + } 1543 1644 1544 1645 return bio; 1646 + 1647 + cleanup: 1648 + bio_free_pages(bio); 1649 + bio_put(bio); 1650 + return ERR_PTR(-ENOMEM); 1545 1651 } 1546 1652 EXPORT_SYMBOL(bio_copy_kern); 1547 1653

+8 -3

block/blk-core.c

··· 2048 2048 should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) 2049 2049 return -EIO; 2050 2050 2051 + if (q->mq_ops) { 2052 + if (blk_queue_io_stat(q)) 2053 + blk_account_io_start(rq, true); 2054 + blk_mq_insert_request(rq, false, true, true); 2055 + return 0; 2056 + } 2057 + 2051 2058 spin_lock_irqsave(q->queue_lock, flags); 2052 2059 if (unlikely(blk_queue_dying(q))) { 2053 2060 spin_unlock_irqrestore(q->queue_lock, flags); ··· 2914 2907 static void __blk_rq_prep_clone(struct request *dst, struct request *src) 2915 2908 { 2916 2909 dst->cpu = src->cpu; 2917 - dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE; 2910 + dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE; 2918 2911 dst->cmd_type = src->cmd_type; 2919 2912 dst->__sector = blk_rq_pos(src); 2920 2913 dst->__data_len = blk_rq_bytes(src); ··· 2951 2944 2952 2945 if (!bs) 2953 2946 bs = fs_bio_set; 2954 - 2955 - blk_rq_init(NULL, rq); 2956 2947 2957 2948 __rq_for_each_bio(bio_src, rq_src) { 2958 2949 bio = bio_clone_fast(bio_src, gfp_mask, bs);

+20 -10

block/blk-lib.c

··· 283 283 * @sector: start sector 284 284 * @nr_sects: number of sectors to write 285 285 * @gfp_mask: memory allocation flags (for bio_alloc) 286 + * @discard: whether to discard the block range 286 287 * 287 288 * Description: 288 - * Generate and issue number of bios with zerofiled pages. 289 + * Zero-fill a block range. If the discard flag is set and the block 290 + * device guarantees that subsequent READ operations to the block range 291 + * in question will return zeroes, the blocks will be discarded. Should 292 + * the discard request fail, if the discard flag is not set, or if 293 + * discard_zeroes_data is not supported, this function will resort to 294 + * zeroing the blocks manually, thus provisioning (allocating, 295 + * anchoring) them. If the block device supports the WRITE SAME command 296 + * blkdev_issue_zeroout() will use it to optimize the process of 297 + * clearing the block range. Otherwise the zeroing will be performed 298 + * using regular WRITE calls. 289 299 */ 290 300 291 301 int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, 292 - sector_t nr_sects, gfp_t gfp_mask) 302 + sector_t nr_sects, gfp_t gfp_mask, bool discard) 293 303 { 294 - if (bdev_write_same(bdev)) { 295 - unsigned char bdn[BDEVNAME_SIZE]; 304 + struct request_queue *q = bdev_get_queue(bdev); 296 305 297 - if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, 298 - ZERO_PAGE(0))) 299 - return 0; 306 + if (discard && blk_queue_discard(q) && q->limits.discard_zeroes_data && 307 + blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, 0) == 0) 308 + return 0; 300 309 301 - bdevname(bdev, bdn); 302 - pr_err("%s: WRITE SAME failed. Manually zeroing.\n", bdn); 303 - } 310 + if (bdev_write_same(bdev) && 311 + blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, 312 + ZERO_PAGE(0)) == 0) 313 + return 0; 304 314 305 315 return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask); 306 316 }

+32 -140

block/blk-map.c

··· 5 5 #include <linux/module.h> 6 6 #include <linux/bio.h> 7 7 #include <linux/blkdev.h> 8 - #include <scsi/sg.h> /* for struct sg_iovec */ 8 + #include <linux/uio.h> 9 9 10 10 #include "blk.h" 11 11 ··· 39 39 return ret; 40 40 } 41 41 42 - static int __blk_rq_map_user(struct request_queue *q, struct request *rq, 43 - struct rq_map_data *map_data, void __user *ubuf, 44 - unsigned int len, gfp_t gfp_mask) 45 - { 46 - unsigned long uaddr; 47 - struct bio *bio, *orig_bio; 48 - int reading, ret; 49 - 50 - reading = rq_data_dir(rq) == READ; 51 - 52 - /* 53 - * if alignment requirement is satisfied, map in user pages for 54 - * direct dma. else, set up kernel bounce buffers 55 - */ 56 - uaddr = (unsigned long) ubuf; 57 - if (blk_rq_aligned(q, uaddr, len) && !map_data) 58 - bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask); 59 - else 60 - bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask); 61 - 62 - if (IS_ERR(bio)) 63 - return PTR_ERR(bio); 64 - 65 - if (map_data && map_data->null_mapped) 66 - bio->bi_flags |= (1 << BIO_NULL_MAPPED); 67 - 68 - orig_bio = bio; 69 - blk_queue_bounce(q, &bio); 70 - 71 - /* 72 - * We link the bounce buffer in and could have to traverse it 73 - * later so we have to get a ref to prevent it from being freed 74 - */ 75 - bio_get(bio); 76 - 77 - ret = blk_rq_append_bio(q, rq, bio); 78 - if (!ret) 79 - return bio->bi_iter.bi_size; 80 - 81 - /* if it was boucned we must call the end io function */ 82 - bio_endio(bio, 0); 83 - __blk_rq_unmap_user(orig_bio); 84 - bio_put(bio); 85 - return ret; 86 - } 87 - 88 - /** 89 - * blk_rq_map_user - map user data to a request, for REQ_TYPE_BLOCK_PC usage 90 - * @q: request queue where request should be inserted 91 - * @rq: request structure to fill 92 - * @map_data: pointer to the rq_map_data holding pages (if necessary) 93 - * @ubuf: the user buffer 94 - * @len: length of user data 95 - * @gfp_mask: memory allocation flags 96 - * 97 - * Description: 98 - * Data will be mapped directly for zero copy I/O, if possible. Otherwise 99 - * a kernel bounce buffer is used. 100 - * 101 - * A matching blk_rq_unmap_user() must be issued at the end of I/O, while 102 - * still in process context. 103 - * 104 - * Note: The mapped bio may need to be bounced through blk_queue_bounce() 105 - * before being submitted to the device, as pages mapped may be out of 106 - * reach. It's the callers responsibility to make sure this happens. The 107 - * original bio must be passed back in to blk_rq_unmap_user() for proper 108 - * unmapping. 109 - */ 110 - int blk_rq_map_user(struct request_queue *q, struct request *rq, 111 - struct rq_map_data *map_data, void __user *ubuf, 112 - unsigned long len, gfp_t gfp_mask) 113 - { 114 - unsigned long bytes_read = 0; 115 - struct bio *bio = NULL; 116 - int ret; 117 - 118 - if (len > (queue_max_hw_sectors(q) << 9)) 119 - return -EINVAL; 120 - if (!len) 121 - return -EINVAL; 122 - 123 - if (!ubuf && (!map_data || !map_data->null_mapped)) 124 - return -EINVAL; 125 - 126 - while (bytes_read != len) { 127 - unsigned long map_len, end, start; 128 - 129 - map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE); 130 - end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1) 131 - >> PAGE_SHIFT; 132 - start = (unsigned long)ubuf >> PAGE_SHIFT; 133 - 134 - /* 135 - * A bad offset could cause us to require BIO_MAX_PAGES + 1 136 - * pages. If this happens we just lower the requested 137 - * mapping len by a page so that we can fit 138 - */ 139 - if (end - start > BIO_MAX_PAGES) 140 - map_len -= PAGE_SIZE; 141 - 142 - ret = __blk_rq_map_user(q, rq, map_data, ubuf, map_len, 143 - gfp_mask); 144 - if (ret < 0) 145 - goto unmap_rq; 146 - if (!bio) 147 - bio = rq->bio; 148 - bytes_read += ret; 149 - ubuf += ret; 150 - 151 - if (map_data) 152 - map_data->offset += ret; 153 - } 154 - 155 - if (!bio_flagged(bio, BIO_USER_MAPPED)) 156 - rq->cmd_flags |= REQ_COPY_USER; 157 - 158 - return 0; 159 - unmap_rq: 160 - blk_rq_unmap_user(bio); 161 - rq->bio = NULL; 162 - return ret; 163 - } 164 - EXPORT_SYMBOL(blk_rq_map_user); 165 - 166 42 /** 167 43 * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage 168 44 * @q: request queue where request should be inserted 169 45 * @rq: request to map data to 170 46 * @map_data: pointer to the rq_map_data holding pages (if necessary) 171 - * @iov: pointer to the iovec 172 - * @iov_count: number of elements in the iovec 173 - * @len: I/O byte count 47 + * @iter: iovec iterator 174 48 * @gfp_mask: memory allocation flags 175 49 * 176 50 * Description: ··· 61 187 * unmapping. 62 188 */ 63 189 int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, 64 - struct rq_map_data *map_data, const struct sg_iovec *iov, 65 - int iov_count, unsigned int len, gfp_t gfp_mask) 190 + struct rq_map_data *map_data, 191 + const struct iov_iter *iter, gfp_t gfp_mask) 66 192 { 67 193 struct bio *bio; 68 - int i, read = rq_data_dir(rq) == READ; 69 194 int unaligned = 0; 195 + struct iov_iter i; 196 + struct iovec iov; 70 197 71 - if (!iov || iov_count <= 0) 198 + if (!iter || !iter->count) 72 199 return -EINVAL; 73 200 74 - for (i = 0; i < iov_count; i++) { 75 - unsigned long uaddr = (unsigned long)iov[i].iov_base; 201 + iov_for_each(iov, i, *iter) { 202 + unsigned long uaddr = (unsigned long) iov.iov_base; 76 203 77 - if (!iov[i].iov_len) 204 + if (!iov.iov_len) 78 205 return -EINVAL; 79 206 80 207 /* ··· 85 210 unaligned = 1; 86 211 } 87 212 88 - if (unaligned || (q->dma_pad_mask & len) || map_data) 89 - bio = bio_copy_user_iov(q, map_data, iov, iov_count, read, 90 - gfp_mask); 213 + if (unaligned || (q->dma_pad_mask & iter->count) || map_data) 214 + bio = bio_copy_user_iov(q, map_data, iter, gfp_mask); 91 215 else 92 - bio = bio_map_user_iov(q, NULL, iov, iov_count, read, gfp_mask); 216 + bio = bio_map_user_iov(q, iter, gfp_mask); 93 217 94 218 if (IS_ERR(bio)) 95 219 return PTR_ERR(bio); 96 220 97 - if (bio->bi_iter.bi_size != len) { 221 + if (map_data && map_data->null_mapped) 222 + bio->bi_flags |= (1 << BIO_NULL_MAPPED); 223 + 224 + if (bio->bi_iter.bi_size != iter->count) { 98 225 /* 99 226 * Grab an extra reference to this bio, as bio_unmap_user() 100 227 * expects to be able to drop it twice as it happens on the ··· 117 240 return 0; 118 241 } 119 242 EXPORT_SYMBOL(blk_rq_map_user_iov); 243 + 244 + int blk_rq_map_user(struct request_queue *q, struct request *rq, 245 + struct rq_map_data *map_data, void __user *ubuf, 246 + unsigned long len, gfp_t gfp_mask) 247 + { 248 + struct iovec iov; 249 + struct iov_iter i; 250 + 251 + iov.iov_base = ubuf; 252 + iov.iov_len = len; 253 + iov_iter_init(&i, rq_data_dir(rq), &iov, 1, len); 254 + 255 + return blk_rq_map_user_iov(q, rq, map_data, &i, gfp_mask); 256 + } 257 + EXPORT_SYMBOL(blk_rq_map_user); 120 258 121 259 /** 122 260 * blk_rq_unmap_user - unmap a request with user data

+12 -29

block/blk-merge.c

··· 283 283 } 284 284 EXPORT_SYMBOL(blk_rq_map_sg); 285 285 286 - /** 287 - * blk_bio_map_sg - map a bio to a scatterlist 288 - * @q: request_queue in question 289 - * @bio: bio being mapped 290 - * @sglist: scatterlist being mapped 291 - * 292 - * Note: 293 - * Caller must make sure sg can hold bio->bi_phys_segments entries 294 - * 295 - * Will return the number of sg entries setup 296 - */ 297 - int blk_bio_map_sg(struct request_queue *q, struct bio *bio, 298 - struct scatterlist *sglist) 299 - { 300 - struct scatterlist *sg = NULL; 301 - int nsegs; 302 - struct bio *next = bio->bi_next; 303 - bio->bi_next = NULL; 304 - 305 - nsegs = __blk_bios_map_sg(q, bio, sglist, &sg); 306 - bio->bi_next = next; 307 - if (sg) 308 - sg_mark_end(sg); 309 - 310 - BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments); 311 - return nsegs; 312 - } 313 - EXPORT_SYMBOL(blk_bio_map_sg); 314 - 315 286 static inline int ll_new_hw_segment(struct request_queue *q, 316 287 struct request *req, 317 288 struct bio *bio) ··· 356 385 return !q->mq_ops && req->special; 357 386 } 358 387 388 + static int req_gap_to_prev(struct request *req, struct request *next) 389 + { 390 + struct bio *prev = req->biotail; 391 + 392 + return bvec_gap_to_prev(&prev->bi_io_vec[prev->bi_vcnt - 1], 393 + next->bio->bi_io_vec[0].bv_offset); 394 + } 395 + 359 396 static int ll_merge_requests_fn(struct request_queue *q, struct request *req, 360 397 struct request *next) 361 398 { ··· 376 397 * requests. Can't merge them if they are. 377 398 */ 378 399 if (req_no_special_merge(req) || req_no_special_merge(next)) 400 + return 0; 401 + 402 + if (test_bit(QUEUE_FLAG_SG_GAPS, &q->queue_flags) && 403 + req_gap_to_prev(req, next)) 379 404 return 0; 380 405 381 406 /*

+50 -31

block/blk-mq-tag.c

··· 140 140 return atomic_read(&hctx->nr_active) < depth; 141 141 } 142 142 143 - static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag) 143 + static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag, 144 + bool nowrap) 144 145 { 145 - int tag, org_last_tag, end; 146 - bool wrap = last_tag != 0; 146 + int tag, org_last_tag = last_tag; 147 147 148 - org_last_tag = last_tag; 149 - end = bm->depth; 150 - do { 151 - restart: 152 - tag = find_next_zero_bit(&bm->word, end, last_tag); 153 - if (unlikely(tag >= end)) { 148 + while (1) { 149 + tag = find_next_zero_bit(&bm->word, bm->depth, last_tag); 150 + if (unlikely(tag >= bm->depth)) { 154 151 /* 155 - * We started with an offset, start from 0 to 152 + * We started with an offset, and we didn't reset the 153 + * offset to 0 in a failure case, so start from 0 to 156 154 * exhaust the map. 157 155 */ 158 - if (wrap) { 159 - wrap = false; 160 - end = org_last_tag; 161 - last_tag = 0; 162 - goto restart; 156 + if (org_last_tag && last_tag && !nowrap) { 157 + last_tag = org_last_tag = 0; 158 + continue; 163 159 } 164 160 return -1; 165 161 } 162 + 163 + if (!test_and_set_bit(tag, &bm->word)) 164 + break; 165 + 166 166 last_tag = tag + 1; 167 - } while (test_and_set_bit(tag, &bm->word)); 167 + if (last_tag >= bm->depth - 1) 168 + last_tag = 0; 169 + } 168 170 169 171 return tag; 170 172 } 173 + 174 + #define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR) 171 175 172 176 /* 173 177 * Straight forward bitmap tag implementation, where each bit is a tag ··· 185 181 * until the map is exhausted. 186 182 */ 187 183 static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, 188 - unsigned int *tag_cache) 184 + unsigned int *tag_cache, struct blk_mq_tags *tags) 189 185 { 190 186 unsigned int last_tag, org_last_tag; 191 187 int index, i, tag; ··· 197 193 index = TAG_TO_INDEX(bt, last_tag); 198 194 199 195 for (i = 0; i < bt->map_nr; i++) { 200 - tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag)); 196 + tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag), 197 + BT_ALLOC_RR(tags)); 201 198 if (tag != -1) { 202 199 tag += (index << bt->bits_per_word); 203 200 goto done; 204 201 } 205 202 206 - last_tag = 0; 207 - if (++index >= bt->map_nr) 203 + /* 204 + * Jump to next index, and reset the last tag to be the 205 + * first tag of that index 206 + */ 207 + index++; 208 + last_tag = (index << bt->bits_per_word); 209 + 210 + if (index >= bt->map_nr) { 208 211 index = 0; 212 + last_tag = 0; 213 + } 209 214 } 210 215 211 216 *tag_cache = 0; ··· 225 212 * up using the specific cached tag. 226 213 */ 227 214 done: 228 - if (tag == org_last_tag) { 215 + if (tag == org_last_tag || unlikely(BT_ALLOC_RR(tags))) { 229 216 last_tag = tag + 1; 230 217 if (last_tag >= bt->depth - 1) 231 218 last_tag = 0; ··· 254 241 static int bt_get(struct blk_mq_alloc_data *data, 255 242 struct blk_mq_bitmap_tags *bt, 256 243 struct blk_mq_hw_ctx *hctx, 257 - unsigned int *last_tag) 244 + unsigned int *last_tag, struct blk_mq_tags *tags) 258 245 { 259 246 struct bt_wait_state *bs; 260 247 DEFINE_WAIT(wait); 261 248 int tag; 262 249 263 - tag = __bt_get(hctx, bt, last_tag); 250 + tag = __bt_get(hctx, bt, last_tag, tags); 264 251 if (tag != -1) 265 252 return tag; 266 253 ··· 271 258 do { 272 259 prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); 273 260 274 - tag = __bt_get(hctx, bt, last_tag); 261 + tag = __bt_get(hctx, bt, last_tag, tags); 275 262 if (tag != -1) 276 263 break; 277 264 ··· 286 273 * Retry tag allocation after running the hardware queue, 287 274 * as running the queue may also have found completions. 288 275 */ 289 - tag = __bt_get(hctx, bt, last_tag); 276 + tag = __bt_get(hctx, bt, last_tag, tags); 290 277 if (tag != -1) 291 278 break; 292 279 ··· 317 304 int tag; 318 305 319 306 tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx, 320 - &data->ctx->last_tag); 307 + &data->ctx->last_tag, data->hctx->tags); 321 308 if (tag >= 0) 322 309 return tag + data->hctx->tags->nr_reserved_tags; 323 310 ··· 333 320 return BLK_MQ_TAG_FAIL; 334 321 } 335 322 336 - tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero); 323 + tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero, 324 + data->hctx->tags); 337 325 if (tag < 0) 338 326 return BLK_MQ_TAG_FAIL; 339 327 ··· 406 392 407 393 BUG_ON(real_tag >= tags->nr_tags); 408 394 bt_clear_tag(&tags->bitmap_tags, real_tag); 409 - *last_tag = real_tag; 395 + if (likely(tags->alloc_policy == BLK_TAG_ALLOC_FIFO)) 396 + *last_tag = real_tag; 410 397 } else { 411 398 BUG_ON(tag >= tags->nr_reserved_tags); 412 399 bt_clear_tag(&tags->breserved_tags, tag); ··· 524 509 bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL); 525 510 if (!bt->bs) { 526 511 kfree(bt->map); 512 + bt->map = NULL; 527 513 return -ENOMEM; 528 514 } 529 515 ··· 545 529 } 546 530 547 531 static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, 548 - int node) 532 + int node, int alloc_policy) 549 533 { 550 534 unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; 535 + 536 + tags->alloc_policy = alloc_policy; 551 537 552 538 if (bt_alloc(&tags->bitmap_tags, depth, node, false)) 553 539 goto enomem; ··· 564 546 } 565 547 566 548 struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, 567 - unsigned int reserved_tags, int node) 549 + unsigned int reserved_tags, 550 + int node, int alloc_policy) 568 551 { 569 552 struct blk_mq_tags *tags; 570 553 ··· 581 562 tags->nr_tags = total_tags; 582 563 tags->nr_reserved_tags = reserved_tags; 583 564 584 - return blk_mq_init_bitmap_tags(tags, node); 565 + return blk_mq_init_bitmap_tags(tags, node, alloc_policy); 585 566 } 586 567 587 568 void blk_mq_free_tags(struct blk_mq_tags *tags)

+3 -1

block/blk-mq-tag.h

··· 42 42 43 43 struct request **rqs; 44 44 struct list_head page_list; 45 + 46 + int alloc_policy; 45 47 }; 46 48 47 49 48 - extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); 50 + extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy); 49 51 extern void blk_mq_free_tags(struct blk_mq_tags *tags); 50 52 51 53 extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);

+7 -6

block/blk-mq.c

··· 33 33 static LIST_HEAD(all_q_list); 34 34 35 35 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); 36 + static void blk_mq_run_queues(struct request_queue *q); 36 37 37 38 /* 38 39 * Check if any of the ctx's have pending work in this hardware queue ··· 118 117 119 118 if (freeze) { 120 119 percpu_ref_kill(&q->mq_usage_counter); 121 - blk_mq_run_queues(q, false); 120 + blk_mq_run_queues(q); 122 121 } 123 122 } 124 123 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); ··· 137 136 blk_mq_freeze_queue_start(q); 138 137 blk_mq_freeze_queue_wait(q); 139 138 } 139 + EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); 140 140 141 141 void blk_mq_unfreeze_queue(struct request_queue *q) 142 142 { ··· 904 902 &hctx->run_work, 0); 905 903 } 906 904 907 - void blk_mq_run_queues(struct request_queue *q, bool async) 905 + static void blk_mq_run_queues(struct request_queue *q) 908 906 { 909 907 struct blk_mq_hw_ctx *hctx; 910 908 int i; ··· 915 913 test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 916 914 continue; 917 915 918 - blk_mq_run_hw_queue(hctx, async); 916 + blk_mq_run_hw_queue(hctx, false); 919 917 } 920 918 } 921 - EXPORT_SYMBOL(blk_mq_run_queues); 922 919 923 920 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 924 921 { ··· 954 953 blk_mq_start_hw_queue(hctx); 955 954 } 956 955 EXPORT_SYMBOL(blk_mq_start_hw_queues); 957 - 958 956 959 957 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) 960 958 { ··· 1423 1423 size_t rq_size, left; 1424 1424 1425 1425 tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags, 1426 - set->numa_node); 1426 + set->numa_node, 1427 + BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); 1427 1428 if (!tags) 1428 1429 return NULL; 1429 1430

+25 -8

block/blk-tag.c

··· 119 119 } 120 120 121 121 static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q, 122 - int depth) 122 + int depth, int alloc_policy) 123 123 { 124 124 struct blk_queue_tag *tags; 125 125 ··· 131 131 goto fail; 132 132 133 133 atomic_set(&tags->refcnt, 1); 134 + tags->alloc_policy = alloc_policy; 135 + tags->next_tag = 0; 134 136 return tags; 135 137 fail: 136 138 kfree(tags); ··· 142 140 /** 143 141 * blk_init_tags - initialize the tag info for an external tag map 144 142 * @depth: the maximum queue depth supported 143 + * @alloc_policy: tag allocation policy 145 144 **/ 146 - struct blk_queue_tag *blk_init_tags(int depth) 145 + struct blk_queue_tag *blk_init_tags(int depth, int alloc_policy) 147 146 { 148 - return __blk_queue_init_tags(NULL, depth); 147 + return __blk_queue_init_tags(NULL, depth, alloc_policy); 149 148 } 150 149 EXPORT_SYMBOL(blk_init_tags); 151 150 ··· 155 152 * @q: the request queue for the device 156 153 * @depth: the maximum queue depth supported 157 154 * @tags: the tag to use 155 + * @alloc_policy: tag allocation policy 158 156 * 159 157 * Queue lock must be held here if the function is called to resize an 160 158 * existing map. 161 159 **/ 162 160 int blk_queue_init_tags(struct request_queue *q, int depth, 163 - struct blk_queue_tag *tags) 161 + struct blk_queue_tag *tags, int alloc_policy) 164 162 { 165 163 int rc; 166 164 167 165 BUG_ON(tags && q->queue_tags && tags != q->queue_tags); 168 166 169 167 if (!tags && !q->queue_tags) { 170 - tags = __blk_queue_init_tags(q, depth); 168 + tags = __blk_queue_init_tags(q, depth, alloc_policy); 171 169 172 170 if (!tags) 173 171 return -ENOMEM; ··· 348 344 } 349 345 350 346 do { 351 - tag = find_first_zero_bit(bqt->tag_map, max_depth); 352 - if (tag >= max_depth) 353 - return 1; 347 + if (bqt->alloc_policy == BLK_TAG_ALLOC_FIFO) { 348 + tag = find_first_zero_bit(bqt->tag_map, max_depth); 349 + if (tag >= max_depth) 350 + return 1; 351 + } else { 352 + int start = bqt->next_tag; 353 + int size = min_t(int, bqt->max_depth, max_depth + start); 354 + tag = find_next_zero_bit(bqt->tag_map, size, start); 355 + if (tag >= size && start + size > bqt->max_depth) { 356 + size = start + size - bqt->max_depth; 357 + tag = find_first_zero_bit(bqt->tag_map, size); 358 + } 359 + if (tag >= size) 360 + return 1; 361 + } 354 362 355 363 } while (test_and_set_bit_lock(tag, bqt->tag_map)); 356 364 /* ··· 370 354 * See blk_queue_end_tag for details. 371 355 */ 372 356 357 + bqt->next_tag = (tag + 1) % bqt->max_depth; 373 358 rq->cmd_flags |= REQ_QUEUED; 374 359 rq->tag = tag; 375 360 bqt->tag_index[tag] = rq;

+13 -3

block/cfq-iosched.c

··· 3590 3590 3591 3591 blkcg = bio_blkcg(bio); 3592 3592 cfqg = cfq_lookup_create_cfqg(cfqd, blkcg); 3593 + if (!cfqg) { 3594 + cfqq = &cfqd->oom_cfqq; 3595 + goto out; 3596 + } 3597 + 3593 3598 cfqq = cic_to_cfqq(cic, is_sync); 3594 3599 3595 3600 /* ··· 3631 3626 } else 3632 3627 cfqq = &cfqd->oom_cfqq; 3633 3628 } 3634 - 3629 + out: 3635 3630 if (new_cfqq) 3636 3631 kmem_cache_free(cfq_pool, new_cfqq); 3637 3632 ··· 3661 3656 cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, 3662 3657 struct bio *bio, gfp_t gfp_mask) 3663 3658 { 3664 - const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); 3665 - const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); 3659 + int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); 3660 + int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); 3666 3661 struct cfq_queue **async_cfqq = NULL; 3667 3662 struct cfq_queue *cfqq = NULL; 3668 3663 3669 3664 if (!is_sync) { 3665 + if (!ioprio_valid(cic->ioprio)) { 3666 + struct task_struct *tsk = current; 3667 + ioprio = task_nice_ioprio(tsk); 3668 + ioprio_class = task_nice_ioclass(tsk); 3669 + } 3670 3670 async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); 3671 3671 cfqq = *async_cfqq; 3672 3672 }

+1 -1

block/ioctl.c

··· 198 198 if (start + len > (i_size_read(bdev->bd_inode) >> 9)) 199 199 return -EINVAL; 200 200 201 - return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL); 201 + return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL, false); 202 202 } 203 203 204 204 static int put_ushort(unsigned long arg, unsigned short val)

+6 -6

block/partitions/check.c

··· 184 184 if (err) 185 185 /* The partition is unrecognized. So report I/O errors if there were any */ 186 186 res = err; 187 - if (!res) 188 - strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE); 189 - else if (warn_no_part) 190 - strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE); 191 - 192 - printk(KERN_INFO "%s", state->pp_buf); 187 + if (res) { 188 + if (warn_no_part) 189 + strlcat(state->pp_buf, 190 + " unable to read partition table\n", PAGE_SIZE); 191 + printk(KERN_INFO "%s", state->pp_buf); 192 + } 193 193 194 194 free_page((unsigned long)state->pp_buf); 195 195 free_partitions(state);

+4 -13

block/scsi_ioctl.c

··· 332 332 333 333 ret = 0; 334 334 if (hdr->iovec_count) { 335 - size_t iov_data_len; 335 + struct iov_iter i; 336 336 struct iovec *iov = NULL; 337 337 338 338 ret = rw_copy_check_uvector(-1, hdr->dxferp, hdr->iovec_count, ··· 342 342 goto out_free_cdb; 343 343 } 344 344 345 - iov_data_len = ret; 346 - ret = 0; 347 - 348 345 /* SG_IO howto says that the shorter of the two wins */ 349 - if (hdr->dxfer_len < iov_data_len) { 350 - hdr->iovec_count = iov_shorten(iov, 351 - hdr->iovec_count, 352 - hdr->dxfer_len); 353 - iov_data_len = hdr->dxfer_len; 354 - } 346 + iov_iter_init(&i, rq_data_dir(rq), iov, hdr->iovec_count, 347 + min_t(unsigned, ret, hdr->dxfer_len)); 355 348 356 - ret = blk_rq_map_user_iov(q, rq, NULL, (struct sg_iovec *) iov, 357 - hdr->iovec_count, 358 - iov_data_len, GFP_KERNEL); 349 + ret = blk_rq_map_user_iov(q, rq, NULL, &i, GFP_KERNEL); 359 350 kfree(iov); 360 351 } else if (hdr->dxfer_len) 361 352 ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,

+7 -7

drivers/block/brd.c

··· 370 370 } 371 371 372 372 #ifdef CONFIG_BLK_DEV_XIP 373 - static int brd_direct_access(struct block_device *bdev, sector_t sector, 374 - void **kaddr, unsigned long *pfn) 373 + static long brd_direct_access(struct block_device *bdev, sector_t sector, 374 + void **kaddr, unsigned long *pfn, long size) 375 375 { 376 376 struct brd_device *brd = bdev->bd_disk->private_data; 377 377 struct page *page; 378 378 379 379 if (!brd) 380 380 return -ENODEV; 381 - if (sector & (PAGE_SECTORS-1)) 382 - return -EINVAL; 383 - if (sector + PAGE_SECTORS > get_capacity(bdev->bd_disk)) 384 - return -ERANGE; 385 381 page = brd_insert_page(brd, sector); 386 382 if (!page) 387 383 return -ENOSPC; 388 384 *kaddr = page_address(page); 389 385 *pfn = page_to_pfn(page); 390 386 391 - return 0; 387 + /* 388 + * TODO: If size > PAGE_SIZE, we could look to see if the next page in 389 + * the file happens to be mapped to the next page of physical RAM. 390 + */ 391 + return PAGE_SIZE; 392 392 } 393 393 #endif 394 394

+1 -1

drivers/block/drbd/drbd_receiver.c

··· 1388 1388 list_add_tail(&peer_req->w.list, &device->active_ee); 1389 1389 spin_unlock_irq(&device->resource->req_lock); 1390 1390 if (blkdev_issue_zeroout(device->ldev->backing_bdev, 1391 - sector, data_size >> 9, GFP_NOIO)) 1391 + sector, data_size >> 9, GFP_NOIO, false)) 1392 1392 peer_req->flags |= EE_WAS_ERROR; 1393 1393 drbd_endio_write_sec_final(peer_req); 1394 1394 return 0;

+1 -1

drivers/block/osdblk.c

··· 423 423 } 424 424 425 425 /* switch queue to TCQ mode; allocate tag map */ 426 - rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL); 426 + rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL, BLK_TAG_ALLOC_FIFO); 427 427 if (rc) { 428 428 blk_cleanup_queue(q); 429 429 put_disk(disk);

+1

drivers/md/dm.c

··· 1722 1722 { 1723 1723 int r; 1724 1724 1725 + blk_rq_init(NULL, clone); 1725 1726 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, 1726 1727 dm_rq_bio_constructor, tio); 1727 1728 if (r)

+9 -12

drivers/s390/block/dcssblk.c

··· 28 28 static int dcssblk_open(struct block_device *bdev, fmode_t mode); 29 29 static void dcssblk_release(struct gendisk *disk, fmode_t mode); 30 30 static void dcssblk_make_request(struct request_queue *q, struct bio *bio); 31 - static int dcssblk_direct_access(struct block_device *bdev, sector_t secnum, 32 - void **kaddr, unsigned long *pfn); 31 + static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum, 32 + void **kaddr, unsigned long *pfn, long size); 33 33 34 34 static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0"; 35 35 ··· 877 877 bio_io_error(bio); 878 878 } 879 879 880 - static int 880 + static long 881 881 dcssblk_direct_access (struct block_device *bdev, sector_t secnum, 882 - void **kaddr, unsigned long *pfn) 882 + void **kaddr, unsigned long *pfn, long size) 883 883 { 884 884 struct dcssblk_dev_info *dev_info; 885 - unsigned long pgoff; 885 + unsigned long offset, dev_sz; 886 886 887 887 dev_info = bdev->bd_disk->private_data; 888 888 if (!dev_info) 889 889 return -ENODEV; 890 - if (secnum % (PAGE_SIZE/512)) 891 - return -EINVAL; 892 - pgoff = secnum / (PAGE_SIZE / 512); 893 - if ((pgoff+1)*PAGE_SIZE-1 > dev_info->end - dev_info->start) 894 - return -ERANGE; 895 - *kaddr = (void *) (dev_info->start+pgoff*PAGE_SIZE); 890 + dev_sz = dev_info->end - dev_info->start; 891 + offset = secnum * 512; 892 + *kaddr = (void *) (dev_info->start + offset); 896 893 *pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT; 897 894 898 - return 0; 895 + return dev_sz - offset; 899 896 } 900 897 901 898 static void

+2

drivers/scsi/scsi_lib.c

··· 2197 2197 shost->tag_set.cmd_size = cmd_size; 2198 2198 shost->tag_set.numa_node = NUMA_NO_NODE; 2199 2199 shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 2200 + shost->tag_set.flags |= 2201 + BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy); 2200 2202 shost->tag_set.driver_data = shost; 2201 2203 2202 2204 return blk_mq_alloc_tag_set(&shost->tag_set);

+2 -1

drivers/scsi/scsi_scan.c

··· 277 277 if (!shost_use_blk_mq(sdev->host) && 278 278 (shost->bqt || shost->hostt->use_blk_tags)) { 279 279 blk_queue_init_tags(sdev->request_queue, 280 - sdev->host->cmd_per_lun, shost->bqt); 280 + sdev->host->cmd_per_lun, shost->bqt, 281 + shost->hostt->tag_alloc_policy); 281 282 } 282 283 scsi_change_queue_depth(sdev, sdev->host->cmd_per_lun); 283 284

+6 -9

drivers/scsi/sg.c

··· 1719 1719 } 1720 1720 1721 1721 if (iov_count) { 1722 - int len, size = sizeof(struct sg_iovec) * iov_count; 1722 + int size = sizeof(struct iovec) * iov_count; 1723 1723 struct iovec *iov; 1724 + struct iov_iter i; 1724 1725 1725 1726 iov = memdup_user(hp->dxferp, size); 1726 1727 if (IS_ERR(iov)) 1727 1728 return PTR_ERR(iov); 1728 1729 1729 - len = iov_length(iov, iov_count); 1730 - if (hp->dxfer_len < len) { 1731 - iov_count = iov_shorten(iov, iov_count, hp->dxfer_len); 1732 - len = hp->dxfer_len; 1733 - } 1730 + iov_iter_init(&i, rw, iov, iov_count, 1731 + min_t(size_t, hp->dxfer_len, 1732 + iov_length(iov, iov_count))); 1734 1733 1735 - res = blk_rq_map_user_iov(q, rq, md, (struct sg_iovec *)iov, 1736 - iov_count, 1737 - len, GFP_ATOMIC); 1734 + res = blk_rq_map_user_iov(q, rq, md, &i, GFP_ATOMIC); 1738 1735 kfree(iov); 1739 1736 } else 1740 1737 res = blk_rq_map_user(q, rq, md, hp->dxferp,

+40

fs/block_dev.c

··· 421 421 } 422 422 EXPORT_SYMBOL_GPL(bdev_write_page); 423 423 424 + /** 425 + * bdev_direct_access() - Get the address for directly-accessibly memory 426 + * @bdev: The device containing the memory 427 + * @sector: The offset within the device 428 + * @addr: Where to put the address of the memory 429 + * @pfn: The Page Frame Number for the memory 430 + * @size: The number of bytes requested 431 + * 432 + * If a block device is made up of directly addressable memory, this function 433 + * will tell the caller the PFN and the address of the memory. The address 434 + * may be directly dereferenced within the kernel without the need to call 435 + * ioremap(), kmap() or similar. The PFN is suitable for inserting into 436 + * page tables. 437 + * 438 + * Return: negative errno if an error occurs, otherwise the number of bytes 439 + * accessible at this address. 440 + */ 441 + long bdev_direct_access(struct block_device *bdev, sector_t sector, 442 + void **addr, unsigned long *pfn, long size) 443 + { 444 + long avail; 445 + const struct block_device_operations *ops = bdev->bd_disk->fops; 446 + 447 + if (size < 0) 448 + return size; 449 + if (!ops->direct_access) 450 + return -EOPNOTSUPP; 451 + if ((sector + DIV_ROUND_UP(size, 512)) > 452 + part_nr_sects_read(bdev->bd_part)) 453 + return -ERANGE; 454 + sector += get_start_sect(bdev); 455 + if (sector % (PAGE_SIZE / 512)) 456 + return -EINVAL; 457 + avail = ops->direct_access(bdev, sector, addr, pfn, size); 458 + if (!avail) 459 + return -ERANGE; 460 + return min(avail, size); 461 + } 462 + EXPORT_SYMBOL_GPL(bdev_direct_access); 463 + 424 464 /* 425 465 * pseudo-fs 426 466 */

+13 -18

fs/ext2/xip.c

··· 13 13 #include "ext2.h" 14 14 #include "xip.h" 15 15 16 - static inline int 17 - __inode_direct_access(struct inode *inode, sector_t block, 18 - void **kaddr, unsigned long *pfn) 16 + static inline long __inode_direct_access(struct inode *inode, sector_t block, 17 + void **kaddr, unsigned long *pfn, long size) 19 18 { 20 19 struct block_device *bdev = inode->i_sb->s_bdev; 21 - const struct block_device_operations *ops = bdev->bd_disk->fops; 22 - sector_t sector; 23 - 24 - sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */ 25 - 26 - BUG_ON(!ops->direct_access); 27 - return ops->direct_access(bdev, sector, kaddr, pfn); 20 + sector_t sector = block * (PAGE_SIZE / 512); 21 + return bdev_direct_access(bdev, sector, kaddr, pfn, size); 28 22 } 29 23 30 24 static inline int ··· 47 53 { 48 54 void *kaddr; 49 55 unsigned long pfn; 50 - int rc; 56 + long size; 51 57 52 - rc = __inode_direct_access(inode, block, &kaddr, &pfn); 53 - if (!rc) 54 - clear_page(kaddr); 55 - return rc; 58 + size = __inode_direct_access(inode, block, &kaddr, &pfn, PAGE_SIZE); 59 + if (size < 0) 60 + return size; 61 + clear_page(kaddr); 62 + return 0; 56 63 } 57 64 58 65 void ext2_xip_verify_sb(struct super_block *sb) ··· 72 77 int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create, 73 78 void **kmem, unsigned long *pfn) 74 79 { 75 - int rc; 80 + long rc; 76 81 sector_t block; 77 82 78 83 /* first, retrieve the sector number */ ··· 81 86 return rc; 82 87 83 88 /* retrieve address of the target data */ 84 - rc = __inode_direct_access(mapping->host, block, kmem, pfn); 85 - return rc; 89 + rc = __inode_direct_access(mapping->host, block, kmem, pfn, PAGE_SIZE); 90 + return (rc < 0) ? rc : 0; 86 91 }

+3 -9

include/linux/bio.h

··· 428 428 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, 429 429 unsigned int, unsigned int); 430 430 extern int bio_get_nr_vecs(struct block_device *); 431 - extern struct bio *bio_map_user(struct request_queue *, struct block_device *, 432 - unsigned long, unsigned int, int, gfp_t); 433 - struct sg_iovec; 434 431 struct rq_map_data; 435 432 extern struct bio *bio_map_user_iov(struct request_queue *, 436 - struct block_device *, 437 - const struct sg_iovec *, int, int, gfp_t); 433 + const struct iov_iter *, gfp_t); 438 434 extern void bio_unmap_user(struct bio *); 439 435 extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int, 440 436 gfp_t); ··· 458 462 extern void bio_copy_data(struct bio *dst, struct bio *src); 459 463 extern int bio_alloc_pages(struct bio *bio, gfp_t gfp); 460 464 461 - extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *, 462 - unsigned long, unsigned int, int, gfp_t); 463 465 extern struct bio *bio_copy_user_iov(struct request_queue *, 464 466 struct rq_map_data *, 465 - const struct sg_iovec *, 466 - int, int, gfp_t); 467 + const struct iov_iter *, 468 + gfp_t); 467 469 extern int bio_uncopy_user(struct bio *); 468 470 void zero_fill_bio(struct bio *bio); 469 471 extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *);

+9 -1

include/linux/blk-mq.h

··· 146 146 BLK_MQ_F_SG_MERGE = 1 << 2, 147 147 BLK_MQ_F_SYSFS_UP = 1 << 3, 148 148 BLK_MQ_F_DEFER_ISSUE = 1 << 4, 149 + BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, 150 + BLK_MQ_F_ALLOC_POLICY_BITS = 1, 149 151 150 152 BLK_MQ_S_STOPPED = 0, 151 153 BLK_MQ_S_TAG_ACTIVE = 1, ··· 156 154 157 155 BLK_MQ_CPU_WORK_BATCH = 8, 158 156 }; 157 + #define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ 158 + ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ 159 + ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) 160 + #define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \ 161 + ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ 162 + << BLK_MQ_F_ALLOC_POLICY_START_BIT) 159 163 160 164 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); 161 165 void blk_mq_finish_init(struct request_queue *q); ··· 174 166 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); 175 167 176 168 void blk_mq_insert_request(struct request *, bool, bool, bool); 177 - void blk_mq_run_queues(struct request_queue *q, bool async); 178 169 void blk_mq_free_request(struct request *rq); 179 170 void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq); 180 171 bool blk_mq_can_queue(struct blk_mq_hw_ctx *); ··· 221 214 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); 222 215 void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, 223 216 void *priv); 217 + void blk_mq_freeze_queue(struct request_queue *q); 224 218 void blk_mq_unfreeze_queue(struct request_queue *q); 225 219 void blk_mq_freeze_queue_start(struct request_queue *q); 226 220

+15 -10

include/linux/blkdev.h

··· 272 272 int max_depth; /* what we will send to device */ 273 273 int real_max_depth; /* what the array can hold */ 274 274 atomic_t refcnt; /* map can be shared */ 275 + int alloc_policy; /* tag allocation policy */ 276 + int next_tag; /* next tag */ 275 277 }; 278 + #define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ 279 + #define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */ 276 280 277 281 #define BLK_SCSI_MAX_CMDS (256) 278 282 #define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8)) ··· 520 516 (1 << QUEUE_FLAG_ADD_RANDOM)) 521 517 522 518 #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ 519 + (1 << QUEUE_FLAG_STACKABLE) | \ 523 520 (1 << QUEUE_FLAG_SAME_COMP)) 524 521 525 522 static inline void queue_lockdep_assert_held(struct request_queue *q) ··· 855 850 extern int blk_rq_unmap_user(struct bio *); 856 851 extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t); 857 852 extern int blk_rq_map_user_iov(struct request_queue *, struct request *, 858 - struct rq_map_data *, const struct sg_iovec *, 859 - int, unsigned int, gfp_t); 853 + struct rq_map_data *, const struct iov_iter *, 854 + gfp_t); 860 855 extern int blk_execute_rq(struct request_queue *, struct gendisk *, 861 856 struct request *, int); 862 857 extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, ··· 1049 1044 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); 1050 1045 1051 1046 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); 1052 - extern int blk_bio_map_sg(struct request_queue *q, struct bio *bio, 1053 - struct scatterlist *sglist); 1054 1047 extern void blk_dump_rq_flags(struct request *, char *); 1055 1048 extern long nr_blockdev_pages(void); 1056 1049 ··· 1142 1139 extern int blk_queue_start_tag(struct request_queue *, struct request *); 1143 1140 extern struct request *blk_queue_find_tag(struct request_queue *, int); 1144 1141 extern void blk_queue_end_tag(struct request_queue *, struct request *); 1145 - extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *); 1142 + extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *, int); 1146 1143 extern void blk_queue_free_tags(struct request_queue *); 1147 1144 extern int blk_queue_resize_tags(struct request_queue *, int); 1148 1145 extern void blk_queue_invalidate_tags(struct request_queue *); 1149 - extern struct blk_queue_tag *blk_init_tags(int); 1146 + extern struct blk_queue_tag *blk_init_tags(int, int); 1150 1147 extern void blk_free_tags(struct blk_queue_tag *); 1151 1148 1152 1149 static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, ··· 1165 1162 extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, 1166 1163 sector_t nr_sects, gfp_t gfp_mask, struct page *page); 1167 1164 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, 1168 - sector_t nr_sects, gfp_t gfp_mask); 1165 + sector_t nr_sects, gfp_t gfp_mask, bool discard); 1169 1166 static inline int sb_issue_discard(struct super_block *sb, sector_t block, 1170 1167 sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags) 1171 1168 { ··· 1179 1176 return blkdev_issue_zeroout(sb->s_bdev, 1180 1177 block << (sb->s_blocksize_bits - 9), 1181 1178 nr_blocks << (sb->s_blocksize_bits - 9), 1182 - gfp_mask); 1179 + gfp_mask, true); 1183 1180 } 1184 1181 1185 1182 extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); ··· 1604 1601 int (*rw_page)(struct block_device *, sector_t, struct page *, int rw); 1605 1602 int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); 1606 1603 int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); 1607 - int (*direct_access) (struct block_device *, sector_t, 1608 - void **, unsigned long *); 1604 + long (*direct_access)(struct block_device *, sector_t, 1605 + void **, unsigned long *pfn, long size); 1609 1606 unsigned int (*check_events) (struct gendisk *disk, 1610 1607 unsigned int clearing); 1611 1608 /* ->media_changed() is DEPRECATED, use ->check_events() instead */ ··· 1623 1620 extern int bdev_read_page(struct block_device *, sector_t, struct page *); 1624 1621 extern int bdev_write_page(struct block_device *, sector_t, struct page *, 1625 1622 struct writeback_control *); 1623 + extern long bdev_direct_access(struct block_device *, sector_t, void **addr, 1624 + unsigned long *pfn, long size); 1626 1625 #else /* CONFIG_BLOCK */ 1627 1626 1628 1627 struct block_device;

+3

include/scsi/scsi_host.h

··· 402 402 */ 403 403 unsigned char present; 404 404 405 + /* If use block layer to manage tags, this is tag allocation policy */ 406 + int tag_alloc_policy; 407 + 405 408 /* 406 409 * Let the block layer assigns tags to all commands. 407 410 */

+2 -1

include/scsi/scsi_tcq.h

··· 66 66 * devices on the shared host (for libata) 67 67 */ 68 68 if (!shost->bqt) { 69 - shost->bqt = blk_init_tags(depth); 69 + shost->bqt = blk_init_tags(depth, 70 + shost->hostt->tag_alloc_policy); 70 71 if (!shost->bqt) 71 72 return -ENOMEM; 72 73 }