Merge tag 'dm-3.12-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

+4 -2

Documentation/device-mapper/cache.txt

··· 50 50 which are dirty, and extra hints for use by the policy object. 51 51 This information could be put on the cache device, but having it 52 52 separate allows the volume manager to configure it differently, 53 - e.g. as a mirror for extra robustness. 53 + e.g. as a mirror for extra robustness. This metadata device may only 54 + be used by a single cache device. 54 55 55 56 Fixed block size 56 57 ---------------- 57 58 58 59 The origin is divided up into blocks of a fixed size. This block size 59 60 is configurable when you first create the cache. Typically we've been 60 - using block sizes of 256k - 1024k. 61 + using block sizes of 256KB - 1024KB. The block size must be between 64 62 + (32KB) and 2097152 (1GB) and a multiple of 64 (32KB). 61 63 62 64 Having a fixed block size simplifies the target a lot. But it is 63 65 something of a compromise. For instance, a small part of a block may be

+186

Documentation/device-mapper/statistics.txt

··· 1 + DM statistics 2 + ============= 3 + 4 + Device Mapper supports the collection of I/O statistics on user-defined 5 + regions of a DM device. If no regions are defined no statistics are 6 + collected so there isn't any performance impact. Only bio-based DM 7 + devices are currently supported. 8 + 9 + Each user-defined region specifies a starting sector, length and step. 10 + Individual statistics will be collected for each step-sized area within 11 + the range specified. 12 + 13 + The I/O statistics counters for each step-sized area of a region are 14 + in the same format as /sys/block/*/stat or /proc/diskstats (see: 15 + Documentation/iostats.txt). But two extra counters (12 and 13) are 16 + provided: total time spent reading and writing in milliseconds. All 17 + these counters may be accessed by sending the @stats_print message to 18 + the appropriate DM device via dmsetup. 19 + 20 + Each region has a corresponding unique identifier, which we call a 21 + region_id, that is assigned when the region is created. The region_id 22 + must be supplied when querying statistics about the region, deleting the 23 + region, etc. Unique region_ids enable multiple userspace programs to 24 + request and process statistics for the same DM device without stepping 25 + on each other's data. 26 + 27 + The creation of DM statistics will allocate memory via kmalloc or 28 + fallback to using vmalloc space. At most, 1/4 of the overall system 29 + memory may be allocated by DM statistics. The admin can see how much 30 + memory is used by reading 31 + /sys/module/dm_mod/parameters/stats_current_allocated_bytes 32 + 33 + Messages 34 + ======== 35 + 36 + @stats_create <range> <step> [<program_id> [<aux_data>]] 37 + 38 + Create a new region and return the region_id. 39 + 40 + <range> 41 + "-" - whole device 42 + "<start_sector>+<length>" - a range of <length> 512-byte sectors 43 + starting with <start_sector>. 44 + 45 + <step> 46 + "<area_size>" - the range is subdivided into areas each containing 47 + <area_size> sectors. 48 + "/<number_of_areas>" - the range is subdivided into the specified 49 + number of areas. 50 + 51 + <program_id> 52 + An optional parameter. A name that uniquely identifies 53 + the userspace owner of the range. This groups ranges together 54 + so that userspace programs can identify the ranges they 55 + created and ignore those created by others. 56 + The kernel returns this string back in the output of 57 + @stats_list message, but it doesn't use it for anything else. 58 + 59 + <aux_data> 60 + An optional parameter. A word that provides auxiliary data 61 + that is useful to the client program that created the range. 62 + The kernel returns this string back in the output of 63 + @stats_list message, but it doesn't use this value for anything. 64 + 65 + @stats_delete <region_id> 66 + 67 + Delete the region with the specified id. 68 + 69 + <region_id> 70 + region_id returned from @stats_create 71 + 72 + @stats_clear <region_id> 73 + 74 + Clear all the counters except the in-flight i/o counters. 75 + 76 + <region_id> 77 + region_id returned from @stats_create 78 + 79 + @stats_list [<program_id>] 80 + 81 + List all regions registered with @stats_create. 82 + 83 + <program_id> 84 + An optional parameter. 85 + If this parameter is specified, only matching regions 86 + are returned. 87 + If it is not specified, all regions are returned. 88 + 89 + Output format: 90 + <region_id>: <start_sector>+<length> <step> <program_id> <aux_data> 91 + 92 + @stats_print <region_id> [<starting_line> <number_of_lines>] 93 + 94 + Print counters for each step-sized area of a region. 95 + 96 + <region_id> 97 + region_id returned from @stats_create 98 + 99 + <starting_line> 100 + The index of the starting line in the output. 101 + If omitted, all lines are returned. 102 + 103 + <number_of_lines> 104 + The number of lines to include in the output. 105 + If omitted, all lines are returned. 106 + 107 + Output format for each step-sized area of a region: 108 + 109 + <start_sector>+<length> counters 110 + 111 + The first 11 counters have the same meaning as 112 + /sys/block/*/stat or /proc/diskstats. 113 + 114 + Please refer to Documentation/iostats.txt for details. 115 + 116 + 1. the number of reads completed 117 + 2. the number of reads merged 118 + 3. the number of sectors read 119 + 4. the number of milliseconds spent reading 120 + 5. the number of writes completed 121 + 6. the number of writes merged 122 + 7. the number of sectors written 123 + 8. the number of milliseconds spent writing 124 + 9. the number of I/Os currently in progress 125 + 10. the number of milliseconds spent doing I/Os 126 + 11. the weighted number of milliseconds spent doing I/Os 127 + 128 + Additional counters: 129 + 12. the total time spent reading in milliseconds 130 + 13. the total time spent writing in milliseconds 131 + 132 + @stats_print_clear <region_id> [<starting_line> <number_of_lines>] 133 + 134 + Atomically print and then clear all the counters except the 135 + in-flight i/o counters. Useful when the client consuming the 136 + statistics does not want to lose any statistics (those updated 137 + between printing and clearing). 138 + 139 + <region_id> 140 + region_id returned from @stats_create 141 + 142 + <starting_line> 143 + The index of the starting line in the output. 144 + If omitted, all lines are printed and then cleared. 145 + 146 + <number_of_lines> 147 + The number of lines to process. 148 + If omitted, all lines are printed and then cleared. 149 + 150 + @stats_set_aux <region_id> <aux_data> 151 + 152 + Store auxiliary data aux_data for the specified region. 153 + 154 + <region_id> 155 + region_id returned from @stats_create 156 + 157 + <aux_data> 158 + The string that identifies data which is useful to the client 159 + program that created the range. The kernel returns this 160 + string back in the output of @stats_list message, but it 161 + doesn't use this value for anything. 162 + 163 + Examples 164 + ======== 165 + 166 + Subdivide the DM device 'vol' into 100 pieces and start collecting 167 + statistics on them: 168 + 169 + dmsetup message vol 0 @stats_create - /100 170 + 171 + Set the auxillary data string to "foo bar baz" (the escape for each 172 + space must also be escaped, otherwise the shell will consume them): 173 + 174 + dmsetup message vol 0 @stats_set_aux 0 foo\\ bar\\ baz 175 + 176 + List the statistics: 177 + 178 + dmsetup message vol 0 @stats_list 179 + 180 + Print the statistics: 181 + 182 + dmsetup message vol 0 @stats_print 0 183 + 184 + Delete the statistics: 185 + 186 + dmsetup message vol 0 @stats_delete 0

+8 -7

Documentation/device-mapper/thin-provisioning.txt

··· 99 99 $data_block_size $low_water_mark" 100 100 101 101 $data_block_size gives the smallest unit of disk space that can be 102 - allocated at a time expressed in units of 512-byte sectors. People 103 - primarily interested in thin provisioning may want to use a value such 104 - as 1024 (512KB). People doing lots of snapshotting may want a smaller value 105 - such as 128 (64KB). If you are not zeroing newly-allocated data, 106 - a larger $data_block_size in the region of 256000 (128MB) is suggested. 107 - $data_block_size must be the same for the lifetime of the 108 - metadata device. 102 + allocated at a time expressed in units of 512-byte sectors. 103 + $data_block_size must be between 128 (64KB) and 2097152 (1GB) and a 104 + multiple of 128 (64KB). $data_block_size cannot be changed after the 105 + thin-pool is created. People primarily interested in thin provisioning 106 + may want to use a value such as 1024 (512KB). People doing lots of 107 + snapshotting may want a smaller value such as 128 (64KB). If you are 108 + not zeroing newly-allocated data, a larger $data_block_size in the 109 + region of 256000 (128MB) is suggested. 109 110 110 111 $low_water_mark is expressed in blocks of size $data_block_size. If 111 112 free space on the data device drops below this level then a dm event

+1 -1

drivers/md/Makefile

··· 3 3 # 4 4 5 5 dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ 6 - dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o 6 + dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-stats.o 7 7 dm-multipath-y += dm-path-selector.o dm-mpath.o 8 8 dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ 9 9 dm-snap-persistent.o

+35 -24

drivers/md/dm-cache-target.c

··· 67 67 #define MIGRATION_COUNT_WINDOW 10 68 68 69 69 /* 70 - * The block size of the device holding cache data must be >= 32KB 70 + * The block size of the device holding cache data must be 71 + * between 32KB and 1GB. 71 72 */ 72 73 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 74 + #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 73 75 74 76 /* 75 77 * FIXME: the cache is read/write for the time being. ··· 103 101 struct dm_target *ti; 104 102 struct dm_target_callbacks callbacks; 105 103 104 + struct dm_cache_metadata *cmd; 105 + 106 106 /* 107 107 * Metadata is written to this device. 108 108 */ ··· 119 115 * The faster of the two data devices. Typically an SSD. 120 116 */ 121 117 struct dm_dev *cache_dev; 122 - 123 - /* 124 - * Cache features such as write-through. 125 - */ 126 - struct cache_features features; 127 118 128 119 /* 129 120 * Size of the origin device in _complete_ blocks and native sectors. ··· 137 138 uint32_t sectors_per_block; 138 139 int sectors_per_block_shift; 139 140 140 - struct dm_cache_metadata *cmd; 141 - 142 141 spinlock_t lock; 143 142 struct bio_list deferred_bios; 144 143 struct bio_list deferred_flush_bios; ··· 145 148 struct list_head completed_migrations; 146 149 struct list_head need_commit_migrations; 147 150 sector_t migration_threshold; 148 - atomic_t nr_migrations; 149 151 wait_queue_head_t migration_wait; 152 + atomic_t nr_migrations; 150 153 151 154 /* 152 155 * cache_size entries, dirty if set ··· 157 160 /* 158 161 * origin_blocks entries, discarded if set. 159 162 */ 160 - uint32_t discard_block_size; /* a power of 2 times sectors per block */ 161 163 dm_dblock_t discard_nr_blocks; 162 164 unsigned long *discard_bitset; 165 + uint32_t discard_block_size; /* a power of 2 times sectors per block */ 166 + 167 + /* 168 + * Rather than reconstructing the table line for the status we just 169 + * save it and regurgitate. 170 + */ 171 + unsigned nr_ctr_args; 172 + const char **ctr_args; 163 173 164 174 struct dm_kcopyd_client *copier; 165 175 struct workqueue_struct *wq; ··· 191 187 bool loaded_mappings:1; 192 188 bool loaded_discards:1; 193 189 194 - struct cache_stats stats; 195 - 196 190 /* 197 - * Rather than reconstructing the table line for the status we just 198 - * save it and regurgitate. 191 + * Cache features such as write-through. 199 192 */ 200 - unsigned nr_ctr_args; 201 - const char **ctr_args; 193 + struct cache_features features; 194 + 195 + struct cache_stats stats; 202 196 }; 203 197 204 198 struct per_bio_data { ··· 1689 1687 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 1690 1688 char **error) 1691 1689 { 1692 - unsigned long tmp; 1690 + unsigned long block_size; 1693 1691 1694 1692 if (!at_least_one_arg(as, error)) 1695 1693 return -EINVAL; 1696 1694 1697 - if (kstrtoul(dm_shift_arg(as), 10, &tmp) || !tmp || 1698 - tmp < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 1699 - tmp & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 1695 + if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 1696 + block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 1697 + block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 1698 + block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 1700 1699 *error = "Invalid data block size"; 1701 1700 return -EINVAL; 1702 1701 } 1703 1702 1704 - if (tmp > ca->cache_sectors) { 1703 + if (block_size > ca->cache_sectors) { 1705 1704 *error = "Data block size is larger than the cache device"; 1706 1705 return -EINVAL; 1707 1706 } 1708 1707 1709 - ca->block_size = tmp; 1708 + ca->block_size = block_size; 1710 1709 1711 1710 return 0; 1712 1711 } ··· 2612 2609 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 2613 2610 { 2614 2611 struct cache *cache = ti->private; 2612 + uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 2615 2613 2616 - blk_limits_io_min(limits, 0); 2617 - blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 2614 + /* 2615 + * If the system-determined stacked limits are compatible with the 2616 + * cache's blocksize (io_opt is a factor) do not override them. 2617 + */ 2618 + if (io_opt_sectors < cache->sectors_per_block || 2619 + do_div(io_opt_sectors, cache->sectors_per_block)) { 2620 + blk_limits_io_min(limits, 0); 2621 + blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 2622 + } 2618 2623 set_discard_limits(cache, limits); 2619 2624 } 2620 2625

+2 -8

drivers/md/dm-crypt.c

··· 1645 1645 } 1646 1646 1647 1647 ret = -ENOMEM; 1648 - cc->io_queue = alloc_workqueue("kcryptd_io", 1649 - WQ_NON_REENTRANT| 1650 - WQ_MEM_RECLAIM, 1651 - 1); 1648 + cc->io_queue = alloc_workqueue("kcryptd_io", WQ_MEM_RECLAIM, 1); 1652 1649 if (!cc->io_queue) { 1653 1650 ti->error = "Couldn't create kcryptd io queue"; 1654 1651 goto bad; 1655 1652 } 1656 1653 1657 1654 cc->crypt_queue = alloc_workqueue("kcryptd", 1658 - WQ_NON_REENTRANT| 1659 - WQ_CPU_INTENSIVE| 1660 - WQ_MEM_RECLAIM, 1661 - 1); 1655 + WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1); 1662 1656 if (!cc->crypt_queue) { 1663 1657 ti->error = "Couldn't create kcryptd queue"; 1664 1658 goto bad;

+33 -27

drivers/md/dm-ioctl.c

··· 877 877 unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; 878 878 879 879 if (new_data < param->data || 880 - invalid_str(new_data, (void *) param + param_size) || 880 + invalid_str(new_data, (void *) param + param_size) || !*new_data || 881 881 strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) { 882 882 DMWARN("Invalid new mapped device name or uuid string supplied."); 883 883 return -EINVAL; ··· 1262 1262 1263 1263 r = dm_table_create(&t, get_mode(param), param->target_count, md); 1264 1264 if (r) 1265 - goto out; 1265 + goto err; 1266 1266 1267 + /* Protect md->type and md->queue against concurrent table loads. */ 1268 + dm_lock_md_type(md); 1267 1269 r = populate_table(t, param, param_size); 1268 - if (r) { 1269 - dm_table_destroy(t); 1270 - goto out; 1271 - } 1270 + if (r) 1271 + goto err_unlock_md_type; 1272 1272 1273 1273 immutable_target_type = dm_get_immutable_target_type(md); 1274 1274 if (immutable_target_type && 1275 1275 (immutable_target_type != dm_table_get_immutable_target_type(t))) { 1276 1276 DMWARN("can't replace immutable target type %s", 1277 1277 immutable_target_type->name); 1278 - dm_table_destroy(t); 1279 1278 r = -EINVAL; 1280 - goto out; 1279 + goto err_unlock_md_type; 1281 1280 } 1282 1281 1283 - /* Protect md->type and md->queue against concurrent table loads. */ 1284 - dm_lock_md_type(md); 1285 1282 if (dm_get_md_type(md) == DM_TYPE_NONE) 1286 1283 /* Initial table load: acquire type of table. */ 1287 1284 dm_set_md_type(md, dm_table_get_type(t)); 1288 1285 else if (dm_get_md_type(md) != dm_table_get_type(t)) { 1289 1286 DMWARN("can't change device type after initial table load."); 1290 - dm_table_destroy(t); 1291 - dm_unlock_md_type(md); 1292 1287 r = -EINVAL; 1293 - goto out; 1288 + goto err_unlock_md_type; 1294 1289 } 1295 1290 1296 1291 /* setup md->queue to reflect md's type (may block) */ 1297 1292 r = dm_setup_md_queue(md); 1298 1293 if (r) { 1299 1294 DMWARN("unable to set up device queue for new table."); 1300 - dm_table_destroy(t); 1301 - dm_unlock_md_type(md); 1302 - goto out; 1295 + goto err_unlock_md_type; 1303 1296 } 1304 1297 dm_unlock_md_type(md); 1305 1298 ··· 1302 1309 if (!hc || hc->md != md) { 1303 1310 DMWARN("device has been removed from the dev hash table."); 1304 1311 up_write(&_hash_lock); 1305 - dm_table_destroy(t); 1306 1312 r = -ENXIO; 1307 - goto out; 1313 + goto err_destroy_table; 1308 1314 } 1309 1315 1310 1316 if (hc->new_map) ··· 1314 1322 param->flags |= DM_INACTIVE_PRESENT_FLAG; 1315 1323 __dev_status(md, param); 1316 1324 1317 - out: 1318 1325 if (old_map) { 1319 1326 dm_sync_table(md); 1320 1327 dm_table_destroy(old_map); 1321 1328 } 1322 1329 1330 + dm_put(md); 1331 + 1332 + return 0; 1333 + 1334 + err_unlock_md_type: 1335 + dm_unlock_md_type(md); 1336 + err_destroy_table: 1337 + dm_table_destroy(t); 1338 + err: 1323 1339 dm_put(md); 1324 1340 1325 1341 return r; ··· 1455 1455 return 0; 1456 1456 } 1457 1457 1458 - static bool buffer_test_overflow(char *result, unsigned maxlen) 1459 - { 1460 - return !maxlen || strlen(result) + 1 >= maxlen; 1461 - } 1462 - 1463 1458 /* 1464 - * Process device-mapper dependent messages. 1459 + * Process device-mapper dependent messages. Messages prefixed with '@' 1460 + * are processed by the DM core. All others are delivered to the target. 1465 1461 * Returns a number <= 1 if message was processed by device mapper. 1466 1462 * Returns 2 if message should be delivered to the target. 1467 1463 */ 1468 1464 static int message_for_md(struct mapped_device *md, unsigned argc, char **argv, 1469 1465 char *result, unsigned maxlen) 1470 1466 { 1471 - return 2; 1467 + int r; 1468 + 1469 + if (**argv != '@') 1470 + return 2; /* no '@' prefix, deliver to target */ 1471 + 1472 + r = dm_stats_message(md, argc, argv, result, maxlen); 1473 + if (r < 2) 1474 + return r; 1475 + 1476 + DMERR("Unsupported message sent to DM core: %s", argv[0]); 1477 + return -EINVAL; 1472 1478 } 1473 1479 1474 1480 /* ··· 1548 1542 1549 1543 if (r == 1) { 1550 1544 param->flags |= DM_DATA_OUT_FLAG; 1551 - if (buffer_test_overflow(result, maxlen)) 1545 + if (dm_message_test_buffer_overflow(result, maxlen)) 1552 1546 param->flags |= DM_BUFFER_FULL_FLAG; 1553 1547 else 1554 1548 param->data_size = param->data_start + strlen(result) + 1;

+1 -2

drivers/md/dm-kcopyd.c

··· 833 833 goto bad_slab; 834 834 835 835 INIT_WORK(&kc->kcopyd_work, do_work); 836 - kc->kcopyd_wq = alloc_workqueue("kcopyd", 837 - WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); 836 + kc->kcopyd_wq = alloc_workqueue("kcopyd", WQ_MEM_RECLAIM, 0); 838 837 if (!kc->kcopyd_wq) 839 838 goto bad_workqueue; 840 839

+1 -2

drivers/md/dm-raid1.c

··· 1080 1080 ti->per_bio_data_size = sizeof(struct dm_raid1_bio_record); 1081 1081 ti->discard_zeroes_data_unsupported = true; 1082 1082 1083 - ms->kmirrord_wq = alloc_workqueue("kmirrord", 1084 - WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); 1083 + ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_MEM_RECLAIM, 0); 1085 1084 if (!ms->kmirrord_wq) { 1086 1085 DMERR("couldn't start kmirrord"); 1087 1086 r = -ENOMEM;

+969

drivers/md/dm-stats.c

··· 1 + #include <linux/errno.h> 2 + #include <linux/numa.h> 3 + #include <linux/slab.h> 4 + #include <linux/rculist.h> 5 + #include <linux/threads.h> 6 + #include <linux/preempt.h> 7 + #include <linux/irqflags.h> 8 + #include <linux/vmalloc.h> 9 + #include <linux/mm.h> 10 + #include <linux/module.h> 11 + #include <linux/device-mapper.h> 12 + 13 + #include "dm.h" 14 + #include "dm-stats.h" 15 + 16 + #define DM_MSG_PREFIX "stats" 17 + 18 + static int dm_stat_need_rcu_barrier; 19 + 20 + /* 21 + * Using 64-bit values to avoid overflow (which is a 22 + * problem that block/genhd.c's IO accounting has). 23 + */ 24 + struct dm_stat_percpu { 25 + unsigned long long sectors[2]; 26 + unsigned long long ios[2]; 27 + unsigned long long merges[2]; 28 + unsigned long long ticks[2]; 29 + unsigned long long io_ticks[2]; 30 + unsigned long long io_ticks_total; 31 + unsigned long long time_in_queue; 32 + }; 33 + 34 + struct dm_stat_shared { 35 + atomic_t in_flight[2]; 36 + unsigned long stamp; 37 + struct dm_stat_percpu tmp; 38 + }; 39 + 40 + struct dm_stat { 41 + struct list_head list_entry; 42 + int id; 43 + size_t n_entries; 44 + sector_t start; 45 + sector_t end; 46 + sector_t step; 47 + const char *program_id; 48 + const char *aux_data; 49 + struct rcu_head rcu_head; 50 + size_t shared_alloc_size; 51 + size_t percpu_alloc_size; 52 + struct dm_stat_percpu *stat_percpu[NR_CPUS]; 53 + struct dm_stat_shared stat_shared[0]; 54 + }; 55 + 56 + struct dm_stats_last_position { 57 + sector_t last_sector; 58 + unsigned last_rw; 59 + }; 60 + 61 + /* 62 + * A typo on the command line could possibly make the kernel run out of memory 63 + * and crash. To prevent the crash we account all used memory. We fail if we 64 + * exhaust 1/4 of all memory or 1/2 of vmalloc space. 65 + */ 66 + #define DM_STATS_MEMORY_FACTOR 4 67 + #define DM_STATS_VMALLOC_FACTOR 2 68 + 69 + static DEFINE_SPINLOCK(shared_memory_lock); 70 + 71 + static unsigned long shared_memory_amount; 72 + 73 + static bool __check_shared_memory(size_t alloc_size) 74 + { 75 + size_t a; 76 + 77 + a = shared_memory_amount + alloc_size; 78 + if (a < shared_memory_amount) 79 + return false; 80 + if (a >> PAGE_SHIFT > totalram_pages / DM_STATS_MEMORY_FACTOR) 81 + return false; 82 + #ifdef CONFIG_MMU 83 + if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR) 84 + return false; 85 + #endif 86 + return true; 87 + } 88 + 89 + static bool check_shared_memory(size_t alloc_size) 90 + { 91 + bool ret; 92 + 93 + spin_lock_irq(&shared_memory_lock); 94 + 95 + ret = __check_shared_memory(alloc_size); 96 + 97 + spin_unlock_irq(&shared_memory_lock); 98 + 99 + return ret; 100 + } 101 + 102 + static bool claim_shared_memory(size_t alloc_size) 103 + { 104 + spin_lock_irq(&shared_memory_lock); 105 + 106 + if (!__check_shared_memory(alloc_size)) { 107 + spin_unlock_irq(&shared_memory_lock); 108 + return false; 109 + } 110 + 111 + shared_memory_amount += alloc_size; 112 + 113 + spin_unlock_irq(&shared_memory_lock); 114 + 115 + return true; 116 + } 117 + 118 + static void free_shared_memory(size_t alloc_size) 119 + { 120 + unsigned long flags; 121 + 122 + spin_lock_irqsave(&shared_memory_lock, flags); 123 + 124 + if (WARN_ON_ONCE(shared_memory_amount < alloc_size)) { 125 + spin_unlock_irqrestore(&shared_memory_lock, flags); 126 + DMCRIT("Memory usage accounting bug."); 127 + return; 128 + } 129 + 130 + shared_memory_amount -= alloc_size; 131 + 132 + spin_unlock_irqrestore(&shared_memory_lock, flags); 133 + } 134 + 135 + static void *dm_kvzalloc(size_t alloc_size, int node) 136 + { 137 + void *p; 138 + 139 + if (!claim_shared_memory(alloc_size)) 140 + return NULL; 141 + 142 + if (alloc_size <= KMALLOC_MAX_SIZE) { 143 + p = kzalloc_node(alloc_size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node); 144 + if (p) 145 + return p; 146 + } 147 + p = vzalloc_node(alloc_size, node); 148 + if (p) 149 + return p; 150 + 151 + free_shared_memory(alloc_size); 152 + 153 + return NULL; 154 + } 155 + 156 + static void dm_kvfree(void *ptr, size_t alloc_size) 157 + { 158 + if (!ptr) 159 + return; 160 + 161 + free_shared_memory(alloc_size); 162 + 163 + if (is_vmalloc_addr(ptr)) 164 + vfree(ptr); 165 + else 166 + kfree(ptr); 167 + } 168 + 169 + static void dm_stat_free(struct rcu_head *head) 170 + { 171 + int cpu; 172 + struct dm_stat *s = container_of(head, struct dm_stat, rcu_head); 173 + 174 + kfree(s->program_id); 175 + kfree(s->aux_data); 176 + for_each_possible_cpu(cpu) 177 + dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size); 178 + dm_kvfree(s, s->shared_alloc_size); 179 + } 180 + 181 + static int dm_stat_in_flight(struct dm_stat_shared *shared) 182 + { 183 + return atomic_read(&shared->in_flight[READ]) + 184 + atomic_read(&shared->in_flight[WRITE]); 185 + } 186 + 187 + void dm_stats_init(struct dm_stats *stats) 188 + { 189 + int cpu; 190 + struct dm_stats_last_position *last; 191 + 192 + mutex_init(&stats->mutex); 193 + INIT_LIST_HEAD(&stats->list); 194 + stats->last = alloc_percpu(struct dm_stats_last_position); 195 + for_each_possible_cpu(cpu) { 196 + last = per_cpu_ptr(stats->last, cpu); 197 + last->last_sector = (sector_t)ULLONG_MAX; 198 + last->last_rw = UINT_MAX; 199 + } 200 + } 201 + 202 + void dm_stats_cleanup(struct dm_stats *stats) 203 + { 204 + size_t ni; 205 + struct dm_stat *s; 206 + struct dm_stat_shared *shared; 207 + 208 + while (!list_empty(&stats->list)) { 209 + s = container_of(stats->list.next, struct dm_stat, list_entry); 210 + list_del(&s->list_entry); 211 + for (ni = 0; ni < s->n_entries; ni++) { 212 + shared = &s->stat_shared[ni]; 213 + if (WARN_ON(dm_stat_in_flight(shared))) { 214 + DMCRIT("leaked in-flight counter at index %lu " 215 + "(start %llu, end %llu, step %llu): reads %d, writes %d", 216 + (unsigned long)ni, 217 + (unsigned long long)s->start, 218 + (unsigned long long)s->end, 219 + (unsigned long long)s->step, 220 + atomic_read(&shared->in_flight[READ]), 221 + atomic_read(&shared->in_flight[WRITE])); 222 + } 223 + } 224 + dm_stat_free(&s->rcu_head); 225 + } 226 + free_percpu(stats->last); 227 + } 228 + 229 + static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, 230 + sector_t step, const char *program_id, const char *aux_data, 231 + void (*suspend_callback)(struct mapped_device *), 232 + void (*resume_callback)(struct mapped_device *), 233 + struct mapped_device *md) 234 + { 235 + struct list_head *l; 236 + struct dm_stat *s, *tmp_s; 237 + sector_t n_entries; 238 + size_t ni; 239 + size_t shared_alloc_size; 240 + size_t percpu_alloc_size; 241 + struct dm_stat_percpu *p; 242 + int cpu; 243 + int ret_id; 244 + int r; 245 + 246 + if (end < start || !step) 247 + return -EINVAL; 248 + 249 + n_entries = end - start; 250 + if (dm_sector_div64(n_entries, step)) 251 + n_entries++; 252 + 253 + if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1)) 254 + return -EOVERFLOW; 255 + 256 + shared_alloc_size = sizeof(struct dm_stat) + (size_t)n_entries * sizeof(struct dm_stat_shared); 257 + if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries) 258 + return -EOVERFLOW; 259 + 260 + percpu_alloc_size = (size_t)n_entries * sizeof(struct dm_stat_percpu); 261 + if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries) 262 + return -EOVERFLOW; 263 + 264 + if (!check_shared_memory(shared_alloc_size + num_possible_cpus() * percpu_alloc_size)) 265 + return -ENOMEM; 266 + 267 + s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE); 268 + if (!s) 269 + return -ENOMEM; 270 + 271 + s->n_entries = n_entries; 272 + s->start = start; 273 + s->end = end; 274 + s->step = step; 275 + s->shared_alloc_size = shared_alloc_size; 276 + s->percpu_alloc_size = percpu_alloc_size; 277 + 278 + s->program_id = kstrdup(program_id, GFP_KERNEL); 279 + if (!s->program_id) { 280 + r = -ENOMEM; 281 + goto out; 282 + } 283 + s->aux_data = kstrdup(aux_data, GFP_KERNEL); 284 + if (!s->aux_data) { 285 + r = -ENOMEM; 286 + goto out; 287 + } 288 + 289 + for (ni = 0; ni < n_entries; ni++) { 290 + atomic_set(&s->stat_shared[ni].in_flight[READ], 0); 291 + atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0); 292 + } 293 + 294 + for_each_possible_cpu(cpu) { 295 + p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu)); 296 + if (!p) { 297 + r = -ENOMEM; 298 + goto out; 299 + } 300 + s->stat_percpu[cpu] = p; 301 + } 302 + 303 + /* 304 + * Suspend/resume to make sure there is no i/o in flight, 305 + * so that newly created statistics will be exact. 306 + * 307 + * (note: we couldn't suspend earlier because we must not 308 + * allocate memory while suspended) 309 + */ 310 + suspend_callback(md); 311 + 312 + mutex_lock(&stats->mutex); 313 + s->id = 0; 314 + list_for_each(l, &stats->list) { 315 + tmp_s = container_of(l, struct dm_stat, list_entry); 316 + if (WARN_ON(tmp_s->id < s->id)) { 317 + r = -EINVAL; 318 + goto out_unlock_resume; 319 + } 320 + if (tmp_s->id > s->id) 321 + break; 322 + if (unlikely(s->id == INT_MAX)) { 323 + r = -ENFILE; 324 + goto out_unlock_resume; 325 + } 326 + s->id++; 327 + } 328 + ret_id = s->id; 329 + list_add_tail_rcu(&s->list_entry, l); 330 + mutex_unlock(&stats->mutex); 331 + 332 + resume_callback(md); 333 + 334 + return ret_id; 335 + 336 + out_unlock_resume: 337 + mutex_unlock(&stats->mutex); 338 + resume_callback(md); 339 + out: 340 + dm_stat_free(&s->rcu_head); 341 + return r; 342 + } 343 + 344 + static struct dm_stat *__dm_stats_find(struct dm_stats *stats, int id) 345 + { 346 + struct dm_stat *s; 347 + 348 + list_for_each_entry(s, &stats->list, list_entry) { 349 + if (s->id > id) 350 + break; 351 + if (s->id == id) 352 + return s; 353 + } 354 + 355 + return NULL; 356 + } 357 + 358 + static int dm_stats_delete(struct dm_stats *stats, int id) 359 + { 360 + struct dm_stat *s; 361 + int cpu; 362 + 363 + mutex_lock(&stats->mutex); 364 + 365 + s = __dm_stats_find(stats, id); 366 + if (!s) { 367 + mutex_unlock(&stats->mutex); 368 + return -ENOENT; 369 + } 370 + 371 + list_del_rcu(&s->list_entry); 372 + mutex_unlock(&stats->mutex); 373 + 374 + /* 375 + * vfree can't be called from RCU callback 376 + */ 377 + for_each_possible_cpu(cpu) 378 + if (is_vmalloc_addr(s->stat_percpu)) 379 + goto do_sync_free; 380 + if (is_vmalloc_addr(s)) { 381 + do_sync_free: 382 + synchronize_rcu_expedited(); 383 + dm_stat_free(&s->rcu_head); 384 + } else { 385 + ACCESS_ONCE(dm_stat_need_rcu_barrier) = 1; 386 + call_rcu(&s->rcu_head, dm_stat_free); 387 + } 388 + return 0; 389 + } 390 + 391 + static int dm_stats_list(struct dm_stats *stats, const char *program, 392 + char *result, unsigned maxlen) 393 + { 394 + struct dm_stat *s; 395 + sector_t len; 396 + unsigned sz = 0; 397 + 398 + /* 399 + * Output format: 400 + * <region_id>: <start_sector>+<length> <step> <program_id> <aux_data> 401 + */ 402 + 403 + mutex_lock(&stats->mutex); 404 + list_for_each_entry(s, &stats->list, list_entry) { 405 + if (!program || !strcmp(program, s->program_id)) { 406 + len = s->end - s->start; 407 + DMEMIT("%d: %llu+%llu %llu %s %s\n", s->id, 408 + (unsigned long long)s->start, 409 + (unsigned long long)len, 410 + (unsigned long long)s->step, 411 + s->program_id, 412 + s->aux_data); 413 + } 414 + } 415 + mutex_unlock(&stats->mutex); 416 + 417 + return 1; 418 + } 419 + 420 + static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *p) 421 + { 422 + /* 423 + * This is racy, but so is part_round_stats_single. 424 + */ 425 + unsigned long now = jiffies; 426 + unsigned in_flight_read; 427 + unsigned in_flight_write; 428 + unsigned long difference = now - shared->stamp; 429 + 430 + if (!difference) 431 + return; 432 + in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]); 433 + in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]); 434 + if (in_flight_read) 435 + p->io_ticks[READ] += difference; 436 + if (in_flight_write) 437 + p->io_ticks[WRITE] += difference; 438 + if (in_flight_read + in_flight_write) { 439 + p->io_ticks_total += difference; 440 + p->time_in_queue += (in_flight_read + in_flight_write) * difference; 441 + } 442 + shared->stamp = now; 443 + } 444 + 445 + static void dm_stat_for_entry(struct dm_stat *s, size_t entry, 446 + unsigned long bi_rw, sector_t len, bool merged, 447 + bool end, unsigned long duration) 448 + { 449 + unsigned long idx = bi_rw & REQ_WRITE; 450 + struct dm_stat_shared *shared = &s->stat_shared[entry]; 451 + struct dm_stat_percpu *p; 452 + 453 + /* 454 + * For strict correctness we should use local_irq_disable/enable 455 + * instead of preempt_disable/enable. 456 + * 457 + * This is racy if the driver finishes bios from non-interrupt 458 + * context as well as from interrupt context or from more different 459 + * interrupts. 460 + * 461 + * However, the race only results in not counting some events, 462 + * so it is acceptable. 463 + * 464 + * part_stat_lock()/part_stat_unlock() have this race too. 465 + */ 466 + preempt_disable(); 467 + p = &s->stat_percpu[smp_processor_id()][entry]; 468 + 469 + if (!end) { 470 + dm_stat_round(shared, p); 471 + atomic_inc(&shared->in_flight[idx]); 472 + } else { 473 + dm_stat_round(shared, p); 474 + atomic_dec(&shared->in_flight[idx]); 475 + p->sectors[idx] += len; 476 + p->ios[idx] += 1; 477 + p->merges[idx] += merged; 478 + p->ticks[idx] += duration; 479 + } 480 + 481 + preempt_enable(); 482 + } 483 + 484 + static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw, 485 + sector_t bi_sector, sector_t end_sector, 486 + bool end, unsigned long duration, 487 + struct dm_stats_aux *stats_aux) 488 + { 489 + sector_t rel_sector, offset, todo, fragment_len; 490 + size_t entry; 491 + 492 + if (end_sector <= s->start || bi_sector >= s->end) 493 + return; 494 + if (unlikely(bi_sector < s->start)) { 495 + rel_sector = 0; 496 + todo = end_sector - s->start; 497 + } else { 498 + rel_sector = bi_sector - s->start; 499 + todo = end_sector - bi_sector; 500 + } 501 + if (unlikely(end_sector > s->end)) 502 + todo -= (end_sector - s->end); 503 + 504 + offset = dm_sector_div64(rel_sector, s->step); 505 + entry = rel_sector; 506 + do { 507 + if (WARN_ON_ONCE(entry >= s->n_entries)) { 508 + DMCRIT("Invalid area access in region id %d", s->id); 509 + return; 510 + } 511 + fragment_len = todo; 512 + if (fragment_len > s->step - offset) 513 + fragment_len = s->step - offset; 514 + dm_stat_for_entry(s, entry, bi_rw, fragment_len, 515 + stats_aux->merged, end, duration); 516 + todo -= fragment_len; 517 + entry++; 518 + offset = 0; 519 + } while (unlikely(todo != 0)); 520 + } 521 + 522 + void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, 523 + sector_t bi_sector, unsigned bi_sectors, bool end, 524 + unsigned long duration, struct dm_stats_aux *stats_aux) 525 + { 526 + struct dm_stat *s; 527 + sector_t end_sector; 528 + struct dm_stats_last_position *last; 529 + 530 + if (unlikely(!bi_sectors)) 531 + return; 532 + 533 + end_sector = bi_sector + bi_sectors; 534 + 535 + if (!end) { 536 + /* 537 + * A race condition can at worst result in the merged flag being 538 + * misrepresented, so we don't have to disable preemption here. 539 + */ 540 + last = __this_cpu_ptr(stats->last); 541 + stats_aux->merged = 542 + (bi_sector == (ACCESS_ONCE(last->last_sector) && 543 + ((bi_rw & (REQ_WRITE | REQ_DISCARD)) == 544 + (ACCESS_ONCE(last->last_rw) & (REQ_WRITE | REQ_DISCARD))) 545 + )); 546 + ACCESS_ONCE(last->last_sector) = end_sector; 547 + ACCESS_ONCE(last->last_rw) = bi_rw; 548 + } 549 + 550 + rcu_read_lock(); 551 + 552 + list_for_each_entry_rcu(s, &stats->list, list_entry) 553 + __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration, stats_aux); 554 + 555 + rcu_read_unlock(); 556 + } 557 + 558 + static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared, 559 + struct dm_stat *s, size_t x) 560 + { 561 + int cpu; 562 + struct dm_stat_percpu *p; 563 + 564 + local_irq_disable(); 565 + p = &s->stat_percpu[smp_processor_id()][x]; 566 + dm_stat_round(shared, p); 567 + local_irq_enable(); 568 + 569 + memset(&shared->tmp, 0, sizeof(shared->tmp)); 570 + for_each_possible_cpu(cpu) { 571 + p = &s->stat_percpu[cpu][x]; 572 + shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]); 573 + shared->tmp.sectors[WRITE] += ACCESS_ONCE(p->sectors[WRITE]); 574 + shared->tmp.ios[READ] += ACCESS_ONCE(p->ios[READ]); 575 + shared->tmp.ios[WRITE] += ACCESS_ONCE(p->ios[WRITE]); 576 + shared->tmp.merges[READ] += ACCESS_ONCE(p->merges[READ]); 577 + shared->tmp.merges[WRITE] += ACCESS_ONCE(p->merges[WRITE]); 578 + shared->tmp.ticks[READ] += ACCESS_ONCE(p->ticks[READ]); 579 + shared->tmp.ticks[WRITE] += ACCESS_ONCE(p->ticks[WRITE]); 580 + shared->tmp.io_ticks[READ] += ACCESS_ONCE(p->io_ticks[READ]); 581 + shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]); 582 + shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total); 583 + shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue); 584 + } 585 + } 586 + 587 + static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end, 588 + bool init_tmp_percpu_totals) 589 + { 590 + size_t x; 591 + struct dm_stat_shared *shared; 592 + struct dm_stat_percpu *p; 593 + 594 + for (x = idx_start; x < idx_end; x++) { 595 + shared = &s->stat_shared[x]; 596 + if (init_tmp_percpu_totals) 597 + __dm_stat_init_temporary_percpu_totals(shared, s, x); 598 + local_irq_disable(); 599 + p = &s->stat_percpu[smp_processor_id()][x]; 600 + p->sectors[READ] -= shared->tmp.sectors[READ]; 601 + p->sectors[WRITE] -= shared->tmp.sectors[WRITE]; 602 + p->ios[READ] -= shared->tmp.ios[READ]; 603 + p->ios[WRITE] -= shared->tmp.ios[WRITE]; 604 + p->merges[READ] -= shared->tmp.merges[READ]; 605 + p->merges[WRITE] -= shared->tmp.merges[WRITE]; 606 + p->ticks[READ] -= shared->tmp.ticks[READ]; 607 + p->ticks[WRITE] -= shared->tmp.ticks[WRITE]; 608 + p->io_ticks[READ] -= shared->tmp.io_ticks[READ]; 609 + p->io_ticks[WRITE] -= shared->tmp.io_ticks[WRITE]; 610 + p->io_ticks_total -= shared->tmp.io_ticks_total; 611 + p->time_in_queue -= shared->tmp.time_in_queue; 612 + local_irq_enable(); 613 + } 614 + } 615 + 616 + static int dm_stats_clear(struct dm_stats *stats, int id) 617 + { 618 + struct dm_stat *s; 619 + 620 + mutex_lock(&stats->mutex); 621 + 622 + s = __dm_stats_find(stats, id); 623 + if (!s) { 624 + mutex_unlock(&stats->mutex); 625 + return -ENOENT; 626 + } 627 + 628 + __dm_stat_clear(s, 0, s->n_entries, true); 629 + 630 + mutex_unlock(&stats->mutex); 631 + 632 + return 1; 633 + } 634 + 635 + /* 636 + * This is like jiffies_to_msec, but works for 64-bit values. 637 + */ 638 + static unsigned long long dm_jiffies_to_msec64(unsigned long long j) 639 + { 640 + unsigned long long result = 0; 641 + unsigned mult; 642 + 643 + if (j) 644 + result = jiffies_to_msecs(j & 0x3fffff); 645 + if (j >= 1 << 22) { 646 + mult = jiffies_to_msecs(1 << 22); 647 + result += (unsigned long long)mult * (unsigned long long)jiffies_to_msecs((j >> 22) & 0x3fffff); 648 + } 649 + if (j >= 1ULL << 44) 650 + result += (unsigned long long)mult * (unsigned long long)mult * (unsigned long long)jiffies_to_msecs(j >> 44); 651 + 652 + return result; 653 + } 654 + 655 + static int dm_stats_print(struct dm_stats *stats, int id, 656 + size_t idx_start, size_t idx_len, 657 + bool clear, char *result, unsigned maxlen) 658 + { 659 + unsigned sz = 0; 660 + struct dm_stat *s; 661 + size_t x; 662 + sector_t start, end, step; 663 + size_t idx_end; 664 + struct dm_stat_shared *shared; 665 + 666 + /* 667 + * Output format: 668 + * <start_sector>+<length> counters 669 + */ 670 + 671 + mutex_lock(&stats->mutex); 672 + 673 + s = __dm_stats_find(stats, id); 674 + if (!s) { 675 + mutex_unlock(&stats->mutex); 676 + return -ENOENT; 677 + } 678 + 679 + idx_end = idx_start + idx_len; 680 + if (idx_end < idx_start || 681 + idx_end > s->n_entries) 682 + idx_end = s->n_entries; 683 + 684 + if (idx_start > idx_end) 685 + idx_start = idx_end; 686 + 687 + step = s->step; 688 + start = s->start + (step * idx_start); 689 + 690 + for (x = idx_start; x < idx_end; x++, start = end) { 691 + shared = &s->stat_shared[x]; 692 + end = start + step; 693 + if (unlikely(end > s->end)) 694 + end = s->end; 695 + 696 + __dm_stat_init_temporary_percpu_totals(shared, s, x); 697 + 698 + DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu\n", 699 + (unsigned long long)start, 700 + (unsigned long long)step, 701 + shared->tmp.ios[READ], 702 + shared->tmp.merges[READ], 703 + shared->tmp.sectors[READ], 704 + dm_jiffies_to_msec64(shared->tmp.ticks[READ]), 705 + shared->tmp.ios[WRITE], 706 + shared->tmp.merges[WRITE], 707 + shared->tmp.sectors[WRITE], 708 + dm_jiffies_to_msec64(shared->tmp.ticks[WRITE]), 709 + dm_stat_in_flight(shared), 710 + dm_jiffies_to_msec64(shared->tmp.io_ticks_total), 711 + dm_jiffies_to_msec64(shared->tmp.time_in_queue), 712 + dm_jiffies_to_msec64(shared->tmp.io_ticks[READ]), 713 + dm_jiffies_to_msec64(shared->tmp.io_ticks[WRITE])); 714 + 715 + if (unlikely(sz + 1 >= maxlen)) 716 + goto buffer_overflow; 717 + } 718 + 719 + if (clear) 720 + __dm_stat_clear(s, idx_start, idx_end, false); 721 + 722 + buffer_overflow: 723 + mutex_unlock(&stats->mutex); 724 + 725 + return 1; 726 + } 727 + 728 + static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data) 729 + { 730 + struct dm_stat *s; 731 + const char *new_aux_data; 732 + 733 + mutex_lock(&stats->mutex); 734 + 735 + s = __dm_stats_find(stats, id); 736 + if (!s) { 737 + mutex_unlock(&stats->mutex); 738 + return -ENOENT; 739 + } 740 + 741 + new_aux_data = kstrdup(aux_data, GFP_KERNEL); 742 + if (!new_aux_data) { 743 + mutex_unlock(&stats->mutex); 744 + return -ENOMEM; 745 + } 746 + 747 + kfree(s->aux_data); 748 + s->aux_data = new_aux_data; 749 + 750 + mutex_unlock(&stats->mutex); 751 + 752 + return 0; 753 + } 754 + 755 + static int message_stats_create(struct mapped_device *md, 756 + unsigned argc, char **argv, 757 + char *result, unsigned maxlen) 758 + { 759 + int id; 760 + char dummy; 761 + unsigned long long start, end, len, step; 762 + unsigned divisor; 763 + const char *program_id, *aux_data; 764 + 765 + /* 766 + * Input format: 767 + * <range> <step> [<program_id> [<aux_data>]] 768 + */ 769 + 770 + if (argc < 3 || argc > 5) 771 + return -EINVAL; 772 + 773 + if (!strcmp(argv[1], "-")) { 774 + start = 0; 775 + len = dm_get_size(md); 776 + if (!len) 777 + len = 1; 778 + } else if (sscanf(argv[1], "%llu+%llu%c", &start, &len, &dummy) != 2 || 779 + start != (sector_t)start || len != (sector_t)len) 780 + return -EINVAL; 781 + 782 + end = start + len; 783 + if (start >= end) 784 + return -EINVAL; 785 + 786 + if (sscanf(argv[2], "/%u%c", &divisor, &dummy) == 1) { 787 + step = end - start; 788 + if (do_div(step, divisor)) 789 + step++; 790 + if (!step) 791 + step = 1; 792 + } else if (sscanf(argv[2], "%llu%c", &step, &dummy) != 1 || 793 + step != (sector_t)step || !step) 794 + return -EINVAL; 795 + 796 + program_id = "-"; 797 + aux_data = "-"; 798 + 799 + if (argc > 3) 800 + program_id = argv[3]; 801 + 802 + if (argc > 4) 803 + aux_data = argv[4]; 804 + 805 + /* 806 + * If a buffer overflow happens after we created the region, 807 + * it's too late (the userspace would retry with a larger 808 + * buffer, but the region id that caused the overflow is already 809 + * leaked). So we must detect buffer overflow in advance. 810 + */ 811 + snprintf(result, maxlen, "%d", INT_MAX); 812 + if (dm_message_test_buffer_overflow(result, maxlen)) 813 + return 1; 814 + 815 + id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data, 816 + dm_internal_suspend, dm_internal_resume, md); 817 + if (id < 0) 818 + return id; 819 + 820 + snprintf(result, maxlen, "%d", id); 821 + 822 + return 1; 823 + } 824 + 825 + static int message_stats_delete(struct mapped_device *md, 826 + unsigned argc, char **argv) 827 + { 828 + int id; 829 + char dummy; 830 + 831 + if (argc != 2) 832 + return -EINVAL; 833 + 834 + if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 835 + return -EINVAL; 836 + 837 + return dm_stats_delete(dm_get_stats(md), id); 838 + } 839 + 840 + static int message_stats_clear(struct mapped_device *md, 841 + unsigned argc, char **argv) 842 + { 843 + int id; 844 + char dummy; 845 + 846 + if (argc != 2) 847 + return -EINVAL; 848 + 849 + if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 850 + return -EINVAL; 851 + 852 + return dm_stats_clear(dm_get_stats(md), id); 853 + } 854 + 855 + static int message_stats_list(struct mapped_device *md, 856 + unsigned argc, char **argv, 857 + char *result, unsigned maxlen) 858 + { 859 + int r; 860 + const char *program = NULL; 861 + 862 + if (argc < 1 || argc > 2) 863 + return -EINVAL; 864 + 865 + if (argc > 1) { 866 + program = kstrdup(argv[1], GFP_KERNEL); 867 + if (!program) 868 + return -ENOMEM; 869 + } 870 + 871 + r = dm_stats_list(dm_get_stats(md), program, result, maxlen); 872 + 873 + kfree(program); 874 + 875 + return r; 876 + } 877 + 878 + static int message_stats_print(struct mapped_device *md, 879 + unsigned argc, char **argv, bool clear, 880 + char *result, unsigned maxlen) 881 + { 882 + int id; 883 + char dummy; 884 + unsigned long idx_start = 0, idx_len = ULONG_MAX; 885 + 886 + if (argc != 2 && argc != 4) 887 + return -EINVAL; 888 + 889 + if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 890 + return -EINVAL; 891 + 892 + if (argc > 3) { 893 + if (strcmp(argv[2], "-") && 894 + sscanf(argv[2], "%lu%c", &idx_start, &dummy) != 1) 895 + return -EINVAL; 896 + if (strcmp(argv[3], "-") && 897 + sscanf(argv[3], "%lu%c", &idx_len, &dummy) != 1) 898 + return -EINVAL; 899 + } 900 + 901 + return dm_stats_print(dm_get_stats(md), id, idx_start, idx_len, clear, 902 + result, maxlen); 903 + } 904 + 905 + static int message_stats_set_aux(struct mapped_device *md, 906 + unsigned argc, char **argv) 907 + { 908 + int id; 909 + char dummy; 910 + 911 + if (argc != 3) 912 + return -EINVAL; 913 + 914 + if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) 915 + return -EINVAL; 916 + 917 + return dm_stats_set_aux(dm_get_stats(md), id, argv[2]); 918 + } 919 + 920 + int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv, 921 + char *result, unsigned maxlen) 922 + { 923 + int r; 924 + 925 + if (dm_request_based(md)) { 926 + DMWARN("Statistics are only supported for bio-based devices"); 927 + return -EOPNOTSUPP; 928 + } 929 + 930 + /* All messages here must start with '@' */ 931 + if (!strcasecmp(argv[0], "@stats_create")) 932 + r = message_stats_create(md, argc, argv, result, maxlen); 933 + else if (!strcasecmp(argv[0], "@stats_delete")) 934 + r = message_stats_delete(md, argc, argv); 935 + else if (!strcasecmp(argv[0], "@stats_clear")) 936 + r = message_stats_clear(md, argc, argv); 937 + else if (!strcasecmp(argv[0], "@stats_list")) 938 + r = message_stats_list(md, argc, argv, result, maxlen); 939 + else if (!strcasecmp(argv[0], "@stats_print")) 940 + r = message_stats_print(md, argc, argv, false, result, maxlen); 941 + else if (!strcasecmp(argv[0], "@stats_print_clear")) 942 + r = message_stats_print(md, argc, argv, true, result, maxlen); 943 + else if (!strcasecmp(argv[0], "@stats_set_aux")) 944 + r = message_stats_set_aux(md, argc, argv); 945 + else 946 + return 2; /* this wasn't a stats message */ 947 + 948 + if (r == -EINVAL) 949 + DMWARN("Invalid parameters for message %s", argv[0]); 950 + 951 + return r; 952 + } 953 + 954 + int __init dm_statistics_init(void) 955 + { 956 + dm_stat_need_rcu_barrier = 0; 957 + return 0; 958 + } 959 + 960 + void dm_statistics_exit(void) 961 + { 962 + if (dm_stat_need_rcu_barrier) 963 + rcu_barrier(); 964 + if (WARN_ON(shared_memory_amount)) 965 + DMCRIT("shared_memory_amount leaked: %lu", shared_memory_amount); 966 + } 967 + 968 + module_param_named(stats_current_allocated_bytes, shared_memory_amount, ulong, S_IRUGO); 969 + MODULE_PARM_DESC(stats_current_allocated_bytes, "Memory currently used by statistics");

+40

drivers/md/dm-stats.h

··· 1 + #ifndef DM_STATS_H 2 + #define DM_STATS_H 3 + 4 + #include <linux/types.h> 5 + #include <linux/mutex.h> 6 + #include <linux/list.h> 7 + 8 + int dm_statistics_init(void); 9 + void dm_statistics_exit(void); 10 + 11 + struct dm_stats { 12 + struct mutex mutex; 13 + struct list_head list; /* list of struct dm_stat */ 14 + struct dm_stats_last_position __percpu *last; 15 + sector_t last_sector; 16 + unsigned last_rw; 17 + }; 18 + 19 + struct dm_stats_aux { 20 + bool merged; 21 + }; 22 + 23 + void dm_stats_init(struct dm_stats *st); 24 + void dm_stats_cleanup(struct dm_stats *st); 25 + 26 + struct mapped_device; 27 + 28 + int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv, 29 + char *result, unsigned maxlen); 30 + 31 + void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, 32 + sector_t bi_sector, unsigned bi_sectors, bool end, 33 + unsigned long duration, struct dm_stats_aux *aux); 34 + 35 + static inline bool dm_stats_used(struct dm_stats *st) 36 + { 37 + return !list_empty(&st->list); 38 + } 39 + 40 + #endif

+1

drivers/md/dm-stripe.c

··· 4 4 * This file is released under the GPL. 5 5 */ 6 6 7 + #include "dm.h" 7 8 #include <linux/device-mapper.h> 8 9 9 10 #include <linux/module.h>

+18 -2

drivers/md/dm-table.c

··· 860 860 static int dm_table_set_type(struct dm_table *t) 861 861 { 862 862 unsigned i; 863 - unsigned bio_based = 0, request_based = 0; 863 + unsigned bio_based = 0, request_based = 0, hybrid = 0; 864 864 struct dm_target *tgt; 865 865 struct dm_dev_internal *dd; 866 866 struct list_head *devices; 867 + unsigned live_md_type; 867 868 868 869 for (i = 0; i < t->num_targets; i++) { 869 870 tgt = t->targets + i; 870 - if (dm_target_request_based(tgt)) 871 + if (dm_target_hybrid(tgt)) 872 + hybrid = 1; 873 + else if (dm_target_request_based(tgt)) 871 874 request_based = 1; 872 875 else 873 876 bio_based = 1; ··· 880 877 " can't be mixed up"); 881 878 return -EINVAL; 882 879 } 880 + } 881 + 882 + if (hybrid && !bio_based && !request_based) { 883 + /* 884 + * The targets can work either way. 885 + * Determine the type from the live device. 886 + * Default to bio-based if device is new. 887 + */ 888 + live_md_type = dm_get_md_type(t->md); 889 + if (live_md_type == DM_TYPE_REQUEST_BASED) 890 + request_based = 1; 891 + else 892 + bio_based = 1; 883 893 } 884 894 885 895 if (bio_based) {

+8 -1

drivers/md/dm-target.c

··· 131 131 return -EIO; 132 132 } 133 133 134 + static int io_err_map_rq(struct dm_target *ti, struct request *clone, 135 + union map_info *map_context) 136 + { 137 + return -EIO; 138 + } 139 + 134 140 static struct target_type error_target = { 135 141 .name = "error", 136 - .version = {1, 1, 0}, 142 + .version = {1, 2, 0}, 137 143 .ctr = io_err_ctr, 138 144 .dtr = io_err_dtr, 139 145 .map = io_err_map, 146 + .map_rq = io_err_map_rq, 140 147 }; 141 148 142 149 int __init dm_target_init(void)

+78 -46

drivers/md/dm-thin.c

··· 887 887 888 888 r = dm_pool_commit_metadata(pool->pmd); 889 889 if (r) 890 - DMERR_LIMIT("commit failed: error = %d", r); 890 + DMERR_LIMIT("%s: commit failed: error = %d", 891 + dm_device_name(pool->pool_md), r); 891 892 892 893 return r; 893 894 } ··· 918 917 unsigned long flags; 919 918 struct pool *pool = tc->pool; 920 919 920 + /* 921 + * Once no_free_space is set we must not allow allocation to succeed. 922 + * Otherwise it is difficult to explain, debug, test and support. 923 + */ 924 + if (pool->no_free_space) 925 + return -ENOSPC; 926 + 921 927 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 922 928 if (r) 923 929 return r; ··· 939 931 } 940 932 941 933 if (!free_blocks) { 942 - if (pool->no_free_space) 934 + /* 935 + * Try to commit to see if that will free up some 936 + * more space. 937 + */ 938 + (void) commit_or_fallback(pool); 939 + 940 + r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 941 + if (r) 942 + return r; 943 + 944 + /* 945 + * If we still have no space we set a flag to avoid 946 + * doing all this checking and return -ENOSPC. This 947 + * flag serves as a latch that disallows allocations from 948 + * this pool until the admin takes action (e.g. resize or 949 + * table reload). 950 + */ 951 + if (!free_blocks) { 952 + DMWARN("%s: no free space available.", 953 + dm_device_name(pool->pool_md)); 954 + spin_lock_irqsave(&pool->lock, flags); 955 + pool->no_free_space = 1; 956 + spin_unlock_irqrestore(&pool->lock, flags); 943 957 return -ENOSPC; 944 - else { 945 - /* 946 - * Try to commit to see if that will free up some 947 - * more space. 948 - */ 949 - (void) commit_or_fallback(pool); 950 - 951 - r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 952 - if (r) 953 - return r; 954 - 955 - /* 956 - * If we still have no space we set a flag to avoid 957 - * doing all this checking and return -ENOSPC. 958 - */ 959 - if (!free_blocks) { 960 - DMWARN("%s: no free space available.", 961 - dm_device_name(pool->pool_md)); 962 - spin_lock_irqsave(&pool->lock, flags); 963 - pool->no_free_space = 1; 964 - spin_unlock_irqrestore(&pool->lock, flags); 965 - return -ENOSPC; 966 - } 967 958 } 968 959 } 969 960 ··· 1092 1085 { 1093 1086 int r; 1094 1087 dm_block_t data_block; 1088 + struct pool *pool = tc->pool; 1095 1089 1096 1090 r = alloc_data_block(tc, &data_block); 1097 1091 switch (r) { ··· 1102 1094 break; 1103 1095 1104 1096 case -ENOSPC: 1105 - no_space(tc->pool, cell); 1097 + no_space(pool, cell); 1106 1098 break; 1107 1099 1108 1100 default: 1109 1101 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", 1110 1102 __func__, r); 1111 - cell_error(tc->pool, cell); 1103 + set_pool_mode(pool, PM_READ_ONLY); 1104 + cell_error(pool, cell); 1112 1105 break; 1113 1106 } 1114 1107 } ··· 1395 1386 1396 1387 switch (mode) { 1397 1388 case PM_FAIL: 1398 - DMERR("switching pool to failure mode"); 1389 + DMERR("%s: switching pool to failure mode", 1390 + dm_device_name(pool->pool_md)); 1399 1391 pool->process_bio = process_bio_fail; 1400 1392 pool->process_discard = process_bio_fail; 1401 1393 pool->process_prepared_mapping = process_prepared_mapping_fail; ··· 1404 1394 break; 1405 1395 1406 1396 case PM_READ_ONLY: 1407 - DMERR("switching pool to read-only mode"); 1397 + DMERR("%s: switching pool to read-only mode", 1398 + dm_device_name(pool->pool_md)); 1408 1399 r = dm_pool_abort_metadata(pool->pmd); 1409 1400 if (r) { 1410 - DMERR("aborting transaction failed"); 1401 + DMERR("%s: aborting transaction failed", 1402 + dm_device_name(pool->pool_md)); 1411 1403 set_pool_mode(pool, PM_FAIL); 1412 1404 } else { 1413 1405 dm_pool_metadata_read_only(pool->pmd); ··· 2168 2156 2169 2157 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); 2170 2158 if (r) { 2171 - DMERR("failed to retrieve data device size"); 2159 + DMERR("%s: failed to retrieve data device size", 2160 + dm_device_name(pool->pool_md)); 2172 2161 return r; 2173 2162 } 2174 2163 2175 2164 if (data_size < sb_data_size) { 2176 - DMERR("pool target (%llu blocks) too small: expected %llu", 2165 + DMERR("%s: pool target (%llu blocks) too small: expected %llu", 2166 + dm_device_name(pool->pool_md), 2177 2167 (unsigned long long)data_size, sb_data_size); 2178 2168 return -EINVAL; 2179 2169 2180 2170 } else if (data_size > sb_data_size) { 2181 2171 r = dm_pool_resize_data_dev(pool->pmd, data_size); 2182 2172 if (r) { 2183 - DMERR("failed to resize data device"); 2173 + DMERR("%s: failed to resize data device", 2174 + dm_device_name(pool->pool_md)); 2184 2175 set_pool_mode(pool, PM_READ_ONLY); 2185 2176 return r; 2186 2177 } ··· 2207 2192 2208 2193 r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size); 2209 2194 if (r) { 2210 - DMERR("failed to retrieve data device size"); 2195 + DMERR("%s: failed to retrieve metadata device size", 2196 + dm_device_name(pool->pool_md)); 2211 2197 return r; 2212 2198 } 2213 2199 2214 2200 if (metadata_dev_size < sb_metadata_dev_size) { 2215 - DMERR("metadata device (%llu blocks) too small: expected %llu", 2201 + DMERR("%s: metadata device (%llu blocks) too small: expected %llu", 2202 + dm_device_name(pool->pool_md), 2216 2203 metadata_dev_size, sb_metadata_dev_size); 2217 2204 return -EINVAL; 2218 2205 2219 2206 } else if (metadata_dev_size > sb_metadata_dev_size) { 2220 2207 r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size); 2221 2208 if (r) { 2222 - DMERR("failed to resize metadata device"); 2209 + DMERR("%s: failed to resize metadata device", 2210 + dm_device_name(pool->pool_md)); 2223 2211 return r; 2224 2212 } 2225 2213 ··· 2548 2530 2549 2531 r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id); 2550 2532 if (r) { 2551 - DMERR("dm_pool_get_metadata_transaction_id returned %d", r); 2533 + DMERR("%s: dm_pool_get_metadata_transaction_id returned %d", 2534 + dm_device_name(pool->pool_md), r); 2552 2535 goto err; 2553 2536 } 2554 2537 2555 2538 r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata); 2556 2539 if (r) { 2557 - DMERR("dm_pool_get_free_metadata_block_count returned %d", r); 2540 + DMERR("%s: dm_pool_get_free_metadata_block_count returned %d", 2541 + dm_device_name(pool->pool_md), r); 2558 2542 goto err; 2559 2543 } 2560 2544 2561 2545 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata); 2562 2546 if (r) { 2563 - DMERR("dm_pool_get_metadata_dev_size returned %d", r); 2547 + DMERR("%s: dm_pool_get_metadata_dev_size returned %d", 2548 + dm_device_name(pool->pool_md), r); 2564 2549 goto err; 2565 2550 } 2566 2551 2567 2552 r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data); 2568 2553 if (r) { 2569 - DMERR("dm_pool_get_free_block_count returned %d", r); 2554 + DMERR("%s: dm_pool_get_free_block_count returned %d", 2555 + dm_device_name(pool->pool_md), r); 2570 2556 goto err; 2571 2557 } 2572 2558 2573 2559 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data); 2574 2560 if (r) { 2575 - DMERR("dm_pool_get_data_dev_size returned %d", r); 2561 + DMERR("%s: dm_pool_get_data_dev_size returned %d", 2562 + dm_device_name(pool->pool_md), r); 2576 2563 goto err; 2577 2564 } 2578 2565 2579 2566 r = dm_pool_get_metadata_snap(pool->pmd, &held_root); 2580 2567 if (r) { 2581 - DMERR("dm_pool_get_metadata_snap returned %d", r); 2568 + DMERR("%s: dm_pool_get_metadata_snap returned %d", 2569 + dm_device_name(pool->pool_md), r); 2582 2570 goto err; 2583 2571 } 2584 2572 ··· 2672 2648 { 2673 2649 struct pool_c *pt = ti->private; 2674 2650 struct pool *pool = pt->pool; 2651 + uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 2675 2652 2676 - blk_limits_io_min(limits, 0); 2677 - blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 2653 + /* 2654 + * If the system-determined stacked limits are compatible with the 2655 + * pool's blocksize (io_opt is a factor) do not override them. 2656 + */ 2657 + if (io_opt_sectors < pool->sectors_per_block || 2658 + do_div(io_opt_sectors, pool->sectors_per_block)) { 2659 + blk_limits_io_min(limits, 0); 2660 + blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 2661 + } 2678 2662 2679 2663 /* 2680 2664 * pt->adjusted_pf is a staging area for the actual features to use. ··· 2701 2669 .name = "thin-pool", 2702 2670 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2703 2671 DM_TARGET_IMMUTABLE, 2704 - .version = {1, 8, 0}, 2672 + .version = {1, 9, 0}, 2705 2673 .module = THIS_MODULE, 2706 2674 .ctr = pool_ctr, 2707 2675 .dtr = pool_dtr, ··· 2988 2956 2989 2957 static struct target_type thin_target = { 2990 2958 .name = "thin", 2991 - .version = {1, 8, 0}, 2959 + .version = {1, 9, 0}, 2992 2960 .module = THIS_MODULE, 2993 2961 .ctr = thin_ctr, 2994 2962 .dtr = thin_dtr,

+65 -5

drivers/md/dm.c

··· 60 60 struct bio *bio; 61 61 unsigned long start_time; 62 62 spinlock_t endio_lock; 63 + struct dm_stats_aux stats_aux; 63 64 }; 64 65 65 66 /* ··· 199 198 200 199 /* zero-length flush that will be cloned and submitted to targets */ 201 200 struct bio flush_bio; 201 + 202 + struct dm_stats stats; 202 203 }; 203 204 204 205 /* ··· 272 269 dm_io_init, 273 270 dm_kcopyd_init, 274 271 dm_interface_init, 272 + dm_statistics_init, 275 273 }; 276 274 277 275 static void (*_exits[])(void) = { ··· 283 279 dm_io_exit, 284 280 dm_kcopyd_exit, 285 281 dm_interface_exit, 282 + dm_statistics_exit, 286 283 }; 287 284 288 285 static int __init dm_init(void) ··· 389 384 return r; 390 385 } 391 386 387 + sector_t dm_get_size(struct mapped_device *md) 388 + { 389 + return get_capacity(md->disk); 390 + } 391 + 392 + struct dm_stats *dm_get_stats(struct mapped_device *md) 393 + { 394 + return &md->stats; 395 + } 396 + 392 397 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 393 398 { 394 399 struct mapped_device *md = bdev->bd_disk->private_data; ··· 481 466 static void start_io_acct(struct dm_io *io) 482 467 { 483 468 struct mapped_device *md = io->md; 469 + struct bio *bio = io->bio; 484 470 int cpu; 485 - int rw = bio_data_dir(io->bio); 471 + int rw = bio_data_dir(bio); 486 472 487 473 io->start_time = jiffies; 488 474 ··· 492 476 part_stat_unlock(); 493 477 atomic_set(&dm_disk(md)->part0.in_flight[rw], 494 478 atomic_inc_return(&md->pending[rw])); 479 + 480 + if (unlikely(dm_stats_used(&md->stats))) 481 + dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector, 482 + bio_sectors(bio), false, 0, &io->stats_aux); 495 483 } 496 484 497 485 static void end_io_acct(struct dm_io *io) ··· 510 490 part_round_stats(cpu, &dm_disk(md)->part0); 511 491 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); 512 492 part_stat_unlock(); 493 + 494 + if (unlikely(dm_stats_used(&md->stats))) 495 + dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector, 496 + bio_sectors(bio), true, duration, &io->stats_aux); 513 497 514 498 /* 515 499 * After this is decremented the bio must not be touched if it is ··· 1543 1519 return; 1544 1520 } 1545 1521 1546 - static int dm_request_based(struct mapped_device *md) 1522 + int dm_request_based(struct mapped_device *md) 1547 1523 { 1548 1524 return blk_queue_stackable(md->queue); 1549 1525 } ··· 1970 1946 add_disk(md->disk); 1971 1947 format_dev_t(md->name, MKDEV(_major, minor)); 1972 1948 1973 - md->wq = alloc_workqueue("kdmflush", 1974 - WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); 1949 + md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); 1975 1950 if (!md->wq) 1976 1951 goto bad_thread; 1977 1952 ··· 1981 1958 bio_init(&md->flush_bio); 1982 1959 md->flush_bio.bi_bdev = md->bdev; 1983 1960 md->flush_bio.bi_rw = WRITE_FLUSH; 1961 + 1962 + dm_stats_init(&md->stats); 1984 1963 1985 1964 /* Populate the mapping, nobody knows we exist yet */ 1986 1965 spin_lock(&_minor_lock); ··· 2035 2010 2036 2011 put_disk(md->disk); 2037 2012 blk_cleanup_queue(md->queue); 2013 + dm_stats_cleanup(&md->stats); 2038 2014 module_put(THIS_MODULE); 2039 2015 kfree(md); 2040 2016 } ··· 2177 2151 /* 2178 2152 * Wipe any geometry if the size of the table changed. 2179 2153 */ 2180 - if (size != get_capacity(md->disk)) 2154 + if (size != dm_get_size(md)) 2181 2155 memset(&md->geometry, 0, sizeof(md->geometry)); 2182 2156 2183 2157 __set_size(md, size); ··· 2262 2236 2263 2237 void dm_set_md_type(struct mapped_device *md, unsigned type) 2264 2238 { 2239 + BUG_ON(!mutex_is_locked(&md->type_lock)); 2265 2240 md->type = type; 2266 2241 } 2267 2242 2268 2243 unsigned dm_get_md_type(struct mapped_device *md) 2269 2244 { 2245 + BUG_ON(!mutex_is_locked(&md->type_lock)); 2270 2246 return md->type; 2271 2247 } 2272 2248 ··· 2721 2693 mutex_unlock(&md->suspend_lock); 2722 2694 2723 2695 return r; 2696 + } 2697 + 2698 + /* 2699 + * Internal suspend/resume works like userspace-driven suspend. It waits 2700 + * until all bios finish and prevents issuing new bios to the target drivers. 2701 + * It may be used only from the kernel. 2702 + * 2703 + * Internal suspend holds md->suspend_lock, which prevents interaction with 2704 + * userspace-driven suspend. 2705 + */ 2706 + 2707 + void dm_internal_suspend(struct mapped_device *md) 2708 + { 2709 + mutex_lock(&md->suspend_lock); 2710 + if (dm_suspended_md(md)) 2711 + return; 2712 + 2713 + set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2714 + synchronize_srcu(&md->io_barrier); 2715 + flush_workqueue(md->wq); 2716 + dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2717 + } 2718 + 2719 + void dm_internal_resume(struct mapped_device *md) 2720 + { 2721 + if (dm_suspended_md(md)) 2722 + goto done; 2723 + 2724 + dm_queue_flush(md); 2725 + 2726 + done: 2727 + mutex_unlock(&md->suspend_lock); 2724 2728 } 2725 2729 2726 2730 /*-----------------------------------------------------------------

+27

drivers/md/dm.h

··· 16 16 #include <linux/blkdev.h> 17 17 #include <linux/hdreg.h> 18 18 19 + #include "dm-stats.h" 20 + 19 21 /* 20 22 * Suspend feature flags 21 23 */ ··· 91 89 #define dm_target_is_valid(t) ((t)->table) 92 90 93 91 /* 92 + * To check whether the target type is bio-based or not (request-based). 93 + */ 94 + #define dm_target_bio_based(t) ((t)->type->map != NULL) 95 + 96 + /* 94 97 * To check whether the target type is request-based or not (bio-based). 95 98 */ 96 99 #define dm_target_request_based(t) ((t)->type->map_rq != NULL) 100 + 101 + /* 102 + * To check whether the target type is a hybrid (capable of being 103 + * either request-based or bio-based). 104 + */ 105 + #define dm_target_hybrid(t) (dm_target_bio_based(t) && dm_target_request_based(t)) 97 106 98 107 /*----------------------------------------------------------------- 99 108 * A registry of target types. ··· 159 146 void dm_destroy_immediate(struct mapped_device *md); 160 147 int dm_open_count(struct mapped_device *md); 161 148 int dm_lock_for_deletion(struct mapped_device *md); 149 + int dm_request_based(struct mapped_device *md); 150 + sector_t dm_get_size(struct mapped_device *md); 151 + struct dm_stats *dm_get_stats(struct mapped_device *md); 162 152 163 153 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 164 154 unsigned cookie); 155 + 156 + void dm_internal_suspend(struct mapped_device *md); 157 + void dm_internal_resume(struct mapped_device *md); 165 158 166 159 int dm_io_init(void); 167 160 void dm_io_exit(void); ··· 180 161 */ 181 162 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size); 182 163 void dm_free_md_mempools(struct dm_md_mempools *pools); 164 + 165 + /* 166 + * Helpers that are used by DM core 167 + */ 168 + static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen) 169 + { 170 + return !maxlen || strlen(result) + 1 >= maxlen; 171 + } 183 172 184 173 #endif

+5

drivers/md/persistent-data/dm-block-manager.c

··· 615 615 } 616 616 EXPORT_SYMBOL_GPL(dm_bm_flush_and_unlock); 617 617 618 + void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b) 619 + { 620 + dm_bufio_prefetch(bm->bufio, b, 1); 621 + } 622 + 618 623 void dm_bm_set_read_only(struct dm_block_manager *bm) 619 624 { 620 625 bm->read_only = true;

+5

drivers/md/persistent-data/dm-block-manager.h

··· 108 108 int dm_bm_flush_and_unlock(struct dm_block_manager *bm, 109 109 struct dm_block *superblock); 110 110 111 + /* 112 + * Request data be prefetched into the cache. 113 + */ 114 + void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b); 115 + 111 116 /* 112 117 * Switches the bm to a read only mode. Once read-only mode 113 118 * has been entered the following functions will return -EPERM.

+22 -6

drivers/md/persistent-data/dm-btree.c

··· 161 161 }; 162 162 163 163 struct del_stack { 164 + struct dm_btree_info *info; 164 165 struct dm_transaction_manager *tm; 165 166 int top; 166 167 struct frame spine[MAX_SPINE_DEPTH]; ··· 182 181 static int unprocessed_frames(struct del_stack *s) 183 182 { 184 183 return s->top >= 0; 184 + } 185 + 186 + static void prefetch_children(struct del_stack *s, struct frame *f) 187 + { 188 + unsigned i; 189 + struct dm_block_manager *bm = dm_tm_get_bm(s->tm); 190 + 191 + for (i = 0; i < f->nr_children; i++) 192 + dm_bm_prefetch(bm, value64(f->n, i)); 193 + } 194 + 195 + static bool is_internal_level(struct dm_btree_info *info, struct frame *f) 196 + { 197 + return f->level < (info->levels - 1); 185 198 } 186 199 187 200 static int push_frame(struct del_stack *s, dm_block_t b, unsigned level) ··· 220 205 dm_tm_dec(s->tm, b); 221 206 222 207 else { 208 + uint32_t flags; 223 209 struct frame *f = s->spine + ++s->top; 224 210 225 211 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b); ··· 233 217 f->level = level; 234 218 f->nr_children = le32_to_cpu(f->n->header.nr_entries); 235 219 f->current_child = 0; 220 + 221 + flags = le32_to_cpu(f->n->header.flags); 222 + if (flags & INTERNAL_NODE || is_internal_level(s->info, f)) 223 + prefetch_children(s, f); 236 224 } 237 225 238 226 return 0; ··· 250 230 dm_tm_unlock(s->tm, f->b); 251 231 } 252 232 253 - static bool is_internal_level(struct dm_btree_info *info, struct frame *f) 254 - { 255 - return f->level < (info->levels - 1); 256 - } 257 - 258 233 int dm_btree_del(struct dm_btree_info *info, dm_block_t root) 259 234 { 260 235 int r; ··· 258 243 s = kmalloc(sizeof(*s), GFP_KERNEL); 259 244 if (!s) 260 245 return -ENOMEM; 246 + s->info = info; 261 247 s->tm = info->tm; 262 248 s->top = -1; 263 249 ··· 303 287 info->value_type.dec(info->value_type.context, 304 288 value_ptr(f->n, i)); 305 289 } 306 - f->current_child = f->nr_children; 290 + pop_frame(s); 307 291 } 308 292 } 309 293

+50 -29

drivers/md/persistent-data/dm-space-map-common.c

··· 292 292 return dm_tm_unlock(ll->tm, blk); 293 293 } 294 294 295 - int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result) 295 + static int sm_ll_lookup_big_ref_count(struct ll_disk *ll, dm_block_t b, 296 + uint32_t *result) 296 297 { 297 298 __le32 le_rc; 298 - int r = sm_ll_lookup_bitmap(ll, b, result); 299 - 300 - if (r) 301 - return r; 302 - 303 - if (*result != 3) 304 - return r; 299 + int r; 305 300 306 301 r = dm_btree_lookup(&ll->ref_count_info, ll->ref_count_root, &b, &le_rc); 307 302 if (r < 0) ··· 305 310 *result = le32_to_cpu(le_rc); 306 311 307 312 return r; 313 + } 314 + 315 + int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result) 316 + { 317 + int r = sm_ll_lookup_bitmap(ll, b, result); 318 + 319 + if (r) 320 + return r; 321 + 322 + if (*result != 3) 323 + return r; 324 + 325 + return sm_ll_lookup_big_ref_count(ll, b, result); 308 326 } 309 327 310 328 int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, ··· 380 372 return -ENOSPC; 381 373 } 382 374 383 - int sm_ll_insert(struct ll_disk *ll, dm_block_t b, 384 - uint32_t ref_count, enum allocation_event *ev) 375 + static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, 376 + uint32_t (*mutator)(void *context, uint32_t old), 377 + void *context, enum allocation_event *ev) 385 378 { 386 379 int r; 387 - uint32_t bit, old; 380 + uint32_t bit, old, ref_count; 388 381 struct dm_block *nb; 389 382 dm_block_t index = b; 390 383 struct disk_index_entry ie_disk; ··· 407 398 408 399 bm_le = dm_bitmap_data(nb); 409 400 old = sm_lookup_bitmap(bm_le, bit); 401 + 402 + if (old > 2) { 403 + r = sm_ll_lookup_big_ref_count(ll, b, &old); 404 + if (r < 0) 405 + return r; 406 + } 407 + 408 + ref_count = mutator(context, old); 410 409 411 410 if (ref_count <= 2) { 412 411 sm_set_bitmap(bm_le, bit, ref_count); ··· 465 448 return ll->save_ie(ll, index, &ie_disk); 466 449 } 467 450 451 + static uint32_t set_ref_count(void *context, uint32_t old) 452 + { 453 + return *((uint32_t *) context); 454 + } 455 + 456 + int sm_ll_insert(struct ll_disk *ll, dm_block_t b, 457 + uint32_t ref_count, enum allocation_event *ev) 458 + { 459 + return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev); 460 + } 461 + 462 + static uint32_t inc_ref_count(void *context, uint32_t old) 463 + { 464 + return old + 1; 465 + } 466 + 468 467 int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) 469 468 { 470 - int r; 471 - uint32_t rc; 469 + return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev); 470 + } 472 471 473 - r = sm_ll_lookup(ll, b, &rc); 474 - if (r) 475 - return r; 476 - 477 - return sm_ll_insert(ll, b, rc + 1, ev); 472 + static uint32_t dec_ref_count(void *context, uint32_t old) 473 + { 474 + return old - 1; 478 475 } 479 476 480 477 int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) 481 478 { 482 - int r; 483 - uint32_t rc; 484 - 485 - r = sm_ll_lookup(ll, b, &rc); 486 - if (r) 487 - return r; 488 - 489 - if (!rc) 490 - return -EINVAL; 491 - 492 - return sm_ll_insert(ll, b, rc - 1, ev); 479 + return sm_ll_mutate(ll, b, dec_ref_count, NULL, ev); 493 480 } 494 481 495 482 int sm_ll_commit(struct ll_disk *ll)

+9

include/linux/device-mapper.h

··· 10 10 11 11 #include <linux/bio.h> 12 12 #include <linux/blkdev.h> 13 + #include <linux/math64.h> 13 14 #include <linux/ratelimit.h> 14 15 15 16 struct dm_dev; ··· 550 549 #define DM_MAPIO_SUBMITTED 0 551 550 #define DM_MAPIO_REMAPPED 1 552 551 #define DM_MAPIO_REQUEUE DM_ENDIO_REQUEUE 552 + 553 + #define dm_sector_div64(x, y)( \ 554 + { \ 555 + u64 _res; \ 556 + (x) = div64_u64_rem(x, y, &_res); \ 557 + _res; \ 558 + } \ 559 + ) 553 560 554 561 /* 555 562 * Ceiling(n / sz)

+13

include/linux/math64.h

··· 31 31 } 32 32 33 33 /** 34 + * div64_u64_rem - unsigned 64bit divide with 64bit divisor and remainder 35 + */ 36 + static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder) 37 + { 38 + *remainder = dividend % divisor; 39 + return dividend / divisor; 40 + } 41 + 42 + /** 34 43 * div64_u64 - unsigned 64bit divide with 64bit divisor 35 44 */ 36 45 static inline u64 div64_u64(u64 dividend, u64 divisor) ··· 70 61 71 62 #ifndef div_s64_rem 72 63 extern s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder); 64 + #endif 65 + 66 + #ifndef div64_u64_rem 67 + extern u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder); 73 68 #endif 74 69 75 70 #ifndef div64_u64

+2 -2

include/uapi/linux/dm-ioctl.h

··· 267 267 #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) 268 268 269 269 #define DM_VERSION_MAJOR 4 270 - #define DM_VERSION_MINOR 25 270 + #define DM_VERSION_MINOR 26 271 271 #define DM_VERSION_PATCHLEVEL 0 272 - #define DM_VERSION_EXTRA "-ioctl (2013-06-26)" 272 + #define DM_VERSION_EXTRA "-ioctl (2013-08-15)" 273 273 274 274 /* Status bits */ 275 275 #define DM_READONLY_FLAG (1 << 0) /* In/Out */

+40

lib/div64.c

··· 79 79 #endif 80 80 81 81 /** 82 + * div64_u64_rem - unsigned 64bit divide with 64bit divisor and remainder 83 + * @dividend: 64bit dividend 84 + * @divisor: 64bit divisor 85 + * @remainder: 64bit remainder 86 + * 87 + * This implementation is a comparable to algorithm used by div64_u64. 88 + * But this operation, which includes math for calculating the remainder, 89 + * is kept distinct to avoid slowing down the div64_u64 operation on 32bit 90 + * systems. 91 + */ 92 + #ifndef div64_u64_rem 93 + u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder) 94 + { 95 + u32 high = divisor >> 32; 96 + u64 quot; 97 + 98 + if (high == 0) { 99 + u32 rem32; 100 + quot = div_u64_rem(dividend, divisor, &rem32); 101 + *remainder = rem32; 102 + } else { 103 + int n = 1 + fls(high); 104 + quot = div_u64(dividend >> n, divisor >> n); 105 + 106 + if (quot != 0) 107 + quot--; 108 + 109 + *remainder = dividend - quot * divisor; 110 + if (*remainder >= divisor) { 111 + quot++; 112 + *remainder -= divisor; 113 + } 114 + } 115 + 116 + return quot; 117 + } 118 + EXPORT_SYMBOL(div64_u64_rem); 119 + #endif 120 + 121 + /** 82 122 * div64_u64 - unsigned 64bit divide with 64bit divisor 83 123 * @dividend: 64bit dividend 84 124 * @divisor: 64bit divisor