Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

dm thin: fix a race condition between discarding and provisioning a block

The discard passdown was being issued after the block was unmapped,
which meant the block could be reprovisioned whilst the passdown discard
was still in flight.

We can only identify unshared blocks (safe to do a passdown a discard
to) once they're unmapped and their ref count hits zero. Block ref
counts are now used to guard against concurrent allocation of these
blocks that are being discarded. So now we unmap the block, issue
passdown discards, and the immediately increment ref counts for regions
that have been discarded via passed down (this is safe because
allocation occurs within the same thread). We then decrement ref counts
once the passdown discard IO is complete -- signaling these blocks may
now be allocated.

This fixes the potential for corruption that was reported here:
https://www.redhat.com/archives/dm-devel/2016-June/msg00311.html

Reported-by: Dennis Yang <dennisyang@qnap.com>
Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

authored by

Joe Thornber and committed by
Mike Snitzer
2a0fbffb e7e0f730

+130 -17
+30
drivers/md/dm-thin-metadata.c
··· 1677 1677 return r; 1678 1678 } 1679 1679 1680 + int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e) 1681 + { 1682 + int r = 0; 1683 + 1684 + down_write(&pmd->root_lock); 1685 + for (; b != e; b++) { 1686 + r = dm_sm_inc_block(pmd->data_sm, b); 1687 + if (r) 1688 + break; 1689 + } 1690 + up_write(&pmd->root_lock); 1691 + 1692 + return r; 1693 + } 1694 + 1695 + int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e) 1696 + { 1697 + int r = 0; 1698 + 1699 + down_write(&pmd->root_lock); 1700 + for (; b != e; b++) { 1701 + r = dm_sm_dec_block(pmd->data_sm, b); 1702 + if (r) 1703 + break; 1704 + } 1705 + up_write(&pmd->root_lock); 1706 + 1707 + return r; 1708 + } 1709 + 1680 1710 bool dm_thin_changed_this_transaction(struct dm_thin_device *td) 1681 1711 { 1682 1712 int r;
+3
drivers/md/dm-thin-metadata.h
··· 197 197 198 198 int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result); 199 199 200 + int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e); 201 + int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e); 202 + 200 203 /* 201 204 * Returns -ENOSPC if the new size is too small and already allocated 202 205 * blocks would be lost.
+97 -17
drivers/md/dm-thin.c
··· 253 253 struct bio_list deferred_flush_bios; 254 254 struct list_head prepared_mappings; 255 255 struct list_head prepared_discards; 256 + struct list_head prepared_discards_pt2; 256 257 struct list_head active_thins; 257 258 258 259 struct dm_deferred_set *shared_read_ds; ··· 270 269 271 270 process_mapping_fn process_prepared_mapping; 272 271 process_mapping_fn process_prepared_discard; 272 + process_mapping_fn process_prepared_discard_pt2; 273 273 274 274 struct dm_bio_prison_cell **cell_sort_array; 275 275 }; ··· 1003 1001 1004 1002 /*----------------------------------------------------------------*/ 1005 1003 1006 - static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m) 1004 + static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m, 1005 + struct bio *discard_parent) 1007 1006 { 1008 1007 /* 1009 1008 * We've already unmapped this range of blocks, but before we ··· 1017 1014 dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin; 1018 1015 struct discard_op op; 1019 1016 1020 - begin_discard(&op, tc, m->bio); 1017 + begin_discard(&op, tc, discard_parent); 1021 1018 while (b != end) { 1022 1019 /* find start of unmapped run */ 1023 1020 for (; b < end; b++) { ··· 1052 1049 end_discard(&op, r); 1053 1050 } 1054 1051 1055 - static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m) 1052 + static void queue_passdown_pt2(struct dm_thin_new_mapping *m) 1053 + { 1054 + unsigned long flags; 1055 + struct pool *pool = m->tc->pool; 1056 + 1057 + spin_lock_irqsave(&pool->lock, flags); 1058 + list_add_tail(&m->list, &pool->prepared_discards_pt2); 1059 + spin_unlock_irqrestore(&pool->lock, flags); 1060 + wake_worker(pool); 1061 + } 1062 + 1063 + static void passdown_endio(struct bio *bio) 1064 + { 1065 + /* 1066 + * It doesn't matter if the passdown discard failed, we still want 1067 + * to unmap (we ignore err). 1068 + */ 1069 + queue_passdown_pt2(bio->bi_private); 1070 + } 1071 + 1072 + static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m) 1073 + { 1074 + int r; 1075 + struct thin_c *tc = m->tc; 1076 + struct pool *pool = tc->pool; 1077 + struct bio *discard_parent; 1078 + dm_block_t data_end = m->data_block + (m->virt_end - m->virt_begin); 1079 + 1080 + /* 1081 + * Only this thread allocates blocks, so we can be sure that the 1082 + * newly unmapped blocks will not be allocated before the end of 1083 + * the function. 1084 + */ 1085 + r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end); 1086 + if (r) { 1087 + metadata_operation_failed(pool, "dm_thin_remove_range", r); 1088 + bio_io_error(m->bio); 1089 + cell_defer_no_holder(tc, m->cell); 1090 + mempool_free(m, pool->mapping_pool); 1091 + return; 1092 + } 1093 + 1094 + discard_parent = bio_alloc(GFP_NOIO, 1); 1095 + if (!discard_parent) { 1096 + DMWARN("%s: unable to allocate top level discard bio for passdown. Skipping passdown.", 1097 + dm_device_name(tc->pool->pool_md)); 1098 + queue_passdown_pt2(m); 1099 + 1100 + } else { 1101 + discard_parent->bi_end_io = passdown_endio; 1102 + discard_parent->bi_private = m; 1103 + 1104 + if (m->maybe_shared) 1105 + passdown_double_checking_shared_status(m, discard_parent); 1106 + else { 1107 + struct discard_op op; 1108 + 1109 + begin_discard(&op, tc, discard_parent); 1110 + r = issue_discard(&op, m->data_block, data_end); 1111 + end_discard(&op, r); 1112 + } 1113 + } 1114 + 1115 + /* 1116 + * Increment the unmapped blocks. This prevents a race between the 1117 + * passdown io and reallocation of freed blocks. 1118 + */ 1119 + r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end); 1120 + if (r) { 1121 + metadata_operation_failed(pool, "dm_pool_inc_data_range", r); 1122 + bio_io_error(m->bio); 1123 + cell_defer_no_holder(tc, m->cell); 1124 + mempool_free(m, pool->mapping_pool); 1125 + return; 1126 + } 1127 + } 1128 + 1129 + static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m) 1056 1130 { 1057 1131 int r; 1058 1132 struct thin_c *tc = m->tc; 1059 1133 struct pool *pool = tc->pool; 1060 1134 1061 - r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end); 1135 + /* 1136 + * The passdown has completed, so now we can decrement all those 1137 + * unmapped blocks. 1138 + */ 1139 + r = dm_pool_dec_data_range(pool->pmd, m->data_block, 1140 + m->data_block + (m->virt_end - m->virt_begin)); 1062 1141 if (r) { 1063 - metadata_operation_failed(pool, "dm_thin_remove_range", r); 1142 + metadata_operation_failed(pool, "dm_pool_dec_data_range", r); 1064 1143 bio_io_error(m->bio); 1065 - 1066 - } else if (m->maybe_shared) { 1067 - passdown_double_checking_shared_status(m); 1068 - 1069 - } else { 1070 - struct discard_op op; 1071 - begin_discard(&op, tc, m->bio); 1072 - r = issue_discard(&op, m->data_block, 1073 - m->data_block + (m->virt_end - m->virt_begin)); 1074 - end_discard(&op, r); 1075 - } 1144 + } else 1145 + bio_endio(m->bio); 1076 1146 1077 1147 cell_defer_no_holder(tc, m->cell); 1078 1148 mempool_free(m, pool->mapping_pool); ··· 2291 2215 throttle_work_update(&pool->throttle); 2292 2216 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard); 2293 2217 throttle_work_update(&pool->throttle); 2218 + process_prepared(pool, &pool->prepared_discards_pt2, &pool->process_prepared_discard_pt2); 2219 + throttle_work_update(&pool->throttle); 2294 2220 process_deferred_bios(pool); 2295 2221 throttle_work_complete(&pool->throttle); 2296 2222 } ··· 2421 2343 2422 2344 if (passdown_enabled(pt)) { 2423 2345 pool->process_discard_cell = process_discard_cell_passdown; 2424 - pool->process_prepared_discard = process_prepared_discard_passdown; 2346 + pool->process_prepared_discard = process_prepared_discard_passdown_pt1; 2347 + pool->process_prepared_discard_pt2 = process_prepared_discard_passdown_pt2; 2425 2348 } else { 2426 2349 pool->process_discard_cell = process_discard_cell_no_passdown; 2427 2350 pool->process_prepared_discard = process_prepared_discard_no_passdown; ··· 2909 2830 bio_list_init(&pool->deferred_flush_bios); 2910 2831 INIT_LIST_HEAD(&pool->prepared_mappings); 2911 2832 INIT_LIST_HEAD(&pool->prepared_discards); 2833 + INIT_LIST_HEAD(&pool->prepared_discards_pt2); 2912 2834 INIT_LIST_HEAD(&pool->active_thins); 2913 2835 pool->low_water_triggered = false; 2914 2836 pool->suspended = true;