dm table: rework reference counting

Rework table reference counting.

The existing code uses a reference counter. When the last reference is
dropped and the counter reaches zero, the table destructor is called.
Table reference counters are acquired/released from upcalls from other
kernel code (dm_any_congested, dm_merge_bvec, dm_unplug_all).
If the reference counter reaches zero in one of the upcalls, the table
destructor is called from almost random kernel code.

This leads to various problems:
* dm_any_congested being called under a spinlock, which calls the
destructor, which calls some sleeping function.
* the destructor attempting to take a lock that is already taken by the
same process.
* stale reference from some other kernel code keeps the table
constructed, which keeps some devices open, even after successful
return from "dmsetup remove". This can confuse lvm and prevent closing
of underlying devices or reusing device minor numbers.

The patch changes reference counting so that the table destructor can be
called only at predetermined places.

The table has always exactly one reference from either mapped_device->map
or hash_cell->new_map. After this patch, this reference is not counted
in table->holders. A pair of dm_create_table/dm_destroy_table functions
is used for table creation/destruction.

Temporary references from the other code increase table->holders. A pair
of dm_table_get/dm_table_put functions is used to manipulate it.

When the table is about to be destroyed, we wait for table->holders to
reach 0. Then, we call the table destructor. We use active waiting with
msleep(1), because the situation happens rarely (to one user in 5 years)
and removing the device isn't performance-critical task: the user doesn't
care if it takes one tick more or not.

This way, the destructor is called only at specific points
(dm_table_destroy function) and the above problems associated with lazy
destruction can't happen.

Finally remove the temporary protection added to dm_any_congested().

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

authored by

Mikulas Patocka and committed by
Alasdair G Kergon
d5816876 ab4c1424

+34 -21
+4 -6
drivers/md/dm-ioctl.c
··· 233 } 234 235 if (hc->new_map) 236 - dm_table_put(hc->new_map); 237 dm_put(hc->md); 238 free_cell(hc); 239 } ··· 827 828 r = dm_swap_table(md, new_map); 829 if (r) { 830 dm_put(md); 831 - dm_table_put(new_map); 832 return r; 833 } 834 ··· 836 set_disk_ro(dm_disk(md), 0); 837 else 838 set_disk_ro(dm_disk(md), 1); 839 - 840 - dm_table_put(new_map); 841 } 842 843 if (dm_suspended(md)) ··· 1078 } 1079 1080 if (hc->new_map) 1081 - dm_table_put(hc->new_map); 1082 hc->new_map = t; 1083 up_write(&_hash_lock); 1084 ··· 1107 } 1108 1109 if (hc->new_map) { 1110 - dm_table_put(hc->new_map); 1111 hc->new_map = NULL; 1112 } 1113
··· 233 } 234 235 if (hc->new_map) 236 + dm_table_destroy(hc->new_map); 237 dm_put(hc->md); 238 free_cell(hc); 239 } ··· 827 828 r = dm_swap_table(md, new_map); 829 if (r) { 830 + dm_table_destroy(new_map); 831 dm_put(md); 832 return r; 833 } 834 ··· 836 set_disk_ro(dm_disk(md), 0); 837 else 838 set_disk_ro(dm_disk(md), 1); 839 } 840 841 if (dm_suspended(md)) ··· 1080 } 1081 1082 if (hc->new_map) 1083 + dm_table_destroy(hc->new_map); 1084 hc->new_map = t; 1085 up_write(&_hash_lock); 1086 ··· 1109 } 1110 1111 if (hc->new_map) { 1112 + dm_table_destroy(hc->new_map); 1113 hc->new_map = NULL; 1114 } 1115
+23 -5
drivers/md/dm-table.c
··· 1 /* 2 * Copyright (C) 2001 Sistina Software (UK) Limited. 3 - * Copyright (C) 2004 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ ··· 15 #include <linux/slab.h> 16 #include <linux/interrupt.h> 17 #include <linux/mutex.h> 18 #include <asm/atomic.h> 19 20 #define DM_MSG_PREFIX "table" ··· 24 #define NODE_SIZE L1_CACHE_BYTES 25 #define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) 26 #define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) 27 28 struct dm_table { 29 struct mapped_device *md; ··· 242 return -ENOMEM; 243 244 INIT_LIST_HEAD(&t->devices); 245 - atomic_set(&t->holders, 1); 246 t->barriers_supported = 1; 247 248 if (!num_targets) ··· 273 } 274 } 275 276 - static void table_destroy(struct dm_table *t) 277 { 278 unsigned int i; 279 280 /* free the indexes (see dm_table_complete) */ 281 if (t->depth >= 2) ··· 318 if (!t) 319 return; 320 321 - if (atomic_dec_and_test(&t->holders)) 322 - table_destroy(t); 323 } 324 325 /*
··· 1 /* 2 * Copyright (C) 2001 Sistina Software (UK) Limited. 3 + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ ··· 15 #include <linux/slab.h> 16 #include <linux/interrupt.h> 17 #include <linux/mutex.h> 18 + #include <linux/delay.h> 19 #include <asm/atomic.h> 20 21 #define DM_MSG_PREFIX "table" ··· 23 #define NODE_SIZE L1_CACHE_BYTES 24 #define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) 25 #define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) 26 + 27 + /* 28 + * The table has always exactly one reference from either mapped_device->map 29 + * or hash_cell->new_map. This reference is not counted in table->holders. 30 + * A pair of dm_create_table/dm_destroy_table functions is used for table 31 + * creation/destruction. 32 + * 33 + * Temporary references from the other code increase table->holders. A pair 34 + * of dm_table_get/dm_table_put functions is used to manipulate it. 35 + * 36 + * When the table is about to be destroyed, we wait for table->holders to 37 + * drop to zero. 38 + */ 39 40 struct dm_table { 41 struct mapped_device *md; ··· 228 return -ENOMEM; 229 230 INIT_LIST_HEAD(&t->devices); 231 + atomic_set(&t->holders, 0); 232 t->barriers_supported = 1; 233 234 if (!num_targets) ··· 259 } 260 } 261 262 + void dm_table_destroy(struct dm_table *t) 263 { 264 unsigned int i; 265 + 266 + while (atomic_read(&t->holders)) 267 + msleep(1); 268 + smp_mb(); 269 270 /* free the indexes (see dm_table_complete) */ 271 if (t->depth >= 2) ··· 300 if (!t) 301 return; 302 303 + smp_mb__before_atomic_dec(); 304 + atomic_dec(&t->holders); 305 } 306 307 /*
+6 -10
drivers/md/dm.c
··· 977 struct mapped_device *md = congested_data; 978 struct dm_table *map; 979 980 - atomic_inc(&md->pending); 981 - 982 if (!test_bit(DMF_BLOCK_IO, &md->flags)) { 983 map = dm_get_table(md); 984 if (map) { ··· 984 dm_table_put(map); 985 } 986 } 987 - 988 - if (!atomic_dec_return(&md->pending)) 989 - /* nudge anyone waiting on suspend queue */ 990 - wake_up(&md->wait); 991 992 return r; 993 } ··· 1244 1245 if (md->suspended_bdev) 1246 __set_size(md, size); 1247 - if (size == 0) 1248 - return 0; 1249 1250 - dm_table_get(t); 1251 dm_table_event_callback(t, event_callback, md); 1252 1253 write_lock(&md->map_lock); ··· 1271 write_lock(&md->map_lock); 1272 md->map = NULL; 1273 write_unlock(&md->map_lock); 1274 - dm_table_put(map); 1275 } 1276 1277 /*
··· 977 struct mapped_device *md = congested_data; 978 struct dm_table *map; 979 980 if (!test_bit(DMF_BLOCK_IO, &md->flags)) { 981 map = dm_get_table(md); 982 if (map) { ··· 986 dm_table_put(map); 987 } 988 } 989 990 return r; 991 } ··· 1250 1251 if (md->suspended_bdev) 1252 __set_size(md, size); 1253 1254 + if (!size) { 1255 + dm_table_destroy(t); 1256 + return 0; 1257 + } 1258 + 1259 dm_table_event_callback(t, event_callback, md); 1260 1261 write_lock(&md->map_lock); ··· 1275 write_lock(&md->map_lock); 1276 md->map = NULL; 1277 write_unlock(&md->map_lock); 1278 + dm_table_destroy(map); 1279 } 1280 1281 /*
+1
drivers/md/dm.h
··· 36 /*----------------------------------------------------------------- 37 * Internal table functions. 38 *---------------------------------------------------------------*/ 39 void dm_table_event_callback(struct dm_table *t, 40 void (*fn)(void *), void *context); 41 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
··· 36 /*----------------------------------------------------------------- 37 * Internal table functions. 38 *---------------------------------------------------------------*/ 39 + void dm_table_destroy(struct dm_table *t); 40 void dm_table_event_callback(struct dm_table *t, 41 void (*fn)(void *), void *context); 42 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);