Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

aoe: avoid races between device destruction and discovery

This change avoids a race that could result in a NULL pointer derference
following a WARNing from kobject_add_internal, "don't try to register
things with the same name in the same directory."

The problem was found with a test that forgets and discovers an
aoe device in a loop:

while test ! -r /tmp/stop; do
aoe-flush -a
aoe-discover
done

The race was between aoedev_flush taking aoedevs out of the devlist,
allowing a new discovery of the same AoE target to take place before the
driver gets around to calling sysfs_remove_group. Fixing that one
revealed another race between do_open and add_disk, and this patch avoids
that, too.

The fix required some care, because for flushing (forgetting) an aoedev,
some of the steps must be performed under lock and some must be able to
sleep. Also, for discovering a new aoedev, some steps might sleep.

The check for a bad aoedev pointer remains from a time when about half of
this patch was done, and it was possible for the
bdev->bd_disk->private_data to become corrupted. The check should be
removed eventually, but it is not expected to add significant overhead,
occurring in the aoeblk_open routine.

Signed-off-by: Ed Cashin <ecashin@coraid.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Ed Cashin and committed by
Linus Torvalds
e52a2932 bbb44e30

+146 -63
+5 -2
drivers/block/aoe/aoe.h
··· 74 74 DEVFL_TKILL = (1<<1), /* flag for timer to know when to kill self */ 75 75 DEVFL_EXT = (1<<2), /* device accepts lba48 commands */ 76 76 DEVFL_GDALLOC = (1<<3), /* need to alloc gendisk */ 77 - DEVFL_KICKME = (1<<4), /* slow polling network card catch */ 78 - DEVFL_NEWSIZE = (1<<5), /* need to update dev size in block layer */ 77 + DEVFL_GD_NOW = (1<<4), /* allocating gendisk */ 78 + DEVFL_KICKME = (1<<5), /* slow polling network card catch */ 79 + DEVFL_NEWSIZE = (1<<6), /* need to update dev size in block layer */ 80 + DEVFL_FREEING = (1<<7), /* set when device is being cleaned up */ 81 + DEVFL_FREED = (1<<8), /* device has been cleaned up */ 79 82 }; 80 83 81 84 enum {
+34 -2
drivers/block/aoe/aoeblk.c
··· 147 147 struct aoedev *d = bdev->bd_disk->private_data; 148 148 ulong flags; 149 149 150 + if (!virt_addr_valid(d)) { 151 + pr_crit("aoe: invalid device pointer in %s\n", 152 + __func__); 153 + WARN_ON(1); 154 + return -ENODEV; 155 + } 156 + if (!(d->flags & DEVFL_UP) || d->flags & DEVFL_TKILL) 157 + return -ENODEV; 158 + 150 159 mutex_lock(&aoeblk_mutex); 151 160 spin_lock_irqsave(&d->lock, flags); 152 - if (d->flags & DEVFL_UP) { 161 + if (d->flags & DEVFL_UP && !(d->flags & DEVFL_TKILL)) { 153 162 d->nopen++; 154 163 spin_unlock_irqrestore(&d->lock, flags); 155 164 mutex_unlock(&aoeblk_mutex); ··· 268 259 struct request_queue *q; 269 260 enum { KB = 1024, MB = KB * KB, READ_AHEAD = 2 * MB, }; 270 261 ulong flags; 262 + int late = 0; 263 + 264 + spin_lock_irqsave(&d->lock, flags); 265 + if (d->flags & DEVFL_GDALLOC 266 + && !(d->flags & DEVFL_TKILL) 267 + && !(d->flags & DEVFL_GD_NOW)) 268 + d->flags |= DEVFL_GD_NOW; 269 + else 270 + late = 1; 271 + spin_unlock_irqrestore(&d->lock, flags); 272 + if (late) 273 + return; 271 274 272 275 gd = alloc_disk(AOE_PARTITIONS); 273 276 if (gd == NULL) { ··· 303 282 } 304 283 305 284 spin_lock_irqsave(&d->lock, flags); 285 + WARN_ON(!(d->flags & DEVFL_GD_NOW)); 286 + WARN_ON(!(d->flags & DEVFL_GDALLOC)); 287 + WARN_ON(d->flags & DEVFL_TKILL); 288 + WARN_ON(d->gd); 289 + WARN_ON(d->flags & DEVFL_UP); 306 290 blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS); 307 291 q->backing_dev_info.name = "aoe"; 308 292 q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE; ··· 332 306 333 307 add_disk(gd); 334 308 aoedisk_add_sysfs(d); 309 + 310 + spin_lock_irqsave(&d->lock, flags); 311 + WARN_ON(!(d->flags & DEVFL_GD_NOW)); 312 + d->flags &= ~DEVFL_GD_NOW; 313 + spin_unlock_irqrestore(&d->lock, flags); 335 314 return; 336 315 337 316 err_mempool: ··· 345 314 put_disk(gd); 346 315 err: 347 316 spin_lock_irqsave(&d->lock, flags); 348 - d->flags &= ~DEVFL_GDALLOC; 317 + d->flags &= ~DEVFL_GD_NOW; 318 + schedule_work(&d->work); 349 319 spin_unlock_irqrestore(&d->lock, flags); 350 320 } 351 321
+107 -59
drivers/block/aoe/aoedev.c
··· 15 15 #include "aoe.h" 16 16 17 17 static void dummy_timer(ulong); 18 - static void aoedev_freedev(struct aoedev *); 19 18 static void freetgt(struct aoedev *d, struct aoetgt *t); 20 19 static void skbpoolfree(struct aoedev *d); 21 20 ··· 235 236 set_capacity(d->gd, 0); 236 237 } 237 238 238 - static void 239 - aoedev_freedev(struct aoedev *d) 240 - { 241 - struct aoetgt **t, **e; 242 - 243 - cancel_work_sync(&d->work); 244 - if (d->gd) { 245 - aoedisk_rm_sysfs(d); 246 - del_gendisk(d->gd); 247 - put_disk(d->gd); 248 - blk_cleanup_queue(d->blkq); 249 - } 250 - t = d->targets; 251 - e = t + NTARGETS; 252 - for (; t < e && *t; t++) 253 - freetgt(d, *t); 254 - if (d->bufpool) 255 - mempool_destroy(d->bufpool); 256 - skbpoolfree(d); 257 - minor_free(d->sysminor); 258 - kfree(d); 259 - } 260 - 261 239 /* return whether the user asked for this particular 262 240 * device to be flushed 263 241 */ ··· 259 283 return !strncmp(s, p, lim); 260 284 } 261 285 262 - int 263 - aoedev_flush(const char __user *str, size_t cnt) 286 + static void 287 + freedev(struct aoedev *d) 288 + { 289 + struct aoetgt **t, **e; 290 + int freeing = 0; 291 + unsigned long flags; 292 + 293 + spin_lock_irqsave(&d->lock, flags); 294 + if (d->flags & DEVFL_TKILL 295 + && !(d->flags & DEVFL_FREEING)) { 296 + d->flags |= DEVFL_FREEING; 297 + freeing = 1; 298 + } 299 + spin_unlock_irqrestore(&d->lock, flags); 300 + if (!freeing) 301 + return; 302 + 303 + del_timer_sync(&d->timer); 304 + if (d->gd) { 305 + aoedisk_rm_sysfs(d); 306 + del_gendisk(d->gd); 307 + put_disk(d->gd); 308 + blk_cleanup_queue(d->blkq); 309 + } 310 + t = d->targets; 311 + e = t + NTARGETS; 312 + for (; t < e && *t; t++) 313 + freetgt(d, *t); 314 + if (d->bufpool) 315 + mempool_destroy(d->bufpool); 316 + skbpoolfree(d); 317 + minor_free(d->sysminor); 318 + 319 + spin_lock_irqsave(&d->lock, flags); 320 + d->flags |= DEVFL_FREED; 321 + spin_unlock_irqrestore(&d->lock, flags); 322 + } 323 + 324 + enum flush_parms { 325 + NOT_EXITING = 0, 326 + EXITING = 1, 327 + }; 328 + 329 + static int 330 + flush(const char __user *str, size_t cnt, int exiting) 264 331 { 265 332 ulong flags; 266 333 struct aoedev *d, **dd; 267 - struct aoedev *rmd = NULL; 268 334 char buf[16]; 269 335 int all = 0; 270 336 int specified = 0; /* flush a specific device */ 337 + unsigned int skipflags; 271 338 272 - if (cnt >= 3) { 339 + skipflags = DEVFL_GDALLOC | DEVFL_NEWSIZE | DEVFL_TKILL; 340 + 341 + if (!exiting && cnt >= 3) { 273 342 if (cnt > sizeof buf) 274 343 cnt = sizeof buf; 275 344 if (copy_from_user(buf, str, cnt)) ··· 324 303 specified = 1; 325 304 } 326 305 306 + flush_scheduled_work(); 307 + /* pass one: without sleeping, do aoedev_downdev */ 327 308 spin_lock_irqsave(&devlist_lock, flags); 328 - dd = &devlist; 329 - while ((d = *dd)) { 309 + for (d = devlist; d; d = d->next) { 330 310 spin_lock(&d->lock); 331 - if (specified) { 311 + if (exiting) { 312 + /* unconditionally take each device down */ 313 + } else if (specified) { 332 314 if (!user_req(buf, cnt, d)) 333 - goto skip; 315 + goto cont; 334 316 } else if ((!all && (d->flags & DEVFL_UP)) 335 - || (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE)) 317 + || d->flags & skipflags 336 318 || d->nopen 337 319 || d->ref) 338 - goto skip; 320 + goto cont; 339 321 340 - *dd = d->next; 341 322 aoedev_downdev(d); 342 323 d->flags |= DEVFL_TKILL; 324 + cont: 343 325 spin_unlock(&d->lock); 344 - d->next = rmd; 345 - rmd = d; 346 - continue; 347 - skip: 348 - spin_unlock(&d->lock); 349 - dd = &d->next; 350 326 } 351 327 spin_unlock_irqrestore(&devlist_lock, flags); 352 - while ((d = rmd)) { 353 - rmd = d->next; 354 - del_timer_sync(&d->timer); 355 - aoedev_freedev(d); /* must be able to sleep */ 328 + 329 + /* pass two: call freedev, which might sleep, 330 + * for aoedevs marked with DEVFL_TKILL 331 + */ 332 + restart: 333 + spin_lock_irqsave(&devlist_lock, flags); 334 + for (d = devlist; d; d = d->next) { 335 + spin_lock(&d->lock); 336 + if (d->flags & DEVFL_TKILL 337 + && !(d->flags & DEVFL_FREEING)) { 338 + spin_unlock(&d->lock); 339 + spin_unlock_irqrestore(&devlist_lock, flags); 340 + freedev(d); 341 + goto restart; 342 + } 343 + spin_unlock(&d->lock); 356 344 } 345 + 346 + /* pass three: remove aoedevs marked with DEVFL_FREED */ 347 + for (dd = &devlist, d = *dd; d; d = *dd) { 348 + struct aoedev *doomed = NULL; 349 + 350 + spin_lock(&d->lock); 351 + if (d->flags & DEVFL_FREED) { 352 + *dd = d->next; 353 + doomed = d; 354 + } else { 355 + dd = &d->next; 356 + } 357 + spin_unlock(&d->lock); 358 + kfree(doomed); 359 + } 360 + spin_unlock_irqrestore(&devlist_lock, flags); 361 + 357 362 return 0; 363 + } 364 + 365 + int 366 + aoedev_flush(const char __user *str, size_t cnt) 367 + { 368 + return flush(str, cnt, NOT_EXITING); 358 369 } 359 370 360 371 /* This has been confirmed to occur once with Tms=3*1000 due to the ··· 441 388 442 389 for (d=devlist; d; d=d->next) 443 390 if (d->aoemajor == maj && d->aoeminor == min) { 391 + spin_lock(&d->lock); 392 + if (d->flags & DEVFL_TKILL) { 393 + spin_unlock(&d->lock); 394 + d = NULL; 395 + goto out; 396 + } 444 397 d->ref++; 398 + spin_unlock(&d->lock); 445 399 break; 446 400 } 447 401 if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0) ··· 508 448 void 509 449 aoedev_exit(void) 510 450 { 511 - struct aoedev *d; 512 - ulong flags; 513 - 451 + flush_scheduled_work(); 514 452 aoe_flush_iocq(); 515 - while ((d = devlist)) { 516 - devlist = d->next; 517 - 518 - spin_lock_irqsave(&d->lock, flags); 519 - aoedev_downdev(d); 520 - d->flags |= DEVFL_TKILL; 521 - spin_unlock_irqrestore(&d->lock, flags); 522 - 523 - del_timer_sync(&d->timer); 524 - aoedev_freedev(d); 525 - } 453 + flush(NULL, 0, EXITING); 526 454 } 527 455 528 456 int __init