Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

dm mpath: eliminate use of spinlock in IO fast-paths

The primary motivation of this commit is to improve the scalability of
DM multipath on large NUMA systems where m->lock spinlock contention has
been proven to be a serious bottleneck on really fast storage.

The ability to atomically read a pointer, using lockless_dereference(),
is leveraged in this commit. But all pointer writes are still protected
by the m->lock spinlock (which is fine since these all now occur in the
slow-path).

The following functions no longer require the m->lock spinlock in their
fast-path: multipath_busy(), __multipath_map(), and do_end_io()

And choose_pgpath() is modified to _not_ update m->current_pgpath unless
it also switches the path-group. This is done to avoid needing to take
the m->lock everytime __multipath_map() calls choose_pgpath().
But m->current_pgpath will be reset if it is failed via fail_path().

Suggested-by: Jeff Moyer <jmoyer@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

+93 -77
+93 -77
drivers/md/dm-mpath.c
··· 305 305 return atomic_read(&m->pg_init_in_progress); 306 306 } 307 307 308 - static void __switch_pg(struct multipath *m, struct pgpath *pgpath) 308 + static int pg_init_all_paths(struct multipath *m) 309 309 { 310 - m->current_pg = pgpath->pg; 310 + int r; 311 + unsigned long flags; 312 + 313 + spin_lock_irqsave(&m->lock, flags); 314 + r = __pg_init_all_paths(m); 315 + spin_unlock_irqrestore(&m->lock, flags); 316 + 317 + return r; 318 + } 319 + 320 + static void __switch_pg(struct multipath *m, struct priority_group *pg) 321 + { 322 + m->current_pg = pg; 311 323 312 324 /* Must we initialise the PG first, and queue I/O till it's ready? */ 313 325 if (m->hw_handler_name) { ··· 333 321 atomic_set(&m->pg_init_count, 0); 334 322 } 335 323 336 - static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, 337 - size_t nr_bytes) 324 + static struct pgpath *choose_path_in_pg(struct multipath *m, 325 + struct priority_group *pg, 326 + size_t nr_bytes) 338 327 { 328 + unsigned long flags; 339 329 struct dm_path *path; 330 + struct pgpath *pgpath; 340 331 341 332 path = pg->ps.type->select_path(&pg->ps, nr_bytes); 342 333 if (!path) 343 - return -ENXIO; 334 + return ERR_PTR(-ENXIO); 344 335 345 - m->current_pgpath = path_to_pgpath(path); 336 + pgpath = path_to_pgpath(path); 346 337 347 - if (m->current_pg != pg) 348 - __switch_pg(m, m->current_pgpath); 338 + if (unlikely(lockless_dereference(m->current_pg) != pg)) { 339 + /* Only update current_pgpath if pg changed */ 340 + spin_lock_irqsave(&m->lock, flags); 341 + m->current_pgpath = pgpath; 342 + __switch_pg(m, pg); 343 + spin_unlock_irqrestore(&m->lock, flags); 344 + } 349 345 350 - return 0; 346 + return pgpath; 351 347 } 352 348 353 - static void __choose_pgpath(struct multipath *m, size_t nr_bytes) 349 + static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) 354 350 { 351 + unsigned long flags; 355 352 struct priority_group *pg; 353 + struct pgpath *pgpath; 356 354 bool bypassed = true; 357 355 358 356 if (!atomic_read(&m->nr_valid_paths)) { ··· 371 349 } 372 350 373 351 /* Were we instructed to switch PG? */ 374 - if (m->next_pg) { 352 + if (lockless_dereference(m->next_pg)) { 353 + spin_lock_irqsave(&m->lock, flags); 375 354 pg = m->next_pg; 355 + if (!pg) { 356 + spin_unlock_irqrestore(&m->lock, flags); 357 + goto check_current_pg; 358 + } 376 359 m->next_pg = NULL; 377 - if (!__choose_path_in_pg(m, pg, nr_bytes)) 378 - return; 360 + spin_unlock_irqrestore(&m->lock, flags); 361 + pgpath = choose_path_in_pg(m, pg, nr_bytes); 362 + if (!IS_ERR_OR_NULL(pgpath)) 363 + return pgpath; 379 364 } 380 365 381 366 /* Don't change PG until it has no remaining paths */ 382 - if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes)) 383 - return; 367 + check_current_pg: 368 + pg = lockless_dereference(m->current_pg); 369 + if (pg) { 370 + pgpath = choose_path_in_pg(m, pg, nr_bytes); 371 + if (!IS_ERR_OR_NULL(pgpath)) 372 + return pgpath; 373 + } 384 374 385 375 /* 386 376 * Loop through priority groups until we find a valid path. ··· 404 370 list_for_each_entry(pg, &m->priority_groups, list) { 405 371 if (pg->bypassed == bypassed) 406 372 continue; 407 - if (!__choose_path_in_pg(m, pg, nr_bytes)) { 373 + pgpath = choose_path_in_pg(m, pg, nr_bytes); 374 + if (!IS_ERR_OR_NULL(pgpath)) { 408 375 if (!bypassed) 409 376 set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); 410 - return; 377 + return pgpath; 411 378 } 412 379 } 413 380 } while (bypassed--); 414 381 415 382 failed: 383 + spin_lock_irqsave(&m->lock, flags); 416 384 m->current_pgpath = NULL; 417 385 m->current_pg = NULL; 386 + spin_unlock_irqrestore(&m->lock, flags); 387 + 388 + return NULL; 418 389 } 419 390 420 391 /* 421 392 * Check whether bios must be queued in the device-mapper core rather 422 393 * than here in the target. 423 394 * 424 - * m->lock must be held on entry. 425 - * 426 395 * If m->queue_if_no_path and m->saved_queue_if_no_path hold the 427 396 * same value then we are not between multipath_presuspend() 428 397 * and multipath_resume() calls and we have no need to check 429 398 * for the DMF_NOFLUSH_SUSPENDING flag. 430 399 */ 431 - static int __must_push_back(struct multipath *m) 400 + static int must_push_back(struct multipath *m) 432 401 { 433 402 return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || 434 403 ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) != ··· 453 416 struct block_device *bdev; 454 417 struct dm_mpath_io *mpio; 455 418 456 - spin_lock_irq(&m->lock); 457 - 458 419 /* Do we need to select a new pgpath? */ 459 - if (!m->current_pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) 460 - __choose_pgpath(m, nr_bytes); 461 - 462 - pgpath = m->current_pgpath; 420 + pgpath = lockless_dereference(m->current_pgpath); 421 + if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) 422 + pgpath = choose_pgpath(m, nr_bytes); 463 423 464 424 if (!pgpath) { 465 - if (!__must_push_back(m)) 425 + if (!must_push_back(m)) 466 426 r = -EIO; /* Failed */ 467 - goto out_unlock; 427 + return r; 468 428 } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || 469 429 test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { 470 - __pg_init_all_paths(m); 471 - goto out_unlock; 430 + pg_init_all_paths(m); 431 + return r; 472 432 } 473 433 474 434 mpio = set_mpio(m, map_context); 475 435 if (!mpio) 476 436 /* ENOMEM, requeue */ 477 - goto out_unlock; 437 + return r; 478 438 479 439 mpio->pgpath = pgpath; 480 440 mpio->nr_bytes = nr_bytes; 481 441 482 442 bdev = pgpath->path.dev->bdev; 483 - 484 - spin_unlock_irq(&m->lock); 485 443 486 444 if (clone) { 487 445 /* ··· 509 477 &pgpath->path, 510 478 nr_bytes); 511 479 return DM_MAPIO_REMAPPED; 512 - 513 - out_unlock: 514 - spin_unlock_irq(&m->lock); 515 - 516 - return r; 517 480 } 518 481 519 482 static int multipath_map(struct dm_target *ti, struct request *clone, ··· 1335 1308 * clone bios for it and resubmit it later. 1336 1309 */ 1337 1310 int r = DM_ENDIO_REQUEUE; 1338 - unsigned long flags; 1339 1311 1340 1312 if (!error && !clone->errors) 1341 1313 return 0; /* I/O complete */ ··· 1345 1319 if (mpio->pgpath) 1346 1320 fail_path(mpio->pgpath); 1347 1321 1348 - spin_lock_irqsave(&m->lock, flags); 1349 1322 if (!atomic_read(&m->nr_valid_paths)) { 1350 1323 if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { 1351 - if (!__must_push_back(m)) 1324 + if (!must_push_back(m)) 1352 1325 r = -EIO; 1353 1326 } else { 1354 1327 if (error == -EBADE) 1355 1328 r = error; 1356 1329 } 1357 1330 } 1358 - spin_unlock_irqrestore(&m->lock, flags); 1359 1331 1360 1332 return r; 1361 1333 } ··· 1610 1586 struct block_device **bdev, fmode_t *mode) 1611 1587 { 1612 1588 struct multipath *m = ti->private; 1613 - unsigned long flags; 1589 + struct pgpath *current_pgpath; 1614 1590 int r; 1615 1591 1616 - spin_lock_irqsave(&m->lock, flags); 1592 + current_pgpath = lockless_dereference(m->current_pgpath); 1593 + if (!current_pgpath) 1594 + current_pgpath = choose_pgpath(m, 0); 1617 1595 1618 - if (!m->current_pgpath) 1619 - __choose_pgpath(m, 0); 1620 - 1621 - if (m->current_pgpath) { 1596 + if (current_pgpath) { 1622 1597 if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) { 1623 - *bdev = m->current_pgpath->path.dev->bdev; 1624 - *mode = m->current_pgpath->path.dev->mode; 1598 + *bdev = current_pgpath->path.dev->bdev; 1599 + *mode = current_pgpath->path.dev->mode; 1625 1600 r = 0; 1626 1601 } else { 1627 1602 /* pg_init has not started or completed */ ··· 1634 1611 r = -EIO; 1635 1612 } 1636 1613 1637 - spin_unlock_irqrestore(&m->lock, flags); 1638 - 1639 1614 if (r == -ENOTCONN) { 1640 - spin_lock_irqsave(&m->lock, flags); 1641 - if (!m->current_pg) { 1615 + if (!lockless_dereference(m->current_pg)) { 1642 1616 /* Path status changed, redo selection */ 1643 - __choose_pgpath(m, 0); 1617 + (void) choose_pgpath(m, 0); 1644 1618 } 1645 1619 if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) 1646 - __pg_init_all_paths(m); 1647 - spin_unlock_irqrestore(&m->lock, flags); 1620 + pg_init_all_paths(m); 1648 1621 dm_table_run_md_queue_async(m->ti->table); 1649 1622 } 1650 1623 ··· 1691 1672 { 1692 1673 bool busy = false, has_active = false; 1693 1674 struct multipath *m = ti->private; 1694 - struct priority_group *pg; 1675 + struct priority_group *pg, *next_pg; 1695 1676 struct pgpath *pgpath; 1696 - unsigned long flags; 1697 - 1698 - spin_lock_irqsave(&m->lock, flags); 1699 1677 1700 1678 /* pg_init in progress or no paths available */ 1701 1679 if (atomic_read(&m->pg_init_in_progress) || 1702 - (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) { 1703 - busy = true; 1704 - goto out; 1705 - } 1680 + (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) 1681 + return true; 1682 + 1706 1683 /* Guess which priority_group will be used at next mapping time */ 1707 - if (unlikely(!m->current_pgpath && m->next_pg)) 1708 - pg = m->next_pg; 1709 - else if (likely(m->current_pg)) 1710 - pg = m->current_pg; 1711 - else 1684 + pg = lockless_dereference(m->current_pg); 1685 + next_pg = lockless_dereference(m->next_pg); 1686 + if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg)) 1687 + pg = next_pg; 1688 + 1689 + if (!pg) { 1712 1690 /* 1713 1691 * We don't know which pg will be used at next mapping time. 1714 - * We don't call __choose_pgpath() here to avoid to trigger 1692 + * We don't call choose_pgpath() here to avoid to trigger 1715 1693 * pg_init just by busy checking. 1716 1694 * So we don't know whether underlying devices we will be using 1717 1695 * at next mapping time are busy or not. Just try mapping. 1718 1696 */ 1719 - goto out; 1697 + return busy; 1698 + } 1720 1699 1721 1700 /* 1722 1701 * If there is one non-busy active path at least, the path selector 1723 1702 * will be able to select it. So we consider such a pg as not busy. 1724 1703 */ 1725 1704 busy = true; 1726 - list_for_each_entry(pgpath, &pg->pgpaths, list) 1705 + list_for_each_entry(pgpath, &pg->pgpaths, list) { 1727 1706 if (pgpath->is_active) { 1728 1707 has_active = true; 1729 1708 if (!pgpath_busy(pgpath)) { ··· 1729 1712 break; 1730 1713 } 1731 1714 } 1715 + } 1732 1716 1733 - if (!has_active) 1717 + if (!has_active) { 1734 1718 /* 1735 1719 * No active path in this pg, so this pg won't be used and 1736 1720 * the current_pg will be changed at next mapping time. 1737 1721 * We need to try mapping to determine it. 1738 1722 */ 1739 1723 busy = false; 1740 - 1741 - out: 1742 - spin_unlock_irqrestore(&m->lock, flags); 1724 + } 1743 1725 1744 1726 return busy; 1745 1727 }