Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

dm-bufio: fix no-sleep mode

dm-bufio has a no-sleep mode. When activated (with the
DM_BUFIO_CLIENT_NO_SLEEP flag), the bufio client is read-only and we
could call dm_bufio_get from tasklets. This is used by dm-verity.

Unfortunately, commit 450e8dee51aa ("dm bufio: improve concurrent IO
performance") broke this and the kernel would warn that cache_get()
was calling down_read() from no-sleeping context. The bug can be
reproduced by using "veritysetup open" with the "--use-tasklets"
flag.

This commit fixes dm-bufio, so that the tasklet mode works again, by
expanding use of the 'no_sleep_enabled' static_key to conditionally
use either a rw_semaphore or rwlock_t (which are colocated in the
buffer_tree structure using a union).

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@vger.kernel.org # v6.4
Fixes: 450e8dee51aa ("dm bufio: improve concurrent IO performance")
Signed-off-by: Mike Snitzer <snitzer@kernel.org>

authored by

Mikulas Patocka and committed by
Mike Snitzer
2a695062 ccadc8a2

+62 -25
+62 -25
drivers/md/dm-bufio.c
··· 254 254 255 255 typedef enum evict_result (*le_predicate)(struct lru_entry *le, void *context); 256 256 257 - static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *context) 257 + static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *context, bool no_sleep) 258 258 { 259 259 unsigned long tested = 0; 260 260 struct list_head *h = lru->cursor; ··· 295 295 296 296 h = h->next; 297 297 298 - cond_resched(); 298 + if (!no_sleep) 299 + cond_resched(); 299 300 } 300 301 301 302 return NULL; ··· 383 382 */ 384 383 385 384 struct buffer_tree { 386 - struct rw_semaphore lock; 385 + union { 386 + struct rw_semaphore lock; 387 + rwlock_t spinlock; 388 + } u; 387 389 struct rb_root root; 388 390 } ____cacheline_aligned_in_smp; 389 391 ··· 397 393 * on the locks. 398 394 */ 399 395 unsigned int num_locks; 396 + bool no_sleep; 400 397 struct buffer_tree trees[]; 401 398 }; 399 + 400 + static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled); 402 401 403 402 static inline unsigned int cache_index(sector_t block, unsigned int num_locks) 404 403 { ··· 410 403 411 404 static inline void cache_read_lock(struct dm_buffer_cache *bc, sector_t block) 412 405 { 413 - down_read(&bc->trees[cache_index(block, bc->num_locks)].lock); 406 + if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep) 407 + read_lock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock); 408 + else 409 + down_read(&bc->trees[cache_index(block, bc->num_locks)].u.lock); 414 410 } 415 411 416 412 static inline void cache_read_unlock(struct dm_buffer_cache *bc, sector_t block) 417 413 { 418 - up_read(&bc->trees[cache_index(block, bc->num_locks)].lock); 414 + if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep) 415 + read_unlock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock); 416 + else 417 + up_read(&bc->trees[cache_index(block, bc->num_locks)].u.lock); 419 418 } 420 419 421 420 static inline void cache_write_lock(struct dm_buffer_cache *bc, sector_t block) 422 421 { 423 - down_write(&bc->trees[cache_index(block, bc->num_locks)].lock); 422 + if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep) 423 + write_lock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock); 424 + else 425 + down_write(&bc->trees[cache_index(block, bc->num_locks)].u.lock); 424 426 } 425 427 426 428 static inline void cache_write_unlock(struct dm_buffer_cache *bc, sector_t block) 427 429 { 428 - up_write(&bc->trees[cache_index(block, bc->num_locks)].lock); 430 + if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep) 431 + write_unlock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock); 432 + else 433 + up_write(&bc->trees[cache_index(block, bc->num_locks)].u.lock); 429 434 } 430 435 431 436 /* ··· 461 442 462 443 static void __lh_lock(struct lock_history *lh, unsigned int index) 463 444 { 464 - if (lh->write) 465 - down_write(&lh->cache->trees[index].lock); 466 - else 467 - down_read(&lh->cache->trees[index].lock); 445 + if (lh->write) { 446 + if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep) 447 + write_lock_bh(&lh->cache->trees[index].u.spinlock); 448 + else 449 + down_write(&lh->cache->trees[index].u.lock); 450 + } else { 451 + if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep) 452 + read_lock_bh(&lh->cache->trees[index].u.spinlock); 453 + else 454 + down_read(&lh->cache->trees[index].u.lock); 455 + } 468 456 } 469 457 470 458 static void __lh_unlock(struct lock_history *lh, unsigned int index) 471 459 { 472 - if (lh->write) 473 - up_write(&lh->cache->trees[index].lock); 474 - else 475 - up_read(&lh->cache->trees[index].lock); 460 + if (lh->write) { 461 + if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep) 462 + write_unlock_bh(&lh->cache->trees[index].u.spinlock); 463 + else 464 + up_write(&lh->cache->trees[index].u.lock); 465 + } else { 466 + if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep) 467 + read_unlock_bh(&lh->cache->trees[index].u.spinlock); 468 + else 469 + up_read(&lh->cache->trees[index].u.lock); 470 + } 476 471 } 477 472 478 473 /* ··· 535 502 return le_to_buffer(le); 536 503 } 537 504 538 - static void cache_init(struct dm_buffer_cache *bc, unsigned int num_locks) 505 + static void cache_init(struct dm_buffer_cache *bc, unsigned int num_locks, bool no_sleep) 539 506 { 540 507 unsigned int i; 541 508 542 509 bc->num_locks = num_locks; 510 + bc->no_sleep = no_sleep; 543 511 544 512 for (i = 0; i < bc->num_locks; i++) { 545 - init_rwsem(&bc->trees[i].lock); 513 + if (no_sleep) 514 + rwlock_init(&bc->trees[i].u.spinlock); 515 + else 516 + init_rwsem(&bc->trees[i].u.lock); 546 517 bc->trees[i].root = RB_ROOT; 547 518 } 548 519 ··· 685 648 struct lru_entry *le; 686 649 struct dm_buffer *b; 687 650 688 - le = lru_evict(&bc->lru[list_mode], __evict_pred, &w); 651 + le = lru_evict(&bc->lru[list_mode], __evict_pred, &w, bc->no_sleep); 689 652 if (!le) 690 653 return NULL; 691 654 ··· 739 702 struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context}; 740 703 741 704 while (true) { 742 - le = lru_evict(&bc->lru[old_mode], __evict_pred, &w); 705 + le = lru_evict(&bc->lru[old_mode], __evict_pred, &w, bc->no_sleep); 743 706 if (!le) 744 707 break; 745 708 ··· 952 915 { 953 916 unsigned int i; 954 917 918 + BUG_ON(bc->no_sleep); 955 919 for (i = 0; i < bc->num_locks; i++) { 956 - down_write(&bc->trees[i].lock); 920 + down_write(&bc->trees[i].u.lock); 957 921 __remove_range(bc, &bc->trees[i].root, begin, end, pred, release); 958 - up_write(&bc->trees[i].lock); 922 + up_write(&bc->trees[i].u.lock); 959 923 } 960 924 } 961 925 ··· 1016 978 1017 979 struct dm_buffer_cache cache; /* must be last member */ 1018 980 }; 1019 - 1020 - static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled); 1021 981 1022 982 /*----------------------------------------------------------------*/ 1023 983 ··· 1907 1871 if (need_submit) 1908 1872 submit_io(b, REQ_OP_READ, read_endio); 1909 1873 1910 - wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); 1874 + if (nf != NF_GET) /* we already tested this condition above */ 1875 + wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); 1911 1876 1912 1877 if (b->read_error) { 1913 1878 int error = blk_status_to_errno(b->read_error); ··· 2458 2421 r = -ENOMEM; 2459 2422 goto bad_client; 2460 2423 } 2461 - cache_init(&c->cache, num_locks); 2424 + cache_init(&c->cache, num_locks, (flags & DM_BUFIO_CLIENT_NO_SLEEP) != 0); 2462 2425 2463 2426 c->bdev = bdev; 2464 2427 c->block_size = block_size;