Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bcachefs: optimize __bch2_trans_get(), kill DEBUG_TRANSACTIONS

- Some tweaks to greatly reduce locking overhead for the list of btree
transactions, so that it can always be enabled: leave btree_trans
objects on the list when they're on the percpu single item freelist,
and only check for duplicates in the same process when
CONFIG_BCACHEFS_DEBUG is enabled

- don't zero out the full btree_trans() unless we allocated it from
the mempool

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

+100 -106
-8
fs/bcachefs/Kconfig
··· 50 50 depends on BCACHEFS_FS 51 51 select FS_POSIX_ACL 52 52 53 - config BCACHEFS_DEBUG_TRANSACTIONS 54 - bool "bcachefs runtime info" 55 - depends on BCACHEFS_FS 56 - help 57 - This makes the list of running btree transactions available in debugfs. 58 - 59 - This is a highly useful debugging feature but does add a small amount of overhead. 60 - 61 53 config BCACHEFS_DEBUG 62 54 bool "bcachefs debugging" 63 55 depends on BCACHEFS_FS
+82 -84
fs/bcachefs/btree_iter.c
··· 2714 2714 2715 2715 void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) 2716 2716 { 2717 + struct bch_fs *c = trans->c; 2717 2718 unsigned new_top = trans->mem_top + size; 2718 2719 unsigned old_bytes = trans->mem_bytes; 2719 2720 unsigned new_bytes = roundup_pow_of_two(new_top); ··· 2722 2721 void *new_mem; 2723 2722 void *p; 2724 2723 2725 - trans->mem_max = max(trans->mem_max, new_top); 2726 - 2727 2724 WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); 2725 + 2726 + struct btree_transaction_stats *s = btree_trans_stats(trans); 2727 + if (s) 2728 + s->max_mem = max(s->max_mem, new_bytes); 2728 2729 2729 2730 new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); 2730 2731 if (unlikely(!new_mem)) { ··· 2734 2731 2735 2732 new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); 2736 2733 if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { 2737 - new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); 2734 + new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); 2738 2735 new_bytes = BTREE_TRANS_MEM_MAX; 2739 2736 kfree(trans->mem); 2740 2737 } ··· 2754 2751 trans->mem_bytes = new_bytes; 2755 2752 2756 2753 if (old_bytes) { 2757 - trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); 2754 + trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); 2758 2755 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); 2759 2756 } 2760 2757 ··· 2863 2860 return trans->restart_count; 2864 2861 } 2865 2862 2866 - static struct btree_trans *bch2_trans_alloc(struct bch_fs *c) 2867 - { 2868 - struct btree_trans *trans; 2869 - 2870 - if (IS_ENABLED(__KERNEL__)) { 2871 - trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL); 2872 - if (trans) 2873 - return trans; 2874 - } 2875 - 2876 - trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); 2877 - /* 2878 - * paths need to be zeroed, bch2_check_for_deadlock looks at 2879 - * paths in other threads 2880 - */ 2881 - memset(&trans->paths, 0, sizeof(trans->paths)); 2882 - return trans; 2883 - } 2884 - 2885 2863 const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; 2886 2864 2887 2865 unsigned bch2_trans_get_fn_idx(const char *fn) ··· 2884 2900 __acquires(&c->btree_trans_barrier) 2885 2901 { 2886 2902 struct btree_trans *trans; 2887 - struct btree_transaction_stats *s; 2888 2903 2889 - trans = bch2_trans_alloc(c); 2904 + if (IS_ENABLED(__KERNEL__)) { 2905 + trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL); 2906 + if (trans) { 2907 + memset(trans, 0, offsetof(struct btree_trans, updates)); 2908 + goto got_trans; 2909 + } 2910 + } 2890 2911 2912 + trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); 2891 2913 memset(trans, 0, sizeof(*trans)); 2914 + closure_init_stack(&trans->ref); 2915 + 2916 + seqmutex_lock(&c->btree_trans_lock); 2917 + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { 2918 + struct btree_trans *pos; 2919 + pid_t pid = current->pid; 2920 + 2921 + trans->locking_wait.task = current; 2922 + 2923 + list_for_each_entry(pos, &c->btree_trans_list, list) { 2924 + struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task); 2925 + /* 2926 + * We'd much prefer to be stricter here and completely 2927 + * disallow multiple btree_trans in the same thread - 2928 + * but the data move path calls bch2_write when we 2929 + * already have a btree_trans initialized. 2930 + */ 2931 + BUG_ON(pos_task && 2932 + pid == pos_task->pid && 2933 + bch2_trans_locked(pos)); 2934 + 2935 + if (pos_task && pid < pos_task->pid) { 2936 + list_add_tail(&trans->list, &pos->list); 2937 + goto list_add_done; 2938 + } 2939 + } 2940 + } 2941 + list_add_tail(&trans->list, &c->btree_trans_list); 2942 + list_add_done: 2943 + seqmutex_unlock(&c->btree_trans_lock); 2944 + got_trans: 2892 2945 trans->c = c; 2893 - trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns) 2894 - ? bch2_btree_transaction_fns[fn_idx] : NULL; 2895 2946 trans->last_begin_time = local_clock(); 2896 2947 trans->fn_idx = fn_idx; 2897 2948 trans->locking_wait.task = current; 2898 2949 trans->journal_replay_not_finished = 2899 2950 unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) && 2900 2951 atomic_inc_not_zero(&c->journal_keys.ref); 2901 - closure_init_stack(&trans->ref); 2902 - 2903 2952 trans->paths_allocated = trans->_paths_allocated; 2904 2953 trans->sorted = trans->_sorted; 2905 2954 trans->paths = trans->_paths; ··· 2941 2924 2942 2925 trans->paths_allocated[0] = 1; 2943 2926 2944 - s = btree_trans_stats(trans); 2945 - if (s && s->max_mem) { 2946 - unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); 2927 + if (fn_idx < BCH_TRANSACTIONS_NR) { 2928 + trans->fn = bch2_btree_transaction_fns[fn_idx]; 2947 2929 2948 - trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); 2930 + struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx]; 2949 2931 2950 - if (!unlikely(trans->mem)) { 2951 - trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); 2952 - trans->mem_bytes = BTREE_TRANS_MEM_MAX; 2953 - } else { 2954 - trans->mem_bytes = expected_mem_bytes; 2932 + if (s->max_mem) { 2933 + unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); 2934 + 2935 + trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); 2936 + if (likely(trans->mem)) 2937 + trans->mem_bytes = expected_mem_bytes; 2955 2938 } 2956 - } 2957 2939 2958 - if (s) { 2959 2940 trans->nr_paths_max = s->nr_max_paths; 2960 2941 trans->journal_entries_size = s->journal_entries_size; 2961 2942 } ··· 2961 2946 trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 2962 2947 trans->srcu_lock_time = jiffies; 2963 2948 trans->srcu_held = true; 2964 - 2965 - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { 2966 - struct btree_trans *pos; 2967 - 2968 - seqmutex_lock(&c->btree_trans_lock); 2969 - list_for_each_entry(pos, &c->btree_trans_list, list) { 2970 - /* 2971 - * We'd much prefer to be stricter here and completely 2972 - * disallow multiple btree_trans in the same thread - 2973 - * but the data move path calls bch2_write when we 2974 - * already have a btree_trans initialized. 2975 - */ 2976 - BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid && 2977 - bch2_trans_locked(pos)); 2978 - 2979 - if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) { 2980 - list_add_tail(&trans->list, &pos->list); 2981 - goto list_add_done; 2982 - } 2983 - } 2984 - list_add_tail(&trans->list, &c->btree_trans_list); 2985 - list_add_done: 2986 - seqmutex_unlock(&c->btree_trans_lock); 2987 - } 2988 - 2989 2949 return trans; 2990 2950 } 2991 2951 ··· 2991 3001 __releases(&c->btree_trans_barrier) 2992 3002 { 2993 3003 struct bch_fs *c = trans->c; 2994 - struct btree_transaction_stats *s = btree_trans_stats(trans); 2995 3004 2996 3005 bch2_trans_unlock(trans); 2997 3006 2998 - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { 2999 - seqmutex_lock(&c->btree_trans_lock); 3000 - list_del(&trans->list); 3001 - seqmutex_unlock(&c->btree_trans_lock); 3002 - } 3003 - 3004 - closure_sync(&trans->ref); 3005 - 3006 - if (s) 3007 - s->max_mem = max(s->max_mem, trans->mem_max); 3008 - 3009 3007 trans_for_each_update(trans, i) 3010 3008 __btree_path_put(trans->paths + i->path, true); 3011 - trans->nr_updates = 0; 3009 + trans->nr_updates = 0; 3010 + trans->locking_wait.task = NULL; 3012 3011 3013 3012 check_btree_paths_leaked(trans); 3014 3013 ··· 3026 3047 /* Userspace doesn't have a real percpu implementation: */ 3027 3048 if (IS_ENABLED(__KERNEL__)) 3028 3049 trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); 3029 - if (trans) 3050 + 3051 + if (trans) { 3052 + closure_sync(&trans->ref); 3053 + 3054 + seqmutex_lock(&c->btree_trans_lock); 3055 + list_del(&trans->list); 3056 + seqmutex_unlock(&c->btree_trans_lock); 3057 + 3030 3058 mempool_free(trans, &c->btree_trans_pool); 3059 + } 3031 3060 } 3032 3061 3033 3062 static void __maybe_unused ··· 3133 3146 struct btree_trans *trans; 3134 3147 int cpu; 3135 3148 3149 + if (c->btree_trans_bufs) 3150 + for_each_possible_cpu(cpu) { 3151 + struct btree_trans *trans = 3152 + per_cpu_ptr(c->btree_trans_bufs, cpu)->trans; 3153 + 3154 + if (trans) { 3155 + closure_sync(&trans->ref); 3156 + 3157 + seqmutex_lock(&c->btree_trans_lock); 3158 + list_del(&trans->list); 3159 + seqmutex_unlock(&c->btree_trans_lock); 3160 + } 3161 + kfree(trans); 3162 + } 3163 + free_percpu(c->btree_trans_bufs); 3164 + 3136 3165 trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list); 3137 3166 if (trans) 3138 3167 panic("%s leaked btree_trans\n", trans->fn); 3139 - 3140 - if (c->btree_trans_bufs) 3141 - for_each_possible_cpu(cpu) 3142 - kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans); 3143 - free_percpu(c->btree_trans_bufs); 3144 3168 3145 3169 for (s = c->btree_transaction_stats; 3146 3170 s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
+2 -1
fs/bcachefs/btree_locking.c
··· 95 95 struct trans_waiting_for_lock *i; 96 96 97 97 for (i = g->g; i != g->g + g->nr; i++) { 98 + struct task_struct *task = i->trans->locking_wait.task; 98 99 if (i != g->g) 99 100 prt_str(out, "<- "); 100 - prt_printf(out, "%u ", i->trans->locking_wait.task->pid); 101 + prt_printf(out, "%u ", task ?task->pid : 0); 101 102 } 102 103 prt_newline(out); 103 104 }
+7 -4
fs/bcachefs/btree_types.h
··· 386 386 387 387 void *mem; 388 388 unsigned mem_top; 389 - unsigned mem_max; 390 389 unsigned mem_bytes; 391 390 392 391 btree_path_idx_t nr_sorted; ··· 412 413 unsigned long srcu_lock_time; 413 414 414 415 const char *fn; 415 - struct closure ref; 416 - struct list_head list; 417 416 struct btree_bkey_cached_common *locking; 418 417 struct six_lock_waiter locking_wait; 419 418 int srcu_idx; ··· 421 424 u16 journal_entries_size; 422 425 struct jset_entry *journal_entries; 423 426 424 - struct btree_insert_entry updates[BTREE_ITER_MAX]; 425 427 struct btree_trans_commit_hook *hooks; 426 428 struct journal_entry_pin *journal_pin; 427 429 ··· 430 434 unsigned journal_u64s; 431 435 unsigned extra_disk_res; /* XXX kill */ 432 436 struct replicas_delta_list *fs_usage_deltas; 437 + 438 + /* Entries before this are zeroed out on every bch2_trans_get() call */ 439 + 440 + struct btree_insert_entry updates[BTREE_ITER_MAX]; 441 + 442 + struct list_head list; 443 + struct closure ref; 433 444 434 445 unsigned long _paths_allocated[BITS_TO_LONGS(BTREE_ITER_MAX)]; 435 446 struct btree_trans_paths trans_paths;
+9 -9
fs/bcachefs/debug.c
··· 592 592 .read = bch2_cached_btree_nodes_read, 593 593 }; 594 594 595 - #ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS 596 595 static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, 597 596 size_t size, loff_t *ppos) 598 597 { ··· 607 608 restart: 608 609 seqmutex_lock(&c->btree_trans_lock); 609 610 list_for_each_entry(trans, &c->btree_trans_list, list) { 610 - if (trans->locking_wait.task->pid <= i->iter) 611 + struct task_struct *task = READ_ONCE(trans->locking_wait.task); 612 + 613 + if (!task || task->pid <= i->iter) 611 614 continue; 612 615 613 616 closure_get(&trans->ref); ··· 627 626 prt_printf(&i->buf, "backtrace:"); 628 627 prt_newline(&i->buf); 629 628 printbuf_indent_add(&i->buf, 2); 630 - bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task); 629 + bch2_prt_task_backtrace(&i->buf, task); 631 630 printbuf_indent_sub(&i->buf, 2); 632 631 prt_newline(&i->buf); 633 632 634 - i->iter = trans->locking_wait.task->pid; 633 + i->iter = task->pid; 635 634 636 635 closure_put(&trans->ref); 637 636 ··· 655 654 .release = bch2_dump_release, 656 655 .read = bch2_btree_transactions_read, 657 656 }; 658 - #endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */ 659 657 660 658 static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, 661 659 size_t size, loff_t *ppos) ··· 811 811 restart: 812 812 seqmutex_lock(&c->btree_trans_lock); 813 813 list_for_each_entry(trans, &c->btree_trans_list, list) { 814 - if (trans->locking_wait.task->pid <= i->iter) 814 + struct task_struct *task = READ_ONCE(trans->locking_wait.task); 815 + 816 + if (!task || task->pid <= i->iter) 815 817 continue; 816 818 817 819 closure_get(&trans->ref); ··· 828 826 829 827 bch2_check_for_deadlock(trans, &i->buf); 830 828 831 - i->iter = trans->locking_wait.task->pid; 829 + i->iter = task->pid; 832 830 833 831 closure_put(&trans->ref); 834 832 ··· 875 873 debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir, 876 874 c->btree_debug, &cached_btree_nodes_ops); 877 875 878 - #ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS 879 876 debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir, 880 877 c->btree_debug, &btree_transactions_ops); 881 - #endif 882 878 883 879 debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, 884 880 c->btree_debug, &journal_pins_ops);