Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Revert "bcache: remove heap-related macros and switch to generic min_heap"

This reverts commit 866898efbb25bb44fd42848318e46db9e785973a.

The generic bottom-up min_heap implementation causes performance
regression in invalidate_buckets_lru(), a hot path in bcache. Before the
cache is fully populated, new_bucket_prio() often returns zero, leading to
many equal comparisons. In such cases, bottom-up sift_down performs up to
2 * log2(n) comparisons, while the original top-down approach completes
with just O() comparisons, resulting in a measurable performance gap.

The performance degradation is further worsened by the non-inlined
min_heap API functions introduced in commit 92a8b224b833 ("lib/min_heap:
introduce non-inline versions of min heap API functions"), adding function
call overhead to this critical path.

As reported by Robert, bcache now suffers from latency spikes, with P100
(max) latency increasing from 600 ms to 2.4 seconds every 5 minutes.
These regressions degrade bcache's effectiveness as a low-latency cache
layer and lead to frequent timeouts and application stalls in production
environments.

This revert aims to restore bcache's original low-latency behavior.

Link: https://lore.kernel.org/lkml/CAJhEC05+0S69z+3+FB2Cd0hD+pCRyWTKLEOsc8BOmH73p1m+KQ@mail.gmail.com
Link: https://lkml.kernel.org/r/20250614202353.1632957-3-visitorckw@gmail.com
Fixes: 866898efbb25 ("bcache: remove heap-related macros and switch to generic min_heap")
Fixes: 92a8b224b833 ("lib/min_heap: introduce non-inline versions of min heap API functions")
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Reported-by: Robert Pang <robertpang@google.com>
Closes: https://lore.kernel.org/linux-bcache/CAJhEC06F_AtrPgw2-7CvCqZgeStgCtitbD-ryuPpXQA-JG5XXw@mail.gmail.com
Acked-by: Coly Li <colyli@kernel.org>
Cc: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Kuan-Wei Chiu and committed by
Andrew Morton
48fd7ebe 845f1f2d

+216 -262
+17 -47
drivers/md/bcache/alloc.c
··· 164 164 * prio is worth 1/8th of what INITIAL_PRIO is worth. 165 165 */ 166 166 167 - static inline unsigned int new_bucket_prio(struct cache *ca, struct bucket *b) 168 - { 169 - unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; 167 + #define bucket_prio(b) \ 168 + ({ \ 169 + unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \ 170 + \ 171 + (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \ 172 + }) 170 173 171 - return (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); 172 - } 173 - 174 - static inline bool new_bucket_max_cmp(const void *l, const void *r, void *args) 175 - { 176 - struct bucket **lhs = (struct bucket **)l; 177 - struct bucket **rhs = (struct bucket **)r; 178 - struct cache *ca = args; 179 - 180 - return new_bucket_prio(ca, *lhs) > new_bucket_prio(ca, *rhs); 181 - } 182 - 183 - static inline bool new_bucket_min_cmp(const void *l, const void *r, void *args) 184 - { 185 - struct bucket **lhs = (struct bucket **)l; 186 - struct bucket **rhs = (struct bucket **)r; 187 - struct cache *ca = args; 188 - 189 - return new_bucket_prio(ca, *lhs) < new_bucket_prio(ca, *rhs); 190 - } 191 - 192 - static inline void new_bucket_swap(void *l, void *r, void __always_unused *args) 193 - { 194 - struct bucket **lhs = l, **rhs = r; 195 - 196 - swap(*lhs, *rhs); 197 - } 174 + #define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) 175 + #define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) 198 176 199 177 static void invalidate_buckets_lru(struct cache *ca) 200 178 { 201 179 struct bucket *b; 202 - const struct min_heap_callbacks bucket_max_cmp_callback = { 203 - .less = new_bucket_max_cmp, 204 - .swp = new_bucket_swap, 205 - }; 206 - const struct min_heap_callbacks bucket_min_cmp_callback = { 207 - .less = new_bucket_min_cmp, 208 - .swp = new_bucket_swap, 209 - }; 180 + ssize_t i; 210 181 211 - ca->heap.nr = 0; 182 + ca->heap.used = 0; 212 183 213 184 for_each_bucket(b, ca) { 214 185 if (!bch_can_invalidate_bucket(ca, b)) 215 186 continue; 216 187 217 - if (!min_heap_full(&ca->heap)) 218 - min_heap_push(&ca->heap, &b, &bucket_max_cmp_callback, ca); 219 - else if (!new_bucket_max_cmp(&b, min_heap_peek(&ca->heap), ca)) { 188 + if (!heap_full(&ca->heap)) 189 + heap_add(&ca->heap, b, bucket_max_cmp); 190 + else if (bucket_max_cmp(b, heap_peek(&ca->heap))) { 220 191 ca->heap.data[0] = b; 221 - min_heap_sift_down(&ca->heap, 0, &bucket_max_cmp_callback, ca); 192 + heap_sift(&ca->heap, 0, bucket_max_cmp); 222 193 } 223 194 } 224 195 225 - min_heapify_all(&ca->heap, &bucket_min_cmp_callback, ca); 196 + for (i = ca->heap.used / 2 - 1; i >= 0; --i) 197 + heap_sift(&ca->heap, i, bucket_min_cmp); 226 198 227 199 while (!fifo_full(&ca->free_inc)) { 228 - if (!ca->heap.nr) { 200 + if (!heap_pop(&ca->heap, b, bucket_min_cmp)) { 229 201 /* 230 202 * We don't want to be calling invalidate_buckets() 231 203 * multiple times when it can't do anything ··· 206 234 wake_up_gc(ca->set); 207 235 return; 208 236 } 209 - b = min_heap_peek(&ca->heap)[0]; 210 - min_heap_pop(&ca->heap, &bucket_min_cmp_callback, ca); 211 237 212 238 bch_invalidate_one_bucket(ca, b); 213 239 }
+1 -1
drivers/md/bcache/bcache.h
··· 458 458 /* Allocation stuff: */ 459 459 struct bucket *buckets; 460 460 461 - DEFINE_MIN_HEAP(struct bucket *, cache_heap) heap; 461 + DECLARE_HEAP(struct bucket *, heap); 462 462 463 463 /* 464 464 * If nonzero, we know we aren't going to find any buckets to invalidate
+45 -79
drivers/md/bcache/bset.c
··· 54 54 int __bch_count_data(struct btree_keys *b) 55 55 { 56 56 unsigned int ret = 0; 57 - struct btree_iter iter; 57 + struct btree_iter_stack iter; 58 58 struct bkey *k; 59 - 60 - min_heap_init(&iter.heap, NULL, MAX_BSETS); 61 59 62 60 if (b->ops->is_extents) 63 61 for_each_key(b, k, &iter) ··· 67 69 { 68 70 va_list args; 69 71 struct bkey *k, *p = NULL; 70 - struct btree_iter iter; 72 + struct btree_iter_stack iter; 71 73 const char *err; 72 - 73 - min_heap_init(&iter.heap, NULL, MAX_BSETS); 74 74 75 75 for_each_key(b, k, &iter) { 76 76 if (b->ops->is_extents) { ··· 110 114 111 115 static void bch_btree_iter_next_check(struct btree_iter *iter) 112 116 { 113 - struct bkey *k = iter->heap.data->k, *next = bkey_next(k); 117 + struct bkey *k = iter->data->k, *next = bkey_next(k); 114 118 115 - if (next < iter->heap.data->end && 119 + if (next < iter->data->end && 116 120 bkey_cmp(k, iter->b->ops->is_extents ? 117 121 &START_KEY(next) : next) > 0) { 118 122 bch_dump_bucket(iter->b); ··· 879 883 unsigned int status = BTREE_INSERT_STATUS_NO_INSERT; 880 884 struct bset *i = bset_tree_last(b)->data; 881 885 struct bkey *m, *prev = NULL; 882 - struct btree_iter iter; 886 + struct btree_iter_stack iter; 883 887 struct bkey preceding_key_on_stack = ZERO_KEY; 884 888 struct bkey *preceding_key_p = &preceding_key_on_stack; 885 889 886 890 BUG_ON(b->ops->is_extents && !KEY_SIZE(k)); 887 - 888 - min_heap_init(&iter.heap, NULL, MAX_BSETS); 889 891 890 892 /* 891 893 * If k has preceding key, preceding_key_p will be set to address ··· 895 901 else 896 902 preceding_key(k, &preceding_key_p); 897 903 898 - m = bch_btree_iter_init(b, &iter, preceding_key_p); 904 + m = bch_btree_iter_stack_init(b, &iter, preceding_key_p); 899 905 900 - if (b->ops->insert_fixup(b, k, &iter, replace_key)) 906 + if (b->ops->insert_fixup(b, k, &iter.iter, replace_key)) 901 907 return status; 902 908 903 909 status = BTREE_INSERT_STATUS_INSERT; ··· 1077 1083 1078 1084 /* Btree iterator */ 1079 1085 1080 - typedef bool (new_btree_iter_cmp_fn)(const void *, const void *, void *); 1086 + typedef bool (btree_iter_cmp_fn)(struct btree_iter_set, 1087 + struct btree_iter_set); 1081 1088 1082 - static inline bool new_btree_iter_cmp(const void *l, const void *r, void __always_unused *args) 1089 + static inline bool btree_iter_cmp(struct btree_iter_set l, 1090 + struct btree_iter_set r) 1083 1091 { 1084 - const struct btree_iter_set *_l = l; 1085 - const struct btree_iter_set *_r = r; 1086 - 1087 - return bkey_cmp(_l->k, _r->k) <= 0; 1088 - } 1089 - 1090 - static inline void new_btree_iter_swap(void *iter1, void *iter2, void __always_unused *args) 1091 - { 1092 - struct btree_iter_set *_iter1 = iter1; 1093 - struct btree_iter_set *_iter2 = iter2; 1094 - 1095 - swap(*_iter1, *_iter2); 1092 + return bkey_cmp(l.k, r.k) > 0; 1096 1093 } 1097 1094 1098 1095 static inline bool btree_iter_end(struct btree_iter *iter) 1099 1096 { 1100 - return !iter->heap.nr; 1097 + return !iter->used; 1101 1098 } 1102 1099 1103 1100 void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, 1104 1101 struct bkey *end) 1105 1102 { 1106 - const struct min_heap_callbacks callbacks = { 1107 - .less = new_btree_iter_cmp, 1108 - .swp = new_btree_iter_swap, 1109 - }; 1110 - 1111 1103 if (k != end) 1112 - BUG_ON(!min_heap_push(&iter->heap, 1113 - &((struct btree_iter_set) { k, end }), 1114 - &callbacks, 1115 - NULL)); 1104 + BUG_ON(!heap_add(iter, 1105 + ((struct btree_iter_set) { k, end }), 1106 + btree_iter_cmp)); 1116 1107 } 1117 1108 1118 - static struct bkey *__bch_btree_iter_init(struct btree_keys *b, 1119 - struct btree_iter *iter, 1120 - struct bkey *search, 1121 - struct bset_tree *start) 1109 + static struct bkey *__bch_btree_iter_stack_init(struct btree_keys *b, 1110 + struct btree_iter_stack *iter, 1111 + struct bkey *search, 1112 + struct bset_tree *start) 1122 1113 { 1123 1114 struct bkey *ret = NULL; 1124 1115 1125 - iter->heap.size = ARRAY_SIZE(iter->heap.preallocated); 1126 - iter->heap.nr = 0; 1116 + iter->iter.size = ARRAY_SIZE(iter->stack_data); 1117 + iter->iter.used = 0; 1127 1118 1128 1119 #ifdef CONFIG_BCACHE_DEBUG 1129 - iter->b = b; 1120 + iter->iter.b = b; 1130 1121 #endif 1131 1122 1132 1123 for (; start <= bset_tree_last(b); start++) { 1133 1124 ret = bch_bset_search(b, start, search); 1134 - bch_btree_iter_push(iter, ret, bset_bkey_last(start->data)); 1125 + bch_btree_iter_push(&iter->iter, ret, bset_bkey_last(start->data)); 1135 1126 } 1136 1127 1137 1128 return ret; 1138 1129 } 1139 1130 1140 - struct bkey *bch_btree_iter_init(struct btree_keys *b, 1141 - struct btree_iter *iter, 1131 + struct bkey *bch_btree_iter_stack_init(struct btree_keys *b, 1132 + struct btree_iter_stack *iter, 1142 1133 struct bkey *search) 1143 1134 { 1144 - return __bch_btree_iter_init(b, iter, search, b->set); 1135 + return __bch_btree_iter_stack_init(b, iter, search, b->set); 1145 1136 } 1146 1137 1147 1138 static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, 1148 - new_btree_iter_cmp_fn *cmp) 1139 + btree_iter_cmp_fn *cmp) 1149 1140 { 1150 1141 struct btree_iter_set b __maybe_unused; 1151 1142 struct bkey *ret = NULL; 1152 - const struct min_heap_callbacks callbacks = { 1153 - .less = cmp, 1154 - .swp = new_btree_iter_swap, 1155 - }; 1156 1143 1157 1144 if (!btree_iter_end(iter)) { 1158 1145 bch_btree_iter_next_check(iter); 1159 1146 1160 - ret = iter->heap.data->k; 1161 - iter->heap.data->k = bkey_next(iter->heap.data->k); 1147 + ret = iter->data->k; 1148 + iter->data->k = bkey_next(iter->data->k); 1162 1149 1163 - if (iter->heap.data->k > iter->heap.data->end) { 1150 + if (iter->data->k > iter->data->end) { 1164 1151 WARN_ONCE(1, "bset was corrupt!\n"); 1165 - iter->heap.data->k = iter->heap.data->end; 1152 + iter->data->k = iter->data->end; 1166 1153 } 1167 1154 1168 - if (iter->heap.data->k == iter->heap.data->end) { 1169 - if (iter->heap.nr) { 1170 - b = min_heap_peek(&iter->heap)[0]; 1171 - min_heap_pop(&iter->heap, &callbacks, NULL); 1172 - } 1173 - } 1155 + if (iter->data->k == iter->data->end) 1156 + heap_pop(iter, b, cmp); 1174 1157 else 1175 - min_heap_sift_down(&iter->heap, 0, &callbacks, NULL); 1158 + heap_sift(iter, 0, cmp); 1176 1159 } 1177 1160 1178 1161 return ret; ··· 1157 1186 1158 1187 struct bkey *bch_btree_iter_next(struct btree_iter *iter) 1159 1188 { 1160 - return __bch_btree_iter_next(iter, new_btree_iter_cmp); 1189 + return __bch_btree_iter_next(iter, btree_iter_cmp); 1161 1190 1162 1191 } 1163 1192 ··· 1195 1224 struct btree_iter *iter, 1196 1225 bool fixup, bool remove_stale) 1197 1226 { 1227 + int i; 1198 1228 struct bkey *k, *last = NULL; 1199 1229 BKEY_PADDED(k) tmp; 1200 1230 bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale 1201 1231 ? bch_ptr_bad 1202 1232 : bch_ptr_invalid; 1203 - const struct min_heap_callbacks callbacks = { 1204 - .less = b->ops->sort_cmp, 1205 - .swp = new_btree_iter_swap, 1206 - }; 1207 1233 1208 1234 /* Heapify the iterator, using our comparison function */ 1209 - min_heapify_all(&iter->heap, &callbacks, NULL); 1235 + for (i = iter->used / 2 - 1; i >= 0; --i) 1236 + heap_sift(iter, i, b->ops->sort_cmp); 1210 1237 1211 1238 while (!btree_iter_end(iter)) { 1212 1239 if (b->ops->sort_fixup && fixup) ··· 1293 1324 struct bset_sort_state *state) 1294 1325 { 1295 1326 size_t order = b->page_order, keys = 0; 1296 - struct btree_iter iter; 1327 + struct btree_iter_stack iter; 1297 1328 int oldsize = bch_count_data(b); 1298 1329 1299 - min_heap_init(&iter.heap, NULL, MAX_BSETS); 1300 - __bch_btree_iter_init(b, &iter, NULL, &b->set[start]); 1330 + __bch_btree_iter_stack_init(b, &iter, NULL, &b->set[start]); 1301 1331 1302 1332 if (start) { 1303 1333 unsigned int i; ··· 1307 1339 order = get_order(__set_bytes(b->set->data, keys)); 1308 1340 } 1309 1341 1310 - __btree_sort(b, &iter, start, order, false, state); 1342 + __btree_sort(b, &iter.iter, start, order, false, state); 1311 1343 1312 1344 EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); 1313 1345 } ··· 1323 1355 struct bset_sort_state *state) 1324 1356 { 1325 1357 uint64_t start_time = local_clock(); 1326 - struct btree_iter iter; 1358 + struct btree_iter_stack iter; 1327 1359 1328 - min_heap_init(&iter.heap, NULL, MAX_BSETS); 1360 + bch_btree_iter_stack_init(b, &iter, NULL); 1329 1361 1330 - bch_btree_iter_init(b, &iter, NULL); 1331 - 1332 - btree_mergesort(b, new->set->data, &iter, false, true); 1362 + btree_mergesort(b, new->set->data, &iter.iter, false, true); 1333 1363 1334 1364 bch_time_stats_update(&state->time, start_time); 1335 1365
+23 -17
drivers/md/bcache/bset.h
··· 187 187 }; 188 188 189 189 struct btree_keys_ops { 190 - bool (*sort_cmp)(const void *l, 191 - const void *r, 192 - void *args); 190 + bool (*sort_cmp)(struct btree_iter_set l, 191 + struct btree_iter_set r); 193 192 struct bkey *(*sort_fixup)(struct btree_iter *iter, 194 193 struct bkey *tmp); 195 194 bool (*insert_fixup)(struct btree_keys *b, ··· 312 313 BTREE_INSERT_STATUS_FRONT_MERGE, 313 314 }; 314 315 315 - struct btree_iter_set { 316 - struct bkey *k, *end; 317 - }; 318 - 319 316 /* Btree key iteration */ 320 317 321 318 struct btree_iter { 319 + size_t size, used; 322 320 #ifdef CONFIG_BCACHE_DEBUG 323 321 struct btree_keys *b; 324 322 #endif 325 - MIN_HEAP_PREALLOCATED(struct btree_iter_set, btree_iter_heap, MAX_BSETS) heap; 323 + struct btree_iter_set { 324 + struct bkey *k, *end; 325 + } data[]; 326 + }; 327 + 328 + /* Fixed-size btree_iter that can be allocated on the stack */ 329 + 330 + struct btree_iter_stack { 331 + struct btree_iter iter; 332 + struct btree_iter_set stack_data[MAX_BSETS]; 326 333 }; 327 334 328 335 typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k); ··· 340 335 341 336 void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, 342 337 struct bkey *end); 343 - struct bkey *bch_btree_iter_init(struct btree_keys *b, 344 - struct btree_iter *iter, 345 - struct bkey *search); 338 + struct bkey *bch_btree_iter_stack_init(struct btree_keys *b, 339 + struct btree_iter_stack *iter, 340 + struct bkey *search); 346 341 347 342 struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, 348 343 const struct bkey *search); ··· 357 352 return search ? __bch_bset_search(b, t, search) : t->data->start; 358 353 } 359 354 360 - #define for_each_key_filter(b, k, iter, filter) \ 361 - for (bch_btree_iter_init((b), (iter), NULL); \ 362 - ((k) = bch_btree_iter_next_filter((iter), (b), filter));) 355 + #define for_each_key_filter(b, k, stack_iter, filter) \ 356 + for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \ 357 + ((k) = bch_btree_iter_next_filter(&((stack_iter)->iter), (b), \ 358 + filter));) 363 359 364 - #define for_each_key(b, k, iter) \ 365 - for (bch_btree_iter_init((b), (iter), NULL); \ 366 - ((k) = bch_btree_iter_next(iter));) 360 + #define for_each_key(b, k, stack_iter) \ 361 + for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \ 362 + ((k) = bch_btree_iter_next(&((stack_iter)->iter)));) 367 363 368 364 /* Sorting */ 369 365
+29 -40
drivers/md/bcache/btree.c
··· 148 148 { 149 149 const char *err = "bad btree header"; 150 150 struct bset *i = btree_bset_first(b); 151 - struct btree_iter iter; 151 + struct btree_iter *iter; 152 152 153 153 /* 154 154 * c->fill_iter can allocate an iterator with more memory space 155 155 * than static MAX_BSETS. 156 156 * See the comment arount cache_set->fill_iter. 157 157 */ 158 - iter.heap.data = mempool_alloc(&b->c->fill_iter, GFP_NOIO); 159 - iter.heap.size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size; 160 - iter.heap.nr = 0; 158 + iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO); 159 + iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size; 160 + iter->used = 0; 161 161 162 162 #ifdef CONFIG_BCACHE_DEBUG 163 - iter.b = &b->keys; 163 + iter->b = &b->keys; 164 164 #endif 165 165 166 166 if (!i->seq) ··· 198 198 if (i != b->keys.set[0].data && !i->keys) 199 199 goto err; 200 200 201 - bch_btree_iter_push(&iter, i->start, bset_bkey_last(i)); 201 + bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); 202 202 203 203 b->written += set_blocks(i, block_bytes(b->c->cache)); 204 204 } ··· 210 210 if (i->seq == b->keys.set[0].data->seq) 211 211 goto err; 212 212 213 - bch_btree_sort_and_fix_extents(&b->keys, &iter, &b->c->sort); 213 + bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); 214 214 215 215 i = b->keys.set[0].data; 216 216 err = "short btree key"; ··· 222 222 bch_bset_init_next(&b->keys, write_block(b), 223 223 bset_magic(&b->c->cache->sb)); 224 224 out: 225 - mempool_free(iter.heap.data, &b->c->fill_iter); 225 + mempool_free(iter, &b->c->fill_iter); 226 226 return; 227 227 err: 228 228 set_btree_node_io_error(b); ··· 1306 1306 uint8_t stale = 0; 1307 1307 unsigned int keys = 0, good_keys = 0; 1308 1308 struct bkey *k; 1309 - struct btree_iter iter; 1309 + struct btree_iter_stack iter; 1310 1310 struct bset_tree *t; 1311 - 1312 - min_heap_init(&iter.heap, NULL, MAX_BSETS); 1313 1311 1314 1312 gc->nodes++; 1315 1313 ··· 1567 1569 static unsigned int btree_gc_count_keys(struct btree *b) 1568 1570 { 1569 1571 struct bkey *k; 1570 - struct btree_iter iter; 1572 + struct btree_iter_stack iter; 1571 1573 unsigned int ret = 0; 1572 - 1573 - min_heap_init(&iter.heap, NULL, MAX_BSETS); 1574 1574 1575 1575 for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) 1576 1576 ret += bkey_u64s(k); ··· 1608 1612 int ret = 0; 1609 1613 bool should_rewrite; 1610 1614 struct bkey *k; 1611 - struct btree_iter iter; 1615 + struct btree_iter_stack iter; 1612 1616 struct gc_merge_info r[GC_MERGE_NODES]; 1613 1617 struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; 1614 1618 1615 - min_heap_init(&iter.heap, NULL, MAX_BSETS); 1616 - bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); 1619 + bch_btree_iter_stack_init(&b->keys, &iter, &b->c->gc_done); 1617 1620 1618 1621 for (i = r; i < r + ARRAY_SIZE(r); i++) 1619 1622 i->b = ERR_PTR(-EINTR); 1620 1623 1621 1624 while (1) { 1622 - k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); 1625 + k = bch_btree_iter_next_filter(&iter.iter, &b->keys, 1626 + bch_ptr_bad); 1623 1627 if (k) { 1624 1628 r->b = bch_btree_node_get(b->c, op, k, b->level - 1, 1625 1629 true, b); ··· 1914 1918 { 1915 1919 int ret = 0; 1916 1920 struct bkey *k, *p = NULL; 1917 - struct btree_iter iter; 1918 - 1919 - min_heap_init(&iter.heap, NULL, MAX_BSETS); 1921 + struct btree_iter_stack iter; 1920 1922 1921 1923 for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) 1922 1924 bch_initial_mark_key(b->c, b->level, k); ··· 1922 1928 bch_initial_mark_key(b->c, b->level + 1, &b->key); 1923 1929 1924 1930 if (b->level) { 1925 - bch_btree_iter_init(&b->keys, &iter, NULL); 1931 + bch_btree_iter_stack_init(&b->keys, &iter, NULL); 1926 1932 1927 1933 do { 1928 - k = bch_btree_iter_next_filter(&iter, &b->keys, 1934 + k = bch_btree_iter_next_filter(&iter.iter, &b->keys, 1929 1935 bch_ptr_bad); 1930 1936 if (k) { 1931 1937 btree_node_prefetch(b, k); ··· 1953 1959 struct btree_check_info *info = arg; 1954 1960 struct btree_check_state *check_state = info->state; 1955 1961 struct cache_set *c = check_state->c; 1956 - struct btree_iter iter; 1962 + struct btree_iter_stack iter; 1957 1963 struct bkey *k, *p; 1958 1964 int cur_idx, prev_idx, skip_nr; 1959 1965 ··· 1961 1967 cur_idx = prev_idx = 0; 1962 1968 ret = 0; 1963 1969 1964 - min_heap_init(&iter.heap, NULL, MAX_BSETS); 1965 - 1966 1970 /* root node keys are checked before thread created */ 1967 - bch_btree_iter_init(&c->root->keys, &iter, NULL); 1968 - k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); 1971 + bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); 1972 + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); 1969 1973 BUG_ON(!k); 1970 1974 1971 1975 p = k; ··· 1981 1989 skip_nr = cur_idx - prev_idx; 1982 1990 1983 1991 while (skip_nr) { 1984 - k = bch_btree_iter_next_filter(&iter, 1992 + k = bch_btree_iter_next_filter(&iter.iter, 1985 1993 &c->root->keys, 1986 1994 bch_ptr_bad); 1987 1995 if (k) ··· 2054 2062 int ret = 0; 2055 2063 int i; 2056 2064 struct bkey *k = NULL; 2057 - struct btree_iter iter; 2065 + struct btree_iter_stack iter; 2058 2066 struct btree_check_state check_state; 2059 - 2060 - min_heap_init(&iter.heap, NULL, MAX_BSETS); 2061 2067 2062 2068 /* check and mark root node keys */ 2063 2069 for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid) ··· 2550 2560 2551 2561 if (b->level) { 2552 2562 struct bkey *k; 2553 - struct btree_iter iter; 2563 + struct btree_iter_stack iter; 2554 2564 2555 - min_heap_init(&iter.heap, NULL, MAX_BSETS); 2556 - bch_btree_iter_init(&b->keys, &iter, from); 2565 + bch_btree_iter_stack_init(&b->keys, &iter, from); 2557 2566 2558 - while ((k = bch_btree_iter_next_filter(&iter, &b->keys, 2567 + while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys, 2559 2568 bch_ptr_bad))) { 2560 2569 ret = bcache_btree(map_nodes_recurse, k, b, 2561 2570 op, from, fn, flags); ··· 2583 2594 { 2584 2595 int ret = MAP_CONTINUE; 2585 2596 struct bkey *k; 2586 - struct btree_iter iter; 2597 + struct btree_iter_stack iter; 2587 2598 2588 - min_heap_init(&iter.heap, NULL, MAX_BSETS); 2589 - bch_btree_iter_init(&b->keys, &iter, from); 2599 + bch_btree_iter_stack_init(&b->keys, &iter, from); 2590 2600 2591 - while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { 2601 + while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys, 2602 + bch_ptr_bad))) { 2592 2603 ret = !b->level 2593 2604 ? fn(op, b, k) 2594 2605 : bcache_btree(map_keys_recurse, k,
+18 -33
drivers/md/bcache/extents.c
··· 33 33 i->k = bkey_next(i->k); 34 34 35 35 if (i->k == i->end) 36 - *i = iter->heap.data[--iter->heap.nr]; 36 + *i = iter->data[--iter->used]; 37 37 } 38 38 39 - static bool new_bch_key_sort_cmp(const void *l, const void *r, void *args) 39 + static bool bch_key_sort_cmp(struct btree_iter_set l, 40 + struct btree_iter_set r) 40 41 { 41 - struct btree_iter_set *_l = (struct btree_iter_set *)l; 42 - struct btree_iter_set *_r = (struct btree_iter_set *)r; 43 - int64_t c = bkey_cmp(_l->k, _r->k); 42 + int64_t c = bkey_cmp(l.k, r.k); 44 43 45 - return !(c ? c > 0 : _l->k < _r->k); 44 + return c ? c > 0 : l.k < r.k; 46 45 } 47 46 48 47 static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) ··· 238 239 } 239 240 240 241 const struct btree_keys_ops bch_btree_keys_ops = { 241 - .sort_cmp = new_bch_key_sort_cmp, 242 + .sort_cmp = bch_key_sort_cmp, 242 243 .insert_fixup = bch_btree_ptr_insert_fixup, 243 244 .key_invalid = bch_btree_ptr_invalid, 244 245 .key_bad = bch_btree_ptr_bad, ··· 255 256 * Necessary for btree_sort_fixup() - if there are multiple keys that compare 256 257 * equal in different sets, we have to process them newest to oldest. 257 258 */ 258 - 259 - static bool new_bch_extent_sort_cmp(const void *l, const void *r, void __always_unused *args) 259 + static bool bch_extent_sort_cmp(struct btree_iter_set l, 260 + struct btree_iter_set r) 260 261 { 261 - struct btree_iter_set *_l = (struct btree_iter_set *)l; 262 - struct btree_iter_set *_r = (struct btree_iter_set *)r; 263 - int64_t c = bkey_cmp(&START_KEY(_l->k), &START_KEY(_r->k)); 262 + int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); 264 263 265 - return !(c ? c > 0 : _l->k < _r->k); 266 - } 267 - 268 - static inline void new_btree_iter_swap(void *iter1, void *iter2, void __always_unused *args) 269 - { 270 - struct btree_iter_set *_iter1 = iter1; 271 - struct btree_iter_set *_iter2 = iter2; 272 - 273 - swap(*_iter1, *_iter2); 264 + return c ? c > 0 : l.k < r.k; 274 265 } 275 266 276 267 static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, 277 268 struct bkey *tmp) 278 269 { 279 - const struct min_heap_callbacks callbacks = { 280 - .less = new_bch_extent_sort_cmp, 281 - .swp = new_btree_iter_swap, 282 - }; 283 - while (iter->heap.nr > 1) { 284 - struct btree_iter_set *top = iter->heap.data, *i = top + 1; 270 + while (iter->used > 1) { 271 + struct btree_iter_set *top = iter->data, *i = top + 1; 285 272 286 - if (iter->heap.nr > 2 && 287 - !new_bch_extent_sort_cmp(&i[0], &i[1], NULL)) 273 + if (iter->used > 2 && 274 + bch_extent_sort_cmp(i[0], i[1])) 288 275 i++; 289 276 290 277 if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) ··· 278 293 279 294 if (!KEY_SIZE(i->k)) { 280 295 sort_key_next(iter, i); 281 - min_heap_sift_down(&iter->heap, i - top, &callbacks, NULL); 296 + heap_sift(iter, i - top, bch_extent_sort_cmp); 282 297 continue; 283 298 } 284 299 ··· 288 303 else 289 304 bch_cut_front(top->k, i->k); 290 305 291 - min_heap_sift_down(&iter->heap, i - top, &callbacks, NULL); 306 + heap_sift(iter, i - top, bch_extent_sort_cmp); 292 307 } else { 293 308 /* can't happen because of comparison func */ 294 309 BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); ··· 298 313 299 314 bch_cut_back(&START_KEY(i->k), tmp); 300 315 bch_cut_front(i->k, top->k); 301 - min_heap_sift_down(&iter->heap, 0, &callbacks, NULL); 316 + heap_sift(iter, 0, bch_extent_sort_cmp); 302 317 303 318 return tmp; 304 319 } else { ··· 618 633 } 619 634 620 635 const struct btree_keys_ops bch_extent_keys_ops = { 621 - .sort_cmp = new_bch_extent_sort_cmp, 636 + .sort_cmp = bch_extent_sort_cmp, 622 637 .sort_fixup = bch_extent_sort_fixup, 623 638 .insert_fixup = bch_extent_insert_fixup, 624 639 .key_invalid = bch_extent_invalid,
+10 -31
drivers/md/bcache/movinggc.c
··· 182 182 closure_sync(&cl); 183 183 } 184 184 185 - static bool new_bucket_cmp(const void *l, const void *r, void __always_unused *args) 185 + static bool bucket_cmp(struct bucket *l, struct bucket *r) 186 186 { 187 - struct bucket **_l = (struct bucket **)l; 188 - struct bucket **_r = (struct bucket **)r; 189 - 190 - return GC_SECTORS_USED(*_l) >= GC_SECTORS_USED(*_r); 191 - } 192 - 193 - static void new_bucket_swap(void *l, void *r, void __always_unused *args) 194 - { 195 - struct bucket **_l = l; 196 - struct bucket **_r = r; 197 - 198 - swap(*_l, *_r); 187 + return GC_SECTORS_USED(l) < GC_SECTORS_USED(r); 199 188 } 200 189 201 190 static unsigned int bucket_heap_top(struct cache *ca) 202 191 { 203 192 struct bucket *b; 204 193 205 - return (b = min_heap_peek(&ca->heap)[0]) ? GC_SECTORS_USED(b) : 0; 194 + return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0; 206 195 } 207 196 208 197 void bch_moving_gc(struct cache_set *c) ··· 199 210 struct cache *ca = c->cache; 200 211 struct bucket *b; 201 212 unsigned long sectors_to_move, reserve_sectors; 202 - const struct min_heap_callbacks callbacks = { 203 - .less = new_bucket_cmp, 204 - .swp = new_bucket_swap, 205 - }; 206 213 207 214 if (!c->copy_gc_enabled) 208 215 return; ··· 209 224 reserve_sectors = ca->sb.bucket_size * 210 225 fifo_used(&ca->free[RESERVE_MOVINGGC]); 211 226 212 - ca->heap.nr = 0; 227 + ca->heap.used = 0; 213 228 214 229 for_each_bucket(b, ca) { 215 230 if (GC_MARK(b) == GC_MARK_METADATA || ··· 218 233 atomic_read(&b->pin)) 219 234 continue; 220 235 221 - if (!min_heap_full(&ca->heap)) { 236 + if (!heap_full(&ca->heap)) { 222 237 sectors_to_move += GC_SECTORS_USED(b); 223 - min_heap_push(&ca->heap, &b, &callbacks, NULL); 224 - } else if (!new_bucket_cmp(&b, min_heap_peek(&ca->heap), ca)) { 238 + heap_add(&ca->heap, b, bucket_cmp); 239 + } else if (bucket_cmp(b, heap_peek(&ca->heap))) { 225 240 sectors_to_move -= bucket_heap_top(ca); 226 241 sectors_to_move += GC_SECTORS_USED(b); 227 242 228 243 ca->heap.data[0] = b; 229 - min_heap_sift_down(&ca->heap, 0, &callbacks, NULL); 244 + heap_sift(&ca->heap, 0, bucket_cmp); 230 245 } 231 246 } 232 247 233 248 while (sectors_to_move > reserve_sectors) { 234 - if (ca->heap.nr) { 235 - b = min_heap_peek(&ca->heap)[0]; 236 - min_heap_pop(&ca->heap, &callbacks, NULL); 237 - } 249 + heap_pop(&ca->heap, b, bucket_cmp); 238 250 sectors_to_move -= GC_SECTORS_USED(b); 239 251 } 240 252 241 - while (ca->heap.nr) { 242 - b = min_heap_peek(&ca->heap)[0]; 243 - min_heap_pop(&ca->heap, &callbacks, NULL); 253 + while (heap_pop(&ca->heap, b, bucket_cmp)) 244 254 SET_GC_MOVE(b, 1); 245 - } 246 255 247 256 mutex_unlock(&c->bucket_lock); 248 257
+2 -1
drivers/md/bcache/super.c
··· 1912 1912 INIT_LIST_HEAD(&c->btree_cache_freed); 1913 1913 INIT_LIST_HEAD(&c->data_buckets); 1914 1914 1915 - iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) * 1915 + iter_size = sizeof(struct btree_iter) + 1916 + ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) * 1916 1917 sizeof(struct btree_iter_set); 1917 1918 1918 1919 c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL);
+1 -3
drivers/md/bcache/sysfs.c
··· 660 660 unsigned int bytes = 0; 661 661 struct bkey *k; 662 662 struct btree *b; 663 - struct btree_iter iter; 664 - 665 - min_heap_init(&iter.heap, NULL, MAX_BSETS); 663 + struct btree_iter_stack iter; 666 664 667 665 goto lock_root; 668 666
+65 -2
drivers/md/bcache/util.h
··· 9 9 #include <linux/kernel.h> 10 10 #include <linux/sched/clock.h> 11 11 #include <linux/llist.h> 12 - #include <linux/min_heap.h> 13 12 #include <linux/ratelimit.h> 14 13 #include <linux/vmalloc.h> 15 14 #include <linux/workqueue.h> ··· 30 31 31 32 #endif 32 33 34 + #define DECLARE_HEAP(type, name) \ 35 + struct { \ 36 + size_t size, used; \ 37 + type *data; \ 38 + } name 39 + 33 40 #define init_heap(heap, _size, gfp) \ 34 41 ({ \ 35 42 size_t _bytes; \ 36 - (heap)->nr = 0; \ 43 + (heap)->used = 0; \ 37 44 (heap)->size = (_size); \ 38 45 _bytes = (heap)->size * sizeof(*(heap)->data); \ 39 46 (heap)->data = kvmalloc(_bytes, (gfp) & GFP_KERNEL); \ ··· 51 46 kvfree((heap)->data); \ 52 47 (heap)->data = NULL; \ 53 48 } while (0) 49 + 50 + #define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j]) 51 + 52 + #define heap_sift(h, i, cmp) \ 53 + do { \ 54 + size_t _r, _j = i; \ 55 + \ 56 + for (; _j * 2 + 1 < (h)->used; _j = _r) { \ 57 + _r = _j * 2 + 1; \ 58 + if (_r + 1 < (h)->used && \ 59 + cmp((h)->data[_r], (h)->data[_r + 1])) \ 60 + _r++; \ 61 + \ 62 + if (cmp((h)->data[_r], (h)->data[_j])) \ 63 + break; \ 64 + heap_swap(h, _r, _j); \ 65 + } \ 66 + } while (0) 67 + 68 + #define heap_sift_down(h, i, cmp) \ 69 + do { \ 70 + while (i) { \ 71 + size_t p = (i - 1) / 2; \ 72 + if (cmp((h)->data[i], (h)->data[p])) \ 73 + break; \ 74 + heap_swap(h, i, p); \ 75 + i = p; \ 76 + } \ 77 + } while (0) 78 + 79 + #define heap_add(h, d, cmp) \ 80 + ({ \ 81 + bool _r = !heap_full(h); \ 82 + if (_r) { \ 83 + size_t _i = (h)->used++; \ 84 + (h)->data[_i] = d; \ 85 + \ 86 + heap_sift_down(h, _i, cmp); \ 87 + heap_sift(h, _i, cmp); \ 88 + } \ 89 + _r; \ 90 + }) 91 + 92 + #define heap_pop(h, d, cmp) \ 93 + ({ \ 94 + bool _r = (h)->used; \ 95 + if (_r) { \ 96 + (d) = (h)->data[0]; \ 97 + (h)->used--; \ 98 + heap_swap(h, 0, (h)->used); \ 99 + heap_sift(h, 0, cmp); \ 100 + } \ 101 + _r; \ 102 + }) 103 + 104 + #define heap_peek(h) ((h)->used ? (h)->data[0] : NULL) 105 + 106 + #define heap_full(h) ((h)->used == (h)->size) 54 107 55 108 #define DECLARE_FIFO(type, name) \ 56 109 struct { \
+5 -8
drivers/md/bcache/writeback.c
··· 908 908 struct dirty_init_thrd_info *info = arg; 909 909 struct bch_dirty_init_state *state = info->state; 910 910 struct cache_set *c = state->c; 911 - struct btree_iter iter; 911 + struct btree_iter_stack iter; 912 912 struct bkey *k, *p; 913 913 int cur_idx, prev_idx, skip_nr; 914 914 915 915 k = p = NULL; 916 916 prev_idx = 0; 917 917 918 - min_heap_init(&iter.heap, NULL, MAX_BSETS); 919 - bch_btree_iter_init(&c->root->keys, &iter, NULL); 920 - k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); 918 + bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); 919 + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); 921 920 BUG_ON(!k); 922 921 923 922 p = k; ··· 930 931 skip_nr = cur_idx - prev_idx; 931 932 932 933 while (skip_nr) { 933 - k = bch_btree_iter_next_filter(&iter, 934 + k = bch_btree_iter_next_filter(&iter.iter, 934 935 &c->root->keys, 935 936 bch_ptr_bad); 936 937 if (k) ··· 979 980 int i; 980 981 struct btree *b = NULL; 981 982 struct bkey *k = NULL; 982 - struct btree_iter iter; 983 + struct btree_iter_stack iter; 983 984 struct sectors_dirty_init op; 984 985 struct cache_set *c = d->c; 985 986 struct bch_dirty_init_state state; 986 - 987 - min_heap_init(&iter.heap, NULL, MAX_BSETS); 988 987 989 988 retry_lock: 990 989 b = c->root;