Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

crush: remove mutable part of CRUSH map

Then add it to the working state. It would be very nice if we didn't
have to take a lock to calculate a crush placement. By moving the
permutation array into the working data, we can treat the CRUSH map as
immutable.

Reflects ceph.git commit cbcd039651c0569551cb90d26ce27e1432671f2a.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>

+218 -86
+1
include/linux/ceph/osdmap.h
··· 175 175 176 176 struct mutex crush_scratch_mutex; 177 177 int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3]; 178 + void *crush_workspace; 178 179 }; 179 180 180 181 static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
+34 -7
include/linux/crush/crush.h
··· 135 135 __u32 size; /* num items */ 136 136 __s32 *items; 137 137 138 - /* 139 - * cached random permutation: used for uniform bucket and for 140 - * the linear search fallback for the other bucket types. 141 - */ 142 - __u32 perm_x; /* @x for which *perm is defined */ 143 - __u32 perm_n; /* num elements of *perm that are permuted/defined */ 144 - __u32 *perm; 145 138 }; 146 139 147 140 struct crush_bucket_uniform { ··· 204 211 * device fails. */ 205 212 __u8 chooseleaf_stable; 206 213 214 + /* 215 + * This value is calculated after decode or construction by 216 + * the builder. It is exposed here (rather than having a 217 + * 'build CRUSH working space' function) so that callers can 218 + * reserve a static buffer, allocate space on the stack, or 219 + * otherwise avoid calling into the heap allocator if they 220 + * want to. The size of the working space depends on the map, 221 + * while the size of the scratch vector passed to the mapper 222 + * depends on the size of the desired result set. 223 + * 224 + * Nothing stops the caller from allocating both in one swell 225 + * foop and passing in two points, though. 226 + */ 227 + size_t working_size; 228 + 207 229 #ifndef __KERNEL__ 208 230 /* 209 231 * version 0 (original) of straw_calc has various flaws. version 1 ··· 255 247 { 256 248 return ((i+1) << 1)-1; 257 249 } 250 + 251 + /* 252 + * These data structures are private to the CRUSH implementation. They 253 + * are exposed in this header file because builder needs their 254 + * definitions to calculate the total working size. 255 + * 256 + * Moving this out of the crush map allow us to treat the CRUSH map as 257 + * immutable within the mapper and removes the requirement for a CRUSH 258 + * map lock. 259 + */ 260 + struct crush_work_bucket { 261 + __u32 perm_x; /* @x for which *perm is defined */ 262 + __u32 perm_n; /* num elements of *perm that are permuted/defined */ 263 + __u32 *perm; /* Permutation of the bucket's items */ 264 + }; 265 + 266 + struct crush_work { 267 + struct crush_work_bucket **work; /* Per-bucket working store */ 268 + }; 258 269 259 270 #endif
+3 -1
include/linux/crush/mapper.h
··· 15 15 int ruleno, 16 16 int x, int *result, int result_max, 17 17 const __u32 *weights, int weight_max, 18 - int *scratch); 18 + void *cwin, int *scratch); 19 + 20 + void crush_init_workspace(const struct crush_map *map, void *v); 19 21 20 22 #endif
-5
net/ceph/crush/crush.c
··· 45 45 46 46 void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) 47 47 { 48 - kfree(b->h.perm); 49 48 kfree(b->h.items); 50 49 kfree(b); 51 50 } ··· 53 54 { 54 55 kfree(b->item_weights); 55 56 kfree(b->sum_weights); 56 - kfree(b->h.perm); 57 57 kfree(b->h.items); 58 58 kfree(b); 59 59 } 60 60 61 61 void crush_destroy_bucket_tree(struct crush_bucket_tree *b) 62 62 { 63 - kfree(b->h.perm); 64 63 kfree(b->h.items); 65 64 kfree(b->node_weights); 66 65 kfree(b); ··· 68 71 { 69 72 kfree(b->straws); 70 73 kfree(b->item_weights); 71 - kfree(b->h.perm); 72 74 kfree(b->h.items); 73 75 kfree(b); 74 76 } ··· 75 79 void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b) 76 80 { 77 81 kfree(b->item_weights); 78 - kfree(b->h.perm); 79 82 kfree(b->h.items); 80 83 kfree(b); 81 84 }
+138 -68
net/ceph/crush/mapper.c
··· 54 54 return -1; 55 55 } 56 56 57 - 58 57 /* 59 58 * bucket choose methods 60 59 * ··· 71 72 * Since this is expensive, we optimize for the r=0 case, which 72 73 * captures the vast majority of calls. 73 74 */ 74 - static int bucket_perm_choose(struct crush_bucket *bucket, 75 + static int bucket_perm_choose(const struct crush_bucket *bucket, 76 + struct crush_work_bucket *work, 75 77 int x, int r) 76 78 { 77 79 unsigned int pr = r % bucket->size; 78 80 unsigned int i, s; 79 81 80 82 /* start a new permutation if @x has changed */ 81 - if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) { 83 + if (work->perm_x != (__u32)x || work->perm_n == 0) { 82 84 dprintk("bucket %d new x=%d\n", bucket->id, x); 83 - bucket->perm_x = x; 85 + work->perm_x = x; 84 86 85 87 /* optimize common r=0 case */ 86 88 if (pr == 0) { 87 89 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % 88 90 bucket->size; 89 - bucket->perm[0] = s; 90 - bucket->perm_n = 0xffff; /* magic value, see below */ 91 + work->perm[0] = s; 92 + work->perm_n = 0xffff; /* magic value, see below */ 91 93 goto out; 92 94 } 93 95 94 96 for (i = 0; i < bucket->size; i++) 95 - bucket->perm[i] = i; 96 - bucket->perm_n = 0; 97 - } else if (bucket->perm_n == 0xffff) { 97 + work->perm[i] = i; 98 + work->perm_n = 0; 99 + } else if (work->perm_n == 0xffff) { 98 100 /* clean up after the r=0 case above */ 99 101 for (i = 1; i < bucket->size; i++) 100 - bucket->perm[i] = i; 101 - bucket->perm[bucket->perm[0]] = 0; 102 - bucket->perm_n = 1; 102 + work->perm[i] = i; 103 + work->perm[work->perm[0]] = 0; 104 + work->perm_n = 1; 103 105 } 104 106 105 107 /* calculate permutation up to pr */ 106 - for (i = 0; i < bucket->perm_n; i++) 108 + for (i = 0; i < work->perm_n; i++) 107 109 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); 108 - while (bucket->perm_n <= pr) { 109 - unsigned int p = bucket->perm_n; 110 + while (work->perm_n <= pr) { 111 + unsigned int p = work->perm_n; 110 112 /* no point in swapping the final entry */ 111 113 if (p < bucket->size - 1) { 112 114 i = crush_hash32_3(bucket->hash, x, bucket->id, p) % 113 115 (bucket->size - p); 114 116 if (i) { 115 - unsigned int t = bucket->perm[p + i]; 116 - bucket->perm[p + i] = bucket->perm[p]; 117 - bucket->perm[p] = t; 117 + unsigned int t = work->perm[p + i]; 118 + work->perm[p + i] = work->perm[p]; 119 + work->perm[p] = t; 118 120 } 119 121 dprintk(" perm_choose swap %d with %d\n", p, p+i); 120 122 } 121 - bucket->perm_n++; 123 + work->perm_n++; 122 124 } 123 125 for (i = 0; i < bucket->size; i++) 124 126 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]); 125 127 126 - s = bucket->perm[pr]; 128 + s = work->perm[pr]; 127 129 out: 128 130 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, 129 131 bucket->size, x, r, pr, s); ··· 132 132 } 133 133 134 134 /* uniform */ 135 - static int bucket_uniform_choose(struct crush_bucket_uniform *bucket, 136 - int x, int r) 135 + static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket, 136 + struct crush_work_bucket *work, int x, int r) 137 137 { 138 - return bucket_perm_choose(&bucket->h, x, r); 138 + return bucket_perm_choose(&bucket->h, work, x, r); 139 139 } 140 140 141 141 /* list */ 142 - static int bucket_list_choose(struct crush_bucket_list *bucket, 142 + static int bucket_list_choose(const struct crush_bucket_list *bucket, 143 143 int x, int r) 144 144 { 145 145 int i; ··· 155 155 w *= bucket->sum_weights[i]; 156 156 w = w >> 16; 157 157 /*dprintk(" scaled %llx\n", w);*/ 158 - if (w < bucket->item_weights[i]) 158 + if (w < bucket->item_weights[i]) { 159 159 return bucket->h.items[i]; 160 + } 160 161 } 161 162 162 163 dprintk("bad list sums for bucket %d\n", bucket->h.id); ··· 193 192 return x & 1; 194 193 } 195 194 196 - static int bucket_tree_choose(struct crush_bucket_tree *bucket, 195 + static int bucket_tree_choose(const struct crush_bucket_tree *bucket, 197 196 int x, int r) 198 197 { 199 198 int n; ··· 225 224 226 225 /* straw */ 227 226 228 - static int bucket_straw_choose(struct crush_bucket_straw *bucket, 227 + static int bucket_straw_choose(const struct crush_bucket_straw *bucket, 229 228 int x, int r) 230 229 { 231 230 __u32 i; ··· 302 301 * 303 302 */ 304 303 305 - static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, 304 + static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, 306 305 int x, int r) 307 306 { 308 307 unsigned int i, high = 0; ··· 345 344 high_draw = draw; 346 345 } 347 346 } 347 + 348 348 return bucket->h.items[high]; 349 349 } 350 350 351 351 352 - static int crush_bucket_choose(struct crush_bucket *in, int x, int r) 352 + static int crush_bucket_choose(const struct crush_bucket *in, 353 + struct crush_work_bucket *work, 354 + int x, int r) 353 355 { 354 356 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); 355 357 BUG_ON(in->size == 0); 356 358 switch (in->alg) { 357 359 case CRUSH_BUCKET_UNIFORM: 358 - return bucket_uniform_choose((struct crush_bucket_uniform *)in, 359 - x, r); 360 + return bucket_uniform_choose( 361 + (const struct crush_bucket_uniform *)in, 362 + work, x, r); 360 363 case CRUSH_BUCKET_LIST: 361 - return bucket_list_choose((struct crush_bucket_list *)in, 364 + return bucket_list_choose((const struct crush_bucket_list *)in, 362 365 x, r); 363 366 case CRUSH_BUCKET_TREE: 364 - return bucket_tree_choose((struct crush_bucket_tree *)in, 367 + return bucket_tree_choose((const struct crush_bucket_tree *)in, 365 368 x, r); 366 369 case CRUSH_BUCKET_STRAW: 367 - return bucket_straw_choose((struct crush_bucket_straw *)in, 368 - x, r); 370 + return bucket_straw_choose( 371 + (const struct crush_bucket_straw *)in, 372 + x, r); 369 373 case CRUSH_BUCKET_STRAW2: 370 - return bucket_straw2_choose((struct crush_bucket_straw2 *)in, 371 - x, r); 374 + return bucket_straw2_choose( 375 + (const struct crush_bucket_straw2 *)in, 376 + x, r); 372 377 default: 373 378 dprintk("unknown bucket %d alg %d\n", in->id, in->alg); 374 379 return in->items[0]; 375 380 } 376 381 } 377 - 378 382 379 383 /* 380 384 * true if device is marked "out" (failed, fully offloaded) ··· 422 416 * @parent_r: r value passed from the parent 423 417 */ 424 418 static int crush_choose_firstn(const struct crush_map *map, 425 - struct crush_bucket *bucket, 419 + struct crush_work *work, 420 + const struct crush_bucket *bucket, 426 421 const __u32 *weight, int weight_max, 427 422 int x, int numrep, int type, 428 423 int *out, int outpos, ··· 441 434 int rep; 442 435 unsigned int ftotal, flocal; 443 436 int retry_descent, retry_bucket, skip_rep; 444 - struct crush_bucket *in = bucket; 437 + const struct crush_bucket *in = bucket; 445 438 int r; 446 439 int i; 447 440 int item = 0; ··· 480 473 if (local_fallback_retries > 0 && 481 474 flocal >= (in->size>>1) && 482 475 flocal > local_fallback_retries) 483 - item = bucket_perm_choose(in, x, r); 476 + item = bucket_perm_choose( 477 + in, work->work[-1-in->id], 478 + x, r); 484 479 else 485 - item = crush_bucket_choose(in, x, r); 480 + item = crush_bucket_choose( 481 + in, work->work[-1-in->id], 482 + x, r); 486 483 if (item >= map->max_devices) { 487 484 dprintk(" bad item %d\n", item); 488 485 skip_rep = 1; ··· 529 518 sub_r = r >> (vary_r-1); 530 519 else 531 520 sub_r = 0; 532 - if (crush_choose_firstn(map, 533 - map->buckets[-1-item], 534 - weight, weight_max, 535 - x, stable ? 1 : outpos+1, 0, 536 - out2, outpos, count, 537 - recurse_tries, 0, 538 - local_retries, 539 - local_fallback_retries, 540 - 0, 541 - vary_r, 542 - stable, 543 - NULL, 544 - sub_r) <= outpos) 521 + if (crush_choose_firstn( 522 + map, 523 + work, 524 + map->buckets[-1-item], 525 + weight, weight_max, 526 + x, stable ? 1 : outpos+1, 0, 527 + out2, outpos, count, 528 + recurse_tries, 0, 529 + local_retries, 530 + local_fallback_retries, 531 + 0, 532 + vary_r, 533 + stable, 534 + NULL, 535 + sub_r) <= outpos) 545 536 /* didn't get leaf */ 546 537 reject = 1; 547 538 } else { ··· 613 600 * 614 601 */ 615 602 static void crush_choose_indep(const struct crush_map *map, 616 - struct crush_bucket *bucket, 603 + struct crush_work *work, 604 + const struct crush_bucket *bucket, 617 605 const __u32 *weight, int weight_max, 618 606 int x, int left, int numrep, int type, 619 607 int *out, int outpos, ··· 624 610 int *out2, 625 611 int parent_r) 626 612 { 627 - struct crush_bucket *in = bucket; 613 + const struct crush_bucket *in = bucket; 628 614 int endpos = outpos + left; 629 615 int rep; 630 616 unsigned int ftotal; ··· 692 678 break; 693 679 } 694 680 695 - item = crush_bucket_choose(in, x, r); 681 + item = crush_bucket_choose( 682 + in, work->work[-1-in->id], 683 + x, r); 696 684 if (item >= map->max_devices) { 697 685 dprintk(" bad item %d\n", item); 698 686 out[rep] = CRUSH_ITEM_NONE; ··· 740 724 741 725 if (recurse_to_leaf) { 742 726 if (item < 0) { 743 - crush_choose_indep(map, 744 - map->buckets[-1-item], 745 - weight, weight_max, 746 - x, 1, numrep, 0, 747 - out2, rep, 748 - recurse_tries, 0, 749 - 0, NULL, r); 727 + crush_choose_indep( 728 + map, 729 + work, 730 + map->buckets[-1-item], 731 + weight, weight_max, 732 + x, 1, numrep, 0, 733 + out2, rep, 734 + recurse_tries, 0, 735 + 0, NULL, r); 750 736 if (out2[rep] == CRUSH_ITEM_NONE) { 751 737 /* placed nothing; no leaf */ 752 738 break; ··· 799 781 #endif 800 782 } 801 783 784 + 785 + /* 786 + * This takes a chunk of memory and sets it up to be a shiny new 787 + * working area for a CRUSH placement computation. It must be called 788 + * on any newly allocated memory before passing it in to 789 + * crush_do_rule. It may be used repeatedly after that, so long as the 790 + * map has not changed. If the map /has/ changed, you must make sure 791 + * the working size is no smaller than what was allocated and re-run 792 + * crush_init_workspace. 793 + * 794 + * If you do retain the working space between calls to crush, make it 795 + * thread-local. 796 + */ 797 + void crush_init_workspace(const struct crush_map *map, void *v) 798 + { 799 + struct crush_work *w = v; 800 + __s32 b; 801 + 802 + /* 803 + * We work by moving through the available space and setting 804 + * values and pointers as we go. 805 + * 806 + * It's a bit like Forth's use of the 'allot' word since we 807 + * set the pointer first and then reserve the space for it to 808 + * point to by incrementing the point. 809 + */ 810 + v += sizeof(struct crush_work *); 811 + w->work = v; 812 + v += map->max_buckets * sizeof(struct crush_work_bucket *); 813 + for (b = 0; b < map->max_buckets; ++b) { 814 + if (!map->buckets[b]) 815 + continue; 816 + 817 + w->work[b] = v; 818 + switch (map->buckets[b]->alg) { 819 + default: 820 + v += sizeof(struct crush_work_bucket); 821 + break; 822 + } 823 + w->work[b]->perm_x = 0; 824 + w->work[b]->perm_n = 0; 825 + w->work[b]->perm = v; 826 + v += map->buckets[b]->size * sizeof(__u32); 827 + } 828 + BUG_ON(v - (void *)w != map->working_size); 829 + } 830 + 802 831 /** 803 832 * crush_do_rule - calculate a mapping with the given input and rule 804 833 * @map: the crush_map ··· 855 790 * @result_max: maximum result size 856 791 * @weight: weight vector (for map leaves) 857 792 * @weight_max: size of weight vector 793 + * @cwin: pointer to at least map->working_size bytes of memory 858 794 * @scratch: scratch vector for private use; must be >= 3 * result_max 859 795 */ 860 796 int crush_do_rule(const struct crush_map *map, 861 797 int ruleno, int x, int *result, int result_max, 862 798 const __u32 *weight, int weight_max, 863 - int *scratch) 799 + void *cwin, int *scratch) 864 800 { 865 801 int result_len; 802 + struct crush_work *cw = cwin; 866 803 int *a = scratch; 867 804 int *b = scratch + result_max; 868 805 int *c = scratch + result_max*2; ··· 874 807 int *o; 875 808 int osize; 876 809 int *tmp; 877 - struct crush_rule *rule; 810 + const struct crush_rule *rule; 878 811 __u32 step; 879 812 int i, j; 880 813 int numrep; ··· 907 840 908 841 for (step = 0; step < rule->len; step++) { 909 842 int firstn = 0; 910 - struct crush_rule_step *curstep = &rule->steps[step]; 843 + const struct crush_rule_step *curstep = &rule->steps[step]; 911 844 912 845 switch (curstep->op) { 913 846 case CRUSH_RULE_TAKE: ··· 1003 936 recurse_tries = choose_tries; 1004 937 osize += crush_choose_firstn( 1005 938 map, 939 + cw, 1006 940 map->buckets[bno], 1007 941 weight, weight_max, 1008 942 x, numrep, ··· 1024 956 numrep : (result_max-osize)); 1025 957 crush_choose_indep( 1026 958 map, 959 + cw, 1027 960 map->buckets[bno], 1028 961 weight, weight_max, 1029 962 x, out_size, numrep, ··· 1066 997 break; 1067 998 } 1068 999 } 1000 + 1069 1001 return result_len; 1070 1002 }
+42 -5
net/ceph/osdmap.c
··· 153 153 return -EINVAL; 154 154 } 155 155 156 + static void crush_finalize(struct crush_map *c) 157 + { 158 + __s32 b; 159 + 160 + /* Space for the array of pointers to per-bucket workspace */ 161 + c->working_size = sizeof(struct crush_work) + 162 + c->max_buckets * sizeof(struct crush_work_bucket *); 163 + 164 + for (b = 0; b < c->max_buckets; b++) { 165 + if (!c->buckets[b]) 166 + continue; 167 + 168 + switch (c->buckets[b]->alg) { 169 + default: 170 + /* 171 + * The base case, permutation variables and 172 + * the pointer to the permutation array. 173 + */ 174 + c->working_size += sizeof(struct crush_work_bucket); 175 + break; 176 + } 177 + /* Every bucket has a permutation array. */ 178 + c->working_size += c->buckets[b]->size * sizeof(__u32); 179 + } 180 + } 181 + 156 182 static struct crush_map *crush_decode(void *pbyval, void *end) 157 183 { 158 184 struct crush_map *c; ··· 272 246 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); 273 247 if (b->items == NULL) 274 248 goto badmem; 275 - b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS); 276 - if (b->perm == NULL) 277 - goto badmem; 278 - b->perm_n = 0; 279 249 280 250 ceph_decode_need(p, end, b->size*sizeof(u32), bad); 281 251 for (j = 0; j < b->size; j++) ··· 389 367 c->chooseleaf_stable = ceph_decode_8(p); 390 368 dout("crush decode tunable chooseleaf_stable = %d\n", 391 369 c->chooseleaf_stable); 370 + 371 + crush_finalize(c); 392 372 393 373 done: 394 374 dout("crush_decode success\n"); ··· 777 753 kfree(map->osd_weight); 778 754 kfree(map->osd_addr); 779 755 kfree(map->osd_primary_affinity); 756 + kfree(map->crush_workspace); 780 757 kfree(map); 781 758 } 782 759 ··· 835 810 836 811 static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) 837 812 { 813 + void *workspace; 814 + 838 815 if (IS_ERR(crush)) 839 816 return PTR_ERR(crush); 840 817 818 + workspace = kmalloc(crush->working_size, GFP_NOIO); 819 + if (!workspace) { 820 + crush_destroy(crush); 821 + return -ENOMEM; 822 + } 823 + crush_init_workspace(crush, workspace); 824 + 841 825 if (map->crush) 842 826 crush_destroy(map->crush); 827 + kfree(map->crush_workspace); 843 828 map->crush = crush; 829 + map->crush_workspace = workspace; 844 830 return 0; 845 831 } 846 832 ··· 1976 1940 1977 1941 mutex_lock(&map->crush_scratch_mutex); 1978 1942 r = crush_do_rule(map->crush, ruleno, x, result, result_max, 1979 - weight, weight_max, map->crush_scratch_ary); 1943 + weight, weight_max, map->crush_workspace, 1944 + map->crush_scratch_ary); 1980 1945 mutex_unlock(&map->crush_scratch_mutex); 1981 1946 1982 1947 return r;