Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

crush: add chooseleaf_vary_r tunable

The current crush_choose_firstn code will re-use the same 'r' value for
the recursive call. That means that if we are hitting a collision or
rejection for some reason (say, an OSD that is marked out) and need to
retry, we will keep making the same (bad) choice in that recursive
selection.

Introduce a tunable that fixes that behavior by incorporating the parent
'r' value into the recursive starting point, so that a different path
will be taken in subsequent placement attempts.

Note that this was done from the get-go for the new crush_choose_indep
algorithm.

This was exposed by a user who was seeing PGs stuck in active+remapped
after reweight-by-utilization because the up set mapped to a single OSD.

Reflects ceph.git commit a8e6c9fbf88bad056dd05d3eb790e98a5e43451a.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

authored by

Ilya Dryomov and committed by
Sage Weil
e2b149cc 6ed1002f

+30 -6
+6
include/linux/crush/crush.h
··· 173 173 * apply to a collision: in that case we will retry as we used 174 174 * to. */ 175 175 __u32 chooseleaf_descend_once; 176 + 177 + /* if non-zero, feed r into chooseleaf, bit-shifted right by (r-1) 178 + * bits. a value of 1 is best for new clusters. for legacy clusters 179 + * that want to limit reshuffling, a value of 3 or 4 will make the 180 + * mappings line up a bit better with previous mappings. */ 181 + __u8 chooseleaf_vary_r; 176 182 }; 177 183 178 184
+24 -6
net/ceph/crush/mapper.c
··· 295 295 * @local_retries: localized retries 296 296 * @local_fallback_retries: localized fallback retries 297 297 * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) 298 + * @vary_r: pass r to recursive calls 298 299 * @out2: second output vector for leaf items (if @recurse_to_leaf) 300 + * @parent_r: r value passed from the parent 299 301 */ 300 302 static int crush_choose_firstn(const struct crush_map *map, 301 303 struct crush_bucket *bucket, ··· 309 307 unsigned int local_retries, 310 308 unsigned int local_fallback_retries, 311 309 int recurse_to_leaf, 312 - int *out2) 310 + unsigned int vary_r, 311 + int *out2, 312 + int parent_r) 313 313 { 314 314 int rep; 315 315 unsigned int ftotal, flocal; ··· 323 319 int itemtype; 324 320 int collide, reject; 325 321 326 - dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", 327 - bucket->id, x, outpos, numrep); 322 + dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", 323 + recurse_to_leaf ? "_LEAF" : "", 324 + bucket->id, x, outpos, numrep, 325 + tries, recurse_tries, local_retries, local_fallback_retries, 326 + parent_r); 328 327 329 328 for (rep = outpos; rep < numrep; rep++) { 330 329 /* keep trying until we get a non-out, non-colliding item */ ··· 342 335 do { 343 336 collide = 0; 344 337 retry_bucket = 0; 345 - r = rep; 338 + r = rep + parent_r; 346 339 /* r' = r + f_total */ 347 340 r += ftotal; 348 341 ··· 394 387 reject = 0; 395 388 if (!collide && recurse_to_leaf) { 396 389 if (item < 0) { 390 + int sub_r; 391 + if (vary_r) 392 + sub_r = r >> (vary_r-1); 393 + else 394 + sub_r = 0; 397 395 if (crush_choose_firstn(map, 398 396 map->buckets[-1-item], 399 397 weight, weight_max, ··· 408 396 local_retries, 409 397 local_fallback_retries, 410 398 0, 411 - NULL) <= outpos) 399 + vary_r, 400 + NULL, 401 + sub_r) <= outpos) 412 402 /* didn't get leaf */ 413 403 reject = 1; 414 404 } else { ··· 667 653 int choose_local_retries = map->choose_local_tries; 668 654 int choose_local_fallback_retries = map->choose_local_fallback_tries; 669 655 656 + int vary_r = map->chooseleaf_vary_r; 657 + 670 658 if ((__u32)ruleno >= map->max_rules) { 671 659 dprintk(" bad ruleno %d\n", ruleno); 672 660 return 0; ··· 761 745 choose_local_retries, 762 746 choose_local_fallback_retries, 763 747 recurse_to_leaf, 764 - c+osize); 748 + vary_r, 749 + c+osize, 750 + 0); 765 751 } else { 766 752 crush_choose_indep( 767 753 map,