Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

libceph: crush_location infrastructure

Allow expressing client's location in terms of CRUSH hierarchy as
a set of (bucket type name, bucket name) pairs. The userspace syntax
"crush_location = key1=value1 key2=value2" is incompatible with mount
options and needed adaptation. Key-value pairs are separated by '|'
and we use ':' instead of '=' to separate keys from values. So for:

crush_location = host=foo rack=bar

one would write:

crush_location=host:foo|rack:bar

As in userspace, "multipath" locations are supported, so indicating
locality for parallel hierarchies is possible:

crush_location=rack:foo1|rack:foo2|datacenter:bar

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>

+168 -1
+1
include/linux/ceph/libceph.h
··· 64 64 int num_mon; 65 65 char *name; 66 66 struct ceph_crypto_key *key; 67 + struct rb_root crush_locs; 67 68 }; 68 69 69 70 /*
+15 -1
include/linux/ceph/osdmap.h
··· 302 302 int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, 303 303 const struct ceph_pg *raw_pgid); 304 304 305 + struct crush_loc { 306 + char *cl_type_name; 307 + char *cl_name; 308 + }; 309 + 310 + struct crush_loc_node { 311 + struct rb_node cl_node; 312 + struct crush_loc cl_loc; /* pointers into cl_data */ 313 + char cl_data[]; 314 + }; 315 + 316 + int ceph_parse_crush_location(char *crush_location, struct rb_root *locs); 317 + int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2); 318 + void ceph_clear_crush_locs(struct rb_root *locs); 319 + 305 320 extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, 306 321 u64 id); 307 - 308 322 extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id); 309 323 extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name); 310 324 u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id);
+36
net/ceph/ceph_common.c
··· 176 176 } 177 177 } 178 178 179 + ret = ceph_compare_crush_locs(&opt1->crush_locs, &opt2->crush_locs); 180 + if (ret) 181 + return ret; 182 + 179 183 /* any matching mon ip implies a match */ 180 184 for (i = 0; i < opt1->num_mon; i++) { 181 185 if (ceph_monmap_contains(client->monc.monmap, ··· 264 260 Opt_secret, 265 261 Opt_key, 266 262 Opt_ip, 263 + Opt_crush_location, 267 264 /* string args above */ 268 265 Opt_share, 269 266 Opt_crc, ··· 279 274 fsparam_flag_no ("cephx_require_signatures", Opt_cephx_require_signatures), 280 275 fsparam_flag_no ("cephx_sign_messages", Opt_cephx_sign_messages), 281 276 fsparam_flag_no ("crc", Opt_crc), 277 + fsparam_string ("crush_location", Opt_crush_location), 282 278 fsparam_string ("fsid", Opt_fsid), 283 279 fsparam_string ("ip", Opt_ip), 284 280 fsparam_string ("key", Opt_key), ··· 304 298 if (!opt) 305 299 return NULL; 306 300 301 + opt->crush_locs = RB_ROOT; 307 302 opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), 308 303 GFP_KERNEL); 309 304 if (!opt->mon_addr) { ··· 327 320 if (!opt) 328 321 return; 329 322 323 + ceph_clear_crush_locs(&opt->crush_locs); 330 324 kfree(opt->name); 331 325 if (opt->key) { 332 326 ceph_crypto_key_destroy(opt->key); ··· 462 454 if (!opt->key) 463 455 return -ENOMEM; 464 456 return get_secret(opt->key, param->string, &log); 457 + case Opt_crush_location: 458 + ceph_clear_crush_locs(&opt->crush_locs); 459 + err = ceph_parse_crush_location(param->string, 460 + &opt->crush_locs); 461 + if (err) { 462 + error_plog(&log, "Failed to parse CRUSH location: %d", 463 + err); 464 + return err; 465 + } 466 + break; 465 467 466 468 case Opt_osdtimeout: 467 469 warn_plog(&log, "Ignoring osdtimeout"); ··· 554 536 { 555 537 struct ceph_options *opt = client->options; 556 538 size_t pos = m->count; 539 + struct rb_node *n; 557 540 558 541 if (opt->name) { 559 542 seq_puts(m, "name="); ··· 563 544 } 564 545 if (opt->key) 565 546 seq_puts(m, "secret=<hidden>,"); 547 + 548 + if (!RB_EMPTY_ROOT(&opt->crush_locs)) { 549 + seq_puts(m, "crush_location="); 550 + for (n = rb_first(&opt->crush_locs); ; ) { 551 + struct crush_loc_node *loc = 552 + rb_entry(n, struct crush_loc_node, cl_node); 553 + 554 + seq_printf(m, "%s:%s", loc->cl_loc.cl_type_name, 555 + loc->cl_loc.cl_name); 556 + n = rb_next(n); 557 + if (!n) 558 + break; 559 + 560 + seq_putc(m, '|'); 561 + } 562 + seq_putc(m, ','); 563 + } 566 564 567 565 if (opt->flags & CEPH_OPT_FSID) 568 566 seq_printf(m, "fsid=%pU,", &opt->fsid);
+116
net/ceph/osdmap.c
··· 2715 2715 return acting.primary; 2716 2716 } 2717 2717 EXPORT_SYMBOL(ceph_pg_to_acting_primary); 2718 + 2719 + static struct crush_loc_node *alloc_crush_loc(size_t type_name_len, 2720 + size_t name_len) 2721 + { 2722 + struct crush_loc_node *loc; 2723 + 2724 + loc = kmalloc(sizeof(*loc) + type_name_len + name_len + 2, GFP_NOIO); 2725 + if (!loc) 2726 + return NULL; 2727 + 2728 + RB_CLEAR_NODE(&loc->cl_node); 2729 + return loc; 2730 + } 2731 + 2732 + static void free_crush_loc(struct crush_loc_node *loc) 2733 + { 2734 + WARN_ON(!RB_EMPTY_NODE(&loc->cl_node)); 2735 + 2736 + kfree(loc); 2737 + } 2738 + 2739 + static int crush_loc_compare(const struct crush_loc *loc1, 2740 + const struct crush_loc *loc2) 2741 + { 2742 + return strcmp(loc1->cl_type_name, loc2->cl_type_name) ?: 2743 + strcmp(loc1->cl_name, loc2->cl_name); 2744 + } 2745 + 2746 + DEFINE_RB_FUNCS2(crush_loc, struct crush_loc_node, cl_loc, crush_loc_compare, 2747 + RB_BYPTR, const struct crush_loc *, cl_node) 2748 + 2749 + /* 2750 + * Parses a set of <bucket type name>':'<bucket name> pairs separated 2751 + * by '|', e.g. "rack:foo1|rack:foo2|datacenter:bar". 2752 + * 2753 + * Note that @crush_location is modified by strsep(). 2754 + */ 2755 + int ceph_parse_crush_location(char *crush_location, struct rb_root *locs) 2756 + { 2757 + struct crush_loc_node *loc; 2758 + const char *type_name, *name, *colon; 2759 + size_t type_name_len, name_len; 2760 + 2761 + dout("%s '%s'\n", __func__, crush_location); 2762 + while ((type_name = strsep(&crush_location, "|"))) { 2763 + colon = strchr(type_name, ':'); 2764 + if (!colon) 2765 + return -EINVAL; 2766 + 2767 + type_name_len = colon - type_name; 2768 + if (type_name_len == 0) 2769 + return -EINVAL; 2770 + 2771 + name = colon + 1; 2772 + name_len = strlen(name); 2773 + if (name_len == 0) 2774 + return -EINVAL; 2775 + 2776 + loc = alloc_crush_loc(type_name_len, name_len); 2777 + if (!loc) 2778 + return -ENOMEM; 2779 + 2780 + loc->cl_loc.cl_type_name = loc->cl_data; 2781 + memcpy(loc->cl_loc.cl_type_name, type_name, type_name_len); 2782 + loc->cl_loc.cl_type_name[type_name_len] = '\0'; 2783 + 2784 + loc->cl_loc.cl_name = loc->cl_data + type_name_len + 1; 2785 + memcpy(loc->cl_loc.cl_name, name, name_len); 2786 + loc->cl_loc.cl_name[name_len] = '\0'; 2787 + 2788 + if (!__insert_crush_loc(locs, loc)) { 2789 + free_crush_loc(loc); 2790 + return -EEXIST; 2791 + } 2792 + 2793 + dout("%s type_name '%s' name '%s'\n", __func__, 2794 + loc->cl_loc.cl_type_name, loc->cl_loc.cl_name); 2795 + } 2796 + 2797 + return 0; 2798 + } 2799 + 2800 + int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2) 2801 + { 2802 + struct rb_node *n1 = rb_first(locs1); 2803 + struct rb_node *n2 = rb_first(locs2); 2804 + int ret; 2805 + 2806 + for ( ; n1 && n2; n1 = rb_next(n1), n2 = rb_next(n2)) { 2807 + struct crush_loc_node *loc1 = 2808 + rb_entry(n1, struct crush_loc_node, cl_node); 2809 + struct crush_loc_node *loc2 = 2810 + rb_entry(n2, struct crush_loc_node, cl_node); 2811 + 2812 + ret = crush_loc_compare(&loc1->cl_loc, &loc2->cl_loc); 2813 + if (ret) 2814 + return ret; 2815 + } 2816 + 2817 + if (!n1 && n2) 2818 + return -1; 2819 + if (n1 && !n2) 2820 + return 1; 2821 + return 0; 2822 + } 2823 + 2824 + void ceph_clear_crush_locs(struct rb_root *locs) 2825 + { 2826 + while (!RB_EMPTY_ROOT(locs)) { 2827 + struct crush_loc_node *loc = 2828 + rb_entry(rb_first(locs), struct crush_loc_node, cl_node); 2829 + 2830 + erase_crush_loc(locs, loc); 2831 + free_crush_loc(loc); 2832 + } 2833 + }