Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ceph: update support for PGID64, PGPOOL3, OSDENC protocol features

Support (and require) the PGID64, PGPOOL3, and OSDENC protocol features.
These have been present in ceph.git since v0.42, Feb 2012. Require these
features to simplify support; nobody is running older userspace.

Note that the new request and reply encoding is still not in place, so the new
code is not yet functional.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>

Sage Weil 4f6a7e5e ec73a754

+124 -119
+8 -4
fs/ceph/mdsmap.c
··· 59 59 return ERR_PTR(-ENOMEM); 60 60 61 61 ceph_decode_16_safe(p, end, version, bad); 62 + if (version > 3) { 63 + pr_warning("got mdsmap version %d > 3, failing", version); 64 + goto bad; 65 + } 62 66 63 67 ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); 64 68 m->m_epoch = ceph_decode_32(p); ··· 148 144 /* pg_pools */ 149 145 ceph_decode_32_safe(p, end, n, bad); 150 146 m->m_num_data_pg_pools = n; 151 - m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS); 147 + m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS); 152 148 if (!m->m_data_pg_pools) 153 149 goto badmem; 154 - ceph_decode_need(p, end, sizeof(u32)*(n+1), bad); 150 + ceph_decode_need(p, end, sizeof(u64)*(n+1), bad); 155 151 for (i = 0; i < n; i++) 156 - m->m_data_pg_pools[i] = ceph_decode_32(p); 157 - m->m_cas_pg_pool = ceph_decode_32(p); 152 + m->m_data_pg_pools[i] = ceph_decode_64(p); 153 + m->m_cas_pg_pool = ceph_decode_64(p); 158 154 159 155 /* ok, we don't care about the rest. */ 160 156 dout("mdsmap_decode success epoch %u\n", m->m_epoch);
+10 -4
include/linux/ceph/ceph_features.h
··· 39 39 * Features supported. 40 40 */ 41 41 #define CEPH_FEATURES_SUPPORTED_DEFAULT \ 42 - (CEPH_FEATURE_NOSRCADDR | \ 43 - CEPH_FEATURE_CRUSH_TUNABLES | \ 44 - CEPH_FEATURE_CRUSH_TUNABLES2 | \ 42 + (CEPH_FEATURE_NOSRCADDR | \ 43 + CEPH_FEATURE_PGID64 | \ 44 + CEPH_FEATURE_PGPOOL3 | \ 45 + CEPH_FEATURE_OSDENC | \ 46 + CEPH_FEATURE_CRUSH_TUNABLES | \ 47 + CEPH_FEATURE_CRUSH_TUNABLES2 | \ 45 48 CEPH_FEATURE_REPLY_CREATE_INODE) 46 49 47 50 #define CEPH_FEATURES_REQUIRED_DEFAULT \ 48 - (CEPH_FEATURE_NOSRCADDR) 51 + (CEPH_FEATURE_NOSRCADDR | \ 52 + CEPH_FEATURE_PGID64 | \ 53 + CEPH_FEATURE_PGPOOL3 | \ 54 + CEPH_FEATURE_OSDENC) 49 55 #endif
+2 -2
include/linux/ceph/mdsmap.h
··· 29 29 30 30 /* which object pools file data can be stored in */ 31 31 int m_num_data_pg_pools; 32 - u32 *m_data_pg_pools; 33 - u32 m_cas_pg_pool; 32 + u64 *m_data_pg_pools; 33 + u64 m_cas_pg_pool; 34 34 }; 35 35 36 36 static inline struct ceph_entity_addr *
+13 -3
include/linux/ceph/osdmap.h
··· 25 25 26 26 struct ceph_pg_pool_info { 27 27 struct rb_node node; 28 - int id; 29 - struct ceph_pg_pool v; 30 - int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; 28 + s64 id; 29 + u8 type; 30 + u8 size; 31 + u8 crush_ruleset; 32 + u8 object_hash; 33 + u32 pg_num, pgp_num; 34 + int pg_num_mask, pgp_num_mask; 35 + u64 flags; 31 36 char *name; 37 + }; 38 + 39 + struct ceph_object_locator { 40 + uint64_t pool; 41 + char *key; 32 42 }; 33 43 34 44 struct ceph_pg_mapping {
-23
include/linux/ceph/rados.h
··· 9 9 #include <linux/ceph/msgr.h> 10 10 11 11 /* 12 - * osdmap encoding versions 13 - */ 14 - #define CEPH_OSDMAP_INC_VERSION 5 15 - #define CEPH_OSDMAP_INC_VERSION_EXT 6 16 - #define CEPH_OSDMAP_VERSION 5 17 - #define CEPH_OSDMAP_VERSION_EXT 6 18 - 19 - /* 20 12 * fs id 21 13 */ 22 14 struct ceph_fsid { ··· 83 91 84 92 #define CEPH_PG_TYPE_REP 1 85 93 #define CEPH_PG_TYPE_RAID4 2 86 - #define CEPH_PG_POOL_VERSION 2 87 - struct ceph_pg_pool { 88 - __u8 type; /* CEPH_PG_TYPE_* */ 89 - __u8 size; /* number of osds in each pg */ 90 - __u8 crush_ruleset; /* crush placement rule */ 91 - __u8 object_hash; /* hash mapping object name to ps */ 92 - __le32 pg_num, pgp_num; /* number of pg's */ 93 - __le32 lpg_num, lpgp_num; /* number of localized pg's */ 94 - __le32 last_change; /* most recent epoch changed */ 95 - __le64 snap_seq; /* seq for per-pool snapshot */ 96 - __le32 snap_epoch; /* epoch of last snap */ 97 - __le32 num_snaps; 98 - __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */ 99 - __le64 auid; /* who owns the pg */ 100 - } __attribute__ ((packed)); 101 94 102 95 /* 103 96 * stable_mod func is used to control number of placement groups.
+2 -4
net/ceph/ceph_common.c
··· 601 601 if (ret < 0) 602 602 goto out_crypto; 603 603 604 - pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n", 605 - CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL, 606 - CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT, 607 - CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT); 604 + pr_info("loaded (mon/osd proto %d/%d)\n", 605 + CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL); 608 606 609 607 return 0; 610 608
+3 -3
net/ceph/debugfs.c
··· 66 66 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { 67 67 struct ceph_pg_pool_info *pool = 68 68 rb_entry(n, struct ceph_pg_pool_info, node); 69 - seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", 70 - pool->id, pool->v.pg_num, pool->pg_num_mask, 71 - pool->v.lpg_num, pool->lpg_num_mask); 69 + seq_printf(s, "pg_pool %llu pg_num %d / %d\n", 70 + (unsigned long long)pool->id, pool->pg_num, 71 + pool->pg_num_mask); 72 72 } 73 73 for (i = 0; i < client->osdc.osdmap->max_osd; i++) { 74 74 struct ceph_entity_addr *addr =
+86 -76
net/ceph/osdmap.c
··· 45 45 */ 46 46 static void calc_pg_masks(struct ceph_pg_pool_info *pi) 47 47 { 48 - pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1; 49 - pi->pgp_num_mask = 50 - (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1; 51 - pi->lpg_num_mask = 52 - (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1; 53 - pi->lpgp_num_mask = 54 - (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1; 48 + pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1; 49 + pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1; 55 50 } 56 51 57 52 /* ··· 447 452 return 0; 448 453 } 449 454 450 - static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) 455 + static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id) 451 456 { 452 457 struct ceph_pg_pool_info *pi; 453 458 struct rb_node *n = root->rb_node; ··· 503 508 504 509 static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) 505 510 { 506 - unsigned int n, m; 511 + u8 ev, cv; 512 + unsigned len, num; 513 + void *pool_end; 507 514 508 - ceph_decode_copy(p, &pi->v, sizeof(pi->v)); 509 - calc_pg_masks(pi); 515 + ceph_decode_need(p, end, 2 + 4, bad); 516 + ev = ceph_decode_8(p); /* encoding version */ 517 + cv = ceph_decode_8(p); /* compat version */ 518 + if (ev < 5) { 519 + pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv); 520 + return -EINVAL; 521 + } 522 + if (cv > 7) { 523 + pr_warning("got v %d cv %d > 7 of ceph_pg_pool\n", ev, cv); 524 + return -EINVAL; 525 + } 526 + len = ceph_decode_32(p); 527 + ceph_decode_need(p, end, len, bad); 528 + pool_end = *p + len; 510 529 511 - /* num_snaps * snap_info_t */ 512 - n = le32_to_cpu(pi->v.num_snaps); 513 - while (n--) { 514 - ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) + 515 - sizeof(struct ceph_timespec), bad); 516 - *p += sizeof(u64) + /* key */ 517 - 1 + sizeof(u64) + /* u8, snapid */ 518 - sizeof(struct ceph_timespec); 519 - m = ceph_decode_32(p); /* snap name */ 520 - *p += m; 530 + pi->type = ceph_decode_8(p); 531 + pi->size = ceph_decode_8(p); 532 + pi->crush_ruleset = ceph_decode_8(p); 533 + pi->object_hash = ceph_decode_8(p); 534 + 535 + pi->pg_num = ceph_decode_32(p); 536 + pi->pgp_num = ceph_decode_32(p); 537 + 538 + *p += 4 + 4; /* skip lpg* */ 539 + *p += 4; /* skip last_change */ 540 + *p += 8 + 4; /* skip snap_seq, snap_epoch */ 541 + 542 + /* skip snaps */ 543 + num = ceph_decode_32(p); 544 + while (num--) { 545 + *p += 8; /* snapid key */ 546 + *p += 1 + 1; /* versions */ 547 + len = ceph_decode_32(p); 548 + *p += len; 521 549 } 522 550 523 - *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; 551 + /* skip removed snaps */ 552 + num = ceph_decode_32(p); 553 + *p += num * (8 + 8); 554 + 555 + *p += 8; /* skip auid */ 556 + pi->flags = ceph_decode_64(p); 557 + 558 + /* ignore the rest */ 559 + 560 + *p = pool_end; 561 + calc_pg_masks(pi); 524 562 return 0; 525 563 526 564 bad: ··· 563 535 static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) 564 536 { 565 537 struct ceph_pg_pool_info *pi; 566 - u32 num, len, pool; 538 + u32 num, len; 539 + u64 pool; 567 540 568 541 ceph_decode_32_safe(p, end, num, bad); 569 542 dout(" %d pool names\n", num); 570 543 while (num--) { 571 - ceph_decode_32_safe(p, end, pool, bad); 544 + ceph_decode_64_safe(p, end, pool, bad); 572 545 ceph_decode_32_safe(p, end, len, bad); 573 - dout(" pool %d len %d\n", pool, len); 546 + dout(" pool %llu len %d\n", pool, len); 574 547 ceph_decode_need(p, end, len, bad); 575 548 pi = __lookup_pg_pool(&map->pg_pools, pool); 576 549 if (pi) { ··· 662 633 struct ceph_osdmap *map; 663 634 u16 version; 664 635 u32 len, max, i; 665 - u8 ev; 666 636 int err = -EINVAL; 667 637 void *start = *p; 668 638 struct ceph_pg_pool_info *pi; ··· 674 646 map->pg_temp = RB_ROOT; 675 647 676 648 ceph_decode_16_safe(p, end, version, bad); 677 - if (version > CEPH_OSDMAP_VERSION) { 678 - pr_warning("got unknown v %d > %d of osdmap\n", version, 679 - CEPH_OSDMAP_VERSION); 649 + if (version > 6) { 650 + pr_warning("got unknown v %d > 6 of osdmap\n", version); 651 + goto bad; 652 + } 653 + if (version < 6) { 654 + pr_warning("got old v %d < 6 of osdmap\n", version); 680 655 goto bad; 681 656 } 682 657 ··· 691 660 692 661 ceph_decode_32_safe(p, end, max, bad); 693 662 while (max--) { 694 - ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); 663 + ceph_decode_need(p, end, 8 + 2, bad); 695 664 err = -ENOMEM; 696 665 pi = kzalloc(sizeof(*pi), GFP_NOFS); 697 666 if (!pi) 698 667 goto bad; 699 - pi->id = ceph_decode_32(p); 700 - err = -EINVAL; 701 - ev = ceph_decode_8(p); /* encoding version */ 702 - if (ev > CEPH_PG_POOL_VERSION) { 703 - pr_warning("got unknown v %d > %d of ceph_pg_pool\n", 704 - ev, CEPH_PG_POOL_VERSION); 705 - kfree(pi); 706 - goto bad; 707 - } 668 + pi->id = ceph_decode_64(p); 708 669 err = __decode_pool(p, end, pi); 709 670 if (err < 0) { 710 671 kfree(pi); ··· 705 682 __insert_pg_pool(&map->pg_pools, pi); 706 683 } 707 684 708 - if (version >= 5) { 709 - err = __decode_pool_names(p, end, map); 710 - if (err < 0) { 711 - dout("fail to decode pool names"); 712 - goto bad; 713 - } 685 + err = __decode_pool_names(p, end, map); 686 + if (err < 0) { 687 + dout("fail to decode pool names"); 688 + goto bad; 714 689 } 715 690 716 691 ceph_decode_32_safe(p, end, map->pool_max, bad); ··· 809 788 struct ceph_fsid fsid; 810 789 u32 epoch = 0; 811 790 struct ceph_timespec modified; 812 - u32 len, pool; 813 - __s32 new_pool_max, new_flags, max; 791 + s32 len; 792 + u64 pool; 793 + __s64 new_pool_max; 794 + __s32 new_flags, max; 814 795 void *start = *p; 815 796 int err = -EINVAL; 816 797 u16 version; 817 798 818 799 ceph_decode_16_safe(p, end, version, bad); 819 - if (version > CEPH_OSDMAP_INC_VERSION) { 820 - pr_warning("got unknown v %d > %d of inc osdmap\n", version, 821 - CEPH_OSDMAP_INC_VERSION); 800 + if (version > 6) { 801 + pr_warning("got unknown v %d > %d of inc osdmap\n", version, 6); 822 802 goto bad; 823 803 } 824 804 ··· 829 807 epoch = ceph_decode_32(p); 830 808 BUG_ON(epoch != map->epoch+1); 831 809 ceph_decode_copy(p, &modified, sizeof(modified)); 832 - new_pool_max = ceph_decode_32(p); 810 + new_pool_max = ceph_decode_64(p); 833 811 new_flags = ceph_decode_32(p); 834 812 835 813 /* full map? */ ··· 879 857 /* new_pool */ 880 858 ceph_decode_32_safe(p, end, len, bad); 881 859 while (len--) { 882 - __u8 ev; 883 860 struct ceph_pg_pool_info *pi; 884 861 885 - ceph_decode_32_safe(p, end, pool, bad); 886 - ceph_decode_need(p, end, 1 + sizeof(pi->v), bad); 887 - ev = ceph_decode_8(p); /* encoding version */ 888 - if (ev > CEPH_PG_POOL_VERSION) { 889 - pr_warning("got unknown v %d > %d of ceph_pg_pool\n", 890 - ev, CEPH_PG_POOL_VERSION); 891 - err = -EINVAL; 892 - goto bad; 893 - } 862 + ceph_decode_64_safe(p, end, pool, bad); 894 863 pi = __lookup_pg_pool(&map->pg_pools, pool); 895 864 if (!pi) { 896 865 pi = kzalloc(sizeof(*pi), GFP_NOFS); ··· 907 894 while (len--) { 908 895 struct ceph_pg_pool_info *pi; 909 896 910 - ceph_decode_32_safe(p, end, pool, bad); 897 + ceph_decode_64_safe(p, end, pool, bad); 911 898 pi = __lookup_pg_pool(&map->pg_pools, pool); 912 899 if (pi) 913 900 __remove_pg_pool(&map->pg_pools, pi); ··· 1110 1097 pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); 1111 1098 if (!pool) 1112 1099 return -EIO; 1113 - pgid.seed = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); 1114 - num = le32_to_cpu(pool->v.pg_num); 1100 + pgid.seed = ceph_str_hash(pool->object_hash, oid, strlen(oid)); 1101 + num = pool->pg_num; 1115 1102 num_mask = pool->pg_num_mask; 1116 1103 1117 1104 dout("calc_object_layout '%s' pgid %lld.%x\n", oid, pgid.pool, ··· 1145 1132 return NULL; 1146 1133 1147 1134 /* pg_temp? */ 1148 - t = ceph_stable_mod(ps, le32_to_cpu(pool->v.pg_num), 1149 - pool->pgp_num_mask); 1135 + t = ceph_stable_mod(ps, pool->pg_num, pool->pgp_num_mask); 1150 1136 pgid.seed = t; 1151 1137 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); 1152 1138 if (pg) { ··· 1154 1142 } 1155 1143 1156 1144 /* crush */ 1157 - ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, 1158 - pool->v.type, pool->v.size); 1145 + ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, 1146 + pool->type, pool->size); 1159 1147 if (ruleno < 0) { 1160 1148 pr_err("no crush rule pool %d ruleset %d type %d size %d\n", 1161 - poolid, pool->v.crush_ruleset, pool->v.type, 1162 - pool->v.size); 1149 + poolid, pool->crush_ruleset, pool->type, 1150 + pool->size); 1163 1151 return NULL; 1164 1152 } 1165 1153 1166 - pps = ceph_stable_mod(ps, 1167 - le32_to_cpu(pool->v.pgp_num), 1168 - pool->pgp_num_mask); 1154 + pps = ceph_stable_mod(ps, pool->pgp_num, pool->pgp_num_mask); 1169 1155 pps += poolid; 1170 1156 r = crush_do_rule(osdmap->crush, ruleno, pps, osds, 1171 - min_t(int, pool->v.size, *num), 1157 + min_t(int, pool->size, *num), 1172 1158 osdmap->osd_weight); 1173 1159 if (r < 0) { 1174 1160 pr_err("error %d from crush rule: pool %d ruleset %d type %d" 1175 - " size %d\n", r, poolid, pool->v.crush_ruleset, 1176 - pool->v.type, pool->v.size); 1161 + " size %d\n", r, poolid, pool->crush_ruleset, 1162 + pool->type, pool->size); 1177 1163 return NULL; 1178 1164 } 1179 1165 *num = r;