Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

libceph: add support for HASHPSPOOL pool flag

The legacy behavior adds the pgid seed and pool together as the input for
CRUSH. That is problematic because each pool's PGs end up mapping to the
same OSDs: 1.5 == 2.4 == 3.3 == ...

Instead, if the HASHPSPOOL flag is set, we has the ps and pool together and
feed that into CRUSH. This ensures that two adjacent pools will map to
an independent pseudorandom set of OSDs.

Advertise our support for this via a protocol feature flag.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>

Sage Weil 83ca14fd 1b83bef2

+31 -14
+3 -1
include/linux/ceph/ceph_features.h
··· 34 34 #define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27) 35 35 #define CEPH_FEATURE_OSD_HBMSGS (1<<28) 36 36 #define CEPH_FEATURE_MDSENC (1<<29) 37 + #define CEPH_FEATURE_OSDHASHPSPOOL (1<<30) 37 38 38 39 /* 39 40 * Features supported. ··· 46 45 CEPH_FEATURE_OSDENC | \ 47 46 CEPH_FEATURE_CRUSH_TUNABLES | \ 48 47 CEPH_FEATURE_CRUSH_TUNABLES2 | \ 49 - CEPH_FEATURE_REPLY_CREATE_INODE) 48 + CEPH_FEATURE_REPLY_CREATE_INODE | \ 49 + CEPH_FEATURE_OSDHASHPSPOOL) 50 50 51 51 #define CEPH_FEATURES_REQUIRED_DEFAULT \ 52 52 (CEPH_FEATURE_NOSRCADDR | \
+2
include/linux/ceph/osdmap.h
··· 23 23 uint32_t seed; 24 24 }; 25 25 26 + #define CEPH_POOL_FLAG_HASHPSPOOL 1 27 + 26 28 struct ceph_pg_pool_info { 27 29 struct rb_node node; 28 30 s64 id;
+26 -13
net/ceph/osdmap.c
··· 1127 1127 struct ceph_pg_mapping *pg; 1128 1128 struct ceph_pg_pool_info *pool; 1129 1129 int ruleno; 1130 - unsigned int poolid, ps, pps, t, r; 1130 + int r; 1131 + u32 pps; 1131 1132 1132 - poolid = pgid.pool; 1133 - ps = pgid.seed; 1134 - 1135 - pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); 1133 + pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); 1136 1134 if (!pool) 1137 1135 return NULL; 1138 1136 1139 1137 /* pg_temp? */ 1140 - t = ceph_stable_mod(ps, pool->pg_num, pool->pgp_num_mask); 1141 - pgid.seed = t; 1138 + pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, 1139 + pool->pgp_num_mask); 1142 1140 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); 1143 1141 if (pg) { 1144 1142 *num = pg->len; ··· 1147 1149 ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, 1148 1150 pool->type, pool->size); 1149 1151 if (ruleno < 0) { 1150 - pr_err("no crush rule pool %d ruleset %d type %d size %d\n", 1151 - poolid, pool->crush_ruleset, pool->type, 1152 + pr_err("no crush rule pool %lld ruleset %d type %d size %d\n", 1153 + pgid.pool, pool->crush_ruleset, pool->type, 1152 1154 pool->size); 1153 1155 return NULL; 1154 1156 } 1155 1157 1156 - pps = ceph_stable_mod(ps, pool->pgp_num, pool->pgp_num_mask); 1157 - pps += poolid; 1158 + if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { 1159 + /* hash pool id and seed sothat pool PGs do not overlap */ 1160 + pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, 1161 + ceph_stable_mod(pgid.seed, pool->pgp_num, 1162 + pool->pgp_num_mask), 1163 + pgid.pool); 1164 + } else { 1165 + /* 1166 + * legacy ehavior: add ps and pool together. this is 1167 + * not a great approach because the PGs from each pool 1168 + * will overlap on top of each other: 0.5 == 1.4 == 1169 + * 2.3 == ... 1170 + */ 1171 + pps = ceph_stable_mod(pgid.seed, pool->pgp_num, 1172 + pool->pgp_num_mask) + 1173 + (unsigned)pgid.pool; 1174 + } 1158 1175 r = crush_do_rule(osdmap->crush, ruleno, pps, osds, 1159 1176 min_t(int, pool->size, *num), 1160 1177 osdmap->osd_weight); 1161 1178 if (r < 0) { 1162 - pr_err("error %d from crush rule: pool %d ruleset %d type %d" 1163 - " size %d\n", r, poolid, pool->crush_ruleset, 1179 + pr_err("error %d from crush rule: pool %lld ruleset %d type %d" 1180 + " size %d\n", r, pgid.pool, pool->crush_ruleset, 1164 1181 pool->type, pool->size); 1165 1182 return NULL; 1166 1183 }