include/linux/ceph/osdmap.h at v4.7 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / include / linux / ceph / osdmap.h
at v4.7 7.5 kB view raw
  1#ifndef _FS_CEPH_OSDMAP_H
  2#define _FS_CEPH_OSDMAP_H
  3
  4#include <linux/rbtree.h>
  5#include <linux/ceph/types.h>
  6#include <linux/ceph/decode.h>
  7#include <linux/ceph/ceph_fs.h>
  8#include <linux/crush/crush.h>
  9
 10/*
 11 * The osd map describes the current membership of the osd cluster and
 12 * specifies the mapping of objects to placement groups and placement
 13 * groups to (sets of) osds.  That is, it completely specifies the
 14 * (desired) distribution of all data objects in the system at some
 15 * point in time.
 16 *
 17 * Each map version is identified by an epoch, which increases monotonically.
 18 *
 19 * The map can be updated either via an incremental map (diff) describing
 20 * the change between two successive epochs, or as a fully encoded map.
 21 */
 22struct ceph_pg {
 23	uint64_t pool;
 24	uint32_t seed;
 25};
 26
 27int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
 28
 29#define CEPH_POOL_FLAG_HASHPSPOOL	(1ULL << 0) /* hash pg seed and pool id
 30						       together */
 31#define CEPH_POOL_FLAG_FULL		(1ULL << 1) /* pool is full */
 32
 33struct ceph_pg_pool_info {
 34	struct rb_node node;
 35	s64 id;
 36	u8 type; /* CEPH_POOL_TYPE_* */
 37	u8 size;
 38	u8 min_size;
 39	u8 crush_ruleset;
 40	u8 object_hash;
 41	u32 last_force_request_resend;
 42	u32 pg_num, pgp_num;
 43	int pg_num_mask, pgp_num_mask;
 44	s64 read_tier;
 45	s64 write_tier; /* wins for read+write ops */
 46	u64 flags; /* CEPH_POOL_FLAG_* */
 47	char *name;
 48
 49	bool was_full;  /* for handle_one_map() */
 50};
 51
 52static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
 53{
 54	switch (pool->type) {
 55	case CEPH_POOL_TYPE_REP:
 56		return true;
 57	case CEPH_POOL_TYPE_EC:
 58		return false;
 59	default:
 60		BUG_ON(1);
 61	}
 62}
 63
 64struct ceph_object_locator {
 65	s64 pool;
 66};
 67
 68static inline void ceph_oloc_init(struct ceph_object_locator *oloc)
 69{
 70	oloc->pool = -1;
 71}
 72
 73static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
 74{
 75	return oloc->pool == -1;
 76}
 77
 78static inline void ceph_oloc_copy(struct ceph_object_locator *dest,
 79				  const struct ceph_object_locator *src)
 80{
 81	dest->pool = src->pool;
 82}
 83
 84/*
 85 * Maximum supported by kernel client object name length
 86 *
 87 * (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100)
 88 */
 89#define CEPH_MAX_OID_NAME_LEN 100
 90
 91/*
 92 * 51-char inline_name is long enough for all cephfs and all but one
 93 * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
 94 * arbitrarily long (~PAGE_SIZE).  It's done once during rbd map; all
 95 * other rbd requests fit into inline_name.
 96 *
 97 * Makes ceph_object_id 64 bytes on 64-bit.
 98 */
 99#define CEPH_OID_INLINE_LEN 52
100
101/*
102 * Both inline and external buffers have space for a NUL-terminator,
103 * which is carried around.  It's not required though - RADOS object
104 * names don't have to be NUL-terminated and may contain NULs.
105 */
106struct ceph_object_id {
107	char *name;
108	char inline_name[CEPH_OID_INLINE_LEN];
109	int name_len;
110};
111
112static inline void ceph_oid_init(struct ceph_object_id *oid)
113{
114	oid->name = oid->inline_name;
115	oid->name_len = 0;
116}
117
118static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
119{
120	return oid->name == oid->inline_name && !oid->name_len;
121}
122
123void ceph_oid_copy(struct ceph_object_id *dest,
124		   const struct ceph_object_id *src);
125__printf(2, 3)
126void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...);
127__printf(3, 4)
128int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
129		     const char *fmt, ...);
130void ceph_oid_destroy(struct ceph_object_id *oid);
131
132struct ceph_pg_mapping {
133	struct rb_node node;
134	struct ceph_pg pgid;
135
136	union {
137		struct {
138			int len;
139			int osds[];
140		} pg_temp;
141		struct {
142			int osd;
143		} primary_temp;
144	};
145};
146
147struct ceph_osdmap {
148	struct ceph_fsid fsid;
149	u32 epoch;
150	struct ceph_timespec created, modified;
151
152	u32 flags;         /* CEPH_OSDMAP_* */
153
154	u32 max_osd;       /* size of osd_state, _offload, _addr arrays */
155	u8 *osd_state;     /* CEPH_OSD_* */
156	u32 *osd_weight;   /* 0 = failed, 0x10000 = 100% normal */
157	struct ceph_entity_addr *osd_addr;
158
159	struct rb_root pg_temp;
160	struct rb_root primary_temp;
161
162	u32 *osd_primary_affinity;
163
164	struct rb_root pg_pools;
165	u32 pool_max;
166
167	/* the CRUSH map specifies the mapping of placement groups to
168	 * the list of osds that store+replicate them. */
169	struct crush_map *crush;
170
171	struct mutex crush_scratch_mutex;
172	int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
173};
174
175static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
176{
177	return osd >= 0 && osd < map->max_osd &&
178	       (map->osd_state[osd] & CEPH_OSD_EXISTS);
179}
180
181static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd)
182{
183	return ceph_osd_exists(map, osd) &&
184	       (map->osd_state[osd] & CEPH_OSD_UP);
185}
186
187static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
188{
189	return !ceph_osd_is_up(map, osd);
190}
191
192extern char *ceph_osdmap_state_str(char *str, int len, int state);
193extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd);
194
195static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
196						     int osd)
197{
198	if (osd >= map->max_osd)
199		return NULL;
200	return &map->osd_addr[osd];
201}
202
203static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
204{
205	__u8 version;
206
207	if (!ceph_has_room(p, end, 1 + 8 + 4 + 4)) {
208		pr_warn("incomplete pg encoding\n");
209		return -EINVAL;
210	}
211	version = ceph_decode_8(p);
212	if (version > 1) {
213		pr_warn("do not understand pg encoding %d > 1\n",
214			(int)version);
215		return -EINVAL;
216	}
217
218	pgid->pool = ceph_decode_64(p);
219	pgid->seed = ceph_decode_32(p);
220	*p += 4;	/* skip deprecated preferred value */
221
222	return 0;
223}
224
225struct ceph_osdmap *ceph_osdmap_alloc(void);
226extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
227struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
228					     struct ceph_osdmap *map);
229extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
230
231struct ceph_osds {
232	int osds[CEPH_PG_MAX_SIZE];
233	int size;
234	int primary; /* id, NOT index */
235};
236
237static inline void ceph_osds_init(struct ceph_osds *set)
238{
239	set->size = 0;
240	set->primary = -1;
241}
242
243void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
244
245bool ceph_is_new_interval(const struct ceph_osds *old_acting,
246			  const struct ceph_osds *new_acting,
247			  const struct ceph_osds *old_up,
248			  const struct ceph_osds *new_up,
249			  int old_size,
250			  int new_size,
251			  int old_min_size,
252			  int new_min_size,
253			  u32 old_pg_num,
254			  u32 new_pg_num,
255			  bool old_sort_bitwise,
256			  bool new_sort_bitwise,
257			  const struct ceph_pg *pgid);
258bool ceph_osds_changed(const struct ceph_osds *old_acting,
259		       const struct ceph_osds *new_acting,
260		       bool any_change);
261
262/* calculate mapping of a file extent to an object */
263extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
264					 u64 off, u64 len,
265					 u64 *bno, u64 *oxoff, u64 *oxlen);
266
267int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
268			      struct ceph_object_id *oid,
269			      struct ceph_object_locator *oloc,
270			      struct ceph_pg *raw_pgid);
271
272void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
273			       const struct ceph_pg *raw_pgid,
274			       struct ceph_osds *up,
275			       struct ceph_osds *acting);
276int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
277			      const struct ceph_pg *raw_pgid);
278
279extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
280						    u64 id);
281
282extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
283extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
284
285#endif