include/linux/ceph/osdmap.h at v4.18 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / include / linux / ceph / osdmap.h
at v4.18 8.3 kB view raw
  1/* SPDX-License-Identifier: GPL-2.0 */
  2#ifndef _FS_CEPH_OSDMAP_H
  3#define _FS_CEPH_OSDMAP_H
  4
  5#include <linux/rbtree.h>
  6#include <linux/ceph/types.h>
  7#include <linux/ceph/decode.h>
  8#include <linux/crush/crush.h>
  9
 10/*
 11 * The osd map describes the current membership of the osd cluster and
 12 * specifies the mapping of objects to placement groups and placement
 13 * groups to (sets of) osds.  That is, it completely specifies the
 14 * (desired) distribution of all data objects in the system at some
 15 * point in time.
 16 *
 17 * Each map version is identified by an epoch, which increases monotonically.
 18 *
 19 * The map can be updated either via an incremental map (diff) describing
 20 * the change between two successive epochs, or as a fully encoded map.
 21 */
 22struct ceph_pg {
 23	uint64_t pool;
 24	uint32_t seed;
 25};
 26
 27#define CEPH_SPG_NOSHARD	-1
 28
 29struct ceph_spg {
 30	struct ceph_pg pgid;
 31	s8 shard;
 32};
 33
 34int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
 35int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs);
 36
 37#define CEPH_POOL_FLAG_HASHPSPOOL	(1ULL << 0) /* hash pg seed and pool id
 38						       together */
 39#define CEPH_POOL_FLAG_FULL		(1ULL << 1) /* pool is full */
 40
 41struct ceph_pg_pool_info {
 42	struct rb_node node;
 43	s64 id;
 44	u8 type; /* CEPH_POOL_TYPE_* */
 45	u8 size;
 46	u8 min_size;
 47	u8 crush_ruleset;
 48	u8 object_hash;
 49	u32 last_force_request_resend;
 50	u32 pg_num, pgp_num;
 51	int pg_num_mask, pgp_num_mask;
 52	s64 read_tier;
 53	s64 write_tier; /* wins for read+write ops */
 54	u64 flags; /* CEPH_POOL_FLAG_* */
 55	char *name;
 56
 57	bool was_full;  /* for handle_one_map() */
 58};
 59
 60static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
 61{
 62	switch (pool->type) {
 63	case CEPH_POOL_TYPE_REP:
 64		return true;
 65	case CEPH_POOL_TYPE_EC:
 66		return false;
 67	default:
 68		BUG();
 69	}
 70}
 71
 72struct ceph_object_locator {
 73	s64 pool;
 74	struct ceph_string *pool_ns;
 75};
 76
 77static inline void ceph_oloc_init(struct ceph_object_locator *oloc)
 78{
 79	oloc->pool = -1;
 80	oloc->pool_ns = NULL;
 81}
 82
 83static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
 84{
 85	return oloc->pool == -1;
 86}
 87
 88void ceph_oloc_copy(struct ceph_object_locator *dest,
 89		    const struct ceph_object_locator *src);
 90void ceph_oloc_destroy(struct ceph_object_locator *oloc);
 91
 92/*
 93 * 51-char inline_name is long enough for all cephfs and all but one
 94 * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
 95 * arbitrarily long (~PAGE_SIZE).  It's done once during rbd map; all
 96 * other rbd requests fit into inline_name.
 97 *
 98 * Makes ceph_object_id 64 bytes on 64-bit.
 99 */
100#define CEPH_OID_INLINE_LEN 52
101
102/*
103 * Both inline and external buffers have space for a NUL-terminator,
104 * which is carried around.  It's not required though - RADOS object
105 * names don't have to be NUL-terminated and may contain NULs.
106 */
107struct ceph_object_id {
108	char *name;
109	char inline_name[CEPH_OID_INLINE_LEN];
110	int name_len;
111};
112
113static inline void ceph_oid_init(struct ceph_object_id *oid)
114{
115	oid->name = oid->inline_name;
116	oid->name_len = 0;
117}
118
119#define CEPH_OID_INIT_ONSTACK(oid)					\
120    ({ ceph_oid_init(&oid); oid; })
121#define CEPH_DEFINE_OID_ONSTACK(oid)					\
122	struct ceph_object_id oid = CEPH_OID_INIT_ONSTACK(oid)
123
124static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
125{
126	return oid->name == oid->inline_name && !oid->name_len;
127}
128
129void ceph_oid_copy(struct ceph_object_id *dest,
130		   const struct ceph_object_id *src);
131__printf(2, 3)
132void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...);
133__printf(3, 4)
134int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
135		     const char *fmt, ...);
136void ceph_oid_destroy(struct ceph_object_id *oid);
137
138struct ceph_pg_mapping {
139	struct rb_node node;
140	struct ceph_pg pgid;
141
142	union {
143		struct {
144			int len;
145			int osds[];
146		} pg_temp, pg_upmap;
147		struct {
148			int osd;
149		} primary_temp;
150		struct {
151			int len;
152			int from_to[][2];
153		} pg_upmap_items;
154	};
155};
156
157struct ceph_osdmap {
158	struct ceph_fsid fsid;
159	u32 epoch;
160	struct ceph_timespec created, modified;
161
162	u32 flags;         /* CEPH_OSDMAP_* */
163
164	u32 max_osd;       /* size of osd_state, _offload, _addr arrays */
165	u32 *osd_state;    /* CEPH_OSD_* */
166	u32 *osd_weight;   /* 0 = failed, 0x10000 = 100% normal */
167	struct ceph_entity_addr *osd_addr;
168
169	struct rb_root pg_temp;
170	struct rb_root primary_temp;
171
172	/* remap (post-CRUSH, pre-up) */
173	struct rb_root pg_upmap;	/* PG := raw set */
174	struct rb_root pg_upmap_items;	/* from -> to within raw set */
175
176	u32 *osd_primary_affinity;
177
178	struct rb_root pg_pools;
179	u32 pool_max;
180
181	/* the CRUSH map specifies the mapping of placement groups to
182	 * the list of osds that store+replicate them. */
183	struct crush_map *crush;
184
185	struct mutex crush_workspace_mutex;
186	void *crush_workspace;
187};
188
189static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
190{
191	return osd >= 0 && osd < map->max_osd &&
192	       (map->osd_state[osd] & CEPH_OSD_EXISTS);
193}
194
195static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd)
196{
197	return ceph_osd_exists(map, osd) &&
198	       (map->osd_state[osd] & CEPH_OSD_UP);
199}
200
201static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
202{
203	return !ceph_osd_is_up(map, osd);
204}
205
206char *ceph_osdmap_state_str(char *str, int len, u32 state);
207extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd);
208
209static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
210						     int osd)
211{
212	if (osd >= map->max_osd)
213		return NULL;
214	return &map->osd_addr[osd];
215}
216
217#define CEPH_PGID_ENCODING_LEN		(1 + 8 + 4 + 4)
218
219static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
220{
221	__u8 version;
222
223	if (!ceph_has_room(p, end, CEPH_PGID_ENCODING_LEN)) {
224		pr_warn("incomplete pg encoding\n");
225		return -EINVAL;
226	}
227	version = ceph_decode_8(p);
228	if (version > 1) {
229		pr_warn("do not understand pg encoding %d > 1\n",
230			(int)version);
231		return -EINVAL;
232	}
233
234	pgid->pool = ceph_decode_64(p);
235	pgid->seed = ceph_decode_32(p);
236	*p += 4;	/* skip deprecated preferred value */
237
238	return 0;
239}
240
241struct ceph_osdmap *ceph_osdmap_alloc(void);
242extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
243struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
244					     struct ceph_osdmap *map);
245extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
246
247struct ceph_osds {
248	int osds[CEPH_PG_MAX_SIZE];
249	int size;
250	int primary; /* id, NOT index */
251};
252
253static inline void ceph_osds_init(struct ceph_osds *set)
254{
255	set->size = 0;
256	set->primary = -1;
257}
258
259void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
260
261bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num,
262		      u32 new_pg_num);
263bool ceph_is_new_interval(const struct ceph_osds *old_acting,
264			  const struct ceph_osds *new_acting,
265			  const struct ceph_osds *old_up,
266			  const struct ceph_osds *new_up,
267			  int old_size,
268			  int new_size,
269			  int old_min_size,
270			  int new_min_size,
271			  u32 old_pg_num,
272			  u32 new_pg_num,
273			  bool old_sort_bitwise,
274			  bool new_sort_bitwise,
275			  bool old_recovery_deletes,
276			  bool new_recovery_deletes,
277			  const struct ceph_pg *pgid);
278bool ceph_osds_changed(const struct ceph_osds *old_acting,
279		       const struct ceph_osds *new_acting,
280		       bool any_change);
281
282void __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
283				 const struct ceph_object_id *oid,
284				 const struct ceph_object_locator *oloc,
285				 struct ceph_pg *raw_pgid);
286int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
287			      const struct ceph_object_id *oid,
288			      const struct ceph_object_locator *oloc,
289			      struct ceph_pg *raw_pgid);
290
291void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
292			       struct ceph_pg_pool_info *pi,
293			       const struct ceph_pg *raw_pgid,
294			       struct ceph_osds *up,
295			       struct ceph_osds *acting);
296bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
297			      struct ceph_pg_pool_info *pi,
298			      const struct ceph_pg *raw_pgid,
299			      struct ceph_spg *spgid);
300int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
301			      const struct ceph_pg *raw_pgid);
302
303extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
304						    u64 id);
305
306extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
307extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
308
309#endif