at v3.3 13 kB view raw
1#ifndef CEPH_RADOS_H 2#define CEPH_RADOS_H 3 4/* 5 * Data types for the Ceph distributed object storage layer RADOS 6 * (Reliable Autonomic Distributed Object Store). 7 */ 8 9#include "msgr.h" 10 11/* 12 * osdmap encoding versions 13 */ 14#define CEPH_OSDMAP_INC_VERSION 5 15#define CEPH_OSDMAP_INC_VERSION_EXT 6 16#define CEPH_OSDMAP_VERSION 5 17#define CEPH_OSDMAP_VERSION_EXT 6 18 19/* 20 * fs id 21 */ 22struct ceph_fsid { 23 unsigned char fsid[16]; 24}; 25 26static inline int ceph_fsid_compare(const struct ceph_fsid *a, 27 const struct ceph_fsid *b) 28{ 29 return memcmp(a, b, sizeof(*a)); 30} 31 32/* 33 * ino, object, etc. 34 */ 35typedef __le64 ceph_snapid_t; 36#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */ 37#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */ 38#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */ 39 40struct ceph_timespec { 41 __le32 tv_sec; 42 __le32 tv_nsec; 43} __attribute__ ((packed)); 44 45 46/* 47 * object layout - how objects are mapped into PGs 48 */ 49#define CEPH_OBJECT_LAYOUT_HASH 1 50#define CEPH_OBJECT_LAYOUT_LINEAR 2 51#define CEPH_OBJECT_LAYOUT_HASHINO 3 52 53/* 54 * pg layout -- how PGs are mapped onto (sets of) OSDs 55 */ 56#define CEPH_PG_LAYOUT_CRUSH 0 57#define CEPH_PG_LAYOUT_HASH 1 58#define CEPH_PG_LAYOUT_LINEAR 2 59#define CEPH_PG_LAYOUT_HYBRID 3 60 61#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */ 62 63/* 64 * placement group. 65 * we encode this into one __le64. 66 */ 67struct ceph_pg { 68 __le16 preferred; /* preferred primary osd */ 69 __le16 ps; /* placement seed */ 70 __le32 pool; /* object pool */ 71} __attribute__ ((packed)); 72 73/* 74 * pg_pool is a set of pgs storing a pool of objects 75 * 76 * pg_num -- base number of pseudorandomly placed pgs 77 * 78 * pgp_num -- effective number when calculating pg placement. this 79 * is used for pg_num increases. new pgs result in data being "split" 80 * into new pgs. for this to proceed smoothly, new pgs are intiially 81 * colocated with their parents; that is, pgp_num doesn't increase 82 * until the new pgs have successfully split. only _then_ are the new 83 * pgs placed independently. 84 * 85 * lpg_num -- localized pg count (per device). replicas are randomly 86 * selected. 87 * 88 * lpgp_num -- as above. 89 */ 90#define CEPH_PG_TYPE_REP 1 91#define CEPH_PG_TYPE_RAID4 2 92#define CEPH_PG_POOL_VERSION 2 93struct ceph_pg_pool { 94 __u8 type; /* CEPH_PG_TYPE_* */ 95 __u8 size; /* number of osds in each pg */ 96 __u8 crush_ruleset; /* crush placement rule */ 97 __u8 object_hash; /* hash mapping object name to ps */ 98 __le32 pg_num, pgp_num; /* number of pg's */ 99 __le32 lpg_num, lpgp_num; /* number of localized pg's */ 100 __le32 last_change; /* most recent epoch changed */ 101 __le64 snap_seq; /* seq for per-pool snapshot */ 102 __le32 snap_epoch; /* epoch of last snap */ 103 __le32 num_snaps; 104 __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */ 105 __le64 auid; /* who owns the pg */ 106} __attribute__ ((packed)); 107 108/* 109 * stable_mod func is used to control number of placement groups. 110 * similar to straight-up modulo, but produces a stable mapping as b 111 * increases over time. b is the number of bins, and bmask is the 112 * containing power of 2 minus 1. 113 * 114 * b <= bmask and bmask=(2**n)-1 115 * e.g., b=12 -> bmask=15, b=123 -> bmask=127 116 */ 117static inline int ceph_stable_mod(int x, int b, int bmask) 118{ 119 if ((x & bmask) < b) 120 return x & bmask; 121 else 122 return x & (bmask >> 1); 123} 124 125/* 126 * object layout - how a given object should be stored. 127 */ 128struct ceph_object_layout { 129 struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */ 130 __le32 ol_stripe_unit; /* for per-object parity, if any */ 131} __attribute__ ((packed)); 132 133/* 134 * compound epoch+version, used by storage layer to serialize mutations 135 */ 136struct ceph_eversion { 137 __le32 epoch; 138 __le64 version; 139} __attribute__ ((packed)); 140 141/* 142 * osd map bits 143 */ 144 145/* status bits */ 146#define CEPH_OSD_EXISTS 1 147#define CEPH_OSD_UP 2 148 149/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */ 150#define CEPH_OSD_IN 0x10000 151#define CEPH_OSD_OUT 0 152 153 154/* 155 * osd map flag bits 156 */ 157#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */ 158#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */ 159#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ 160#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ 161#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ 162 163/* 164 * osd ops 165 */ 166#define CEPH_OSD_OP_MODE 0xf000 167#define CEPH_OSD_OP_MODE_RD 0x1000 168#define CEPH_OSD_OP_MODE_WR 0x2000 169#define CEPH_OSD_OP_MODE_RMW 0x3000 170#define CEPH_OSD_OP_MODE_SUB 0x4000 171 172#define CEPH_OSD_OP_TYPE 0x0f00 173#define CEPH_OSD_OP_TYPE_LOCK 0x0100 174#define CEPH_OSD_OP_TYPE_DATA 0x0200 175#define CEPH_OSD_OP_TYPE_ATTR 0x0300 176#define CEPH_OSD_OP_TYPE_EXEC 0x0400 177#define CEPH_OSD_OP_TYPE_PG 0x0500 178 179enum { 180 /** data **/ 181 /* read */ 182 CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1, 183 CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2, 184 CEPH_OSD_OP_MAPEXT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 3, 185 186 /* fancy read */ 187 CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4, 188 CEPH_OSD_OP_SPARSE_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 5, 189 190 CEPH_OSD_OP_NOTIFY = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 6, 191 CEPH_OSD_OP_NOTIFY_ACK = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 7, 192 193 /* versioning */ 194 CEPH_OSD_OP_ASSERT_VER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 8, 195 196 /* write */ 197 CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1, 198 CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2, 199 CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3, 200 CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4, 201 CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5, 202 203 /* fancy write */ 204 CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6, 205 CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7, 206 CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8, 207 CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9, 208 209 CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10, 210 CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11, 211 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12, 212 213 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13, 214 CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14, 215 216 CEPH_OSD_OP_WATCH = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 15, 217 218 /** attrs **/ 219 /* read */ 220 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, 221 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2, 222 CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3, 223 224 /* write */ 225 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1, 226 CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2, 227 CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3, 228 CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4, 229 230 /** subop **/ 231 CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1, 232 CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2, 233 CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3, 234 CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4, 235 CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5, 236 CEPH_OSD_OP_SCRUB_RESERVE = CEPH_OSD_OP_MODE_SUB | 6, 237 CEPH_OSD_OP_SCRUB_UNRESERVE = CEPH_OSD_OP_MODE_SUB | 7, 238 CEPH_OSD_OP_SCRUB_STOP = CEPH_OSD_OP_MODE_SUB | 8, 239 240 /** lock **/ 241 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1, 242 CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2, 243 CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3, 244 CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4, 245 CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5, 246 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6, 247 248 /** exec **/ 249 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1, 250 251 /** pg **/ 252 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1, 253}; 254 255static inline int ceph_osd_op_type_lock(int op) 256{ 257 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK; 258} 259static inline int ceph_osd_op_type_data(int op) 260{ 261 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA; 262} 263static inline int ceph_osd_op_type_attr(int op) 264{ 265 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR; 266} 267static inline int ceph_osd_op_type_exec(int op) 268{ 269 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC; 270} 271static inline int ceph_osd_op_type_pg(int op) 272{ 273 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG; 274} 275 276static inline int ceph_osd_op_mode_subop(int op) 277{ 278 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB; 279} 280static inline int ceph_osd_op_mode_read(int op) 281{ 282 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD; 283} 284static inline int ceph_osd_op_mode_modify(int op) 285{ 286 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR; 287} 288 289/* 290 * note that the following tmap stuff is also defined in the ceph librados.h 291 * any modification here needs to be updated there 292 */ 293#define CEPH_OSD_TMAP_HDR 'h' 294#define CEPH_OSD_TMAP_SET 's' 295#define CEPH_OSD_TMAP_RM 'r' 296 297extern const char *ceph_osd_op_name(int op); 298 299 300/* 301 * osd op flags 302 * 303 * An op may be READ, WRITE, or READ|WRITE. 304 */ 305enum { 306 CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */ 307 CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */ 308 CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */ 309 CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */ 310 CEPH_OSD_FLAG_READ = 16, /* op may read */ 311 CEPH_OSD_FLAG_WRITE = 32, /* op may write */ 312 CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */ 313 CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */ 314 CEPH_OSD_FLAG_BALANCE_READS = 256, 315 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */ 316 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */ 317 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */ 318 CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */ 319}; 320 321enum { 322 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */ 323}; 324 325#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ 326#define EBLACKLISTED ESHUTDOWN /* blacklisted */ 327 328/* xattr comparison */ 329enum { 330 CEPH_OSD_CMPXATTR_OP_NOP = 0, 331 CEPH_OSD_CMPXATTR_OP_EQ = 1, 332 CEPH_OSD_CMPXATTR_OP_NE = 2, 333 CEPH_OSD_CMPXATTR_OP_GT = 3, 334 CEPH_OSD_CMPXATTR_OP_GTE = 4, 335 CEPH_OSD_CMPXATTR_OP_LT = 5, 336 CEPH_OSD_CMPXATTR_OP_LTE = 6 337}; 338 339enum { 340 CEPH_OSD_CMPXATTR_MODE_STRING = 1, 341 CEPH_OSD_CMPXATTR_MODE_U64 = 2 342}; 343 344#define RADOS_NOTIFY_VER 1 345 346/* 347 * an individual object operation. each may be accompanied by some data 348 * payload 349 */ 350struct ceph_osd_op { 351 __le16 op; /* CEPH_OSD_OP_* */ 352 __le32 flags; /* CEPH_OSD_FLAG_* */ 353 union { 354 struct { 355 __le64 offset, length; 356 __le64 truncate_size; 357 __le32 truncate_seq; 358 } __attribute__ ((packed)) extent; 359 struct { 360 __le32 name_len; 361 __le32 value_len; 362 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ 363 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ 364 } __attribute__ ((packed)) xattr; 365 struct { 366 __u8 class_len; 367 __u8 method_len; 368 __u8 argc; 369 __le32 indata_len; 370 } __attribute__ ((packed)) cls; 371 struct { 372 __le64 cookie, count; 373 } __attribute__ ((packed)) pgls; 374 struct { 375 __le64 snapid; 376 } __attribute__ ((packed)) snap; 377 struct { 378 __le64 cookie; 379 __le64 ver; 380 __u8 flag; /* 0 = unwatch, 1 = watch */ 381 } __attribute__ ((packed)) watch; 382}; 383 __le32 payload_len; 384} __attribute__ ((packed)); 385 386/* 387 * osd request message header. each request may include multiple 388 * ceph_osd_op object operations. 389 */ 390struct ceph_osd_request_head { 391 __le32 client_inc; /* client incarnation */ 392 struct ceph_object_layout layout; /* pgid */ 393 __le32 osdmap_epoch; /* client's osdmap epoch */ 394 395 __le32 flags; 396 397 struct ceph_timespec mtime; /* for mutations only */ 398 struct ceph_eversion reassert_version; /* if we are replaying op */ 399 400 __le32 object_len; /* length of object name */ 401 402 __le64 snapid; /* snapid to read */ 403 __le64 snap_seq; /* writer's snap context */ 404 __le32 num_snaps; 405 406 __le16 num_ops; 407 struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */ 408} __attribute__ ((packed)); 409 410struct ceph_osd_reply_head { 411 __le32 client_inc; /* client incarnation */ 412 __le32 flags; 413 struct ceph_object_layout layout; 414 __le32 osdmap_epoch; 415 struct ceph_eversion reassert_version; /* for replaying uncommitted */ 416 417 __le32 result; /* result code */ 418 419 __le32 object_len; /* length of object name */ 420 __le32 num_ops; 421 struct ceph_osd_op ops[0]; /* ops[], object */ 422} __attribute__ ((packed)); 423 424 425 426#endif