Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull ceph updates from Sage Weil:
"The bulk of this pull is a series from Alex that refactors and cleans
up the RBD code to lay the groundwork for supporting the new image
format and evolving feature set. There are also some cleanups in
libceph, and for ceph there's fixed validation of file striping
layouts and a bugfix in the code handling a shrinking MDS cluster."

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (71 commits)
ceph: avoid 32-bit page index overflow
ceph: return EIO on invalid layout on GET_DATALOC ioctl
rbd: BUG on invalid layout
ceph: propagate layout error on osd request creation
libceph: check for invalid mapping
ceph: convert to use le32_add_cpu()
ceph: Fix oops when handling mdsmap that decreases max_mds
rbd: update remaining header fields for v2
rbd: get snapshot name for a v2 image
rbd: get the snapshot context for a v2 image
rbd: get image features for a v2 image
rbd: get the object prefix for a v2 rbd image
rbd: add code to get the size of a v2 rbd image
rbd: lay out header probe infrastructure
rbd: encapsulate code that gets snapshot info
rbd: add an rbd features field
rbd: don't use index in __rbd_add_snap_dev()
rbd: kill create_snap sysfs entry
rbd: define rbd_dev_image_id()
rbd: define some new format constants
...

+1322 -712
+12 -6
Documentation/ABI/testing/sysfs-bus-rbd
··· 25 25 26 26 The ceph unique client id that was assigned for this specific session. 27 27 28 + features 29 + 30 + A hexadecimal encoding of the feature bits for this image. 31 + 28 32 major 29 33 30 34 The block device major number. ··· 36 32 name 37 33 38 34 The name of the rbd image. 35 + 36 + image_id 37 + 38 + The unique id for the rbd image. (For rbd image format 1 39 + this is empty.) 39 40 40 41 pool 41 42 ··· 66 57 67 58 The current snapshot for which the device is mapped. 68 59 69 - create_snap 70 - 71 - Create a snapshot: 72 - 73 - $ echo <snap-name> > /sys/bus/rbd/devices/<dev-id>/snap_create 74 - 75 60 snap_* 76 61 77 62 A directory per each snapshot ··· 82 79 83 80 The size of the image when this snapshot was taken. 84 81 82 + snap_features 83 + 84 + A hexadecimal encoding of the feature bits for this snapshot. 85 85
+1189 -641
drivers/block/rbd.c
··· 41 41 42 42 #include "rbd_types.h" 43 43 44 + #define RBD_DEBUG /* Activate rbd_assert() calls */ 45 + 44 46 /* 45 47 * The basic unit of block I/O is a sector. It is interpreted in a 46 48 * number of contexts in Linux (blk, bio, genhd), but the default is ··· 52 50 #define SECTOR_SHIFT 9 53 51 #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 54 52 53 + /* It might be useful to have this defined elsewhere too */ 54 + 55 + #define U64_MAX ((u64) (~0ULL)) 56 + 55 57 #define RBD_DRV_NAME "rbd" 56 58 #define RBD_DRV_NAME_LONG "rbd (rados block device)" 57 59 58 60 #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ 59 61 60 62 #define RBD_MAX_SNAP_NAME_LEN 32 63 + #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 61 64 #define RBD_MAX_OPT_LEN 1024 62 65 63 66 #define RBD_SNAP_HEAD_NAME "-" 67 + 68 + #define RBD_IMAGE_ID_LEN_MAX 64 69 + #define RBD_OBJ_PREFIX_LEN_MAX 64 64 70 65 71 /* 66 72 * An RBD device name will be "rbd#", where the "rbd" comes from ··· 79 69 #define DEV_NAME_LEN 32 80 70 #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 81 71 82 - #define RBD_NOTIFY_TIMEOUT_DEFAULT 10 72 + #define RBD_READ_ONLY_DEFAULT false 83 73 84 74 /* 85 75 * block device image metadata (in-memory version) 86 76 */ 87 77 struct rbd_image_header { 88 - u64 image_size; 78 + /* These four fields never change for a given rbd image */ 89 79 char *object_prefix; 80 + u64 features; 90 81 __u8 obj_order; 91 82 __u8 crypt_type; 92 83 __u8 comp_type; 93 - struct ceph_snap_context *snapc; 94 - size_t snap_names_len; 95 - u32 total_snaps; 96 84 85 + /* The remaining fields need to be updated occasionally */ 86 + u64 image_size; 87 + struct ceph_snap_context *snapc; 97 88 char *snap_names; 98 89 u64 *snap_sizes; 99 90 ··· 102 91 }; 103 92 104 93 struct rbd_options { 105 - int notify_timeout; 94 + bool read_only; 106 95 }; 107 96 108 97 /* ··· 110 99 */ 111 100 struct rbd_client { 112 101 struct ceph_client *client; 113 - struct rbd_options *rbd_opts; 114 102 struct kref kref; 115 103 struct list_head node; 116 104 }; ··· 151 141 u64 size; 152 142 struct list_head node; 153 143 u64 id; 144 + u64 features; 145 + }; 146 + 147 + struct rbd_mapping { 148 + char *snap_name; 149 + u64 snap_id; 150 + u64 size; 151 + u64 features; 152 + bool snap_exists; 153 + bool read_only; 154 154 }; 155 155 156 156 /* ··· 171 151 172 152 int major; /* blkdev assigned major */ 173 153 struct gendisk *disk; /* blkdev's gendisk and rq */ 174 - struct request_queue *q; 175 154 155 + u32 image_format; /* Either 1 or 2 */ 156 + struct rbd_options rbd_opts; 176 157 struct rbd_client *rbd_client; 177 158 178 159 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ ··· 181 160 spinlock_t lock; /* queue lock */ 182 161 183 162 struct rbd_image_header header; 163 + char *image_id; 164 + size_t image_id_len; 184 165 char *image_name; 185 166 size_t image_name_len; 186 167 char *header_name; ··· 194 171 195 172 /* protects updating the header */ 196 173 struct rw_semaphore header_rwsem; 197 - /* name of the snapshot this device reads from */ 198 - char *snap_name; 199 - /* id of the snapshot this device reads from */ 200 - u64 snap_id; /* current snapshot id */ 201 - /* whether the snap_id this device reads from still exists */ 202 - bool snap_exists; 203 - int read_only; 174 + 175 + struct rbd_mapping mapping; 204 176 205 177 struct list_head node; 206 178 ··· 214 196 static LIST_HEAD(rbd_client_list); /* clients */ 215 197 static DEFINE_SPINLOCK(rbd_client_list_lock); 216 198 217 - static int __rbd_init_snaps_header(struct rbd_device *rbd_dev); 199 + static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); 200 + static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); 201 + 218 202 static void rbd_dev_release(struct device *dev); 219 - static ssize_t rbd_snap_add(struct device *dev, 220 - struct device_attribute *attr, 221 - const char *buf, 222 - size_t count); 223 203 static void __rbd_remove_snap_dev(struct rbd_snap *snap); 224 204 225 205 static ssize_t rbd_add(struct bus_type *bus, const char *buf, ··· 245 229 .release = rbd_root_dev_release, 246 230 }; 247 231 232 + #ifdef RBD_DEBUG 233 + #define rbd_assert(expr) \ 234 + if (unlikely(!(expr))) { \ 235 + printk(KERN_ERR "\nAssertion failure in %s() " \ 236 + "at line %d:\n\n" \ 237 + "\trbd_assert(%s);\n\n", \ 238 + __func__, __LINE__, #expr); \ 239 + BUG(); \ 240 + } 241 + #else /* !RBD_DEBUG */ 242 + # define rbd_assert(expr) ((void) 0) 243 + #endif /* !RBD_DEBUG */ 248 244 249 245 static struct device *rbd_get_dev(struct rbd_device *rbd_dev) 250 246 { ··· 274 246 { 275 247 struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 276 248 277 - if ((mode & FMODE_WRITE) && rbd_dev->read_only) 249 + if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 278 250 return -EROFS; 279 251 280 252 rbd_get_dev(rbd_dev); 281 - set_device_ro(bdev, rbd_dev->read_only); 253 + set_device_ro(bdev, rbd_dev->mapping.read_only); 282 254 283 255 return 0; 284 256 } ··· 302 274 * Initialize an rbd client instance. 303 275 * We own *ceph_opts. 304 276 */ 305 - static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts, 306 - struct rbd_options *rbd_opts) 277 + static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 307 278 { 308 279 struct rbd_client *rbdc; 309 280 int ret = -ENOMEM; ··· 326 299 if (ret < 0) 327 300 goto out_err; 328 301 329 - rbdc->rbd_opts = rbd_opts; 330 - 331 302 spin_lock(&rbd_client_list_lock); 332 303 list_add_tail(&rbdc->node, &rbd_client_list); 333 304 spin_unlock(&rbd_client_list_lock); ··· 347 322 } 348 323 349 324 /* 350 - * Find a ceph client with specific addr and configuration. 325 + * Find a ceph client with specific addr and configuration. If 326 + * found, bump its reference count. 351 327 */ 352 - static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts) 328 + static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 353 329 { 354 330 struct rbd_client *client_node; 331 + bool found = false; 355 332 356 333 if (ceph_opts->flags & CEPH_OPT_NOSHARE) 357 334 return NULL; 358 335 359 - list_for_each_entry(client_node, &rbd_client_list, node) 360 - if (!ceph_compare_options(ceph_opts, client_node->client)) 361 - return client_node; 362 - return NULL; 336 + spin_lock(&rbd_client_list_lock); 337 + list_for_each_entry(client_node, &rbd_client_list, node) { 338 + if (!ceph_compare_options(ceph_opts, client_node->client)) { 339 + kref_get(&client_node->kref); 340 + found = true; 341 + break; 342 + } 343 + } 344 + spin_unlock(&rbd_client_list_lock); 345 + 346 + return found ? client_node : NULL; 363 347 } 364 348 365 349 /* 366 350 * mount options 367 351 */ 368 352 enum { 369 - Opt_notify_timeout, 370 353 Opt_last_int, 371 354 /* int args above */ 372 355 Opt_last_string, 373 356 /* string args above */ 357 + Opt_read_only, 358 + Opt_read_write, 359 + /* Boolean args above */ 360 + Opt_last_bool, 374 361 }; 375 362 376 363 static match_table_t rbd_opts_tokens = { 377 - {Opt_notify_timeout, "notify_timeout=%d"}, 378 364 /* int args above */ 379 365 /* string args above */ 366 + {Opt_read_only, "mapping.read_only"}, 367 + {Opt_read_only, "ro"}, /* Alternate spelling */ 368 + {Opt_read_write, "read_write"}, 369 + {Opt_read_write, "rw"}, /* Alternate spelling */ 370 + /* Boolean args above */ 380 371 {-1, NULL} 381 372 }; 382 373 ··· 417 376 } else if (token > Opt_last_int && token < Opt_last_string) { 418 377 dout("got string token %d val %s\n", token, 419 378 argstr[0].from); 379 + } else if (token > Opt_last_string && token < Opt_last_bool) { 380 + dout("got Boolean token %d\n", token); 420 381 } else { 421 382 dout("got token %d\n", token); 422 383 } 423 384 424 385 switch (token) { 425 - case Opt_notify_timeout: 426 - rbd_opts->notify_timeout = intval; 386 + case Opt_read_only: 387 + rbd_opts->read_only = true; 388 + break; 389 + case Opt_read_write: 390 + rbd_opts->read_only = false; 427 391 break; 428 392 default: 429 - BUG_ON(token); 393 + rbd_assert(false); 394 + break; 430 395 } 431 396 return 0; 432 397 } ··· 441 394 * Get a ceph client with specific addr and configuration, if one does 442 395 * not exist create it. 443 396 */ 444 - static struct rbd_client *rbd_get_client(const char *mon_addr, 445 - size_t mon_addr_len, 446 - char *options) 397 + static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, 398 + size_t mon_addr_len, char *options) 447 399 { 448 - struct rbd_client *rbdc; 400 + struct rbd_options *rbd_opts = &rbd_dev->rbd_opts; 449 401 struct ceph_options *ceph_opts; 450 - struct rbd_options *rbd_opts; 402 + struct rbd_client *rbdc; 451 403 452 - rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL); 453 - if (!rbd_opts) 454 - return ERR_PTR(-ENOMEM); 455 - 456 - rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; 404 + rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 457 405 458 406 ceph_opts = ceph_parse_options(options, mon_addr, 459 407 mon_addr + mon_addr_len, 460 408 parse_rbd_opts_token, rbd_opts); 461 - if (IS_ERR(ceph_opts)) { 462 - kfree(rbd_opts); 463 - return ERR_CAST(ceph_opts); 464 - } 409 + if (IS_ERR(ceph_opts)) 410 + return PTR_ERR(ceph_opts); 465 411 466 - spin_lock(&rbd_client_list_lock); 467 - rbdc = __rbd_client_find(ceph_opts); 412 + rbdc = rbd_client_find(ceph_opts); 468 413 if (rbdc) { 469 414 /* using an existing client */ 470 - kref_get(&rbdc->kref); 471 - spin_unlock(&rbd_client_list_lock); 472 - 473 415 ceph_destroy_options(ceph_opts); 474 - kfree(rbd_opts); 475 - 476 - return rbdc; 416 + } else { 417 + rbdc = rbd_client_create(ceph_opts); 418 + if (IS_ERR(rbdc)) 419 + return PTR_ERR(rbdc); 477 420 } 478 - spin_unlock(&rbd_client_list_lock); 421 + rbd_dev->rbd_client = rbdc; 479 422 480 - rbdc = rbd_client_create(ceph_opts, rbd_opts); 481 - 482 - if (IS_ERR(rbdc)) 483 - kfree(rbd_opts); 484 - 485 - return rbdc; 423 + return 0; 486 424 } 487 425 488 426 /* ··· 485 453 spin_unlock(&rbd_client_list_lock); 486 454 487 455 ceph_destroy_client(rbdc->client); 488 - kfree(rbdc->rbd_opts); 489 456 kfree(rbdc); 490 457 } 491 458 ··· 510 479 kfree(coll); 511 480 } 512 481 482 + static bool rbd_image_format_valid(u32 image_format) 483 + { 484 + return image_format == 1 || image_format == 2; 485 + } 486 + 513 487 static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 514 488 { 515 - return !memcmp(&ondisk->text, 516 - RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)); 489 + size_t size; 490 + u32 snap_count; 491 + 492 + /* The header has to start with the magic rbd header text */ 493 + if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 494 + return false; 495 + 496 + /* 497 + * The size of a snapshot header has to fit in a size_t, and 498 + * that limits the number of snapshots. 499 + */ 500 + snap_count = le32_to_cpu(ondisk->snap_count); 501 + size = SIZE_MAX - sizeof (struct ceph_snap_context); 502 + if (snap_count > size / sizeof (__le64)) 503 + return false; 504 + 505 + /* 506 + * Not only that, but the size of the entire the snapshot 507 + * header must also be representable in a size_t. 508 + */ 509 + size -= snap_count * sizeof (__le64); 510 + if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 511 + return false; 512 + 513 + return true; 517 514 } 518 515 519 516 /* ··· 549 490 * header. 550 491 */ 551 492 static int rbd_header_from_disk(struct rbd_image_header *header, 552 - struct rbd_image_header_ondisk *ondisk, 553 - u32 allocated_snaps) 493 + struct rbd_image_header_ondisk *ondisk) 554 494 { 555 495 u32 snap_count; 496 + size_t len; 497 + size_t size; 498 + u32 i; 556 499 557 - if (!rbd_dev_ondisk_valid(ondisk)) 558 - return -ENXIO; 500 + memset(header, 0, sizeof (*header)); 559 501 560 502 snap_count = le32_to_cpu(ondisk->snap_count); 561 - if (snap_count > (SIZE_MAX - sizeof(struct ceph_snap_context)) 562 - / sizeof (u64)) 563 - return -EINVAL; 564 - header->snapc = kmalloc(sizeof(struct ceph_snap_context) + 565 - snap_count * sizeof(u64), 566 - GFP_KERNEL); 567 - if (!header->snapc) 503 + 504 + len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix)); 505 + header->object_prefix = kmalloc(len + 1, GFP_KERNEL); 506 + if (!header->object_prefix) 568 507 return -ENOMEM; 508 + memcpy(header->object_prefix, ondisk->object_prefix, len); 509 + header->object_prefix[len] = '\0'; 569 510 570 511 if (snap_count) { 571 - header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); 572 - header->snap_names = kmalloc(header->snap_names_len, 573 - GFP_KERNEL); 512 + u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 513 + 514 + /* Save a copy of the snapshot names */ 515 + 516 + if (snap_names_len > (u64) SIZE_MAX) 517 + return -EIO; 518 + header->snap_names = kmalloc(snap_names_len, GFP_KERNEL); 574 519 if (!header->snap_names) 575 - goto err_snapc; 576 - header->snap_sizes = kmalloc(snap_count * sizeof(u64), 577 - GFP_KERNEL); 520 + goto out_err; 521 + /* 522 + * Note that rbd_dev_v1_header_read() guarantees 523 + * the ondisk buffer we're working with has 524 + * snap_names_len bytes beyond the end of the 525 + * snapshot id array, this memcpy() is safe. 526 + */ 527 + memcpy(header->snap_names, &ondisk->snaps[snap_count], 528 + snap_names_len); 529 + 530 + /* Record each snapshot's size */ 531 + 532 + size = snap_count * sizeof (*header->snap_sizes); 533 + header->snap_sizes = kmalloc(size, GFP_KERNEL); 578 534 if (!header->snap_sizes) 579 - goto err_names; 535 + goto out_err; 536 + for (i = 0; i < snap_count; i++) 537 + header->snap_sizes[i] = 538 + le64_to_cpu(ondisk->snaps[i].image_size); 580 539 } else { 581 540 WARN_ON(ondisk->snap_names_len); 582 - header->snap_names_len = 0; 583 541 header->snap_names = NULL; 584 542 header->snap_sizes = NULL; 585 543 } 586 544 587 - header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1, 588 - GFP_KERNEL); 589 - if (!header->object_prefix) 590 - goto err_sizes; 591 - 592 - memcpy(header->object_prefix, ondisk->block_name, 593 - sizeof(ondisk->block_name)); 594 - header->object_prefix[sizeof (ondisk->block_name)] = '\0'; 595 - 596 - header->image_size = le64_to_cpu(ondisk->image_size); 545 + header->features = 0; /* No features support in v1 images */ 597 546 header->obj_order = ondisk->options.order; 598 547 header->crypt_type = ondisk->options.crypt_type; 599 548 header->comp_type = ondisk->options.comp_type; 600 549 550 + /* Allocate and fill in the snapshot context */ 551 + 552 + header->image_size = le64_to_cpu(ondisk->image_size); 553 + size = sizeof (struct ceph_snap_context); 554 + size += snap_count * sizeof (header->snapc->snaps[0]); 555 + header->snapc = kzalloc(size, GFP_KERNEL); 556 + if (!header->snapc) 557 + goto out_err; 558 + 601 559 atomic_set(&header->snapc->nref, 1); 602 560 header->snapc->seq = le64_to_cpu(ondisk->snap_seq); 603 561 header->snapc->num_snaps = snap_count; 604 - header->total_snaps = snap_count; 605 - 606 - if (snap_count && allocated_snaps == snap_count) { 607 - int i; 608 - 609 - for (i = 0; i < snap_count; i++) { 610 - header->snapc->snaps[i] = 611 - le64_to_cpu(ondisk->snaps[i].id); 612 - header->snap_sizes[i] = 613 - le64_to_cpu(ondisk->snaps[i].image_size); 614 - } 615 - 616 - /* copy snapshot names */ 617 - memcpy(header->snap_names, &ondisk->snaps[snap_count], 618 - header->snap_names_len); 619 - } 562 + for (i = 0; i < snap_count; i++) 563 + header->snapc->snaps[i] = 564 + le64_to_cpu(ondisk->snaps[i].id); 620 565 621 566 return 0; 622 567 623 - err_sizes: 568 + out_err: 624 569 kfree(header->snap_sizes); 625 570 header->snap_sizes = NULL; 626 - err_names: 627 571 kfree(header->snap_names); 628 572 header->snap_names = NULL; 629 - err_snapc: 630 - kfree(header->snapc); 631 - header->snapc = NULL; 573 + kfree(header->object_prefix); 574 + header->object_prefix = NULL; 632 575 633 576 return -ENOMEM; 634 577 } 635 578 636 - static int snap_by_name(struct rbd_image_header *header, const char *snap_name, 637 - u64 *seq, u64 *size) 579 + static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) 638 580 { 639 - int i; 640 - char *p = header->snap_names; 641 581 642 - for (i = 0; i < header->total_snaps; i++) { 643 - if (!strcmp(snap_name, p)) { 582 + struct rbd_snap *snap; 644 583 645 - /* Found it. Pass back its id and/or size */ 584 + list_for_each_entry(snap, &rbd_dev->snaps, node) { 585 + if (!strcmp(snap_name, snap->name)) { 586 + rbd_dev->mapping.snap_id = snap->id; 587 + rbd_dev->mapping.size = snap->size; 588 + rbd_dev->mapping.features = snap->features; 646 589 647 - if (seq) 648 - *seq = header->snapc->snaps[i]; 649 - if (size) 650 - *size = header->snap_sizes[i]; 651 - return i; 590 + return 0; 652 591 } 653 - p += strlen(p) + 1; /* Skip ahead to the next name */ 654 592 } 593 + 655 594 return -ENOENT; 656 595 } 657 596 658 - static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size) 597 + static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name) 659 598 { 660 599 int ret; 661 600 662 - down_write(&rbd_dev->header_rwsem); 663 - 664 - if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, 601 + if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME, 665 602 sizeof (RBD_SNAP_HEAD_NAME))) { 666 - rbd_dev->snap_id = CEPH_NOSNAP; 667 - rbd_dev->snap_exists = false; 668 - rbd_dev->read_only = 0; 669 - if (size) 670 - *size = rbd_dev->header.image_size; 603 + rbd_dev->mapping.snap_id = CEPH_NOSNAP; 604 + rbd_dev->mapping.size = rbd_dev->header.image_size; 605 + rbd_dev->mapping.features = rbd_dev->header.features; 606 + rbd_dev->mapping.snap_exists = false; 607 + rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only; 608 + ret = 0; 671 609 } else { 672 - u64 snap_id = 0; 673 - 674 - ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name, 675 - &snap_id, size); 610 + ret = snap_by_name(rbd_dev, snap_name); 676 611 if (ret < 0) 677 612 goto done; 678 - rbd_dev->snap_id = snap_id; 679 - rbd_dev->snap_exists = true; 680 - rbd_dev->read_only = 1; 613 + rbd_dev->mapping.snap_exists = true; 614 + rbd_dev->mapping.read_only = true; 681 615 } 682 - 683 - ret = 0; 616 + rbd_dev->mapping.snap_name = snap_name; 684 617 done: 685 - up_write(&rbd_dev->header_rwsem); 686 618 return ret; 687 619 } 688 620 689 621 static void rbd_header_free(struct rbd_image_header *header) 690 622 { 691 623 kfree(header->object_prefix); 624 + header->object_prefix = NULL; 692 625 kfree(header->snap_sizes); 626 + header->snap_sizes = NULL; 693 627 kfree(header->snap_names); 628 + header->snap_names = NULL; 694 629 ceph_put_snap_context(header->snapc); 630 + header->snapc = NULL; 695 631 } 696 632 697 - /* 698 - * get the actual striped segment name, offset and length 699 - */ 700 - static u64 rbd_get_segment(struct rbd_image_header *header, 701 - const char *object_prefix, 702 - u64 ofs, u64 len, 703 - char *seg_name, u64 *segofs) 633 + static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 704 634 { 705 - u64 seg = ofs >> header->obj_order; 635 + char *name; 636 + u64 segment; 637 + int ret; 706 638 707 - if (seg_name) 708 - snprintf(seg_name, RBD_MAX_SEG_NAME_LEN, 709 - "%s.%012llx", object_prefix, seg); 639 + name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); 640 + if (!name) 641 + return NULL; 642 + segment = offset >> rbd_dev->header.obj_order; 643 + ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx", 644 + rbd_dev->header.object_prefix, segment); 645 + if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) { 646 + pr_err("error formatting segment name for #%llu (%d)\n", 647 + segment, ret); 648 + kfree(name); 649 + name = NULL; 650 + } 710 651 711 - ofs = ofs & ((1 << header->obj_order) - 1); 712 - len = min_t(u64, len, (1 << header->obj_order) - ofs); 652 + return name; 653 + } 713 654 714 - if (segofs) 715 - *segofs = ofs; 655 + static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 656 + { 657 + u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 716 658 717 - return len; 659 + return offset & (segment_size - 1); 660 + } 661 + 662 + static u64 rbd_segment_length(struct rbd_device *rbd_dev, 663 + u64 offset, u64 length) 664 + { 665 + u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 666 + 667 + offset &= segment_size - 1; 668 + 669 + rbd_assert(length <= U64_MAX - offset); 670 + if (offset + length > segment_size) 671 + length = segment_size - offset; 672 + 673 + return length; 718 674 } 719 675 720 676 static int rbd_get_num_segments(struct rbd_image_header *header, 721 677 u64 ofs, u64 len) 722 678 { 723 - u64 start_seg = ofs >> header->obj_order; 724 - u64 end_seg = (ofs + len - 1) >> header->obj_order; 679 + u64 start_seg; 680 + u64 end_seg; 681 + 682 + if (!len) 683 + return 0; 684 + if (len - 1 > U64_MAX - ofs) 685 + return -ERANGE; 686 + 687 + start_seg = ofs >> header->obj_order; 688 + end_seg = (ofs + len - 1) >> header->obj_order; 689 + 725 690 return end_seg - start_seg + 1; 726 691 } 727 692 ··· 807 724 struct bio_pair **bp, 808 725 int len, gfp_t gfpmask) 809 726 { 810 - struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL; 727 + struct bio *old_chain = *old; 728 + struct bio *new_chain = NULL; 729 + struct bio *tail; 811 730 int total = 0; 812 731 813 732 if (*bp) { ··· 818 733 } 819 734 820 735 while (old_chain && (total < len)) { 736 + struct bio *tmp; 737 + 821 738 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs); 822 739 if (!tmp) 823 740 goto err_out; 741 + gfpmask &= ~__GFP_WAIT; /* can't wait after the first */ 824 742 825 743 if (total + old_chain->bi_size > len) { 826 744 struct bio_pair *bp; ··· 851 763 } 852 764 853 765 tmp->bi_bdev = NULL; 854 - gfpmask &= ~__GFP_WAIT; 855 766 tmp->bi_next = NULL; 856 - 857 - if (!new_chain) { 858 - new_chain = tail = tmp; 859 - } else { 767 + if (new_chain) 860 768 tail->bi_next = tmp; 861 - tail = tmp; 862 - } 769 + else 770 + new_chain = tmp; 771 + tail = tmp; 863 772 old_chain = old_chain->bi_next; 864 773 865 774 total += tmp->bi_size; 866 775 } 867 776 868 - BUG_ON(total < len); 869 - 870 - if (tail) 871 - tail->bi_next = NULL; 777 + rbd_assert(total == len); 872 778 873 779 *old = old_chain; 874 780 ··· 1020 938 layout->fl_stripe_count = cpu_to_le32(1); 1021 939 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 1022 940 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id); 1023 - ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, 1024 - req, ops); 941 + ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, 942 + req, ops); 943 + rbd_assert(ret == 0); 1025 944 1026 945 ceph_osdc_build_request(req, ofs, &len, 1027 946 ops, ··· 1113 1030 int flags, 1114 1031 struct ceph_osd_req_op *ops, 1115 1032 const char *object_name, 1116 - u64 ofs, u64 len, 1117 - char *buf, 1033 + u64 ofs, u64 inbound_size, 1034 + char *inbound, 1118 1035 struct ceph_osd_request **linger_req, 1119 1036 u64 *ver) 1120 1037 { ··· 1122 1039 struct page **pages; 1123 1040 int num_pages; 1124 1041 1125 - BUG_ON(ops == NULL); 1042 + rbd_assert(ops != NULL); 1126 1043 1127 - num_pages = calc_pages_for(ofs , len); 1044 + num_pages = calc_pages_for(ofs, inbound_size); 1128 1045 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 1129 1046 if (IS_ERR(pages)) 1130 1047 return PTR_ERR(pages); 1131 1048 1132 1049 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid, 1133 - object_name, ofs, len, NULL, 1050 + object_name, ofs, inbound_size, NULL, 1134 1051 pages, num_pages, 1135 1052 flags, 1136 1053 ops, ··· 1140 1057 if (ret < 0) 1141 1058 goto done; 1142 1059 1143 - if ((flags & CEPH_OSD_FLAG_READ) && buf) 1144 - ret = ceph_copy_from_page_vector(pages, buf, ofs, ret); 1060 + if ((flags & CEPH_OSD_FLAG_READ) && inbound) 1061 + ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret); 1145 1062 1146 1063 done: 1147 1064 ceph_release_page_vector(pages, num_pages); ··· 1168 1085 struct ceph_osd_req_op *ops; 1169 1086 u32 payload_len; 1170 1087 1171 - seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); 1088 + seg_name = rbd_segment_name(rbd_dev, ofs); 1172 1089 if (!seg_name) 1173 1090 return -ENOMEM; 1174 - 1175 - seg_len = rbd_get_segment(&rbd_dev->header, 1176 - rbd_dev->header.object_prefix, 1177 - ofs, len, 1178 - seg_name, &seg_ofs); 1091 + seg_len = rbd_segment_length(rbd_dev, ofs, len); 1092 + seg_ofs = rbd_segment_offset(rbd_dev, ofs); 1179 1093 1180 1094 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0); 1181 1095 ··· 1184 1104 /* we've taken care of segment sizes earlier when we 1185 1105 cloned the bios. We should never have a segment 1186 1106 truncated at this point */ 1187 - BUG_ON(seg_len < len); 1107 + rbd_assert(seg_len == len); 1188 1108 1189 1109 ret = rbd_do_request(rq, rbd_dev, snapc, snapid, 1190 1110 seg_name, seg_ofs, seg_len, ··· 1386 1306 return ret; 1387 1307 } 1388 1308 1389 - struct rbd_notify_info { 1390 - struct rbd_device *rbd_dev; 1391 - }; 1392 - 1393 - static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 1394 - { 1395 - struct rbd_device *rbd_dev = (struct rbd_device *)data; 1396 - if (!rbd_dev) 1397 - return; 1398 - 1399 - dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n", 1400 - rbd_dev->header_name, (unsigned long long) notify_id, 1401 - (unsigned int) opcode); 1402 - } 1403 - 1404 1309 /* 1405 - * Request sync osd notify 1406 - */ 1407 - static int rbd_req_sync_notify(struct rbd_device *rbd_dev) 1408 - { 1409 - struct ceph_osd_req_op *ops; 1410 - struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 1411 - struct ceph_osd_event *event; 1412 - struct rbd_notify_info info; 1413 - int payload_len = sizeof(u32) + sizeof(u32); 1414 - int ret; 1415 - 1416 - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len); 1417 - if (!ops) 1418 - return -ENOMEM; 1419 - 1420 - info.rbd_dev = rbd_dev; 1421 - 1422 - ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1, 1423 - (void *)&info, &event); 1424 - if (ret < 0) 1425 - goto fail; 1426 - 1427 - ops[0].watch.ver = 1; 1428 - ops[0].watch.flag = 1; 1429 - ops[0].watch.cookie = event->cookie; 1430 - ops[0].watch.prot_ver = RADOS_NOTIFY_VER; 1431 - ops[0].watch.timeout = 12; 1432 - 1433 - ret = rbd_req_sync_op(rbd_dev, NULL, 1434 - CEPH_NOSNAP, 1435 - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 1436 - ops, 1437 - rbd_dev->header_name, 1438 - 0, 0, NULL, NULL, NULL); 1439 - if (ret < 0) 1440 - goto fail_event; 1441 - 1442 - ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT); 1443 - dout("ceph_osdc_wait_event returned %d\n", ret); 1444 - rbd_destroy_ops(ops); 1445 - return 0; 1446 - 1447 - fail_event: 1448 - ceph_osdc_cancel_event(event); 1449 - fail: 1450 - rbd_destroy_ops(ops); 1451 - return ret; 1452 - } 1453 - 1454 - /* 1455 - * Request sync osd read 1310 + * Synchronous osd object method call 1456 1311 */ 1457 1312 static int rbd_req_sync_exec(struct rbd_device *rbd_dev, 1458 1313 const char *object_name, 1459 1314 const char *class_name, 1460 1315 const char *method_name, 1461 - const char *data, 1462 - int len, 1316 + const char *outbound, 1317 + size_t outbound_size, 1318 + char *inbound, 1319 + size_t inbound_size, 1320 + int flags, 1463 1321 u64 *ver) 1464 1322 { 1465 1323 struct ceph_osd_req_op *ops; 1466 1324 int class_name_len = strlen(class_name); 1467 1325 int method_name_len = strlen(method_name); 1326 + int payload_size; 1468 1327 int ret; 1469 1328 1470 - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, 1471 - class_name_len + method_name_len + len); 1329 + /* 1330 + * Any input parameters required by the method we're calling 1331 + * will be sent along with the class and method names as 1332 + * part of the message payload. That data and its size are 1333 + * supplied via the indata and indata_len fields (named from 1334 + * the perspective of the server side) in the OSD request 1335 + * operation. 1336 + */ 1337 + payload_size = class_name_len + method_name_len + outbound_size; 1338 + ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size); 1472 1339 if (!ops) 1473 1340 return -ENOMEM; 1474 1341 ··· 1424 1397 ops[0].cls.method_name = method_name; 1425 1398 ops[0].cls.method_len = (__u8) method_name_len; 1426 1399 ops[0].cls.argc = 0; 1427 - ops[0].cls.indata = data; 1428 - ops[0].cls.indata_len = len; 1400 + ops[0].cls.indata = outbound; 1401 + ops[0].cls.indata_len = outbound_size; 1429 1402 1430 1403 ret = rbd_req_sync_op(rbd_dev, NULL, 1431 1404 CEPH_NOSNAP, 1432 - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 1433 - ops, 1434 - object_name, 0, 0, NULL, NULL, ver); 1405 + flags, ops, 1406 + object_name, 0, inbound_size, inbound, 1407 + NULL, ver); 1435 1408 1436 1409 rbd_destroy_ops(ops); 1437 1410 ··· 1473 1446 struct rbd_req_coll *coll; 1474 1447 struct ceph_snap_context *snapc; 1475 1448 1476 - /* peek at request from block layer */ 1477 - if (!rq) 1478 - break; 1479 - 1480 1449 dout("fetched request\n"); 1481 1450 1482 1451 /* filter out block requests we don't understand */ ··· 1487 1464 size = blk_rq_bytes(rq); 1488 1465 ofs = blk_rq_pos(rq) * SECTOR_SIZE; 1489 1466 rq_bio = rq->bio; 1490 - if (do_write && rbd_dev->read_only) { 1467 + if (do_write && rbd_dev->mapping.read_only) { 1491 1468 __blk_end_request_all(rq, -EROFS); 1492 1469 continue; 1493 1470 } ··· 1496 1473 1497 1474 down_read(&rbd_dev->header_rwsem); 1498 1475 1499 - if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) { 1476 + if (rbd_dev->mapping.snap_id != CEPH_NOSNAP && 1477 + !rbd_dev->mapping.snap_exists) { 1500 1478 up_read(&rbd_dev->header_rwsem); 1501 1479 dout("request for non-existent snapshot"); 1502 1480 spin_lock_irq(q->queue_lock); ··· 1514 1490 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); 1515 1491 1516 1492 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); 1493 + if (num_segs <= 0) { 1494 + spin_lock_irq(q->queue_lock); 1495 + __blk_end_request_all(rq, num_segs); 1496 + ceph_put_snap_context(snapc); 1497 + continue; 1498 + } 1517 1499 coll = rbd_alloc_coll(num_segs); 1518 1500 if (!coll) { 1519 1501 spin_lock_irq(q->queue_lock); ··· 1531 1501 do { 1532 1502 /* a bio clone to be passed down to OSD req */ 1533 1503 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt); 1534 - op_size = rbd_get_segment(&rbd_dev->header, 1535 - rbd_dev->header.object_prefix, 1536 - ofs, size, 1537 - NULL, NULL); 1504 + op_size = rbd_segment_length(rbd_dev, ofs, size); 1538 1505 kref_get(&coll->kref); 1539 1506 bio = bio_chain_clone(&rq_bio, &next_bio, &bp, 1540 1507 op_size, GFP_ATOMIC); ··· 1551 1524 coll, cur_seg); 1552 1525 else 1553 1526 rbd_req_read(rq, rbd_dev, 1554 - rbd_dev->snap_id, 1527 + rbd_dev->mapping.snap_id, 1555 1528 ofs, 1556 1529 op_size, bio, 1557 1530 coll, cur_seg); ··· 1607 1580 if (!disk) 1608 1581 return; 1609 1582 1610 - rbd_header_free(&rbd_dev->header); 1611 - 1612 1583 if (disk->flags & GENHD_FL_UP) 1613 1584 del_gendisk(disk); 1614 1585 if (disk->queue) ··· 1615 1590 } 1616 1591 1617 1592 /* 1618 - * reload the ondisk the header 1593 + * Read the complete header for the given rbd device. 1594 + * 1595 + * Returns a pointer to a dynamically-allocated buffer containing 1596 + * the complete and validated header. Caller can pass the address 1597 + * of a variable that will be filled in with the version of the 1598 + * header object at the time it was read. 1599 + * 1600 + * Returns a pointer-coded errno if a failure occurs. 1601 + */ 1602 + static struct rbd_image_header_ondisk * 1603 + rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) 1604 + { 1605 + struct rbd_image_header_ondisk *ondisk = NULL; 1606 + u32 snap_count = 0; 1607 + u64 names_size = 0; 1608 + u32 want_count; 1609 + int ret; 1610 + 1611 + /* 1612 + * The complete header will include an array of its 64-bit 1613 + * snapshot ids, followed by the names of those snapshots as 1614 + * a contiguous block of NUL-terminated strings. Note that 1615 + * the number of snapshots could change by the time we read 1616 + * it in, in which case we re-read it. 1617 + */ 1618 + do { 1619 + size_t size; 1620 + 1621 + kfree(ondisk); 1622 + 1623 + size = sizeof (*ondisk); 1624 + size += snap_count * sizeof (struct rbd_image_snap_ondisk); 1625 + size += names_size; 1626 + ondisk = kmalloc(size, GFP_KERNEL); 1627 + if (!ondisk) 1628 + return ERR_PTR(-ENOMEM); 1629 + 1630 + ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP, 1631 + rbd_dev->header_name, 1632 + 0, size, 1633 + (char *) ondisk, version); 1634 + 1635 + if (ret < 0) 1636 + goto out_err; 1637 + if (WARN_ON((size_t) ret < size)) { 1638 + ret = -ENXIO; 1639 + pr_warning("short header read for image %s" 1640 + " (want %zd got %d)\n", 1641 + rbd_dev->image_name, size, ret); 1642 + goto out_err; 1643 + } 1644 + if (!rbd_dev_ondisk_valid(ondisk)) { 1645 + ret = -ENXIO; 1646 + pr_warning("invalid header for image %s\n", 1647 + rbd_dev->image_name); 1648 + goto out_err; 1649 + } 1650 + 1651 + names_size = le64_to_cpu(ondisk->snap_names_len); 1652 + want_count = snap_count; 1653 + snap_count = le32_to_cpu(ondisk->snap_count); 1654 + } while (snap_count != want_count); 1655 + 1656 + return ondisk; 1657 + 1658 + out_err: 1659 + kfree(ondisk); 1660 + 1661 + return ERR_PTR(ret); 1662 + } 1663 + 1664 + /* 1665 + * reload the ondisk the header 1619 1666 */ 1620 1667 static int rbd_read_header(struct rbd_device *rbd_dev, 1621 1668 struct rbd_image_header *header) 1622 1669 { 1623 - ssize_t rc; 1624 - struct rbd_image_header_ondisk *dh; 1625 - u32 snap_count = 0; 1626 - u64 ver; 1627 - size_t len; 1628 - 1629 - /* 1630 - * First reads the fixed-size header to determine the number 1631 - * of snapshots, then re-reads it, along with all snapshot 1632 - * records as well as their stored names. 1633 - */ 1634 - len = sizeof (*dh); 1635 - while (1) { 1636 - dh = kmalloc(len, GFP_KERNEL); 1637 - if (!dh) 1638 - return -ENOMEM; 1639 - 1640 - rc = rbd_req_sync_read(rbd_dev, 1641 - CEPH_NOSNAP, 1642 - rbd_dev->header_name, 1643 - 0, len, 1644 - (char *)dh, &ver); 1645 - if (rc < 0) 1646 - goto out_dh; 1647 - 1648 - rc = rbd_header_from_disk(header, dh, snap_count); 1649 - if (rc < 0) { 1650 - if (rc == -ENXIO) 1651 - pr_warning("unrecognized header format" 1652 - " for image %s\n", 1653 - rbd_dev->image_name); 1654 - goto out_dh; 1655 - } 1656 - 1657 - if (snap_count == header->total_snaps) 1658 - break; 1659 - 1660 - snap_count = header->total_snaps; 1661 - len = sizeof (*dh) + 1662 - snap_count * sizeof(struct rbd_image_snap_ondisk) + 1663 - header->snap_names_len; 1664 - 1665 - rbd_header_free(header); 1666 - kfree(dh); 1667 - } 1668 - header->obj_version = ver; 1669 - 1670 - out_dh: 1671 - kfree(dh); 1672 - return rc; 1673 - } 1674 - 1675 - /* 1676 - * create a snapshot 1677 - */ 1678 - static int rbd_header_add_snap(struct rbd_device *rbd_dev, 1679 - const char *snap_name, 1680 - gfp_t gfp_flags) 1681 - { 1682 - int name_len = strlen(snap_name); 1683 - u64 new_snapid; 1670 + struct rbd_image_header_ondisk *ondisk; 1671 + u64 ver = 0; 1684 1672 int ret; 1685 - void *data, *p, *e; 1686 - struct ceph_mon_client *monc; 1687 1673 1688 - /* we should create a snapshot only if we're pointing at the head */ 1689 - if (rbd_dev->snap_id != CEPH_NOSNAP) 1690 - return -EINVAL; 1674 + ondisk = rbd_dev_v1_header_read(rbd_dev, &ver); 1675 + if (IS_ERR(ondisk)) 1676 + return PTR_ERR(ondisk); 1677 + ret = rbd_header_from_disk(header, ondisk); 1678 + if (ret >= 0) 1679 + header->obj_version = ver; 1680 + kfree(ondisk); 1691 1681 1692 - monc = &rbd_dev->rbd_client->client->monc; 1693 - ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid); 1694 - dout("created snapid=%llu\n", (unsigned long long) new_snapid); 1695 - if (ret < 0) 1696 - return ret; 1697 - 1698 - data = kmalloc(name_len + 16, gfp_flags); 1699 - if (!data) 1700 - return -ENOMEM; 1701 - 1702 - p = data; 1703 - e = data + name_len + 16; 1704 - 1705 - ceph_encode_string_safe(&p, e, snap_name, name_len, bad); 1706 - ceph_encode_64_safe(&p, e, new_snapid, bad); 1707 - 1708 - ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 1709 - "rbd", "snap_add", 1710 - data, p - data, NULL); 1711 - 1712 - kfree(data); 1713 - 1714 - return ret < 0 ? ret : 0; 1715 - bad: 1716 - return -ERANGE; 1682 + return ret; 1717 1683 } 1718 1684 1719 1685 static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev) ··· 1731 1715 down_write(&rbd_dev->header_rwsem); 1732 1716 1733 1717 /* resized? */ 1734 - if (rbd_dev->snap_id == CEPH_NOSNAP) { 1718 + if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) { 1735 1719 sector_t size = (sector_t) h.image_size / SECTOR_SIZE; 1736 1720 1737 - dout("setting size to %llu sectors", (unsigned long long) size); 1738 - set_capacity(rbd_dev->disk, size); 1721 + if (size != (sector_t) rbd_dev->mapping.size) { 1722 + dout("setting size to %llu sectors", 1723 + (unsigned long long) size); 1724 + rbd_dev->mapping.size = (u64) size; 1725 + set_capacity(rbd_dev->disk, size); 1726 + } 1739 1727 } 1740 1728 1741 1729 /* rbd_dev->header.object_prefix shouldn't change */ ··· 1752 1732 *hver = h.obj_version; 1753 1733 rbd_dev->header.obj_version = h.obj_version; 1754 1734 rbd_dev->header.image_size = h.image_size; 1755 - rbd_dev->header.total_snaps = h.total_snaps; 1756 1735 rbd_dev->header.snapc = h.snapc; 1757 1736 rbd_dev->header.snap_names = h.snap_names; 1758 - rbd_dev->header.snap_names_len = h.snap_names_len; 1759 1737 rbd_dev->header.snap_sizes = h.snap_sizes; 1760 1738 /* Free the extra copy of the object prefix */ 1761 1739 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); 1762 1740 kfree(h.object_prefix); 1763 1741 1764 - ret = __rbd_init_snaps_header(rbd_dev); 1742 + ret = rbd_dev_snaps_update(rbd_dev); 1743 + if (!ret) 1744 + ret = rbd_dev_snaps_register(rbd_dev); 1765 1745 1766 1746 up_write(&rbd_dev->header_rwsem); 1767 1747 ··· 1783 1763 { 1784 1764 struct gendisk *disk; 1785 1765 struct request_queue *q; 1786 - int rc; 1787 1766 u64 segment_size; 1788 - u64 total_size = 0; 1789 - 1790 - /* contact OSD, request size info about the object being mapped */ 1791 - rc = rbd_read_header(rbd_dev, &rbd_dev->header); 1792 - if (rc) 1793 - return rc; 1794 - 1795 - /* no need to lock here, as rbd_dev is not registered yet */ 1796 - rc = __rbd_init_snaps_header(rbd_dev); 1797 - if (rc) 1798 - return rc; 1799 - 1800 - rc = rbd_header_set_snap(rbd_dev, &total_size); 1801 - if (rc) 1802 - return rc; 1803 1767 1804 1768 /* create gendisk info */ 1805 - rc = -ENOMEM; 1806 1769 disk = alloc_disk(RBD_MINORS_PER_MAJOR); 1807 1770 if (!disk) 1808 - goto out; 1771 + return -ENOMEM; 1809 1772 1810 1773 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 1811 1774 rbd_dev->dev_id); ··· 1798 1795 disk->private_data = rbd_dev; 1799 1796 1800 1797 /* init rq */ 1801 - rc = -ENOMEM; 1802 1798 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock); 1803 1799 if (!q) 1804 1800 goto out_disk; ··· 1818 1816 q->queuedata = rbd_dev; 1819 1817 1820 1818 rbd_dev->disk = disk; 1821 - rbd_dev->q = q; 1822 1819 1823 - /* finally, announce the disk to the world */ 1824 - set_capacity(disk, total_size / SECTOR_SIZE); 1825 - add_disk(disk); 1820 + set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 1826 1821 1827 - pr_info("%s: added with size 0x%llx\n", 1828 - disk->disk_name, (unsigned long long)total_size); 1829 1822 return 0; 1830 - 1831 1823 out_disk: 1832 1824 put_disk(disk); 1833 - out: 1834 - return rc; 1825 + 1826 + return -ENOMEM; 1835 1827 } 1836 1828 1837 1829 /* ··· 1848 1852 up_read(&rbd_dev->header_rwsem); 1849 1853 1850 1854 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE); 1855 + } 1856 + 1857 + /* 1858 + * Note this shows the features for whatever's mapped, which is not 1859 + * necessarily the base image. 1860 + */ 1861 + static ssize_t rbd_features_show(struct device *dev, 1862 + struct device_attribute *attr, char *buf) 1863 + { 1864 + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1865 + 1866 + return sprintf(buf, "0x%016llx\n", 1867 + (unsigned long long) rbd_dev->mapping.features); 1851 1868 } 1852 1869 1853 1870 static ssize_t rbd_major_show(struct device *dev, ··· 1904 1895 return sprintf(buf, "%s\n", rbd_dev->image_name); 1905 1896 } 1906 1897 1898 + static ssize_t rbd_image_id_show(struct device *dev, 1899 + struct device_attribute *attr, char *buf) 1900 + { 1901 + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1902 + 1903 + return sprintf(buf, "%s\n", rbd_dev->image_id); 1904 + } 1905 + 1906 + /* 1907 + * Shows the name of the currently-mapped snapshot (or 1908 + * RBD_SNAP_HEAD_NAME for the base image). 1909 + */ 1907 1910 static ssize_t rbd_snap_show(struct device *dev, 1908 1911 struct device_attribute *attr, 1909 1912 char *buf) 1910 1913 { 1911 1914 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1912 1915 1913 - return sprintf(buf, "%s\n", rbd_dev->snap_name); 1916 + return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name); 1914 1917 } 1915 1918 1916 1919 static ssize_t rbd_image_refresh(struct device *dev, ··· 1939 1918 } 1940 1919 1941 1920 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 1921 + static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 1942 1922 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 1943 1923 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 1944 1924 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 1945 1925 static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 1946 1926 static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 1927 + static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 1947 1928 static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 1948 1929 static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 1949 - static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add); 1950 1930 1951 1931 static struct attribute *rbd_attrs[] = { 1952 1932 &dev_attr_size.attr, 1933 + &dev_attr_features.attr, 1953 1934 &dev_attr_major.attr, 1954 1935 &dev_attr_client_id.attr, 1955 1936 &dev_attr_pool.attr, 1956 1937 &dev_attr_pool_id.attr, 1957 1938 &dev_attr_name.attr, 1939 + &dev_attr_image_id.attr, 1958 1940 &dev_attr_current_snap.attr, 1959 1941 &dev_attr_refresh.attr, 1960 - &dev_attr_create_snap.attr, 1961 1942 NULL 1962 1943 }; 1963 1944 ··· 2005 1982 return sprintf(buf, "%llu\n", (unsigned long long)snap->id); 2006 1983 } 2007 1984 1985 + static ssize_t rbd_snap_features_show(struct device *dev, 1986 + struct device_attribute *attr, 1987 + char *buf) 1988 + { 1989 + struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 1990 + 1991 + return sprintf(buf, "0x%016llx\n", 1992 + (unsigned long long) snap->features); 1993 + } 1994 + 2008 1995 static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL); 2009 1996 static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); 1997 + static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL); 2010 1998 2011 1999 static struct attribute *rbd_snap_attrs[] = { 2012 2000 &dev_attr_snap_size.attr, 2013 2001 &dev_attr_snap_id.attr, 2002 + &dev_attr_snap_features.attr, 2014 2003 NULL, 2015 2004 }; 2016 2005 ··· 2047 2012 .release = rbd_snap_dev_release, 2048 2013 }; 2049 2014 2015 + static bool rbd_snap_registered(struct rbd_snap *snap) 2016 + { 2017 + bool ret = snap->dev.type == &rbd_snap_device_type; 2018 + bool reg = device_is_registered(&snap->dev); 2019 + 2020 + rbd_assert(!ret ^ reg); 2021 + 2022 + return ret; 2023 + } 2024 + 2050 2025 static void __rbd_remove_snap_dev(struct rbd_snap *snap) 2051 2026 { 2052 2027 list_del(&snap->node); 2053 - device_unregister(&snap->dev); 2028 + if (device_is_registered(&snap->dev)) 2029 + device_unregister(&snap->dev); 2054 2030 } 2055 2031 2056 2032 static int rbd_register_snap_dev(struct rbd_snap *snap, ··· 2074 2028 dev->parent = parent; 2075 2029 dev->release = rbd_snap_dev_release; 2076 2030 dev_set_name(dev, "snap_%s", snap->name); 2031 + dout("%s: registering device for snapshot %s\n", __func__, snap->name); 2032 + 2077 2033 ret = device_register(dev); 2078 2034 2079 2035 return ret; 2080 2036 } 2081 2037 2082 2038 static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, 2083 - int i, const char *name) 2039 + const char *snap_name, 2040 + u64 snap_id, u64 snap_size, 2041 + u64 snap_features) 2084 2042 { 2085 2043 struct rbd_snap *snap; 2086 2044 int ret; ··· 2094 2044 return ERR_PTR(-ENOMEM); 2095 2045 2096 2046 ret = -ENOMEM; 2097 - snap->name = kstrdup(name, GFP_KERNEL); 2047 + snap->name = kstrdup(snap_name, GFP_KERNEL); 2098 2048 if (!snap->name) 2099 2049 goto err; 2100 2050 2101 - snap->size = rbd_dev->header.snap_sizes[i]; 2102 - snap->id = rbd_dev->header.snapc->snaps[i]; 2103 - if (device_is_registered(&rbd_dev->dev)) { 2104 - ret = rbd_register_snap_dev(snap, &rbd_dev->dev); 2105 - if (ret < 0) 2106 - goto err; 2107 - } 2051 + snap->id = snap_id; 2052 + snap->size = snap_size; 2053 + snap->features = snap_features; 2108 2054 2109 2055 return snap; 2110 2056 ··· 2111 2065 return ERR_PTR(ret); 2112 2066 } 2113 2067 2114 - /* 2115 - * search for the previous snap in a null delimited string list 2116 - */ 2117 - const char *rbd_prev_snap_name(const char *name, const char *start) 2068 + static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, 2069 + u64 *snap_size, u64 *snap_features) 2118 2070 { 2119 - if (name < start + 2) 2120 - return NULL; 2071 + char *snap_name; 2121 2072 2122 - name -= 2; 2123 - while (*name) { 2124 - if (name == start) 2125 - return start; 2126 - name--; 2127 - } 2128 - return name + 1; 2073 + rbd_assert(which < rbd_dev->header.snapc->num_snaps); 2074 + 2075 + *snap_size = rbd_dev->header.snap_sizes[which]; 2076 + *snap_features = 0; /* No features for v1 */ 2077 + 2078 + /* Skip over names until we find the one we are looking for */ 2079 + 2080 + snap_name = rbd_dev->header.snap_names; 2081 + while (which--) 2082 + snap_name += strlen(snap_name) + 1; 2083 + 2084 + return snap_name; 2129 2085 } 2130 2086 2131 2087 /* 2132 - * compare the old list of snapshots that we have to what's in the header 2133 - * and update it accordingly. Note that the header holds the snapshots 2134 - * in a reverse order (from newest to oldest) and we need to go from 2135 - * older to new so that we don't get a duplicate snap name when 2136 - * doing the process (e.g., removed snapshot and recreated a new 2137 - * one with the same name. 2088 + * Get the size and object order for an image snapshot, or if 2089 + * snap_id is CEPH_NOSNAP, gets this information for the base 2090 + * image. 2138 2091 */ 2139 - static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) 2092 + static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 2093 + u8 *order, u64 *snap_size) 2140 2094 { 2141 - const char *name, *first_name; 2142 - int i = rbd_dev->header.total_snaps; 2143 - struct rbd_snap *snap, *old_snap = NULL; 2144 - struct list_head *p, *n; 2095 + __le64 snapid = cpu_to_le64(snap_id); 2096 + int ret; 2097 + struct { 2098 + u8 order; 2099 + __le64 size; 2100 + } __attribute__ ((packed)) size_buf = { 0 }; 2145 2101 2146 - first_name = rbd_dev->header.snap_names; 2147 - name = first_name + rbd_dev->header.snap_names_len; 2102 + ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2103 + "rbd", "get_size", 2104 + (char *) &snapid, sizeof (snapid), 2105 + (char *) &size_buf, sizeof (size_buf), 2106 + CEPH_OSD_FLAG_READ, NULL); 2107 + dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2108 + if (ret < 0) 2109 + return ret; 2148 2110 2149 - list_for_each_prev_safe(p, n, &rbd_dev->snaps) { 2150 - u64 cur_id; 2111 + *order = size_buf.order; 2112 + *snap_size = le64_to_cpu(size_buf.size); 2151 2113 2152 - old_snap = list_entry(p, struct rbd_snap, node); 2153 - 2154 - if (i) 2155 - cur_id = rbd_dev->header.snapc->snaps[i - 1]; 2156 - 2157 - if (!i || old_snap->id < cur_id) { 2158 - /* 2159 - * old_snap->id was skipped, thus was 2160 - * removed. If this rbd_dev is mapped to 2161 - * the removed snapshot, record that it no 2162 - * longer exists, to prevent further I/O. 2163 - */ 2164 - if (rbd_dev->snap_id == old_snap->id) 2165 - rbd_dev->snap_exists = false; 2166 - __rbd_remove_snap_dev(old_snap); 2167 - continue; 2168 - } 2169 - if (old_snap->id == cur_id) { 2170 - /* we have this snapshot already */ 2171 - i--; 2172 - name = rbd_prev_snap_name(name, first_name); 2173 - continue; 2174 - } 2175 - for (; i > 0; 2176 - i--, name = rbd_prev_snap_name(name, first_name)) { 2177 - if (!name) { 2178 - WARN_ON(1); 2179 - return -EINVAL; 2180 - } 2181 - cur_id = rbd_dev->header.snapc->snaps[i]; 2182 - /* snapshot removal? handle it above */ 2183 - if (cur_id >= old_snap->id) 2184 - break; 2185 - /* a new snapshot */ 2186 - snap = __rbd_add_snap_dev(rbd_dev, i - 1, name); 2187 - if (IS_ERR(snap)) 2188 - return PTR_ERR(snap); 2189 - 2190 - /* note that we add it backward so using n and not p */ 2191 - list_add(&snap->node, n); 2192 - p = &snap->node; 2193 - } 2194 - } 2195 - /* we're done going over the old snap list, just add what's left */ 2196 - for (; i > 0; i--) { 2197 - name = rbd_prev_snap_name(name, first_name); 2198 - if (!name) { 2199 - WARN_ON(1); 2200 - return -EINVAL; 2201 - } 2202 - snap = __rbd_add_snap_dev(rbd_dev, i - 1, name); 2203 - if (IS_ERR(snap)) 2204 - return PTR_ERR(snap); 2205 - list_add(&snap->node, &rbd_dev->snaps); 2206 - } 2114 + dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n", 2115 + (unsigned long long) snap_id, (unsigned int) *order, 2116 + (unsigned long long) *snap_size); 2207 2117 2208 2118 return 0; 2209 2119 } 2210 2120 2121 + static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 2122 + { 2123 + return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 2124 + &rbd_dev->header.obj_order, 2125 + &rbd_dev->header.image_size); 2126 + } 2127 + 2128 + static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 2129 + { 2130 + void *reply_buf; 2131 + int ret; 2132 + void *p; 2133 + 2134 + reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 2135 + if (!reply_buf) 2136 + return -ENOMEM; 2137 + 2138 + ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2139 + "rbd", "get_object_prefix", 2140 + NULL, 0, 2141 + reply_buf, RBD_OBJ_PREFIX_LEN_MAX, 2142 + CEPH_OSD_FLAG_READ, NULL); 2143 + dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2144 + if (ret < 0) 2145 + goto out; 2146 + 2147 + p = reply_buf; 2148 + rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 2149 + p + RBD_OBJ_PREFIX_LEN_MAX, 2150 + NULL, GFP_NOIO); 2151 + 2152 + if (IS_ERR(rbd_dev->header.object_prefix)) { 2153 + ret = PTR_ERR(rbd_dev->header.object_prefix); 2154 + rbd_dev->header.object_prefix = NULL; 2155 + } else { 2156 + dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 2157 + } 2158 + 2159 + out: 2160 + kfree(reply_buf); 2161 + 2162 + return ret; 2163 + } 2164 + 2165 + static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 2166 + u64 *snap_features) 2167 + { 2168 + __le64 snapid = cpu_to_le64(snap_id); 2169 + struct { 2170 + __le64 features; 2171 + __le64 incompat; 2172 + } features_buf = { 0 }; 2173 + int ret; 2174 + 2175 + ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2176 + "rbd", "get_features", 2177 + (char *) &snapid, sizeof (snapid), 2178 + (char *) &features_buf, sizeof (features_buf), 2179 + CEPH_OSD_FLAG_READ, NULL); 2180 + dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2181 + if (ret < 0) 2182 + return ret; 2183 + *snap_features = le64_to_cpu(features_buf.features); 2184 + 2185 + dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 2186 + (unsigned long long) snap_id, 2187 + (unsigned long long) *snap_features, 2188 + (unsigned long long) le64_to_cpu(features_buf.incompat)); 2189 + 2190 + return 0; 2191 + } 2192 + 2193 + static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 2194 + { 2195 + return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 2196 + &rbd_dev->header.features); 2197 + } 2198 + 2199 + static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) 2200 + { 2201 + size_t size; 2202 + int ret; 2203 + void *reply_buf; 2204 + void *p; 2205 + void *end; 2206 + u64 seq; 2207 + u32 snap_count; 2208 + struct ceph_snap_context *snapc; 2209 + u32 i; 2210 + 2211 + /* 2212 + * We'll need room for the seq value (maximum snapshot id), 2213 + * snapshot count, and array of that many snapshot ids. 2214 + * For now we have a fixed upper limit on the number we're 2215 + * prepared to receive. 2216 + */ 2217 + size = sizeof (__le64) + sizeof (__le32) + 2218 + RBD_MAX_SNAP_COUNT * sizeof (__le64); 2219 + reply_buf = kzalloc(size, GFP_KERNEL); 2220 + if (!reply_buf) 2221 + return -ENOMEM; 2222 + 2223 + ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2224 + "rbd", "get_snapcontext", 2225 + NULL, 0, 2226 + reply_buf, size, 2227 + CEPH_OSD_FLAG_READ, ver); 2228 + dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2229 + if (ret < 0) 2230 + goto out; 2231 + 2232 + ret = -ERANGE; 2233 + p = reply_buf; 2234 + end = (char *) reply_buf + size; 2235 + ceph_decode_64_safe(&p, end, seq, out); 2236 + ceph_decode_32_safe(&p, end, snap_count, out); 2237 + 2238 + /* 2239 + * Make sure the reported number of snapshot ids wouldn't go 2240 + * beyond the end of our buffer. But before checking that, 2241 + * make sure the computed size of the snapshot context we 2242 + * allocate is representable in a size_t. 2243 + */ 2244 + if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 2245 + / sizeof (u64)) { 2246 + ret = -EINVAL; 2247 + goto out; 2248 + } 2249 + if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 2250 + goto out; 2251 + 2252 + size = sizeof (struct ceph_snap_context) + 2253 + snap_count * sizeof (snapc->snaps[0]); 2254 + snapc = kmalloc(size, GFP_KERNEL); 2255 + if (!snapc) { 2256 + ret = -ENOMEM; 2257 + goto out; 2258 + } 2259 + 2260 + atomic_set(&snapc->nref, 1); 2261 + snapc->seq = seq; 2262 + snapc->num_snaps = snap_count; 2263 + for (i = 0; i < snap_count; i++) 2264 + snapc->snaps[i] = ceph_decode_64(&p); 2265 + 2266 + rbd_dev->header.snapc = snapc; 2267 + 2268 + dout(" snap context seq = %llu, snap_count = %u\n", 2269 + (unsigned long long) seq, (unsigned int) snap_count); 2270 + 2271 + out: 2272 + kfree(reply_buf); 2273 + 2274 + return 0; 2275 + } 2276 + 2277 + static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) 2278 + { 2279 + size_t size; 2280 + void *reply_buf; 2281 + __le64 snap_id; 2282 + int ret; 2283 + void *p; 2284 + void *end; 2285 + size_t snap_name_len; 2286 + char *snap_name; 2287 + 2288 + size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 2289 + reply_buf = kmalloc(size, GFP_KERNEL); 2290 + if (!reply_buf) 2291 + return ERR_PTR(-ENOMEM); 2292 + 2293 + snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); 2294 + ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2295 + "rbd", "get_snapshot_name", 2296 + (char *) &snap_id, sizeof (snap_id), 2297 + reply_buf, size, 2298 + CEPH_OSD_FLAG_READ, NULL); 2299 + dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2300 + if (ret < 0) 2301 + goto out; 2302 + 2303 + p = reply_buf; 2304 + end = (char *) reply_buf + size; 2305 + snap_name_len = 0; 2306 + snap_name = ceph_extract_encoded_string(&p, end, &snap_name_len, 2307 + GFP_KERNEL); 2308 + if (IS_ERR(snap_name)) { 2309 + ret = PTR_ERR(snap_name); 2310 + goto out; 2311 + } else { 2312 + dout(" snap_id 0x%016llx snap_name = %s\n", 2313 + (unsigned long long) le64_to_cpu(snap_id), snap_name); 2314 + } 2315 + kfree(reply_buf); 2316 + 2317 + return snap_name; 2318 + out: 2319 + kfree(reply_buf); 2320 + 2321 + return ERR_PTR(ret); 2322 + } 2323 + 2324 + static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, 2325 + u64 *snap_size, u64 *snap_features) 2326 + { 2327 + __le64 snap_id; 2328 + u8 order; 2329 + int ret; 2330 + 2331 + snap_id = rbd_dev->header.snapc->snaps[which]; 2332 + ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size); 2333 + if (ret) 2334 + return ERR_PTR(ret); 2335 + ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features); 2336 + if (ret) 2337 + return ERR_PTR(ret); 2338 + 2339 + return rbd_dev_v2_snap_name(rbd_dev, which); 2340 + } 2341 + 2342 + static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, 2343 + u64 *snap_size, u64 *snap_features) 2344 + { 2345 + if (rbd_dev->image_format == 1) 2346 + return rbd_dev_v1_snap_info(rbd_dev, which, 2347 + snap_size, snap_features); 2348 + if (rbd_dev->image_format == 2) 2349 + return rbd_dev_v2_snap_info(rbd_dev, which, 2350 + snap_size, snap_features); 2351 + return ERR_PTR(-EINVAL); 2352 + } 2353 + 2354 + /* 2355 + * Scan the rbd device's current snapshot list and compare it to the 2356 + * newly-received snapshot context. Remove any existing snapshots 2357 + * not present in the new snapshot context. Add a new snapshot for 2358 + * any snaphots in the snapshot context not in the current list. 2359 + * And verify there are no changes to snapshots we already know 2360 + * about. 2361 + * 2362 + * Assumes the snapshots in the snapshot context are sorted by 2363 + * snapshot id, highest id first. (Snapshots in the rbd_dev's list 2364 + * are also maintained in that order.) 2365 + */ 2366 + static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) 2367 + { 2368 + struct ceph_snap_context *snapc = rbd_dev->header.snapc; 2369 + const u32 snap_count = snapc->num_snaps; 2370 + struct list_head *head = &rbd_dev->snaps; 2371 + struct list_head *links = head->next; 2372 + u32 index = 0; 2373 + 2374 + dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count); 2375 + while (index < snap_count || links != head) { 2376 + u64 snap_id; 2377 + struct rbd_snap *snap; 2378 + char *snap_name; 2379 + u64 snap_size = 0; 2380 + u64 snap_features = 0; 2381 + 2382 + snap_id = index < snap_count ? snapc->snaps[index] 2383 + : CEPH_NOSNAP; 2384 + snap = links != head ? list_entry(links, struct rbd_snap, node) 2385 + : NULL; 2386 + rbd_assert(!snap || snap->id != CEPH_NOSNAP); 2387 + 2388 + if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { 2389 + struct list_head *next = links->next; 2390 + 2391 + /* Existing snapshot not in the new snap context */ 2392 + 2393 + if (rbd_dev->mapping.snap_id == snap->id) 2394 + rbd_dev->mapping.snap_exists = false; 2395 + __rbd_remove_snap_dev(snap); 2396 + dout("%ssnap id %llu has been removed\n", 2397 + rbd_dev->mapping.snap_id == snap->id ? 2398 + "mapped " : "", 2399 + (unsigned long long) snap->id); 2400 + 2401 + /* Done with this list entry; advance */ 2402 + 2403 + links = next; 2404 + continue; 2405 + } 2406 + 2407 + snap_name = rbd_dev_snap_info(rbd_dev, index, 2408 + &snap_size, &snap_features); 2409 + if (IS_ERR(snap_name)) 2410 + return PTR_ERR(snap_name); 2411 + 2412 + dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count, 2413 + (unsigned long long) snap_id); 2414 + if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) { 2415 + struct rbd_snap *new_snap; 2416 + 2417 + /* We haven't seen this snapshot before */ 2418 + 2419 + new_snap = __rbd_add_snap_dev(rbd_dev, snap_name, 2420 + snap_id, snap_size, snap_features); 2421 + if (IS_ERR(new_snap)) { 2422 + int err = PTR_ERR(new_snap); 2423 + 2424 + dout(" failed to add dev, error %d\n", err); 2425 + 2426 + return err; 2427 + } 2428 + 2429 + /* New goes before existing, or at end of list */ 2430 + 2431 + dout(" added dev%s\n", snap ? "" : " at end\n"); 2432 + if (snap) 2433 + list_add_tail(&new_snap->node, &snap->node); 2434 + else 2435 + list_add_tail(&new_snap->node, head); 2436 + } else { 2437 + /* Already have this one */ 2438 + 2439 + dout(" already present\n"); 2440 + 2441 + rbd_assert(snap->size == snap_size); 2442 + rbd_assert(!strcmp(snap->name, snap_name)); 2443 + rbd_assert(snap->features == snap_features); 2444 + 2445 + /* Done with this list entry; advance */ 2446 + 2447 + links = links->next; 2448 + } 2449 + 2450 + /* Advance to the next entry in the snapshot context */ 2451 + 2452 + index++; 2453 + } 2454 + dout("%s: done\n", __func__); 2455 + 2456 + return 0; 2457 + } 2458 + 2459 + /* 2460 + * Scan the list of snapshots and register the devices for any that 2461 + * have not already been registered. 2462 + */ 2463 + static int rbd_dev_snaps_register(struct rbd_device *rbd_dev) 2464 + { 2465 + struct rbd_snap *snap; 2466 + int ret = 0; 2467 + 2468 + dout("%s called\n", __func__); 2469 + if (WARN_ON(!device_is_registered(&rbd_dev->dev))) 2470 + return -EIO; 2471 + 2472 + list_for_each_entry(snap, &rbd_dev->snaps, node) { 2473 + if (!rbd_snap_registered(snap)) { 2474 + ret = rbd_register_snap_dev(snap, &rbd_dev->dev); 2475 + if (ret < 0) 2476 + break; 2477 + } 2478 + } 2479 + dout("%s: returning %d\n", __func__, ret); 2480 + 2481 + return ret; 2482 + } 2483 + 2211 2484 static int rbd_bus_add_dev(struct rbd_device *rbd_dev) 2212 2485 { 2213 - int ret; 2214 2486 struct device *dev; 2215 - struct rbd_snap *snap; 2487 + int ret; 2216 2488 2217 2489 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 2218 - dev = &rbd_dev->dev; 2219 2490 2491 + dev = &rbd_dev->dev; 2220 2492 dev->bus = &rbd_bus_type; 2221 2493 dev->type = &rbd_device_type; 2222 2494 dev->parent = &rbd_root_dev; 2223 2495 dev->release = rbd_dev_release; 2224 2496 dev_set_name(dev, "%d", rbd_dev->dev_id); 2225 2497 ret = device_register(dev); 2226 - if (ret < 0) 2227 - goto out; 2228 2498 2229 - list_for_each_entry(snap, &rbd_dev->snaps, node) { 2230 - ret = rbd_register_snap_dev(snap, &rbd_dev->dev); 2231 - if (ret < 0) 2232 - break; 2233 - } 2234 - out: 2235 2499 mutex_unlock(&ctl_mutex); 2500 + 2236 2501 return ret; 2237 2502 } 2238 2503 ··· 2568 2211 return ret; 2569 2212 } 2570 2213 2571 - static atomic64_t rbd_id_max = ATOMIC64_INIT(0); 2214 + static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); 2572 2215 2573 2216 /* 2574 2217 * Get a unique rbd identifier for the given new rbd_dev, and add 2575 2218 * the rbd_dev to the global list. The minimum rbd id is 1. 2576 2219 */ 2577 - static void rbd_id_get(struct rbd_device *rbd_dev) 2220 + static void rbd_dev_id_get(struct rbd_device *rbd_dev) 2578 2221 { 2579 - rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max); 2222 + rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); 2580 2223 2581 2224 spin_lock(&rbd_dev_list_lock); 2582 2225 list_add_tail(&rbd_dev->node, &rbd_dev_list); 2583 2226 spin_unlock(&rbd_dev_list_lock); 2227 + dout("rbd_dev %p given dev id %llu\n", rbd_dev, 2228 + (unsigned long long) rbd_dev->dev_id); 2584 2229 } 2585 2230 2586 2231 /* 2587 2232 * Remove an rbd_dev from the global list, and record that its 2588 2233 * identifier is no longer in use. 2589 2234 */ 2590 - static void rbd_id_put(struct rbd_device *rbd_dev) 2235 + static void rbd_dev_id_put(struct rbd_device *rbd_dev) 2591 2236 { 2592 2237 struct list_head *tmp; 2593 2238 int rbd_id = rbd_dev->dev_id; 2594 2239 int max_id; 2595 2240 2596 - BUG_ON(rbd_id < 1); 2241 + rbd_assert(rbd_id > 0); 2597 2242 2243 + dout("rbd_dev %p released dev id %llu\n", rbd_dev, 2244 + (unsigned long long) rbd_dev->dev_id); 2598 2245 spin_lock(&rbd_dev_list_lock); 2599 2246 list_del_init(&rbd_dev->node); 2600 2247 ··· 2606 2245 * If the id being "put" is not the current maximum, there 2607 2246 * is nothing special we need to do. 2608 2247 */ 2609 - if (rbd_id != atomic64_read(&rbd_id_max)) { 2248 + if (rbd_id != atomic64_read(&rbd_dev_id_max)) { 2610 2249 spin_unlock(&rbd_dev_list_lock); 2611 2250 return; 2612 2251 } ··· 2627 2266 spin_unlock(&rbd_dev_list_lock); 2628 2267 2629 2268 /* 2630 - * The max id could have been updated by rbd_id_get(), in 2269 + * The max id could have been updated by rbd_dev_id_get(), in 2631 2270 * which case it now accurately reflects the new maximum. 2632 2271 * Be careful not to overwrite the maximum value in that 2633 2272 * case. 2634 2273 */ 2635 - atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id); 2274 + atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); 2275 + dout(" max dev id has been reset\n"); 2636 2276 } 2637 2277 2638 2278 /* ··· 2722 2360 } 2723 2361 2724 2362 /* 2725 - * This fills in the pool_name, image_name, image_name_len, snap_name, 2726 - * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based 2727 - * on the list of monitor addresses and other options provided via 2728 - * /sys/bus/rbd/add. 2363 + * This fills in the pool_name, image_name, image_name_len, rbd_dev, 2364 + * rbd_md_name, and name fields of the given rbd_dev, based on the 2365 + * list of monitor addresses and other options provided via 2366 + * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated 2367 + * copy of the snapshot name to map if successful, or a 2368 + * pointer-coded error otherwise. 2729 2369 * 2730 2370 * Note: rbd_dev is assumed to have been initially zero-filled. 2731 2371 */ 2732 - static int rbd_add_parse_args(struct rbd_device *rbd_dev, 2733 - const char *buf, 2734 - const char **mon_addrs, 2735 - size_t *mon_addrs_size, 2736 - char *options, 2737 - size_t options_size) 2372 + static char *rbd_add_parse_args(struct rbd_device *rbd_dev, 2373 + const char *buf, 2374 + const char **mon_addrs, 2375 + size_t *mon_addrs_size, 2376 + char *options, 2377 + size_t options_size) 2738 2378 { 2739 2379 size_t len; 2740 - int ret; 2380 + char *err_ptr = ERR_PTR(-EINVAL); 2381 + char *snap_name; 2741 2382 2742 2383 /* The first four tokens are required */ 2743 2384 2744 2385 len = next_token(&buf); 2745 2386 if (!len) 2746 - return -EINVAL; 2387 + return err_ptr; 2747 2388 *mon_addrs_size = len + 1; 2748 2389 *mon_addrs = buf; 2749 2390 ··· 2754 2389 2755 2390 len = copy_token(&buf, options, options_size); 2756 2391 if (!len || len >= options_size) 2757 - return -EINVAL; 2392 + return err_ptr; 2758 2393 2759 - ret = -ENOMEM; 2394 + err_ptr = ERR_PTR(-ENOMEM); 2760 2395 rbd_dev->pool_name = dup_token(&buf, NULL); 2761 2396 if (!rbd_dev->pool_name) 2762 2397 goto out_err; ··· 2765 2400 if (!rbd_dev->image_name) 2766 2401 goto out_err; 2767 2402 2768 - /* Create the name of the header object */ 2769 - 2770 - rbd_dev->header_name = kmalloc(rbd_dev->image_name_len 2771 - + sizeof (RBD_SUFFIX), 2772 - GFP_KERNEL); 2773 - if (!rbd_dev->header_name) 2403 + /* Snapshot name is optional */ 2404 + len = next_token(&buf); 2405 + if (!len) { 2406 + buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 2407 + len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 2408 + } 2409 + snap_name = kmalloc(len + 1, GFP_KERNEL); 2410 + if (!snap_name) 2774 2411 goto out_err; 2775 - sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX); 2412 + memcpy(snap_name, buf, len); 2413 + *(snap_name + len) = '\0'; 2414 + 2415 + dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len); 2416 + 2417 + return snap_name; 2418 + 2419 + out_err: 2420 + kfree(rbd_dev->image_name); 2421 + rbd_dev->image_name = NULL; 2422 + rbd_dev->image_name_len = 0; 2423 + kfree(rbd_dev->pool_name); 2424 + rbd_dev->pool_name = NULL; 2425 + 2426 + return err_ptr; 2427 + } 2428 + 2429 + /* 2430 + * An rbd format 2 image has a unique identifier, distinct from the 2431 + * name given to it by the user. Internally, that identifier is 2432 + * what's used to specify the names of objects related to the image. 2433 + * 2434 + * A special "rbd id" object is used to map an rbd image name to its 2435 + * id. If that object doesn't exist, then there is no v2 rbd image 2436 + * with the supplied name. 2437 + * 2438 + * This function will record the given rbd_dev's image_id field if 2439 + * it can be determined, and in that case will return 0. If any 2440 + * errors occur a negative errno will be returned and the rbd_dev's 2441 + * image_id field will be unchanged (and should be NULL). 2442 + */ 2443 + static int rbd_dev_image_id(struct rbd_device *rbd_dev) 2444 + { 2445 + int ret; 2446 + size_t size; 2447 + char *object_name; 2448 + void *response; 2449 + void *p; 2776 2450 2777 2451 /* 2778 - * The snapshot name is optional. If none is is supplied, 2779 - * we use the default value. 2452 + * First, see if the format 2 image id file exists, and if 2453 + * so, get the image's persistent id from it. 2780 2454 */ 2781 - rbd_dev->snap_name = dup_token(&buf, &len); 2782 - if (!rbd_dev->snap_name) 2783 - goto out_err; 2784 - if (!len) { 2785 - /* Replace the empty name with the default */ 2786 - kfree(rbd_dev->snap_name); 2787 - rbd_dev->snap_name 2788 - = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL); 2789 - if (!rbd_dev->snap_name) 2790 - goto out_err; 2455 + size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len; 2456 + object_name = kmalloc(size, GFP_NOIO); 2457 + if (!object_name) 2458 + return -ENOMEM; 2459 + sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name); 2460 + dout("rbd id object name is %s\n", object_name); 2791 2461 2792 - memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, 2793 - sizeof (RBD_SNAP_HEAD_NAME)); 2462 + /* Response will be an encoded string, which includes a length */ 2463 + 2464 + size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 2465 + response = kzalloc(size, GFP_NOIO); 2466 + if (!response) { 2467 + ret = -ENOMEM; 2468 + goto out; 2794 2469 } 2470 + 2471 + ret = rbd_req_sync_exec(rbd_dev, object_name, 2472 + "rbd", "get_id", 2473 + NULL, 0, 2474 + response, RBD_IMAGE_ID_LEN_MAX, 2475 + CEPH_OSD_FLAG_READ, NULL); 2476 + dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2477 + if (ret < 0) 2478 + goto out; 2479 + 2480 + p = response; 2481 + rbd_dev->image_id = ceph_extract_encoded_string(&p, 2482 + p + RBD_IMAGE_ID_LEN_MAX, 2483 + &rbd_dev->image_id_len, 2484 + GFP_NOIO); 2485 + if (IS_ERR(rbd_dev->image_id)) { 2486 + ret = PTR_ERR(rbd_dev->image_id); 2487 + rbd_dev->image_id = NULL; 2488 + } else { 2489 + dout("image_id is %s\n", rbd_dev->image_id); 2490 + } 2491 + out: 2492 + kfree(response); 2493 + kfree(object_name); 2494 + 2495 + return ret; 2496 + } 2497 + 2498 + static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) 2499 + { 2500 + int ret; 2501 + size_t size; 2502 + 2503 + /* Version 1 images have no id; empty string is used */ 2504 + 2505 + rbd_dev->image_id = kstrdup("", GFP_KERNEL); 2506 + if (!rbd_dev->image_id) 2507 + return -ENOMEM; 2508 + rbd_dev->image_id_len = 0; 2509 + 2510 + /* Record the header object name for this rbd image. */ 2511 + 2512 + size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX); 2513 + rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 2514 + if (!rbd_dev->header_name) { 2515 + ret = -ENOMEM; 2516 + goto out_err; 2517 + } 2518 + sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX); 2519 + 2520 + /* Populate rbd image metadata */ 2521 + 2522 + ret = rbd_read_header(rbd_dev, &rbd_dev->header); 2523 + if (ret < 0) 2524 + goto out_err; 2525 + rbd_dev->image_format = 1; 2526 + 2527 + dout("discovered version 1 image, header name is %s\n", 2528 + rbd_dev->header_name); 2795 2529 2796 2530 return 0; 2797 2531 2798 2532 out_err: 2799 2533 kfree(rbd_dev->header_name); 2800 - kfree(rbd_dev->image_name); 2801 - kfree(rbd_dev->pool_name); 2802 - rbd_dev->pool_name = NULL; 2534 + rbd_dev->header_name = NULL; 2535 + kfree(rbd_dev->image_id); 2536 + rbd_dev->image_id = NULL; 2537 + 2538 + return ret; 2539 + } 2540 + 2541 + static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) 2542 + { 2543 + size_t size; 2544 + int ret; 2545 + u64 ver = 0; 2546 + 2547 + /* 2548 + * Image id was filled in by the caller. Record the header 2549 + * object name for this rbd image. 2550 + */ 2551 + size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len; 2552 + rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 2553 + if (!rbd_dev->header_name) 2554 + return -ENOMEM; 2555 + sprintf(rbd_dev->header_name, "%s%s", 2556 + RBD_HEADER_PREFIX, rbd_dev->image_id); 2557 + 2558 + /* Get the size and object order for the image */ 2559 + 2560 + ret = rbd_dev_v2_image_size(rbd_dev); 2561 + if (ret < 0) 2562 + goto out_err; 2563 + 2564 + /* Get the object prefix (a.k.a. block_name) for the image */ 2565 + 2566 + ret = rbd_dev_v2_object_prefix(rbd_dev); 2567 + if (ret < 0) 2568 + goto out_err; 2569 + 2570 + /* Get the features for the image */ 2571 + 2572 + ret = rbd_dev_v2_features(rbd_dev); 2573 + if (ret < 0) 2574 + goto out_err; 2575 + 2576 + /* crypto and compression type aren't (yet) supported for v2 images */ 2577 + 2578 + rbd_dev->header.crypt_type = 0; 2579 + rbd_dev->header.comp_type = 0; 2580 + 2581 + /* Get the snapshot context, plus the header version */ 2582 + 2583 + ret = rbd_dev_v2_snap_context(rbd_dev, &ver); 2584 + if (ret) 2585 + goto out_err; 2586 + rbd_dev->header.obj_version = ver; 2587 + 2588 + rbd_dev->image_format = 2; 2589 + 2590 + dout("discovered version 2 image, header name is %s\n", 2591 + rbd_dev->header_name); 2592 + 2593 + return -ENOTSUPP; 2594 + out_err: 2595 + kfree(rbd_dev->header_name); 2596 + rbd_dev->header_name = NULL; 2597 + kfree(rbd_dev->header.object_prefix); 2598 + rbd_dev->header.object_prefix = NULL; 2599 + 2600 + return ret; 2601 + } 2602 + 2603 + /* 2604 + * Probe for the existence of the header object for the given rbd 2605 + * device. For format 2 images this includes determining the image 2606 + * id. 2607 + */ 2608 + static int rbd_dev_probe(struct rbd_device *rbd_dev) 2609 + { 2610 + int ret; 2611 + 2612 + /* 2613 + * Get the id from the image id object. If it's not a 2614 + * format 2 image, we'll get ENOENT back, and we'll assume 2615 + * it's a format 1 image. 2616 + */ 2617 + ret = rbd_dev_image_id(rbd_dev); 2618 + if (ret) 2619 + ret = rbd_dev_v1_probe(rbd_dev); 2620 + else 2621 + ret = rbd_dev_v2_probe(rbd_dev); 2622 + if (ret) 2623 + dout("probe failed, returning %d\n", ret); 2803 2624 2804 2625 return ret; 2805 2626 } ··· 3000 2449 size_t mon_addrs_size = 0; 3001 2450 struct ceph_osd_client *osdc; 3002 2451 int rc = -ENOMEM; 2452 + char *snap_name; 3003 2453 3004 2454 if (!try_module_get(THIS_MODULE)) 3005 2455 return -ENODEV; 3006 2456 3007 2457 options = kmalloc(count, GFP_KERNEL); 3008 2458 if (!options) 3009 - goto err_nomem; 2459 + goto err_out_mem; 3010 2460 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); 3011 2461 if (!rbd_dev) 3012 - goto err_nomem; 2462 + goto err_out_mem; 3013 2463 3014 2464 /* static rbd_device initialization */ 3015 2465 spin_lock_init(&rbd_dev->lock); ··· 3018 2466 INIT_LIST_HEAD(&rbd_dev->snaps); 3019 2467 init_rwsem(&rbd_dev->header_rwsem); 3020 2468 3021 - /* generate unique id: find highest unique id, add one */ 3022 - rbd_id_get(rbd_dev); 3023 - 3024 - /* Fill in the device name, now that we have its id. */ 3025 - BUILD_BUG_ON(DEV_NAME_LEN 3026 - < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 3027 - sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 3028 - 3029 2469 /* parse add command */ 3030 - rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size, 3031 - options, count); 3032 - if (rc) 3033 - goto err_put_id; 3034 - 3035 - rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1, 3036 - options); 3037 - if (IS_ERR(rbd_dev->rbd_client)) { 3038 - rc = PTR_ERR(rbd_dev->rbd_client); 3039 - goto err_put_id; 2470 + snap_name = rbd_add_parse_args(rbd_dev, buf, 2471 + &mon_addrs, &mon_addrs_size, options, count); 2472 + if (IS_ERR(snap_name)) { 2473 + rc = PTR_ERR(snap_name); 2474 + goto err_out_mem; 3040 2475 } 2476 + 2477 + rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options); 2478 + if (rc < 0) 2479 + goto err_out_args; 3041 2480 3042 2481 /* pick the pool */ 3043 2482 osdc = &rbd_dev->rbd_client->client->osdc; ··· 3037 2494 goto err_out_client; 3038 2495 rbd_dev->pool_id = rc; 3039 2496 3040 - /* register our block device */ 3041 - rc = register_blkdev(0, rbd_dev->name); 2497 + rc = rbd_dev_probe(rbd_dev); 3042 2498 if (rc < 0) 3043 2499 goto err_out_client; 2500 + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 2501 + 2502 + /* no need to lock here, as rbd_dev is not registered yet */ 2503 + rc = rbd_dev_snaps_update(rbd_dev); 2504 + if (rc) 2505 + goto err_out_header; 2506 + 2507 + rc = rbd_dev_set_mapping(rbd_dev, snap_name); 2508 + if (rc) 2509 + goto err_out_header; 2510 + 2511 + /* generate unique id: find highest unique id, add one */ 2512 + rbd_dev_id_get(rbd_dev); 2513 + 2514 + /* Fill in the device name, now that we have its id. */ 2515 + BUILD_BUG_ON(DEV_NAME_LEN 2516 + < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 2517 + sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 2518 + 2519 + /* Get our block major device number. */ 2520 + 2521 + rc = register_blkdev(0, rbd_dev->name); 2522 + if (rc < 0) 2523 + goto err_out_id; 3044 2524 rbd_dev->major = rc; 2525 + 2526 + /* Set up the blkdev mapping. */ 2527 + 2528 + rc = rbd_init_disk(rbd_dev); 2529 + if (rc) 2530 + goto err_out_blkdev; 3045 2531 3046 2532 rc = rbd_bus_add_dev(rbd_dev); 3047 2533 if (rc) 3048 - goto err_out_blkdev; 2534 + goto err_out_disk; 3049 2535 3050 2536 /* 3051 2537 * At this point cleanup in the event of an error is the job 3052 2538 * of the sysfs code (initiated by rbd_bus_del_dev()). 3053 - * 3054 - * Set up and announce blkdev mapping. 3055 2539 */ 3056 - rc = rbd_init_disk(rbd_dev); 2540 + 2541 + down_write(&rbd_dev->header_rwsem); 2542 + rc = rbd_dev_snaps_register(rbd_dev); 2543 + up_write(&rbd_dev->header_rwsem); 3057 2544 if (rc) 3058 2545 goto err_out_bus; 3059 2546 3060 2547 rc = rbd_init_watch_dev(rbd_dev); 3061 2548 if (rc) 3062 2549 goto err_out_bus; 2550 + 2551 + /* Everything's ready. Announce the disk to the world. */ 2552 + 2553 + add_disk(rbd_dev->disk); 2554 + 2555 + pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 2556 + (unsigned long long) rbd_dev->mapping.size); 3063 2557 3064 2558 return count; 3065 2559 ··· 3107 2527 kfree(options); 3108 2528 return rc; 3109 2529 2530 + err_out_disk: 2531 + rbd_free_disk(rbd_dev); 3110 2532 err_out_blkdev: 3111 2533 unregister_blkdev(rbd_dev->major, rbd_dev->name); 2534 + err_out_id: 2535 + rbd_dev_id_put(rbd_dev); 2536 + err_out_header: 2537 + rbd_header_free(&rbd_dev->header); 3112 2538 err_out_client: 2539 + kfree(rbd_dev->header_name); 3113 2540 rbd_put_client(rbd_dev); 3114 - err_put_id: 3115 - if (rbd_dev->pool_name) { 3116 - kfree(rbd_dev->snap_name); 3117 - kfree(rbd_dev->header_name); 3118 - kfree(rbd_dev->image_name); 3119 - kfree(rbd_dev->pool_name); 3120 - } 3121 - rbd_id_put(rbd_dev); 3122 - err_nomem: 2541 + kfree(rbd_dev->image_id); 2542 + err_out_args: 2543 + kfree(rbd_dev->mapping.snap_name); 2544 + kfree(rbd_dev->image_name); 2545 + kfree(rbd_dev->pool_name); 2546 + err_out_mem: 3123 2547 kfree(rbd_dev); 3124 2548 kfree(options); 3125 2549 ··· 3169 2585 rbd_free_disk(rbd_dev); 3170 2586 unregister_blkdev(rbd_dev->major, rbd_dev->name); 3171 2587 2588 + /* release allocated disk header fields */ 2589 + rbd_header_free(&rbd_dev->header); 2590 + 3172 2591 /* done with the id, and with the rbd_dev */ 3173 - kfree(rbd_dev->snap_name); 2592 + kfree(rbd_dev->mapping.snap_name); 2593 + kfree(rbd_dev->image_id); 3174 2594 kfree(rbd_dev->header_name); 3175 2595 kfree(rbd_dev->pool_name); 3176 2596 kfree(rbd_dev->image_name); 3177 - rbd_id_put(rbd_dev); 2597 + rbd_dev_id_put(rbd_dev); 3178 2598 kfree(rbd_dev); 3179 2599 3180 2600 /* release module ref */ ··· 3216 2628 3217 2629 done: 3218 2630 mutex_unlock(&ctl_mutex); 3219 - return ret; 3220 - } 3221 2631 3222 - static ssize_t rbd_snap_add(struct device *dev, 3223 - struct device_attribute *attr, 3224 - const char *buf, 3225 - size_t count) 3226 - { 3227 - struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3228 - int ret; 3229 - char *name = kmalloc(count + 1, GFP_KERNEL); 3230 - if (!name) 3231 - return -ENOMEM; 3232 - 3233 - snprintf(name, count, "%s", buf); 3234 - 3235 - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 3236 - 3237 - ret = rbd_header_add_snap(rbd_dev, 3238 - name, GFP_KERNEL); 3239 - if (ret < 0) 3240 - goto err_unlock; 3241 - 3242 - ret = __rbd_refresh_header(rbd_dev, NULL); 3243 - if (ret < 0) 3244 - goto err_unlock; 3245 - 3246 - /* shouldn't hold ctl_mutex when notifying.. notify might 3247 - trigger a watch callback that would need to get that mutex */ 3248 - mutex_unlock(&ctl_mutex); 3249 - 3250 - /* make a best effort, don't error if failed */ 3251 - rbd_req_sync_notify(rbd_dev); 3252 - 3253 - ret = count; 3254 - kfree(name); 3255 - return ret; 3256 - 3257 - err_unlock: 3258 - mutex_unlock(&ctl_mutex); 3259 - kfree(name); 3260 2632 return ret; 3261 2633 } 3262 2634
+21 -6
drivers/block/rbd_types.h
··· 15 15 16 16 #include <linux/types.h> 17 17 18 + /* For format version 2, rbd image 'foo' consists of objects 19 + * rbd_id.foo - id of image 20 + * rbd_header.<id> - image metadata 21 + * rbd_data.<id>.0000000000000000 22 + * rbd_data.<id>.0000000000000001 23 + * ... - data 24 + * Clients do not access header data directly in rbd format 2. 25 + */ 26 + 27 + #define RBD_HEADER_PREFIX "rbd_header." 28 + #define RBD_DATA_PREFIX "rbd_data." 29 + #define RBD_ID_PREFIX "rbd_id." 30 + 18 31 /* 19 - * rbd image 'foo' consists of objects 20 - * foo.rbd - image metadata 21 - * foo.00000000 22 - * foo.00000001 23 - * ... - data 32 + * For format version 1, rbd image 'foo' consists of objects 33 + * foo.rbd - image metadata 34 + * rb.<idhi>.<idlo>.00000000 35 + * rb.<idhi>.<idlo>.00000001 36 + * ... - data 37 + * There is no notion of a persistent image id in rbd format 1. 24 38 */ 25 39 26 40 #define RBD_SUFFIX ".rbd" 41 + 27 42 #define RBD_DIRECTORY "rbd_directory" 28 43 #define RBD_INFO "rbd_info" 29 44 ··· 62 47 63 48 struct rbd_image_header_ondisk { 64 49 char text[40]; 65 - char block_name[24]; 50 + char object_prefix[24]; 66 51 char signature[4]; 67 52 char version[8]; 68 53 struct {
+9 -10
fs/ceph/addr.c
··· 205 205 dout("readpage inode %p file %p page %p index %lu\n", 206 206 inode, filp, page, page->index); 207 207 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 208 - page->index << PAGE_CACHE_SHIFT, &len, 208 + (u64) page_offset(page), &len, 209 209 ci->i_truncate_seq, ci->i_truncate_size, 210 210 &page, 1, 0); 211 211 if (err == -ENOENT) ··· 286 286 int nr_pages = 0; 287 287 int ret; 288 288 289 - off = page->index << PAGE_CACHE_SHIFT; 289 + off = (u64) page_offset(page); 290 290 291 291 /* count pages */ 292 292 next_index = page->index; ··· 308 308 NULL, 0, 309 309 ci->i_truncate_seq, ci->i_truncate_size, 310 310 NULL, false, 1, 0); 311 - if (!req) 312 - return -ENOMEM; 311 + if (IS_ERR(req)) 312 + return PTR_ERR(req); 313 313 314 314 /* build page vector */ 315 315 nr_pages = len >> PAGE_CACHE_SHIFT; ··· 426 426 struct ceph_inode_info *ci; 427 427 struct ceph_fs_client *fsc; 428 428 struct ceph_osd_client *osdc; 429 - loff_t page_off = page->index << PAGE_CACHE_SHIFT; 429 + loff_t page_off = page_offset(page); 430 430 int len = PAGE_CACHE_SIZE; 431 431 loff_t i_size; 432 432 int err = 0; ··· 817 817 /* ok */ 818 818 if (locked_pages == 0) { 819 819 /* prepare async write request */ 820 - offset = (unsigned long long)page->index 821 - << PAGE_CACHE_SHIFT; 820 + offset = (u64) page_offset(page); 822 821 len = wsize; 823 822 req = ceph_osdc_new_request(&fsc->client->osdc, 824 823 &ci->i_layout, ··· 831 832 ci->i_truncate_size, 832 833 &inode->i_mtime, true, 1, 0); 833 834 834 - if (!req) { 835 - rc = -ENOMEM; 835 + if (IS_ERR(req)) { 836 + rc = PTR_ERR(req); 836 837 unlock_page(page); 837 838 break; 838 839 } ··· 1179 1180 struct inode *inode = vma->vm_file->f_dentry->d_inode; 1180 1181 struct page *page = vmf->page; 1181 1182 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 1182 - loff_t off = page->index << PAGE_CACHE_SHIFT; 1183 + loff_t off = page_offset(page); 1183 1184 loff_t size, len; 1184 1185 int ret; 1185 1186
+1 -1
fs/ceph/caps.c
··· 1005 1005 1006 1006 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); 1007 1007 head = msg->front.iov_base; 1008 - head->num = cpu_to_le32(le32_to_cpu(head->num) + 1); 1008 + le32_add_cpu(&head->num, 1); 1009 1009 item = msg->front.iov_base + msg->front.iov_len; 1010 1010 item->ino = cpu_to_le64(ino); 1011 1011 item->cap_id = cpu_to_le64(cap_id);
+2 -2
fs/ceph/file.c
··· 536 536 do_sync, 537 537 ci->i_truncate_seq, ci->i_truncate_size, 538 538 &mtime, false, 2, page_align); 539 - if (!req) 540 - return -ENOMEM; 539 + if (IS_ERR(req)) 540 + return PTR_ERR(req); 541 541 542 542 if (file->f_flags & O_DIRECT) { 543 543 pages = ceph_get_direct_page_vector(data, num_pages, false);
+6 -2
fs/ceph/ioctl.c
··· 187 187 u64 tmp; 188 188 struct ceph_object_layout ol; 189 189 struct ceph_pg pgid; 190 + int r; 190 191 191 192 /* copy and validate */ 192 193 if (copy_from_user(&dl, arg, sizeof(dl))) 193 194 return -EFAULT; 194 195 195 196 down_read(&osdc->map_sem); 196 - ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, 197 - &dl.object_no, &dl.object_offset, &olen); 197 + r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, 198 + &dl.object_no, &dl.object_offset, 199 + &olen); 200 + if (r < 0) 201 + return -EIO; 198 202 dl.file_offset -= dl.object_offset; 199 203 dl.object_size = ceph_file_layout_object_size(ci->i_layout); 200 204 dl.block_size = ceph_file_layout_su(ci->i_layout);
+2 -1
fs/ceph/mds_client.c
··· 2625 2625 ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", 2626 2626 session_state_name(s->s_state)); 2627 2627 2628 - if (memcmp(ceph_mdsmap_get_addr(oldmap, i), 2628 + if (i >= newmap->m_max_mds || 2629 + memcmp(ceph_mdsmap_get_addr(oldmap, i), 2629 2630 ceph_mdsmap_get_addr(newmap, i), 2630 2631 sizeof(struct ceph_entity_addr))) { 2631 2632 if (s->s_state == CEPH_MDS_SESSION_OPENING) {
+26 -11
fs/ceph/super.c
··· 307 307 { 308 308 struct ceph_mount_options *fsopt; 309 309 const char *dev_name_end; 310 - int err = -ENOMEM; 310 + int err; 311 + 312 + if (!dev_name || !*dev_name) 313 + return -EINVAL; 311 314 312 315 fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL); 313 316 if (!fsopt) ··· 331 328 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; 332 329 fsopt->congestion_kb = default_congestion_kb(); 333 330 334 - /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ 331 + /* 332 + * Distinguish the server list from the path in "dev_name". 333 + * Internally we do not include the leading '/' in the path. 334 + * 335 + * "dev_name" will look like: 336 + * <server_spec>[,<server_spec>...]:[<path>] 337 + * where 338 + * <server_spec> is <ip>[:<port>] 339 + * <path> is optional, but if present must begin with '/' 340 + */ 341 + dev_name_end = strchr(dev_name, '/'); 342 + if (dev_name_end) { 343 + /* skip over leading '/' for path */ 344 + *path = dev_name_end + 1; 345 + } else { 346 + /* path is empty */ 347 + dev_name_end = dev_name + strlen(dev_name); 348 + *path = dev_name_end; 349 + } 335 350 err = -EINVAL; 336 - if (!dev_name) 337 - goto out; 338 - *path = strstr(dev_name, ":/"); 339 - if (*path == NULL) { 340 - pr_err("device name is missing path (no :/ in %s)\n", 351 + dev_name_end--; /* back up to ':' separator */ 352 + if (*dev_name_end != ':') { 353 + pr_err("device name is missing path (no : separator in %s)\n", 341 354 dev_name); 342 355 goto out; 343 356 } 344 - dev_name_end = *path; 345 357 dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); 346 - 347 - /* path on server */ 348 - *path += 2; 349 358 dout("server path '%s'\n", *path); 350 359 351 360 *popt = ceph_parse_options(options, dev_name, dev_name_end,
-1
include/linux/ceph/mon_client.h
··· 71 71 int cur_mon; /* last monitor i contacted */ 72 72 unsigned long sub_sent, sub_renew_after; 73 73 struct ceph_connection con; 74 - bool have_fsid; 75 74 76 75 /* pending generic requests */ 77 76 struct rb_root generic_request_tree;
+1 -1
include/linux/ceph/osd_client.h
··· 207 207 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, 208 208 struct ceph_msg *msg); 209 209 210 - extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc, 210 + extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc, 211 211 struct ceph_file_layout *layout, 212 212 u64 snapid, 213 213 u64 off, u64 *plen, u64 *bno,
+3 -3
include/linux/ceph/osdmap.h
··· 109 109 extern void ceph_osdmap_destroy(struct ceph_osdmap *map); 110 110 111 111 /* calculate mapping of a file extent to an object */ 112 - extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, 113 - u64 off, u64 *plen, 114 - u64 *bno, u64 *oxoff, u64 *oxlen); 112 + extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, 113 + u64 off, u64 *plen, 114 + u64 *bno, u64 *oxoff, u64 *oxlen); 115 115 116 116 /* calculate mapping of object to a placement group */ 117 117 extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
+3 -4
net/ceph/mon_client.c
··· 637 637 /* 638 638 * Do a synchronous pool op. 639 639 */ 640 - int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op, 640 + static int do_poolop(struct ceph_mon_client *monc, u32 op, 641 641 u32 pool, u64 snapid, 642 642 char *buf, int len) 643 643 { ··· 687 687 int ceph_monc_create_snapid(struct ceph_mon_client *monc, 688 688 u32 pool, u64 *snapid) 689 689 { 690 - return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, 690 + return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, 691 691 pool, 0, (char *)snapid, sizeof(*snapid)); 692 692 693 693 } ··· 696 696 int ceph_monc_delete_snapid(struct ceph_mon_client *monc, 697 697 u32 pool, u64 snapid) 698 698 { 699 - return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, 699 + return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, 700 700 pool, snapid, 0, 0); 701 701 702 702 } ··· 769 769 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); 770 770 } 771 771 monc->monmap->num_mon = num_mon; 772 - monc->have_fsid = false; 773 772 return 0; 774 773 } 775 774
+29 -18
net/ceph/osd_client.c
··· 52 52 op == CEPH_OSD_OP_WRITE); 53 53 } 54 54 55 - void ceph_calc_raw_layout(struct ceph_osd_client *osdc, 55 + int ceph_calc_raw_layout(struct ceph_osd_client *osdc, 56 56 struct ceph_file_layout *layout, 57 57 u64 snapid, 58 58 u64 off, u64 *plen, u64 *bno, ··· 62 62 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; 63 63 u64 orig_len = *plen; 64 64 u64 objoff, objlen; /* extent in object */ 65 + int r; 65 66 66 67 reqhead->snapid = cpu_to_le64(snapid); 67 68 68 69 /* object extent? */ 69 - ceph_calc_file_object_mapping(layout, off, plen, bno, 70 - &objoff, &objlen); 70 + r = ceph_calc_file_object_mapping(layout, off, plen, bno, 71 + &objoff, &objlen); 72 + if (r < 0) 73 + return r; 71 74 if (*plen < orig_len) 72 75 dout(" skipping last %llu, final file extent %llu~%llu\n", 73 76 orig_len - *plen, off, *plen); ··· 86 83 87 84 dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", 88 85 *bno, objoff, objlen, req->r_num_pages); 89 - 86 + return 0; 90 87 } 91 88 EXPORT_SYMBOL(ceph_calc_raw_layout); 92 89 ··· 115 112 * 116 113 * fill osd op in request message. 117 114 */ 118 - static void calc_layout(struct ceph_osd_client *osdc, 119 - struct ceph_vino vino, 120 - struct ceph_file_layout *layout, 121 - u64 off, u64 *plen, 122 - struct ceph_osd_request *req, 123 - struct ceph_osd_req_op *op) 115 + static int calc_layout(struct ceph_osd_client *osdc, 116 + struct ceph_vino vino, 117 + struct ceph_file_layout *layout, 118 + u64 off, u64 *plen, 119 + struct ceph_osd_request *req, 120 + struct ceph_osd_req_op *op) 124 121 { 125 122 u64 bno; 123 + int r; 126 124 127 - ceph_calc_raw_layout(osdc, layout, vino.snap, off, 128 - plen, &bno, req, op); 125 + r = ceph_calc_raw_layout(osdc, layout, vino.snap, off, 126 + plen, &bno, req, op); 127 + if (r < 0) 128 + return r; 129 129 130 130 snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); 131 131 req->r_oid_len = strlen(req->r_oid); 132 + 133 + return r; 132 134 } 133 135 134 136 /* ··· 464 456 { 465 457 struct ceph_osd_req_op ops[3]; 466 458 struct ceph_osd_request *req; 459 + int r; 467 460 468 461 ops[0].op = opcode; 469 462 ops[0].extent.truncate_seq = truncate_seq; ··· 483 474 use_mempool, 484 475 GFP_NOFS, NULL, NULL); 485 476 if (!req) 486 - return NULL; 477 + return ERR_PTR(-ENOMEM); 487 478 488 479 /* calculate max write size */ 489 - calc_layout(osdc, vino, layout, off, plen, req, ops); 480 + r = calc_layout(osdc, vino, layout, off, plen, req, ops); 481 + if (r < 0) 482 + return ERR_PTR(r); 490 483 req->r_file_layout = *layout; /* keep a copy */ 491 484 492 485 /* in case it differs from natural (file) alignment that ··· 1931 1920 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 1932 1921 NULL, 0, truncate_seq, truncate_size, NULL, 1933 1922 false, 1, page_align); 1934 - if (!req) 1935 - return -ENOMEM; 1923 + if (IS_ERR(req)) 1924 + return PTR_ERR(req); 1936 1925 1937 1926 /* it may be a short read due to an object boundary */ 1938 1927 req->r_pages = pages; ··· 1974 1963 snapc, do_sync, 1975 1964 truncate_seq, truncate_size, mtime, 1976 1965 nofail, 1, page_align); 1977 - if (!req) 1978 - return -ENOMEM; 1966 + if (IS_ERR(req)) 1967 + return PTR_ERR(req); 1979 1968 1980 1969 /* it may be a short write due to an object boundary */ 1981 1970 req->r_pages = pages;
+16 -2
net/ceph/osdmap.c
··· 984 984 * for now, we write only a single su, until we can 985 985 * pass a stride back to the caller. 986 986 */ 987 - void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, 987 + int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, 988 988 u64 off, u64 *plen, 989 989 u64 *ono, 990 990 u64 *oxoff, u64 *oxlen) ··· 998 998 999 999 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, 1000 1000 osize, su); 1001 + if (su == 0 || sc == 0) 1002 + goto invalid; 1001 1003 su_per_object = osize / su; 1004 + if (su_per_object == 0) 1005 + goto invalid; 1002 1006 dout("osize %u / su %u = su_per_object %u\n", osize, su, 1003 1007 su_per_object); 1004 1008 1005 - BUG_ON((su & ~PAGE_MASK) != 0); 1009 + if ((su & ~PAGE_MASK) != 0) 1010 + goto invalid; 1011 + 1006 1012 /* bl = *off / su; */ 1007 1013 t = off; 1008 1014 do_div(t, su); ··· 1036 1030 *plen = *oxlen; 1037 1031 1038 1032 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); 1033 + return 0; 1034 + 1035 + invalid: 1036 + dout(" invalid layout\n"); 1037 + *ono = 0; 1038 + *oxoff = 0; 1039 + *oxlen = 0; 1040 + return -EINVAL; 1039 1041 } 1040 1042 EXPORT_SYMBOL(ceph_calc_file_object_mapping); 1041 1043
+2 -3
net/ceph/pagelist.c
··· 1 - 2 1 #include <linux/module.h> 3 2 #include <linux/gfp.h> 4 3 #include <linux/pagemap.h> ··· 133 134 ceph_pagelist_unmap_tail(pl); 134 135 while (pl->head.prev != c->page_lru) { 135 136 page = list_entry(pl->head.prev, struct page, lru); 136 - list_del(&page->lru); /* remove from pagelist */ 137 - list_add_tail(&page->lru, &pl->free_list); /* add to reserve */ 137 + /* move from pagelist to reserve */ 138 + list_move_tail(&page->lru, &pl->free_list); 138 139 ++pl->num_pages_free; 139 140 } 140 141 pl->room = c->room;