Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

+12 -6

Documentation/ABI/testing/sysfs-bus-rbd

··· 25 25 26 26 The ceph unique client id that was assigned for this specific session. 27 27 28 + features 29 + 30 + A hexadecimal encoding of the feature bits for this image. 31 + 28 32 major 29 33 30 34 The block device major number. ··· 36 32 name 37 33 38 34 The name of the rbd image. 35 + 36 + image_id 37 + 38 + The unique id for the rbd image. (For rbd image format 1 39 + this is empty.) 39 40 40 41 pool 41 42 ··· 66 57 67 58 The current snapshot for which the device is mapped. 68 59 69 - create_snap 70 - 71 - Create a snapshot: 72 - 73 - $ echo <snap-name> > /sys/bus/rbd/devices/<dev-id>/snap_create 74 - 75 60 snap_* 76 61 77 62 A directory per each snapshot ··· 82 79 83 80 The size of the image when this snapshot was taken. 84 81 82 + snap_features 83 + 84 + A hexadecimal encoding of the feature bits for this snapshot. 85 85

+1189 -641

drivers/block/rbd.c

··· 41 41 42 42 #include "rbd_types.h" 43 43 44 + #define RBD_DEBUG /* Activate rbd_assert() calls */ 45 + 44 46 /* 45 47 * The basic unit of block I/O is a sector. It is interpreted in a 46 48 * number of contexts in Linux (blk, bio, genhd), but the default is ··· 52 50 #define SECTOR_SHIFT 9 53 51 #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 54 52 53 + /* It might be useful to have this defined elsewhere too */ 54 + 55 + #define U64_MAX ((u64) (~0ULL)) 56 + 55 57 #define RBD_DRV_NAME "rbd" 56 58 #define RBD_DRV_NAME_LONG "rbd (rados block device)" 57 59 58 60 #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ 59 61 60 62 #define RBD_MAX_SNAP_NAME_LEN 32 63 + #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 61 64 #define RBD_MAX_OPT_LEN 1024 62 65 63 66 #define RBD_SNAP_HEAD_NAME "-" 67 + 68 + #define RBD_IMAGE_ID_LEN_MAX 64 69 + #define RBD_OBJ_PREFIX_LEN_MAX 64 64 70 65 71 /* 66 72 * An RBD device name will be "rbd#", where the "rbd" comes from ··· 79 69 #define DEV_NAME_LEN 32 80 70 #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 81 71 82 - #define RBD_NOTIFY_TIMEOUT_DEFAULT 10 72 + #define RBD_READ_ONLY_DEFAULT false 83 73 84 74 /* 85 75 * block device image metadata (in-memory version) 86 76 */ 87 77 struct rbd_image_header { 88 - u64 image_size; 78 + /* These four fields never change for a given rbd image */ 89 79 char *object_prefix; 80 + u64 features; 90 81 __u8 obj_order; 91 82 __u8 crypt_type; 92 83 __u8 comp_type; 93 - struct ceph_snap_context *snapc; 94 - size_t snap_names_len; 95 - u32 total_snaps; 96 84 85 + /* The remaining fields need to be updated occasionally */ 86 + u64 image_size; 87 + struct ceph_snap_context *snapc; 97 88 char *snap_names; 98 89 u64 *snap_sizes; 99 90 ··· 102 91 }; 103 92 104 93 struct rbd_options { 105 - int notify_timeout; 94 + bool read_only; 106 95 }; 107 96 108 97 /* ··· 110 99 */ 111 100 struct rbd_client { 112 101 struct ceph_client *client; 113 - struct rbd_options *rbd_opts; 114 102 struct kref kref; 115 103 struct list_head node; 116 104 }; ··· 151 141 u64 size; 152 142 struct list_head node; 153 143 u64 id; 144 + u64 features; 145 + }; 146 + 147 + struct rbd_mapping { 148 + char *snap_name; 149 + u64 snap_id; 150 + u64 size; 151 + u64 features; 152 + bool snap_exists; 153 + bool read_only; 154 154 }; 155 155 156 156 /* ··· 171 151 172 152 int major; /* blkdev assigned major */ 173 153 struct gendisk *disk; /* blkdev's gendisk and rq */ 174 - struct request_queue *q; 175 154 155 + u32 image_format; /* Either 1 or 2 */ 156 + struct rbd_options rbd_opts; 176 157 struct rbd_client *rbd_client; 177 158 178 159 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ ··· 181 160 spinlock_t lock; /* queue lock */ 182 161 183 162 struct rbd_image_header header; 163 + char *image_id; 164 + size_t image_id_len; 184 165 char *image_name; 185 166 size_t image_name_len; 186 167 char *header_name; ··· 194 171 195 172 /* protects updating the header */ 196 173 struct rw_semaphore header_rwsem; 197 - /* name of the snapshot this device reads from */ 198 - char *snap_name; 199 - /* id of the snapshot this device reads from */ 200 - u64 snap_id; /* current snapshot id */ 201 - /* whether the snap_id this device reads from still exists */ 202 - bool snap_exists; 203 - int read_only; 174 + 175 + struct rbd_mapping mapping; 204 176 205 177 struct list_head node; 206 178 ··· 214 196 static LIST_HEAD(rbd_client_list); /* clients */ 215 197 static DEFINE_SPINLOCK(rbd_client_list_lock); 216 198 217 - static int __rbd_init_snaps_header(struct rbd_device *rbd_dev); 199 + static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); 200 + static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); 201 + 218 202 static void rbd_dev_release(struct device *dev); 219 - static ssize_t rbd_snap_add(struct device *dev, 220 - struct device_attribute *attr, 221 - const char *buf, 222 - size_t count); 223 203 static void __rbd_remove_snap_dev(struct rbd_snap *snap); 224 204 225 205 static ssize_t rbd_add(struct bus_type *bus, const char *buf, ··· 245 229 .release = rbd_root_dev_release, 246 230 }; 247 231 232 + #ifdef RBD_DEBUG 233 + #define rbd_assert(expr) \ 234 + if (unlikely(!(expr))) { \ 235 + printk(KERN_ERR "\nAssertion failure in %s() " \ 236 + "at line %d:\n\n" \ 237 + "\trbd_assert(%s);\n\n", \ 238 + __func__, __LINE__, #expr); \ 239 + BUG(); \ 240 + } 241 + #else /* !RBD_DEBUG */ 242 + # define rbd_assert(expr) ((void) 0) 243 + #endif /* !RBD_DEBUG */ 248 244 249 245 static struct device *rbd_get_dev(struct rbd_device *rbd_dev) 250 246 { ··· 274 246 { 275 247 struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 276 248 277 - if ((mode & FMODE_WRITE) && rbd_dev->read_only) 249 + if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 278 250 return -EROFS; 279 251 280 252 rbd_get_dev(rbd_dev); 281 - set_device_ro(bdev, rbd_dev->read_only); 253 + set_device_ro(bdev, rbd_dev->mapping.read_only); 282 254 283 255 return 0; 284 256 } ··· 302 274 * Initialize an rbd client instance. 303 275 * We own *ceph_opts. 304 276 */ 305 - static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts, 306 - struct rbd_options *rbd_opts) 277 + static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 307 278 { 308 279 struct rbd_client *rbdc; 309 280 int ret = -ENOMEM; ··· 326 299 if (ret < 0) 327 300 goto out_err; 328 301 329 - rbdc->rbd_opts = rbd_opts; 330 - 331 302 spin_lock(&rbd_client_list_lock); 332 303 list_add_tail(&rbdc->node, &rbd_client_list); 333 304 spin_unlock(&rbd_client_list_lock); ··· 347 322 } 348 323 349 324 /* 350 - * Find a ceph client with specific addr and configuration. 325 + * Find a ceph client with specific addr and configuration. If 326 + * found, bump its reference count. 351 327 */ 352 - static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts) 328 + static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 353 329 { 354 330 struct rbd_client *client_node; 331 + bool found = false; 355 332 356 333 if (ceph_opts->flags & CEPH_OPT_NOSHARE) 357 334 return NULL; 358 335 359 - list_for_each_entry(client_node, &rbd_client_list, node) 360 - if (!ceph_compare_options(ceph_opts, client_node->client)) 361 - return client_node; 362 - return NULL; 336 + spin_lock(&rbd_client_list_lock); 337 + list_for_each_entry(client_node, &rbd_client_list, node) { 338 + if (!ceph_compare_options(ceph_opts, client_node->client)) { 339 + kref_get(&client_node->kref); 340 + found = true; 341 + break; 342 + } 343 + } 344 + spin_unlock(&rbd_client_list_lock); 345 + 346 + return found ? client_node : NULL; 363 347 } 364 348 365 349 /* 366 350 * mount options 367 351 */ 368 352 enum { 369 - Opt_notify_timeout, 370 353 Opt_last_int, 371 354 /* int args above */ 372 355 Opt_last_string, 373 356 /* string args above */ 357 + Opt_read_only, 358 + Opt_read_write, 359 + /* Boolean args above */ 360 + Opt_last_bool, 374 361 }; 375 362 376 363 static match_table_t rbd_opts_tokens = { 377 - {Opt_notify_timeout, "notify_timeout=%d"}, 378 364 /* int args above */ 379 365 /* string args above */ 366 + {Opt_read_only, "mapping.read_only"}, 367 + {Opt_read_only, "ro"}, /* Alternate spelling */ 368 + {Opt_read_write, "read_write"}, 369 + {Opt_read_write, "rw"}, /* Alternate spelling */ 370 + /* Boolean args above */ 380 371 {-1, NULL} 381 372 }; 382 373 ··· 417 376 } else if (token > Opt_last_int && token < Opt_last_string) { 418 377 dout("got string token %d val %s\n", token, 419 378 argstr[0].from); 379 + } else if (token > Opt_last_string && token < Opt_last_bool) { 380 + dout("got Boolean token %d\n", token); 420 381 } else { 421 382 dout("got token %d\n", token); 422 383 } 423 384 424 385 switch (token) { 425 - case Opt_notify_timeout: 426 - rbd_opts->notify_timeout = intval; 386 + case Opt_read_only: 387 + rbd_opts->read_only = true; 388 + break; 389 + case Opt_read_write: 390 + rbd_opts->read_only = false; 427 391 break; 428 392 default: 429 - BUG_ON(token); 393 + rbd_assert(false); 394 + break; 430 395 } 431 396 return 0; 432 397 } ··· 441 394 * Get a ceph client with specific addr and configuration, if one does 442 395 * not exist create it. 443 396 */ 444 - static struct rbd_client *rbd_get_client(const char *mon_addr, 445 - size_t mon_addr_len, 446 - char *options) 397 + static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, 398 + size_t mon_addr_len, char *options) 447 399 { 448 - struct rbd_client *rbdc; 400 + struct rbd_options *rbd_opts = &rbd_dev->rbd_opts; 449 401 struct ceph_options *ceph_opts; 450 - struct rbd_options *rbd_opts; 402 + struct rbd_client *rbdc; 451 403 452 - rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL); 453 - if (!rbd_opts) 454 - return ERR_PTR(-ENOMEM); 455 - 456 - rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; 404 + rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 457 405 458 406 ceph_opts = ceph_parse_options(options, mon_addr, 459 407 mon_addr + mon_addr_len, 460 408 parse_rbd_opts_token, rbd_opts); 461 - if (IS_ERR(ceph_opts)) { 462 - kfree(rbd_opts); 463 - return ERR_CAST(ceph_opts); 464 - } 409 + if (IS_ERR(ceph_opts)) 410 + return PTR_ERR(ceph_opts); 465 411 466 - spin_lock(&rbd_client_list_lock); 467 - rbdc = __rbd_client_find(ceph_opts); 412 + rbdc = rbd_client_find(ceph_opts); 468 413 if (rbdc) { 469 414 /* using an existing client */ 470 - kref_get(&rbdc->kref); 471 - spin_unlock(&rbd_client_list_lock); 472 - 473 415 ceph_destroy_options(ceph_opts); 474 - kfree(rbd_opts); 475 - 476 - return rbdc; 416 + } else { 417 + rbdc = rbd_client_create(ceph_opts); 418 + if (IS_ERR(rbdc)) 419 + return PTR_ERR(rbdc); 477 420 } 478 - spin_unlock(&rbd_client_list_lock); 421 + rbd_dev->rbd_client = rbdc; 479 422 480 - rbdc = rbd_client_create(ceph_opts, rbd_opts); 481 - 482 - if (IS_ERR(rbdc)) 483 - kfree(rbd_opts); 484 - 485 - return rbdc; 423 + return 0; 486 424 } 487 425 488 426 /* ··· 485 453 spin_unlock(&rbd_client_list_lock); 486 454 487 455 ceph_destroy_client(rbdc->client); 488 - kfree(rbdc->rbd_opts); 489 456 kfree(rbdc); 490 457 } 491 458 ··· 510 479 kfree(coll); 511 480 } 512 481 482 + static bool rbd_image_format_valid(u32 image_format) 483 + { 484 + return image_format == 1 || image_format == 2; 485 + } 486 + 513 487 static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 514 488 { 515 - return !memcmp(&ondisk->text, 516 - RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)); 489 + size_t size; 490 + u32 snap_count; 491 + 492 + /* The header has to start with the magic rbd header text */ 493 + if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 494 + return false; 495 + 496 + /* 497 + * The size of a snapshot header has to fit in a size_t, and 498 + * that limits the number of snapshots. 499 + */ 500 + snap_count = le32_to_cpu(ondisk->snap_count); 501 + size = SIZE_MAX - sizeof (struct ceph_snap_context); 502 + if (snap_count > size / sizeof (__le64)) 503 + return false; 504 + 505 + /* 506 + * Not only that, but the size of the entire the snapshot 507 + * header must also be representable in a size_t. 508 + */ 509 + size -= snap_count * sizeof (__le64); 510 + if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 511 + return false; 512 + 513 + return true; 517 514 } 518 515 519 516 /* ··· 549 490 * header. 550 491 */ 551 492 static int rbd_header_from_disk(struct rbd_image_header *header, 552 - struct rbd_image_header_ondisk *ondisk, 553 - u32 allocated_snaps) 493 + struct rbd_image_header_ondisk *ondisk) 554 494 { 555 495 u32 snap_count; 496 + size_t len; 497 + size_t size; 498 + u32 i; 556 499 557 - if (!rbd_dev_ondisk_valid(ondisk)) 558 - return -ENXIO; 500 + memset(header, 0, sizeof (*header)); 559 501 560 502 snap_count = le32_to_cpu(ondisk->snap_count); 561 - if (snap_count > (SIZE_MAX - sizeof(struct ceph_snap_context)) 562 - / sizeof (u64)) 563 - return -EINVAL; 564 - header->snapc = kmalloc(sizeof(struct ceph_snap_context) + 565 - snap_count * sizeof(u64), 566 - GFP_KERNEL); 567 - if (!header->snapc) 503 + 504 + len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix)); 505 + header->object_prefix = kmalloc(len + 1, GFP_KERNEL); 506 + if (!header->object_prefix) 568 507 return -ENOMEM; 508 + memcpy(header->object_prefix, ondisk->object_prefix, len); 509 + header->object_prefix[len] = '\0'; 569 510 570 511 if (snap_count) { 571 - header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); 572 - header->snap_names = kmalloc(header->snap_names_len, 573 - GFP_KERNEL); 512 + u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 513 + 514 + /* Save a copy of the snapshot names */ 515 + 516 + if (snap_names_len > (u64) SIZE_MAX) 517 + return -EIO; 518 + header->snap_names = kmalloc(snap_names_len, GFP_KERNEL); 574 519 if (!header->snap_names) 575 - goto err_snapc; 576 - header->snap_sizes = kmalloc(snap_count * sizeof(u64), 577 - GFP_KERNEL); 520 + goto out_err; 521 + /* 522 + * Note that rbd_dev_v1_header_read() guarantees 523 + * the ondisk buffer we're working with has 524 + * snap_names_len bytes beyond the end of the 525 + * snapshot id array, this memcpy() is safe. 526 + */ 527 + memcpy(header->snap_names, &ondisk->snaps[snap_count], 528 + snap_names_len); 529 + 530 + /* Record each snapshot's size */ 531 + 532 + size = snap_count * sizeof (*header->snap_sizes); 533 + header->snap_sizes = kmalloc(size, GFP_KERNEL); 578 534 if (!header->snap_sizes) 579 - goto err_names; 535 + goto out_err; 536 + for (i = 0; i < snap_count; i++) 537 + header->snap_sizes[i] = 538 + le64_to_cpu(ondisk->snaps[i].image_size); 580 539 } else { 581 540 WARN_ON(ondisk->snap_names_len); 582 - header->snap_names_len = 0; 583 541 header->snap_names = NULL; 584 542 header->snap_sizes = NULL; 585 543 } 586 544 587 - header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1, 588 - GFP_KERNEL); 589 - if (!header->object_prefix) 590 - goto err_sizes; 591 - 592 - memcpy(header->object_prefix, ondisk->block_name, 593 - sizeof(ondisk->block_name)); 594 - header->object_prefix[sizeof (ondisk->block_name)] = '\0'; 595 - 596 - header->image_size = le64_to_cpu(ondisk->image_size); 545 + header->features = 0; /* No features support in v1 images */ 597 546 header->obj_order = ondisk->options.order; 598 547 header->crypt_type = ondisk->options.crypt_type; 599 548 header->comp_type = ondisk->options.comp_type; 600 549 550 + /* Allocate and fill in the snapshot context */ 551 + 552 + header->image_size = le64_to_cpu(ondisk->image_size); 553 + size = sizeof (struct ceph_snap_context); 554 + size += snap_count * sizeof (header->snapc->snaps[0]); 555 + header->snapc = kzalloc(size, GFP_KERNEL); 556 + if (!header->snapc) 557 + goto out_err; 558 + 601 559 atomic_set(&header->snapc->nref, 1); 602 560 header->snapc->seq = le64_to_cpu(ondisk->snap_seq); 603 561 header->snapc->num_snaps = snap_count; 604 - header->total_snaps = snap_count; 605 - 606 - if (snap_count && allocated_snaps == snap_count) { 607 - int i; 608 - 609 - for (i = 0; i < snap_count; i++) { 610 - header->snapc->snaps[i] = 611 - le64_to_cpu(ondisk->snaps[i].id); 612 - header->snap_sizes[i] = 613 - le64_to_cpu(ondisk->snaps[i].image_size); 614 - } 615 - 616 - /* copy snapshot names */ 617 - memcpy(header->snap_names, &ondisk->snaps[snap_count], 618 - header->snap_names_len); 619 - } 562 + for (i = 0; i < snap_count; i++) 563 + header->snapc->snaps[i] = 564 + le64_to_cpu(ondisk->snaps[i].id); 620 565 621 566 return 0; 622 567 623 - err_sizes: 568 + out_err: 624 569 kfree(header->snap_sizes); 625 570 header->snap_sizes = NULL; 626 - err_names: 627 571 kfree(header->snap_names); 628 572 header->snap_names = NULL; 629 - err_snapc: 630 - kfree(header->snapc); 631 - header->snapc = NULL; 573 + kfree(header->object_prefix); 574 + header->object_prefix = NULL; 632 575 633 576 return -ENOMEM; 634 577 } 635 578 636 - static int snap_by_name(struct rbd_image_header *header, const char *snap_name, 637 - u64 *seq, u64 *size) 579 + static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) 638 580 { 639 - int i; 640 - char *p = header->snap_names; 641 581 642 - for (i = 0; i < header->total_snaps; i++) { 643 - if (!strcmp(snap_name, p)) { 582 + struct rbd_snap *snap; 644 583 645 - /* Found it. Pass back its id and/or size */ 584 + list_for_each_entry(snap, &rbd_dev->snaps, node) { 585 + if (!strcmp(snap_name, snap->name)) { 586 + rbd_dev->mapping.snap_id = snap->id; 587 + rbd_dev->mapping.size = snap->size; 588 + rbd_dev->mapping.features = snap->features; 646 589 647 - if (seq) 648 - *seq = header->snapc->snaps[i]; 649 - if (size) 650 - *size = header->snap_sizes[i]; 651 - return i; 590 + return 0; 652 591 } 653 - p += strlen(p) + 1; /* Skip ahead to the next name */ 654 592 } 593 + 655 594 return -ENOENT; 656 595 } 657 596 658 - static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size) 597 + static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name) 659 598 { 660 599 int ret; 661 600 662 - down_write(&rbd_dev->header_rwsem); 663 - 664 - if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, 601 + if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME, 665 602 sizeof (RBD_SNAP_HEAD_NAME))) { 666 - rbd_dev->snap_id = CEPH_NOSNAP; 667 - rbd_dev->snap_exists = false; 668 - rbd_dev->read_only = 0; 669 - if (size) 670 - *size = rbd_dev->header.image_size; 603 + rbd_dev->mapping.snap_id = CEPH_NOSNAP; 604 + rbd_dev->mapping.size = rbd_dev->header.image_size; 605 + rbd_dev->mapping.features = rbd_dev->header.features; 606 + rbd_dev->mapping.snap_exists = false; 607 + rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only; 608 + ret = 0; 671 609 } else { 672 - u64 snap_id = 0; 673 - 674 - ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name, 675 - &snap_id, size); 610 + ret = snap_by_name(rbd_dev, snap_name); 676 611 if (ret < 0) 677 612 goto done; 678 - rbd_dev->snap_id = snap_id; 679 - rbd_dev->snap_exists = true; 680 - rbd_dev->read_only = 1; 613 + rbd_dev->mapping.snap_exists = true; 614 + rbd_dev->mapping.read_only = true; 681 615 } 682 - 683 - ret = 0; 616 + rbd_dev->mapping.snap_name = snap_name; 684 617 done: 685 - up_write(&rbd_dev->header_rwsem); 686 618 return ret; 687 619 } 688 620 689 621 static void rbd_header_free(struct rbd_image_header *header) 690 622 { 691 623 kfree(header->object_prefix); 624 + header->object_prefix = NULL; 692 625 kfree(header->snap_sizes); 626 + header->snap_sizes = NULL; 693 627 kfree(header->snap_names); 628 + header->snap_names = NULL; 694 629 ceph_put_snap_context(header->snapc); 630 + header->snapc = NULL; 695 631 } 696 632 697 - /* 698 - * get the actual striped segment name, offset and length 699 - */ 700 - static u64 rbd_get_segment(struct rbd_image_header *header, 701 - const char *object_prefix, 702 - u64 ofs, u64 len, 703 - char *seg_name, u64 *segofs) 633 + static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 704 634 { 705 - u64 seg = ofs >> header->obj_order; 635 + char *name; 636 + u64 segment; 637 + int ret; 706 638 707 - if (seg_name) 708 - snprintf(seg_name, RBD_MAX_SEG_NAME_LEN, 709 - "%s.%012llx", object_prefix, seg); 639 + name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); 640 + if (!name) 641 + return NULL; 642 + segment = offset >> rbd_dev->header.obj_order; 643 + ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx", 644 + rbd_dev->header.object_prefix, segment); 645 + if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) { 646 + pr_err("error formatting segment name for #%llu (%d)\n", 647 + segment, ret); 648 + kfree(name); 649 + name = NULL; 650 + } 710 651 711 - ofs = ofs & ((1 << header->obj_order) - 1); 712 - len = min_t(u64, len, (1 << header->obj_order) - ofs); 652 + return name; 653 + } 713 654 714 - if (segofs) 715 - *segofs = ofs; 655 + static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 656 + { 657 + u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 716 658 717 - return len; 659 + return offset & (segment_size - 1); 660 + } 661 + 662 + static u64 rbd_segment_length(struct rbd_device *rbd_dev, 663 + u64 offset, u64 length) 664 + { 665 + u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 666 + 667 + offset &= segment_size - 1; 668 + 669 + rbd_assert(length <= U64_MAX - offset); 670 + if (offset + length > segment_size) 671 + length = segment_size - offset; 672 + 673 + return length; 718 674 } 719 675 720 676 static int rbd_get_num_segments(struct rbd_image_header *header, 721 677 u64 ofs, u64 len) 722 678 { 723 - u64 start_seg = ofs >> header->obj_order; 724 - u64 end_seg = (ofs + len - 1) >> header->obj_order; 679 + u64 start_seg; 680 + u64 end_seg; 681 + 682 + if (!len) 683 + return 0; 684 + if (len - 1 > U64_MAX - ofs) 685 + return -ERANGE; 686 + 687 + start_seg = ofs >> header->obj_order; 688 + end_seg = (ofs + len - 1) >> header->obj_order; 689 + 725 690 return end_seg - start_seg + 1; 726 691 } 727 692 ··· 807 724 struct bio_pair **bp, 808 725 int len, gfp_t gfpmask) 809 726 { 810 - struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL; 727 + struct bio *old_chain = *old; 728 + struct bio *new_chain = NULL; 729 + struct bio *tail; 811 730 int total = 0; 812 731 813 732 if (*bp) { ··· 818 733 } 819 734 820 735 while (old_chain && (total < len)) { 736 + struct bio *tmp; 737 + 821 738 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs); 822 739 if (!tmp) 823 740 goto err_out; 741 + gfpmask &= ~__GFP_WAIT; /* can't wait after the first */ 824 742 825 743 if (total + old_chain->bi_size > len) { 826 744 struct bio_pair *bp; ··· 851 763 } 852 764 853 765 tmp->bi_bdev = NULL; 854 - gfpmask &= ~__GFP_WAIT; 855 766 tmp->bi_next = NULL; 856 - 857 - if (!new_chain) { 858 - new_chain = tail = tmp; 859 - } else { 767 + if (new_chain) 860 768 tail->bi_next = tmp; 861 - tail = tmp; 862 - } 769 + else 770 + new_chain = tmp; 771 + tail = tmp; 863 772 old_chain = old_chain->bi_next; 864 773 865 774 total += tmp->bi_size; 866 775 } 867 776 868 - BUG_ON(total < len); 869 - 870 - if (tail) 871 - tail->bi_next = NULL; 777 + rbd_assert(total == len); 872 778 873 779 *old = old_chain; 874 780 ··· 1020 938 layout->fl_stripe_count = cpu_to_le32(1); 1021 939 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 1022 940 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id); 1023 - ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, 1024 - req, ops); 941 + ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, 942 + req, ops); 943 + rbd_assert(ret == 0); 1025 944 1026 945 ceph_osdc_build_request(req, ofs, &len, 1027 946 ops, ··· 1113 1030 int flags, 1114 1031 struct ceph_osd_req_op *ops, 1115 1032 const char *object_name, 1116 - u64 ofs, u64 len, 1117 - char *buf, 1033 + u64 ofs, u64 inbound_size, 1034 + char *inbound, 1118 1035 struct ceph_osd_request **linger_req, 1119 1036 u64 *ver) 1120 1037 { ··· 1122 1039 struct page **pages; 1123 1040 int num_pages; 1124 1041 1125 - BUG_ON(ops == NULL); 1042 + rbd_assert(ops != NULL); 1126 1043 1127 - num_pages = calc_pages_for(ofs , len); 1044 + num_pages = calc_pages_for(ofs, inbound_size); 1128 1045 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 1129 1046 if (IS_ERR(pages)) 1130 1047 return PTR_ERR(pages); 1131 1048 1132 1049 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid, 1133 - object_name, ofs, len, NULL, 1050 + object_name, ofs, inbound_size, NULL, 1134 1051 pages, num_pages, 1135 1052 flags, 1136 1053 ops, ··· 1140 1057 if (ret < 0) 1141 1058 goto done; 1142 1059 1143 - if ((flags & CEPH_OSD_FLAG_READ) && buf) 1144 - ret = ceph_copy_from_page_vector(pages, buf, ofs, ret); 1060 + if ((flags & CEPH_OSD_FLAG_READ) && inbound) 1061 + ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret); 1145 1062 1146 1063 done: 1147 1064 ceph_release_page_vector(pages, num_pages); ··· 1168 1085 struct ceph_osd_req_op *ops; 1169 1086 u32 payload_len; 1170 1087 1171 - seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); 1088 + seg_name = rbd_segment_name(rbd_dev, ofs); 1172 1089 if (!seg_name) 1173 1090 return -ENOMEM; 1174 - 1175 - seg_len = rbd_get_segment(&rbd_dev->header, 1176 - rbd_dev->header.object_prefix, 1177 - ofs, len, 1178 - seg_name, &seg_ofs); 1091 + seg_len = rbd_segment_length(rbd_dev, ofs, len); 1092 + seg_ofs = rbd_segment_offset(rbd_dev, ofs); 1179 1093 1180 1094 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0); 1181 1095 ··· 1184 1104 /* we've taken care of segment sizes earlier when we 1185 1105 cloned the bios. We should never have a segment 1186 1106 truncated at this point */ 1187 - BUG_ON(seg_len < len); 1107 + rbd_assert(seg_len == len); 1188 1108 1189 1109 ret = rbd_do_request(rq, rbd_dev, snapc, snapid, 1190 1110 seg_name, seg_ofs, seg_len, ··· 1386 1306 return ret; 1387 1307 } 1388 1308 1389 - struct rbd_notify_info { 1390 - struct rbd_device *rbd_dev; 1391 - }; 1392 - 1393 - static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 1394 - { 1395 - struct rbd_device *rbd_dev = (struct rbd_device *)data; 1396 - if (!rbd_dev) 1397 - return; 1398 - 1399 - dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n", 1400 - rbd_dev->header_name, (unsigned long long) notify_id, 1401 - (unsigned int) opcode); 1402 - } 1403 - 1404 1309 /* 1405 - * Request sync osd notify 1406 - */ 1407 - static int rbd_req_sync_notify(struct rbd_device *rbd_dev) 1408 - { 1409 - struct ceph_osd_req_op *ops; 1410 - struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 1411 - struct ceph_osd_event *event; 1412 - struct rbd_notify_info info; 1413 - int payload_len = sizeof(u32) + sizeof(u32); 1414 - int ret; 1415 - 1416 - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len); 1417 - if (!ops) 1418 - return -ENOMEM; 1419 - 1420 - info.rbd_dev = rbd_dev; 1421 - 1422 - ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1, 1423 - (void *)&info, &event); 1424 - if (ret < 0) 1425 - goto fail; 1426 - 1427 - ops[0].watch.ver = 1; 1428 - ops[0].watch.flag = 1; 1429 - ops[0].watch.cookie = event->cookie; 1430 - ops[0].watch.prot_ver = RADOS_NOTIFY_VER; 1431 - ops[0].watch.timeout = 12; 1432 - 1433 - ret = rbd_req_sync_op(rbd_dev, NULL, 1434 - CEPH_NOSNAP, 1435 - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 1436 - ops, 1437 - rbd_dev->header_name, 1438 - 0, 0, NULL, NULL, NULL); 1439 - if (ret < 0) 1440 - goto fail_event; 1441 - 1442 - ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT); 1443 - dout("ceph_osdc_wait_event returned %d\n", ret); 1444 - rbd_destroy_ops(ops); 1445 - return 0; 1446 - 1447 - fail_event: 1448 - ceph_osdc_cancel_event(event); 1449 - fail: 1450 - rbd_destroy_ops(ops); 1451 - return ret; 1452 - } 1453 - 1454 - /* 1455 - * Request sync osd read 1310 + * Synchronous osd object method call 1456 1311 */ 1457 1312 static int rbd_req_sync_exec(struct rbd_device *rbd_dev, 1458 1313 const char *object_name, 1459 1314 const char *class_name, 1460 1315 const char *method_name, 1461 - const char *data, 1462 - int len, 1316 + const char *outbound, 1317 + size_t outbound_size, 1318 + char *inbound, 1319 + size_t inbound_size, 1320 + int flags, 1463 1321 u64 *ver) 1464 1322 { 1465 1323 struct ceph_osd_req_op *ops; 1466 1324 int class_name_len = strlen(class_name); 1467 1325 int method_name_len = strlen(method_name); 1326 + int payload_size; 1468 1327 int ret; 1469 1328 1470 - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, 1471 - class_name_len + method_name_len + len); 1329 + /* 1330 + * Any input parameters required by the method we're calling 1331 + * will be sent along with the class and method names as 1332 + * part of the message payload. That data and its size are 1333 + * supplied via the indata and indata_len fields (named from 1334 + * the perspective of the server side) in the OSD request 1335 + * operation. 1336 + */ 1337 + payload_size = class_name_len + method_name_len + outbound_size; 1338 + ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size); 1472 1339 if (!ops) 1473 1340 return -ENOMEM; 1474 1341 ··· 1424 1397 ops[0].cls.method_name = method_name; 1425 1398 ops[0].cls.method_len = (__u8) method_name_len; 1426 1399 ops[0].cls.argc = 0; 1427 - ops[0].cls.indata = data; 1428 - ops[0].cls.indata_len = len; 1400 + ops[0].cls.indata = outbound; 1401 + ops[0].cls.indata_len = outbound_size; 1429 1402 1430 1403 ret = rbd_req_sync_op(rbd_dev, NULL, 1431 1404 CEPH_NOSNAP, 1432 - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 1433 - ops, 1434 - object_name, 0, 0, NULL, NULL, ver); 1405 + flags, ops, 1406 + object_name, 0, inbound_size, inbound, 1407 + NULL, ver); 1435 1408 1436 1409 rbd_destroy_ops(ops); 1437 1410 ··· 1473 1446 struct rbd_req_coll *coll; 1474 1447 struct ceph_snap_context *snapc; 1475 1448 1476 - /* peek at request from block layer */ 1477 - if (!rq) 1478 - break; 1479 - 1480 1449 dout("fetched request\n"); 1481 1450 1482 1451 /* filter out block requests we don't understand */ ··· 1487 1464 size = blk_rq_bytes(rq); 1488 1465 ofs = blk_rq_pos(rq) * SECTOR_SIZE; 1489 1466 rq_bio = rq->bio; 1490 - if (do_write && rbd_dev->read_only) { 1467 + if (do_write && rbd_dev->mapping.read_only) { 1491 1468 __blk_end_request_all(rq, -EROFS); 1492 1469 continue; 1493 1470 } ··· 1496 1473 1497 1474 down_read(&rbd_dev->header_rwsem); 1498 1475 1499 - if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) { 1476 + if (rbd_dev->mapping.snap_id != CEPH_NOSNAP && 1477 + !rbd_dev->mapping.snap_exists) { 1500 1478 up_read(&rbd_dev->header_rwsem); 1501 1479 dout("request for non-existent snapshot"); 1502 1480 spin_lock_irq(q->queue_lock); ··· 1514 1490 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); 1515 1491 1516 1492 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); 1493 + if (num_segs <= 0) { 1494 + spin_lock_irq(q->queue_lock); 1495 + __blk_end_request_all(rq, num_segs); 1496 + ceph_put_snap_context(snapc); 1497 + continue; 1498 + } 1517 1499 coll = rbd_alloc_coll(num_segs); 1518 1500 if (!coll) { 1519 1501 spin_lock_irq(q->queue_lock); ··· 1531 1501 do { 1532 1502 /* a bio clone to be passed down to OSD req */ 1533 1503 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt); 1534 - op_size = rbd_get_segment(&rbd_dev->header, 1535 - rbd_dev->header.object_prefix, 1536 - ofs, size, 1537 - NULL, NULL); 1504 + op_size = rbd_segment_length(rbd_dev, ofs, size); 1538 1505 kref_get(&coll->kref); 1539 1506 bio = bio_chain_clone(&rq_bio, &next_bio, &bp, 1540 1507 op_size, GFP_ATOMIC); ··· 1551 1524 coll, cur_seg); 1552 1525 else 1553 1526 rbd_req_read(rq, rbd_dev, 1554 - rbd_dev->snap_id, 1527 + rbd_dev->mapping.snap_id, 1555 1528 ofs, 1556 1529 op_size, bio, 1557 1530 coll, cur_seg); ··· 1607 1580 if (!disk) 1608 1581 return; 1609 1582 1610 - rbd_header_free(&rbd_dev->header); 1611 - 1612 1583 if (disk->flags & GENHD_FL_UP) 1613 1584 del_gendisk(disk); 1614 1585 if (disk->queue) ··· 1615 1590 } 1616 1591 1617 1592 /* 1618 - * reload the ondisk the header 1593 + * Read the complete header for the given rbd device. 1594 + * 1595 + * Returns a pointer to a dynamically-allocated buffer containing 1596 + * the complete and validated header. Caller can pass the address 1597 + * of a variable that will be filled in with the version of the 1598 + * header object at the time it was read. 1599 + * 1600 + * Returns a pointer-coded errno if a failure occurs. 1601 + */ 1602 + static struct rbd_image_header_ondisk * 1603 + rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) 1604 + { 1605 + struct rbd_image_header_ondisk *ondisk = NULL; 1606 + u32 snap_count = 0; 1607 + u64 names_size = 0; 1608 + u32 want_count; 1609 + int ret; 1610 + 1611 + /* 1612 + * The complete header will include an array of its 64-bit 1613 + * snapshot ids, followed by the names of those snapshots as 1614 + * a contiguous block of NUL-terminated strings. Note that 1615 + * the number of snapshots could change by the time we read 1616 + * it in, in which case we re-read it. 1617 + */ 1618 + do { 1619 + size_t size; 1620 + 1621 + kfree(ondisk); 1622 + 1623 + size = sizeof (*ondisk); 1624 + size += snap_count * sizeof (struct rbd_image_snap_ondisk); 1625 + size += names_size; 1626 + ondisk = kmalloc(size, GFP_KERNEL); 1627 + if (!ondisk) 1628 + return ERR_PTR(-ENOMEM); 1629 + 1630 + ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP, 1631 + rbd_dev->header_name, 1632 + 0, size, 1633 + (char *) ondisk, version); 1634 + 1635 + if (ret < 0) 1636 + goto out_err; 1637 + if (WARN_ON((size_t) ret < size)) { 1638 + ret = -ENXIO; 1639 + pr_warning("short header read for image %s" 1640 + " (want %zd got %d)\n", 1641 + rbd_dev->image_name, size, ret); 1642 + goto out_err; 1643 + } 1644 + if (!rbd_dev_ondisk_valid(ondisk)) { 1645 + ret = -ENXIO; 1646 + pr_warning("invalid header for image %s\n", 1647 + rbd_dev->image_name); 1648 + goto out_err; 1649 + } 1650 + 1651 + names_size = le64_to_cpu(ondisk->snap_names_len); 1652 + want_count = snap_count; 1653 + snap_count = le32_to_cpu(ondisk->snap_count); 1654 + } while (snap_count != want_count); 1655 + 1656 + return ondisk; 1657 + 1658 + out_err: 1659 + kfree(ondisk); 1660 + 1661 + return ERR_PTR(ret); 1662 + } 1663 + 1664 + /* 1665 + * reload the ondisk the header 1619 1666 */ 1620 1667 static int rbd_read_header(struct rbd_device *rbd_dev, 1621 1668 struct rbd_image_header *header) 1622 1669 { 1623 - ssize_t rc; 1624 - struct rbd_image_header_ondisk *dh; 1625 - u32 snap_count = 0; 1626 - u64 ver; 1627 - size_t len; 1628 - 1629 - /* 1630 - * First reads the fixed-size header to determine the number 1631 - * of snapshots, then re-reads it, along with all snapshot 1632 - * records as well as their stored names. 1633 - */ 1634 - len = sizeof (*dh); 1635 - while (1) { 1636 - dh = kmalloc(len, GFP_KERNEL); 1637 - if (!dh) 1638 - return -ENOMEM; 1639 - 1640 - rc = rbd_req_sync_read(rbd_dev, 1641 - CEPH_NOSNAP, 1642 - rbd_dev->header_name, 1643 - 0, len, 1644 - (char *)dh, &ver); 1645 - if (rc < 0) 1646 - goto out_dh; 1647 - 1648 - rc = rbd_header_from_disk(header, dh, snap_count); 1649 - if (rc < 0) { 1650 - if (rc == -ENXIO) 1651 - pr_warning("unrecognized header format" 1652 - " for image %s\n", 1653 - rbd_dev->image_name); 1654 - goto out_dh; 1655 - } 1656 - 1657 - if (snap_count == header->total_snaps) 1658 - break; 1659 - 1660 - snap_count = header->total_snaps; 1661 - len = sizeof (*dh) + 1662 - snap_count * sizeof(struct rbd_image_snap_ondisk) + 1663 - header->snap_names_len; 1664 - 1665 - rbd_header_free(header); 1666 - kfree(dh); 1667 - } 1668 - header->obj_version = ver; 1669 - 1670 - out_dh: 1671 - kfree(dh); 1672 - return rc; 1673 - } 1674 - 1675 - /* 1676 - * create a snapshot 1677 - */ 1678 - static int rbd_header_add_snap(struct rbd_device *rbd_dev, 1679 - const char *snap_name, 1680 - gfp_t gfp_flags) 1681 - { 1682 - int name_len = strlen(snap_name); 1683 - u64 new_snapid; 1670 + struct rbd_image_header_ondisk *ondisk; 1671 + u64 ver = 0; 1684 1672 int ret; 1685 - void *data, *p, *e; 1686 - struct ceph_mon_client *monc; 1687 1673 1688 - /* we should create a snapshot only if we're pointing at the head */ 1689 - if (rbd_dev->snap_id != CEPH_NOSNAP) 1690 - return -EINVAL; 1674 + ondisk = rbd_dev_v1_header_read(rbd_dev, &ver); 1675 + if (IS_ERR(ondisk)) 1676 + return PTR_ERR(ondisk); 1677 + ret = rbd_header_from_disk(header, ondisk); 1678 + if (ret >= 0) 1679 + header->obj_version = ver; 1680 + kfree(ondisk); 1691 1681 1692 - monc = &rbd_dev->rbd_client->client->monc; 1693 - ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid); 1694 - dout("created snapid=%llu\n", (unsigned long long) new_snapid); 1695 - if (ret < 0) 1696 - return ret; 1697 - 1698 - data = kmalloc(name_len + 16, gfp_flags); 1699 - if (!data) 1700 - return -ENOMEM; 1701 - 1702 - p = data; 1703 - e = data + name_len + 16; 1704 - 1705 - ceph_encode_string_safe(&p, e, snap_name, name_len, bad); 1706 - ceph_encode_64_safe(&p, e, new_snapid, bad); 1707 - 1708 - ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 1709 - "rbd", "snap_add", 1710 - data, p - data, NULL); 1711 - 1712 - kfree(data); 1713 - 1714 - return ret < 0 ? ret : 0; 1715 - bad: 1716 - return -ERANGE; 1682 + return ret; 1717 1683 } 1718 1684 1719 1685 static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev) ··· 1731 1715 down_write(&rbd_dev->header_rwsem); 1732 1716 1733 1717 /* resized? */ 1734 - if (rbd_dev->snap_id == CEPH_NOSNAP) { 1718 + if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) { 1735 1719 sector_t size = (sector_t) h.image_size / SECTOR_SIZE; 1736 1720 1737 - dout("setting size to %llu sectors", (unsigned long long) size); 1738 - set_capacity(rbd_dev->disk, size); 1721 + if (size != (sector_t) rbd_dev->mapping.size) { 1722 + dout("setting size to %llu sectors", 1723 + (unsigned long long) size); 1724 + rbd_dev->mapping.size = (u64) size; 1725 + set_capacity(rbd_dev->disk, size); 1726 + } 1739 1727 } 1740 1728 1741 1729 /* rbd_dev->header.object_prefix shouldn't change */ ··· 1752 1732 *hver = h.obj_version; 1753 1733 rbd_dev->header.obj_version = h.obj_version; 1754 1734 rbd_dev->header.image_size = h.image_size; 1755 - rbd_dev->header.total_snaps = h.total_snaps; 1756 1735 rbd_dev->header.snapc = h.snapc; 1757 1736 rbd_dev->header.snap_names = h.snap_names; 1758 - rbd_dev->header.snap_names_len = h.snap_names_len; 1759 1737 rbd_dev->header.snap_sizes = h.snap_sizes; 1760 1738 /* Free the extra copy of the object prefix */ 1761 1739 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); 1762 1740 kfree(h.object_prefix); 1763 1741 1764 - ret = __rbd_init_snaps_header(rbd_dev); 1742 + ret = rbd_dev_snaps_update(rbd_dev); 1743 + if (!ret) 1744 + ret = rbd_dev_snaps_register(rbd_dev); 1765 1745 1766 1746 up_write(&rbd_dev->header_rwsem); 1767 1747 ··· 1783 1763 { 1784 1764 struct gendisk *disk; 1785 1765 struct request_queue *q; 1786 - int rc; 1787 1766 u64 segment_size; 1788 - u64 total_size = 0; 1789 - 1790 - /* contact OSD, request size info about the object being mapped */ 1791 - rc = rbd_read_header(rbd_dev, &rbd_dev->header); 1792 - if (rc) 1793 - return rc; 1794 - 1795 - /* no need to lock here, as rbd_dev is not registered yet */ 1796 - rc = __rbd_init_snaps_header(rbd_dev); 1797 - if (rc) 1798 - return rc; 1799 - 1800 - rc = rbd_header_set_snap(rbd_dev, &total_size); 1801 - if (rc) 1802 - return rc; 1803 1767 1804 1768 /* create gendisk info */ 1805 - rc = -ENOMEM; 1806 1769 disk = alloc_disk(RBD_MINORS_PER_MAJOR); 1807 1770 if (!disk) 1808 - goto out; 1771 + return -ENOMEM; 1809 1772 1810 1773 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 1811 1774 rbd_dev->dev_id); ··· 1798 1795 disk->private_data = rbd_dev; 1799 1796 1800 1797 /* init rq */ 1801 - rc = -ENOMEM; 1802 1798 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock); 1803 1799 if (!q) 1804 1800 goto out_disk; ··· 1818 1816 q->queuedata = rbd_dev; 1819 1817 1820 1818 rbd_dev->disk = disk; 1821 - rbd_dev->q = q; 1822 1819 1823 - /* finally, announce the disk to the world */ 1824 - set_capacity(disk, total_size / SECTOR_SIZE); 1825 - add_disk(disk); 1820 + set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 1826 1821 1827 - pr_info("%s: added with size 0x%llx\n", 1828 - disk->disk_name, (unsigned long long)total_size); 1829 1822 return 0; 1830 - 1831 1823 out_disk: 1832 1824 put_disk(disk); 1833 - out: 1834 - return rc; 1825 + 1826 + return -ENOMEM; 1835 1827 } 1836 1828 1837 1829 /* ··· 1848 1852 up_read(&rbd_dev->header_rwsem); 1849 1853 1850 1854 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE); 1855 + } 1856 + 1857 + /* 1858 + * Note this shows the features for whatever's mapped, which is not 1859 + * necessarily the base image. 1860 + */ 1861 + static ssize_t rbd_features_show(struct device *dev, 1862 + struct device_attribute *attr, char *buf) 1863 + { 1864 + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1865 + 1866 + return sprintf(buf, "0x%016llx\n", 1867 + (unsigned long long) rbd_dev->mapping.features); 1851 1868 } 1852 1869 1853 1870 static ssize_t rbd_major_show(struct device *dev, ··· 1904 1895 return sprintf(buf, "%s\n", rbd_dev->image_name); 1905 1896 } 1906 1897 1898 + static ssize_t rbd_image_id_show(struct device *dev, 1899 + struct device_attribute *attr, char *buf) 1900 + { 1901 + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1902 + 1903 + return sprintf(buf, "%s\n", rbd_dev->image_id); 1904 + } 1905 + 1906 + /* 1907 + * Shows the name of the currently-mapped snapshot (or 1908 + * RBD_SNAP_HEAD_NAME for the base image). 1909 + */ 1907 1910 static ssize_t rbd_snap_show(struct device *dev, 1908 1911 struct device_attribute *attr, 1909 1912 char *buf) 1910 1913 { 1911 1914 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1912 1915 1913 - return sprintf(buf, "%s\n", rbd_dev->snap_name); 1916 + return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name); 1914 1917 } 1915 1918 1916 1919 static ssize_t rbd_image_refresh(struct device *dev, ··· 1939 1918 } 1940 1919 1941 1920 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 1921 + static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 1942 1922 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 1943 1923 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 1944 1924 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 1945 1925 static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 1946 1926 static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 1927 + static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 1947 1928 static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 1948 1929 static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 1949 - static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add); 1950 1930 1951 1931 static struct attribute *rbd_attrs[] = { 1952 1932 &dev_attr_size.attr, 1933 + &dev_attr_features.attr, 1953 1934 &dev_attr_major.attr, 1954 1935 &dev_attr_client_id.attr, 1955 1936 &dev_attr_pool.attr, 1956 1937 &dev_attr_pool_id.attr, 1957 1938 &dev_attr_name.attr, 1939 + &dev_attr_image_id.attr, 1958 1940 &dev_attr_current_snap.attr, 1959 1941 &dev_attr_refresh.attr, 1960 - &dev_attr_create_snap.attr, 1961 1942 NULL 1962 1943 }; 1963 1944 ··· 2005 1982 return sprintf(buf, "%llu\n", (unsigned long long)snap->id); 2006 1983 } 2007 1984 1985 + static ssize_t rbd_snap_features_show(struct device *dev, 1986 + struct device_attribute *attr, 1987 + char *buf) 1988 + { 1989 + struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 1990 + 1991 + return sprintf(buf, "0x%016llx\n", 1992 + (unsigned long long) snap->features); 1993 + } 1994 + 2008 1995 static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL); 2009 1996 static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); 1997 + static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL); 2010 1998 2011 1999 static struct attribute *rbd_snap_attrs[] = { 2012 2000 &dev_attr_snap_size.attr, 2013 2001 &dev_attr_snap_id.attr, 2002 + &dev_attr_snap_features.attr, 2014 2003 NULL, 2015 2004 }; 2016 2005 ··· 2047 2012 .release = rbd_snap_dev_release, 2048 2013 }; 2049 2014 2015 + static bool rbd_snap_registered(struct rbd_snap *snap) 2016 + { 2017 + bool ret = snap->dev.type == &rbd_snap_device_type; 2018 + bool reg = device_is_registered(&snap->dev); 2019 + 2020 + rbd_assert(!ret ^ reg); 2021 + 2022 + return ret; 2023 + } 2024 + 2050 2025 static void __rbd_remove_snap_dev(struct rbd_snap *snap) 2051 2026 { 2052 2027 list_del(&snap->node); 2053 - device_unregister(&snap->dev); 2028 + if (device_is_registered(&snap->dev)) 2029 + device_unregister(&snap->dev); 2054 2030 } 2055 2031 2056 2032 static int rbd_register_snap_dev(struct rbd_snap *snap, ··· 2074 2028 dev->parent = parent; 2075 2029 dev->release = rbd_snap_dev_release; 2076 2030 dev_set_name(dev, "snap_%s", snap->name); 2031 + dout("%s: registering device for snapshot %s\n", __func__, snap->name); 2032 + 2077 2033 ret = device_register(dev); 2078 2034 2079 2035 return ret; 2080 2036 } 2081 2037 2082 2038 static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, 2083 - int i, const char *name) 2039 + const char *snap_name, 2040 + u64 snap_id, u64 snap_size, 2041 + u64 snap_features) 2084 2042 { 2085 2043 struct rbd_snap *snap; 2086 2044 int ret; ··· 2094 2044 return ERR_PTR(-ENOMEM); 2095 2045 2096 2046 ret = -ENOMEM; 2097 - snap->name = kstrdup(name, GFP_KERNEL); 2047 + snap->name = kstrdup(snap_name, GFP_KERNEL); 2098 2048 if (!snap->name) 2099 2049 goto err; 2100 2050 2101 - snap->size = rbd_dev->header.snap_sizes[i]; 2102 - snap->id = rbd_dev->header.snapc->snaps[i]; 2103 - if (device_is_registered(&rbd_dev->dev)) { 2104 - ret = rbd_register_snap_dev(snap, &rbd_dev->dev); 2105 - if (ret < 0) 2106 - goto err; 2107 - } 2051 + snap->id = snap_id; 2052 + snap->size = snap_size; 2053 + snap->features = snap_features; 2108 2054 2109 2055 return snap; 2110 2056 ··· 2111 2065 return ERR_PTR(ret); 2112 2066 } 2113 2067 2114 - /* 2115 - * search for the previous snap in a null delimited string list 2116 - */ 2117 - const char *rbd_prev_snap_name(const char *name, const char *start) 2068 + static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, 2069 + u64 *snap_size, u64 *snap_features) 2118 2070 { 2119 - if (name < start + 2) 2120 - return NULL; 2071 + char *snap_name; 2121 2072 2122 - name -= 2; 2123 - while (*name) { 2124 - if (name == start) 2125 - return start; 2126 - name--; 2127 - } 2128 - return name + 1; 2073 + rbd_assert(which < rbd_dev->header.snapc->num_snaps); 2074 + 2075 + *snap_size = rbd_dev->header.snap_sizes[which]; 2076 + *snap_features = 0; /* No features for v1 */ 2077 + 2078 + /* Skip over names until we find the one we are looking for */ 2079 + 2080 + snap_name = rbd_dev->header.snap_names; 2081 + while (which--) 2082 + snap_name += strlen(snap_name) + 1; 2083 + 2084 + return snap_name; 2129 2085 } 2130 2086 2131 2087 /* 2132 - * compare the old list of snapshots that we have to what's in the header 2133 - * and update it accordingly. Note that the header holds the snapshots 2134 - * in a reverse order (from newest to oldest) and we need to go from 2135 - * older to new so that we don't get a duplicate snap name when 2136 - * doing the process (e.g., removed snapshot and recreated a new 2137 - * one with the same name. 2088 + * Get the size and object order for an image snapshot, or if 2089 + * snap_id is CEPH_NOSNAP, gets this information for the base 2090 + * image. 2138 2091 */ 2139 - static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) 2092 + static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 2093 + u8 *order, u64 *snap_size) 2140 2094 { 2141 - const char *name, *first_name; 2142 - int i = rbd_dev->header.total_snaps; 2143 - struct rbd_snap *snap, *old_snap = NULL; 2144 - struct list_head *p, *n; 2095 + __le64 snapid = cpu_to_le64(snap_id); 2096 + int ret; 2097 + struct { 2098 + u8 order; 2099 + __le64 size; 2100 + } __attribute__ ((packed)) size_buf = { 0 }; 2145 2101 2146 - first_name = rbd_dev->header.snap_names; 2147 - name = first_name + rbd_dev->header.snap_names_len; 2102 + ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2103 + "rbd", "get_size", 2104 + (char *) &snapid, sizeof (snapid), 2105 + (char *) &size_buf, sizeof (size_buf), 2106 + CEPH_OSD_FLAG_READ, NULL); 2107 + dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2108 + if (ret < 0) 2109 + return ret; 2148 2110 2149 - list_for_each_prev_safe(p, n, &rbd_dev->snaps) { 2150 - u64 cur_id; 2111 + *order = size_buf.order; 2112 + *snap_size = le64_to_cpu(size_buf.size); 2151 2113 2152 - old_snap = list_entry(p, struct rbd_snap, node); 2153 - 2154 - if (i) 2155 - cur_id = rbd_dev->header.snapc->snaps[i - 1]; 2156 - 2157 - if (!i || old_snap->id < cur_id) { 2158 - /* 2159 - * old_snap->id was skipped, thus was 2160 - * removed. If this rbd_dev is mapped to 2161 - * the removed snapshot, record that it no 2162 - * longer exists, to prevent further I/O. 2163 - */ 2164 - if (rbd_dev->snap_id == old_snap->id) 2165 - rbd_dev->snap_exists = false; 2166 - __rbd_remove_snap_dev(old_snap); 2167 - continue; 2168 - } 2169 - if (old_snap->id == cur_id) { 2170 - /* we have this snapshot already */ 2171 - i--; 2172 - name = rbd_prev_snap_name(name, first_name); 2173 - continue; 2174 - } 2175 - for (; i > 0; 2176 - i--, name = rbd_prev_snap_name(name, first_name)) { 2177 - if (!name) { 2178 - WARN_ON(1); 2179 - return -EINVAL; 2180 - } 2181 - cur_id = rbd_dev->header.snapc->snaps[i]; 2182 - /* snapshot removal? handle it above */ 2183 - if (cur_id >= old_snap->id) 2184 - break; 2185 - /* a new snapshot */ 2186 - snap = __rbd_add_snap_dev(rbd_dev, i - 1, name); 2187 - if (IS_ERR(snap)) 2188 - return PTR_ERR(snap); 2189 - 2190 - /* note that we add it backward so using n and not p */ 2191 - list_add(&snap->node, n); 2192 - p = &snap->node; 2193 - } 2194 - } 2195 - /* we're done going over the old snap list, just add what's left */ 2196 - for (; i > 0; i--) { 2197 - name = rbd_prev_snap_name(name, first_name); 2198 - if (!name) { 2199 - WARN_ON(1); 2200 - return -EINVAL; 2201 - } 2202 - snap = __rbd_add_snap_dev(rbd_dev, i - 1, name); 2203 - if (IS_ERR(snap)) 2204 - return PTR_ERR(snap); 2205 - list_add(&snap->node, &rbd_dev->snaps); 2206 - } 2114 + dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n", 2115 + (unsigned long long) snap_id, (unsigned int) *order, 2116 + (unsigned long long) *snap_size); 2207 2117 2208 2118 return 0; 2209 2119 } 2210 2120 2121 + static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 2122 + { 2123 + return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 2124 + &rbd_dev->header.obj_order, 2125 + &rbd_dev->header.image_size); 2126 + } 2127 + 2128 + static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 2129 + { 2130 + void *reply_buf; 2131 + int ret; 2132 + void *p; 2133 + 2134 + reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 2135 + if (!reply_buf) 2136 + return -ENOMEM; 2137 + 2138 + ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2139 + "rbd", "get_object_prefix", 2140 + NULL, 0, 2141 + reply_buf, RBD_OBJ_PREFIX_LEN_MAX, 2142 + CEPH_OSD_FLAG_READ, NULL); 2143 + dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2144 + if (ret < 0) 2145 + goto out; 2146 + 2147 + p = reply_buf; 2148 + rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 2149 + p + RBD_OBJ_PREFIX_LEN_MAX, 2150 + NULL, GFP_NOIO); 2151 + 2152 + if (IS_ERR(rbd_dev->header.object_prefix)) { 2153 + ret = PTR_ERR(rbd_dev->header.object_prefix); 2154 + rbd_dev->header.object_prefix = NULL; 2155 + } else { 2156 + dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 2157 + } 2158 + 2159 + out: 2160 + kfree(reply_buf); 2161 + 2162 + return ret; 2163 + } 2164 + 2165 + static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 2166 + u64 *snap_features) 2167 + { 2168 + __le64 snapid = cpu_to_le64(snap_id); 2169 + struct { 2170 + __le64 features; 2171 + __le64 incompat; 2172 + } features_buf = { 0 }; 2173 + int ret; 2174 + 2175 + ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2176 + "rbd", "get_features", 2177 + (char *) &snapid, sizeof (snapid), 2178 + (char *) &features_buf, sizeof (features_buf), 2179 + CEPH_OSD_FLAG_READ, NULL); 2180 + dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2181 + if (ret < 0) 2182 + return ret; 2183 + *snap_features = le64_to_cpu(features_buf.features); 2184 + 2185 + dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 2186 + (unsigned long long) snap_id, 2187 + (unsigned long long) *snap_features, 2188 + (unsigned long long) le64_to_cpu(features_buf.incompat)); 2189 + 2190 + return 0; 2191 + } 2192 + 2193 + static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 2194 + { 2195 + return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 2196 + &rbd_dev->header.features); 2197 + } 2198 + 2199 + static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) 2200 + { 2201 + size_t size; 2202 + int ret; 2203 + void *reply_buf; 2204 + void *p; 2205 + void *end; 2206 + u64 seq; 2207 + u32 snap_count; 2208 + struct ceph_snap_context *snapc; 2209 + u32 i; 2210 + 2211 + /* 2212 + * We'll need room for the seq value (maximum snapshot id), 2213 + * snapshot count, and array of that many snapshot ids. 2214 + * For now we have a fixed upper limit on the number we're 2215 + * prepared to receive. 2216 + */ 2217 + size = sizeof (__le64) + sizeof (__le32) + 2218 + RBD_MAX_SNAP_COUNT * sizeof (__le64); 2219 + reply_buf = kzalloc(size, GFP_KERNEL); 2220 + if (!reply_buf) 2221 + return -ENOMEM; 2222 + 2223 + ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2224 + "rbd", "get_snapcontext", 2225 + NULL, 0, 2226 + reply_buf, size, 2227 + CEPH_OSD_FLAG_READ, ver); 2228 + dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2229 + if (ret < 0) 2230 + goto out; 2231 + 2232 + ret = -ERANGE; 2233 + p = reply_buf; 2234 + end = (char *) reply_buf + size; 2235 + ceph_decode_64_safe(&p, end, seq, out); 2236 + ceph_decode_32_safe(&p, end, snap_count, out); 2237 + 2238 + /* 2239 + * Make sure the reported number of snapshot ids wouldn't go 2240 + * beyond the end of our buffer. But before checking that, 2241 + * make sure the computed size of the snapshot context we 2242 + * allocate is representable in a size_t. 2243 + */ 2244 + if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 2245 + / sizeof (u64)) { 2246 + ret = -EINVAL; 2247 + goto out; 2248 + } 2249 + if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 2250 + goto out; 2251 + 2252 + size = sizeof (struct ceph_snap_context) + 2253 + snap_count * sizeof (snapc->snaps[0]); 2254 + snapc = kmalloc(size, GFP_KERNEL); 2255 + if (!snapc) { 2256 + ret = -ENOMEM; 2257 + goto out; 2258 + } 2259 + 2260 + atomic_set(&snapc->nref, 1); 2261 + snapc->seq = seq; 2262 + snapc->num_snaps = snap_count; 2263 + for (i = 0; i < snap_count; i++) 2264 + snapc->snaps[i] = ceph_decode_64(&p); 2265 + 2266 + rbd_dev->header.snapc = snapc; 2267 + 2268 + dout(" snap context seq = %llu, snap_count = %u\n", 2269 + (unsigned long long) seq, (unsigned int) snap_count); 2270 + 2271 + out: 2272 + kfree(reply_buf); 2273 + 2274 + return 0; 2275 + } 2276 + 2277 + static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) 2278 + { 2279 + size_t size; 2280 + void *reply_buf; 2281 + __le64 snap_id; 2282 + int ret; 2283 + void *p; 2284 + void *end; 2285 + size_t snap_name_len; 2286 + char *snap_name; 2287 + 2288 + size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 2289 + reply_buf = kmalloc(size, GFP_KERNEL); 2290 + if (!reply_buf) 2291 + return ERR_PTR(-ENOMEM); 2292 + 2293 + snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); 2294 + ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2295 + "rbd", "get_snapshot_name", 2296 + (char *) &snap_id, sizeof (snap_id), 2297 + reply_buf, size, 2298 + CEPH_OSD_FLAG_READ, NULL); 2299 + dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2300 + if (ret < 0) 2301 + goto out; 2302 + 2303 + p = reply_buf; 2304 + end = (char *) reply_buf + size; 2305 + snap_name_len = 0; 2306 + snap_name = ceph_extract_encoded_string(&p, end, &snap_name_len, 2307 + GFP_KERNEL); 2308 + if (IS_ERR(snap_name)) { 2309 + ret = PTR_ERR(snap_name); 2310 + goto out; 2311 + } else { 2312 + dout(" snap_id 0x%016llx snap_name = %s\n", 2313 + (unsigned long long) le64_to_cpu(snap_id), snap_name); 2314 + } 2315 + kfree(reply_buf); 2316 + 2317 + return snap_name; 2318 + out: 2319 + kfree(reply_buf); 2320 + 2321 + return ERR_PTR(ret); 2322 + } 2323 + 2324 + static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, 2325 + u64 *snap_size, u64 *snap_features) 2326 + { 2327 + __le64 snap_id; 2328 + u8 order; 2329 + int ret; 2330 + 2331 + snap_id = rbd_dev->header.snapc->snaps[which]; 2332 + ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size); 2333 + if (ret) 2334 + return ERR_PTR(ret); 2335 + ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features); 2336 + if (ret) 2337 + return ERR_PTR(ret); 2338 + 2339 + return rbd_dev_v2_snap_name(rbd_dev, which); 2340 + } 2341 + 2342 + static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, 2343 + u64 *snap_size, u64 *snap_features) 2344 + { 2345 + if (rbd_dev->image_format == 1) 2346 + return rbd_dev_v1_snap_info(rbd_dev, which, 2347 + snap_size, snap_features); 2348 + if (rbd_dev->image_format == 2) 2349 + return rbd_dev_v2_snap_info(rbd_dev, which, 2350 + snap_size, snap_features); 2351 + return ERR_PTR(-EINVAL); 2352 + } 2353 + 2354 + /* 2355 + * Scan the rbd device's current snapshot list and compare it to the 2356 + * newly-received snapshot context. Remove any existing snapshots 2357 + * not present in the new snapshot context. Add a new snapshot for 2358 + * any snaphots in the snapshot context not in the current list. 2359 + * And verify there are no changes to snapshots we already know 2360 + * about. 2361 + * 2362 + * Assumes the snapshots in the snapshot context are sorted by 2363 + * snapshot id, highest id first. (Snapshots in the rbd_dev's list 2364 + * are also maintained in that order.) 2365 + */ 2366 + static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) 2367 + { 2368 + struct ceph_snap_context *snapc = rbd_dev->header.snapc; 2369 + const u32 snap_count = snapc->num_snaps; 2370 + struct list_head *head = &rbd_dev->snaps; 2371 + struct list_head *links = head->next; 2372 + u32 index = 0; 2373 + 2374 + dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count); 2375 + while (index < snap_count || links != head) { 2376 + u64 snap_id; 2377 + struct rbd_snap *snap; 2378 + char *snap_name; 2379 + u64 snap_size = 0; 2380 + u64 snap_features = 0; 2381 + 2382 + snap_id = index < snap_count ? snapc->snaps[index] 2383 + : CEPH_NOSNAP; 2384 + snap = links != head ? list_entry(links, struct rbd_snap, node) 2385 + : NULL; 2386 + rbd_assert(!snap || snap->id != CEPH_NOSNAP); 2387 + 2388 + if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { 2389 + struct list_head *next = links->next; 2390 + 2391 + /* Existing snapshot not in the new snap context */ 2392 + 2393 + if (rbd_dev->mapping.snap_id == snap->id) 2394 + rbd_dev->mapping.snap_exists = false; 2395 + __rbd_remove_snap_dev(snap); 2396 + dout("%ssnap id %llu has been removed\n", 2397 + rbd_dev->mapping.snap_id == snap->id ? 2398 + "mapped " : "", 2399 + (unsigned long long) snap->id); 2400 + 2401 + /* Done with this list entry; advance */ 2402 + 2403 + links = next; 2404 + continue; 2405 + } 2406 + 2407 + snap_name = rbd_dev_snap_info(rbd_dev, index, 2408 + &snap_size, &snap_features); 2409 + if (IS_ERR(snap_name)) 2410 + return PTR_ERR(snap_name); 2411 + 2412 + dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count, 2413 + (unsigned long long) snap_id); 2414 + if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) { 2415 + struct rbd_snap *new_snap; 2416 + 2417 + /* We haven't seen this snapshot before */ 2418 + 2419 + new_snap = __rbd_add_snap_dev(rbd_dev, snap_name, 2420 + snap_id, snap_size, snap_features); 2421 + if (IS_ERR(new_snap)) { 2422 + int err = PTR_ERR(new_snap); 2423 + 2424 + dout(" failed to add dev, error %d\n", err); 2425 + 2426 + return err; 2427 + } 2428 + 2429 + /* New goes before existing, or at end of list */ 2430 + 2431 + dout(" added dev%s\n", snap ? "" : " at end\n"); 2432 + if (snap) 2433 + list_add_tail(&new_snap->node, &snap->node); 2434 + else 2435 + list_add_tail(&new_snap->node, head); 2436 + } else { 2437 + /* Already have this one */ 2438 + 2439 + dout(" already present\n"); 2440 + 2441 + rbd_assert(snap->size == snap_size); 2442 + rbd_assert(!strcmp(snap->name, snap_name)); 2443 + rbd_assert(snap->features == snap_features); 2444 + 2445 + /* Done with this list entry; advance */ 2446 + 2447 + links = links->next; 2448 + } 2449 + 2450 + /* Advance to the next entry in the snapshot context */ 2451 + 2452 + index++; 2453 + } 2454 + dout("%s: done\n", __func__); 2455 + 2456 + return 0; 2457 + } 2458 + 2459 + /* 2460 + * Scan the list of snapshots and register the devices for any that 2461 + * have not already been registered. 2462 + */ 2463 + static int rbd_dev_snaps_register(struct rbd_device *rbd_dev) 2464 + { 2465 + struct rbd_snap *snap; 2466 + int ret = 0; 2467 + 2468 + dout("%s called\n", __func__); 2469 + if (WARN_ON(!device_is_registered(&rbd_dev->dev))) 2470 + return -EIO; 2471 + 2472 + list_for_each_entry(snap, &rbd_dev->snaps, node) { 2473 + if (!rbd_snap_registered(snap)) { 2474 + ret = rbd_register_snap_dev(snap, &rbd_dev->dev); 2475 + if (ret < 0) 2476 + break; 2477 + } 2478 + } 2479 + dout("%s: returning %d\n", __func__, ret); 2480 + 2481 + return ret; 2482 + } 2483 + 2211 2484 static int rbd_bus_add_dev(struct rbd_device *rbd_dev) 2212 2485 { 2213 - int ret; 2214 2486 struct device *dev; 2215 - struct rbd_snap *snap; 2487 + int ret; 2216 2488 2217 2489 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 2218 - dev = &rbd_dev->dev; 2219 2490 2491 + dev = &rbd_dev->dev; 2220 2492 dev->bus = &rbd_bus_type; 2221 2493 dev->type = &rbd_device_type; 2222 2494 dev->parent = &rbd_root_dev; 2223 2495 dev->release = rbd_dev_release; 2224 2496 dev_set_name(dev, "%d", rbd_dev->dev_id); 2225 2497 ret = device_register(dev); 2226 - if (ret < 0) 2227 - goto out; 2228 2498 2229 - list_for_each_entry(snap, &rbd_dev->snaps, node) { 2230 - ret = rbd_register_snap_dev(snap, &rbd_dev->dev); 2231 - if (ret < 0) 2232 - break; 2233 - } 2234 - out: 2235 2499 mutex_unlock(&ctl_mutex); 2500 + 2236 2501 return ret; 2237 2502 } 2238 2503 ··· 2568 2211 return ret; 2569 2212 } 2570 2213 2571 - static atomic64_t rbd_id_max = ATOMIC64_INIT(0); 2214 + static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); 2572 2215 2573 2216 /* 2574 2217 * Get a unique rbd identifier for the given new rbd_dev, and add 2575 2218 * the rbd_dev to the global list. The minimum rbd id is 1. 2576 2219 */ 2577 - static void rbd_id_get(struct rbd_device *rbd_dev) 2220 + static void rbd_dev_id_get(struct rbd_device *rbd_dev) 2578 2221 { 2579 - rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max); 2222 + rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); 2580 2223 2581 2224 spin_lock(&rbd_dev_list_lock); 2582 2225 list_add_tail(&rbd_dev->node, &rbd_dev_list); 2583 2226 spin_unlock(&rbd_dev_list_lock); 2227 + dout("rbd_dev %p given dev id %llu\n", rbd_dev, 2228 + (unsigned long long) rbd_dev->dev_id); 2584 2229 } 2585 2230 2586 2231 /* 2587 2232 * Remove an rbd_dev from the global list, and record that its 2588 2233 * identifier is no longer in use. 2589 2234 */ 2590 - static void rbd_id_put(struct rbd_device *rbd_dev) 2235 + static void rbd_dev_id_put(struct rbd_device *rbd_dev) 2591 2236 { 2592 2237 struct list_head *tmp; 2593 2238 int rbd_id = rbd_dev->dev_id; 2594 2239 int max_id; 2595 2240 2596 - BUG_ON(rbd_id < 1); 2241 + rbd_assert(rbd_id > 0); 2597 2242 2243 + dout("rbd_dev %p released dev id %llu\n", rbd_dev, 2244 + (unsigned long long) rbd_dev->dev_id); 2598 2245 spin_lock(&rbd_dev_list_lock); 2599 2246 list_del_init(&rbd_dev->node); 2600 2247 ··· 2606 2245 * If the id being "put" is not the current maximum, there 2607 2246 * is nothing special we need to do. 2608 2247 */ 2609 - if (rbd_id != atomic64_read(&rbd_id_max)) { 2248 + if (rbd_id != atomic64_read(&rbd_dev_id_max)) { 2610 2249 spin_unlock(&rbd_dev_list_lock); 2611 2250 return; 2612 2251 } ··· 2627 2266 spin_unlock(&rbd_dev_list_lock); 2628 2267 2629 2268 /* 2630 - * The max id could have been updated by rbd_id_get(), in 2269 + * The max id could have been updated by rbd_dev_id_get(), in 2631 2270 * which case it now accurately reflects the new maximum. 2632 2271 * Be careful not to overwrite the maximum value in that 2633 2272 * case. 2634 2273 */ 2635 - atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id); 2274 + atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); 2275 + dout(" max dev id has been reset\n"); 2636 2276 } 2637 2277 2638 2278 /* ··· 2722 2360 } 2723 2361 2724 2362 /* 2725 - * This fills in the pool_name, image_name, image_name_len, snap_name, 2726 - * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based 2727 - * on the list of monitor addresses and other options provided via 2728 - * /sys/bus/rbd/add. 2363 + * This fills in the pool_name, image_name, image_name_len, rbd_dev, 2364 + * rbd_md_name, and name fields of the given rbd_dev, based on the 2365 + * list of monitor addresses and other options provided via 2366 + * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated 2367 + * copy of the snapshot name to map if successful, or a 2368 + * pointer-coded error otherwise. 2729 2369 * 2730 2370 * Note: rbd_dev is assumed to have been initially zero-filled. 2731 2371 */ 2732 - static int rbd_add_parse_args(struct rbd_device *rbd_dev, 2733 - const char *buf, 2734 - const char **mon_addrs, 2735 - size_t *mon_addrs_size, 2736 - char *options, 2737 - size_t options_size) 2372 + static char *rbd_add_parse_args(struct rbd_device *rbd_dev, 2373 + const char *buf, 2374 + const char **mon_addrs, 2375 + size_t *mon_addrs_size, 2376 + char *options, 2377 + size_t options_size) 2738 2378 { 2739 2379 size_t len; 2740 - int ret; 2380 + char *err_ptr = ERR_PTR(-EINVAL); 2381 + char *snap_name; 2741 2382 2742 2383 /* The first four tokens are required */ 2743 2384 2744 2385 len = next_token(&buf); 2745 2386 if (!len) 2746 - return -EINVAL; 2387 + return err_ptr; 2747 2388 *mon_addrs_size = len + 1; 2748 2389 *mon_addrs = buf; 2749 2390 ··· 2754 2389 2755 2390 len = copy_token(&buf, options, options_size); 2756 2391 if (!len || len >= options_size) 2757 - return -EINVAL; 2392 + return err_ptr; 2758 2393 2759 - ret = -ENOMEM; 2394 + err_ptr = ERR_PTR(-ENOMEM); 2760 2395 rbd_dev->pool_name = dup_token(&buf, NULL); 2761 2396 if (!rbd_dev->pool_name) 2762 2397 goto out_err; ··· 2765 2400 if (!rbd_dev->image_name) 2766 2401 goto out_err; 2767 2402 2768 - /* Create the name of the header object */ 2769 - 2770 - rbd_dev->header_name = kmalloc(rbd_dev->image_name_len 2771 - + sizeof (RBD_SUFFIX), 2772 - GFP_KERNEL); 2773 - if (!rbd_dev->header_name) 2403 + /* Snapshot name is optional */ 2404 + len = next_token(&buf); 2405 + if (!len) { 2406 + buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 2407 + len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 2408 + } 2409 + snap_name = kmalloc(len + 1, GFP_KERNEL); 2410 + if (!snap_name) 2774 2411 goto out_err; 2775 - sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX); 2412 + memcpy(snap_name, buf, len); 2413 + *(snap_name + len) = '\0'; 2414 + 2415 + dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len); 2416 + 2417 + return snap_name; 2418 + 2419 + out_err: 2420 + kfree(rbd_dev->image_name); 2421 + rbd_dev->image_name = NULL; 2422 + rbd_dev->image_name_len = 0; 2423 + kfree(rbd_dev->pool_name); 2424 + rbd_dev->pool_name = NULL; 2425 + 2426 + return err_ptr; 2427 + } 2428 + 2429 + /* 2430 + * An rbd format 2 image has a unique identifier, distinct from the 2431 + * name given to it by the user. Internally, that identifier is 2432 + * what's used to specify the names of objects related to the image. 2433 + * 2434 + * A special "rbd id" object is used to map an rbd image name to its 2435 + * id. If that object doesn't exist, then there is no v2 rbd image 2436 + * with the supplied name. 2437 + * 2438 + * This function will record the given rbd_dev's image_id field if 2439 + * it can be determined, and in that case will return 0. If any 2440 + * errors occur a negative errno will be returned and the rbd_dev's 2441 + * image_id field will be unchanged (and should be NULL). 2442 + */ 2443 + static int rbd_dev_image_id(struct rbd_device *rbd_dev) 2444 + { 2445 + int ret; 2446 + size_t size; 2447 + char *object_name; 2448 + void *response; 2449 + void *p; 2776 2450 2777 2451 /* 2778 - * The snapshot name is optional. If none is is supplied, 2779 - * we use the default value. 2452 + * First, see if the format 2 image id file exists, and if 2453 + * so, get the image's persistent id from it. 2780 2454 */ 2781 - rbd_dev->snap_name = dup_token(&buf, &len); 2782 - if (!rbd_dev->snap_name) 2783 - goto out_err; 2784 - if (!len) { 2785 - /* Replace the empty name with the default */ 2786 - kfree(rbd_dev->snap_name); 2787 - rbd_dev->snap_name 2788 - = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL); 2789 - if (!rbd_dev->snap_name) 2790 - goto out_err; 2455 + size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len; 2456 + object_name = kmalloc(size, GFP_NOIO); 2457 + if (!object_name) 2458 + return -ENOMEM; 2459 + sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name); 2460 + dout("rbd id object name is %s\n", object_name); 2791 2461 2792 - memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, 2793 - sizeof (RBD_SNAP_HEAD_NAME)); 2462 + /* Response will be an encoded string, which includes a length */ 2463 + 2464 + size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 2465 + response = kzalloc(size, GFP_NOIO); 2466 + if (!response) { 2467 + ret = -ENOMEM; 2468 + goto out; 2794 2469 } 2470 + 2471 + ret = rbd_req_sync_exec(rbd_dev, object_name, 2472 + "rbd", "get_id", 2473 + NULL, 0, 2474 + response, RBD_IMAGE_ID_LEN_MAX, 2475 + CEPH_OSD_FLAG_READ, NULL); 2476 + dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2477 + if (ret < 0) 2478 + goto out; 2479 + 2480 + p = response; 2481 + rbd_dev->image_id = ceph_extract_encoded_string(&p, 2482 + p + RBD_IMAGE_ID_LEN_MAX, 2483 + &rbd_dev->image_id_len, 2484 + GFP_NOIO); 2485 + if (IS_ERR(rbd_dev->image_id)) { 2486 + ret = PTR_ERR(rbd_dev->image_id); 2487 + rbd_dev->image_id = NULL; 2488 + } else { 2489 + dout("image_id is %s\n", rbd_dev->image_id); 2490 + } 2491 + out: 2492 + kfree(response); 2493 + kfree(object_name); 2494 + 2495 + return ret; 2496 + } 2497 + 2498 + static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) 2499 + { 2500 + int ret; 2501 + size_t size; 2502 + 2503 + /* Version 1 images have no id; empty string is used */ 2504 + 2505 + rbd_dev->image_id = kstrdup("", GFP_KERNEL); 2506 + if (!rbd_dev->image_id) 2507 + return -ENOMEM; 2508 + rbd_dev->image_id_len = 0; 2509 + 2510 + /* Record the header object name for this rbd image. */ 2511 + 2512 + size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX); 2513 + rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 2514 + if (!rbd_dev->header_name) { 2515 + ret = -ENOMEM; 2516 + goto out_err; 2517 + } 2518 + sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX); 2519 + 2520 + /* Populate rbd image metadata */ 2521 + 2522 + ret = rbd_read_header(rbd_dev, &rbd_dev->header); 2523 + if (ret < 0) 2524 + goto out_err; 2525 + rbd_dev->image_format = 1; 2526 + 2527 + dout("discovered version 1 image, header name is %s\n", 2528 + rbd_dev->header_name); 2795 2529 2796 2530 return 0; 2797 2531 2798 2532 out_err: 2799 2533 kfree(rbd_dev->header_name); 2800 - kfree(rbd_dev->image_name); 2801 - kfree(rbd_dev->pool_name); 2802 - rbd_dev->pool_name = NULL; 2534 + rbd_dev->header_name = NULL; 2535 + kfree(rbd_dev->image_id); 2536 + rbd_dev->image_id = NULL; 2537 + 2538 + return ret; 2539 + } 2540 + 2541 + static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) 2542 + { 2543 + size_t size; 2544 + int ret; 2545 + u64 ver = 0; 2546 + 2547 + /* 2548 + * Image id was filled in by the caller. Record the header 2549 + * object name for this rbd image. 2550 + */ 2551 + size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len; 2552 + rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 2553 + if (!rbd_dev->header_name) 2554 + return -ENOMEM; 2555 + sprintf(rbd_dev->header_name, "%s%s", 2556 + RBD_HEADER_PREFIX, rbd_dev->image_id); 2557 + 2558 + /* Get the size and object order for the image */ 2559 + 2560 + ret = rbd_dev_v2_image_size(rbd_dev); 2561 + if (ret < 0) 2562 + goto out_err; 2563 + 2564 + /* Get the object prefix (a.k.a. block_name) for the image */ 2565 + 2566 + ret = rbd_dev_v2_object_prefix(rbd_dev); 2567 + if (ret < 0) 2568 + goto out_err; 2569 + 2570 + /* Get the features for the image */ 2571 + 2572 + ret = rbd_dev_v2_features(rbd_dev); 2573 + if (ret < 0) 2574 + goto out_err; 2575 + 2576 + /* crypto and compression type aren't (yet) supported for v2 images */ 2577 + 2578 + rbd_dev->header.crypt_type = 0; 2579 + rbd_dev->header.comp_type = 0; 2580 + 2581 + /* Get the snapshot context, plus the header version */ 2582 + 2583 + ret = rbd_dev_v2_snap_context(rbd_dev, &ver); 2584 + if (ret) 2585 + goto out_err; 2586 + rbd_dev->header.obj_version = ver; 2587 + 2588 + rbd_dev->image_format = 2; 2589 + 2590 + dout("discovered version 2 image, header name is %s\n", 2591 + rbd_dev->header_name); 2592 + 2593 + return -ENOTSUPP; 2594 + out_err: 2595 + kfree(rbd_dev->header_name); 2596 + rbd_dev->header_name = NULL; 2597 + kfree(rbd_dev->header.object_prefix); 2598 + rbd_dev->header.object_prefix = NULL; 2599 + 2600 + return ret; 2601 + } 2602 + 2603 + /* 2604 + * Probe for the existence of the header object for the given rbd 2605 + * device. For format 2 images this includes determining the image 2606 + * id. 2607 + */ 2608 + static int rbd_dev_probe(struct rbd_device *rbd_dev) 2609 + { 2610 + int ret; 2611 + 2612 + /* 2613 + * Get the id from the image id object. If it's not a 2614 + * format 2 image, we'll get ENOENT back, and we'll assume 2615 + * it's a format 1 image. 2616 + */ 2617 + ret = rbd_dev_image_id(rbd_dev); 2618 + if (ret) 2619 + ret = rbd_dev_v1_probe(rbd_dev); 2620 + else 2621 + ret = rbd_dev_v2_probe(rbd_dev); 2622 + if (ret) 2623 + dout("probe failed, returning %d\n", ret); 2803 2624 2804 2625 return ret; 2805 2626 } ··· 3000 2449 size_t mon_addrs_size = 0; 3001 2450 struct ceph_osd_client *osdc; 3002 2451 int rc = -ENOMEM; 2452 + char *snap_name; 3003 2453 3004 2454 if (!try_module_get(THIS_MODULE)) 3005 2455 return -ENODEV; 3006 2456 3007 2457 options = kmalloc(count, GFP_KERNEL); 3008 2458 if (!options) 3009 - goto err_nomem; 2459 + goto err_out_mem; 3010 2460 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); 3011 2461 if (!rbd_dev) 3012 - goto err_nomem; 2462 + goto err_out_mem; 3013 2463 3014 2464 /* static rbd_device initialization */ 3015 2465 spin_lock_init(&rbd_dev->lock); ··· 3018 2466 INIT_LIST_HEAD(&rbd_dev->snaps); 3019 2467 init_rwsem(&rbd_dev->header_rwsem); 3020 2468 3021 - /* generate unique id: find highest unique id, add one */ 3022 - rbd_id_get(rbd_dev); 3023 - 3024 - /* Fill in the device name, now that we have its id. */ 3025 - BUILD_BUG_ON(DEV_NAME_LEN 3026 - < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 3027 - sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 3028 - 3029 2469 /* parse add command */ 3030 - rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size, 3031 - options, count); 3032 - if (rc) 3033 - goto err_put_id; 3034 - 3035 - rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1, 3036 - options); 3037 - if (IS_ERR(rbd_dev->rbd_client)) { 3038 - rc = PTR_ERR(rbd_dev->rbd_client); 3039 - goto err_put_id; 2470 + snap_name = rbd_add_parse_args(rbd_dev, buf, 2471 + &mon_addrs, &mon_addrs_size, options, count); 2472 + if (IS_ERR(snap_name)) { 2473 + rc = PTR_ERR(snap_name); 2474 + goto err_out_mem; 3040 2475 } 2476 + 2477 + rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options); 2478 + if (rc < 0) 2479 + goto err_out_args; 3041 2480 3042 2481 /* pick the pool */ 3043 2482 osdc = &rbd_dev->rbd_client->client->osdc; ··· 3037 2494 goto err_out_client; 3038 2495 rbd_dev->pool_id = rc; 3039 2496 3040 - /* register our block device */ 3041 - rc = register_blkdev(0, rbd_dev->name); 2497 + rc = rbd_dev_probe(rbd_dev); 3042 2498 if (rc < 0) 3043 2499 goto err_out_client; 2500 + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 2501 + 2502 + /* no need to lock here, as rbd_dev is not registered yet */ 2503 + rc = rbd_dev_snaps_update(rbd_dev); 2504 + if (rc) 2505 + goto err_out_header; 2506 + 2507 + rc = rbd_dev_set_mapping(rbd_dev, snap_name); 2508 + if (rc) 2509 + goto err_out_header; 2510 + 2511 + /* generate unique id: find highest unique id, add one */ 2512 + rbd_dev_id_get(rbd_dev); 2513 + 2514 + /* Fill in the device name, now that we have its id. */ 2515 + BUILD_BUG_ON(DEV_NAME_LEN 2516 + < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 2517 + sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 2518 + 2519 + /* Get our block major device number. */ 2520 + 2521 + rc = register_blkdev(0, rbd_dev->name); 2522 + if (rc < 0) 2523 + goto err_out_id; 3044 2524 rbd_dev->major = rc; 2525 + 2526 + /* Set up the blkdev mapping. */ 2527 + 2528 + rc = rbd_init_disk(rbd_dev); 2529 + if (rc) 2530 + goto err_out_blkdev; 3045 2531 3046 2532 rc = rbd_bus_add_dev(rbd_dev); 3047 2533 if (rc) 3048 - goto err_out_blkdev; 2534 + goto err_out_disk; 3049 2535 3050 2536 /* 3051 2537 * At this point cleanup in the event of an error is the job 3052 2538 * of the sysfs code (initiated by rbd_bus_del_dev()). 3053 - * 3054 - * Set up and announce blkdev mapping. 3055 2539 */ 3056 - rc = rbd_init_disk(rbd_dev); 2540 + 2541 + down_write(&rbd_dev->header_rwsem); 2542 + rc = rbd_dev_snaps_register(rbd_dev); 2543 + up_write(&rbd_dev->header_rwsem); 3057 2544 if (rc) 3058 2545 goto err_out_bus; 3059 2546 3060 2547 rc = rbd_init_watch_dev(rbd_dev); 3061 2548 if (rc) 3062 2549 goto err_out_bus; 2550 + 2551 + /* Everything's ready. Announce the disk to the world. */ 2552 + 2553 + add_disk(rbd_dev->disk); 2554 + 2555 + pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 2556 + (unsigned long long) rbd_dev->mapping.size); 3063 2557 3064 2558 return count; 3065 2559 ··· 3107 2527 kfree(options); 3108 2528 return rc; 3109 2529 2530 + err_out_disk: 2531 + rbd_free_disk(rbd_dev); 3110 2532 err_out_blkdev: 3111 2533 unregister_blkdev(rbd_dev->major, rbd_dev->name); 2534 + err_out_id: 2535 + rbd_dev_id_put(rbd_dev); 2536 + err_out_header: 2537 + rbd_header_free(&rbd_dev->header); 3112 2538 err_out_client: 2539 + kfree(rbd_dev->header_name); 3113 2540 rbd_put_client(rbd_dev); 3114 - err_put_id: 3115 - if (rbd_dev->pool_name) { 3116 - kfree(rbd_dev->snap_name); 3117 - kfree(rbd_dev->header_name); 3118 - kfree(rbd_dev->image_name); 3119 - kfree(rbd_dev->pool_name); 3120 - } 3121 - rbd_id_put(rbd_dev); 3122 - err_nomem: 2541 + kfree(rbd_dev->image_id); 2542 + err_out_args: 2543 + kfree(rbd_dev->mapping.snap_name); 2544 + kfree(rbd_dev->image_name); 2545 + kfree(rbd_dev->pool_name); 2546 + err_out_mem: 3123 2547 kfree(rbd_dev); 3124 2548 kfree(options); 3125 2549 ··· 3169 2585 rbd_free_disk(rbd_dev); 3170 2586 unregister_blkdev(rbd_dev->major, rbd_dev->name); 3171 2587 2588 + /* release allocated disk header fields */ 2589 + rbd_header_free(&rbd_dev->header); 2590 + 3172 2591 /* done with the id, and with the rbd_dev */ 3173 - kfree(rbd_dev->snap_name); 2592 + kfree(rbd_dev->mapping.snap_name); 2593 + kfree(rbd_dev->image_id); 3174 2594 kfree(rbd_dev->header_name); 3175 2595 kfree(rbd_dev->pool_name); 3176 2596 kfree(rbd_dev->image_name); 3177 - rbd_id_put(rbd_dev); 2597 + rbd_dev_id_put(rbd_dev); 3178 2598 kfree(rbd_dev); 3179 2599 3180 2600 /* release module ref */ ··· 3216 2628 3217 2629 done: 3218 2630 mutex_unlock(&ctl_mutex); 3219 - return ret; 3220 - } 3221 2631 3222 - static ssize_t rbd_snap_add(struct device *dev, 3223 - struct device_attribute *attr, 3224 - const char *buf, 3225 - size_t count) 3226 - { 3227 - struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3228 - int ret; 3229 - char *name = kmalloc(count + 1, GFP_KERNEL); 3230 - if (!name) 3231 - return -ENOMEM; 3232 - 3233 - snprintf(name, count, "%s", buf); 3234 - 3235 - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 3236 - 3237 - ret = rbd_header_add_snap(rbd_dev, 3238 - name, GFP_KERNEL); 3239 - if (ret < 0) 3240 - goto err_unlock; 3241 - 3242 - ret = __rbd_refresh_header(rbd_dev, NULL); 3243 - if (ret < 0) 3244 - goto err_unlock; 3245 - 3246 - /* shouldn't hold ctl_mutex when notifying.. notify might 3247 - trigger a watch callback that would need to get that mutex */ 3248 - mutex_unlock(&ctl_mutex); 3249 - 3250 - /* make a best effort, don't error if failed */ 3251 - rbd_req_sync_notify(rbd_dev); 3252 - 3253 - ret = count; 3254 - kfree(name); 3255 - return ret; 3256 - 3257 - err_unlock: 3258 - mutex_unlock(&ctl_mutex); 3259 - kfree(name); 3260 2632 return ret; 3261 2633 } 3262 2634

+21 -6

drivers/block/rbd_types.h

··· 15 15 16 16 #include <linux/types.h> 17 17 18 + /* For format version 2, rbd image 'foo' consists of objects 19 + * rbd_id.foo - id of image 20 + * rbd_header.<id> - image metadata 21 + * rbd_data.<id>.0000000000000000 22 + * rbd_data.<id>.0000000000000001 23 + * ... - data 24 + * Clients do not access header data directly in rbd format 2. 25 + */ 26 + 27 + #define RBD_HEADER_PREFIX "rbd_header." 28 + #define RBD_DATA_PREFIX "rbd_data." 29 + #define RBD_ID_PREFIX "rbd_id." 30 + 18 31 /* 19 - * rbd image 'foo' consists of objects 20 - * foo.rbd - image metadata 21 - * foo.00000000 22 - * foo.00000001 23 - * ... - data 32 + * For format version 1, rbd image 'foo' consists of objects 33 + * foo.rbd - image metadata 34 + * rb.<idhi>.<idlo>.00000000 35 + * rb.<idhi>.<idlo>.00000001 36 + * ... - data 37 + * There is no notion of a persistent image id in rbd format 1. 24 38 */ 25 39 26 40 #define RBD_SUFFIX ".rbd" 41 + 27 42 #define RBD_DIRECTORY "rbd_directory" 28 43 #define RBD_INFO "rbd_info" 29 44 ··· 62 47 63 48 struct rbd_image_header_ondisk { 64 49 char text[40]; 65 - char block_name[24]; 50 + char object_prefix[24]; 66 51 char signature[4]; 67 52 char version[8]; 68 53 struct {

+9 -10

fs/ceph/addr.c

··· 205 205 dout("readpage inode %p file %p page %p index %lu\n", 206 206 inode, filp, page, page->index); 207 207 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 208 - page->index << PAGE_CACHE_SHIFT, &len, 208 + (u64) page_offset(page), &len, 209 209 ci->i_truncate_seq, ci->i_truncate_size, 210 210 &page, 1, 0); 211 211 if (err == -ENOENT) ··· 286 286 int nr_pages = 0; 287 287 int ret; 288 288 289 - off = page->index << PAGE_CACHE_SHIFT; 289 + off = (u64) page_offset(page); 290 290 291 291 /* count pages */ 292 292 next_index = page->index; ··· 308 308 NULL, 0, 309 309 ci->i_truncate_seq, ci->i_truncate_size, 310 310 NULL, false, 1, 0); 311 - if (!req) 312 - return -ENOMEM; 311 + if (IS_ERR(req)) 312 + return PTR_ERR(req); 313 313 314 314 /* build page vector */ 315 315 nr_pages = len >> PAGE_CACHE_SHIFT; ··· 426 426 struct ceph_inode_info *ci; 427 427 struct ceph_fs_client *fsc; 428 428 struct ceph_osd_client *osdc; 429 - loff_t page_off = page->index << PAGE_CACHE_SHIFT; 429 + loff_t page_off = page_offset(page); 430 430 int len = PAGE_CACHE_SIZE; 431 431 loff_t i_size; 432 432 int err = 0; ··· 817 817 /* ok */ 818 818 if (locked_pages == 0) { 819 819 /* prepare async write request */ 820 - offset = (unsigned long long)page->index 821 - << PAGE_CACHE_SHIFT; 820 + offset = (u64) page_offset(page); 822 821 len = wsize; 823 822 req = ceph_osdc_new_request(&fsc->client->osdc, 824 823 &ci->i_layout, ··· 831 832 ci->i_truncate_size, 832 833 &inode->i_mtime, true, 1, 0); 833 834 834 - if (!req) { 835 - rc = -ENOMEM; 835 + if (IS_ERR(req)) { 836 + rc = PTR_ERR(req); 836 837 unlock_page(page); 837 838 break; 838 839 } ··· 1179 1180 struct inode *inode = vma->vm_file->f_dentry->d_inode; 1180 1181 struct page *page = vmf->page; 1181 1182 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 1182 - loff_t off = page->index << PAGE_CACHE_SHIFT; 1183 + loff_t off = page_offset(page); 1183 1184 loff_t size, len; 1184 1185 int ret; 1185 1186

+1 -1

fs/ceph/caps.c

··· 1005 1005 1006 1006 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); 1007 1007 head = msg->front.iov_base; 1008 - head->num = cpu_to_le32(le32_to_cpu(head->num) + 1); 1008 + le32_add_cpu(&head->num, 1); 1009 1009 item = msg->front.iov_base + msg->front.iov_len; 1010 1010 item->ino = cpu_to_le64(ino); 1011 1011 item->cap_id = cpu_to_le64(cap_id);

+2 -2

fs/ceph/file.c

··· 536 536 do_sync, 537 537 ci->i_truncate_seq, ci->i_truncate_size, 538 538 &mtime, false, 2, page_align); 539 - if (!req) 540 - return -ENOMEM; 539 + if (IS_ERR(req)) 540 + return PTR_ERR(req); 541 541 542 542 if (file->f_flags & O_DIRECT) { 543 543 pages = ceph_get_direct_page_vector(data, num_pages, false);

+6 -2

fs/ceph/ioctl.c

··· 187 187 u64 tmp; 188 188 struct ceph_object_layout ol; 189 189 struct ceph_pg pgid; 190 + int r; 190 191 191 192 /* copy and validate */ 192 193 if (copy_from_user(&dl, arg, sizeof(dl))) 193 194 return -EFAULT; 194 195 195 196 down_read(&osdc->map_sem); 196 - ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, 197 - &dl.object_no, &dl.object_offset, &olen); 197 + r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, 198 + &dl.object_no, &dl.object_offset, 199 + &olen); 200 + if (r < 0) 201 + return -EIO; 198 202 dl.file_offset -= dl.object_offset; 199 203 dl.object_size = ceph_file_layout_object_size(ci->i_layout); 200 204 dl.block_size = ceph_file_layout_su(ci->i_layout);

+2 -1

fs/ceph/mds_client.c

··· 2625 2625 ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", 2626 2626 session_state_name(s->s_state)); 2627 2627 2628 - if (memcmp(ceph_mdsmap_get_addr(oldmap, i), 2628 + if (i >= newmap->m_max_mds || 2629 + memcmp(ceph_mdsmap_get_addr(oldmap, i), 2629 2630 ceph_mdsmap_get_addr(newmap, i), 2630 2631 sizeof(struct ceph_entity_addr))) { 2631 2632 if (s->s_state == CEPH_MDS_SESSION_OPENING) {

+26 -11

fs/ceph/super.c

··· 307 307 { 308 308 struct ceph_mount_options *fsopt; 309 309 const char *dev_name_end; 310 - int err = -ENOMEM; 310 + int err; 311 + 312 + if (!dev_name || !*dev_name) 313 + return -EINVAL; 311 314 312 315 fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL); 313 316 if (!fsopt) ··· 331 328 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; 332 329 fsopt->congestion_kb = default_congestion_kb(); 333 330 334 - /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ 331 + /* 332 + * Distinguish the server list from the path in "dev_name". 333 + * Internally we do not include the leading '/' in the path. 334 + * 335 + * "dev_name" will look like: 336 + * <server_spec>[,<server_spec>...]:[<path>] 337 + * where 338 + * <server_spec> is <ip>[:<port>] 339 + * <path> is optional, but if present must begin with '/' 340 + */ 341 + dev_name_end = strchr(dev_name, '/'); 342 + if (dev_name_end) { 343 + /* skip over leading '/' for path */ 344 + *path = dev_name_end + 1; 345 + } else { 346 + /* path is empty */ 347 + dev_name_end = dev_name + strlen(dev_name); 348 + *path = dev_name_end; 349 + } 335 350 err = -EINVAL; 336 - if (!dev_name) 337 - goto out; 338 - *path = strstr(dev_name, ":/"); 339 - if (*path == NULL) { 340 - pr_err("device name is missing path (no :/ in %s)\n", 351 + dev_name_end--; /* back up to ':' separator */ 352 + if (*dev_name_end != ':') { 353 + pr_err("device name is missing path (no : separator in %s)\n", 341 354 dev_name); 342 355 goto out; 343 356 } 344 - dev_name_end = *path; 345 357 dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); 346 - 347 - /* path on server */ 348 - *path += 2; 349 358 dout("server path '%s'\n", *path); 350 359 351 360 *popt = ceph_parse_options(options, dev_name, dev_name_end,

-1

include/linux/ceph/mon_client.h

··· 71 71 int cur_mon; /* last monitor i contacted */ 72 72 unsigned long sub_sent, sub_renew_after; 73 73 struct ceph_connection con; 74 - bool have_fsid; 75 74 76 75 /* pending generic requests */ 77 76 struct rb_root generic_request_tree;

+1 -1

include/linux/ceph/osd_client.h

··· 207 207 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, 208 208 struct ceph_msg *msg); 209 209 210 - extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc, 210 + extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc, 211 211 struct ceph_file_layout *layout, 212 212 u64 snapid, 213 213 u64 off, u64 *plen, u64 *bno,

+3 -3

include/linux/ceph/osdmap.h

··· 109 109 extern void ceph_osdmap_destroy(struct ceph_osdmap *map); 110 110 111 111 /* calculate mapping of a file extent to an object */ 112 - extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, 113 - u64 off, u64 *plen, 114 - u64 *bno, u64 *oxoff, u64 *oxlen); 112 + extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, 113 + u64 off, u64 *plen, 114 + u64 *bno, u64 *oxoff, u64 *oxlen); 115 115 116 116 /* calculate mapping of object to a placement group */ 117 117 extern int ceph_calc_object_layout(struct ceph_object_layout *ol,

+3 -4

net/ceph/mon_client.c

··· 637 637 /* 638 638 * Do a synchronous pool op. 639 639 */ 640 - int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op, 640 + static int do_poolop(struct ceph_mon_client *monc, u32 op, 641 641 u32 pool, u64 snapid, 642 642 char *buf, int len) 643 643 { ··· 687 687 int ceph_monc_create_snapid(struct ceph_mon_client *monc, 688 688 u32 pool, u64 *snapid) 689 689 { 690 - return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, 690 + return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, 691 691 pool, 0, (char *)snapid, sizeof(*snapid)); 692 692 693 693 } ··· 696 696 int ceph_monc_delete_snapid(struct ceph_mon_client *monc, 697 697 u32 pool, u64 snapid) 698 698 { 699 - return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, 699 + return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, 700 700 pool, snapid, 0, 0); 701 701 702 702 } ··· 769 769 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); 770 770 } 771 771 monc->monmap->num_mon = num_mon; 772 - monc->have_fsid = false; 773 772 return 0; 774 773 } 775 774

+29 -18

net/ceph/osd_client.c

··· 52 52 op == CEPH_OSD_OP_WRITE); 53 53 } 54 54 55 - void ceph_calc_raw_layout(struct ceph_osd_client *osdc, 55 + int ceph_calc_raw_layout(struct ceph_osd_client *osdc, 56 56 struct ceph_file_layout *layout, 57 57 u64 snapid, 58 58 u64 off, u64 *plen, u64 *bno, ··· 62 62 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; 63 63 u64 orig_len = *plen; 64 64 u64 objoff, objlen; /* extent in object */ 65 + int r; 65 66 66 67 reqhead->snapid = cpu_to_le64(snapid); 67 68 68 69 /* object extent? */ 69 - ceph_calc_file_object_mapping(layout, off, plen, bno, 70 - &objoff, &objlen); 70 + r = ceph_calc_file_object_mapping(layout, off, plen, bno, 71 + &objoff, &objlen); 72 + if (r < 0) 73 + return r; 71 74 if (*plen < orig_len) 72 75 dout(" skipping last %llu, final file extent %llu~%llu\n", 73 76 orig_len - *plen, off, *plen); ··· 86 83 87 84 dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", 88 85 *bno, objoff, objlen, req->r_num_pages); 89 - 86 + return 0; 90 87 } 91 88 EXPORT_SYMBOL(ceph_calc_raw_layout); 92 89 ··· 115 112 * 116 113 * fill osd op in request message. 117 114 */ 118 - static void calc_layout(struct ceph_osd_client *osdc, 119 - struct ceph_vino vino, 120 - struct ceph_file_layout *layout, 121 - u64 off, u64 *plen, 122 - struct ceph_osd_request *req, 123 - struct ceph_osd_req_op *op) 115 + static int calc_layout(struct ceph_osd_client *osdc, 116 + struct ceph_vino vino, 117 + struct ceph_file_layout *layout, 118 + u64 off, u64 *plen, 119 + struct ceph_osd_request *req, 120 + struct ceph_osd_req_op *op) 124 121 { 125 122 u64 bno; 123 + int r; 126 124 127 - ceph_calc_raw_layout(osdc, layout, vino.snap, off, 128 - plen, &bno, req, op); 125 + r = ceph_calc_raw_layout(osdc, layout, vino.snap, off, 126 + plen, &bno, req, op); 127 + if (r < 0) 128 + return r; 129 129 130 130 snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); 131 131 req->r_oid_len = strlen(req->r_oid); 132 + 133 + return r; 132 134 } 133 135 134 136 /* ··· 464 456 { 465 457 struct ceph_osd_req_op ops[3]; 466 458 struct ceph_osd_request *req; 459 + int r; 467 460 468 461 ops[0].op = opcode; 469 462 ops[0].extent.truncate_seq = truncate_seq; ··· 483 474 use_mempool, 484 475 GFP_NOFS, NULL, NULL); 485 476 if (!req) 486 - return NULL; 477 + return ERR_PTR(-ENOMEM); 487 478 488 479 /* calculate max write size */ 489 - calc_layout(osdc, vino, layout, off, plen, req, ops); 480 + r = calc_layout(osdc, vino, layout, off, plen, req, ops); 481 + if (r < 0) 482 + return ERR_PTR(r); 490 483 req->r_file_layout = *layout; /* keep a copy */ 491 484 492 485 /* in case it differs from natural (file) alignment that ··· 1931 1920 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 1932 1921 NULL, 0, truncate_seq, truncate_size, NULL, 1933 1922 false, 1, page_align); 1934 - if (!req) 1935 - return -ENOMEM; 1923 + if (IS_ERR(req)) 1924 + return PTR_ERR(req); 1936 1925 1937 1926 /* it may be a short read due to an object boundary */ 1938 1927 req->r_pages = pages; ··· 1974 1963 snapc, do_sync, 1975 1964 truncate_seq, truncate_size, mtime, 1976 1965 nofail, 1, page_align); 1977 - if (!req) 1978 - return -ENOMEM; 1966 + if (IS_ERR(req)) 1967 + return PTR_ERR(req); 1979 1968 1980 1969 /* it may be a short write due to an object boundary */ 1981 1970 req->r_pages = pages;

+16 -2

net/ceph/osdmap.c

··· 984 984 * for now, we write only a single su, until we can 985 985 * pass a stride back to the caller. 986 986 */ 987 - void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, 987 + int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, 988 988 u64 off, u64 *plen, 989 989 u64 *ono, 990 990 u64 *oxoff, u64 *oxlen) ··· 998 998 999 999 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, 1000 1000 osize, su); 1001 + if (su == 0 || sc == 0) 1002 + goto invalid; 1001 1003 su_per_object = osize / su; 1004 + if (su_per_object == 0) 1005 + goto invalid; 1002 1006 dout("osize %u / su %u = su_per_object %u\n", osize, su, 1003 1007 su_per_object); 1004 1008 1005 - BUG_ON((su & ~PAGE_MASK) != 0); 1009 + if ((su & ~PAGE_MASK) != 0) 1010 + goto invalid; 1011 + 1006 1012 /* bl = *off / su; */ 1007 1013 t = off; 1008 1014 do_div(t, su); ··· 1036 1030 *plen = *oxlen; 1037 1031 1038 1032 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); 1033 + return 0; 1034 + 1035 + invalid: 1036 + dout(" invalid layout\n"); 1037 + *ono = 0; 1038 + *oxoff = 0; 1039 + *oxlen = 0; 1040 + return -EINVAL; 1039 1041 } 1040 1042 EXPORT_SYMBOL(ceph_calc_file_object_mapping); 1041 1043

+2 -3

net/ceph/pagelist.c

··· 1 - 2 1 #include <linux/module.h> 3 2 #include <linux/gfp.h> 4 3 #include <linux/pagemap.h> ··· 133 134 ceph_pagelist_unmap_tail(pl); 134 135 while (pl->head.prev != c->page_lru) { 135 136 page = list_entry(pl->head.prev, struct page, lru); 136 - list_del(&page->lru); /* remove from pagelist */ 137 - list_add_tail(&page->lru, &pl->free_list); /* add to reserve */ 137 + /* move from pagelist to reserve */ 138 + list_move_tail(&page->lru, &pl->free_list); 138 139 ++pl->num_pages_free; 139 140 } 140 141 pl->room = c->room;