Merge branch 'for-3.14/drivers' of git://git.kernel.dk/linux-block

+4

block/blk-settings.c

··· 592 592 ret = -1; 593 593 } 594 594 595 + t->raid_partial_stripes_expensive = 596 + max(t->raid_partial_stripes_expensive, 597 + b->raid_partial_stripes_expensive); 598 + 595 599 /* Find lowest common alignment_offset */ 596 600 t->alignment_offset = lcm(t->alignment_offset, alignment) 597 601 & (max(t->physical_block_size, t->io_min) - 1);

+2 -2

drivers/block/cciss.c

··· 5004 5004 5005 5005 i = alloc_cciss_hba(pdev); 5006 5006 if (i < 0) 5007 - return -1; 5007 + return -ENOMEM; 5008 5008 5009 5009 h = hba[i]; 5010 5010 h->pdev = pdev; ··· 5205 5205 */ 5206 5206 pci_set_drvdata(pdev, NULL); 5207 5207 free_hba(h); 5208 - return -1; 5208 + return -ENODEV; 5209 5209 } 5210 5210 5211 5211 static void cciss_shutdown(struct pci_dev *pdev)

+27 -9

drivers/block/floppy.c

··· 3691 3691 if (!(mode & FMODE_NDELAY)) { 3692 3692 if (mode & (FMODE_READ|FMODE_WRITE)) { 3693 3693 UDRS->last_checked = 0; 3694 + clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags); 3694 3695 check_disk_change(bdev); 3695 3696 if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags)) 3697 + goto out; 3698 + if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags)) 3696 3699 goto out; 3697 3700 } 3698 3701 res = -EROFS; ··· 3749 3746 * a disk in the drive, and whether that disk is writable. 3750 3747 */ 3751 3748 3752 - static void floppy_rb0_complete(struct bio *bio, int err) 3749 + struct rb0_cbdata { 3750 + int drive; 3751 + struct completion complete; 3752 + }; 3753 + 3754 + static void floppy_rb0_cb(struct bio *bio, int err) 3753 3755 { 3754 - complete((struct completion *)bio->bi_private); 3756 + struct rb0_cbdata *cbdata = (struct rb0_cbdata *)bio->bi_private; 3757 + int drive = cbdata->drive; 3758 + 3759 + if (err) { 3760 + pr_info("floppy: error %d while reading block 0", err); 3761 + set_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags); 3762 + } 3763 + complete(&cbdata->complete); 3755 3764 } 3756 3765 3757 - static int __floppy_read_block_0(struct block_device *bdev) 3766 + static int __floppy_read_block_0(struct block_device *bdev, int drive) 3758 3767 { 3759 3768 struct bio bio; 3760 3769 struct bio_vec bio_vec; 3761 - struct completion complete; 3762 3770 struct page *page; 3771 + struct rb0_cbdata cbdata; 3763 3772 size_t size; 3764 3773 3765 3774 page = alloc_page(GFP_NOIO); ··· 3784 3769 if (!size) 3785 3770 size = 1024; 3786 3771 3772 + cbdata.drive = drive; 3773 + 3787 3774 bio_init(&bio); 3788 3775 bio.bi_io_vec = &bio_vec; 3789 3776 bio_vec.bv_page = page; ··· 3796 3779 bio.bi_bdev = bdev; 3797 3780 bio.bi_iter.bi_sector = 0; 3798 3781 bio.bi_flags = (1 << BIO_QUIET); 3799 - init_completion(&complete); 3800 - bio.bi_private = &complete; 3801 - bio.bi_end_io = floppy_rb0_complete; 3782 + bio.bi_private = &cbdata; 3783 + bio.bi_end_io = floppy_rb0_cb; 3802 3784 3803 3785 submit_bio(READ, &bio); 3804 3786 process_fd_request(); 3805 - wait_for_completion(&complete); 3787 + 3788 + init_completion(&cbdata.complete); 3789 + wait_for_completion(&cbdata.complete); 3806 3790 3807 3791 __free_page(page); 3808 3792 ··· 3845 3827 UDRS->generation++; 3846 3828 if (drive_no_geom(drive)) { 3847 3829 /* auto-sensing */ 3848 - res = __floppy_read_block_0(opened_bdev[drive]); 3830 + res = __floppy_read_block_0(opened_bdev[drive], drive); 3849 3831 } else { 3850 3832 if (cf) 3851 3833 poll_drive(false, FD_RAW_NEED_DISK);

+1 -1

drivers/block/loop.c

··· 799 799 800 800 /* 801 801 * We use punch hole to reclaim the free space used by the 802 - * image a.k.a. discard. However we do support discard if 802 + * image a.k.a. discard. However we do not support discard if 803 803 * encryption is enabled, because it may give an attacker 804 804 * useful information. 805 805 */

+1 -1

drivers/block/mg_disk.c

··· 915 915 916 916 /* disk reset */ 917 917 if (prv_data->dev_attr == MG_STORAGE_DEV) { 918 - /* If POR seq. not yet finised, wait */ 918 + /* If POR seq. not yet finished, wait */ 919 919 err = mg_wait_rstout(host->rstout, MG_TMAX_RSTOUT); 920 920 if (err) 921 921 goto probe_err_3b;

+158 -92

drivers/block/mtip32xx/mtip32xx.c

··· 41 41 #include "mtip32xx.h" 42 42 43 43 #define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32) 44 - #define HW_CMD_TBL_SZ (AHCI_CMD_TBL_HDR_SZ + (MTIP_MAX_SG * 16)) 45 - #define HW_CMD_TBL_AR_SZ (HW_CMD_TBL_SZ * MTIP_MAX_COMMAND_SLOTS) 46 - #define HW_PORT_PRIV_DMA_SZ \ 47 - (HW_CMD_SLOT_SZ + HW_CMD_TBL_AR_SZ + AHCI_RX_FIS_SZ) 44 + 45 + /* DMA region containing RX Fis, Identify, RLE10, and SMART buffers */ 46 + #define AHCI_RX_FIS_SZ 0x100 47 + #define AHCI_RX_FIS_OFFSET 0x0 48 + #define AHCI_IDFY_SZ ATA_SECT_SIZE 49 + #define AHCI_IDFY_OFFSET 0x400 50 + #define AHCI_SECTBUF_SZ ATA_SECT_SIZE 51 + #define AHCI_SECTBUF_OFFSET 0x800 52 + #define AHCI_SMARTBUF_SZ ATA_SECT_SIZE 53 + #define AHCI_SMARTBUF_OFFSET 0xC00 54 + /* 0x100 + 0x200 + 0x200 + 0x200 is smaller than 4k but we pad it out */ 55 + #define BLOCK_DMA_ALLOC_SZ 4096 56 + 57 + /* DMA region containing command table (should be 8192 bytes) */ 58 + #define AHCI_CMD_SLOT_SZ sizeof(struct mtip_cmd_hdr) 59 + #define AHCI_CMD_TBL_SZ (MTIP_MAX_COMMAND_SLOTS * AHCI_CMD_SLOT_SZ) 60 + #define AHCI_CMD_TBL_OFFSET 0x0 61 + 62 + /* DMA region per command (contains header and SGL) */ 63 + #define AHCI_CMD_TBL_HDR_SZ 0x80 64 + #define AHCI_CMD_TBL_HDR_OFFSET 0x0 65 + #define AHCI_CMD_TBL_SGL_SZ (MTIP_MAX_SG * sizeof(struct mtip_cmd_sg)) 66 + #define AHCI_CMD_TBL_SGL_OFFSET AHCI_CMD_TBL_HDR_SZ 67 + #define CMD_DMA_ALLOC_SZ (AHCI_CMD_TBL_SGL_SZ + AHCI_CMD_TBL_HDR_SZ) 68 + 48 69 49 70 #define HOST_CAP_NZDMA (1 << 19) 50 71 #define HOST_HSORG 0xFC ··· 920 899 fail_reason = "thermal shutdown"; 921 900 } 922 901 if (buf[288] == 0xBF) { 902 + set_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag); 923 903 dev_info(&dd->pdev->dev, 924 - "Drive indicates rebuild has failed.\n"); 904 + "Drive indicates rebuild has failed. Secure erase required.\n"); 925 905 fail_all_ncq_cmds = 1; 926 906 fail_reason = "rebuild failed"; 927 907 } ··· 1588 1566 } 1589 1567 #endif 1590 1568 1569 + /* Check security locked state */ 1570 + if (port->identify[128] & 0x4) 1571 + set_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag); 1572 + else 1573 + clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag); 1574 + 1591 1575 #ifdef MTIP_TRIM /* Disabling TRIM support temporarily */ 1592 1576 /* Demux ID.DRAT & ID.RZAT to determine trim support */ 1593 1577 if (port->identify[69] & (1 << 14) && port->identify[69] & (1 << 5)) ··· 1914 1886 1915 1887 strlcpy(cbuf, (char *)(port->identify+27), 41); 1916 1888 dev_info(&port->dd->pdev->dev, "Model: %s\n", cbuf); 1889 + 1890 + dev_info(&port->dd->pdev->dev, "Security: %04x %s\n", 1891 + port->identify[128], 1892 + port->identify[128] & 0x4 ? "(LOCKED)" : ""); 1917 1893 1918 1894 if (mtip_hw_get_capacity(port->dd, &sectors)) 1919 1895 dev_info(&port->dd->pdev->dev, ··· 3345 3313 } 3346 3314 3347 3315 /* 3316 + * DMA region teardown 3317 + * 3318 + * @dd Pointer to driver_data structure 3319 + * 3320 + * return value 3321 + * None 3322 + */ 3323 + static void mtip_dma_free(struct driver_data *dd) 3324 + { 3325 + int i; 3326 + struct mtip_port *port = dd->port; 3327 + 3328 + if (port->block1) 3329 + dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, 3330 + port->block1, port->block1_dma); 3331 + 3332 + if (port->command_list) { 3333 + dmam_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ, 3334 + port->command_list, port->command_list_dma); 3335 + } 3336 + 3337 + for (i = 0; i < MTIP_MAX_COMMAND_SLOTS; i++) { 3338 + if (port->commands[i].command) 3339 + dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, 3340 + port->commands[i].command, 3341 + port->commands[i].command_dma); 3342 + } 3343 + } 3344 + 3345 + /* 3346 + * DMA region setup 3347 + * 3348 + * @dd Pointer to driver_data structure 3349 + * 3350 + * return value 3351 + * -ENOMEM Not enough free DMA region space to initialize driver 3352 + */ 3353 + static int mtip_dma_alloc(struct driver_data *dd) 3354 + { 3355 + struct mtip_port *port = dd->port; 3356 + int i, rv = 0; 3357 + u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64; 3358 + 3359 + /* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */ 3360 + port->block1 = 3361 + dmam_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, 3362 + &port->block1_dma, GFP_KERNEL); 3363 + if (!port->block1) 3364 + return -ENOMEM; 3365 + memset(port->block1, 0, BLOCK_DMA_ALLOC_SZ); 3366 + 3367 + /* Allocate dma memory for command list */ 3368 + port->command_list = 3369 + dmam_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ, 3370 + &port->command_list_dma, GFP_KERNEL); 3371 + if (!port->command_list) { 3372 + dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, 3373 + port->block1, port->block1_dma); 3374 + port->block1 = NULL; 3375 + port->block1_dma = 0; 3376 + return -ENOMEM; 3377 + } 3378 + memset(port->command_list, 0, AHCI_CMD_TBL_SZ); 3379 + 3380 + /* Setup all pointers into first DMA region */ 3381 + port->rxfis = port->block1 + AHCI_RX_FIS_OFFSET; 3382 + port->rxfis_dma = port->block1_dma + AHCI_RX_FIS_OFFSET; 3383 + port->identify = port->block1 + AHCI_IDFY_OFFSET; 3384 + port->identify_dma = port->block1_dma + AHCI_IDFY_OFFSET; 3385 + port->log_buf = port->block1 + AHCI_SECTBUF_OFFSET; 3386 + port->log_buf_dma = port->block1_dma + AHCI_SECTBUF_OFFSET; 3387 + port->smart_buf = port->block1 + AHCI_SMARTBUF_OFFSET; 3388 + port->smart_buf_dma = port->block1_dma + AHCI_SMARTBUF_OFFSET; 3389 + 3390 + /* Setup per command SGL DMA region */ 3391 + 3392 + /* Point the command headers at the command tables */ 3393 + for (i = 0; i < MTIP_MAX_COMMAND_SLOTS; i++) { 3394 + port->commands[i].command = 3395 + dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, 3396 + &port->commands[i].command_dma, GFP_KERNEL); 3397 + if (!port->commands[i].command) { 3398 + rv = -ENOMEM; 3399 + mtip_dma_free(dd); 3400 + return rv; 3401 + } 3402 + memset(port->commands[i].command, 0, CMD_DMA_ALLOC_SZ); 3403 + 3404 + port->commands[i].command_header = port->command_list + 3405 + (sizeof(struct mtip_cmd_hdr) * i); 3406 + port->commands[i].command_header_dma = 3407 + dd->port->command_list_dma + 3408 + (sizeof(struct mtip_cmd_hdr) * i); 3409 + 3410 + if (host_cap_64) 3411 + port->commands[i].command_header->ctbau = 3412 + __force_bit2int cpu_to_le32( 3413 + (port->commands[i].command_dma >> 16) >> 16); 3414 + 3415 + port->commands[i].command_header->ctba = 3416 + __force_bit2int cpu_to_le32( 3417 + port->commands[i].command_dma & 0xFFFFFFFF); 3418 + 3419 + sg_init_table(port->commands[i].sg, MTIP_MAX_SG); 3420 + 3421 + /* Mark command as currently inactive */ 3422 + atomic_set(&dd->port->commands[i].active, 0); 3423 + } 3424 + return 0; 3425 + } 3426 + 3427 + /* 3348 3428 * Called once for each card. 3349 3429 * 3350 3430 * @dd Pointer to the driver data structure. ··· 3514 3370 dd->port->mmio = dd->mmio + PORT_OFFSET; 3515 3371 dd->port->dd = dd; 3516 3372 3517 - /* Allocate memory for the command list. */ 3518 - dd->port->command_list = 3519 - dmam_alloc_coherent(&dd->pdev->dev, 3520 - HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4), 3521 - &dd->port->command_list_dma, 3522 - GFP_KERNEL); 3523 - if (!dd->port->command_list) { 3524 - dev_err(&dd->pdev->dev, 3525 - "Memory allocation: command list\n"); 3526 - rv = -ENOMEM; 3373 + /* DMA allocations */ 3374 + rv = mtip_dma_alloc(dd); 3375 + if (rv < 0) 3527 3376 goto out1; 3528 - } 3529 - 3530 - /* Clear the memory we have allocated. */ 3531 - memset(dd->port->command_list, 3532 - 0, 3533 - HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4)); 3534 - 3535 - /* Setup the addresse of the RX FIS. */ 3536 - dd->port->rxfis = dd->port->command_list + HW_CMD_SLOT_SZ; 3537 - dd->port->rxfis_dma = dd->port->command_list_dma + HW_CMD_SLOT_SZ; 3538 - 3539 - /* Setup the address of the command tables. */ 3540 - dd->port->command_table = dd->port->rxfis + AHCI_RX_FIS_SZ; 3541 - dd->port->command_tbl_dma = dd->port->rxfis_dma + AHCI_RX_FIS_SZ; 3542 - 3543 - /* Setup the address of the identify data. */ 3544 - dd->port->identify = dd->port->command_table + 3545 - HW_CMD_TBL_AR_SZ; 3546 - dd->port->identify_dma = dd->port->command_tbl_dma + 3547 - HW_CMD_TBL_AR_SZ; 3548 - 3549 - /* Setup the address of the sector buffer - for some non-ncq cmds */ 3550 - dd->port->sector_buffer = (void *) dd->port->identify + ATA_SECT_SIZE; 3551 - dd->port->sector_buffer_dma = dd->port->identify_dma + ATA_SECT_SIZE; 3552 - 3553 - /* Setup the address of the log buf - for read log command */ 3554 - dd->port->log_buf = (void *)dd->port->sector_buffer + ATA_SECT_SIZE; 3555 - dd->port->log_buf_dma = dd->port->sector_buffer_dma + ATA_SECT_SIZE; 3556 - 3557 - /* Setup the address of the smart buf - for smart read data command */ 3558 - dd->port->smart_buf = (void *)dd->port->log_buf + ATA_SECT_SIZE; 3559 - dd->port->smart_buf_dma = dd->port->log_buf_dma + ATA_SECT_SIZE; 3560 - 3561 - 3562 - /* Point the command headers at the command tables. */ 3563 - for (i = 0; i < num_command_slots; i++) { 3564 - dd->port->commands[i].command_header = 3565 - dd->port->command_list + 3566 - (sizeof(struct mtip_cmd_hdr) * i); 3567 - dd->port->commands[i].command_header_dma = 3568 - dd->port->command_list_dma + 3569 - (sizeof(struct mtip_cmd_hdr) * i); 3570 - 3571 - dd->port->commands[i].command = 3572 - dd->port->command_table + (HW_CMD_TBL_SZ * i); 3573 - dd->port->commands[i].command_dma = 3574 - dd->port->command_tbl_dma + (HW_CMD_TBL_SZ * i); 3575 - 3576 - if (readl(dd->mmio + HOST_CAP) & HOST_CAP_64) 3577 - dd->port->commands[i].command_header->ctbau = 3578 - __force_bit2int cpu_to_le32( 3579 - (dd->port->commands[i].command_dma >> 16) >> 16); 3580 - dd->port->commands[i].command_header->ctba = 3581 - __force_bit2int cpu_to_le32( 3582 - dd->port->commands[i].command_dma & 0xFFFFFFFF); 3583 - 3584 - /* 3585 - * If this is not done, a bug is reported by the stock 3586 - * FC11 i386. Due to the fact that it has lots of kernel 3587 - * debugging enabled. 3588 - */ 3589 - sg_init_table(dd->port->commands[i].sg, MTIP_MAX_SG); 3590 - 3591 - /* Mark all commands as currently inactive.*/ 3592 - atomic_set(&dd->port->commands[i].active, 0); 3593 - } 3594 3377 3595 3378 /* Setup the pointers to the extended s_active and CI registers. */ 3596 3379 for (i = 0; i < dd->slot_groups; i++) { ··· 3665 3594 3666 3595 out2: 3667 3596 mtip_deinit_port(dd->port); 3597 + mtip_dma_free(dd); 3668 3598 3669 - /* Free the command/command header memory. */ 3670 - dmam_free_coherent(&dd->pdev->dev, 3671 - HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4), 3672 - dd->port->command_list, 3673 - dd->port->command_list_dma); 3674 3599 out1: 3675 3600 /* Free the memory allocated for the for structure. */ 3676 3601 kfree(dd->port); ··· 3689 3622 * saves its state. 3690 3623 */ 3691 3624 if (!dd->sr) { 3692 - if (!test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) 3625 + if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags) && 3626 + !test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag)) 3693 3627 if (mtip_standby_immediate(dd->port)) 3694 3628 dev_warn(&dd->pdev->dev, 3695 3629 "STANDBY IMMEDIATE failed\n"); ··· 3709 3641 irq_set_affinity_hint(dd->pdev->irq, NULL); 3710 3642 devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd); 3711 3643 3712 - /* Free the command/command header memory. */ 3713 - dmam_free_coherent(&dd->pdev->dev, 3714 - HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4), 3715 - dd->port->command_list, 3716 - dd->port->command_list_dma); 3644 + /* Free dma regions */ 3645 + mtip_dma_free(dd); 3646 + 3717 3647 /* Free the memory allocated for the for structure. */ 3718 3648 kfree(dd->port); 3719 3649 dd->port = NULL;

+6 -8

drivers/block/mtip32xx/mtip32xx.h

··· 69 69 * Maximum number of scatter gather entries 70 70 * a single command may have. 71 71 */ 72 - #define MTIP_MAX_SG 128 72 + #define MTIP_MAX_SG 504 73 73 74 74 /* 75 75 * Maximum number of slot groups (Command Issue & s_active registers) ··· 92 92 93 93 /* Driver name and version strings */ 94 94 #define MTIP_DRV_NAME "mtip32xx" 95 - #define MTIP_DRV_VERSION "1.2.6os3" 95 + #define MTIP_DRV_VERSION "1.3.0" 96 96 97 97 /* Maximum number of minor device numbers per device. */ 98 98 #define MTIP_MAX_MINORS 16 ··· 391 391 */ 392 392 dma_addr_t rxfis_dma; 393 393 /* 394 - * Pointer to the beginning of the command table memory as used 395 - * by the driver. 394 + * Pointer to the DMA region for RX Fis, Identify, RLE10, and SMART 396 395 */ 397 - void *command_table; 396 + void *block1; 398 397 /* 399 - * Pointer to the beginning of the command table memory as used 400 - * by the DMA. 398 + * DMA address of region for RX Fis, Identify, RLE10, and SMART 401 399 */ 402 - dma_addr_t command_tbl_dma; 400 + dma_addr_t block1_dma; 403 401 /* 404 402 * Pointer to the beginning of the identify data memory as used 405 403 * by the driver.

+5

drivers/block/null_blk.c

··· 616 616 irqmode = NULL_IRQ_NONE; 617 617 } 618 618 #endif 619 + if (bs > PAGE_SIZE) { 620 + pr_warn("null_blk: invalid block size\n"); 621 + pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE); 622 + bs = PAGE_SIZE; 623 + } 619 624 620 625 if (queue_mode == NULL_Q_MQ && use_per_node_hctx) { 621 626 if (submit_queues < nr_online_nodes) {

+1 -1

drivers/block/paride/pg.c

··· 581 581 582 582 if (hdr.magic != PG_MAGIC) 583 583 return -EINVAL; 584 - if (hdr.dlen > PG_MAX_DATA) 584 + if (hdr.dlen < 0 || hdr.dlen > PG_MAX_DATA) 585 585 return -EINVAL; 586 586 if ((count - hs) > PG_MAX_DATA) 587 587 return -EINVAL;

+3 -1

drivers/block/pktcdvd.c

··· 706 706 WRITE : READ, __GFP_WAIT); 707 707 708 708 if (cgc->buflen) { 709 - if (blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, __GFP_WAIT)) 709 + ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, 710 + __GFP_WAIT); 711 + if (ret) 710 712 goto out; 711 713 } 712 714

+1 -15

drivers/block/sx8.c

··· 1744 1744 kfree(host); 1745 1745 pci_release_regions(pdev); 1746 1746 pci_disable_device(pdev); 1747 - pci_set_drvdata(pdev, NULL); 1748 1747 } 1749 1748 1750 - static int __init carm_init(void) 1751 - { 1752 - return pci_register_driver(&carm_driver); 1753 - } 1754 - 1755 - static void __exit carm_exit(void) 1756 - { 1757 - pci_unregister_driver(&carm_driver); 1758 - } 1759 - 1760 - module_init(carm_init); 1761 - module_exit(carm_exit); 1762 - 1763 - 1749 + module_pci_driver(carm_driver);

+2 -2

drivers/cdrom/gdrom.c

··· 561 561 int err; 562 562 563 563 err = request_irq(HW_EVENT_GDROM_CMD, gdrom_command_interrupt, 564 - IRQF_DISABLED, "gdrom_command", &gd); 564 + 0, "gdrom_command", &gd); 565 565 if (err) 566 566 return err; 567 567 err = request_irq(HW_EVENT_GDROM_DMA, gdrom_dma_interrupt, 568 - IRQF_DISABLED, "gdrom_dma", &gd); 568 + 0, "gdrom_dma", &gd); 569 569 if (err) 570 570 free_irq(HW_EVENT_GDROM_CMD, &gd); 571 571 return err;

+3 -2

drivers/md/bcache/Makefile

··· 1 1 2 2 obj-$(CONFIG_BCACHE) += bcache.o 3 3 4 - bcache-y := alloc.o btree.o bset.o io.o journal.o writeback.o\ 5 - movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o 4 + bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ 5 + io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ 6 + util.o writeback.o 6 7 7 8 CFLAGS_request.o += -Iblock

+59 -30

drivers/md/bcache/alloc.c

··· 132 132 { 133 133 BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); 134 134 135 - if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] && 136 - CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) 137 - return false; 135 + if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) { 136 + unsigned i; 138 137 138 + for (i = 0; i < RESERVE_NONE; i++) 139 + if (!fifo_full(&ca->free[i])) 140 + goto add; 141 + 142 + return false; 143 + } 144 + add: 139 145 b->prio = 0; 140 146 141 147 if (can_inc_bucket_gen(b) && ··· 168 162 fifo_push(&ca->free_inc, b - ca->buckets); 169 163 } 170 164 171 - #define bucket_prio(b) \ 172 - (((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b)) 165 + /* 166 + * Determines what order we're going to reuse buckets, smallest bucket_prio() 167 + * first: we also take into account the number of sectors of live data in that 168 + * bucket, and in order for that multiply to make sense we have to scale bucket 169 + * 170 + * Thus, we scale the bucket priorities so that the bucket with the smallest 171 + * prio is worth 1/8th of what INITIAL_PRIO is worth. 172 + */ 173 + 174 + #define bucket_prio(b) \ 175 + ({ \ 176 + unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \ 177 + \ 178 + (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \ 179 + }) 173 180 174 181 #define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) 175 182 #define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) ··· 323 304 __set_current_state(TASK_RUNNING); \ 324 305 } while (0) 325 306 307 + static int bch_allocator_push(struct cache *ca, long bucket) 308 + { 309 + unsigned i; 310 + 311 + /* Prios/gens are actually the most important reserve */ 312 + if (fifo_push(&ca->free[RESERVE_PRIO], bucket)) 313 + return true; 314 + 315 + for (i = 0; i < RESERVE_NR; i++) 316 + if (fifo_push(&ca->free[i], bucket)) 317 + return true; 318 + 319 + return false; 320 + } 321 + 326 322 static int bch_allocator_thread(void *arg) 327 323 { 328 324 struct cache *ca = arg; ··· 370 336 mutex_lock(&ca->set->bucket_lock); 371 337 } 372 338 373 - allocator_wait(ca, !fifo_full(&ca->free)); 374 - 375 - fifo_push(&ca->free, bucket); 339 + allocator_wait(ca, bch_allocator_push(ca, bucket)); 376 340 wake_up(&ca->set->bucket_wait); 377 341 } 378 342 ··· 397 365 } 398 366 } 399 367 400 - long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait) 368 + long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) 401 369 { 402 370 DEFINE_WAIT(w); 403 371 struct bucket *b; 404 372 long r; 405 373 406 374 /* fastpath */ 407 - if (fifo_used(&ca->free) > ca->watermark[watermark]) { 408 - fifo_pop(&ca->free, r); 375 + if (fifo_pop(&ca->free[RESERVE_NONE], r) || 376 + fifo_pop(&ca->free[reserve], r)) 409 377 goto out; 410 - } 411 378 412 379 if (!wait) 413 380 return -1; 414 381 415 - while (1) { 416 - if (fifo_used(&ca->free) > ca->watermark[watermark]) { 417 - fifo_pop(&ca->free, r); 418 - break; 419 - } 420 - 382 + do { 421 383 prepare_to_wait(&ca->set->bucket_wait, &w, 422 384 TASK_UNINTERRUPTIBLE); 423 385 424 386 mutex_unlock(&ca->set->bucket_lock); 425 387 schedule(); 426 388 mutex_lock(&ca->set->bucket_lock); 427 - } 389 + } while (!fifo_pop(&ca->free[RESERVE_NONE], r) && 390 + !fifo_pop(&ca->free[reserve], r)); 428 391 429 392 finish_wait(&ca->set->bucket_wait, &w); 430 393 out: ··· 428 401 if (expensive_debug_checks(ca->set)) { 429 402 size_t iter; 430 403 long i; 404 + unsigned j; 431 405 432 406 for (iter = 0; iter < prio_buckets(ca) * 2; iter++) 433 407 BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); 434 408 435 - fifo_for_each(i, &ca->free, iter) 436 - BUG_ON(i == r); 409 + for (j = 0; j < RESERVE_NR; j++) 410 + fifo_for_each(i, &ca->free[j], iter) 411 + BUG_ON(i == r); 437 412 fifo_for_each(i, &ca->free_inc, iter) 438 413 BUG_ON(i == r); 439 414 fifo_for_each(i, &ca->unused, iter) ··· 448 419 449 420 SET_GC_SECTORS_USED(b, ca->sb.bucket_size); 450 421 451 - if (watermark <= WATERMARK_METADATA) { 422 + if (reserve <= RESERVE_PRIO) { 452 423 SET_GC_MARK(b, GC_MARK_METADATA); 453 424 SET_GC_MOVE(b, 0); 454 425 b->prio = BTREE_PRIO; ··· 474 445 } 475 446 } 476 447 477 - int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, 448 + int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, 478 449 struct bkey *k, int n, bool wait) 479 450 { 480 451 int i; ··· 488 459 489 460 for (i = 0; i < n; i++) { 490 461 struct cache *ca = c->cache_by_alloc[i]; 491 - long b = bch_bucket_alloc(ca, watermark, wait); 462 + long b = bch_bucket_alloc(ca, reserve, wait); 492 463 493 464 if (b == -1) 494 465 goto err; ··· 507 478 return -1; 508 479 } 509 480 510 - int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, 481 + int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, 511 482 struct bkey *k, int n, bool wait) 512 483 { 513 484 int ret; 514 485 mutex_lock(&c->bucket_lock); 515 - ret = __bch_bucket_alloc_set(c, watermark, k, n, wait); 486 + ret = __bch_bucket_alloc_set(c, reserve, k, n, wait); 516 487 mutex_unlock(&c->bucket_lock); 517 488 return ret; 518 489 } ··· 602 573 603 574 while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { 604 575 unsigned watermark = write_prio 605 - ? WATERMARK_MOVINGGC 606 - : WATERMARK_NONE; 576 + ? RESERVE_MOVINGGC 577 + : RESERVE_NONE; 607 578 608 579 spin_unlock(&c->data_bucket_lock); 609 580 ··· 718 689 * Then 8 for btree allocations 719 690 * Then half for the moving garbage collector 720 691 */ 721 - 692 + #if 0 722 693 ca->watermark[WATERMARK_PRIO] = 0; 723 694 724 695 ca->watermark[WATERMARK_METADATA] = prio_buckets(ca); ··· 728 699 729 700 ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + 730 701 ca->watermark[WATERMARK_MOVINGGC]; 731 - 702 + #endif 732 703 return 0; 733 704 }

+37 -47

drivers/md/bcache/bcache.h

··· 187 187 #include <linux/types.h> 188 188 #include <linux/workqueue.h> 189 189 190 + #include "bset.h" 190 191 #include "util.h" 191 192 #include "closure.h" 192 193 ··· 310 309 struct cache_sb sb; 311 310 struct bio sb_bio; 312 311 struct bio_vec sb_bv[1]; 313 - struct closure_with_waitlist sb_write; 312 + struct closure sb_write; 313 + struct semaphore sb_write_mutex; 314 314 315 315 /* Refcount on the cache set. Always nonzero when we're caching. */ 316 316 atomic_t count; ··· 384 382 unsigned writeback_rate_p_term_inverse; 385 383 }; 386 384 387 - enum alloc_watermarks { 388 - WATERMARK_PRIO, 389 - WATERMARK_METADATA, 390 - WATERMARK_MOVINGGC, 391 - WATERMARK_NONE, 392 - WATERMARK_MAX 385 + enum alloc_reserve { 386 + RESERVE_BTREE, 387 + RESERVE_PRIO, 388 + RESERVE_MOVINGGC, 389 + RESERVE_NONE, 390 + RESERVE_NR, 393 391 }; 394 392 395 393 struct cache { ··· 400 398 401 399 struct kobject kobj; 402 400 struct block_device *bdev; 403 - 404 - unsigned watermark[WATERMARK_MAX]; 405 401 406 402 struct task_struct *alloc_thread; 407 403 ··· 429 429 * because all the data they contained was overwritten), so we only 430 430 * need to discard them before they can be moved to the free list. 431 431 */ 432 - DECLARE_FIFO(long, free); 432 + DECLARE_FIFO(long, free)[RESERVE_NR]; 433 433 DECLARE_FIFO(long, free_inc); 434 434 DECLARE_FIFO(long, unused); 435 435 ··· 514 514 uint64_t cached_dev_sectors; 515 515 struct closure caching; 516 516 517 - struct closure_with_waitlist sb_write; 517 + struct closure sb_write; 518 + struct semaphore sb_write_mutex; 518 519 519 520 mempool_t *search; 520 521 mempool_t *bio_meta; ··· 630 629 631 630 #ifdef CONFIG_BCACHE_DEBUG 632 631 struct btree *verify_data; 632 + struct bset *verify_ondisk; 633 633 struct mutex verify_lock; 634 634 #endif 635 635 636 636 unsigned nr_uuids; 637 637 struct uuid_entry *uuids; 638 638 BKEY_PADDED(uuid_bucket); 639 - struct closure_with_waitlist uuid_write; 639 + struct closure uuid_write; 640 + struct semaphore uuid_write_mutex; 640 641 641 642 /* 642 643 * A btree node on disk could have too many bsets for an iterator to fit ··· 646 643 */ 647 644 mempool_t *fill_iter; 648 645 649 - /* 650 - * btree_sort() is a merge sort and requires temporary space - single 651 - * element mempool 652 - */ 653 - struct mutex sort_lock; 654 - struct bset *sort; 655 - unsigned sort_crit_factor; 646 + struct bset_sort_state sort; 656 647 657 648 /* List of buckets we're currently writing data to */ 658 649 struct list_head data_buckets; ··· 662 665 unsigned congested_read_threshold_us; 663 666 unsigned congested_write_threshold_us; 664 667 665 - struct time_stats sort_time; 666 668 struct time_stats btree_gc_time; 667 669 struct time_stats btree_split_time; 668 670 struct time_stats btree_read_time; ··· 679 683 unsigned error_decay; 680 684 681 685 unsigned short journal_delay_ms; 686 + bool expensive_debug_checks; 682 687 unsigned verify:1; 683 688 unsigned key_merging_disabled:1; 684 - unsigned expensive_debug_checks:1; 685 689 unsigned gc_always_rewrite:1; 686 690 unsigned shrinker_disabled:1; 687 691 unsigned copy_gc_enabled:1; ··· 703 707 struct bio bio; 704 708 }; 705 709 706 - static inline unsigned local_clock_us(void) 707 - { 708 - return local_clock() >> 10; 709 - } 710 - 711 710 #define BTREE_PRIO USHRT_MAX 712 - #define INITIAL_PRIO 32768 711 + #define INITIAL_PRIO 32768U 713 712 714 713 #define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE) 715 714 #define btree_blocks(b) \ ··· 716 725 #define bucket_pages(c) ((c)->sb.bucket_size / PAGE_SECTORS) 717 726 #define bucket_bytes(c) ((c)->sb.bucket_size << 9) 718 727 #define block_bytes(c) ((c)->sb.block_size << 9) 719 - 720 - #define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) 721 - #define set_bytes(i) __set_bytes(i, i->keys) 722 - 723 - #define __set_blocks(i, k, c) DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c)) 724 - #define set_blocks(i, c) __set_blocks(i, (i)->keys, c) 725 - 726 - #define node(i, j) ((struct bkey *) ((i)->d + (j))) 727 - #define end(i) node(i, (i)->keys) 728 - 729 - #define index(i, b) \ 730 - ((size_t) (((void *) i - (void *) (b)->sets[0].data) / \ 731 - block_bytes(b->c))) 732 - 733 - #define btree_data_space(b) (PAGE_SIZE << (b)->page_order) 734 728 735 729 #define prios_per_bucket(c) \ 736 730 ((bucket_bytes(c) - sizeof(struct prio_set)) / \ ··· 759 783 return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); 760 784 } 761 785 762 - /* Btree key macros */ 763 - 764 - static inline void bkey_init(struct bkey *k) 786 + static inline uint8_t gen_after(uint8_t a, uint8_t b) 765 787 { 766 - *k = ZERO_KEY; 788 + uint8_t r = a - b; 789 + return r > 128U ? 0 : r; 767 790 } 791 + 792 + static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, 793 + unsigned i) 794 + { 795 + return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); 796 + } 797 + 798 + static inline bool ptr_available(struct cache_set *c, const struct bkey *k, 799 + unsigned i) 800 + { 801 + return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); 802 + } 803 + 804 + /* Btree key macros */ 768 805 769 806 /* 770 807 * This is used for various on disk data structures - cache_sb, prio_set, bset, ··· 785 796 */ 786 797 #define csum_set(i) \ 787 798 bch_crc64(((void *) (i)) + sizeof(uint64_t), \ 788 - ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t))) 799 + ((void *) bset_bkey_last(i)) - \ 800 + (((void *) (i)) + sizeof(uint64_t))) 789 801 790 802 /* Error handling macros */ 791 803

+509 -407

drivers/md/bcache/bset.c

··· 5 5 * Copyright 2012 Google, Inc. 6 6 */ 7 7 8 - #include "bcache.h" 9 - #include "btree.h" 10 - #include "debug.h" 8 + #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ 11 9 10 + #include "util.h" 11 + #include "bset.h" 12 + 13 + #include <linux/console.h> 12 14 #include <linux/random.h> 13 15 #include <linux/prefetch.h> 14 16 17 + #ifdef CONFIG_BCACHE_DEBUG 18 + 19 + void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set) 20 + { 21 + struct bkey *k, *next; 22 + 23 + for (k = i->start; k < bset_bkey_last(i); k = next) { 24 + next = bkey_next(k); 25 + 26 + printk(KERN_ERR "block %u key %zi/%u: ", set, 27 + (uint64_t *) k - i->d, i->keys); 28 + 29 + if (b->ops->key_dump) 30 + b->ops->key_dump(b, k); 31 + else 32 + printk("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k)); 33 + 34 + if (next < bset_bkey_last(i) && 35 + bkey_cmp(k, b->ops->is_extents ? 36 + &START_KEY(next) : next) > 0) 37 + printk(KERN_ERR "Key skipped backwards\n"); 38 + } 39 + } 40 + 41 + void bch_dump_bucket(struct btree_keys *b) 42 + { 43 + unsigned i; 44 + 45 + console_lock(); 46 + for (i = 0; i <= b->nsets; i++) 47 + bch_dump_bset(b, b->set[i].data, 48 + bset_sector_offset(b, b->set[i].data)); 49 + console_unlock(); 50 + } 51 + 52 + int __bch_count_data(struct btree_keys *b) 53 + { 54 + unsigned ret = 0; 55 + struct btree_iter iter; 56 + struct bkey *k; 57 + 58 + if (b->ops->is_extents) 59 + for_each_key(b, k, &iter) 60 + ret += KEY_SIZE(k); 61 + return ret; 62 + } 63 + 64 + void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) 65 + { 66 + va_list args; 67 + struct bkey *k, *p = NULL; 68 + struct btree_iter iter; 69 + const char *err; 70 + 71 + for_each_key(b, k, &iter) { 72 + if (b->ops->is_extents) { 73 + err = "Keys out of order"; 74 + if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) 75 + goto bug; 76 + 77 + if (bch_ptr_invalid(b, k)) 78 + continue; 79 + 80 + err = "Overlapping keys"; 81 + if (p && bkey_cmp(p, &START_KEY(k)) > 0) 82 + goto bug; 83 + } else { 84 + if (bch_ptr_bad(b, k)) 85 + continue; 86 + 87 + err = "Duplicate keys"; 88 + if (p && !bkey_cmp(p, k)) 89 + goto bug; 90 + } 91 + p = k; 92 + } 93 + #if 0 94 + err = "Key larger than btree node key"; 95 + if (p && bkey_cmp(p, &b->key) > 0) 96 + goto bug; 97 + #endif 98 + return; 99 + bug: 100 + bch_dump_bucket(b); 101 + 102 + va_start(args, fmt); 103 + vprintk(fmt, args); 104 + va_end(args); 105 + 106 + panic("bch_check_keys error: %s:\n", err); 107 + } 108 + 109 + static void bch_btree_iter_next_check(struct btree_iter *iter) 110 + { 111 + struct bkey *k = iter->data->k, *next = bkey_next(k); 112 + 113 + if (next < iter->data->end && 114 + bkey_cmp(k, iter->b->ops->is_extents ? 115 + &START_KEY(next) : next) > 0) { 116 + bch_dump_bucket(iter->b); 117 + panic("Key skipped backwards\n"); 118 + } 119 + } 120 + 121 + #else 122 + 123 + static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} 124 + 125 + #endif 126 + 15 127 /* Keylists */ 16 128 17 - int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c) 129 + int __bch_keylist_realloc(struct keylist *l, unsigned u64s) 18 130 { 19 131 size_t oldsize = bch_keylist_nkeys(l); 20 - size_t newsize = oldsize + 2 + nptrs; 132 + size_t newsize = oldsize + u64s; 21 133 uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p; 22 134 uint64_t *new_keys; 23 - 24 - /* The journalling code doesn't handle the case where the keys to insert 25 - * is bigger than an empty write: If we just return -ENOMEM here, 26 - * bio_insert() and bio_invalidate() will insert the keys created so far 27 - * and finish the rest when the keylist is empty. 28 - */ 29 - if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) 30 - return -ENOMEM; 31 135 32 136 newsize = roundup_pow_of_two(newsize); 33 137 ··· 173 69 memmove(l->keys, 174 70 bkey_next(l->keys), 175 71 bch_keylist_bytes(l)); 176 - } 177 - 178 - /* Pointer validation */ 179 - 180 - static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) 181 - { 182 - unsigned i; 183 - 184 - for (i = 0; i < KEY_PTRS(k); i++) 185 - if (ptr_available(c, k, i)) { 186 - struct cache *ca = PTR_CACHE(c, k, i); 187 - size_t bucket = PTR_BUCKET_NR(c, k, i); 188 - size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); 189 - 190 - if (KEY_SIZE(k) + r > c->sb.bucket_size || 191 - bucket < ca->sb.first_bucket || 192 - bucket >= ca->sb.nbuckets) 193 - return true; 194 - } 195 - 196 - return false; 197 - } 198 - 199 - bool bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) 200 - { 201 - char buf[80]; 202 - 203 - if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)) 204 - goto bad; 205 - 206 - if (__ptr_invalid(c, k)) 207 - goto bad; 208 - 209 - return false; 210 - bad: 211 - bch_bkey_to_text(buf, sizeof(buf), k); 212 - cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k)); 213 - return true; 214 - } 215 - 216 - bool bch_extent_ptr_invalid(struct cache_set *c, const struct bkey *k) 217 - { 218 - char buf[80]; 219 - 220 - if (!KEY_SIZE(k)) 221 - return true; 222 - 223 - if (KEY_SIZE(k) > KEY_OFFSET(k)) 224 - goto bad; 225 - 226 - if (__ptr_invalid(c, k)) 227 - goto bad; 228 - 229 - return false; 230 - bad: 231 - bch_bkey_to_text(buf, sizeof(buf), k); 232 - cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k)); 233 - return true; 234 - } 235 - 236 - static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k, 237 - unsigned ptr) 238 - { 239 - struct bucket *g = PTR_BUCKET(b->c, k, ptr); 240 - char buf[80]; 241 - 242 - if (mutex_trylock(&b->c->bucket_lock)) { 243 - if (b->level) { 244 - if (KEY_DIRTY(k) || 245 - g->prio != BTREE_PRIO || 246 - (b->c->gc_mark_valid && 247 - GC_MARK(g) != GC_MARK_METADATA)) 248 - goto err; 249 - 250 - } else { 251 - if (g->prio == BTREE_PRIO) 252 - goto err; 253 - 254 - if (KEY_DIRTY(k) && 255 - b->c->gc_mark_valid && 256 - GC_MARK(g) != GC_MARK_DIRTY) 257 - goto err; 258 - } 259 - mutex_unlock(&b->c->bucket_lock); 260 - } 261 - 262 - return false; 263 - err: 264 - mutex_unlock(&b->c->bucket_lock); 265 - bch_bkey_to_text(buf, sizeof(buf), k); 266 - btree_bug(b, 267 - "inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", 268 - buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), 269 - g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); 270 - return true; 271 - } 272 - 273 - bool bch_ptr_bad(struct btree *b, const struct bkey *k) 274 - { 275 - struct bucket *g; 276 - unsigned i, stale; 277 - 278 - if (!bkey_cmp(k, &ZERO_KEY) || 279 - !KEY_PTRS(k) || 280 - bch_ptr_invalid(b, k)) 281 - return true; 282 - 283 - for (i = 0; i < KEY_PTRS(k); i++) { 284 - if (!ptr_available(b->c, k, i)) 285 - return true; 286 - 287 - g = PTR_BUCKET(b->c, k, i); 288 - stale = ptr_stale(b->c, k, i); 289 - 290 - btree_bug_on(stale > 96, b, 291 - "key too stale: %i, need_gc %u", 292 - stale, b->c->need_gc); 293 - 294 - btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), 295 - b, "stale dirty pointer"); 296 - 297 - if (stale) 298 - return true; 299 - 300 - if (expensive_debug_checks(b->c) && 301 - ptr_bad_expensive_checks(b, k, i)) 302 - return true; 303 - } 304 - 305 - return false; 306 72 } 307 73 308 74 /* Key/pointer manipulation */ ··· 229 255 return true; 230 256 } 231 257 232 - static uint64_t merge_chksums(struct bkey *l, struct bkey *r) 258 + /* Auxiliary search trees */ 259 + 260 + /* 32 bits total: */ 261 + #define BKEY_MID_BITS 3 262 + #define BKEY_EXPONENT_BITS 7 263 + #define BKEY_MANTISSA_BITS (32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS) 264 + #define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1) 265 + 266 + struct bkey_float { 267 + unsigned exponent:BKEY_EXPONENT_BITS; 268 + unsigned m:BKEY_MID_BITS; 269 + unsigned mantissa:BKEY_MANTISSA_BITS; 270 + } __packed; 271 + 272 + /* 273 + * BSET_CACHELINE was originally intended to match the hardware cacheline size - 274 + * it used to be 64, but I realized the lookup code would touch slightly less 275 + * memory if it was 128. 276 + * 277 + * It definites the number of bytes (in struct bset) per struct bkey_float in 278 + * the auxiliar search tree - when we're done searching the bset_float tree we 279 + * have this many bytes left that we do a linear search over. 280 + * 281 + * Since (after level 5) every level of the bset_tree is on a new cacheline, 282 + * we're touching one fewer cacheline in the bset tree in exchange for one more 283 + * cacheline in the linear search - but the linear search might stop before it 284 + * gets to the second cacheline. 285 + */ 286 + 287 + #define BSET_CACHELINE 128 288 + 289 + /* Space required for the btree node keys */ 290 + static inline size_t btree_keys_bytes(struct btree_keys *b) 233 291 { 234 - return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & 235 - ~((uint64_t)1 << 63); 292 + return PAGE_SIZE << b->page_order; 236 293 } 237 294 238 - /* Tries to merge l and r: l should be lower than r 239 - * Returns true if we were able to merge. If we did merge, l will be the merged 240 - * key, r will be untouched. 241 - */ 242 - bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r) 295 + static inline size_t btree_keys_cachelines(struct btree_keys *b) 296 + { 297 + return btree_keys_bytes(b) / BSET_CACHELINE; 298 + } 299 + 300 + /* Space required for the auxiliary search trees */ 301 + static inline size_t bset_tree_bytes(struct btree_keys *b) 302 + { 303 + return btree_keys_cachelines(b) * sizeof(struct bkey_float); 304 + } 305 + 306 + /* Space required for the prev pointers */ 307 + static inline size_t bset_prev_bytes(struct btree_keys *b) 308 + { 309 + return btree_keys_cachelines(b) * sizeof(uint8_t); 310 + } 311 + 312 + /* Memory allocation */ 313 + 314 + void bch_btree_keys_free(struct btree_keys *b) 315 + { 316 + struct bset_tree *t = b->set; 317 + 318 + if (bset_prev_bytes(b) < PAGE_SIZE) 319 + kfree(t->prev); 320 + else 321 + free_pages((unsigned long) t->prev, 322 + get_order(bset_prev_bytes(b))); 323 + 324 + if (bset_tree_bytes(b) < PAGE_SIZE) 325 + kfree(t->tree); 326 + else 327 + free_pages((unsigned long) t->tree, 328 + get_order(bset_tree_bytes(b))); 329 + 330 + free_pages((unsigned long) t->data, b->page_order); 331 + 332 + t->prev = NULL; 333 + t->tree = NULL; 334 + t->data = NULL; 335 + } 336 + EXPORT_SYMBOL(bch_btree_keys_free); 337 + 338 + int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp) 339 + { 340 + struct bset_tree *t = b->set; 341 + 342 + BUG_ON(t->data); 343 + 344 + b->page_order = page_order; 345 + 346 + t->data = (void *) __get_free_pages(gfp, b->page_order); 347 + if (!t->data) 348 + goto err; 349 + 350 + t->tree = bset_tree_bytes(b) < PAGE_SIZE 351 + ? kmalloc(bset_tree_bytes(b), gfp) 352 + : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); 353 + if (!t->tree) 354 + goto err; 355 + 356 + t->prev = bset_prev_bytes(b) < PAGE_SIZE 357 + ? kmalloc(bset_prev_bytes(b), gfp) 358 + : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); 359 + if (!t->prev) 360 + goto err; 361 + 362 + return 0; 363 + err: 364 + bch_btree_keys_free(b); 365 + return -ENOMEM; 366 + } 367 + EXPORT_SYMBOL(bch_btree_keys_alloc); 368 + 369 + void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, 370 + bool *expensive_debug_checks) 243 371 { 244 372 unsigned i; 245 373 246 - if (key_merging_disabled(b->c)) 247 - return false; 374 + b->ops = ops; 375 + b->expensive_debug_checks = expensive_debug_checks; 376 + b->nsets = 0; 377 + b->last_set_unwritten = 0; 248 378 249 - if (KEY_PTRS(l) != KEY_PTRS(r) || 250 - KEY_DIRTY(l) != KEY_DIRTY(r) || 251 - bkey_cmp(l, &START_KEY(r))) 252 - return false; 253 - 254 - for (i = 0; i < KEY_PTRS(l); i++) 255 - if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || 256 - PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) 257 - return false; 258 - 259 - /* Keys with no pointers aren't restricted to one bucket and could 260 - * overflow KEY_SIZE 379 + /* XXX: shouldn't be needed */ 380 + for (i = 0; i < MAX_BSETS; i++) 381 + b->set[i].size = 0; 382 + /* 383 + * Second loop starts at 1 because b->keys[0]->data is the memory we 384 + * allocated 261 385 */ 262 - if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) { 263 - SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l)); 264 - SET_KEY_SIZE(l, USHRT_MAX); 265 - 266 - bch_cut_front(l, r); 267 - return false; 268 - } 269 - 270 - if (KEY_CSUM(l)) { 271 - if (KEY_CSUM(r)) 272 - l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); 273 - else 274 - SET_KEY_CSUM(l, 0); 275 - } 276 - 277 - SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); 278 - SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r)); 279 - 280 - return true; 386 + for (i = 1; i < MAX_BSETS; i++) 387 + b->set[i].data = NULL; 281 388 } 389 + EXPORT_SYMBOL(bch_btree_keys_init); 282 390 283 391 /* Binary tree stuff for auxiliary search trees */ 284 392 ··· 511 455 return ((void *) k - (void *) t->data) / BSET_CACHELINE; 512 456 } 513 457 514 - static unsigned bkey_to_cacheline_offset(struct bkey *k) 458 + static unsigned bkey_to_cacheline_offset(struct bset_tree *t, 459 + unsigned cacheline, 460 + struct bkey *k) 515 461 { 516 - return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t); 462 + return (u64 *) k - (u64 *) cacheline_to_bkey(t, cacheline, 0); 517 463 } 518 464 519 465 static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j) ··· 562 504 : tree_to_prev_bkey(t, j >> ffs(j)); 563 505 564 506 struct bkey *r = is_power_of_2(j + 1) 565 - ? node(t->data, t->data->keys - bkey_u64s(&t->end)) 507 + ? bset_bkey_idx(t->data, t->data->keys - bkey_u64s(&t->end)) 566 508 : tree_to_bkey(t, j >> (ffz(j) + 1)); 567 509 568 510 BUG_ON(m < l || m > r); ··· 586 528 f->exponent = 127; 587 529 } 588 530 589 - static void bset_alloc_tree(struct btree *b, struct bset_tree *t) 531 + static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t) 590 532 { 591 - if (t != b->sets) { 533 + if (t != b->set) { 592 534 unsigned j = roundup(t[-1].size, 593 535 64 / sizeof(struct bkey_float)); 594 536 ··· 596 538 t->prev = t[-1].prev + j; 597 539 } 598 540 599 - while (t < b->sets + MAX_BSETS) 541 + while (t < b->set + MAX_BSETS) 600 542 t++->size = 0; 601 543 } 602 544 603 - static void bset_build_unwritten_tree(struct btree *b) 545 + static void bch_bset_build_unwritten_tree(struct btree_keys *b) 604 546 { 605 - struct bset_tree *t = b->sets + b->nsets; 547 + struct bset_tree *t = bset_tree_last(b); 548 + 549 + BUG_ON(b->last_set_unwritten); 550 + b->last_set_unwritten = 1; 606 551 607 552 bset_alloc_tree(b, t); 608 553 609 - if (t->tree != b->sets->tree + bset_tree_space(b)) { 610 - t->prev[0] = bkey_to_cacheline_offset(t->data->start); 554 + if (t->tree != b->set->tree + btree_keys_cachelines(b)) { 555 + t->prev[0] = bkey_to_cacheline_offset(t, 0, t->data->start); 611 556 t->size = 1; 612 557 } 613 558 } 614 559 615 - static void bset_build_written_tree(struct btree *b) 560 + void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic) 616 561 { 617 - struct bset_tree *t = b->sets + b->nsets; 618 - struct bkey *k = t->data->start; 562 + if (i != b->set->data) { 563 + b->set[++b->nsets].data = i; 564 + i->seq = b->set->data->seq; 565 + } else 566 + get_random_bytes(&i->seq, sizeof(uint64_t)); 567 + 568 + i->magic = magic; 569 + i->version = 0; 570 + i->keys = 0; 571 + 572 + bch_bset_build_unwritten_tree(b); 573 + } 574 + EXPORT_SYMBOL(bch_bset_init_next); 575 + 576 + void bch_bset_build_written_tree(struct btree_keys *b) 577 + { 578 + struct bset_tree *t = bset_tree_last(b); 579 + struct bkey *prev = NULL, *k = t->data->start; 619 580 unsigned j, cacheline = 1; 581 + 582 + b->last_set_unwritten = 0; 620 583 621 584 bset_alloc_tree(b, t); 622 585 623 586 t->size = min_t(unsigned, 624 - bkey_to_cacheline(t, end(t->data)), 625 - b->sets->tree + bset_tree_space(b) - t->tree); 587 + bkey_to_cacheline(t, bset_bkey_last(t->data)), 588 + b->set->tree + btree_keys_cachelines(b) - t->tree); 626 589 627 590 if (t->size < 2) { 628 591 t->size = 0; ··· 656 577 for (j = inorder_next(0, t->size); 657 578 j; 658 579 j = inorder_next(j, t->size)) { 659 - while (bkey_to_cacheline(t, k) != cacheline) 660 - k = bkey_next(k); 580 + while (bkey_to_cacheline(t, k) < cacheline) 581 + prev = k, k = bkey_next(k); 661 582 662 - t->prev[j] = bkey_u64s(k); 663 - k = bkey_next(k); 664 - cacheline++; 665 - t->tree[j].m = bkey_to_cacheline_offset(k); 583 + t->prev[j] = bkey_u64s(prev); 584 + t->tree[j].m = bkey_to_cacheline_offset(t, cacheline++, k); 666 585 } 667 586 668 - while (bkey_next(k) != end(t->data)) 587 + while (bkey_next(k) != bset_bkey_last(t->data)) 669 588 k = bkey_next(k); 670 589 671 590 t->end = *k; ··· 674 597 j = inorder_next(j, t->size)) 675 598 make_bfloat(t, j); 676 599 } 600 + EXPORT_SYMBOL(bch_bset_build_written_tree); 677 601 678 - void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k) 602 + /* Insert */ 603 + 604 + void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k) 679 605 { 680 606 struct bset_tree *t; 681 607 unsigned inorder, j = 1; 682 608 683 - for (t = b->sets; t <= &b->sets[b->nsets]; t++) 684 - if (k < end(t->data)) 609 + for (t = b->set; t <= bset_tree_last(b); t++) 610 + if (k < bset_bkey_last(t->data)) 685 611 goto found_set; 686 612 687 613 BUG(); ··· 697 617 if (k == t->data->start) 698 618 goto fix_left; 699 619 700 - if (bkey_next(k) == end(t->data)) { 620 + if (bkey_next(k) == bset_bkey_last(t->data)) { 701 621 t->end = *k; 702 622 goto fix_right; 703 623 } ··· 722 642 j = j * 2 + 1; 723 643 } while (j < t->size); 724 644 } 645 + EXPORT_SYMBOL(bch_bset_fix_invalidated_key); 725 646 726 - void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) 647 + static void bch_bset_fix_lookup_table(struct btree_keys *b, 648 + struct bset_tree *t, 649 + struct bkey *k) 727 650 { 728 - struct bset_tree *t = &b->sets[b->nsets]; 729 651 unsigned shift = bkey_u64s(k); 730 652 unsigned j = bkey_to_cacheline(t, k); 731 653 ··· 739 657 * lookup table for the first key that is strictly greater than k: 740 658 * it's either k's cacheline or the next one 741 659 */ 742 - if (j < t->size && 743 - table_to_bkey(t, j) <= k) 660 + while (j < t->size && 661 + table_to_bkey(t, j) <= k) 744 662 j++; 745 663 746 664 /* Adjust all the lookup table entries, and find a new key for any that ··· 755 673 while (k < cacheline_to_bkey(t, j, 0)) 756 674 k = bkey_next(k); 757 675 758 - t->prev[j] = bkey_to_cacheline_offset(k); 676 + t->prev[j] = bkey_to_cacheline_offset(t, j, k); 759 677 } 760 678 } 761 679 762 - if (t->size == b->sets->tree + bset_tree_space(b) - t->tree) 680 + if (t->size == b->set->tree + btree_keys_cachelines(b) - t->tree) 763 681 return; 764 682 765 683 /* Possibly add a new entry to the end of the lookup table */ 766 684 767 685 for (k = table_to_bkey(t, t->size - 1); 768 - k != end(t->data); 686 + k != bset_bkey_last(t->data); 769 687 k = bkey_next(k)) 770 688 if (t->size == bkey_to_cacheline(t, k)) { 771 - t->prev[t->size] = bkey_to_cacheline_offset(k); 689 + t->prev[t->size] = bkey_to_cacheline_offset(t, t->size, k); 772 690 t->size++; 773 691 } 774 692 } 775 693 776 - void bch_bset_init_next(struct btree *b) 694 + /* 695 + * Tries to merge l and r: l should be lower than r 696 + * Returns true if we were able to merge. If we did merge, l will be the merged 697 + * key, r will be untouched. 698 + */ 699 + bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r) 777 700 { 778 - struct bset *i = write_block(b); 701 + if (!b->ops->key_merge) 702 + return false; 779 703 780 - if (i != b->sets[0].data) { 781 - b->sets[++b->nsets].data = i; 782 - i->seq = b->sets[0].data->seq; 783 - } else 784 - get_random_bytes(&i->seq, sizeof(uint64_t)); 704 + /* 705 + * Generic header checks 706 + * Assumes left and right are in order 707 + * Left and right must be exactly aligned 708 + */ 709 + if (!bch_bkey_equal_header(l, r) || 710 + bkey_cmp(l, &START_KEY(r))) 711 + return false; 785 712 786 - i->magic = bset_magic(&b->c->sb); 787 - i->version = 0; 788 - i->keys = 0; 789 - 790 - bset_build_unwritten_tree(b); 713 + return b->ops->key_merge(b, l, r); 791 714 } 715 + EXPORT_SYMBOL(bch_bkey_try_merge); 716 + 717 + void bch_bset_insert(struct btree_keys *b, struct bkey *where, 718 + struct bkey *insert) 719 + { 720 + struct bset_tree *t = bset_tree_last(b); 721 + 722 + BUG_ON(!b->last_set_unwritten); 723 + BUG_ON(bset_byte_offset(b, t->data) + 724 + __set_bytes(t->data, t->data->keys + bkey_u64s(insert)) > 725 + PAGE_SIZE << b->page_order); 726 + 727 + memmove((uint64_t *) where + bkey_u64s(insert), 728 + where, 729 + (void *) bset_bkey_last(t->data) - (void *) where); 730 + 731 + t->data->keys += bkey_u64s(insert); 732 + bkey_copy(where, insert); 733 + bch_bset_fix_lookup_table(b, t, where); 734 + } 735 + EXPORT_SYMBOL(bch_bset_insert); 736 + 737 + unsigned bch_btree_insert_key(struct btree_keys *b, struct bkey *k, 738 + struct bkey *replace_key) 739 + { 740 + unsigned status = BTREE_INSERT_STATUS_NO_INSERT; 741 + struct bset *i = bset_tree_last(b)->data; 742 + struct bkey *m, *prev = NULL; 743 + struct btree_iter iter; 744 + 745 + BUG_ON(b->ops->is_extents && !KEY_SIZE(k)); 746 + 747 + m = bch_btree_iter_init(b, &iter, b->ops->is_extents 748 + ? PRECEDING_KEY(&START_KEY(k)) 749 + : PRECEDING_KEY(k)); 750 + 751 + if (b->ops->insert_fixup(b, k, &iter, replace_key)) 752 + return status; 753 + 754 + status = BTREE_INSERT_STATUS_INSERT; 755 + 756 + while (m != bset_bkey_last(i) && 757 + bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0) 758 + prev = m, m = bkey_next(m); 759 + 760 + /* prev is in the tree, if we merge we're done */ 761 + status = BTREE_INSERT_STATUS_BACK_MERGE; 762 + if (prev && 763 + bch_bkey_try_merge(b, prev, k)) 764 + goto merged; 765 + #if 0 766 + status = BTREE_INSERT_STATUS_OVERWROTE; 767 + if (m != bset_bkey_last(i) && 768 + KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) 769 + goto copy; 770 + #endif 771 + status = BTREE_INSERT_STATUS_FRONT_MERGE; 772 + if (m != bset_bkey_last(i) && 773 + bch_bkey_try_merge(b, k, m)) 774 + goto copy; 775 + 776 + bch_bset_insert(b, m, k); 777 + copy: bkey_copy(m, k); 778 + merged: 779 + return status; 780 + } 781 + EXPORT_SYMBOL(bch_btree_insert_key); 782 + 783 + /* Lookup */ 792 784 793 785 struct bset_search_iter { 794 786 struct bkey *l, *r; 795 787 }; 796 788 797 - static struct bset_search_iter bset_search_write_set(struct btree *b, 798 - struct bset_tree *t, 789 + static struct bset_search_iter bset_search_write_set(struct bset_tree *t, 799 790 const struct bkey *search) 800 791 { 801 792 unsigned li = 0, ri = t->size; 802 - 803 - BUG_ON(!b->nsets && 804 - t->size < bkey_to_cacheline(t, end(t->data))); 805 793 806 794 while (li + 1 != ri) { 807 795 unsigned m = (li + ri) >> 1; ··· 884 732 885 733 return (struct bset_search_iter) { 886 734 table_to_bkey(t, li), 887 - ri < t->size ? table_to_bkey(t, ri) : end(t->data) 735 + ri < t->size ? table_to_bkey(t, ri) : bset_bkey_last(t->data) 888 736 }; 889 737 } 890 738 891 - static struct bset_search_iter bset_search_tree(struct btree *b, 892 - struct bset_tree *t, 739 + static struct bset_search_iter bset_search_tree(struct bset_tree *t, 893 740 const struct bkey *search) 894 741 { 895 742 struct bkey *l, *r; ··· 935 784 f = &t->tree[inorder_next(j, t->size)]; 936 785 r = cacheline_to_bkey(t, inorder, f->m); 937 786 } else 938 - r = end(t->data); 787 + r = bset_bkey_last(t->data); 939 788 } else { 940 789 r = cacheline_to_bkey(t, inorder, f->m); 941 790 ··· 949 798 return (struct bset_search_iter) {l, r}; 950 799 } 951 800 952 - struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, 801 + struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, 953 802 const struct bkey *search) 954 803 { 955 804 struct bset_search_iter i; ··· 971 820 972 821 if (unlikely(!t->size)) { 973 822 i.l = t->data->start; 974 - i.r = end(t->data); 823 + i.r = bset_bkey_last(t->data); 975 824 } else if (bset_written(b, t)) { 976 825 /* 977 826 * Each node in the auxiliary search tree covers a certain range ··· 981 830 */ 982 831 983 832 if (unlikely(bkey_cmp(search, &t->end) >= 0)) 984 - return end(t->data); 833 + return bset_bkey_last(t->data); 985 834 986 835 if (unlikely(bkey_cmp(search, t->data->start) < 0)) 987 836 return t->data->start; 988 837 989 - i = bset_search_tree(b, t, search); 990 - } else 991 - i = bset_search_write_set(b, t, search); 838 + i = bset_search_tree(t, search); 839 + } else { 840 + BUG_ON(!b->nsets && 841 + t->size < bkey_to_cacheline(t, bset_bkey_last(t->data))); 992 842 993 - if (expensive_debug_checks(b->c)) { 843 + i = bset_search_write_set(t, search); 844 + } 845 + 846 + if (btree_keys_expensive_checks(b)) { 994 847 BUG_ON(bset_written(b, t) && 995 848 i.l != t->data->start && 996 849 bkey_cmp(tree_to_prev_bkey(t, 997 850 inorder_to_tree(bkey_to_cacheline(t, i.l), t)), 998 851 search) > 0); 999 852 1000 - BUG_ON(i.r != end(t->data) && 853 + BUG_ON(i.r != bset_bkey_last(t->data) && 1001 854 bkey_cmp(i.r, search) <= 0); 1002 855 } 1003 856 ··· 1011 856 1012 857 return i.l; 1013 858 } 859 + EXPORT_SYMBOL(__bch_bset_search); 1014 860 1015 861 /* Btree iterator */ 1016 862 1017 - /* 1018 - * Returns true if l > r - unless l == r, in which case returns true if l is 1019 - * older than r. 1020 - * 1021 - * Necessary for btree_sort_fixup() - if there are multiple keys that compare 1022 - * equal in different sets, we have to process them newest to oldest. 1023 - */ 863 + typedef bool (btree_iter_cmp_fn)(struct btree_iter_set, 864 + struct btree_iter_set); 865 + 1024 866 static inline bool btree_iter_cmp(struct btree_iter_set l, 1025 867 struct btree_iter_set r) 1026 868 { 1027 - int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); 1028 - 1029 - return c ? c > 0 : l.k < r.k; 869 + return bkey_cmp(l.k, r.k) > 0; 1030 870 } 1031 871 1032 872 static inline bool btree_iter_end(struct btree_iter *iter) ··· 1038 888 btree_iter_cmp)); 1039 889 } 1040 890 1041 - struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, 1042 - struct bkey *search, struct bset_tree *start) 891 + static struct bkey *__bch_btree_iter_init(struct btree_keys *b, 892 + struct btree_iter *iter, 893 + struct bkey *search, 894 + struct bset_tree *start) 1043 895 { 1044 896 struct bkey *ret = NULL; 1045 897 iter->size = ARRAY_SIZE(iter->data); ··· 1051 899 iter->b = b; 1052 900 #endif 1053 901 1054 - for (; start <= &b->sets[b->nsets]; start++) { 902 + for (; start <= bset_tree_last(b); start++) { 1055 903 ret = bch_bset_search(b, start, search); 1056 - bch_btree_iter_push(iter, ret, end(start->data)); 904 + bch_btree_iter_push(iter, ret, bset_bkey_last(start->data)); 1057 905 } 1058 906 1059 907 return ret; 1060 908 } 1061 909 1062 - struct bkey *bch_btree_iter_next(struct btree_iter *iter) 910 + struct bkey *bch_btree_iter_init(struct btree_keys *b, 911 + struct btree_iter *iter, 912 + struct bkey *search) 913 + { 914 + return __bch_btree_iter_init(b, iter, search, b->set); 915 + } 916 + EXPORT_SYMBOL(bch_btree_iter_init); 917 + 918 + static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, 919 + btree_iter_cmp_fn *cmp) 1063 920 { 1064 921 struct btree_iter_set unused; 1065 922 struct bkey *ret = NULL; ··· 1085 924 } 1086 925 1087 926 if (iter->data->k == iter->data->end) 1088 - heap_pop(iter, unused, btree_iter_cmp); 927 + heap_pop(iter, unused, cmp); 1089 928 else 1090 - heap_sift(iter, 0, btree_iter_cmp); 929 + heap_sift(iter, 0, cmp); 1091 930 } 1092 931 1093 932 return ret; 1094 933 } 1095 934 935 + struct bkey *bch_btree_iter_next(struct btree_iter *iter) 936 + { 937 + return __bch_btree_iter_next(iter, btree_iter_cmp); 938 + 939 + } 940 + EXPORT_SYMBOL(bch_btree_iter_next); 941 + 1096 942 struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, 1097 - struct btree *b, ptr_filter_fn fn) 943 + struct btree_keys *b, ptr_filter_fn fn) 1098 944 { 1099 945 struct bkey *ret; 1100 946 ··· 1114 946 1115 947 /* Mergesort */ 1116 948 1117 - static void sort_key_next(struct btree_iter *iter, 1118 - struct btree_iter_set *i) 949 + void bch_bset_sort_state_free(struct bset_sort_state *state) 1119 950 { 1120 - i->k = bkey_next(i->k); 1121 - 1122 - if (i->k == i->end) 1123 - *i = iter->data[--iter->used]; 951 + if (state->pool) 952 + mempool_destroy(state->pool); 1124 953 } 1125 954 1126 - static void btree_sort_fixup(struct btree_iter *iter) 955 + int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order) 1127 956 { 1128 - while (iter->used > 1) { 1129 - struct btree_iter_set *top = iter->data, *i = top + 1; 957 + spin_lock_init(&state->time.lock); 1130 958 1131 - if (iter->used > 2 && 1132 - btree_iter_cmp(i[0], i[1])) 1133 - i++; 959 + state->page_order = page_order; 960 + state->crit_factor = int_sqrt(1 << page_order); 1134 961 1135 - if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) 1136 - break; 962 + state->pool = mempool_create_page_pool(1, page_order); 963 + if (!state->pool) 964 + return -ENOMEM; 1137 965 1138 - if (!KEY_SIZE(i->k)) { 1139 - sort_key_next(iter, i); 1140 - heap_sift(iter, i - top, btree_iter_cmp); 1141 - continue; 1142 - } 1143 - 1144 - if (top->k > i->k) { 1145 - if (bkey_cmp(top->k, i->k) >= 0) 1146 - sort_key_next(iter, i); 1147 - else 1148 - bch_cut_front(top->k, i->k); 1149 - 1150 - heap_sift(iter, i - top, btree_iter_cmp); 1151 - } else { 1152 - /* can't happen because of comparison func */ 1153 - BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); 1154 - bch_cut_back(&START_KEY(i->k), top->k); 1155 - } 1156 - } 966 + return 0; 1157 967 } 968 + EXPORT_SYMBOL(bch_bset_sort_state_init); 1158 969 1159 - static void btree_mergesort(struct btree *b, struct bset *out, 970 + static void btree_mergesort(struct btree_keys *b, struct bset *out, 1160 971 struct btree_iter *iter, 1161 972 bool fixup, bool remove_stale) 1162 973 { 974 + int i; 1163 975 struct bkey *k, *last = NULL; 1164 - bool (*bad)(struct btree *, const struct bkey *) = remove_stale 976 + BKEY_PADDED(k) tmp; 977 + bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale 1165 978 ? bch_ptr_bad 1166 979 : bch_ptr_invalid; 1167 980 1168 - while (!btree_iter_end(iter)) { 1169 - if (fixup && !b->level) 1170 - btree_sort_fixup(iter); 981 + /* Heapify the iterator, using our comparison function */ 982 + for (i = iter->used / 2 - 1; i >= 0; --i) 983 + heap_sift(iter, i, b->ops->sort_cmp); 1171 984 1172 - k = bch_btree_iter_next(iter); 985 + while (!btree_iter_end(iter)) { 986 + if (b->ops->sort_fixup && fixup) 987 + k = b->ops->sort_fixup(iter, &tmp.k); 988 + else 989 + k = NULL; 990 + 991 + if (!k) 992 + k = __bch_btree_iter_next(iter, b->ops->sort_cmp); 993 + 1173 994 if (bad(b, k)) 1174 995 continue; 1175 996 1176 997 if (!last) { 1177 998 last = out->start; 1178 999 bkey_copy(last, k); 1179 - } else if (b->level || 1180 - !bch_bkey_try_merge(b, last, k)) { 1000 + } else if (!bch_bkey_try_merge(b, last, k)) { 1181 1001 last = bkey_next(last); 1182 1002 bkey_copy(last, k); 1183 1003 } ··· 1176 1020 pr_debug("sorted %i keys", out->keys); 1177 1021 } 1178 1022 1179 - static void __btree_sort(struct btree *b, struct btree_iter *iter, 1180 - unsigned start, unsigned order, bool fixup) 1023 + static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, 1024 + unsigned start, unsigned order, bool fixup, 1025 + struct bset_sort_state *state) 1181 1026 { 1182 1027 uint64_t start_time; 1183 - bool remove_stale = !b->written; 1028 + bool used_mempool = false; 1184 1029 struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, 1185 1030 order); 1186 1031 if (!out) { 1187 - mutex_lock(&b->c->sort_lock); 1188 - out = b->c->sort; 1189 - order = ilog2(bucket_pages(b->c)); 1032 + BUG_ON(order > state->page_order); 1033 + 1034 + out = page_address(mempool_alloc(state->pool, GFP_NOIO)); 1035 + used_mempool = true; 1036 + order = state->page_order; 1190 1037 } 1191 1038 1192 1039 start_time = local_clock(); 1193 1040 1194 - btree_mergesort(b, out, iter, fixup, remove_stale); 1041 + btree_mergesort(b, out, iter, fixup, false); 1195 1042 b->nsets = start; 1196 - 1197 - if (!fixup && !start && b->written) 1198 - bch_btree_verify(b, out); 1199 1043 1200 1044 if (!start && order == b->page_order) { 1201 1045 /* ··· 1204 1048 * memcpy() 1205 1049 */ 1206 1050 1207 - out->magic = bset_magic(&b->c->sb); 1208 - out->seq = b->sets[0].data->seq; 1209 - out->version = b->sets[0].data->version; 1210 - swap(out, b->sets[0].data); 1211 - 1212 - if (b->c->sort == b->sets[0].data) 1213 - b->c->sort = out; 1051 + out->magic = b->set->data->magic; 1052 + out->seq = b->set->data->seq; 1053 + out->version = b->set->data->version; 1054 + swap(out, b->set->data); 1214 1055 } else { 1215 - b->sets[start].data->keys = out->keys; 1216 - memcpy(b->sets[start].data->start, out->start, 1217 - (void *) end(out) - (void *) out->start); 1056 + b->set[start].data->keys = out->keys; 1057 + memcpy(b->set[start].data->start, out->start, 1058 + (void *) bset_bkey_last(out) - (void *) out->start); 1218 1059 } 1219 1060 1220 - if (out == b->c->sort) 1221 - mutex_unlock(&b->c->sort_lock); 1061 + if (used_mempool) 1062 + mempool_free(virt_to_page(out), state->pool); 1222 1063 else 1223 1064 free_pages((unsigned long) out, order); 1224 1065 1225 - if (b->written) 1226 - bset_build_written_tree(b); 1066 + bch_bset_build_written_tree(b); 1227 1067 1228 1068 if (!start) 1229 - bch_time_stats_update(&b->c->sort_time, start_time); 1069 + bch_time_stats_update(&state->time, start_time); 1230 1070 } 1231 1071 1232 - void bch_btree_sort_partial(struct btree *b, unsigned start) 1072 + void bch_btree_sort_partial(struct btree_keys *b, unsigned start, 1073 + struct bset_sort_state *state) 1233 1074 { 1234 1075 size_t order = b->page_order, keys = 0; 1235 1076 struct btree_iter iter; 1236 1077 int oldsize = bch_count_data(b); 1237 1078 1238 - __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]); 1239 - 1240 - BUG_ON(b->sets[b->nsets].data == write_block(b) && 1241 - (b->sets[b->nsets].size || b->nsets)); 1242 - 1079 + __bch_btree_iter_init(b, &iter, NULL, &b->set[start]); 1243 1080 1244 1081 if (start) { 1245 1082 unsigned i; 1246 1083 1247 1084 for (i = start; i <= b->nsets; i++) 1248 - keys += b->sets[i].data->keys; 1085 + keys += b->set[i].data->keys; 1249 1086 1250 - order = roundup_pow_of_two(__set_bytes(b->sets->data, 1251 - keys)) / PAGE_SIZE; 1252 - if (order) 1253 - order = ilog2(order); 1087 + order = get_order(__set_bytes(b->set->data, keys)); 1254 1088 } 1255 1089 1256 - __btree_sort(b, &iter, start, order, false); 1090 + __btree_sort(b, &iter, start, order, false, state); 1257 1091 1258 - EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize); 1092 + EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); 1259 1093 } 1094 + EXPORT_SYMBOL(bch_btree_sort_partial); 1260 1095 1261 - void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter) 1096 + void bch_btree_sort_and_fix_extents(struct btree_keys *b, 1097 + struct btree_iter *iter, 1098 + struct bset_sort_state *state) 1262 1099 { 1263 - BUG_ON(!b->written); 1264 - __btree_sort(b, iter, 0, b->page_order, true); 1100 + __btree_sort(b, iter, 0, b->page_order, true, state); 1265 1101 } 1266 1102 1267 - void bch_btree_sort_into(struct btree *b, struct btree *new) 1103 + void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, 1104 + struct bset_sort_state *state) 1268 1105 { 1269 1106 uint64_t start_time = local_clock(); 1270 1107 1271 1108 struct btree_iter iter; 1272 1109 bch_btree_iter_init(b, &iter, NULL); 1273 1110 1274 - btree_mergesort(b, new->sets->data, &iter, false, true); 1111 + btree_mergesort(b, new->set->data, &iter, false, true); 1275 1112 1276 - bch_time_stats_update(&b->c->sort_time, start_time); 1113 + bch_time_stats_update(&state->time, start_time); 1277 1114 1278 - bkey_copy_key(&new->key, &b->key); 1279 - new->sets->size = 0; 1115 + new->set->size = 0; // XXX: why? 1280 1116 } 1281 1117 1282 1118 #define SORT_CRIT (4096 / sizeof(uint64_t)) 1283 1119 1284 - void bch_btree_sort_lazy(struct btree *b) 1120 + void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state) 1285 1121 { 1286 1122 unsigned crit = SORT_CRIT; 1287 1123 int i; ··· 1282 1134 if (!b->nsets) 1283 1135 goto out; 1284 1136 1285 - /* If not a leaf node, always sort */ 1286 - if (b->level) { 1287 - bch_btree_sort(b); 1288 - return; 1289 - } 1290 - 1291 1137 for (i = b->nsets - 1; i >= 0; --i) { 1292 - crit *= b->c->sort_crit_factor; 1138 + crit *= state->crit_factor; 1293 1139 1294 - if (b->sets[i].data->keys < crit) { 1295 - bch_btree_sort_partial(b, i); 1140 + if (b->set[i].data->keys < crit) { 1141 + bch_btree_sort_partial(b, i, state); 1296 1142 return; 1297 1143 } 1298 1144 } 1299 1145 1300 1146 /* Sort if we'd overflow */ 1301 1147 if (b->nsets + 1 == MAX_BSETS) { 1302 - bch_btree_sort(b); 1148 + bch_btree_sort(b, state); 1303 1149 return; 1304 1150 } 1305 1151 1306 1152 out: 1307 - bset_build_written_tree(b); 1153 + bch_bset_build_written_tree(b); 1308 1154 } 1155 + EXPORT_SYMBOL(bch_btree_sort_lazy); 1309 1156 1310 - /* Sysfs stuff */ 1311 - 1312 - struct bset_stats { 1313 - struct btree_op op; 1314 - size_t nodes; 1315 - size_t sets_written, sets_unwritten; 1316 - size_t bytes_written, bytes_unwritten; 1317 - size_t floats, failed; 1318 - }; 1319 - 1320 - static int btree_bset_stats(struct btree_op *op, struct btree *b) 1157 + void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats) 1321 1158 { 1322 - struct bset_stats *stats = container_of(op, struct bset_stats, op); 1323 1159 unsigned i; 1324 1160 1325 - stats->nodes++; 1326 - 1327 1161 for (i = 0; i <= b->nsets; i++) { 1328 - struct bset_tree *t = &b->sets[i]; 1162 + struct bset_tree *t = &b->set[i]; 1329 1163 size_t bytes = t->data->keys * sizeof(uint64_t); 1330 1164 size_t j; 1331 1165 ··· 1325 1195 stats->bytes_unwritten += bytes; 1326 1196 } 1327 1197 } 1328 - 1329 - return MAP_CONTINUE; 1330 - } 1331 - 1332 - int bch_bset_print_stats(struct cache_set *c, char *buf) 1333 - { 1334 - struct bset_stats t; 1335 - int ret; 1336 - 1337 - memset(&t, 0, sizeof(struct bset_stats)); 1338 - bch_btree_op_init(&t.op, -1); 1339 - 1340 - ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats); 1341 - if (ret < 0) 1342 - return ret; 1343 - 1344 - return snprintf(buf, PAGE_SIZE, 1345 - "btree nodes: %zu\n" 1346 - "written sets: %zu\n" 1347 - "unwritten sets: %zu\n" 1348 - "written key bytes: %zu\n" 1349 - "unwritten key bytes: %zu\n" 1350 - "floats: %zu\n" 1351 - "failed: %zu\n", 1352 - t.nodes, 1353 - t.sets_written, t.sets_unwritten, 1354 - t.bytes_written, t.bytes_unwritten, 1355 - t.floats, t.failed); 1356 1198 }

+309 -141

drivers/md/bcache/bset.h

··· 1 1 #ifndef _BCACHE_BSET_H 2 2 #define _BCACHE_BSET_H 3 3 4 - #include <linux/slab.h> 4 + #include <linux/bcache.h> 5 + #include <linux/kernel.h> 6 + #include <linux/types.h> 7 + 8 + #include "util.h" /* for time_stats */ 5 9 6 10 /* 7 11 * BKEYS: ··· 146 142 * first key in that range of bytes again. 147 143 */ 148 144 149 - /* Btree key comparison/iteration */ 145 + struct btree_keys; 146 + struct btree_iter; 147 + struct btree_iter_set; 148 + struct bkey_float; 150 149 151 150 #define MAX_BSETS 4U 152 - 153 - struct btree_iter { 154 - size_t size, used; 155 - #ifdef CONFIG_BCACHE_DEBUG 156 - struct btree *b; 157 - #endif 158 - struct btree_iter_set { 159 - struct bkey *k, *end; 160 - } data[MAX_BSETS]; 161 - }; 162 151 163 152 struct bset_tree { 164 153 /* ··· 162 165 */ 163 166 164 167 /* size of the binary tree and prev array */ 165 - unsigned size; 168 + unsigned size; 166 169 167 170 /* function of size - precalculated for to_inorder() */ 168 - unsigned extra; 171 + unsigned extra; 169 172 170 173 /* copy of the last key in the set */ 171 - struct bkey end; 172 - struct bkey_float *tree; 174 + struct bkey end; 175 + struct bkey_float *tree; 173 176 174 177 /* 175 178 * The nodes in the bset tree point to specific keys - this ··· 179 182 * to keep bkey_float to 4 bytes and prev isn't used in the fast 180 183 * path. 181 184 */ 182 - uint8_t *prev; 185 + uint8_t *prev; 183 186 184 187 /* The actual btree node, with pointers to each sorted set */ 185 - struct bset *data; 188 + struct bset *data; 186 189 }; 190 + 191 + struct btree_keys_ops { 192 + bool (*sort_cmp)(struct btree_iter_set, 193 + struct btree_iter_set); 194 + struct bkey *(*sort_fixup)(struct btree_iter *, struct bkey *); 195 + bool (*insert_fixup)(struct btree_keys *, struct bkey *, 196 + struct btree_iter *, struct bkey *); 197 + bool (*key_invalid)(struct btree_keys *, 198 + const struct bkey *); 199 + bool (*key_bad)(struct btree_keys *, const struct bkey *); 200 + bool (*key_merge)(struct btree_keys *, 201 + struct bkey *, struct bkey *); 202 + void (*key_to_text)(char *, size_t, const struct bkey *); 203 + void (*key_dump)(struct btree_keys *, const struct bkey *); 204 + 205 + /* 206 + * Only used for deciding whether to use START_KEY(k) or just the key 207 + * itself in a couple places 208 + */ 209 + bool is_extents; 210 + }; 211 + 212 + struct btree_keys { 213 + const struct btree_keys_ops *ops; 214 + uint8_t page_order; 215 + uint8_t nsets; 216 + unsigned last_set_unwritten:1; 217 + bool *expensive_debug_checks; 218 + 219 + /* 220 + * Sets of sorted keys - the real btree node - plus a binary search tree 221 + * 222 + * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point 223 + * to the memory we have allocated for this btree node. Additionally, 224 + * set[0]->data points to the entire btree node as it exists on disk. 225 + */ 226 + struct bset_tree set[MAX_BSETS]; 227 + }; 228 + 229 + static inline struct bset_tree *bset_tree_last(struct btree_keys *b) 230 + { 231 + return b->set + b->nsets; 232 + } 233 + 234 + static inline bool bset_written(struct btree_keys *b, struct bset_tree *t) 235 + { 236 + return t <= b->set + b->nsets - b->last_set_unwritten; 237 + } 238 + 239 + static inline bool bkey_written(struct btree_keys *b, struct bkey *k) 240 + { 241 + return !b->last_set_unwritten || k < b->set[b->nsets].data->start; 242 + } 243 + 244 + static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i) 245 + { 246 + return ((size_t) i) - ((size_t) b->set->data); 247 + } 248 + 249 + static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i) 250 + { 251 + return bset_byte_offset(b, i) >> 9; 252 + } 253 + 254 + #define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) 255 + #define set_bytes(i) __set_bytes(i, i->keys) 256 + 257 + #define __set_blocks(i, k, block_bytes) \ 258 + DIV_ROUND_UP(__set_bytes(i, k), block_bytes) 259 + #define set_blocks(i, block_bytes) \ 260 + __set_blocks(i, (i)->keys, block_bytes) 261 + 262 + static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b) 263 + { 264 + struct bset_tree *t = bset_tree_last(b); 265 + 266 + BUG_ON((PAGE_SIZE << b->page_order) < 267 + (bset_byte_offset(b, t->data) + set_bytes(t->data))); 268 + 269 + if (!b->last_set_unwritten) 270 + return 0; 271 + 272 + return ((PAGE_SIZE << b->page_order) - 273 + (bset_byte_offset(b, t->data) + set_bytes(t->data))) / 274 + sizeof(u64); 275 + } 276 + 277 + static inline struct bset *bset_next_set(struct btree_keys *b, 278 + unsigned block_bytes) 279 + { 280 + struct bset *i = bset_tree_last(b)->data; 281 + 282 + return ((void *) i) + roundup(set_bytes(i), block_bytes); 283 + } 284 + 285 + void bch_btree_keys_free(struct btree_keys *); 286 + int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t); 287 + void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *, 288 + bool *); 289 + 290 + void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t); 291 + void bch_bset_build_written_tree(struct btree_keys *); 292 + void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *); 293 + bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *); 294 + void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *); 295 + unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *, 296 + struct bkey *); 297 + 298 + enum { 299 + BTREE_INSERT_STATUS_NO_INSERT = 0, 300 + BTREE_INSERT_STATUS_INSERT, 301 + BTREE_INSERT_STATUS_BACK_MERGE, 302 + BTREE_INSERT_STATUS_OVERWROTE, 303 + BTREE_INSERT_STATUS_FRONT_MERGE, 304 + }; 305 + 306 + /* Btree key iteration */ 307 + 308 + struct btree_iter { 309 + size_t size, used; 310 + #ifdef CONFIG_BCACHE_DEBUG 311 + struct btree_keys *b; 312 + #endif 313 + struct btree_iter_set { 314 + struct bkey *k, *end; 315 + } data[MAX_BSETS]; 316 + }; 317 + 318 + typedef bool (*ptr_filter_fn)(struct btree_keys *, const struct bkey *); 319 + 320 + struct bkey *bch_btree_iter_next(struct btree_iter *); 321 + struct bkey *bch_btree_iter_next_filter(struct btree_iter *, 322 + struct btree_keys *, ptr_filter_fn); 323 + 324 + void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); 325 + struct bkey *bch_btree_iter_init(struct btree_keys *, struct btree_iter *, 326 + struct bkey *); 327 + 328 + struct bkey *__bch_bset_search(struct btree_keys *, struct bset_tree *, 329 + const struct bkey *); 330 + 331 + /* 332 + * Returns the first key that is strictly greater than search 333 + */ 334 + static inline struct bkey *bch_bset_search(struct btree_keys *b, 335 + struct bset_tree *t, 336 + const struct bkey *search) 337 + { 338 + return search ? __bch_bset_search(b, t, search) : t->data->start; 339 + } 340 + 341 + #define for_each_key_filter(b, k, iter, filter) \ 342 + for (bch_btree_iter_init((b), (iter), NULL); \ 343 + ((k) = bch_btree_iter_next_filter((iter), (b), filter));) 344 + 345 + #define for_each_key(b, k, iter) \ 346 + for (bch_btree_iter_init((b), (iter), NULL); \ 347 + ((k) = bch_btree_iter_next(iter));) 348 + 349 + /* Sorting */ 350 + 351 + struct bset_sort_state { 352 + mempool_t *pool; 353 + 354 + unsigned page_order; 355 + unsigned crit_factor; 356 + 357 + struct time_stats time; 358 + }; 359 + 360 + void bch_bset_sort_state_free(struct bset_sort_state *); 361 + int bch_bset_sort_state_init(struct bset_sort_state *, unsigned); 362 + void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *); 363 + void bch_btree_sort_into(struct btree_keys *, struct btree_keys *, 364 + struct bset_sort_state *); 365 + void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *, 366 + struct bset_sort_state *); 367 + void bch_btree_sort_partial(struct btree_keys *, unsigned, 368 + struct bset_sort_state *); 369 + 370 + static inline void bch_btree_sort(struct btree_keys *b, 371 + struct bset_sort_state *state) 372 + { 373 + bch_btree_sort_partial(b, 0, state); 374 + } 375 + 376 + struct bset_stats { 377 + size_t sets_written, sets_unwritten; 378 + size_t bytes_written, bytes_unwritten; 379 + size_t floats, failed; 380 + }; 381 + 382 + void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *); 383 + 384 + /* Bkey utility code */ 385 + 386 + #define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys) 387 + 388 + static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx) 389 + { 390 + return bkey_idx(i->start, idx); 391 + } 392 + 393 + static inline void bkey_init(struct bkey *k) 394 + { 395 + *k = ZERO_KEY; 396 + } 187 397 188 398 static __always_inline int64_t bkey_cmp(const struct bkey *l, 189 399 const struct bkey *r) ··· 398 194 return unlikely(KEY_INODE(l) != KEY_INODE(r)) 399 195 ? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r) 400 196 : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); 197 + } 198 + 199 + void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, 200 + unsigned); 201 + bool __bch_cut_front(const struct bkey *, struct bkey *); 202 + bool __bch_cut_back(const struct bkey *, struct bkey *); 203 + 204 + static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) 205 + { 206 + BUG_ON(bkey_cmp(where, k) > 0); 207 + return __bch_cut_front(where, k); 208 + } 209 + 210 + static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) 211 + { 212 + BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0); 213 + return __bch_cut_back(where, k); 214 + } 215 + 216 + #define PRECEDING_KEY(_k) \ 217 + ({ \ 218 + struct bkey *_ret = NULL; \ 219 + \ 220 + if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \ 221 + _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \ 222 + \ 223 + if (!_ret->low) \ 224 + _ret->high--; \ 225 + _ret->low--; \ 226 + } \ 227 + \ 228 + _ret; \ 229 + }) 230 + 231 + static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k) 232 + { 233 + return b->ops->key_invalid(b, k); 234 + } 235 + 236 + static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k) 237 + { 238 + return b->ops->key_bad(b, k); 239 + } 240 + 241 + static inline void bch_bkey_to_text(struct btree_keys *b, char *buf, 242 + size_t size, const struct bkey *k) 243 + { 244 + return b->ops->key_to_text(buf, size, k); 245 + } 246 + 247 + static inline bool bch_bkey_equal_header(const struct bkey *l, 248 + const struct bkey *r) 249 + { 250 + return (KEY_DIRTY(l) == KEY_DIRTY(r) && 251 + KEY_PTRS(l) == KEY_PTRS(r) && 252 + KEY_CSUM(l) == KEY_CSUM(l)); 401 253 } 402 254 403 255 /* Keylists */ ··· 517 257 518 258 struct bkey *bch_keylist_pop(struct keylist *); 519 259 void bch_keylist_pop_front(struct keylist *); 520 - int bch_keylist_realloc(struct keylist *, int, struct cache_set *); 260 + int __bch_keylist_realloc(struct keylist *, unsigned); 521 261 522 - void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, 523 - unsigned); 524 - bool __bch_cut_front(const struct bkey *, struct bkey *); 525 - bool __bch_cut_back(const struct bkey *, struct bkey *); 262 + /* Debug stuff */ 526 263 527 - static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) 264 + #ifdef CONFIG_BCACHE_DEBUG 265 + 266 + int __bch_count_data(struct btree_keys *); 267 + void __bch_check_keys(struct btree_keys *, const char *, ...); 268 + void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); 269 + void bch_dump_bucket(struct btree_keys *); 270 + 271 + #else 272 + 273 + static inline int __bch_count_data(struct btree_keys *b) { return -1; } 274 + static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {} 275 + static inline void bch_dump_bucket(struct btree_keys *b) {} 276 + void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); 277 + 278 + #endif 279 + 280 + static inline bool btree_keys_expensive_checks(struct btree_keys *b) 528 281 { 529 - BUG_ON(bkey_cmp(where, k) > 0); 530 - return __bch_cut_front(where, k); 282 + #ifdef CONFIG_BCACHE_DEBUG 283 + return *b->expensive_debug_checks; 284 + #else 285 + return false; 286 + #endif 531 287 } 532 288 533 - static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) 289 + static inline int bch_count_data(struct btree_keys *b) 534 290 { 535 - BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0); 536 - return __bch_cut_back(where, k); 291 + return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1; 537 292 } 538 293 539 - const char *bch_ptr_status(struct cache_set *, const struct bkey *); 540 - bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); 541 - bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *); 542 - 543 - bool bch_ptr_bad(struct btree *, const struct bkey *); 544 - 545 - static inline uint8_t gen_after(uint8_t a, uint8_t b) 546 - { 547 - uint8_t r = a - b; 548 - return r > 128U ? 0 : r; 549 - } 550 - 551 - static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, 552 - unsigned i) 553 - { 554 - return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); 555 - } 556 - 557 - static inline bool ptr_available(struct cache_set *c, const struct bkey *k, 558 - unsigned i) 559 - { 560 - return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); 561 - } 562 - 563 - 564 - typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *); 565 - 566 - struct bkey *bch_btree_iter_next(struct btree_iter *); 567 - struct bkey *bch_btree_iter_next_filter(struct btree_iter *, 568 - struct btree *, ptr_filter_fn); 569 - 570 - void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); 571 - struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *, 572 - struct bkey *, struct bset_tree *); 573 - 574 - /* 32 bits total: */ 575 - #define BKEY_MID_BITS 3 576 - #define BKEY_EXPONENT_BITS 7 577 - #define BKEY_MANTISSA_BITS 22 578 - #define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1) 579 - 580 - struct bkey_float { 581 - unsigned exponent:BKEY_EXPONENT_BITS; 582 - unsigned m:BKEY_MID_BITS; 583 - unsigned mantissa:BKEY_MANTISSA_BITS; 584 - } __packed; 585 - 586 - /* 587 - * BSET_CACHELINE was originally intended to match the hardware cacheline size - 588 - * it used to be 64, but I realized the lookup code would touch slightly less 589 - * memory if it was 128. 590 - * 591 - * It definites the number of bytes (in struct bset) per struct bkey_float in 592 - * the auxiliar search tree - when we're done searching the bset_float tree we 593 - * have this many bytes left that we do a linear search over. 594 - * 595 - * Since (after level 5) every level of the bset_tree is on a new cacheline, 596 - * we're touching one fewer cacheline in the bset tree in exchange for one more 597 - * cacheline in the linear search - but the linear search might stop before it 598 - * gets to the second cacheline. 599 - */ 600 - 601 - #define BSET_CACHELINE 128 602 - #define bset_tree_space(b) (btree_data_space(b) / BSET_CACHELINE) 603 - 604 - #define bset_tree_bytes(b) (bset_tree_space(b) * sizeof(struct bkey_float)) 605 - #define bset_prev_bytes(b) (bset_tree_space(b) * sizeof(uint8_t)) 606 - 607 - void bch_bset_init_next(struct btree *); 608 - 609 - void bch_bset_fix_invalidated_key(struct btree *, struct bkey *); 610 - void bch_bset_fix_lookup_table(struct btree *, struct bkey *); 611 - 612 - struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, 613 - const struct bkey *); 614 - 615 - /* 616 - * Returns the first key that is strictly greater than search 617 - */ 618 - static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t, 619 - const struct bkey *search) 620 - { 621 - return search ? __bch_bset_search(b, t, search) : t->data->start; 622 - } 623 - 624 - #define PRECEDING_KEY(_k) \ 625 - ({ \ 626 - struct bkey *_ret = NULL; \ 627 - \ 628 - if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \ 629 - _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \ 630 - \ 631 - if (!_ret->low) \ 632 - _ret->high--; \ 633 - _ret->low--; \ 634 - } \ 635 - \ 636 - _ret; \ 637 - }) 638 - 639 - bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *); 640 - void bch_btree_sort_lazy(struct btree *); 641 - void bch_btree_sort_into(struct btree *, struct btree *); 642 - void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *); 643 - void bch_btree_sort_partial(struct btree *, unsigned); 644 - 645 - static inline void bch_btree_sort(struct btree *b) 646 - { 647 - bch_btree_sort_partial(b, 0); 648 - } 649 - 650 - int bch_bset_print_stats(struct cache_set *, char *); 294 + #define bch_check_keys(b, ...) \ 295 + do { \ 296 + if (btree_keys_expensive_checks(b)) \ 297 + __bch_check_keys(b, __VA_ARGS__); \ 298 + } while (0) 651 299 652 300 #endif

+251 -427

drivers/md/bcache/btree.c

··· 23 23 #include "bcache.h" 24 24 #include "btree.h" 25 25 #include "debug.h" 26 - #include "writeback.h" 26 + #include "extents.h" 27 27 28 28 #include <linux/slab.h> 29 29 #include <linux/bitops.h> ··· 89 89 * Test module load/unload 90 90 */ 91 91 92 - enum { 93 - BTREE_INSERT_STATUS_INSERT, 94 - BTREE_INSERT_STATUS_BACK_MERGE, 95 - BTREE_INSERT_STATUS_OVERWROTE, 96 - BTREE_INSERT_STATUS_FRONT_MERGE, 97 - }; 98 - 99 92 #define MAX_NEED_GC 64 100 93 #define MAX_SAVE_PRIO 72 101 94 ··· 98 105 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) 99 106 100 107 static struct workqueue_struct *btree_io_wq; 101 - 102 - static inline bool should_split(struct btree *b) 103 - { 104 - struct bset *i = write_block(b); 105 - return b->written >= btree_blocks(b) || 106 - (b->written + __set_blocks(i, i->keys + 15, b->c) 107 - > btree_blocks(b)); 108 - } 109 108 110 109 #define insert_lock(s, b) ((b)->level <= (s)->lock) 111 110 ··· 152 167 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ 153 168 } \ 154 169 rw_unlock(_w, _b); \ 170 + if (_r == -EINTR) \ 171 + schedule(); \ 155 172 bch_cannibalize_unlock(c); \ 156 173 if (_r == -ENOSPC) { \ 157 174 wait_event((c)->try_wait, \ ··· 162 175 } \ 163 176 } while (_r == -EINTR); \ 164 177 \ 178 + finish_wait(&(c)->bucket_wait, &(op)->wait); \ 165 179 _r; \ 166 180 }) 181 + 182 + static inline struct bset *write_block(struct btree *b) 183 + { 184 + return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); 185 + } 167 186 168 187 /* Btree key manipulation */ 169 188 ··· 187 194 static uint64_t btree_csum_set(struct btree *b, struct bset *i) 188 195 { 189 196 uint64_t crc = b->key.ptr[0]; 190 - void *data = (void *) i + 8, *end = end(i); 197 + void *data = (void *) i + 8, *end = bset_bkey_last(i); 191 198 192 199 crc = bch_crc64_update(crc, data, end - data); 193 200 return crc ^ 0xffffffffffffffffULL; 194 201 } 195 202 196 - static void bch_btree_node_read_done(struct btree *b) 203 + void bch_btree_node_read_done(struct btree *b) 197 204 { 198 205 const char *err = "bad btree header"; 199 - struct bset *i = b->sets[0].data; 206 + struct bset *i = btree_bset_first(b); 200 207 struct btree_iter *iter; 201 208 202 209 iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); ··· 204 211 iter->used = 0; 205 212 206 213 #ifdef CONFIG_BCACHE_DEBUG 207 - iter->b = b; 214 + iter->b = &b->keys; 208 215 #endif 209 216 210 217 if (!i->seq) 211 218 goto err; 212 219 213 220 for (; 214 - b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; 221 + b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq; 215 222 i = write_block(b)) { 216 223 err = "unsupported bset version"; 217 224 if (i->version > BCACHE_BSET_VERSION) 218 225 goto err; 219 226 220 227 err = "bad btree header"; 221 - if (b->written + set_blocks(i, b->c) > btree_blocks(b)) 228 + if (b->written + set_blocks(i, block_bytes(b->c)) > 229 + btree_blocks(b)) 222 230 goto err; 223 231 224 232 err = "bad magic"; ··· 239 245 } 240 246 241 247 err = "empty set"; 242 - if (i != b->sets[0].data && !i->keys) 248 + if (i != b->keys.set[0].data && !i->keys) 243 249 goto err; 244 250 245 - bch_btree_iter_push(iter, i->start, end(i)); 251 + bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); 246 252 247 - b->written += set_blocks(i, b->c); 253 + b->written += set_blocks(i, block_bytes(b->c)); 248 254 } 249 255 250 256 err = "corrupted btree"; 251 257 for (i = write_block(b); 252 - index(i, b) < btree_blocks(b); 258 + bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key); 253 259 i = ((void *) i) + block_bytes(b->c)) 254 - if (i->seq == b->sets[0].data->seq) 260 + if (i->seq == b->keys.set[0].data->seq) 255 261 goto err; 256 262 257 - bch_btree_sort_and_fix_extents(b, iter); 263 + bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); 258 264 259 - i = b->sets[0].data; 265 + i = b->keys.set[0].data; 260 266 err = "short btree key"; 261 - if (b->sets[0].size && 262 - bkey_cmp(&b->key, &b->sets[0].end) < 0) 267 + if (b->keys.set[0].size && 268 + bkey_cmp(&b->key, &b->keys.set[0].end) < 0) 263 269 goto err; 264 270 265 271 if (b->written < btree_blocks(b)) 266 - bch_bset_init_next(b); 272 + bch_bset_init_next(&b->keys, write_block(b), 273 + bset_magic(&b->c->sb)); 267 274 out: 268 275 mempool_free(iter, b->c->fill_iter); 269 276 return; 270 277 err: 271 278 set_btree_node_io_error(b); 272 - bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", 279 + bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys", 273 280 err, PTR_BUCKET_NR(b->c, &b->key, 0), 274 - index(i, b), i->keys); 281 + bset_block_offset(b, i), i->keys); 275 282 goto out; 276 283 } 277 284 ··· 282 287 closure_put(cl); 283 288 } 284 289 285 - void bch_btree_node_read(struct btree *b) 290 + static void bch_btree_node_read(struct btree *b) 286 291 { 287 292 uint64_t start_time = local_clock(); 288 293 struct closure cl; ··· 298 303 bio->bi_end_io = btree_node_read_endio; 299 304 bio->bi_private = &cl; 300 305 301 - bch_bio_map(bio, b->sets[0].data); 306 + bch_bio_map(bio, b->keys.set[0].data); 302 307 303 308 bch_submit_bbio(bio, b->c, &b->key, 0); 304 309 closure_sync(&cl); ··· 335 340 w->journal = NULL; 336 341 } 337 342 343 + static void btree_node_write_unlock(struct closure *cl) 344 + { 345 + struct btree *b = container_of(cl, struct btree, io); 346 + 347 + up(&b->io_mutex); 348 + } 349 + 338 350 static void __btree_node_write_done(struct closure *cl) 339 351 { 340 - struct btree *b = container_of(cl, struct btree, io.cl); 352 + struct btree *b = container_of(cl, struct btree, io); 341 353 struct btree_write *w = btree_prev_write(b); 342 354 343 355 bch_bbio_free(b->bio, b->c); ··· 355 353 queue_delayed_work(btree_io_wq, &b->work, 356 354 msecs_to_jiffies(30000)); 357 355 358 - closure_return(cl); 356 + closure_return_with_destructor(cl, btree_node_write_unlock); 359 357 } 360 358 361 359 static void btree_node_write_done(struct closure *cl) 362 360 { 363 - struct btree *b = container_of(cl, struct btree, io.cl); 361 + struct btree *b = container_of(cl, struct btree, io); 364 362 struct bio_vec *bv; 365 363 int n; 366 364 ··· 373 371 static void btree_node_write_endio(struct bio *bio, int error) 374 372 { 375 373 struct closure *cl = bio->bi_private; 376 - struct btree *b = container_of(cl, struct btree, io.cl); 374 + struct btree *b = container_of(cl, struct btree, io); 377 375 378 376 if (error) 379 377 set_btree_node_io_error(b); ··· 384 382 385 383 static void do_btree_node_write(struct btree *b) 386 384 { 387 - struct closure *cl = &b->io.cl; 388 - struct bset *i = b->sets[b->nsets].data; 385 + struct closure *cl = &b->io; 386 + struct bset *i = btree_bset_last(b); 389 387 BKEY_PADDED(key) k; 390 388 391 389 i->version = BCACHE_BSET_VERSION; ··· 397 395 b->bio->bi_end_io = btree_node_write_endio; 398 396 b->bio->bi_private = cl; 399 397 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; 400 - b->bio->bi_iter.bi_size = set_blocks(i, b->c) * block_bytes(b->c); 398 + b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c)); 401 399 bch_bio_map(b->bio, i); 402 400 403 401 /* ··· 416 414 */ 417 415 418 416 bkey_copy(&k.key, &b->key); 419 - SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); 417 + SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + 418 + bset_sector_offset(&b->keys, i)); 420 419 421 420 if (!bio_alloc_pages(b->bio, GFP_NOIO)) { 422 421 int j; ··· 438 435 bch_submit_bbio(b->bio, b->c, &k.key, 0); 439 436 440 437 closure_sync(cl); 441 - __btree_node_write_done(cl); 438 + continue_at_nobarrier(cl, __btree_node_write_done, NULL); 442 439 } 443 440 } 444 441 445 442 void bch_btree_node_write(struct btree *b, struct closure *parent) 446 443 { 447 - struct bset *i = b->sets[b->nsets].data; 444 + struct bset *i = btree_bset_last(b); 448 445 449 446 trace_bcache_btree_write(b); 450 447 451 448 BUG_ON(current->bio_list); 452 449 BUG_ON(b->written >= btree_blocks(b)); 453 450 BUG_ON(b->written && !i->keys); 454 - BUG_ON(b->sets->data->seq != i->seq); 455 - bch_check_keys(b, "writing"); 451 + BUG_ON(btree_bset_first(b)->seq != i->seq); 452 + bch_check_keys(&b->keys, "writing"); 456 453 457 454 cancel_delayed_work(&b->work); 458 455 459 456 /* If caller isn't waiting for write, parent refcount is cache set */ 460 - closure_lock(&b->io, parent ?: &b->c->cl); 457 + down(&b->io_mutex); 458 + closure_init(&b->io, parent ?: &b->c->cl); 461 459 462 460 clear_bit(BTREE_NODE_dirty, &b->flags); 463 461 change_bit(BTREE_NODE_write_idx, &b->flags); 464 462 465 463 do_btree_node_write(b); 466 464 467 - b->written += set_blocks(i, b->c); 468 - atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, 465 + atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size, 469 466 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); 470 467 471 - bch_btree_sort_lazy(b); 468 + b->written += set_blocks(i, block_bytes(b->c)); 469 + 470 + /* If not a leaf node, always sort */ 471 + if (b->level && b->keys.nsets) 472 + bch_btree_sort(&b->keys, &b->c->sort); 473 + else 474 + bch_btree_sort_lazy(&b->keys, &b->c->sort); 475 + 476 + /* 477 + * do verify if there was more than one set initially (i.e. we did a 478 + * sort) and we sorted down to a single set: 479 + */ 480 + if (i != b->keys.set->data && !b->keys.nsets) 481 + bch_btree_verify(b); 472 482 473 483 if (b->written < btree_blocks(b)) 474 - bch_bset_init_next(b); 484 + bch_bset_init_next(&b->keys, write_block(b), 485 + bset_magic(&b->c->sb)); 475 486 } 476 487 477 488 static void bch_btree_node_write_sync(struct btree *b) ··· 510 493 511 494 static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) 512 495 { 513 - struct bset *i = b->sets[b->nsets].data; 496 + struct bset *i = btree_bset_last(b); 514 497 struct btree_write *w = btree_current_write(b); 515 498 516 499 BUG_ON(!b->written); ··· 545 528 * mca -> memory cache 546 529 */ 547 530 548 - static void mca_reinit(struct btree *b) 549 - { 550 - unsigned i; 551 - 552 - b->flags = 0; 553 - b->written = 0; 554 - b->nsets = 0; 555 - 556 - for (i = 0; i < MAX_BSETS; i++) 557 - b->sets[i].size = 0; 558 - /* 559 - * Second loop starts at 1 because b->sets[0]->data is the memory we 560 - * allocated 561 - */ 562 - for (i = 1; i < MAX_BSETS; i++) 563 - b->sets[i].data = NULL; 564 - } 565 - 566 531 #define mca_reserve(c) (((c->root && c->root->level) \ 567 532 ? c->root->level : 1) * 8 + 16) 568 533 #define mca_can_free(c) \ ··· 552 553 553 554 static void mca_data_free(struct btree *b) 554 555 { 555 - struct bset_tree *t = b->sets; 556 - BUG_ON(!closure_is_unlocked(&b->io.cl)); 556 + BUG_ON(b->io_mutex.count != 1); 557 557 558 - if (bset_prev_bytes(b) < PAGE_SIZE) 559 - kfree(t->prev); 560 - else 561 - free_pages((unsigned long) t->prev, 562 - get_order(bset_prev_bytes(b))); 558 + bch_btree_keys_free(&b->keys); 563 559 564 - if (bset_tree_bytes(b) < PAGE_SIZE) 565 - kfree(t->tree); 566 - else 567 - free_pages((unsigned long) t->tree, 568 - get_order(bset_tree_bytes(b))); 569 - 570 - free_pages((unsigned long) t->data, b->page_order); 571 - 572 - t->prev = NULL; 573 - t->tree = NULL; 574 - t->data = NULL; 575 - list_move(&b->list, &b->c->btree_cache_freed); 576 560 b->c->bucket_cache_used--; 561 + list_move(&b->list, &b->c->btree_cache_freed); 577 562 } 578 563 579 564 static void mca_bucket_free(struct btree *b) ··· 576 593 577 594 static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) 578 595 { 579 - struct bset_tree *t = b->sets; 580 - BUG_ON(t->data); 581 - 582 - b->page_order = max_t(unsigned, 583 - ilog2(b->c->btree_pages), 584 - btree_order(k)); 585 - 586 - t->data = (void *) __get_free_pages(gfp, b->page_order); 587 - if (!t->data) 588 - goto err; 589 - 590 - t->tree = bset_tree_bytes(b) < PAGE_SIZE 591 - ? kmalloc(bset_tree_bytes(b), gfp) 592 - : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); 593 - if (!t->tree) 594 - goto err; 595 - 596 - t->prev = bset_prev_bytes(b) < PAGE_SIZE 597 - ? kmalloc(bset_prev_bytes(b), gfp) 598 - : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); 599 - if (!t->prev) 600 - goto err; 601 - 602 - list_move(&b->list, &b->c->btree_cache); 603 - b->c->bucket_cache_used++; 604 - return; 605 - err: 606 - mca_data_free(b); 596 + if (!bch_btree_keys_alloc(&b->keys, 597 + max_t(unsigned, 598 + ilog2(b->c->btree_pages), 599 + btree_order(k)), 600 + gfp)) { 601 + b->c->bucket_cache_used++; 602 + list_move(&b->list, &b->c->btree_cache); 603 + } else { 604 + list_move(&b->list, &b->c->btree_cache_freed); 605 + } 607 606 } 608 607 609 608 static struct btree *mca_bucket_alloc(struct cache_set *c, ··· 600 635 INIT_LIST_HEAD(&b->list); 601 636 INIT_DELAYED_WORK(&b->work, btree_node_write_work); 602 637 b->c = c; 603 - closure_init_unlocked(&b->io); 638 + sema_init(&b->io_mutex, 1); 604 639 605 640 mca_data_alloc(b, k, gfp); 606 641 return b; ··· 616 651 if (!down_write_trylock(&b->lock)) 617 652 return -ENOMEM; 618 653 619 - BUG_ON(btree_node_dirty(b) && !b->sets[0].data); 654 + BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data); 620 655 621 - if (b->page_order < min_order || 622 - (!flush && 623 - (btree_node_dirty(b) || 624 - atomic_read(&b->io.cl.remaining) != -1))) { 625 - rw_unlock(true, b); 626 - return -ENOMEM; 656 + if (b->keys.page_order < min_order) 657 + goto out_unlock; 658 + 659 + if (!flush) { 660 + if (btree_node_dirty(b)) 661 + goto out_unlock; 662 + 663 + if (down_trylock(&b->io_mutex)) 664 + goto out_unlock; 665 + up(&b->io_mutex); 627 666 } 628 667 629 668 if (btree_node_dirty(b)) 630 669 bch_btree_node_write_sync(b); 631 670 632 671 /* wait for any in flight btree write */ 633 - closure_wait_event(&b->io.wait, &cl, 634 - atomic_read(&b->io.cl.remaining) == -1); 672 + down(&b->io_mutex); 673 + up(&b->io_mutex); 635 674 636 675 return 0; 676 + out_unlock: 677 + rw_unlock(true, b); 678 + return -ENOMEM; 637 679 } 638 680 639 681 static unsigned long bch_mca_scan(struct shrinker *shrink, ··· 686 714 } 687 715 } 688 716 689 - /* 690 - * Can happen right when we first start up, before we've read in any 691 - * btree nodes 692 - */ 693 - if (list_empty(&c->btree_cache)) 694 - goto out; 695 - 696 717 for (i = 0; (nr--) && i < c->bucket_cache_used; i++) { 718 + if (list_empty(&c->btree_cache)) 719 + goto out; 720 + 697 721 b = list_first_entry(&c->btree_cache, struct btree, list); 698 722 list_rotate_left(&c->btree_cache); 699 723 ··· 735 767 #ifdef CONFIG_BCACHE_DEBUG 736 768 if (c->verify_data) 737 769 list_move(&c->verify_data->list, &c->btree_cache); 770 + 771 + free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c))); 738 772 #endif 739 773 740 774 list_splice(&c->btree_cache_freeable, ··· 777 807 #ifdef CONFIG_BCACHE_DEBUG 778 808 mutex_init(&c->verify_lock); 779 809 810 + c->verify_ondisk = (void *) 811 + __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c))); 812 + 780 813 c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); 781 814 782 815 if (c->verify_data && 783 - c->verify_data->sets[0].data) 816 + c->verify_data->keys.set->data) 784 817 list_del_init(&c->verify_data->list); 785 818 else 786 819 c->verify_data = NULL; ··· 881 908 list_for_each_entry(b, &c->btree_cache_freed, list) 882 909 if (!mca_reap(b, 0, false)) { 883 910 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); 884 - if (!b->sets[0].data) 911 + if (!b->keys.set[0].data) 885 912 goto err; 886 913 else 887 914 goto out; ··· 892 919 goto err; 893 920 894 921 BUG_ON(!down_write_trylock(&b->lock)); 895 - if (!b->sets->data) 922 + if (!b->keys.set->data) 896 923 goto err; 897 924 out: 898 - BUG_ON(!closure_is_unlocked(&b->io.cl)); 925 + BUG_ON(b->io_mutex.count != 1); 899 926 900 927 bkey_copy(&b->key, k); 901 928 list_move(&b->list, &c->btree_cache); ··· 903 930 hlist_add_head_rcu(&b->hash, mca_hash(c, k)); 904 931 905 932 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); 906 - b->level = level; 907 933 b->parent = (void *) ~0UL; 934 + b->flags = 0; 935 + b->written = 0; 936 + b->level = level; 908 937 909 - mca_reinit(b); 938 + if (!b->level) 939 + bch_btree_keys_init(&b->keys, &bch_extent_keys_ops, 940 + &b->c->expensive_debug_checks); 941 + else 942 + bch_btree_keys_init(&b->keys, &bch_btree_keys_ops, 943 + &b->c->expensive_debug_checks); 910 944 911 945 return b; 912 946 err: ··· 974 994 975 995 b->accessed = 1; 976 996 977 - for (; i <= b->nsets && b->sets[i].size; i++) { 978 - prefetch(b->sets[i].tree); 979 - prefetch(b->sets[i].data); 997 + for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { 998 + prefetch(b->keys.set[i].tree); 999 + prefetch(b->keys.set[i].data); 980 1000 } 981 1001 982 - for (; i <= b->nsets; i++) 983 - prefetch(b->sets[i].data); 1002 + for (; i <= b->keys.nsets; i++) 1003 + prefetch(b->keys.set[i].data); 984 1004 985 1005 if (btree_node_io_error(b)) { 986 1006 rw_unlock(write, b); ··· 1043 1063 1044 1064 mutex_lock(&c->bucket_lock); 1045 1065 retry: 1046 - if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait)) 1066 + if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) 1047 1067 goto err; 1048 1068 1049 1069 bkey_put(c, &k.key); ··· 1060 1080 } 1061 1081 1062 1082 b->accessed = 1; 1063 - bch_bset_init_next(b); 1083 + bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); 1064 1084 1065 1085 mutex_unlock(&c->bucket_lock); 1066 1086 ··· 1078 1098 static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) 1079 1099 { 1080 1100 struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); 1081 - if (!IS_ERR_OR_NULL(n)) 1082 - bch_btree_sort_into(b, n); 1101 + if (!IS_ERR_OR_NULL(n)) { 1102 + bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); 1103 + bkey_copy_key(&n->key, &b->key); 1104 + } 1083 1105 1084 1106 return n; 1085 1107 } ··· 1100 1118 } 1101 1119 1102 1120 atomic_inc(&b->c->prio_blocked); 1121 + } 1122 + 1123 + static int btree_check_reserve(struct btree *b, struct btree_op *op) 1124 + { 1125 + struct cache_set *c = b->c; 1126 + struct cache *ca; 1127 + unsigned i, reserve = c->root->level * 2 + 1; 1128 + int ret = 0; 1129 + 1130 + mutex_lock(&c->bucket_lock); 1131 + 1132 + for_each_cache(ca, c, i) 1133 + if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { 1134 + if (op) 1135 + prepare_to_wait(&c->bucket_wait, &op->wait, 1136 + TASK_UNINTERRUPTIBLE); 1137 + ret = -EINTR; 1138 + break; 1139 + } 1140 + 1141 + mutex_unlock(&c->bucket_lock); 1142 + return ret; 1103 1143 } 1104 1144 1105 1145 /* Garbage collection */ ··· 1187 1183 1188 1184 gc->nodes++; 1189 1185 1190 - for_each_key_filter(b, k, &iter, bch_ptr_invalid) { 1186 + for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { 1191 1187 stale = max(stale, btree_mark_key(b, k)); 1192 1188 keys++; 1193 1189 1194 - if (bch_ptr_bad(b, k)) 1190 + if (bch_ptr_bad(&b->keys, k)) 1195 1191 continue; 1196 1192 1197 1193 gc->key_bytes += bkey_u64s(k); ··· 1201 1197 gc->data += KEY_SIZE(k); 1202 1198 } 1203 1199 1204 - for (t = b->sets; t <= &b->sets[b->nsets]; t++) 1200 + for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++) 1205 1201 btree_bug_on(t->size && 1206 - bset_written(b, t) && 1202 + bset_written(&b->keys, t) && 1207 1203 bkey_cmp(&b->key, &t->end) < 0, 1208 1204 b, "found short btree key in gc"); 1209 1205 ··· 1247 1243 blocks = btree_default_blocks(b->c) * 2 / 3; 1248 1244 1249 1245 if (nodes < 2 || 1250 - __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) 1246 + __set_blocks(b->keys.set[0].data, keys, 1247 + block_bytes(b->c)) > blocks * (nodes - 1)) 1251 1248 return 0; 1252 1249 1253 1250 for (i = 0; i < nodes; i++) { ··· 1258 1253 } 1259 1254 1260 1255 for (i = nodes - 1; i > 0; --i) { 1261 - struct bset *n1 = new_nodes[i]->sets->data; 1262 - struct bset *n2 = new_nodes[i - 1]->sets->data; 1256 + struct bset *n1 = btree_bset_first(new_nodes[i]); 1257 + struct bset *n2 = btree_bset_first(new_nodes[i - 1]); 1263 1258 struct bkey *k, *last = NULL; 1264 1259 1265 1260 keys = 0; 1266 1261 1267 1262 if (i > 1) { 1268 1263 for (k = n2->start; 1269 - k < end(n2); 1264 + k < bset_bkey_last(n2); 1270 1265 k = bkey_next(k)) { 1271 1266 if (__set_blocks(n1, n1->keys + keys + 1272 - bkey_u64s(k), b->c) > blocks) 1267 + bkey_u64s(k), 1268 + block_bytes(b->c)) > blocks) 1273 1269 break; 1274 1270 1275 1271 last = k; ··· 1286 1280 * though) 1287 1281 */ 1288 1282 if (__set_blocks(n1, n1->keys + n2->keys, 1289 - b->c) > btree_blocks(new_nodes[i])) 1283 + block_bytes(b->c)) > 1284 + btree_blocks(new_nodes[i])) 1290 1285 goto out_nocoalesce; 1291 1286 1292 1287 keys = n2->keys; ··· 1295 1288 last = &r->b->key; 1296 1289 } 1297 1290 1298 - BUG_ON(__set_blocks(n1, n1->keys + keys, 1299 - b->c) > btree_blocks(new_nodes[i])); 1291 + BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) > 1292 + btree_blocks(new_nodes[i])); 1300 1293 1301 1294 if (last) 1302 1295 bkey_copy_key(&new_nodes[i]->key, last); 1303 1296 1304 - memcpy(end(n1), 1297 + memcpy(bset_bkey_last(n1), 1305 1298 n2->start, 1306 - (void *) node(n2, keys) - (void *) n2->start); 1299 + (void *) bset_bkey_idx(n2, keys) - (void *) n2->start); 1307 1300 1308 1301 n1->keys += keys; 1309 1302 r[i].keys = n1->keys; 1310 1303 1311 1304 memmove(n2->start, 1312 - node(n2, keys), 1313 - (void *) end(n2) - (void *) node(n2, keys)); 1305 + bset_bkey_idx(n2, keys), 1306 + (void *) bset_bkey_last(n2) - 1307 + (void *) bset_bkey_idx(n2, keys)); 1314 1308 1315 1309 n2->keys -= keys; 1316 1310 1317 - if (bch_keylist_realloc(keylist, 1318 - KEY_PTRS(&new_nodes[i]->key), b->c)) 1311 + if (__bch_keylist_realloc(keylist, 1312 + bkey_u64s(&new_nodes[i]->key))) 1319 1313 goto out_nocoalesce; 1320 1314 1321 1315 bch_btree_node_write(new_nodes[i], &cl); ··· 1324 1316 } 1325 1317 1326 1318 for (i = 0; i < nodes; i++) { 1327 - if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c)) 1319 + if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key))) 1328 1320 goto out_nocoalesce; 1329 1321 1330 1322 make_btree_freeing_key(r[i].b, keylist->top); ··· 1332 1324 } 1333 1325 1334 1326 /* We emptied out this node */ 1335 - BUG_ON(new_nodes[0]->sets->data->keys); 1327 + BUG_ON(btree_bset_first(new_nodes[0])->keys); 1336 1328 btree_node_free(new_nodes[0]); 1337 1329 rw_unlock(true, new_nodes[0]); 1338 1330 ··· 1378 1370 struct btree_iter iter; 1379 1371 unsigned ret = 0; 1380 1372 1381 - for_each_key_filter(b, k, &iter, bch_ptr_bad) 1373 + for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) 1382 1374 ret += bkey_u64s(k); 1383 1375 1384 1376 return ret; ··· 1398 1390 struct gc_merge_info *last = r + GC_MERGE_NODES - 1; 1399 1391 1400 1392 bch_keylist_init(&keys); 1401 - bch_btree_iter_init(b, &iter, &b->c->gc_done); 1393 + bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); 1402 1394 1403 1395 for (i = 0; i < GC_MERGE_NODES; i++) 1404 1396 r[i].b = ERR_PTR(-EINTR); 1405 1397 1406 1398 while (1) { 1407 - k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); 1399 + k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); 1408 1400 if (k) { 1409 1401 r->b = bch_btree_node_get(b->c, k, b->level - 1, true); 1410 1402 if (IS_ERR(r->b)) { ··· 1424 1416 1425 1417 if (!IS_ERR(last->b)) { 1426 1418 should_rewrite = btree_gc_mark_node(last->b, gc); 1427 - if (should_rewrite) { 1419 + if (should_rewrite && 1420 + !btree_check_reserve(b, NULL)) { 1428 1421 n = btree_node_alloc_replacement(last->b, 1429 1422 false); 1430 1423 ··· 1714 1705 struct bucket *g; 1715 1706 struct btree_iter iter; 1716 1707 1717 - for_each_key_filter(b, k, &iter, bch_ptr_invalid) { 1708 + for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { 1718 1709 for (i = 0; i < KEY_PTRS(k); i++) { 1719 1710 if (!ptr_available(b->c, k, i)) 1720 1711 continue; ··· 1737 1728 } 1738 1729 1739 1730 if (b->level) { 1740 - bch_btree_iter_init(b, &iter, NULL); 1731 + bch_btree_iter_init(&b->keys, &iter, NULL); 1741 1732 1742 1733 do { 1743 - k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); 1734 + k = bch_btree_iter_next_filter(&iter, &b->keys, 1735 + bch_ptr_bad); 1744 1736 if (k) 1745 1737 btree_node_prefetch(b->c, k, b->level - 1); 1746 1738 ··· 1784 1774 1785 1775 /* Btree insertion */ 1786 1776 1787 - static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) 1777 + static bool btree_insert_key(struct btree *b, struct bkey *k, 1778 + struct bkey *replace_key) 1788 1779 { 1789 - struct bset *i = b->sets[b->nsets].data; 1790 - 1791 - memmove((uint64_t *) where + bkey_u64s(insert), 1792 - where, 1793 - (void *) end(i) - (void *) where); 1794 - 1795 - i->keys += bkey_u64s(insert); 1796 - bkey_copy(where, insert); 1797 - bch_bset_fix_lookup_table(b, where); 1798 - } 1799 - 1800 - static bool fix_overlapping_extents(struct btree *b, struct bkey *insert, 1801 - struct btree_iter *iter, 1802 - struct bkey *replace_key) 1803 - { 1804 - void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) 1805 - { 1806 - if (KEY_DIRTY(k)) 1807 - bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), 1808 - offset, -sectors); 1809 - } 1810 - 1811 - uint64_t old_offset; 1812 - unsigned old_size, sectors_found = 0; 1813 - 1814 - while (1) { 1815 - struct bkey *k = bch_btree_iter_next(iter); 1816 - if (!k || 1817 - bkey_cmp(&START_KEY(k), insert) >= 0) 1818 - break; 1819 - 1820 - if (bkey_cmp(k, &START_KEY(insert)) <= 0) 1821 - continue; 1822 - 1823 - old_offset = KEY_START(k); 1824 - old_size = KEY_SIZE(k); 1825 - 1826 - /* 1827 - * We might overlap with 0 size extents; we can't skip these 1828 - * because if they're in the set we're inserting to we have to 1829 - * adjust them so they don't overlap with the key we're 1830 - * inserting. But we don't want to check them for replace 1831 - * operations. 1832 - */ 1833 - 1834 - if (replace_key && KEY_SIZE(k)) { 1835 - /* 1836 - * k might have been split since we inserted/found the 1837 - * key we're replacing 1838 - */ 1839 - unsigned i; 1840 - uint64_t offset = KEY_START(k) - 1841 - KEY_START(replace_key); 1842 - 1843 - /* But it must be a subset of the replace key */ 1844 - if (KEY_START(k) < KEY_START(replace_key) || 1845 - KEY_OFFSET(k) > KEY_OFFSET(replace_key)) 1846 - goto check_failed; 1847 - 1848 - /* We didn't find a key that we were supposed to */ 1849 - if (KEY_START(k) > KEY_START(insert) + sectors_found) 1850 - goto check_failed; 1851 - 1852 - if (KEY_PTRS(k) != KEY_PTRS(replace_key) || 1853 - KEY_DIRTY(k) != KEY_DIRTY(replace_key)) 1854 - goto check_failed; 1855 - 1856 - /* skip past gen */ 1857 - offset <<= 8; 1858 - 1859 - BUG_ON(!KEY_PTRS(replace_key)); 1860 - 1861 - for (i = 0; i < KEY_PTRS(replace_key); i++) 1862 - if (k->ptr[i] != replace_key->ptr[i] + offset) 1863 - goto check_failed; 1864 - 1865 - sectors_found = KEY_OFFSET(k) - KEY_START(insert); 1866 - } 1867 - 1868 - if (bkey_cmp(insert, k) < 0 && 1869 - bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { 1870 - /* 1871 - * We overlapped in the middle of an existing key: that 1872 - * means we have to split the old key. But we have to do 1873 - * slightly different things depending on whether the 1874 - * old key has been written out yet. 1875 - */ 1876 - 1877 - struct bkey *top; 1878 - 1879 - subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); 1880 - 1881 - if (bkey_written(b, k)) { 1882 - /* 1883 - * We insert a new key to cover the top of the 1884 - * old key, and the old key is modified in place 1885 - * to represent the bottom split. 1886 - * 1887 - * It's completely arbitrary whether the new key 1888 - * is the top or the bottom, but it has to match 1889 - * up with what btree_sort_fixup() does - it 1890 - * doesn't check for this kind of overlap, it 1891 - * depends on us inserting a new key for the top 1892 - * here. 1893 - */ 1894 - top = bch_bset_search(b, &b->sets[b->nsets], 1895 - insert); 1896 - shift_keys(b, top, k); 1897 - } else { 1898 - BKEY_PADDED(key) temp; 1899 - bkey_copy(&temp.key, k); 1900 - shift_keys(b, k, &temp.key); 1901 - top = bkey_next(k); 1902 - } 1903 - 1904 - bch_cut_front(insert, top); 1905 - bch_cut_back(&START_KEY(insert), k); 1906 - bch_bset_fix_invalidated_key(b, k); 1907 - return false; 1908 - } 1909 - 1910 - if (bkey_cmp(insert, k) < 0) { 1911 - bch_cut_front(insert, k); 1912 - } else { 1913 - if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) 1914 - old_offset = KEY_START(insert); 1915 - 1916 - if (bkey_written(b, k) && 1917 - bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { 1918 - /* 1919 - * Completely overwrote, so we don't have to 1920 - * invalidate the binary search tree 1921 - */ 1922 - bch_cut_front(k, k); 1923 - } else { 1924 - __bch_cut_back(&START_KEY(insert), k); 1925 - bch_bset_fix_invalidated_key(b, k); 1926 - } 1927 - } 1928 - 1929 - subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); 1930 - } 1931 - 1932 - check_failed: 1933 - if (replace_key) { 1934 - if (!sectors_found) { 1935 - return true; 1936 - } else if (sectors_found < KEY_SIZE(insert)) { 1937 - SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - 1938 - (KEY_SIZE(insert) - sectors_found)); 1939 - SET_KEY_SIZE(insert, sectors_found); 1940 - } 1941 - } 1942 - 1943 - return false; 1944 - } 1945 - 1946 - static bool btree_insert_key(struct btree *b, struct btree_op *op, 1947 - struct bkey *k, struct bkey *replace_key) 1948 - { 1949 - struct bset *i = b->sets[b->nsets].data; 1950 - struct bkey *m, *prev; 1951 - unsigned status = BTREE_INSERT_STATUS_INSERT; 1780 + unsigned status; 1952 1781 1953 1782 BUG_ON(bkey_cmp(k, &b->key) > 0); 1954 - BUG_ON(b->level && !KEY_PTRS(k)); 1955 - BUG_ON(!b->level && !KEY_OFFSET(k)); 1956 1783 1957 - if (!b->level) { 1958 - struct btree_iter iter; 1784 + status = bch_btree_insert_key(&b->keys, k, replace_key); 1785 + if (status != BTREE_INSERT_STATUS_NO_INSERT) { 1786 + bch_check_keys(&b->keys, "%u for %s", status, 1787 + replace_key ? "replace" : "insert"); 1959 1788 1960 - /* 1961 - * bset_search() returns the first key that is strictly greater 1962 - * than the search key - but for back merging, we want to find 1963 - * the previous key. 1964 - */ 1965 - prev = NULL; 1966 - m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k))); 1789 + trace_bcache_btree_insert_key(b, k, replace_key != NULL, 1790 + status); 1791 + return true; 1792 + } else 1793 + return false; 1794 + } 1967 1795 1968 - if (fix_overlapping_extents(b, k, &iter, replace_key)) { 1969 - op->insert_collision = true; 1970 - return false; 1971 - } 1796 + static size_t insert_u64s_remaining(struct btree *b) 1797 + { 1798 + ssize_t ret = bch_btree_keys_u64s_remaining(&b->keys); 1972 1799 1973 - if (KEY_DIRTY(k)) 1974 - bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), 1975 - KEY_START(k), KEY_SIZE(k)); 1800 + /* 1801 + * Might land in the middle of an existing extent and have to split it 1802 + */ 1803 + if (b->keys.ops->is_extents) 1804 + ret -= KEY_MAX_U64S; 1976 1805 1977 - while (m != end(i) && 1978 - bkey_cmp(k, &START_KEY(m)) > 0) 1979 - prev = m, m = bkey_next(m); 1980 - 1981 - if (key_merging_disabled(b->c)) 1982 - goto insert; 1983 - 1984 - /* prev is in the tree, if we merge we're done */ 1985 - status = BTREE_INSERT_STATUS_BACK_MERGE; 1986 - if (prev && 1987 - bch_bkey_try_merge(b, prev, k)) 1988 - goto merged; 1989 - 1990 - status = BTREE_INSERT_STATUS_OVERWROTE; 1991 - if (m != end(i) && 1992 - KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) 1993 - goto copy; 1994 - 1995 - status = BTREE_INSERT_STATUS_FRONT_MERGE; 1996 - if (m != end(i) && 1997 - bch_bkey_try_merge(b, k, m)) 1998 - goto copy; 1999 - } else { 2000 - BUG_ON(replace_key); 2001 - m = bch_bset_search(b, &b->sets[b->nsets], k); 2002 - } 2003 - 2004 - insert: shift_keys(b, m, k); 2005 - copy: bkey_copy(m, k); 2006 - merged: 2007 - bch_check_keys(b, "%u for %s", status, 2008 - replace_key ? "replace" : "insert"); 2009 - 2010 - if (b->level && !KEY_OFFSET(k)) 2011 - btree_current_write(b)->prio_blocked++; 2012 - 2013 - trace_bcache_btree_insert_key(b, k, replace_key != NULL, status); 2014 - 2015 - return true; 1806 + return max(ret, 0L); 2016 1807 } 2017 1808 2018 1809 static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, ··· 1821 2010 struct bkey *replace_key) 1822 2011 { 1823 2012 bool ret = false; 1824 - int oldsize = bch_count_data(b); 2013 + int oldsize = bch_count_data(&b->keys); 1825 2014 1826 2015 while (!bch_keylist_empty(insert_keys)) { 1827 - struct bset *i = write_block(b); 1828 2016 struct bkey *k = insert_keys->keys; 1829 2017 1830 - if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c) 1831 - > btree_blocks(b)) 2018 + if (bkey_u64s(k) > insert_u64s_remaining(b)) 1832 2019 break; 1833 2020 1834 2021 if (bkey_cmp(k, &b->key) <= 0) { 1835 2022 if (!b->level) 1836 2023 bkey_put(b->c, k); 1837 2024 1838 - ret |= btree_insert_key(b, op, k, replace_key); 2025 + ret |= btree_insert_key(b, k, replace_key); 1839 2026 bch_keylist_pop_front(insert_keys); 1840 2027 } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { 1841 2028 BKEY_PADDED(key) temp; ··· 1842 2033 bch_cut_back(&b->key, &temp.key); 1843 2034 bch_cut_front(&b->key, insert_keys->keys); 1844 2035 1845 - ret |= btree_insert_key(b, op, &temp.key, replace_key); 2036 + ret |= btree_insert_key(b, &temp.key, replace_key); 1846 2037 break; 1847 2038 } else { 1848 2039 break; 1849 2040 } 1850 2041 } 1851 2042 2043 + if (!ret) 2044 + op->insert_collision = true; 2045 + 1852 2046 BUG_ON(!bch_keylist_empty(insert_keys) && b->level); 1853 2047 1854 - BUG_ON(bch_count_data(b) < oldsize); 2048 + BUG_ON(bch_count_data(&b->keys) < oldsize); 1855 2049 return ret; 1856 2050 } 1857 2051 ··· 1871 2059 closure_init_stack(&cl); 1872 2060 bch_keylist_init(&parent_keys); 1873 2061 2062 + if (!b->level && 2063 + btree_check_reserve(b, op)) 2064 + return -EINTR; 2065 + 1874 2066 n1 = btree_node_alloc_replacement(b, true); 1875 2067 if (IS_ERR(n1)) 1876 2068 goto err; 1877 2069 1878 - split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; 2070 + split = set_blocks(btree_bset_first(n1), 2071 + block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5; 1879 2072 1880 2073 if (split) { 1881 2074 unsigned keys = 0; 1882 2075 1883 - trace_bcache_btree_node_split(b, n1->sets[0].data->keys); 2076 + trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); 1884 2077 1885 2078 n2 = bch_btree_node_alloc(b->c, b->level, true); 1886 2079 if (IS_ERR(n2)) ··· 1904 2087 * search tree yet 1905 2088 */ 1906 2089 1907 - while (keys < (n1->sets[0].data->keys * 3) / 5) 1908 - keys += bkey_u64s(node(n1->sets[0].data, keys)); 2090 + while (keys < (btree_bset_first(n1)->keys * 3) / 5) 2091 + keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), 2092 + keys)); 1909 2093 1910 - bkey_copy_key(&n1->key, node(n1->sets[0].data, keys)); 1911 - keys += bkey_u64s(node(n1->sets[0].data, keys)); 2094 + bkey_copy_key(&n1->key, 2095 + bset_bkey_idx(btree_bset_first(n1), keys)); 2096 + keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys)); 1912 2097 1913 - n2->sets[0].data->keys = n1->sets[0].data->keys - keys; 1914 - n1->sets[0].data->keys = keys; 2098 + btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys; 2099 + btree_bset_first(n1)->keys = keys; 1915 2100 1916 - memcpy(n2->sets[0].data->start, 1917 - end(n1->sets[0].data), 1918 - n2->sets[0].data->keys * sizeof(uint64_t)); 2101 + memcpy(btree_bset_first(n2)->start, 2102 + bset_bkey_last(btree_bset_first(n1)), 2103 + btree_bset_first(n2)->keys * sizeof(uint64_t)); 1919 2104 1920 2105 bkey_copy_key(&n2->key, &b->key); 1921 2106 ··· 1925 2106 bch_btree_node_write(n2, &cl); 1926 2107 rw_unlock(true, n2); 1927 2108 } else { 1928 - trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); 2109 + trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); 1929 2110 1930 2111 bch_btree_insert_keys(n1, op, insert_keys, replace_key); 1931 2112 } ··· 1968 2149 1969 2150 return 0; 1970 2151 err_free2: 2152 + bkey_put(b->c, &n2->key); 1971 2153 btree_node_free(n2); 1972 2154 rw_unlock(true, n2); 1973 2155 err_free1: 2156 + bkey_put(b->c, &n1->key); 1974 2157 btree_node_free(n1); 1975 2158 rw_unlock(true, n1); 1976 2159 err: 2160 + WARN(1, "bcache: btree split failed"); 2161 + 1977 2162 if (n3 == ERR_PTR(-EAGAIN) || 1978 2163 n2 == ERR_PTR(-EAGAIN) || 1979 2164 n1 == ERR_PTR(-EAGAIN)) 1980 2165 return -EAGAIN; 1981 2166 1982 - pr_warn("couldn't split"); 1983 2167 return -ENOMEM; 1984 2168 } 1985 2169 ··· 1993 2171 { 1994 2172 BUG_ON(b->level && replace_key); 1995 2173 1996 - if (should_split(b)) { 2174 + if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { 1997 2175 if (current->bio_list) { 1998 2176 op->lock = b->c->root->level + 1; 1999 2177 return -EAGAIN; ··· 2002 2180 return -EINTR; 2003 2181 } else { 2004 2182 /* Invalidated all iterators */ 2005 - return btree_split(b, op, insert_keys, replace_key) ?: 2006 - -EINTR; 2183 + int ret = btree_split(b, op, insert_keys, replace_key); 2184 + 2185 + return bch_keylist_empty(insert_keys) ? 2186 + 0 : ret ?: -EINTR; 2007 2187 } 2008 2188 } else { 2009 - BUG_ON(write_block(b) != b->sets[b->nsets].data); 2189 + BUG_ON(write_block(b) != btree_bset_last(b)); 2010 2190 2011 2191 if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { 2012 2192 if (!b->level) ··· 2147 2323 struct bkey *k; 2148 2324 struct btree_iter iter; 2149 2325 2150 - bch_btree_iter_init(b, &iter, from); 2326 + bch_btree_iter_init(&b->keys, &iter, from); 2151 2327 2152 - while ((k = bch_btree_iter_next_filter(&iter, b, 2328 + while ((k = bch_btree_iter_next_filter(&iter, &b->keys, 2153 2329 bch_ptr_bad))) { 2154 2330 ret = btree(map_nodes_recurse, k, b, 2155 2331 op, from, fn, flags); ··· 2180 2356 struct bkey *k; 2181 2357 struct btree_iter iter; 2182 2358 2183 - bch_btree_iter_init(b, &iter, from); 2359 + bch_btree_iter_init(&b->keys, &iter, from); 2184 2360 2185 - while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) { 2361 + while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { 2186 2362 ret = !b->level 2187 2363 ? fn(op, b, k) 2188 2364 : btree(map_keys_recurse, k, b, op, from, fn, flags);

+14 -46

drivers/md/bcache/btree.h

··· 130 130 unsigned long flags; 131 131 uint16_t written; /* would be nice to kill */ 132 132 uint8_t level; 133 - uint8_t nsets; 134 - uint8_t page_order; 135 133 136 - /* 137 - * Set of sorted keys - the real btree node - plus a binary search tree 138 - * 139 - * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point 140 - * to the memory we have allocated for this btree node. Additionally, 141 - * set[0]->data points to the entire btree node as it exists on disk. 142 - */ 143 - struct bset_tree sets[MAX_BSETS]; 134 + struct btree_keys keys; 144 135 145 136 /* For outstanding btree writes, used as a lock - protects write_idx */ 146 - struct closure_with_waitlist io; 137 + struct closure io; 138 + struct semaphore io_mutex; 147 139 148 140 struct list_head list; 149 141 struct delayed_work work; ··· 171 179 return b->writes + (btree_node_write_idx(b) ^ 1); 172 180 } 173 181 174 - static inline unsigned bset_offset(struct btree *b, struct bset *i) 182 + static inline struct bset *btree_bset_first(struct btree *b) 175 183 { 176 - return (((size_t) i) - ((size_t) b->sets->data)) >> 9; 184 + return b->keys.set->data; 177 185 } 178 186 179 - static inline struct bset *write_block(struct btree *b) 187 + static inline struct bset *btree_bset_last(struct btree *b) 180 188 { 181 - return ((void *) b->sets[0].data) + b->written * block_bytes(b->c); 189 + return bset_tree_last(&b->keys)->data; 182 190 } 183 191 184 - static inline bool bset_written(struct btree *b, struct bset_tree *t) 192 + static inline unsigned bset_block_offset(struct btree *b, struct bset *i) 185 193 { 186 - return t->data < write_block(b); 187 - } 188 - 189 - static inline bool bkey_written(struct btree *b, struct bkey *k) 190 - { 191 - return k < write_block(b)->start; 194 + return bset_sector_offset(&b->keys, i) >> b->c->block_bits; 192 195 } 193 196 194 197 static inline void set_gc_sectors(struct cache_set *c) 195 198 { 196 199 atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); 197 - } 198 - 199 - static inline struct bkey *bch_btree_iter_init(struct btree *b, 200 - struct btree_iter *iter, 201 - struct bkey *search) 202 - { 203 - return __bch_btree_iter_init(b, iter, search, b->sets); 204 - } 205 - 206 - static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) 207 - { 208 - if (b->level) 209 - return bch_btree_ptr_invalid(b->c, k); 210 - else 211 - return bch_extent_ptr_invalid(b->c, k); 212 200 } 213 201 214 202 void bkey_put(struct cache_set *c, struct bkey *k); ··· 201 229 iter++) \ 202 230 hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash) 203 231 204 - #define for_each_key_filter(b, k, iter, filter) \ 205 - for (bch_btree_iter_init((b), (iter), NULL); \ 206 - ((k) = bch_btree_iter_next_filter((iter), b, filter));) 207 - 208 - #define for_each_key(b, k, iter) \ 209 - for (bch_btree_iter_init((b), (iter), NULL); \ 210 - ((k) = bch_btree_iter_next(iter));) 211 - 212 232 /* Recursing down the btree */ 213 233 214 234 struct btree_op { 235 + /* for waiting on btree reserve in btree_split() */ 236 + wait_queue_t wait; 237 + 215 238 /* Btree level at which we start taking write locks */ 216 239 short lock; 217 240 ··· 216 249 static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) 217 250 { 218 251 memset(op, 0, sizeof(struct btree_op)); 252 + init_wait(&op->wait); 219 253 op->lock = write_lock_level; 220 254 } 221 255 ··· 235 267 (w ? up_write : up_read)(&b->lock); 236 268 } 237 269 238 - void bch_btree_node_read(struct btree *); 270 + void bch_btree_node_read_done(struct btree *); 239 271 void bch_btree_node_write(struct btree *, struct closure *); 240 272 241 273 void bch_btree_set_root(struct btree *);

+18 -72

drivers/md/bcache/closure.c

··· 11 11 12 12 #include "closure.h" 13 13 14 - #define CL_FIELD(type, field) \ 15 - case TYPE_ ## type: \ 16 - return &container_of(cl, struct type, cl)->field 17 - 18 - static struct closure_waitlist *closure_waitlist(struct closure *cl) 19 - { 20 - switch (cl->type) { 21 - CL_FIELD(closure_with_waitlist, wait); 22 - default: 23 - return NULL; 24 - } 25 - } 26 - 27 14 static inline void closure_put_after_sub(struct closure *cl, int flags) 28 15 { 29 16 int r = flags & CLOSURE_REMAINING_MASK; ··· 29 42 closure_queue(cl); 30 43 } else { 31 44 struct closure *parent = cl->parent; 32 - struct closure_waitlist *wait = closure_waitlist(cl); 33 45 closure_fn *destructor = cl->fn; 34 46 35 47 closure_debug_destroy(cl); 36 - 37 - smp_mb(); 38 - atomic_set(&cl->remaining, -1); 39 - 40 - if (wait) 41 - closure_wake_up(wait); 42 48 43 49 if (destructor) 44 50 destructor(cl); ··· 49 69 } 50 70 EXPORT_SYMBOL(closure_sub); 51 71 72 + /** 73 + * closure_put - decrement a closure's refcount 74 + */ 52 75 void closure_put(struct closure *cl) 53 76 { 54 77 closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); 55 78 } 56 79 EXPORT_SYMBOL(closure_put); 57 80 58 - static void set_waiting(struct closure *cl, unsigned long f) 59 - { 60 - #ifdef CONFIG_BCACHE_CLOSURES_DEBUG 61 - cl->waiting_on = f; 62 - #endif 63 - } 64 - 81 + /** 82 + * closure_wake_up - wake up all closures on a wait list, without memory barrier 83 + */ 65 84 void __closure_wake_up(struct closure_waitlist *wait_list) 66 85 { 67 86 struct llist_node *list; ··· 85 106 cl = container_of(reverse, struct closure, list); 86 107 reverse = llist_next(reverse); 87 108 88 - set_waiting(cl, 0); 109 + closure_set_waiting(cl, 0); 89 110 closure_sub(cl, CLOSURE_WAITING + 1); 90 111 } 91 112 } 92 113 EXPORT_SYMBOL(__closure_wake_up); 93 114 94 - bool closure_wait(struct closure_waitlist *list, struct closure *cl) 115 + /** 116 + * closure_wait - add a closure to a waitlist 117 + * 118 + * @waitlist will own a ref on @cl, which will be released when 119 + * closure_wake_up() is called on @waitlist. 120 + * 121 + */ 122 + bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) 95 123 { 96 124 if (atomic_read(&cl->remaining) & CLOSURE_WAITING) 97 125 return false; 98 126 99 - set_waiting(cl, _RET_IP_); 127 + closure_set_waiting(cl, _RET_IP_); 100 128 atomic_add(CLOSURE_WAITING + 1, &cl->remaining); 101 - llist_add(&cl->list, &list->list); 129 + llist_add(&cl->list, &waitlist->list); 102 130 103 131 return true; 104 132 } 105 133 EXPORT_SYMBOL(closure_wait); 106 134 107 135 /** 108 - * closure_sync() - sleep until a closure a closure has nothing left to wait on 136 + * closure_sync - sleep until a closure a closure has nothing left to wait on 109 137 * 110 138 * Sleeps until the refcount hits 1 - the thread that's running the closure owns 111 139 * the last refcount. ··· 133 147 __closure_end_sleep(cl); 134 148 } 135 149 EXPORT_SYMBOL(closure_sync); 136 - 137 - /** 138 - * closure_trylock() - try to acquire the closure, without waiting 139 - * @cl: closure to lock 140 - * 141 - * Returns true if the closure was succesfully locked. 142 - */ 143 - bool closure_trylock(struct closure *cl, struct closure *parent) 144 - { 145 - if (atomic_cmpxchg(&cl->remaining, -1, 146 - CLOSURE_REMAINING_INITIALIZER) != -1) 147 - return false; 148 - 149 - smp_mb(); 150 - 151 - cl->parent = parent; 152 - if (parent) 153 - closure_get(parent); 154 - 155 - closure_set_ret_ip(cl); 156 - closure_debug_create(cl); 157 - return true; 158 - } 159 - EXPORT_SYMBOL(closure_trylock); 160 - 161 - void __closure_lock(struct closure *cl, struct closure *parent, 162 - struct closure_waitlist *wait_list) 163 - { 164 - struct closure wait; 165 - closure_init_stack(&wait); 166 - 167 - while (1) { 168 - if (closure_trylock(cl, parent)) 169 - return; 170 - 171 - closure_wait_event(wait_list, &wait, 172 - atomic_read(&cl->remaining) == -1); 173 - } 174 - } 175 - EXPORT_SYMBOL(__closure_lock); 176 150 177 151 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG 178 152

+110 -257

drivers/md/bcache/closure.h

··· 72 72 * closure - _always_ use continue_at(). Doing so consistently will help 73 73 * eliminate an entire class of particularly pernicious races. 74 74 * 75 - * For a closure to wait on an arbitrary event, we need to introduce waitlists: 76 - * 77 - * struct closure_waitlist list; 78 - * closure_wait_event(list, cl, condition); 79 - * closure_wake_up(wait_list); 80 - * 81 - * These work analagously to wait_event() and wake_up() - except that instead of 82 - * operating on the current thread (for wait_event()) and lists of threads, they 83 - * operate on an explicit closure and lists of closures. 84 - * 85 - * Because it's a closure we can now wait either synchronously or 86 - * asynchronously. closure_wait_event() returns the current value of the 87 - * condition, and if it returned false continue_at() or closure_sync() can be 88 - * used to wait for it to become true. 89 - * 90 - * It's useful for waiting on things when you can't sleep in the context in 91 - * which you must check the condition (perhaps a spinlock held, or you might be 92 - * beneath generic_make_request() - in which case you can't sleep on IO). 93 - * 94 - * closure_wait_event() will wait either synchronously or asynchronously, 95 - * depending on whether the closure is in blocking mode or not. You can pick a 96 - * mode explicitly with closure_wait_event_sync() and 97 - * closure_wait_event_async(), which do just what you might expect. 98 - * 99 75 * Lastly, you might have a wait list dedicated to a specific event, and have no 100 76 * need for specifying the condition - you just want to wait until someone runs 101 77 * closure_wake_up() on the appropriate wait list. In that case, just use ··· 97 121 * All this implies that a closure should typically be embedded in a particular 98 122 * struct (which its refcount will normally control the lifetime of), and that 99 123 * struct can very much be thought of as a stack frame. 100 - * 101 - * Locking: 102 - * 103 - * Closures are based on work items but they can be thought of as more like 104 - * threads - in that like threads and unlike work items they have a well 105 - * defined lifetime; they are created (with closure_init()) and eventually 106 - * complete after a continue_at(cl, NULL, NULL). 107 - * 108 - * Suppose you've got some larger structure with a closure embedded in it that's 109 - * used for periodically doing garbage collection. You only want one garbage 110 - * collection happening at a time, so the natural thing to do is protect it with 111 - * a lock. However, it's difficult to use a lock protecting a closure correctly 112 - * because the unlock should come after the last continue_to() (additionally, if 113 - * you're using the closure asynchronously a mutex won't work since a mutex has 114 - * to be unlocked by the same process that locked it). 115 - * 116 - * So to make it less error prone and more efficient, we also have the ability 117 - * to use closures as locks: 118 - * 119 - * closure_init_unlocked(); 120 - * closure_trylock(); 121 - * 122 - * That's all we need for trylock() - the last closure_put() implicitly unlocks 123 - * it for you. But for closure_lock(), we also need a wait list: 124 - * 125 - * struct closure_with_waitlist frobnicator_cl; 126 - * 127 - * closure_init_unlocked(&frobnicator_cl); 128 - * closure_lock(&frobnicator_cl); 129 - * 130 - * A closure_with_waitlist embeds a closure and a wait list - much like struct 131 - * delayed_work embeds a work item and a timer_list. The important thing is, use 132 - * it exactly like you would a regular closure and closure_put() will magically 133 - * handle everything for you. 134 124 */ 135 125 136 126 struct closure; ··· 104 162 105 163 struct closure_waitlist { 106 164 struct llist_head list; 107 - }; 108 - 109 - enum closure_type { 110 - TYPE_closure = 0, 111 - TYPE_closure_with_waitlist = 1, 112 - MAX_CLOSURE_TYPE = 1, 113 165 }; 114 166 115 167 enum closure_state { ··· 160 224 161 225 atomic_t remaining; 162 226 163 - enum closure_type type; 164 - 165 227 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG 166 228 #define CLOSURE_MAGIC_DEAD 0xc054dead 167 229 #define CLOSURE_MAGIC_ALIVE 0xc054a11e ··· 171 237 #endif 172 238 }; 173 239 174 - struct closure_with_waitlist { 175 - struct closure cl; 176 - struct closure_waitlist wait; 177 - }; 178 - 179 - extern unsigned invalid_closure_type(void); 180 - 181 - #define __CLOSURE_TYPE(cl, _t) \ 182 - __builtin_types_compatible_p(typeof(cl), struct _t) \ 183 - ? TYPE_ ## _t : \ 184 - 185 - #define __closure_type(cl) \ 186 - ( \ 187 - __CLOSURE_TYPE(cl, closure) \ 188 - __CLOSURE_TYPE(cl, closure_with_waitlist) \ 189 - invalid_closure_type() \ 190 - ) 191 - 192 240 void closure_sub(struct closure *cl, int v); 193 241 void closure_put(struct closure *cl); 194 242 void __closure_wake_up(struct closure_waitlist *list); 195 243 bool closure_wait(struct closure_waitlist *list, struct closure *cl); 196 244 void closure_sync(struct closure *cl); 197 - 198 - bool closure_trylock(struct closure *cl, struct closure *parent); 199 - void __closure_lock(struct closure *cl, struct closure *parent, 200 - struct closure_waitlist *wait_list); 201 245 202 246 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG 203 247 ··· 205 293 #endif 206 294 } 207 295 208 - static inline void closure_get(struct closure *cl) 296 + static inline void closure_set_waiting(struct closure *cl, unsigned long f) 209 297 { 210 298 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG 211 - BUG_ON((atomic_inc_return(&cl->remaining) & 212 - CLOSURE_REMAINING_MASK) <= 1); 213 - #else 214 - atomic_inc(&cl->remaining); 299 + cl->waiting_on = f; 215 300 #endif 216 301 } 217 - 218 - static inline void closure_set_stopped(struct closure *cl) 219 - { 220 - atomic_sub(CLOSURE_RUNNING, &cl->remaining); 221 - } 222 - 223 - static inline bool closure_is_unlocked(struct closure *cl) 224 - { 225 - return atomic_read(&cl->remaining) == -1; 226 - } 227 - 228 - static inline void do_closure_init(struct closure *cl, struct closure *parent, 229 - bool running) 230 - { 231 - cl->parent = parent; 232 - if (parent) 233 - closure_get(parent); 234 - 235 - if (running) { 236 - closure_debug_create(cl); 237 - atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); 238 - } else 239 - atomic_set(&cl->remaining, -1); 240 - 241 - closure_set_ip(cl); 242 - } 243 - 244 - /* 245 - * Hack to get at the embedded closure if there is one, by doing an unsafe cast: 246 - * the result of __closure_type() is thrown away, it's used merely for type 247 - * checking. 248 - */ 249 - #define __to_internal_closure(cl) \ 250 - ({ \ 251 - BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE); \ 252 - (struct closure *) cl; \ 253 - }) 254 - 255 - #define closure_init_type(cl, parent, running) \ 256 - do { \ 257 - struct closure *_cl = __to_internal_closure(cl); \ 258 - _cl->type = __closure_type(*(cl)); \ 259 - do_closure_init(_cl, parent, running); \ 260 - } while (0) 261 - 262 - /** 263 - * __closure_init() - Initialize a closure, skipping the memset() 264 - * 265 - * May be used instead of closure_init() when memory has already been zeroed. 266 - */ 267 - #define __closure_init(cl, parent) \ 268 - closure_init_type(cl, parent, true) 269 - 270 - /** 271 - * closure_init() - Initialize a closure, setting the refcount to 1 272 - * @cl: closure to initialize 273 - * @parent: parent of the new closure. cl will take a refcount on it for its 274 - * lifetime; may be NULL. 275 - */ 276 - #define closure_init(cl, parent) \ 277 - do { \ 278 - memset((cl), 0, sizeof(*(cl))); \ 279 - __closure_init(cl, parent); \ 280 - } while (0) 281 - 282 - static inline void closure_init_stack(struct closure *cl) 283 - { 284 - memset(cl, 0, sizeof(struct closure)); 285 - atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); 286 - } 287 - 288 - /** 289 - * closure_init_unlocked() - Initialize a closure but leave it unlocked. 290 - * @cl: closure to initialize 291 - * 292 - * For when the closure will be used as a lock. The closure may not be used 293 - * until after a closure_lock() or closure_trylock(). 294 - */ 295 - #define closure_init_unlocked(cl) \ 296 - do { \ 297 - memset((cl), 0, sizeof(*(cl))); \ 298 - closure_init_type(cl, NULL, false); \ 299 - } while (0) 300 - 301 - /** 302 - * closure_lock() - lock and initialize a closure. 303 - * @cl: the closure to lock 304 - * @parent: the new parent for this closure 305 - * 306 - * The closure must be of one of the types that has a waitlist (otherwise we 307 - * wouldn't be able to sleep on contention). 308 - * 309 - * @parent has exactly the same meaning as in closure_init(); if non null, the 310 - * closure will take a reference on @parent which will be released when it is 311 - * unlocked. 312 - */ 313 - #define closure_lock(cl, parent) \ 314 - __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait) 315 302 316 303 static inline void __closure_end_sleep(struct closure *cl) 317 304 { ··· 230 419 atomic_add(CLOSURE_SLEEPING, &cl->remaining); 231 420 } 232 421 233 - /** 234 - * closure_wake_up() - wake up all closures on a wait list. 235 - */ 236 - static inline void closure_wake_up(struct closure_waitlist *list) 422 + static inline void closure_set_stopped(struct closure *cl) 237 423 { 238 - smp_mb(); 239 - __closure_wake_up(list); 240 - } 241 - 242 - /* 243 - * Wait on an event, synchronously or asynchronously - analogous to wait_event() 244 - * but for closures. 245 - * 246 - * The loop is oddly structured so as to avoid a race; we must check the 247 - * condition again after we've added ourself to the waitlist. We know if we were 248 - * already on the waitlist because closure_wait() returns false; thus, we only 249 - * schedule or break if closure_wait() returns false. If it returns true, we 250 - * just loop again - rechecking the condition. 251 - * 252 - * The __closure_wake_up() is necessary because we may race with the event 253 - * becoming true; i.e. we see event false -> wait -> recheck condition, but the 254 - * thread that made the event true may have called closure_wake_up() before we 255 - * added ourself to the wait list. 256 - * 257 - * We have to call closure_sync() at the end instead of just 258 - * __closure_end_sleep() because a different thread might've called 259 - * closure_wake_up() before us and gotten preempted before they dropped the 260 - * refcount on our closure. If this was a stack allocated closure, that would be 261 - * bad. 262 - */ 263 - #define closure_wait_event(list, cl, condition) \ 264 - ({ \ 265 - typeof(condition) ret; \ 266 - \ 267 - while (1) { \ 268 - ret = (condition); \ 269 - if (ret) { \ 270 - __closure_wake_up(list); \ 271 - closure_sync(cl); \ 272 - break; \ 273 - } \ 274 - \ 275 - __closure_start_sleep(cl); \ 276 - \ 277 - if (!closure_wait(list, cl)) \ 278 - schedule(); \ 279 - } \ 280 - \ 281 - ret; \ 282 - }) 283 - 284 - static inline void closure_queue(struct closure *cl) 285 - { 286 - struct workqueue_struct *wq = cl->wq; 287 - if (wq) { 288 - INIT_WORK(&cl->work, cl->work.func); 289 - BUG_ON(!queue_work(wq, &cl->work)); 290 - } else 291 - cl->fn(cl); 424 + atomic_sub(CLOSURE_RUNNING, &cl->remaining); 292 425 } 293 426 294 427 static inline void set_closure_fn(struct closure *cl, closure_fn *fn, ··· 246 491 smp_mb__before_atomic_dec(); 247 492 } 248 493 494 + static inline void closure_queue(struct closure *cl) 495 + { 496 + struct workqueue_struct *wq = cl->wq; 497 + if (wq) { 498 + INIT_WORK(&cl->work, cl->work.func); 499 + BUG_ON(!queue_work(wq, &cl->work)); 500 + } else 501 + cl->fn(cl); 502 + } 503 + 504 + /** 505 + * closure_get - increment a closure's refcount 506 + */ 507 + static inline void closure_get(struct closure *cl) 508 + { 509 + #ifdef CONFIG_BCACHE_CLOSURES_DEBUG 510 + BUG_ON((atomic_inc_return(&cl->remaining) & 511 + CLOSURE_REMAINING_MASK) <= 1); 512 + #else 513 + atomic_inc(&cl->remaining); 514 + #endif 515 + } 516 + 517 + /** 518 + * closure_init - Initialize a closure, setting the refcount to 1 519 + * @cl: closure to initialize 520 + * @parent: parent of the new closure. cl will take a refcount on it for its 521 + * lifetime; may be NULL. 522 + */ 523 + static inline void closure_init(struct closure *cl, struct closure *parent) 524 + { 525 + memset(cl, 0, sizeof(struct closure)); 526 + cl->parent = parent; 527 + if (parent) 528 + closure_get(parent); 529 + 530 + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); 531 + 532 + closure_debug_create(cl); 533 + closure_set_ip(cl); 534 + } 535 + 536 + static inline void closure_init_stack(struct closure *cl) 537 + { 538 + memset(cl, 0, sizeof(struct closure)); 539 + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); 540 + } 541 + 542 + /** 543 + * closure_wake_up - wake up all closures on a wait list. 544 + */ 545 + static inline void closure_wake_up(struct closure_waitlist *list) 546 + { 547 + smp_mb(); 548 + __closure_wake_up(list); 549 + } 550 + 551 + /** 552 + * continue_at - jump to another function with barrier 553 + * 554 + * After @cl is no longer waiting on anything (i.e. all outstanding refs have 555 + * been dropped with closure_put()), it will resume execution at @fn running out 556 + * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). 557 + * 558 + * NOTE: This macro expands to a return in the calling function! 559 + * 560 + * This is because after calling continue_at() you no longer have a ref on @cl, 561 + * and whatever @cl owns may be freed out from under you - a running closure fn 562 + * has a ref on its own closure which continue_at() drops. 563 + */ 249 564 #define continue_at(_cl, _fn, _wq) \ 250 565 do { \ 251 566 set_closure_fn(_cl, _fn, _wq); \ ··· 323 498 return; \ 324 499 } while (0) 325 500 501 + /** 502 + * closure_return - finish execution of a closure 503 + * 504 + * This is used to indicate that @cl is finished: when all outstanding refs on 505 + * @cl have been dropped @cl's ref on its parent closure (as passed to 506 + * closure_init()) will be dropped, if one was specified - thus this can be 507 + * thought of as returning to the parent closure. 508 + */ 326 509 #define closure_return(_cl) continue_at((_cl), NULL, NULL) 327 510 511 + /** 512 + * continue_at_nobarrier - jump to another function without barrier 513 + * 514 + * Causes @fn to be executed out of @cl, in @wq context (or called directly if 515 + * @wq is NULL). 516 + * 517 + * NOTE: like continue_at(), this macro expands to a return in the caller! 518 + * 519 + * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, 520 + * thus it's not safe to touch anything protected by @cl after a 521 + * continue_at_nobarrier(). 522 + */ 328 523 #define continue_at_nobarrier(_cl, _fn, _wq) \ 329 524 do { \ 330 525 set_closure_fn(_cl, _fn, _wq); \ ··· 352 507 return; \ 353 508 } while (0) 354 509 510 + /** 511 + * closure_return - finish execution of a closure, with destructor 512 + * 513 + * Works like closure_return(), except @destructor will be called when all 514 + * outstanding refs on @cl have been dropped; @destructor may be used to safely 515 + * free the memory occupied by @cl, and it is called with the ref on the parent 516 + * closure still held - so @destructor could safely return an item to a 517 + * freelist protected by @cl's parent. 518 + */ 355 519 #define closure_return_with_destructor(_cl, _destructor) \ 356 520 do { \ 357 521 set_closure_fn(_cl, _destructor, NULL); \ ··· 368 514 return; \ 369 515 } while (0) 370 516 517 + /** 518 + * closure_call - execute @fn out of a new, uninitialized closure 519 + * 520 + * Typically used when running out of one closure, and we want to run @fn 521 + * asynchronously out of a new closure - @parent will then wait for @cl to 522 + * finish. 523 + */ 371 524 static inline void closure_call(struct closure *cl, closure_fn fn, 372 525 struct workqueue_struct *wq, 373 526 struct closure *parent) 374 527 { 375 528 closure_init(cl, parent); 376 529 continue_at_nobarrier(cl, fn, wq); 377 - } 378 - 379 - static inline void closure_trylock_call(struct closure *cl, closure_fn fn, 380 - struct workqueue_struct *wq, 381 - struct closure *parent) 382 - { 383 - if (closure_trylock(cl, parent)) 384 - continue_at_nobarrier(cl, fn, wq); 385 530 } 386 531 387 532 #endif /* _LINUX_CLOSURE_H */

+56 -191

drivers/md/bcache/debug.c

··· 8 8 #include "bcache.h" 9 9 #include "btree.h" 10 10 #include "debug.h" 11 + #include "extents.h" 11 12 12 13 #include <linux/console.h> 13 14 #include <linux/debugfs.h> ··· 18 17 19 18 static struct dentry *debug; 20 19 21 - const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) 22 - { 23 - unsigned i; 24 - 25 - for (i = 0; i < KEY_PTRS(k); i++) 26 - if (ptr_available(c, k, i)) { 27 - struct cache *ca = PTR_CACHE(c, k, i); 28 - size_t bucket = PTR_BUCKET_NR(c, k, i); 29 - size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); 30 - 31 - if (KEY_SIZE(k) + r > c->sb.bucket_size) 32 - return "bad, length too big"; 33 - if (bucket < ca->sb.first_bucket) 34 - return "bad, short offset"; 35 - if (bucket >= ca->sb.nbuckets) 36 - return "bad, offset past end of device"; 37 - if (ptr_stale(c, k, i)) 38 - return "stale"; 39 - } 40 - 41 - if (!bkey_cmp(k, &ZERO_KEY)) 42 - return "bad, null key"; 43 - if (!KEY_PTRS(k)) 44 - return "bad, no pointers"; 45 - if (!KEY_SIZE(k)) 46 - return "zeroed key"; 47 - return ""; 48 - } 49 - 50 - int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) 51 - { 52 - unsigned i = 0; 53 - char *out = buf, *end = buf + size; 54 - 55 - #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) 56 - 57 - p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k)); 58 - 59 - if (KEY_PTRS(k)) 60 - while (1) { 61 - p("%llu:%llu gen %llu", 62 - PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i)); 63 - 64 - if (++i == KEY_PTRS(k)) 65 - break; 66 - 67 - p(", "); 68 - } 69 - 70 - p("]"); 71 - 72 - if (KEY_DIRTY(k)) 73 - p(" dirty"); 74 - if (KEY_CSUM(k)) 75 - p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); 76 - #undef p 77 - return out - buf; 78 - } 79 - 80 20 #ifdef CONFIG_BCACHE_DEBUG 81 21 82 - static void dump_bset(struct btree *b, struct bset *i) 83 - { 84 - struct bkey *k, *next; 85 - unsigned j; 86 - char buf[80]; 22 + #define for_each_written_bset(b, start, i) \ 23 + for (i = (start); \ 24 + (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\ 25 + i->seq == (start)->seq; \ 26 + i = (void *) i + set_blocks(i, block_bytes(b->c)) * \ 27 + block_bytes(b->c)) 87 28 88 - for (k = i->start; k < end(i); k = next) { 89 - next = bkey_next(k); 90 - 91 - bch_bkey_to_text(buf, sizeof(buf), k); 92 - printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), 93 - (uint64_t *) k - i->d, i->keys, buf); 94 - 95 - for (j = 0; j < KEY_PTRS(k); j++) { 96 - size_t n = PTR_BUCKET_NR(b->c, k, j); 97 - printk(" bucket %zu", n); 98 - 99 - if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) 100 - printk(" prio %i", 101 - PTR_BUCKET(b->c, k, j)->prio); 102 - } 103 - 104 - printk(" %s\n", bch_ptr_status(b->c, k)); 105 - 106 - if (next < end(i) && 107 - bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0) 108 - printk(KERN_ERR "Key skipped backwards\n"); 109 - } 110 - } 111 - 112 - static void bch_dump_bucket(struct btree *b) 113 - { 114 - unsigned i; 115 - 116 - console_lock(); 117 - for (i = 0; i <= b->nsets; i++) 118 - dump_bset(b, b->sets[i].data); 119 - console_unlock(); 120 - } 121 - 122 - void bch_btree_verify(struct btree *b, struct bset *new) 29 + void bch_btree_verify(struct btree *b) 123 30 { 124 31 struct btree *v = b->c->verify_data; 125 - struct closure cl; 126 - closure_init_stack(&cl); 32 + struct bset *ondisk, *sorted, *inmemory; 33 + struct bio *bio; 127 34 128 - if (!b->c->verify) 35 + if (!b->c->verify || !b->c->verify_ondisk) 129 36 return; 130 37 131 - closure_wait_event(&b->io.wait, &cl, 132 - atomic_read(&b->io.cl.remaining) == -1); 133 - 38 + down(&b->io_mutex); 134 39 mutex_lock(&b->c->verify_lock); 40 + 41 + ondisk = b->c->verify_ondisk; 42 + sorted = b->c->verify_data->keys.set->data; 43 + inmemory = b->keys.set->data; 135 44 136 45 bkey_copy(&v->key, &b->key); 137 46 v->written = 0; 138 47 v->level = b->level; 48 + v->keys.ops = b->keys.ops; 139 49 140 - bch_btree_node_read(v); 141 - closure_wait_event(&v->io.wait, &cl, 142 - atomic_read(&b->io.cl.remaining) == -1); 50 + bio = bch_bbio_alloc(b->c); 51 + bio->bi_bdev = PTR_CACHE(b->c, &b->key, 0)->bdev; 52 + bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); 53 + bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9; 54 + bch_bio_map(bio, sorted); 143 55 144 - if (new->keys != v->sets[0].data->keys || 145 - memcmp(new->start, 146 - v->sets[0].data->start, 147 - (void *) end(new) - (void *) new->start)) { 148 - unsigned i, j; 56 + submit_bio_wait(REQ_META|READ_SYNC, bio); 57 + bch_bbio_free(bio, b->c); 58 + 59 + memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9); 60 + 61 + bch_btree_node_read_done(v); 62 + sorted = v->keys.set->data; 63 + 64 + if (inmemory->keys != sorted->keys || 65 + memcmp(inmemory->start, 66 + sorted->start, 67 + (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) { 68 + struct bset *i; 69 + unsigned j; 149 70 150 71 console_lock(); 151 72 152 - printk(KERN_ERR "*** original memory node:\n"); 153 - for (i = 0; i <= b->nsets; i++) 154 - dump_bset(b, b->sets[i].data); 73 + printk(KERN_ERR "*** in memory:\n"); 74 + bch_dump_bset(&b->keys, inmemory, 0); 155 75 156 - printk(KERN_ERR "*** sorted memory node:\n"); 157 - dump_bset(b, new); 76 + printk(KERN_ERR "*** read back in:\n"); 77 + bch_dump_bset(&v->keys, sorted, 0); 158 78 159 - printk(KERN_ERR "*** on disk node:\n"); 160 - dump_bset(v, v->sets[0].data); 79 + for_each_written_bset(b, ondisk, i) { 80 + unsigned block = ((void *) i - (void *) ondisk) / 81 + block_bytes(b->c); 161 82 162 - for (j = 0; j < new->keys; j++) 163 - if (new->d[j] != v->sets[0].data->d[j]) 83 + printk(KERN_ERR "*** on disk block %u:\n", block); 84 + bch_dump_bset(&b->keys, i, block); 85 + } 86 + 87 + printk(KERN_ERR "*** block %zu not written\n", 88 + ((void *) i - (void *) ondisk) / block_bytes(b->c)); 89 + 90 + for (j = 0; j < inmemory->keys; j++) 91 + if (inmemory->d[j] != sorted->d[j]) 164 92 break; 93 + 94 + printk(KERN_ERR "b->written %u\n", b->written); 165 95 166 96 console_unlock(); 167 97 panic("verify failed at %u\n", j); 168 98 } 169 99 170 100 mutex_unlock(&b->c->verify_lock); 101 + up(&b->io_mutex); 171 102 } 172 103 173 104 void bch_data_verify(struct cached_dev *dc, struct bio *bio) ··· 138 205 __free_page(bv2->bv_page); 139 206 out_put: 140 207 bio_put(check); 141 - } 142 - 143 - int __bch_count_data(struct btree *b) 144 - { 145 - unsigned ret = 0; 146 - struct btree_iter iter; 147 - struct bkey *k; 148 - 149 - if (!b->level) 150 - for_each_key(b, k, &iter) 151 - ret += KEY_SIZE(k); 152 - return ret; 153 - } 154 - 155 - void __bch_check_keys(struct btree *b, const char *fmt, ...) 156 - { 157 - va_list args; 158 - struct bkey *k, *p = NULL; 159 - struct btree_iter iter; 160 - const char *err; 161 - 162 - for_each_key(b, k, &iter) { 163 - if (!b->level) { 164 - err = "Keys out of order"; 165 - if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) 166 - goto bug; 167 - 168 - if (bch_ptr_invalid(b, k)) 169 - continue; 170 - 171 - err = "Overlapping keys"; 172 - if (p && bkey_cmp(p, &START_KEY(k)) > 0) 173 - goto bug; 174 - } else { 175 - if (bch_ptr_bad(b, k)) 176 - continue; 177 - 178 - err = "Duplicate keys"; 179 - if (p && !bkey_cmp(p, k)) 180 - goto bug; 181 - } 182 - p = k; 183 - } 184 - 185 - err = "Key larger than btree node key"; 186 - if (p && bkey_cmp(p, &b->key) > 0) 187 - goto bug; 188 - 189 - return; 190 - bug: 191 - bch_dump_bucket(b); 192 - 193 - va_start(args, fmt); 194 - vprintk(fmt, args); 195 - va_end(args); 196 - 197 - panic("bcache error: %s:\n", err); 198 - } 199 - 200 - void bch_btree_iter_next_check(struct btree_iter *iter) 201 - { 202 - struct bkey *k = iter->data->k, *next = bkey_next(k); 203 - 204 - if (next < iter->data->end && 205 - bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) { 206 - bch_dump_bucket(iter->b); 207 - panic("Key skipped backwards\n"); 208 - } 209 208 } 210 209 211 210 #endif ··· 186 321 if (!w) 187 322 break; 188 323 189 - bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key); 324 + bch_extent_to_text(kbuf, sizeof(kbuf), &w->key); 190 325 i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); 191 326 bch_keybuf_del(&i->keys, w); 192 327 }

+5 -22

drivers/md/bcache/debug.h

··· 1 1 #ifndef _BCACHE_DEBUG_H 2 2 #define _BCACHE_DEBUG_H 3 3 4 - /* Btree/bkey debug printing */ 5 - 6 - int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); 4 + struct bio; 5 + struct cached_dev; 6 + struct cache_set; 7 7 8 8 #ifdef CONFIG_BCACHE_DEBUG 9 9 10 - void bch_btree_verify(struct btree *, struct bset *); 10 + void bch_btree_verify(struct btree *); 11 11 void bch_data_verify(struct cached_dev *, struct bio *); 12 - int __bch_count_data(struct btree *); 13 - void __bch_check_keys(struct btree *, const char *, ...); 14 - void bch_btree_iter_next_check(struct btree_iter *); 15 12 16 - #define EBUG_ON(cond) BUG_ON(cond) 17 13 #define expensive_debug_checks(c) ((c)->expensive_debug_checks) 18 14 #define key_merging_disabled(c) ((c)->key_merging_disabled) 19 15 #define bypass_torture_test(d) ((d)->bypass_torture_test) 20 16 21 17 #else /* DEBUG */ 22 18 23 - static inline void bch_btree_verify(struct btree *b, struct bset *i) {} 19 + static inline void bch_btree_verify(struct btree *b) {} 24 20 static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} 25 - static inline int __bch_count_data(struct btree *b) { return -1; } 26 - static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {} 27 - static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} 28 21 29 - #define EBUG_ON(cond) do { if (cond); } while (0) 30 22 #define expensive_debug_checks(c) 0 31 23 #define key_merging_disabled(c) 0 32 24 #define bypass_torture_test(d) 0 33 25 34 26 #endif 35 - 36 - #define bch_count_data(b) \ 37 - (expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1) 38 - 39 - #define bch_check_keys(b, ...) \ 40 - do { \ 41 - if (expensive_debug_checks((b)->c)) \ 42 - __bch_check_keys(b, __VA_ARGS__); \ 43 - } while (0) 44 27 45 28 #ifdef CONFIG_DEBUG_FS 46 29 void bch_debug_init_cache_set(struct cache_set *);

+616

drivers/md/bcache/extents.c

··· 1 + /* 2 + * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> 3 + * 4 + * Uses a block device as cache for other block devices; optimized for SSDs. 5 + * All allocation is done in buckets, which should match the erase block size 6 + * of the device. 7 + * 8 + * Buckets containing cached data are kept on a heap sorted by priority; 9 + * bucket priority is increased on cache hit, and periodically all the buckets 10 + * on the heap have their priority scaled down. This currently is just used as 11 + * an LRU but in the future should allow for more intelligent heuristics. 12 + * 13 + * Buckets have an 8 bit counter; freeing is accomplished by incrementing the 14 + * counter. Garbage collection is used to remove stale pointers. 15 + * 16 + * Indexing is done via a btree; nodes are not necessarily fully sorted, rather 17 + * as keys are inserted we only sort the pages that have not yet been written. 18 + * When garbage collection is run, we resort the entire node. 19 + * 20 + * All configuration is done via sysfs; see Documentation/bcache.txt. 21 + */ 22 + 23 + #include "bcache.h" 24 + #include "btree.h" 25 + #include "debug.h" 26 + #include "extents.h" 27 + #include "writeback.h" 28 + 29 + static void sort_key_next(struct btree_iter *iter, 30 + struct btree_iter_set *i) 31 + { 32 + i->k = bkey_next(i->k); 33 + 34 + if (i->k == i->end) 35 + *i = iter->data[--iter->used]; 36 + } 37 + 38 + static bool bch_key_sort_cmp(struct btree_iter_set l, 39 + struct btree_iter_set r) 40 + { 41 + int64_t c = bkey_cmp(l.k, r.k); 42 + 43 + return c ? c > 0 : l.k < r.k; 44 + } 45 + 46 + static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) 47 + { 48 + unsigned i; 49 + 50 + for (i = 0; i < KEY_PTRS(k); i++) 51 + if (ptr_available(c, k, i)) { 52 + struct cache *ca = PTR_CACHE(c, k, i); 53 + size_t bucket = PTR_BUCKET_NR(c, k, i); 54 + size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); 55 + 56 + if (KEY_SIZE(k) + r > c->sb.bucket_size || 57 + bucket < ca->sb.first_bucket || 58 + bucket >= ca->sb.nbuckets) 59 + return true; 60 + } 61 + 62 + return false; 63 + } 64 + 65 + /* Common among btree and extent ptrs */ 66 + 67 + static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) 68 + { 69 + unsigned i; 70 + 71 + for (i = 0; i < KEY_PTRS(k); i++) 72 + if (ptr_available(c, k, i)) { 73 + struct cache *ca = PTR_CACHE(c, k, i); 74 + size_t bucket = PTR_BUCKET_NR(c, k, i); 75 + size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); 76 + 77 + if (KEY_SIZE(k) + r > c->sb.bucket_size) 78 + return "bad, length too big"; 79 + if (bucket < ca->sb.first_bucket) 80 + return "bad, short offset"; 81 + if (bucket >= ca->sb.nbuckets) 82 + return "bad, offset past end of device"; 83 + if (ptr_stale(c, k, i)) 84 + return "stale"; 85 + } 86 + 87 + if (!bkey_cmp(k, &ZERO_KEY)) 88 + return "bad, null key"; 89 + if (!KEY_PTRS(k)) 90 + return "bad, no pointers"; 91 + if (!KEY_SIZE(k)) 92 + return "zeroed key"; 93 + return ""; 94 + } 95 + 96 + void bch_extent_to_text(char *buf, size_t size, const struct bkey *k) 97 + { 98 + unsigned i = 0; 99 + char *out = buf, *end = buf + size; 100 + 101 + #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) 102 + 103 + p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k)); 104 + 105 + for (i = 0; i < KEY_PTRS(k); i++) { 106 + if (i) 107 + p(", "); 108 + 109 + if (PTR_DEV(k, i) == PTR_CHECK_DEV) 110 + p("check dev"); 111 + else 112 + p("%llu:%llu gen %llu", PTR_DEV(k, i), 113 + PTR_OFFSET(k, i), PTR_GEN(k, i)); 114 + } 115 + 116 + p("]"); 117 + 118 + if (KEY_DIRTY(k)) 119 + p(" dirty"); 120 + if (KEY_CSUM(k)) 121 + p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); 122 + #undef p 123 + } 124 + 125 + static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k) 126 + { 127 + struct btree *b = container_of(keys, struct btree, keys); 128 + unsigned j; 129 + char buf[80]; 130 + 131 + bch_extent_to_text(buf, sizeof(buf), k); 132 + printk(" %s", buf); 133 + 134 + for (j = 0; j < KEY_PTRS(k); j++) { 135 + size_t n = PTR_BUCKET_NR(b->c, k, j); 136 + printk(" bucket %zu", n); 137 + 138 + if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) 139 + printk(" prio %i", 140 + PTR_BUCKET(b->c, k, j)->prio); 141 + } 142 + 143 + printk(" %s\n", bch_ptr_status(b->c, k)); 144 + } 145 + 146 + /* Btree ptrs */ 147 + 148 + bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) 149 + { 150 + char buf[80]; 151 + 152 + if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)) 153 + goto bad; 154 + 155 + if (__ptr_invalid(c, k)) 156 + goto bad; 157 + 158 + return false; 159 + bad: 160 + bch_extent_to_text(buf, sizeof(buf), k); 161 + cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k)); 162 + return true; 163 + } 164 + 165 + static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k) 166 + { 167 + struct btree *b = container_of(bk, struct btree, keys); 168 + return __bch_btree_ptr_invalid(b->c, k); 169 + } 170 + 171 + static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k) 172 + { 173 + unsigned i; 174 + char buf[80]; 175 + struct bucket *g; 176 + 177 + if (mutex_trylock(&b->c->bucket_lock)) { 178 + for (i = 0; i < KEY_PTRS(k); i++) 179 + if (ptr_available(b->c, k, i)) { 180 + g = PTR_BUCKET(b->c, k, i); 181 + 182 + if (KEY_DIRTY(k) || 183 + g->prio != BTREE_PRIO || 184 + (b->c->gc_mark_valid && 185 + GC_MARK(g) != GC_MARK_METADATA)) 186 + goto err; 187 + } 188 + 189 + mutex_unlock(&b->c->bucket_lock); 190 + } 191 + 192 + return false; 193 + err: 194 + mutex_unlock(&b->c->bucket_lock); 195 + bch_extent_to_text(buf, sizeof(buf), k); 196 + btree_bug(b, 197 + "inconsistent btree pointer %s: bucket %li pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", 198 + buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), 199 + g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); 200 + return true; 201 + } 202 + 203 + static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k) 204 + { 205 + struct btree *b = container_of(bk, struct btree, keys); 206 + unsigned i; 207 + 208 + if (!bkey_cmp(k, &ZERO_KEY) || 209 + !KEY_PTRS(k) || 210 + bch_ptr_invalid(bk, k)) 211 + return true; 212 + 213 + for (i = 0; i < KEY_PTRS(k); i++) 214 + if (!ptr_available(b->c, k, i) || 215 + ptr_stale(b->c, k, i)) 216 + return true; 217 + 218 + if (expensive_debug_checks(b->c) && 219 + btree_ptr_bad_expensive(b, k)) 220 + return true; 221 + 222 + return false; 223 + } 224 + 225 + static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk, 226 + struct bkey *insert, 227 + struct btree_iter *iter, 228 + struct bkey *replace_key) 229 + { 230 + struct btree *b = container_of(bk, struct btree, keys); 231 + 232 + if (!KEY_OFFSET(insert)) 233 + btree_current_write(b)->prio_blocked++; 234 + 235 + return false; 236 + } 237 + 238 + const struct btree_keys_ops bch_btree_keys_ops = { 239 + .sort_cmp = bch_key_sort_cmp, 240 + .insert_fixup = bch_btree_ptr_insert_fixup, 241 + .key_invalid = bch_btree_ptr_invalid, 242 + .key_bad = bch_btree_ptr_bad, 243 + .key_to_text = bch_extent_to_text, 244 + .key_dump = bch_bkey_dump, 245 + }; 246 + 247 + /* Extents */ 248 + 249 + /* 250 + * Returns true if l > r - unless l == r, in which case returns true if l is 251 + * older than r. 252 + * 253 + * Necessary for btree_sort_fixup() - if there are multiple keys that compare 254 + * equal in different sets, we have to process them newest to oldest. 255 + */ 256 + static bool bch_extent_sort_cmp(struct btree_iter_set l, 257 + struct btree_iter_set r) 258 + { 259 + int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); 260 + 261 + return c ? c > 0 : l.k < r.k; 262 + } 263 + 264 + static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, 265 + struct bkey *tmp) 266 + { 267 + while (iter->used > 1) { 268 + struct btree_iter_set *top = iter->data, *i = top + 1; 269 + 270 + if (iter->used > 2 && 271 + bch_extent_sort_cmp(i[0], i[1])) 272 + i++; 273 + 274 + if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) 275 + break; 276 + 277 + if (!KEY_SIZE(i->k)) { 278 + sort_key_next(iter, i); 279 + heap_sift(iter, i - top, bch_extent_sort_cmp); 280 + continue; 281 + } 282 + 283 + if (top->k > i->k) { 284 + if (bkey_cmp(top->k, i->k) >= 0) 285 + sort_key_next(iter, i); 286 + else 287 + bch_cut_front(top->k, i->k); 288 + 289 + heap_sift(iter, i - top, bch_extent_sort_cmp); 290 + } else { 291 + /* can't happen because of comparison func */ 292 + BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); 293 + 294 + if (bkey_cmp(i->k, top->k) < 0) { 295 + bkey_copy(tmp, top->k); 296 + 297 + bch_cut_back(&START_KEY(i->k), tmp); 298 + bch_cut_front(i->k, top->k); 299 + heap_sift(iter, 0, bch_extent_sort_cmp); 300 + 301 + return tmp; 302 + } else { 303 + bch_cut_back(&START_KEY(i->k), top->k); 304 + } 305 + } 306 + } 307 + 308 + return NULL; 309 + } 310 + 311 + static bool bch_extent_insert_fixup(struct btree_keys *b, 312 + struct bkey *insert, 313 + struct btree_iter *iter, 314 + struct bkey *replace_key) 315 + { 316 + struct cache_set *c = container_of(b, struct btree, keys)->c; 317 + 318 + void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) 319 + { 320 + if (KEY_DIRTY(k)) 321 + bcache_dev_sectors_dirty_add(c, KEY_INODE(k), 322 + offset, -sectors); 323 + } 324 + 325 + uint64_t old_offset; 326 + unsigned old_size, sectors_found = 0; 327 + 328 + BUG_ON(!KEY_OFFSET(insert)); 329 + BUG_ON(!KEY_SIZE(insert)); 330 + 331 + while (1) { 332 + struct bkey *k = bch_btree_iter_next(iter); 333 + if (!k) 334 + break; 335 + 336 + if (bkey_cmp(&START_KEY(k), insert) >= 0) { 337 + if (KEY_SIZE(k)) 338 + break; 339 + else 340 + continue; 341 + } 342 + 343 + if (bkey_cmp(k, &START_KEY(insert)) <= 0) 344 + continue; 345 + 346 + old_offset = KEY_START(k); 347 + old_size = KEY_SIZE(k); 348 + 349 + /* 350 + * We might overlap with 0 size extents; we can't skip these 351 + * because if they're in the set we're inserting to we have to 352 + * adjust them so they don't overlap with the key we're 353 + * inserting. But we don't want to check them for replace 354 + * operations. 355 + */ 356 + 357 + if (replace_key && KEY_SIZE(k)) { 358 + /* 359 + * k might have been split since we inserted/found the 360 + * key we're replacing 361 + */ 362 + unsigned i; 363 + uint64_t offset = KEY_START(k) - 364 + KEY_START(replace_key); 365 + 366 + /* But it must be a subset of the replace key */ 367 + if (KEY_START(k) < KEY_START(replace_key) || 368 + KEY_OFFSET(k) > KEY_OFFSET(replace_key)) 369 + goto check_failed; 370 + 371 + /* We didn't find a key that we were supposed to */ 372 + if (KEY_START(k) > KEY_START(insert) + sectors_found) 373 + goto check_failed; 374 + 375 + if (!bch_bkey_equal_header(k, replace_key)) 376 + goto check_failed; 377 + 378 + /* skip past gen */ 379 + offset <<= 8; 380 + 381 + BUG_ON(!KEY_PTRS(replace_key)); 382 + 383 + for (i = 0; i < KEY_PTRS(replace_key); i++) 384 + if (k->ptr[i] != replace_key->ptr[i] + offset) 385 + goto check_failed; 386 + 387 + sectors_found = KEY_OFFSET(k) - KEY_START(insert); 388 + } 389 + 390 + if (bkey_cmp(insert, k) < 0 && 391 + bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { 392 + /* 393 + * We overlapped in the middle of an existing key: that 394 + * means we have to split the old key. But we have to do 395 + * slightly different things depending on whether the 396 + * old key has been written out yet. 397 + */ 398 + 399 + struct bkey *top; 400 + 401 + subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); 402 + 403 + if (bkey_written(b, k)) { 404 + /* 405 + * We insert a new key to cover the top of the 406 + * old key, and the old key is modified in place 407 + * to represent the bottom split. 408 + * 409 + * It's completely arbitrary whether the new key 410 + * is the top or the bottom, but it has to match 411 + * up with what btree_sort_fixup() does - it 412 + * doesn't check for this kind of overlap, it 413 + * depends on us inserting a new key for the top 414 + * here. 415 + */ 416 + top = bch_bset_search(b, bset_tree_last(b), 417 + insert); 418 + bch_bset_insert(b, top, k); 419 + } else { 420 + BKEY_PADDED(key) temp; 421 + bkey_copy(&temp.key, k); 422 + bch_bset_insert(b, k, &temp.key); 423 + top = bkey_next(k); 424 + } 425 + 426 + bch_cut_front(insert, top); 427 + bch_cut_back(&START_KEY(insert), k); 428 + bch_bset_fix_invalidated_key(b, k); 429 + goto out; 430 + } 431 + 432 + if (bkey_cmp(insert, k) < 0) { 433 + bch_cut_front(insert, k); 434 + } else { 435 + if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) 436 + old_offset = KEY_START(insert); 437 + 438 + if (bkey_written(b, k) && 439 + bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { 440 + /* 441 + * Completely overwrote, so we don't have to 442 + * invalidate the binary search tree 443 + */ 444 + bch_cut_front(k, k); 445 + } else { 446 + __bch_cut_back(&START_KEY(insert), k); 447 + bch_bset_fix_invalidated_key(b, k); 448 + } 449 + } 450 + 451 + subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); 452 + } 453 + 454 + check_failed: 455 + if (replace_key) { 456 + if (!sectors_found) { 457 + return true; 458 + } else if (sectors_found < KEY_SIZE(insert)) { 459 + SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - 460 + (KEY_SIZE(insert) - sectors_found)); 461 + SET_KEY_SIZE(insert, sectors_found); 462 + } 463 + } 464 + out: 465 + if (KEY_DIRTY(insert)) 466 + bcache_dev_sectors_dirty_add(c, KEY_INODE(insert), 467 + KEY_START(insert), 468 + KEY_SIZE(insert)); 469 + 470 + return false; 471 + } 472 + 473 + static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) 474 + { 475 + struct btree *b = container_of(bk, struct btree, keys); 476 + char buf[80]; 477 + 478 + if (!KEY_SIZE(k)) 479 + return true; 480 + 481 + if (KEY_SIZE(k) > KEY_OFFSET(k)) 482 + goto bad; 483 + 484 + if (__ptr_invalid(b->c, k)) 485 + goto bad; 486 + 487 + return false; 488 + bad: 489 + bch_extent_to_text(buf, sizeof(buf), k); 490 + cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k)); 491 + return true; 492 + } 493 + 494 + static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k, 495 + unsigned ptr) 496 + { 497 + struct bucket *g = PTR_BUCKET(b->c, k, ptr); 498 + char buf[80]; 499 + 500 + if (mutex_trylock(&b->c->bucket_lock)) { 501 + if (b->c->gc_mark_valid && 502 + ((GC_MARK(g) != GC_MARK_DIRTY && 503 + KEY_DIRTY(k)) || 504 + GC_MARK(g) == GC_MARK_METADATA)) 505 + goto err; 506 + 507 + if (g->prio == BTREE_PRIO) 508 + goto err; 509 + 510 + mutex_unlock(&b->c->bucket_lock); 511 + } 512 + 513 + return false; 514 + err: 515 + mutex_unlock(&b->c->bucket_lock); 516 + bch_extent_to_text(buf, sizeof(buf), k); 517 + btree_bug(b, 518 + "inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", 519 + buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), 520 + g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); 521 + return true; 522 + } 523 + 524 + static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k) 525 + { 526 + struct btree *b = container_of(bk, struct btree, keys); 527 + struct bucket *g; 528 + unsigned i, stale; 529 + 530 + if (!KEY_PTRS(k) || 531 + bch_extent_invalid(bk, k)) 532 + return true; 533 + 534 + for (i = 0; i < KEY_PTRS(k); i++) 535 + if (!ptr_available(b->c, k, i)) 536 + return true; 537 + 538 + if (!expensive_debug_checks(b->c) && KEY_DIRTY(k)) 539 + return false; 540 + 541 + for (i = 0; i < KEY_PTRS(k); i++) { 542 + g = PTR_BUCKET(b->c, k, i); 543 + stale = ptr_stale(b->c, k, i); 544 + 545 + btree_bug_on(stale > 96, b, 546 + "key too stale: %i, need_gc %u", 547 + stale, b->c->need_gc); 548 + 549 + btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), 550 + b, "stale dirty pointer"); 551 + 552 + if (stale) 553 + return true; 554 + 555 + if (expensive_debug_checks(b->c) && 556 + bch_extent_bad_expensive(b, k, i)) 557 + return true; 558 + } 559 + 560 + return false; 561 + } 562 + 563 + static uint64_t merge_chksums(struct bkey *l, struct bkey *r) 564 + { 565 + return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & 566 + ~((uint64_t)1 << 63); 567 + } 568 + 569 + static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r) 570 + { 571 + struct btree *b = container_of(bk, struct btree, keys); 572 + unsigned i; 573 + 574 + if (key_merging_disabled(b->c)) 575 + return false; 576 + 577 + for (i = 0; i < KEY_PTRS(l); i++) 578 + if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || 579 + PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) 580 + return false; 581 + 582 + /* Keys with no pointers aren't restricted to one bucket and could 583 + * overflow KEY_SIZE 584 + */ 585 + if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) { 586 + SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l)); 587 + SET_KEY_SIZE(l, USHRT_MAX); 588 + 589 + bch_cut_front(l, r); 590 + return false; 591 + } 592 + 593 + if (KEY_CSUM(l)) { 594 + if (KEY_CSUM(r)) 595 + l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); 596 + else 597 + SET_KEY_CSUM(l, 0); 598 + } 599 + 600 + SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); 601 + SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r)); 602 + 603 + return true; 604 + } 605 + 606 + const struct btree_keys_ops bch_extent_keys_ops = { 607 + .sort_cmp = bch_extent_sort_cmp, 608 + .sort_fixup = bch_extent_sort_fixup, 609 + .insert_fixup = bch_extent_insert_fixup, 610 + .key_invalid = bch_extent_invalid, 611 + .key_bad = bch_extent_bad, 612 + .key_merge = bch_extent_merge, 613 + .key_to_text = bch_extent_to_text, 614 + .key_dump = bch_bkey_dump, 615 + .is_extents = true, 616 + };

+13

drivers/md/bcache/extents.h

··· 1 + #ifndef _BCACHE_EXTENTS_H 2 + #define _BCACHE_EXTENTS_H 3 + 4 + extern const struct btree_keys_ops bch_btree_keys_ops; 5 + extern const struct btree_keys_ops bch_extent_keys_ops; 6 + 7 + struct bkey; 8 + struct cache_set; 9 + 10 + void bch_extent_to_text(char *, size_t, const struct bkey *); 11 + bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); 12 + 13 + #endif /* _BCACHE_EXTENTS_H */

+45 -30

drivers/md/bcache/journal.c

··· 44 44 45 45 closure_init_stack(&cl); 46 46 47 - pr_debug("reading %llu", (uint64_t) bucket); 47 + pr_debug("reading %u", bucket_index); 48 48 49 49 while (offset < ca->sb.bucket_size) { 50 50 reread: left = ca->sb.bucket_size - offset; 51 - len = min_t(unsigned, left, PAGE_SECTORS * 8); 51 + len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS); 52 52 53 53 bio_reset(bio); 54 54 bio->bi_iter.bi_sector = bucket + offset; ··· 74 74 struct list_head *where; 75 75 size_t blocks, bytes = set_bytes(j); 76 76 77 - if (j->magic != jset_magic(&ca->sb)) 77 + if (j->magic != jset_magic(&ca->sb)) { 78 + pr_debug("%u: bad magic", bucket_index); 78 79 return ret; 80 + } 79 81 80 - if (bytes > left << 9) 82 + if (bytes > left << 9 || 83 + bytes > PAGE_SIZE << JSET_BITS) { 84 + pr_info("%u: too big, %zu bytes, offset %u", 85 + bucket_index, bytes, offset); 81 86 return ret; 87 + } 82 88 83 89 if (bytes > len << 9) 84 90 goto reread; 85 91 86 - if (j->csum != csum_set(j)) 92 + if (j->csum != csum_set(j)) { 93 + pr_info("%u: bad csum, %zu bytes, offset %u", 94 + bucket_index, bytes, offset); 87 95 return ret; 96 + } 88 97 89 - blocks = set_blocks(j, ca->set); 98 + blocks = set_blocks(j, block_bytes(ca->set)); 90 99 91 100 while (!list_empty(list)) { 92 101 i = list_first_entry(list, ··· 284 275 } 285 276 286 277 for (k = i->j.start; 287 - k < end(&i->j); 278 + k < bset_bkey_last(&i->j); 288 279 k = bkey_next(k)) { 289 280 unsigned j; 290 281 ··· 322 313 n, i->j.seq - 1, start, end); 323 314 324 315 for (k = i->j.start; 325 - k < end(&i->j); 316 + k < bset_bkey_last(&i->j); 326 317 k = bkey_next(k)) { 327 318 trace_bcache_journal_replay_key(k); 328 319 ··· 564 555 continue_at_nobarrier(cl, journal_write, system_wq); 565 556 } 566 557 558 + static void journal_write_unlock(struct closure *cl) 559 + { 560 + struct cache_set *c = container_of(cl, struct cache_set, journal.io); 561 + 562 + c->journal.io_in_flight = 0; 563 + spin_unlock(&c->journal.lock); 564 + } 565 + 567 566 static void journal_write_unlocked(struct closure *cl) 568 567 __releases(c->journal.lock) 569 568 { ··· 579 562 struct cache *ca; 580 563 struct journal_write *w = c->journal.cur; 581 564 struct bkey *k = &c->journal.key; 582 - unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size; 565 + unsigned i, sectors = set_blocks(w->data, block_bytes(c)) * 566 + c->sb.block_size; 583 567 584 568 struct bio *bio; 585 569 struct bio_list list; 586 570 bio_list_init(&list); 587 571 588 572 if (!w->need_write) { 589 - /* 590 - * XXX: have to unlock closure before we unlock journal lock, 591 - * else we race with bch_journal(). But this way we race 592 - * against cache set unregister. Doh. 593 - */ 594 - set_closure_fn(cl, NULL, NULL); 595 - closure_sub(cl, CLOSURE_RUNNING + 1); 596 - spin_unlock(&c->journal.lock); 597 - return; 573 + closure_return_with_destructor(cl, journal_write_unlock); 598 574 } else if (journal_full(&c->journal)) { 599 575 journal_reclaim(c); 600 576 spin_unlock(&c->journal.lock); ··· 596 586 continue_at(cl, journal_write, system_wq); 597 587 } 598 588 599 - c->journal.blocks_free -= set_blocks(w->data, c); 589 + c->journal.blocks_free -= set_blocks(w->data, block_bytes(c)); 600 590 601 591 w->data->btree_level = c->root->level; 602 592 ··· 663 653 664 654 w->need_write = true; 665 655 666 - if (closure_trylock(cl, &c->cl)) 667 - journal_write_unlocked(cl); 668 - else 656 + if (!c->journal.io_in_flight) { 657 + c->journal.io_in_flight = 1; 658 + closure_call(cl, journal_write_unlocked, NULL, &c->cl); 659 + } else { 669 660 spin_unlock(&c->journal.lock); 661 + } 670 662 } 671 663 672 664 static struct journal_write *journal_wait_for_write(struct cache_set *c, ··· 676 664 { 677 665 size_t sectors; 678 666 struct closure cl; 667 + bool wait = false; 679 668 680 669 closure_init_stack(&cl); 681 670 ··· 686 673 struct journal_write *w = c->journal.cur; 687 674 688 675 sectors = __set_blocks(w->data, w->data->keys + nkeys, 689 - c) * c->sb.block_size; 676 + block_bytes(c)) * c->sb.block_size; 690 677 691 678 if (sectors <= min_t(size_t, 692 679 c->journal.blocks_free * c->sb.block_size, 693 680 PAGE_SECTORS << JSET_BITS)) 694 681 return w; 695 682 696 - /* XXX: tracepoint */ 683 + if (wait) 684 + closure_wait(&c->journal.wait, &cl); 685 + 697 686 if (!journal_full(&c->journal)) { 698 - trace_bcache_journal_entry_full(c); 687 + if (wait) 688 + trace_bcache_journal_entry_full(c); 699 689 700 690 /* 701 691 * XXX: If we were inserting so many keys that they ··· 708 692 */ 709 693 BUG_ON(!w->data->keys); 710 694 711 - closure_wait(&w->wait, &cl); 712 695 journal_try_write(c); /* unlocks */ 713 696 } else { 714 - trace_bcache_journal_full(c); 697 + if (wait) 698 + trace_bcache_journal_full(c); 715 699 716 - closure_wait(&c->journal.wait, &cl); 717 700 journal_reclaim(c); 718 701 spin_unlock(&c->journal.lock); 719 702 ··· 721 706 722 707 closure_sync(&cl); 723 708 spin_lock(&c->journal.lock); 709 + wait = true; 724 710 } 725 711 } 726 712 ··· 752 736 753 737 w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); 754 738 755 - memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys)); 739 + memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys)); 756 740 w->data->keys += bch_keylist_nkeys(keys); 757 741 758 742 ret = &fifo_back(&c->journal.pin); ··· 796 780 { 797 781 struct journal *j = &c->journal; 798 782 799 - closure_init_unlocked(&j->io); 800 783 spin_lock_init(&j->lock); 801 784 INIT_DELAYED_WORK(&j->work, journal_write_work); 802 785

+1

drivers/md/bcache/journal.h

··· 104 104 /* used when waiting because the journal was full */ 105 105 struct closure_waitlist wait; 106 106 struct closure io; 107 + int io_in_flight; 107 108 struct delayed_work work; 108 109 109 110 /* Number of blocks free in the bucket(s) we're currently writing to */

+1 -1

drivers/md/bcache/movinggc.c

··· 211 211 for_each_cache(ca, c, i) { 212 212 unsigned sectors_to_move = 0; 213 213 unsigned reserve_sectors = ca->sb.bucket_size * 214 - min(fifo_used(&ca->free), ca->free.size / 2); 214 + fifo_used(&ca->free[RESERVE_MOVINGGC]); 215 215 216 216 ca->heap.used = 0; 217 217

+49 -23

drivers/md/bcache/request.c

··· 254 254 closure_return(cl); 255 255 } 256 256 257 + static int bch_keylist_realloc(struct keylist *l, unsigned u64s, 258 + struct cache_set *c) 259 + { 260 + size_t oldsize = bch_keylist_nkeys(l); 261 + size_t newsize = oldsize + u64s; 262 + 263 + /* 264 + * The journalling code doesn't handle the case where the keys to insert 265 + * is bigger than an empty write: If we just return -ENOMEM here, 266 + * bio_insert() and bio_invalidate() will insert the keys created so far 267 + * and finish the rest when the keylist is empty. 268 + */ 269 + if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) 270 + return -ENOMEM; 271 + 272 + return __bch_keylist_realloc(l, u64s); 273 + } 274 + 257 275 static void bch_data_invalidate(struct closure *cl) 258 276 { 259 277 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); ··· 284 266 unsigned sectors = min(bio_sectors(bio), 285 267 1U << (KEY_SIZE_BITS - 1)); 286 268 287 - if (bch_keylist_realloc(&op->insert_keys, 0, op->c)) 269 + if (bch_keylist_realloc(&op->insert_keys, 2, op->c)) 288 270 goto out; 289 271 290 272 bio->bi_iter.bi_sector += sectors; ··· 374 356 375 357 /* 1 for the device pointer and 1 for the chksum */ 376 358 if (bch_keylist_realloc(&op->insert_keys, 377 - 1 + (op->csum ? 1 : 0), 359 + 3 + (op->csum ? 1 : 0), 378 360 op->c)) 379 361 continue_at(cl, bch_data_insert_keys, bcache_wq); 380 362 ··· 614 596 /* Stack frame for bio_complete */ 615 597 struct closure cl; 616 598 617 - struct bcache_device *d; 618 - 619 599 struct bbio bio; 620 600 struct bio *orig_bio; 621 601 struct bio *cache_miss; 602 + struct bcache_device *d; 622 603 623 604 unsigned insert_bio_sectors; 624 - 625 605 unsigned recoverable:1; 626 606 unsigned write:1; 627 607 unsigned read_dirty_data:1; ··· 645 629 646 630 if (error) 647 631 s->iop.error = error; 648 - else if (ptr_stale(s->iop.c, &b->key, 0)) { 632 + else if (!KEY_DIRTY(&b->key) && 633 + ptr_stale(s->iop.c, &b->key, 0)) { 649 634 atomic_long_inc(&s->iop.c->cache_read_races); 650 635 s->iop.error = -EINTR; 651 636 } ··· 727 710 { 728 711 struct search *s = container_of(cl, struct search, iop.cl); 729 712 struct bio *bio = &s->bio.bio; 713 + int ret; 730 714 731 - int ret = bch_btree_map_keys(&s->op, s->iop.c, 732 - &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0), 733 - cache_lookup_fn, MAP_END_KEY); 715 + bch_btree_op_init(&s->op, -1); 716 + 717 + ret = bch_btree_map_keys(&s->op, s->iop.c, 718 + &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0), 719 + cache_lookup_fn, MAP_END_KEY); 734 720 if (ret == -EAGAIN) 735 721 continue_at(cl, cache_lookup, bcache_wq); 736 722 ··· 774 754 } 775 755 } 776 756 777 - static void do_bio_hook(struct search *s) 757 + static void do_bio_hook(struct search *s, struct bio *orig_bio) 778 758 { 779 759 struct bio *bio = &s->bio.bio; 780 760 781 761 bio_init(bio); 782 - __bio_clone_fast(bio, s->orig_bio); 762 + __bio_clone_fast(bio, orig_bio); 783 763 bio->bi_end_io = request_endio; 784 764 bio->bi_private = &s->cl; 785 765 ··· 798 778 mempool_free(s, s->d->c->search); 799 779 } 800 780 801 - static struct search *search_alloc(struct bio *bio, struct bcache_device *d) 781 + static inline struct search *search_alloc(struct bio *bio, 782 + struct bcache_device *d) 802 783 { 803 784 struct search *s; 804 785 805 786 s = mempool_alloc(d->c->search, GFP_NOIO); 806 - memset(s, 0, offsetof(struct search, iop.insert_keys)); 807 787 808 - __closure_init(&s->cl, NULL); 788 + closure_init(&s->cl, NULL); 789 + do_bio_hook(s, bio); 809 790 810 - s->iop.inode = d->id; 811 - s->iop.c = d->c; 812 - s->d = d; 813 - s->op.lock = -1; 814 - s->iop.write_point = hash_long((unsigned long) current, 16); 815 791 s->orig_bio = bio; 816 - s->write = (bio->bi_rw & REQ_WRITE) != 0; 817 - s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; 792 + s->cache_miss = NULL; 793 + s->d = d; 818 794 s->recoverable = 1; 795 + s->write = (bio->bi_rw & REQ_WRITE) != 0; 796 + s->read_dirty_data = 0; 819 797 s->start_time = jiffies; 820 - do_bio_hook(s); 798 + 799 + s->iop.c = d->c; 800 + s->iop.bio = NULL; 801 + s->iop.inode = d->id; 802 + s->iop.write_point = hash_long((unsigned long) current, 16); 803 + s->iop.write_prio = 0; 804 + s->iop.error = 0; 805 + s->iop.flags = 0; 806 + s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; 821 807 822 808 return s; 823 809 } ··· 869 843 trace_bcache_read_retry(s->orig_bio); 870 844 871 845 s->iop.error = 0; 872 - do_bio_hook(s); 846 + do_bio_hook(s, s->orig_bio); 873 847 874 848 /* XXX: invalidate cache */ 875 849

+13 -8

drivers/md/bcache/request.h

··· 13 13 uint16_t write_prio; 14 14 short error; 15 15 16 - unsigned bypass:1; 17 - unsigned writeback:1; 18 - unsigned flush_journal:1; 19 - unsigned csum:1; 16 + union { 17 + uint16_t flags; 20 18 21 - unsigned replace:1; 22 - unsigned replace_collision:1; 19 + struct { 20 + unsigned bypass:1; 21 + unsigned writeback:1; 22 + unsigned flush_journal:1; 23 + unsigned csum:1; 23 24 24 - unsigned insert_data_done:1; 25 + unsigned replace:1; 26 + unsigned replace_collision:1; 25 27 26 - /* Anything past this point won't get zeroed in search_alloc() */ 28 + unsigned insert_data_done:1; 29 + }; 30 + }; 31 + 27 32 struct keylist insert_keys; 28 33 BKEY_PADDED(replace_key); 29 34 };

+68 -35

drivers/md/bcache/super.c

··· 9 9 #include "bcache.h" 10 10 #include "btree.h" 11 11 #include "debug.h" 12 + #include "extents.h" 12 13 #include "request.h" 13 14 #include "writeback.h" 14 15 ··· 226 225 struct cached_dev *dc = bio->bi_private; 227 226 /* XXX: error checking */ 228 227 229 - closure_put(&dc->sb_write.cl); 228 + closure_put(&dc->sb_write); 230 229 } 231 230 232 231 static void __write_super(struct cache_sb *sb, struct bio *bio) ··· 264 263 submit_bio(REQ_WRITE, bio); 265 264 } 266 265 266 + static void bch_write_bdev_super_unlock(struct closure *cl) 267 + { 268 + struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write); 269 + 270 + up(&dc->sb_write_mutex); 271 + } 272 + 267 273 void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) 268 274 { 269 - struct closure *cl = &dc->sb_write.cl; 275 + struct closure *cl = &dc->sb_write; 270 276 struct bio *bio = &dc->sb_bio; 271 277 272 - closure_lock(&dc->sb_write, parent); 278 + down(&dc->sb_write_mutex); 279 + closure_init(cl, parent); 273 280 274 281 bio_reset(bio); 275 282 bio->bi_bdev = dc->bdev; ··· 287 278 closure_get(cl); 288 279 __write_super(&dc->sb, bio); 289 280 290 - closure_return(cl); 281 + closure_return_with_destructor(cl, bch_write_bdev_super_unlock); 291 282 } 292 283 293 284 static void write_super_endio(struct bio *bio, int error) ··· 295 286 struct cache *ca = bio->bi_private; 296 287 297 288 bch_count_io_errors(ca, error, "writing superblock"); 298 - closure_put(&ca->set->sb_write.cl); 289 + closure_put(&ca->set->sb_write); 290 + } 291 + 292 + static void bcache_write_super_unlock(struct closure *cl) 293 + { 294 + struct cache_set *c = container_of(cl, struct cache_set, sb_write); 295 + 296 + up(&c->sb_write_mutex); 299 297 } 300 298 301 299 void bcache_write_super(struct cache_set *c) 302 300 { 303 - struct closure *cl = &c->sb_write.cl; 301 + struct closure *cl = &c->sb_write; 304 302 struct cache *ca; 305 303 unsigned i; 306 304 307 - closure_lock(&c->sb_write, &c->cl); 305 + down(&c->sb_write_mutex); 306 + closure_init(cl, &c->cl); 308 307 309 308 c->sb.seq++; 310 309 ··· 334 317 __write_super(&ca->sb, bio); 335 318 } 336 319 337 - closure_return(cl); 320 + closure_return_with_destructor(cl, bcache_write_super_unlock); 338 321 } 339 322 340 323 /* UUID io */ ··· 342 325 static void uuid_endio(struct bio *bio, int error) 343 326 { 344 327 struct closure *cl = bio->bi_private; 345 - struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl); 328 + struct cache_set *c = container_of(cl, struct cache_set, uuid_write); 346 329 347 330 cache_set_err_on(error, c, "accessing uuids"); 348 331 bch_bbio_free(bio, c); 349 332 closure_put(cl); 350 333 } 351 334 335 + static void uuid_io_unlock(struct closure *cl) 336 + { 337 + struct cache_set *c = container_of(cl, struct cache_set, uuid_write); 338 + 339 + up(&c->uuid_write_mutex); 340 + } 341 + 352 342 static void uuid_io(struct cache_set *c, unsigned long rw, 353 343 struct bkey *k, struct closure *parent) 354 344 { 355 - struct closure *cl = &c->uuid_write.cl; 345 + struct closure *cl = &c->uuid_write; 356 346 struct uuid_entry *u; 357 347 unsigned i; 358 348 char buf[80]; 359 349 360 350 BUG_ON(!parent); 361 - closure_lock(&c->uuid_write, parent); 351 + down(&c->uuid_write_mutex); 352 + closure_init(cl, parent); 362 353 363 354 for (i = 0; i < KEY_PTRS(k); i++) { 364 355 struct bio *bio = bch_bbio_alloc(c); ··· 384 359 break; 385 360 } 386 361 387 - bch_bkey_to_text(buf, sizeof(buf), k); 362 + bch_extent_to_text(buf, sizeof(buf), k); 388 363 pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf); 389 364 390 365 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) ··· 393 368 u - c->uuids, u->uuid, u->label, 394 369 u->first_reg, u->last_reg, u->invalidated); 395 370 396 - closure_return(cl); 371 + closure_return_with_destructor(cl, uuid_io_unlock); 397 372 } 398 373 399 374 static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) 400 375 { 401 376 struct bkey *k = &j->uuid_bucket; 402 377 403 - if (bch_btree_ptr_invalid(c, k)) 378 + if (__bch_btree_ptr_invalid(c, k)) 404 379 return "bad uuid pointer"; 405 380 406 381 bkey_copy(&c->uuid_bucket, k); ··· 445 420 446 421 lockdep_assert_held(&bch_register_lock); 447 422 448 - if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true)) 423 + if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true)) 449 424 return 1; 450 425 451 426 SET_KEY_SIZE(&k.key, c->sb.bucket_size); ··· 563 538 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), 564 539 &ca->meta_sectors_written); 565 540 566 - pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), 567 - fifo_used(&ca->free_inc), fifo_used(&ca->unused)); 541 + //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), 542 + // fifo_used(&ca->free_inc), fifo_used(&ca->unused)); 568 543 569 544 for (i = prio_buckets(ca) - 1; i >= 0; --i) { 570 545 long bucket; ··· 583 558 p->magic = pset_magic(&ca->sb); 584 559 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); 585 560 586 - bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true); 561 + bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true); 587 562 BUG_ON(bucket == -1); 588 563 589 564 mutex_unlock(&ca->set->bucket_lock); ··· 1123 1098 set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); 1124 1099 kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); 1125 1100 INIT_WORK(&dc->detach, cached_dev_detach_finish); 1126 - closure_init_unlocked(&dc->sb_write); 1101 + sema_init(&dc->sb_write_mutex, 1); 1127 1102 INIT_LIST_HEAD(&dc->io_lru); 1128 1103 spin_lock_init(&dc->io_lock); 1129 1104 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); ··· 1134 1109 list_add(&io->lru, &dc->io_lru); 1135 1110 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); 1136 1111 } 1112 + 1113 + dc->disk.stripe_size = q->limits.io_opt >> 9; 1114 + 1115 + if (dc->disk.stripe_size) 1116 + dc->partial_stripes_expensive = 1117 + q->limits.raid_partial_stripes_expensive; 1137 1118 1138 1119 ret = bcache_device_init(&dc->disk, block_size, 1139 1120 dc->bdev->bd_part->nr_sects - dc->sb.data_offset); ··· 1352 1321 if (ca) 1353 1322 kobject_put(&ca->kobj); 1354 1323 1324 + bch_bset_sort_state_free(&c->sort); 1355 1325 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); 1356 - free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); 1357 1326 1358 1327 if (c->bio_split) 1359 1328 bioset_free(c->bio_split); ··· 1478 1447 c->block_bits = ilog2(sb->block_size); 1479 1448 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); 1480 1449 1481 - c->btree_pages = c->sb.bucket_size / PAGE_SECTORS; 1450 + c->btree_pages = bucket_pages(c); 1482 1451 if (c->btree_pages > BTREE_MAX_PAGES) 1483 1452 c->btree_pages = max_t(int, c->btree_pages / 4, 1484 1453 BTREE_MAX_PAGES); 1485 1454 1486 - c->sort_crit_factor = int_sqrt(c->btree_pages); 1487 - 1488 - closure_init_unlocked(&c->sb_write); 1455 + sema_init(&c->sb_write_mutex, 1); 1489 1456 mutex_init(&c->bucket_lock); 1490 1457 init_waitqueue_head(&c->try_wait); 1491 1458 init_waitqueue_head(&c->bucket_wait); 1492 - closure_init_unlocked(&c->uuid_write); 1493 - mutex_init(&c->sort_lock); 1459 + sema_init(&c->uuid_write_mutex, 1); 1494 1460 1495 - spin_lock_init(&c->sort_time.lock); 1496 1461 spin_lock_init(&c->btree_gc_time.lock); 1497 1462 spin_lock_init(&c->btree_split_time.lock); 1498 1463 spin_lock_init(&c->btree_read_time.lock); ··· 1516 1489 bucket_pages(c))) || 1517 1490 !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || 1518 1491 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || 1519 - !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || 1520 1492 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || 1521 1493 bch_journal_alloc(c) || 1522 1494 bch_btree_cache_alloc(c) || 1523 - bch_open_buckets_alloc(c)) 1495 + bch_open_buckets_alloc(c) || 1496 + bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages))) 1524 1497 goto err; 1525 1498 1526 1499 c->congested_read_threshold_us = 2000; ··· 1576 1549 k = &j->btree_root; 1577 1550 1578 1551 err = "bad btree root"; 1579 - if (bch_btree_ptr_invalid(c, k)) 1552 + if (__bch_btree_ptr_invalid(c, k)) 1580 1553 goto err; 1581 1554 1582 1555 err = "error reading btree root"; ··· 1770 1743 void bch_cache_release(struct kobject *kobj) 1771 1744 { 1772 1745 struct cache *ca = container_of(kobj, struct cache, kobj); 1746 + unsigned i; 1773 1747 1774 1748 if (ca->set) 1775 1749 ca->set->cache[ca->sb.nr_this_dev] = NULL; ··· 1784 1756 free_heap(&ca->heap); 1785 1757 free_fifo(&ca->unused); 1786 1758 free_fifo(&ca->free_inc); 1787 - free_fifo(&ca->free); 1759 + 1760 + for (i = 0; i < RESERVE_NR; i++) 1761 + free_fifo(&ca->free[i]); 1788 1762 1789 1763 if (ca->sb_bio.bi_inline_vecs[0].bv_page) 1790 1764 put_page(ca->sb_bio.bi_io_vec[0].bv_page); ··· 1812 1782 ca->journal.bio.bi_max_vecs = 8; 1813 1783 ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; 1814 1784 1815 - free = roundup_pow_of_two(ca->sb.nbuckets) >> 9; 1816 - free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2); 1785 + free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; 1817 1786 1818 - if (!init_fifo(&ca->free, free, GFP_KERNEL) || 1787 + if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) || 1788 + !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || 1789 + !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || 1790 + !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || 1819 1791 !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || 1820 1792 !init_fifo(&ca->unused, free << 2, GFP_KERNEL) || 1821 1793 !init_heap(&ca->heap, free << 3, GFP_KERNEL) || ··· 2062 2030 kobject_put(bcache_kobj); 2063 2031 if (bcache_wq) 2064 2032 destroy_workqueue(bcache_wq); 2065 - unregister_blkdev(bcache_major, "bcache"); 2033 + if (bcache_major) 2034 + unregister_blkdev(bcache_major, "bcache"); 2066 2035 unregister_reboot_notifier(&reboot); 2067 2036 } 2068 2037

+45 -34

drivers/md/bcache/sysfs.c

··· 102 102 rw_attribute(key_merging_disabled); 103 103 rw_attribute(gc_always_rewrite); 104 104 rw_attribute(expensive_debug_checks); 105 - rw_attribute(freelist_percent); 106 105 rw_attribute(cache_replacement_policy); 107 106 rw_attribute(btree_shrinker_disabled); 108 107 rw_attribute(copy_gc_enabled); ··· 400 401 }; 401 402 KTYPE(bch_flash_dev); 402 403 404 + struct bset_stats_op { 405 + struct btree_op op; 406 + size_t nodes; 407 + struct bset_stats stats; 408 + }; 409 + 410 + static int btree_bset_stats(struct btree_op *b_op, struct btree *b) 411 + { 412 + struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op); 413 + 414 + op->nodes++; 415 + bch_btree_keys_stats(&b->keys, &op->stats); 416 + 417 + return MAP_CONTINUE; 418 + } 419 + 420 + int bch_bset_print_stats(struct cache_set *c, char *buf) 421 + { 422 + struct bset_stats_op op; 423 + int ret; 424 + 425 + memset(&op, 0, sizeof(op)); 426 + bch_btree_op_init(&op.op, -1); 427 + 428 + ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, btree_bset_stats); 429 + if (ret < 0) 430 + return ret; 431 + 432 + return snprintf(buf, PAGE_SIZE, 433 + "btree nodes: %zu\n" 434 + "written sets: %zu\n" 435 + "unwritten sets: %zu\n" 436 + "written key bytes: %zu\n" 437 + "unwritten key bytes: %zu\n" 438 + "floats: %zu\n" 439 + "failed: %zu\n", 440 + op.nodes, 441 + op.stats.sets_written, op.stats.sets_unwritten, 442 + op.stats.bytes_written, op.stats.bytes_unwritten, 443 + op.stats.floats, op.stats.failed); 444 + } 445 + 403 446 SHOW(__bch_cache_set) 404 447 { 405 448 unsigned root_usage(struct cache_set *c) ··· 460 419 rw_lock(false, b, b->level); 461 420 } while (b != c->root); 462 421 463 - for_each_key_filter(b, k, &iter, bch_ptr_bad) 422 + for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) 464 423 bytes += bkey_bytes(k); 465 424 466 425 rw_unlock(false, b); ··· 475 434 476 435 mutex_lock(&c->bucket_lock); 477 436 list_for_each_entry(b, &c->btree_cache, list) 478 - ret += 1 << (b->page_order + PAGE_SHIFT); 437 + ret += 1 << (b->keys.page_order + PAGE_SHIFT); 479 438 480 439 mutex_unlock(&c->bucket_lock); 481 440 return ret; ··· 532 491 533 492 sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); 534 493 sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); 535 - sysfs_print_time_stats(&c->sort_time, btree_sort, ms, us); 494 + sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us); 536 495 sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); 537 496 sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us); 538 497 ··· 752 711 sysfs_print(io_errors, 753 712 atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT); 754 713 755 - sysfs_print(freelist_percent, ca->free.size * 100 / 756 - ((size_t) ca->sb.nbuckets)); 757 - 758 714 if (attr == &sysfs_cache_replacement_policy) 759 715 return bch_snprint_string_list(buf, PAGE_SIZE, 760 716 cache_replacement_policies, ··· 858 820 } 859 821 } 860 822 861 - if (attr == &sysfs_freelist_percent) { 862 - DECLARE_FIFO(long, free); 863 - long i; 864 - size_t p = strtoul_or_return(buf); 865 - 866 - p = clamp_t(size_t, 867 - ((size_t) ca->sb.nbuckets * p) / 100, 868 - roundup_pow_of_two(ca->sb.nbuckets) >> 9, 869 - ca->sb.nbuckets / 2); 870 - 871 - if (!init_fifo_exact(&free, p, GFP_KERNEL)) 872 - return -ENOMEM; 873 - 874 - mutex_lock(&ca->set->bucket_lock); 875 - 876 - fifo_move(&free, &ca->free); 877 - fifo_swap(&free, &ca->free); 878 - 879 - mutex_unlock(&ca->set->bucket_lock); 880 - 881 - while (fifo_pop(&free, i)) 882 - atomic_dec(&ca->buckets[i].pin); 883 - 884 - free_fifo(&free); 885 - } 886 - 887 823 if (attr == &sysfs_clear_stats) { 888 824 atomic_long_set(&ca->sectors_written, 0); 889 825 atomic_long_set(&ca->btree_sectors_written, 0); ··· 881 869 &sysfs_metadata_written, 882 870 &sysfs_io_errors, 883 871 &sysfs_clear_stats, 884 - &sysfs_freelist_percent, 885 872 &sysfs_cache_replacement_policy, 886 873 NULL 887 874 };

+8

drivers/md/bcache/util.h

··· 2 2 #ifndef _BCACHE_UTIL_H 3 3 #define _BCACHE_UTIL_H 4 4 5 + #include <linux/blkdev.h> 5 6 #include <linux/errno.h> 6 7 #include <linux/kernel.h> 7 8 #include <linux/llist.h> ··· 18 17 19 18 #ifdef CONFIG_BCACHE_DEBUG 20 19 20 + #define EBUG_ON(cond) BUG_ON(cond) 21 21 #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) 22 22 #define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) 23 23 24 24 #else /* DEBUG */ 25 25 26 + #define EBUG_ON(cond) do { if (cond); } while (0) 26 27 #define atomic_dec_bug(v) atomic_dec(v) 27 28 #define atomic_inc_bug(v, i) atomic_inc(v) 28 29 ··· 393 390 }; 394 391 395 392 void bch_time_stats_update(struct time_stats *stats, uint64_t time); 393 + 394 + static inline unsigned local_clock_us(void) 395 + { 396 + return local_clock() >> 10; 397 + } 396 398 397 399 #define NSEC_PER_ns 1L 398 400 #define NSEC_PER_us NSEC_PER_USEC

+1

drivers/md/raid5.c

··· 6103 6103 blk_queue_io_min(mddev->queue, chunk_size); 6104 6104 blk_queue_io_opt(mddev->queue, chunk_size * 6105 6105 (conf->raid_disks - conf->max_degraded)); 6106 + mddev->queue->limits.raid_partial_stripes_expensive = 1; 6106 6107 /* 6107 6108 * We can only discard a whole stripe. It doesn't make sense to 6108 6109 * discard data disk but write parity disk

+1 -1

fs/buffer.c

··· 1312 1312 } 1313 1313 while (out < BH_LRU_SIZE) 1314 1314 bhs[out++] = NULL; 1315 - memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs)); 1315 + memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs)); 1316 1316 } 1317 1317 bh_lru_unlock(); 1318 1318

+1

include/linux/blkdev.h

··· 291 291 unsigned char discard_misaligned; 292 292 unsigned char cluster; 293 293 unsigned char discard_zeroes_data; 294 + unsigned char raid_partial_stripes_expensive; 294 295 }; 295 296 296 297 struct request_queue {

+5 -5

include/trace/events/bcache.h

··· 247 247 TP_fast_assign( 248 248 __entry->bucket = PTR_BUCKET_NR(b->c, &b->key, 0); 249 249 __entry->block = b->written; 250 - __entry->keys = b->sets[b->nsets].data->keys; 250 + __entry->keys = b->keys.set[b->keys.nsets].data->keys; 251 251 ), 252 252 253 253 TP_printk("bucket %zu", __entry->bucket) ··· 411 411 ), 412 412 413 413 TP_fast_assign( 414 - __entry->free = fifo_used(&ca->free); 414 + __entry->free = fifo_used(&ca->free[RESERVE_NONE]); 415 415 __entry->free_inc = fifo_used(&ca->free_inc); 416 416 __entry->free_inc_size = ca->free_inc.size; 417 417 __entry->unused = fifo_used(&ca->unused); ··· 422 422 ); 423 423 424 424 TRACE_EVENT(bcache_alloc_fail, 425 - TP_PROTO(struct cache *ca), 426 - TP_ARGS(ca), 425 + TP_PROTO(struct cache *ca, unsigned reserve), 426 + TP_ARGS(ca, reserve), 427 427 428 428 TP_STRUCT__entry( 429 429 __field(unsigned, free ) ··· 433 433 ), 434 434 435 435 TP_fast_assign( 436 - __entry->free = fifo_used(&ca->free); 436 + __entry->free = fifo_used(&ca->free[reserve]); 437 437 __entry->free_inc = fifo_used(&ca->free_inc); 438 438 __entry->unused = fifo_used(&ca->unused); 439 439 __entry->blocked = atomic_read(&ca->set->prio_blocked);

+2 -1

include/uapi/linux/bcache.h

··· 39 39 } 40 40 41 41 #define KEY_SIZE_BITS 16 42 + #define KEY_MAX_U64S 8 42 43 43 44 KEY_FIELD(KEY_PTRS, high, 60, 3) 44 45 KEY_FIELD(HEADER_SIZE, high, 58, 2) ··· 119 118 return (struct bkey *) (d + bkey_u64s(k)); 120 119 } 121 120 122 - static inline struct bkey *bkey_last(const struct bkey *k, unsigned nr_keys) 121 + static inline struct bkey *bkey_idx(const struct bkey *k, unsigned nr_keys) 123 122 { 124 123 __u64 *d = (void *) k; 125 124 return (struct bkey *) (d + nr_keys);

+2 -1

include/uapi/linux/fd.h

··· 185 185 * to clear media change status */ 186 186 FD_UNUSED_BIT, 187 187 FD_DISK_CHANGED_BIT, /* disk has been changed since last i/o */ 188 - FD_DISK_WRITABLE_BIT /* disk is writable */ 188 + FD_DISK_WRITABLE_BIT, /* disk is writable */ 189 + FD_OPEN_SHOULD_FAIL_BIT 189 190 }; 190 191 191 192 #define FDSETDRVPRM _IOW(2, 0x90, struct floppy_drive_params)