Merge branch 'for-linus' of git://git.kernel.dk/linux-block

-1

block/blk-ioc.c

··· 6 6 #include <linux/init.h> 7 7 #include <linux/bio.h> 8 8 #include <linux/blkdev.h> 9 - #include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ 10 9 #include <linux/slab.h> 11 10 12 11 #include "blk.h"

+4 -4

block/blk-mq-cpu.c

··· 13 13 static LIST_HEAD(blk_mq_cpu_notify_list); 14 14 static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock); 15 15 16 - static int __cpuinit blk_mq_main_cpu_notify(struct notifier_block *self, 17 - unsigned long action, void *hcpu) 16 + static int blk_mq_main_cpu_notify(struct notifier_block *self, 17 + unsigned long action, void *hcpu) 18 18 { 19 19 unsigned int cpu = (unsigned long) hcpu; 20 20 struct blk_mq_cpu_notifier *notify; ··· 28 28 return NOTIFY_OK; 29 29 } 30 30 31 - static void __cpuinit blk_mq_cpu_notify(void *data, unsigned long action, 32 - unsigned int cpu) 31 + static void blk_mq_cpu_notify(void *data, unsigned long action, 32 + unsigned int cpu) 33 33 { 34 34 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 35 35 /*

+3 -3

block/blk-mq.c

··· 1444 1444 EXPORT_SYMBOL(blk_mq_free_queue); 1445 1445 1446 1446 /* Basically redo blk_mq_init_queue with queue frozen */ 1447 - static void __cpuinit blk_mq_queue_reinit(struct request_queue *q) 1447 + static void blk_mq_queue_reinit(struct request_queue *q) 1448 1448 { 1449 1449 blk_mq_freeze_queue(q); 1450 1450 ··· 1461 1461 blk_mq_unfreeze_queue(q); 1462 1462 } 1463 1463 1464 - static int __cpuinit blk_mq_queue_reinit_notify(struct notifier_block *nb, 1465 - unsigned long action, void *hcpu) 1464 + static int blk_mq_queue_reinit_notify(struct notifier_block *nb, 1465 + unsigned long action, void *hcpu) 1466 1466 { 1467 1467 struct request_queue *q; 1468 1468

+65 -257

drivers/block/virtio_blk.c

··· 11 11 #include <linux/string_helpers.h> 12 12 #include <scsi/scsi_cmnd.h> 13 13 #include <linux/idr.h> 14 + #include <linux/blk-mq.h> 15 + #include <linux/numa.h> 14 16 15 17 #define PART_BITS 4 16 - 17 - static bool use_bio; 18 - module_param(use_bio, bool, S_IRUGO); 19 18 20 19 static int major; 21 20 static DEFINE_IDA(vd_index_ida); ··· 25 26 { 26 27 struct virtio_device *vdev; 27 28 struct virtqueue *vq; 28 - wait_queue_head_t queue_wait; 29 + spinlock_t vq_lock; 29 30 30 31 /* The disk structure for the kernel. */ 31 32 struct gendisk *disk; 32 - 33 - mempool_t *pool; 34 33 35 34 /* Process context for config space updates */ 36 35 struct work_struct config_work; ··· 44 47 45 48 /* Ida index - used to track minor number allocations. */ 46 49 int index; 47 - 48 - /* Scatterlist: can be too big for stack. */ 49 - struct scatterlist sg[/*sg_elems*/]; 50 50 }; 51 51 52 52 struct virtblk_req 53 53 { 54 54 struct request *req; 55 - struct bio *bio; 56 55 struct virtio_blk_outhdr out_hdr; 57 56 struct virtio_scsi_inhdr in_hdr; 58 - struct work_struct work; 59 - struct virtio_blk *vblk; 60 - int flags; 61 57 u8 status; 62 58 struct scatterlist sg[]; 63 - }; 64 - 65 - enum { 66 - VBLK_IS_FLUSH = 1, 67 - VBLK_REQ_FLUSH = 2, 68 - VBLK_REQ_DATA = 4, 69 - VBLK_REQ_FUA = 8, 70 59 }; 71 60 72 61 static inline int virtblk_result(struct virtblk_req *vbr) ··· 65 82 default: 66 83 return -EIO; 67 84 } 68 - } 69 - 70 - static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk, 71 - gfp_t gfp_mask) 72 - { 73 - struct virtblk_req *vbr; 74 - 75 - vbr = mempool_alloc(vblk->pool, gfp_mask); 76 - if (!vbr) 77 - return NULL; 78 - 79 - vbr->vblk = vblk; 80 - if (use_bio) 81 - sg_init_table(vbr->sg, vblk->sg_elems); 82 - 83 - return vbr; 84 85 } 85 86 86 87 static int __virtblk_add_req(struct virtqueue *vq, ··· 110 143 return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); 111 144 } 112 145 113 - static void virtblk_add_req(struct virtblk_req *vbr, bool have_data) 114 - { 115 - struct virtio_blk *vblk = vbr->vblk; 116 - DEFINE_WAIT(wait); 117 - int ret; 118 - 119 - spin_lock_irq(vblk->disk->queue->queue_lock); 120 - while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, vbr->sg, 121 - have_data)) < 0)) { 122 - prepare_to_wait_exclusive(&vblk->queue_wait, &wait, 123 - TASK_UNINTERRUPTIBLE); 124 - 125 - spin_unlock_irq(vblk->disk->queue->queue_lock); 126 - io_schedule(); 127 - spin_lock_irq(vblk->disk->queue->queue_lock); 128 - 129 - finish_wait(&vblk->queue_wait, &wait); 130 - } 131 - 132 - virtqueue_kick(vblk->vq); 133 - spin_unlock_irq(vblk->disk->queue->queue_lock); 134 - } 135 - 136 - static void virtblk_bio_send_flush(struct virtblk_req *vbr) 137 - { 138 - vbr->flags |= VBLK_IS_FLUSH; 139 - vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; 140 - vbr->out_hdr.sector = 0; 141 - vbr->out_hdr.ioprio = 0; 142 - 143 - virtblk_add_req(vbr, false); 144 - } 145 - 146 - static void virtblk_bio_send_data(struct virtblk_req *vbr) 147 - { 148 - struct virtio_blk *vblk = vbr->vblk; 149 - struct bio *bio = vbr->bio; 150 - bool have_data; 151 - 152 - vbr->flags &= ~VBLK_IS_FLUSH; 153 - vbr->out_hdr.type = 0; 154 - vbr->out_hdr.sector = bio->bi_sector; 155 - vbr->out_hdr.ioprio = bio_prio(bio); 156 - 157 - if (blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg)) { 158 - have_data = true; 159 - if (bio->bi_rw & REQ_WRITE) 160 - vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; 161 - else 162 - vbr->out_hdr.type |= VIRTIO_BLK_T_IN; 163 - } else 164 - have_data = false; 165 - 166 - virtblk_add_req(vbr, have_data); 167 - } 168 - 169 - static void virtblk_bio_send_data_work(struct work_struct *work) 170 - { 171 - struct virtblk_req *vbr; 172 - 173 - vbr = container_of(work, struct virtblk_req, work); 174 - 175 - virtblk_bio_send_data(vbr); 176 - } 177 - 178 - static void virtblk_bio_send_flush_work(struct work_struct *work) 179 - { 180 - struct virtblk_req *vbr; 181 - 182 - vbr = container_of(work, struct virtblk_req, work); 183 - 184 - virtblk_bio_send_flush(vbr); 185 - } 186 - 187 146 static inline void virtblk_request_done(struct virtblk_req *vbr) 188 147 { 189 - struct virtio_blk *vblk = vbr->vblk; 190 148 struct request *req = vbr->req; 191 149 int error = virtblk_result(vbr); 192 150 ··· 123 231 req->errors = (error != 0); 124 232 } 125 233 126 - __blk_end_request_all(req, error); 127 - mempool_free(vbr, vblk->pool); 128 - } 129 - 130 - static inline void virtblk_bio_flush_done(struct virtblk_req *vbr) 131 - { 132 - struct virtio_blk *vblk = vbr->vblk; 133 - 134 - if (vbr->flags & VBLK_REQ_DATA) { 135 - /* Send out the actual write data */ 136 - INIT_WORK(&vbr->work, virtblk_bio_send_data_work); 137 - queue_work(virtblk_wq, &vbr->work); 138 - } else { 139 - bio_endio(vbr->bio, virtblk_result(vbr)); 140 - mempool_free(vbr, vblk->pool); 141 - } 142 - } 143 - 144 - static inline void virtblk_bio_data_done(struct virtblk_req *vbr) 145 - { 146 - struct virtio_blk *vblk = vbr->vblk; 147 - 148 - if (unlikely(vbr->flags & VBLK_REQ_FUA)) { 149 - /* Send out a flush before end the bio */ 150 - vbr->flags &= ~VBLK_REQ_DATA; 151 - INIT_WORK(&vbr->work, virtblk_bio_send_flush_work); 152 - queue_work(virtblk_wq, &vbr->work); 153 - } else { 154 - bio_endio(vbr->bio, virtblk_result(vbr)); 155 - mempool_free(vbr, vblk->pool); 156 - } 157 - } 158 - 159 - static inline void virtblk_bio_done(struct virtblk_req *vbr) 160 - { 161 - if (unlikely(vbr->flags & VBLK_IS_FLUSH)) 162 - virtblk_bio_flush_done(vbr); 163 - else 164 - virtblk_bio_data_done(vbr); 234 + blk_mq_end_io(req, error); 165 235 } 166 236 167 237 static void virtblk_done(struct virtqueue *vq) 168 238 { 169 239 struct virtio_blk *vblk = vq->vdev->priv; 170 - bool bio_done = false, req_done = false; 240 + bool req_done = false; 171 241 struct virtblk_req *vbr; 172 242 unsigned long flags; 173 243 unsigned int len; 174 244 175 - spin_lock_irqsave(vblk->disk->queue->queue_lock, flags); 245 + spin_lock_irqsave(&vblk->vq_lock, flags); 176 246 do { 177 247 virtqueue_disable_cb(vq); 178 248 while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { 179 - if (vbr->bio) { 180 - virtblk_bio_done(vbr); 181 - bio_done = true; 182 - } else { 183 - virtblk_request_done(vbr); 184 - req_done = true; 185 - } 249 + virtblk_request_done(vbr); 250 + req_done = true; 186 251 } 187 252 if (unlikely(virtqueue_is_broken(vq))) 188 253 break; 189 254 } while (!virtqueue_enable_cb(vq)); 255 + spin_unlock_irqrestore(&vblk->vq_lock, flags); 256 + 190 257 /* In case queue is stopped waiting for more buffers. */ 191 258 if (req_done) 192 - blk_start_queue(vblk->disk->queue); 193 - spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags); 194 - 195 - if (bio_done) 196 - wake_up(&vblk->queue_wait); 259 + blk_mq_start_stopped_hw_queues(vblk->disk->queue); 197 260 } 198 261 199 - static bool do_req(struct request_queue *q, struct virtio_blk *vblk, 200 - struct request *req) 262 + static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) 201 263 { 264 + struct virtio_blk *vblk = hctx->queue->queuedata; 265 + struct virtblk_req *vbr = req->special; 266 + unsigned long flags; 202 267 unsigned int num; 203 - struct virtblk_req *vbr; 268 + const bool last = (req->cmd_flags & REQ_END) != 0; 204 269 205 - vbr = virtblk_alloc_req(vblk, GFP_ATOMIC); 206 - if (!vbr) 207 - /* When another request finishes we'll try again. */ 208 - return false; 270 + BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); 209 271 210 272 vbr->req = req; 211 - vbr->bio = NULL; 212 273 if (req->cmd_flags & REQ_FLUSH) { 213 274 vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; 214 275 vbr->out_hdr.sector = 0; ··· 189 344 } 190 345 } 191 346 192 - num = blk_rq_map_sg(q, vbr->req, vblk->sg); 347 + num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg); 193 348 if (num) { 194 349 if (rq_data_dir(vbr->req) == WRITE) 195 350 vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; ··· 197 352 vbr->out_hdr.type |= VIRTIO_BLK_T_IN; 198 353 } 199 354 200 - if (__virtblk_add_req(vblk->vq, vbr, vblk->sg, num) < 0) { 201 - mempool_free(vbr, vblk->pool); 202 - return false; 203 - } 204 - 205 - return true; 206 - } 207 - 208 - static void virtblk_request(struct request_queue *q) 209 - { 210 - struct virtio_blk *vblk = q->queuedata; 211 - struct request *req; 212 - unsigned int issued = 0; 213 - 214 - while ((req = blk_peek_request(q)) != NULL) { 215 - BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); 216 - 217 - /* If this request fails, stop queue and wait for something to 218 - finish to restart it. */ 219 - if (!do_req(q, vblk, req)) { 220 - blk_stop_queue(q); 221 - break; 222 - } 223 - blk_start_request(req); 224 - issued++; 225 - } 226 - 227 - if (issued) 355 + spin_lock_irqsave(&vblk->vq_lock, flags); 356 + if (__virtblk_add_req(vblk->vq, vbr, vbr->sg, num) < 0) { 357 + spin_unlock_irqrestore(&vblk->vq_lock, flags); 358 + blk_mq_stop_hw_queue(hctx); 228 359 virtqueue_kick(vblk->vq); 229 - } 230 - 231 - static void virtblk_make_request(struct request_queue *q, struct bio *bio) 232 - { 233 - struct virtio_blk *vblk = q->queuedata; 234 - struct virtblk_req *vbr; 235 - 236 - BUG_ON(bio->bi_phys_segments + 2 > vblk->sg_elems); 237 - 238 - vbr = virtblk_alloc_req(vblk, GFP_NOIO); 239 - if (!vbr) { 240 - bio_endio(bio, -ENOMEM); 241 - return; 360 + return BLK_MQ_RQ_QUEUE_BUSY; 242 361 } 362 + spin_unlock_irqrestore(&vblk->vq_lock, flags); 243 363 244 - vbr->bio = bio; 245 - vbr->flags = 0; 246 - if (bio->bi_rw & REQ_FLUSH) 247 - vbr->flags |= VBLK_REQ_FLUSH; 248 - if (bio->bi_rw & REQ_FUA) 249 - vbr->flags |= VBLK_REQ_FUA; 250 - if (bio->bi_size) 251 - vbr->flags |= VBLK_REQ_DATA; 252 - 253 - if (unlikely(vbr->flags & VBLK_REQ_FLUSH)) 254 - virtblk_bio_send_flush(vbr); 255 - else 256 - virtblk_bio_send_data(vbr); 364 + if (last) 365 + virtqueue_kick(vblk->vq); 366 + return BLK_MQ_RQ_QUEUE_OK; 257 367 } 258 368 259 369 /* return id (s/n) string for *disk to *id_str ··· 473 673 __ATTR(cache_type, S_IRUGO|S_IWUSR, 474 674 virtblk_cache_type_show, virtblk_cache_type_store); 475 675 676 + static struct blk_mq_ops virtio_mq_ops = { 677 + .queue_rq = virtio_queue_rq, 678 + .map_queue = blk_mq_map_queue, 679 + .alloc_hctx = blk_mq_alloc_single_hw_queue, 680 + .free_hctx = blk_mq_free_single_hw_queue, 681 + }; 682 + 683 + static struct blk_mq_reg virtio_mq_reg = { 684 + .ops = &virtio_mq_ops, 685 + .nr_hw_queues = 1, 686 + .queue_depth = 64, 687 + .numa_node = NUMA_NO_NODE, 688 + .flags = BLK_MQ_F_SHOULD_MERGE, 689 + }; 690 + 691 + static void virtblk_init_vbr(void *data, struct blk_mq_hw_ctx *hctx, 692 + struct request *rq, unsigned int nr) 693 + { 694 + struct virtio_blk *vblk = data; 695 + struct virtblk_req *vbr = rq->special; 696 + 697 + sg_init_table(vbr->sg, vblk->sg_elems); 698 + } 699 + 476 700 static int virtblk_probe(struct virtio_device *vdev) 477 701 { 478 702 struct virtio_blk *vblk; 479 703 struct request_queue *q; 480 704 int err, index; 481 - int pool_size; 482 705 483 706 u64 cap; 484 707 u32 v, blk_size, sg_elems, opt_io_size; ··· 525 702 526 703 /* We need an extra sg elements at head and tail. */ 527 704 sg_elems += 2; 528 - vdev->priv = vblk = kmalloc(sizeof(*vblk) + 529 - sizeof(vblk->sg[0]) * sg_elems, GFP_KERNEL); 705 + vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); 530 706 if (!vblk) { 531 707 err = -ENOMEM; 532 708 goto out_free_index; 533 709 } 534 710 535 - init_waitqueue_head(&vblk->queue_wait); 536 711 vblk->vdev = vdev; 537 712 vblk->sg_elems = sg_elems; 538 - sg_init_table(vblk->sg, vblk->sg_elems); 539 713 mutex_init(&vblk->config_lock); 540 714 541 715 INIT_WORK(&vblk->config_work, virtblk_config_changed_work); ··· 541 721 err = init_vq(vblk); 542 722 if (err) 543 723 goto out_free_vblk; 544 - 545 - pool_size = sizeof(struct virtblk_req); 546 - if (use_bio) 547 - pool_size += sizeof(struct scatterlist) * sg_elems; 548 - vblk->pool = mempool_create_kmalloc_pool(1, pool_size); 549 - if (!vblk->pool) { 550 - err = -ENOMEM; 551 - goto out_free_vq; 552 - } 724 + spin_lock_init(&vblk->vq_lock); 553 725 554 726 /* FIXME: How many partitions? How long is a piece of string? */ 555 727 vblk->disk = alloc_disk(1 << PART_BITS); 556 728 if (!vblk->disk) { 557 729 err = -ENOMEM; 558 - goto out_mempool; 730 + goto out_free_vq; 559 731 } 560 732 561 - q = vblk->disk->queue = blk_init_queue(virtblk_request, NULL); 733 + virtio_mq_reg.cmd_size = 734 + sizeof(struct virtblk_req) + 735 + sizeof(struct scatterlist) * sg_elems; 736 + 737 + q = vblk->disk->queue = blk_mq_init_queue(&virtio_mq_reg, vblk); 562 738 if (!q) { 563 739 err = -ENOMEM; 564 740 goto out_put_disk; 565 741 } 566 742 567 - if (use_bio) 568 - blk_queue_make_request(q, virtblk_make_request); 743 + blk_mq_init_commands(q, virtblk_init_vbr, vblk); 744 + 569 745 q->queuedata = vblk; 570 746 571 747 virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN); ··· 664 848 blk_cleanup_queue(vblk->disk->queue); 665 849 out_put_disk: 666 850 put_disk(vblk->disk); 667 - out_mempool: 668 - mempool_destroy(vblk->pool); 669 851 out_free_vq: 670 852 vdev->config->del_vqs(vdev); 671 853 out_free_vblk: ··· 695 881 696 882 refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount); 697 883 put_disk(vblk->disk); 698 - mempool_destroy(vblk->pool); 699 884 vdev->config->del_vqs(vdev); 700 885 kfree(vblk); 701 886 ··· 718 905 719 906 flush_work(&vblk->config_work); 720 907 721 - spin_lock_irq(vblk->disk->queue->queue_lock); 722 - blk_stop_queue(vblk->disk->queue); 723 - spin_unlock_irq(vblk->disk->queue->queue_lock); 724 - blk_sync_queue(vblk->disk->queue); 908 + blk_mq_stop_hw_queues(vblk->disk->queue); 725 909 726 910 vdev->config->del_vqs(vdev); 727 911 return 0; ··· 731 921 732 922 vblk->config_enable = true; 733 923 ret = init_vq(vdev->priv); 734 - if (!ret) { 735 - spin_lock_irq(vblk->disk->queue->queue_lock); 736 - blk_start_queue(vblk->disk->queue); 737 - spin_unlock_irq(vblk->disk->queue->queue_lock); 738 - } 924 + if (!ret) 925 + blk_mq_start_stopped_hw_queues(vblk->disk->queue); 926 + 739 927 return ret; 740 928 } 741 929 #endif

+2 -9

drivers/md/bcache/Kconfig

··· 13 13 ---help--- 14 14 Don't select this option unless you're a developer 15 15 16 - Enables extra debugging tools (primarily a fuzz tester) 17 - 18 - config BCACHE_EDEBUG 19 - bool "Extended runtime checks" 20 - depends on BCACHE 21 - ---help--- 22 - Don't select this option unless you're a developer 23 - 24 - Enables extra runtime checks which significantly affect performance 16 + Enables extra debugging tools, allows expensive runtime checks to be 17 + turned on. 25 18 26 19 config BCACHE_CLOSURES_DEBUG 27 20 bool "Debug closures"

+241 -142

drivers/md/bcache/alloc.c

··· 63 63 #include "bcache.h" 64 64 #include "btree.h" 65 65 66 + #include <linux/blkdev.h> 66 67 #include <linux/freezer.h> 67 68 #include <linux/kthread.h> 68 69 #include <linux/random.h> 69 70 #include <trace/events/bcache.h> 70 - 71 - #define MAX_IN_FLIGHT_DISCARDS 8U 72 71 73 72 /* Bucket heap / gen */ 74 73 ··· 118 119 } 119 120 120 121 mutex_unlock(&c->bucket_lock); 121 - } 122 - 123 - /* Discard/TRIM */ 124 - 125 - struct discard { 126 - struct list_head list; 127 - struct work_struct work; 128 - struct cache *ca; 129 - long bucket; 130 - 131 - struct bio bio; 132 - struct bio_vec bv; 133 - }; 134 - 135 - static void discard_finish(struct work_struct *w) 136 - { 137 - struct discard *d = container_of(w, struct discard, work); 138 - struct cache *ca = d->ca; 139 - char buf[BDEVNAME_SIZE]; 140 - 141 - if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) { 142 - pr_notice("discard error on %s, disabling", 143 - bdevname(ca->bdev, buf)); 144 - d->ca->discard = 0; 145 - } 146 - 147 - mutex_lock(&ca->set->bucket_lock); 148 - 149 - fifo_push(&ca->free, d->bucket); 150 - list_add(&d->list, &ca->discards); 151 - atomic_dec(&ca->discards_in_flight); 152 - 153 - mutex_unlock(&ca->set->bucket_lock); 154 - 155 - closure_wake_up(&ca->set->bucket_wait); 156 - wake_up_process(ca->alloc_thread); 157 - 158 - closure_put(&ca->set->cl); 159 - } 160 - 161 - static void discard_endio(struct bio *bio, int error) 162 - { 163 - struct discard *d = container_of(bio, struct discard, bio); 164 - schedule_work(&d->work); 165 - } 166 - 167 - static void do_discard(struct cache *ca, long bucket) 168 - { 169 - struct discard *d = list_first_entry(&ca->discards, 170 - struct discard, list); 171 - 172 - list_del(&d->list); 173 - d->bucket = bucket; 174 - 175 - atomic_inc(&ca->discards_in_flight); 176 - closure_get(&ca->set->cl); 177 - 178 - bio_init(&d->bio); 179 - 180 - d->bio.bi_sector = bucket_to_sector(ca->set, d->bucket); 181 - d->bio.bi_bdev = ca->bdev; 182 - d->bio.bi_rw = REQ_WRITE|REQ_DISCARD; 183 - d->bio.bi_max_vecs = 1; 184 - d->bio.bi_io_vec = d->bio.bi_inline_vecs; 185 - d->bio.bi_size = bucket_bytes(ca); 186 - d->bio.bi_end_io = discard_endio; 187 - bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); 188 - 189 - submit_bio(0, &d->bio); 190 122 } 191 123 192 124 /* Allocation */ ··· 210 280 * multiple times when it can't do anything 211 281 */ 212 282 ca->invalidate_needs_gc = 1; 213 - bch_queue_gc(ca->set); 283 + wake_up_gc(ca->set); 214 284 return; 215 285 } 216 286 ··· 235 305 236 306 if (++checked >= ca->sb.nbuckets) { 237 307 ca->invalidate_needs_gc = 1; 238 - bch_queue_gc(ca->set); 308 + wake_up_gc(ca->set); 239 309 return; 240 310 } 241 311 } ··· 260 330 261 331 if (++checked >= ca->sb.nbuckets / 2) { 262 332 ca->invalidate_needs_gc = 1; 263 - bch_queue_gc(ca->set); 333 + wake_up_gc(ca->set); 264 334 return; 265 335 } 266 336 } ··· 328 398 else 329 399 break; 330 400 331 - allocator_wait(ca, (int) fifo_free(&ca->free) > 332 - atomic_read(&ca->discards_in_flight)); 333 - 334 401 if (ca->discard) { 335 - allocator_wait(ca, !list_empty(&ca->discards)); 336 - do_discard(ca, bucket); 337 - } else { 338 - fifo_push(&ca->free, bucket); 339 - closure_wake_up(&ca->set->bucket_wait); 402 + mutex_unlock(&ca->set->bucket_lock); 403 + blkdev_issue_discard(ca->bdev, 404 + bucket_to_sector(ca->set, bucket), 405 + ca->sb.block_size, GFP_KERNEL, 0); 406 + mutex_lock(&ca->set->bucket_lock); 340 407 } 408 + 409 + allocator_wait(ca, !fifo_full(&ca->free)); 410 + 411 + fifo_push(&ca->free, bucket); 412 + wake_up(&ca->set->bucket_wait); 341 413 } 342 414 343 415 /* ··· 365 433 } 366 434 } 367 435 368 - long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl) 436 + long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait) 369 437 { 370 - long r = -1; 371 - again: 438 + DEFINE_WAIT(w); 439 + struct bucket *b; 440 + long r; 441 + 442 + /* fastpath */ 443 + if (fifo_used(&ca->free) > ca->watermark[watermark]) { 444 + fifo_pop(&ca->free, r); 445 + goto out; 446 + } 447 + 448 + if (!wait) 449 + return -1; 450 + 451 + while (1) { 452 + if (fifo_used(&ca->free) > ca->watermark[watermark]) { 453 + fifo_pop(&ca->free, r); 454 + break; 455 + } 456 + 457 + prepare_to_wait(&ca->set->bucket_wait, &w, 458 + TASK_UNINTERRUPTIBLE); 459 + 460 + mutex_unlock(&ca->set->bucket_lock); 461 + schedule(); 462 + mutex_lock(&ca->set->bucket_lock); 463 + } 464 + 465 + finish_wait(&ca->set->bucket_wait, &w); 466 + out: 372 467 wake_up_process(ca->alloc_thread); 373 468 374 - if (fifo_used(&ca->free) > ca->watermark[watermark] && 375 - fifo_pop(&ca->free, r)) { 376 - struct bucket *b = ca->buckets + r; 377 - #ifdef CONFIG_BCACHE_EDEBUG 469 + if (expensive_debug_checks(ca->set)) { 378 470 size_t iter; 379 471 long i; 380 472 ··· 411 455 BUG_ON(i == r); 412 456 fifo_for_each(i, &ca->unused, iter) 413 457 BUG_ON(i == r); 414 - #endif 415 - BUG_ON(atomic_read(&b->pin) != 1); 416 - 417 - SET_GC_SECTORS_USED(b, ca->sb.bucket_size); 418 - 419 - if (watermark <= WATERMARK_METADATA) { 420 - SET_GC_MARK(b, GC_MARK_METADATA); 421 - b->prio = BTREE_PRIO; 422 - } else { 423 - SET_GC_MARK(b, GC_MARK_RECLAIMABLE); 424 - b->prio = INITIAL_PRIO; 425 - } 426 - 427 - return r; 428 458 } 429 459 430 - trace_bcache_alloc_fail(ca); 460 + b = ca->buckets + r; 431 461 432 - if (cl) { 433 - closure_wait(&ca->set->bucket_wait, cl); 462 + BUG_ON(atomic_read(&b->pin) != 1); 434 463 435 - if (closure_blocking(cl)) { 436 - mutex_unlock(&ca->set->bucket_lock); 437 - closure_sync(cl); 438 - mutex_lock(&ca->set->bucket_lock); 439 - goto again; 440 - } 464 + SET_GC_SECTORS_USED(b, ca->sb.bucket_size); 465 + 466 + if (watermark <= WATERMARK_METADATA) { 467 + SET_GC_MARK(b, GC_MARK_METADATA); 468 + b->prio = BTREE_PRIO; 469 + } else { 470 + SET_GC_MARK(b, GC_MARK_RECLAIMABLE); 471 + b->prio = INITIAL_PRIO; 441 472 } 442 473 443 - return -1; 474 + return r; 444 475 } 445 476 446 477 void bch_bucket_free(struct cache_set *c, struct bkey *k) ··· 444 501 } 445 502 446 503 int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, 447 - struct bkey *k, int n, struct closure *cl) 504 + struct bkey *k, int n, bool wait) 448 505 { 449 506 int i; 450 507 ··· 457 514 458 515 for (i = 0; i < n; i++) { 459 516 struct cache *ca = c->cache_by_alloc[i]; 460 - long b = bch_bucket_alloc(ca, watermark, cl); 517 + long b = bch_bucket_alloc(ca, watermark, wait); 461 518 462 519 if (b == -1) 463 520 goto err; ··· 472 529 return 0; 473 530 err: 474 531 bch_bucket_free(c, k); 475 - __bkey_put(c, k); 532 + bkey_put(c, k); 476 533 return -1; 477 534 } 478 535 479 536 int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, 480 - struct bkey *k, int n, struct closure *cl) 537 + struct bkey *k, int n, bool wait) 481 538 { 482 539 int ret; 483 540 mutex_lock(&c->bucket_lock); 484 - ret = __bch_bucket_alloc_set(c, watermark, k, n, cl); 541 + ret = __bch_bucket_alloc_set(c, watermark, k, n, wait); 485 542 mutex_unlock(&c->bucket_lock); 486 543 return ret; 487 544 } 488 545 546 + /* Sector allocator */ 547 + 548 + struct open_bucket { 549 + struct list_head list; 550 + unsigned last_write_point; 551 + unsigned sectors_free; 552 + BKEY_PADDED(key); 553 + }; 554 + 555 + /* 556 + * We keep multiple buckets open for writes, and try to segregate different 557 + * write streams for better cache utilization: first we look for a bucket where 558 + * the last write to it was sequential with the current write, and failing that 559 + * we look for a bucket that was last used by the same task. 560 + * 561 + * The ideas is if you've got multiple tasks pulling data into the cache at the 562 + * same time, you'll get better cache utilization if you try to segregate their 563 + * data and preserve locality. 564 + * 565 + * For example, say you've starting Firefox at the same time you're copying a 566 + * bunch of files. Firefox will likely end up being fairly hot and stay in the 567 + * cache awhile, but the data you copied might not be; if you wrote all that 568 + * data to the same buckets it'd get invalidated at the same time. 569 + * 570 + * Both of those tasks will be doing fairly random IO so we can't rely on 571 + * detecting sequential IO to segregate their data, but going off of the task 572 + * should be a sane heuristic. 573 + */ 574 + static struct open_bucket *pick_data_bucket(struct cache_set *c, 575 + const struct bkey *search, 576 + unsigned write_point, 577 + struct bkey *alloc) 578 + { 579 + struct open_bucket *ret, *ret_task = NULL; 580 + 581 + list_for_each_entry_reverse(ret, &c->data_buckets, list) 582 + if (!bkey_cmp(&ret->key, search)) 583 + goto found; 584 + else if (ret->last_write_point == write_point) 585 + ret_task = ret; 586 + 587 + ret = ret_task ?: list_first_entry(&c->data_buckets, 588 + struct open_bucket, list); 589 + found: 590 + if (!ret->sectors_free && KEY_PTRS(alloc)) { 591 + ret->sectors_free = c->sb.bucket_size; 592 + bkey_copy(&ret->key, alloc); 593 + bkey_init(alloc); 594 + } 595 + 596 + if (!ret->sectors_free) 597 + ret = NULL; 598 + 599 + return ret; 600 + } 601 + 602 + /* 603 + * Allocates some space in the cache to write to, and k to point to the newly 604 + * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the 605 + * end of the newly allocated space). 606 + * 607 + * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many 608 + * sectors were actually allocated. 609 + * 610 + * If s->writeback is true, will not fail. 611 + */ 612 + bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, 613 + unsigned write_point, unsigned write_prio, bool wait) 614 + { 615 + struct open_bucket *b; 616 + BKEY_PADDED(key) alloc; 617 + unsigned i; 618 + 619 + /* 620 + * We might have to allocate a new bucket, which we can't do with a 621 + * spinlock held. So if we have to allocate, we drop the lock, allocate 622 + * and then retry. KEY_PTRS() indicates whether alloc points to 623 + * allocated bucket(s). 624 + */ 625 + 626 + bkey_init(&alloc.key); 627 + spin_lock(&c->data_bucket_lock); 628 + 629 + while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { 630 + unsigned watermark = write_prio 631 + ? WATERMARK_MOVINGGC 632 + : WATERMARK_NONE; 633 + 634 + spin_unlock(&c->data_bucket_lock); 635 + 636 + if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait)) 637 + return false; 638 + 639 + spin_lock(&c->data_bucket_lock); 640 + } 641 + 642 + /* 643 + * If we had to allocate, we might race and not need to allocate the 644 + * second time we call find_data_bucket(). If we allocated a bucket but 645 + * didn't use it, drop the refcount bch_bucket_alloc_set() took: 646 + */ 647 + if (KEY_PTRS(&alloc.key)) 648 + bkey_put(c, &alloc.key); 649 + 650 + for (i = 0; i < KEY_PTRS(&b->key); i++) 651 + EBUG_ON(ptr_stale(c, &b->key, i)); 652 + 653 + /* Set up the pointer to the space we're allocating: */ 654 + 655 + for (i = 0; i < KEY_PTRS(&b->key); i++) 656 + k->ptr[i] = b->key.ptr[i]; 657 + 658 + sectors = min(sectors, b->sectors_free); 659 + 660 + SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); 661 + SET_KEY_SIZE(k, sectors); 662 + SET_KEY_PTRS(k, KEY_PTRS(&b->key)); 663 + 664 + /* 665 + * Move b to the end of the lru, and keep track of what this bucket was 666 + * last used for: 667 + */ 668 + list_move_tail(&b->list, &c->data_buckets); 669 + bkey_copy_key(&b->key, k); 670 + b->last_write_point = write_point; 671 + 672 + b->sectors_free -= sectors; 673 + 674 + for (i = 0; i < KEY_PTRS(&b->key); i++) { 675 + SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); 676 + 677 + atomic_long_add(sectors, 678 + &PTR_CACHE(c, &b->key, i)->sectors_written); 679 + } 680 + 681 + if (b->sectors_free < c->sb.block_size) 682 + b->sectors_free = 0; 683 + 684 + /* 685 + * k takes refcounts on the buckets it points to until it's inserted 686 + * into the btree, but if we're done with this bucket we just transfer 687 + * get_data_bucket()'s refcount. 688 + */ 689 + if (b->sectors_free) 690 + for (i = 0; i < KEY_PTRS(&b->key); i++) 691 + atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); 692 + 693 + spin_unlock(&c->data_bucket_lock); 694 + return true; 695 + } 696 + 489 697 /* Init */ 698 + 699 + void bch_open_buckets_free(struct cache_set *c) 700 + { 701 + struct open_bucket *b; 702 + 703 + while (!list_empty(&c->data_buckets)) { 704 + b = list_first_entry(&c->data_buckets, 705 + struct open_bucket, list); 706 + list_del(&b->list); 707 + kfree(b); 708 + } 709 + } 710 + 711 + int bch_open_buckets_alloc(struct cache_set *c) 712 + { 713 + int i; 714 + 715 + spin_lock_init(&c->data_bucket_lock); 716 + 717 + for (i = 0; i < 6; i++) { 718 + struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); 719 + if (!b) 720 + return -ENOMEM; 721 + 722 + list_add(&b->list, &c->data_buckets); 723 + } 724 + 725 + return 0; 726 + } 490 727 491 728 int bch_cache_allocator_start(struct cache *ca) 492 729 { ··· 679 556 return 0; 680 557 } 681 558 682 - void bch_cache_allocator_exit(struct cache *ca) 683 - { 684 - struct discard *d; 685 - 686 - while (!list_empty(&ca->discards)) { 687 - d = list_first_entry(&ca->discards, struct discard, list); 688 - cancel_work_sync(&d->work); 689 - list_del(&d->list); 690 - kfree(d); 691 - } 692 - } 693 - 694 559 int bch_cache_allocator_init(struct cache *ca) 695 560 { 696 - unsigned i; 697 - 698 561 /* 699 562 * Reserve: 700 563 * Prio/gen writes first ··· 697 588 698 589 ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + 699 590 ca->watermark[WATERMARK_MOVINGGC]; 700 - 701 - for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) { 702 - struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL); 703 - if (!d) 704 - return -ENOMEM; 705 - 706 - d->ca = ca; 707 - INIT_WORK(&d->work, discard_finish); 708 - list_add(&d->list, &ca->discards); 709 - } 710 591 711 592 return 0; 712 593 }

+33 -294

drivers/md/bcache/bcache.h

··· 177 177 178 178 #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ 179 179 180 + #include <linux/bcache.h> 180 181 #include <linux/bio.h> 181 182 #include <linux/kobject.h> 182 183 #include <linux/list.h> ··· 211 210 #define GC_MARK_METADATA 2 212 211 BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14); 213 212 214 - struct bkey { 215 - uint64_t high; 216 - uint64_t low; 217 - uint64_t ptr[]; 218 - }; 219 - 220 - /* Enough for a key with 6 pointers */ 221 - #define BKEY_PAD 8 222 - 223 - #define BKEY_PADDED(key) \ 224 - union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; } 225 - 226 - /* Version 0: Cache device 227 - * Version 1: Backing device 228 - * Version 2: Seed pointer into btree node checksum 229 - * Version 3: Cache device with new UUID format 230 - * Version 4: Backing device with data offset 231 - */ 232 - #define BCACHE_SB_VERSION_CDEV 0 233 - #define BCACHE_SB_VERSION_BDEV 1 234 - #define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 235 - #define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 236 - #define BCACHE_SB_MAX_VERSION 4 237 - 238 - #define SB_SECTOR 8 239 - #define SB_SIZE 4096 240 - #define SB_LABEL_SIZE 32 241 - #define SB_JOURNAL_BUCKETS 256U 242 - /* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */ 243 - #define MAX_CACHES_PER_SET 8 244 - 245 - #define BDEV_DATA_START_DEFAULT 16 /* sectors */ 246 - 247 - struct cache_sb { 248 - uint64_t csum; 249 - uint64_t offset; /* sector where this sb was written */ 250 - uint64_t version; 251 - 252 - uint8_t magic[16]; 253 - 254 - uint8_t uuid[16]; 255 - union { 256 - uint8_t set_uuid[16]; 257 - uint64_t set_magic; 258 - }; 259 - uint8_t label[SB_LABEL_SIZE]; 260 - 261 - uint64_t flags; 262 - uint64_t seq; 263 - uint64_t pad[8]; 264 - 265 - union { 266 - struct { 267 - /* Cache devices */ 268 - uint64_t nbuckets; /* device size */ 269 - 270 - uint16_t block_size; /* sectors */ 271 - uint16_t bucket_size; /* sectors */ 272 - 273 - uint16_t nr_in_set; 274 - uint16_t nr_this_dev; 275 - }; 276 - struct { 277 - /* Backing devices */ 278 - uint64_t data_offset; 279 - 280 - /* 281 - * block_size from the cache device section is still used by 282 - * backing devices, so don't add anything here until we fix 283 - * things to not need it for backing devices anymore 284 - */ 285 - }; 286 - }; 287 - 288 - uint32_t last_mount; /* time_t */ 289 - 290 - uint16_t first_bucket; 291 - union { 292 - uint16_t njournal_buckets; 293 - uint16_t keys; 294 - }; 295 - uint64_t d[SB_JOURNAL_BUCKETS]; /* journal buckets */ 296 - }; 297 - 298 - BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); 299 - BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1); 300 - BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3); 301 - #define CACHE_REPLACEMENT_LRU 0U 302 - #define CACHE_REPLACEMENT_FIFO 1U 303 - #define CACHE_REPLACEMENT_RANDOM 2U 304 - 305 - BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); 306 - #define CACHE_MODE_WRITETHROUGH 0U 307 - #define CACHE_MODE_WRITEBACK 1U 308 - #define CACHE_MODE_WRITEAROUND 2U 309 - #define CACHE_MODE_NONE 3U 310 - BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2); 311 - #define BDEV_STATE_NONE 0U 312 - #define BDEV_STATE_CLEAN 1U 313 - #define BDEV_STATE_DIRTY 2U 314 - #define BDEV_STATE_STALE 3U 315 - 316 - /* Version 1: Seed pointer into btree node checksum 317 - */ 318 - #define BCACHE_BSET_VERSION 1 319 - 320 - /* 321 - * This is the on disk format for btree nodes - a btree node on disk is a list 322 - * of these; within each set the keys are sorted 323 - */ 324 - struct bset { 325 - uint64_t csum; 326 - uint64_t magic; 327 - uint64_t seq; 328 - uint32_t version; 329 - uint32_t keys; 330 - 331 - union { 332 - struct bkey start[0]; 333 - uint64_t d[0]; 334 - }; 335 - }; 336 - 337 - /* 338 - * On disk format for priorities and gens - see super.c near prio_write() for 339 - * more. 340 - */ 341 - struct prio_set { 342 - uint64_t csum; 343 - uint64_t magic; 344 - uint64_t seq; 345 - uint32_t version; 346 - uint32_t pad; 347 - 348 - uint64_t next_bucket; 349 - 350 - struct bucket_disk { 351 - uint16_t prio; 352 - uint8_t gen; 353 - } __attribute((packed)) data[]; 354 - }; 355 - 356 - struct uuid_entry { 357 - union { 358 - struct { 359 - uint8_t uuid[16]; 360 - uint8_t label[32]; 361 - uint32_t first_reg; 362 - uint32_t last_reg; 363 - uint32_t invalidated; 364 - 365 - uint32_t flags; 366 - /* Size of flash only volumes */ 367 - uint64_t sectors; 368 - }; 369 - 370 - uint8_t pad[128]; 371 - }; 372 - }; 373 - 374 - BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1); 375 - 376 213 #include "journal.h" 377 214 #include "stats.h" 378 215 struct search; ··· 222 383 BKEY_PADDED(key); 223 384 void *private; 224 385 }; 225 - 226 - typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); 227 386 228 387 struct keybuf { 229 388 struct bkey last_scanned; ··· 237 400 238 401 struct rb_root keys; 239 402 240 - #define KEYBUF_NR 100 403 + #define KEYBUF_NR 500 241 404 DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR); 242 405 }; 243 406 ··· 266 429 267 430 struct gendisk *disk; 268 431 269 - /* If nonzero, we're closing */ 270 - atomic_t closing; 432 + unsigned long flags; 433 + #define BCACHE_DEV_CLOSING 0 434 + #define BCACHE_DEV_DETACHING 1 435 + #define BCACHE_DEV_UNLINK_DONE 2 271 436 272 - /* If nonzero, we're detaching/unregistering from cache set */ 273 - atomic_t detaching; 274 - int flush_done; 275 - 276 - uint64_t nr_stripes; 277 - unsigned stripe_size_bits; 437 + unsigned nr_stripes; 438 + unsigned stripe_size; 278 439 atomic_t *stripe_sectors_dirty; 440 + unsigned long *full_dirty_stripes; 279 441 280 442 unsigned long sectors_dirty_last; 281 443 long sectors_dirty_derivative; ··· 345 509 346 510 /* Limit number of writeback bios in flight */ 347 511 struct semaphore in_flight; 348 - struct closure_with_timer writeback; 512 + struct task_struct *writeback_thread; 349 513 350 514 struct keybuf writeback_keys; 351 515 ··· 363 527 unsigned sequential_cutoff; 364 528 unsigned readahead; 365 529 366 - unsigned sequential_merge:1; 367 530 unsigned verify:1; 531 + unsigned bypass_torture_test:1; 368 532 369 533 unsigned partial_stripes_expensive:1; 370 534 unsigned writeback_metadata:1; ··· 456 620 457 621 bool discard; /* Get rid of? */ 458 622 459 - /* 460 - * We preallocate structs for issuing discards to buckets, and keep them 461 - * on this list when they're not in use; do_discard() issues discards 462 - * whenever there's work to do and is called by free_some_buckets() and 463 - * when a discard finishes. 464 - */ 465 - atomic_t discards_in_flight; 466 - struct list_head discards; 467 - 468 623 struct journal_device journal; 469 624 470 625 /* The rest of this all shows up in sysfs */ ··· 476 649 477 650 size_t nkeys; 478 651 uint64_t data; /* sectors */ 479 - uint64_t dirty; /* sectors */ 480 652 unsigned in_use; /* percent */ 481 653 }; 482 654 ··· 570 744 * basically a lock for this that we can wait on asynchronously. The 571 745 * btree_root() macro releases the lock when it returns. 572 746 */ 573 - struct closure *try_harder; 574 - struct closure_waitlist try_wait; 747 + struct task_struct *try_harder; 748 + wait_queue_head_t try_wait; 575 749 uint64_t try_harder_start; 576 750 577 751 /* ··· 585 759 * written. 586 760 */ 587 761 atomic_t prio_blocked; 588 - struct closure_waitlist bucket_wait; 762 + wait_queue_head_t bucket_wait; 589 763 590 764 /* 591 765 * For any bio we don't skip we subtract the number of sectors from ··· 608 782 struct gc_stat gc_stats; 609 783 size_t nbuckets; 610 784 611 - struct closure_with_waitlist gc; 785 + struct task_struct *gc_thread; 612 786 /* Where in the btree gc currently is */ 613 787 struct bkey gc_done; 614 788 ··· 621 795 /* Counts how many sectors bio_insert has added to the cache */ 622 796 atomic_t sectors_to_gc; 623 797 624 - struct closure moving_gc; 625 - struct closure_waitlist moving_gc_wait; 798 + wait_queue_head_t moving_gc_wait; 626 799 struct keybuf moving_gc_keys; 627 800 /* Number of moving GC bios in flight */ 628 - atomic_t in_flight; 801 + struct semaphore moving_in_flight; 629 802 630 803 struct btree *root; 631 804 ··· 666 841 unsigned congested_read_threshold_us; 667 842 unsigned congested_write_threshold_us; 668 843 669 - spinlock_t sort_time_lock; 670 844 struct time_stats sort_time; 671 845 struct time_stats btree_gc_time; 672 846 struct time_stats btree_split_time; 673 - spinlock_t btree_read_time_lock; 674 847 struct time_stats btree_read_time; 675 848 struct time_stats try_harder_time; 676 849 677 850 atomic_long_t cache_read_races; 678 851 atomic_long_t writeback_keys_done; 679 852 atomic_long_t writeback_keys_failed; 853 + 854 + enum { 855 + ON_ERROR_UNREGISTER, 856 + ON_ERROR_PANIC, 857 + } on_error; 680 858 unsigned error_limit; 681 859 unsigned error_decay; 860 + 682 861 unsigned short journal_delay_ms; 683 862 unsigned verify:1; 684 863 unsigned key_merging_disabled:1; 864 + unsigned expensive_debug_checks:1; 685 865 unsigned gc_always_rewrite:1; 686 866 unsigned shrinker_disabled:1; 687 867 unsigned copy_gc_enabled:1; ··· 694 864 #define BUCKET_HASH_BITS 12 695 865 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; 696 866 }; 697 - 698 - static inline bool key_merging_disabled(struct cache_set *c) 699 - { 700 - #ifdef CONFIG_BCACHE_DEBUG 701 - return c->key_merging_disabled; 702 - #else 703 - return 0; 704 - #endif 705 - } 706 - 707 - static inline bool SB_IS_BDEV(const struct cache_sb *sb) 708 - { 709 - return sb->version == BCACHE_SB_VERSION_BDEV 710 - || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; 711 - } 712 867 713 868 struct bbio { 714 869 unsigned submit_time_us; ··· 748 933 #define prio_buckets(c) \ 749 934 DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c)) 750 935 751 - #define JSET_MAGIC 0x245235c1a3625032ULL 752 - #define PSET_MAGIC 0x6750e15f87337f91ULL 753 - #define BSET_MAGIC 0x90135c78b99e07f5ULL 754 - 755 - #define jset_magic(c) ((c)->sb.set_magic ^ JSET_MAGIC) 756 - #define pset_magic(c) ((c)->sb.set_magic ^ PSET_MAGIC) 757 - #define bset_magic(c) ((c)->sb.set_magic ^ BSET_MAGIC) 758 - 759 - /* Bkey fields: all units are in sectors */ 760 - 761 - #define KEY_FIELD(name, field, offset, size) \ 762 - BITMASK(name, struct bkey, field, offset, size) 763 - 764 - #define PTR_FIELD(name, offset, size) \ 765 - static inline uint64_t name(const struct bkey *k, unsigned i) \ 766 - { return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); } \ 767 - \ 768 - static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\ 769 - { \ 770 - k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset); \ 771 - k->ptr[i] |= v << offset; \ 772 - } 773 - 774 - KEY_FIELD(KEY_PTRS, high, 60, 3) 775 - KEY_FIELD(HEADER_SIZE, high, 58, 2) 776 - KEY_FIELD(KEY_CSUM, high, 56, 2) 777 - KEY_FIELD(KEY_PINNED, high, 55, 1) 778 - KEY_FIELD(KEY_DIRTY, high, 36, 1) 779 - 780 - KEY_FIELD(KEY_SIZE, high, 20, 16) 781 - KEY_FIELD(KEY_INODE, high, 0, 20) 782 - 783 - /* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */ 784 - 785 - static inline uint64_t KEY_OFFSET(const struct bkey *k) 786 - { 787 - return k->low; 788 - } 789 - 790 - static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v) 791 - { 792 - k->low = v; 793 - } 794 - 795 - PTR_FIELD(PTR_DEV, 51, 12) 796 - PTR_FIELD(PTR_OFFSET, 8, 43) 797 - PTR_FIELD(PTR_GEN, 0, 8) 798 - 799 - #define PTR_CHECK_DEV ((1 << 12) - 1) 800 - 801 - #define PTR(gen, offset, dev) \ 802 - ((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen) 803 - 804 936 static inline size_t sector_to_bucket(struct cache_set *c, sector_t s) 805 937 { 806 938 return s >> c->bucket_bits; ··· 786 1024 787 1025 /* Btree key macros */ 788 1026 789 - /* 790 - * The high bit being set is a relic from when we used it to do binary 791 - * searches - it told you where a key started. It's not used anymore, 792 - * and can probably be safely dropped. 793 - */ 794 - #define KEY(dev, sector, len) \ 795 - ((struct bkey) { \ 796 - .high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev), \ 797 - .low = (sector) \ 798 - }) 799 - 800 1027 static inline void bkey_init(struct bkey *k) 801 1028 { 802 - *k = KEY(0, 0, 0); 1029 + *k = ZERO_KEY; 803 1030 } 804 - 805 - #define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k)) 806 - #define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0) 807 - #define MAX_KEY KEY(~(~0 << 20), ((uint64_t) ~0) >> 1, 0) 808 - #define ZERO_KEY KEY(0, 0, 0) 809 1031 810 1032 /* 811 1033 * This is used for various on disk data structures - cache_sb, prio_set, bset, ··· 839 1093 #define for_each_bucket(b, ca) \ 840 1094 for (b = (ca)->buckets + (ca)->sb.first_bucket; \ 841 1095 b < (ca)->buckets + (ca)->sb.nbuckets; b++) 842 - 843 - static inline void __bkey_put(struct cache_set *c, struct bkey *k) 844 - { 845 - unsigned i; 846 - 847 - for (i = 0; i < KEY_PTRS(k); i++) 848 - atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); 849 - } 850 1096 851 1097 static inline void cached_dev_put(struct cached_dev *dc) 852 1098 { ··· 911 1173 void bch_rescale_priorities(struct cache_set *, int); 912 1174 bool bch_bucket_add_unused(struct cache *, struct bucket *); 913 1175 914 - long bch_bucket_alloc(struct cache *, unsigned, struct closure *); 1176 + long bch_bucket_alloc(struct cache *, unsigned, bool); 915 1177 void bch_bucket_free(struct cache_set *, struct bkey *); 916 1178 917 1179 int __bch_bucket_alloc_set(struct cache_set *, unsigned, 918 - struct bkey *, int, struct closure *); 1180 + struct bkey *, int, bool); 919 1181 int bch_bucket_alloc_set(struct cache_set *, unsigned, 920 - struct bkey *, int, struct closure *); 1182 + struct bkey *, int, bool); 1183 + bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned, 1184 + unsigned, unsigned, bool); 921 1185 922 1186 __printf(2, 3) 923 1187 bool bch_cache_set_error(struct cache_set *, const char *, ...); ··· 927 1187 void bch_prio_write(struct cache *); 928 1188 void bch_write_bdev_super(struct cached_dev *, struct closure *); 929 1189 930 - extern struct workqueue_struct *bcache_wq, *bch_gc_wq; 1190 + extern struct workqueue_struct *bcache_wq; 931 1191 extern const char * const bch_cache_modes[]; 932 1192 extern struct mutex bch_register_lock; 933 1193 extern struct list_head bch_cache_sets; ··· 960 1220 void bch_btree_cache_free(struct cache_set *); 961 1221 int bch_btree_cache_alloc(struct cache_set *); 962 1222 void bch_moving_init_cache_set(struct cache_set *); 1223 + int bch_open_buckets_alloc(struct cache_set *); 1224 + void bch_open_buckets_free(struct cache_set *); 963 1225 964 1226 int bch_cache_allocator_start(struct cache *ca); 965 - void bch_cache_allocator_exit(struct cache *ca); 966 1227 int bch_cache_allocator_init(struct cache *ca); 967 1228 968 1229 void bch_debug_exit(void); 969 1230 int bch_debug_init(struct kobject *); 970 - void bch_writeback_exit(void); 971 - int bch_writeback_init(void); 972 1231 void bch_request_exit(void); 973 1232 int bch_request_init(void); 974 1233 void bch_btree_exit(void);

+146 -143

drivers/md/bcache/bset.c

··· 14 14 15 15 /* Keylists */ 16 16 17 - void bch_keylist_copy(struct keylist *dest, struct keylist *src) 18 - { 19 - *dest = *src; 20 - 21 - if (src->list == src->d) { 22 - size_t n = (uint64_t *) src->top - src->d; 23 - dest->top = (struct bkey *) &dest->d[n]; 24 - dest->list = dest->d; 25 - } 26 - } 27 - 28 17 int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c) 29 18 { 30 - unsigned oldsize = (uint64_t *) l->top - l->list; 31 - unsigned newsize = oldsize + 2 + nptrs; 32 - uint64_t *new; 19 + size_t oldsize = bch_keylist_nkeys(l); 20 + size_t newsize = oldsize + 2 + nptrs; 21 + uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p; 22 + uint64_t *new_keys; 33 23 34 24 /* The journalling code doesn't handle the case where the keys to insert 35 25 * is bigger than an empty write: If we just return -ENOMEM here, ··· 35 45 roundup_pow_of_two(oldsize) == newsize) 36 46 return 0; 37 47 38 - new = krealloc(l->list == l->d ? NULL : l->list, 39 - sizeof(uint64_t) * newsize, GFP_NOIO); 48 + new_keys = krealloc(old_keys, sizeof(uint64_t) * newsize, GFP_NOIO); 40 49 41 - if (!new) 50 + if (!new_keys) 42 51 return -ENOMEM; 43 52 44 - if (l->list == l->d) 45 - memcpy(new, l->list, sizeof(uint64_t) * KEYLIST_INLINE); 53 + if (!old_keys) 54 + memcpy(new_keys, l->inline_keys, sizeof(uint64_t) * oldsize); 46 55 47 - l->list = new; 48 - l->top = (struct bkey *) (&l->list[oldsize]); 56 + l->keys_p = new_keys; 57 + l->top_p = new_keys + oldsize; 49 58 50 59 return 0; 51 60 } 52 61 53 62 struct bkey *bch_keylist_pop(struct keylist *l) 54 63 { 55 - struct bkey *k = l->bottom; 64 + struct bkey *k = l->keys; 56 65 57 66 if (k == l->top) 58 67 return NULL; ··· 62 73 return l->top = k; 63 74 } 64 75 76 + void bch_keylist_pop_front(struct keylist *l) 77 + { 78 + l->top_p -= bkey_u64s(l->keys); 79 + 80 + memmove(l->keys, 81 + bkey_next(l->keys), 82 + bch_keylist_bytes(l)); 83 + } 84 + 65 85 /* Pointer validation */ 66 86 67 - bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) 87 + static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) 68 88 { 69 89 unsigned i; 70 - char buf[80]; 71 - 72 - if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))) 73 - goto bad; 74 - 75 - if (!level && KEY_SIZE(k) > KEY_OFFSET(k)) 76 - goto bad; 77 - 78 - if (!KEY_SIZE(k)) 79 - return true; 80 90 81 91 for (i = 0; i < KEY_PTRS(k); i++) 82 92 if (ptr_available(c, k, i)) { ··· 86 98 if (KEY_SIZE(k) + r > c->sb.bucket_size || 87 99 bucket < ca->sb.first_bucket || 88 100 bucket >= ca->sb.nbuckets) 89 - goto bad; 101 + return true; 90 102 } 103 + 104 + return false; 105 + } 106 + 107 + bool bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) 108 + { 109 + char buf[80]; 110 + 111 + if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)) 112 + goto bad; 113 + 114 + if (__ptr_invalid(c, k)) 115 + goto bad; 91 116 92 117 return false; 93 118 bad: 94 119 bch_bkey_to_text(buf, sizeof(buf), k); 95 - cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k)); 120 + cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k)); 121 + return true; 122 + } 123 + 124 + bool bch_extent_ptr_invalid(struct cache_set *c, const struct bkey *k) 125 + { 126 + char buf[80]; 127 + 128 + if (!KEY_SIZE(k)) 129 + return true; 130 + 131 + if (KEY_SIZE(k) > KEY_OFFSET(k)) 132 + goto bad; 133 + 134 + if (__ptr_invalid(c, k)) 135 + goto bad; 136 + 137 + return false; 138 + bad: 139 + bch_bkey_to_text(buf, sizeof(buf), k); 140 + cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k)); 141 + return true; 142 + } 143 + 144 + static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k, 145 + unsigned ptr) 146 + { 147 + struct bucket *g = PTR_BUCKET(b->c, k, ptr); 148 + char buf[80]; 149 + 150 + if (mutex_trylock(&b->c->bucket_lock)) { 151 + if (b->level) { 152 + if (KEY_DIRTY(k) || 153 + g->prio != BTREE_PRIO || 154 + (b->c->gc_mark_valid && 155 + GC_MARK(g) != GC_MARK_METADATA)) 156 + goto err; 157 + 158 + } else { 159 + if (g->prio == BTREE_PRIO) 160 + goto err; 161 + 162 + if (KEY_DIRTY(k) && 163 + b->c->gc_mark_valid && 164 + GC_MARK(g) != GC_MARK_DIRTY) 165 + goto err; 166 + } 167 + mutex_unlock(&b->c->bucket_lock); 168 + } 169 + 170 + return false; 171 + err: 172 + mutex_unlock(&b->c->bucket_lock); 173 + bch_bkey_to_text(buf, sizeof(buf), k); 174 + btree_bug(b, 175 + "inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", 176 + buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), 177 + g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); 96 178 return true; 97 179 } 98 180 ··· 176 118 bch_ptr_invalid(b, k)) 177 119 return true; 178 120 179 - if (KEY_PTRS(k) && PTR_DEV(k, 0) == PTR_CHECK_DEV) 180 - return true; 121 + for (i = 0; i < KEY_PTRS(k); i++) { 122 + if (!ptr_available(b->c, k, i)) 123 + return true; 181 124 182 - for (i = 0; i < KEY_PTRS(k); i++) 183 - if (ptr_available(b->c, k, i)) { 184 - g = PTR_BUCKET(b->c, k, i); 185 - stale = ptr_stale(b->c, k, i); 125 + g = PTR_BUCKET(b->c, k, i); 126 + stale = ptr_stale(b->c, k, i); 186 127 187 - btree_bug_on(stale > 96, b, 188 - "key too stale: %i, need_gc %u", 189 - stale, b->c->need_gc); 128 + btree_bug_on(stale > 96, b, 129 + "key too stale: %i, need_gc %u", 130 + stale, b->c->need_gc); 190 131 191 - btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), 192 - b, "stale dirty pointer"); 132 + btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), 133 + b, "stale dirty pointer"); 193 134 194 - if (stale) 195 - return true; 135 + if (stale) 136 + return true; 196 137 197 - #ifdef CONFIG_BCACHE_EDEBUG 198 - if (!mutex_trylock(&b->c->bucket_lock)) 199 - continue; 200 - 201 - if (b->level) { 202 - if (KEY_DIRTY(k) || 203 - g->prio != BTREE_PRIO || 204 - (b->c->gc_mark_valid && 205 - GC_MARK(g) != GC_MARK_METADATA)) 206 - goto bug; 207 - 208 - } else { 209 - if (g->prio == BTREE_PRIO) 210 - goto bug; 211 - 212 - if (KEY_DIRTY(k) && 213 - b->c->gc_mark_valid && 214 - GC_MARK(g) != GC_MARK_DIRTY) 215 - goto bug; 216 - } 217 - mutex_unlock(&b->c->bucket_lock); 218 - #endif 219 - } 138 + if (expensive_debug_checks(b->c) && 139 + ptr_bad_expensive_checks(b, k, i)) 140 + return true; 141 + } 220 142 221 143 return false; 222 - #ifdef CONFIG_BCACHE_EDEBUG 223 - bug: 224 - mutex_unlock(&b->c->bucket_lock); 225 - 226 - { 227 - char buf[80]; 228 - 229 - bch_bkey_to_text(buf, sizeof(buf), k); 230 - btree_bug(b, 231 - "inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", 232 - buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), 233 - g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); 234 - } 235 - return true; 236 - #endif 237 144 } 238 145 239 146 /* Key/pointer manipulation */ ··· 481 458 482 459 static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift) 483 460 { 484 - #ifdef CONFIG_X86_64 485 - asm("shrd %[shift],%[high],%[low]" 486 - : [low] "+Rm" (low) 487 - : [high] "R" (high), 488 - [shift] "ci" (shift) 489 - : "cc"); 490 - #else 491 461 low >>= shift; 492 462 low |= (high << 1) << (63U - shift); 493 - #endif 494 463 return low; 495 464 } 496 465 ··· 701 686 } else 702 687 get_random_bytes(&i->seq, sizeof(uint64_t)); 703 688 704 - i->magic = bset_magic(b->c); 689 + i->magic = bset_magic(&b->c->sb); 705 690 i->version = 0; 706 691 i->keys = 0; 707 692 ··· 839 824 } else 840 825 i = bset_search_write_set(b, t, search); 841 826 842 - #ifdef CONFIG_BCACHE_EDEBUG 843 - BUG_ON(bset_written(b, t) && 844 - i.l != t->data->start && 845 - bkey_cmp(tree_to_prev_bkey(t, 846 - inorder_to_tree(bkey_to_cacheline(t, i.l), t)), 847 - search) > 0); 827 + if (expensive_debug_checks(b->c)) { 828 + BUG_ON(bset_written(b, t) && 829 + i.l != t->data->start && 830 + bkey_cmp(tree_to_prev_bkey(t, 831 + inorder_to_tree(bkey_to_cacheline(t, i.l), t)), 832 + search) > 0); 848 833 849 - BUG_ON(i.r != end(t->data) && 850 - bkey_cmp(i.r, search) <= 0); 851 - #endif 834 + BUG_ON(i.r != end(t->data) && 835 + bkey_cmp(i.r, search) <= 0); 836 + } 852 837 853 838 while (likely(i.l != i.r) && 854 839 bkey_cmp(i.l, search) <= 0) ··· 859 844 860 845 /* Btree iterator */ 861 846 847 + /* 848 + * Returns true if l > r - unless l == r, in which case returns true if l is 849 + * older than r. 850 + * 851 + * Necessary for btree_sort_fixup() - if there are multiple keys that compare 852 + * equal in different sets, we have to process them newest to oldest. 853 + */ 862 854 static inline bool btree_iter_cmp(struct btree_iter_set l, 863 855 struct btree_iter_set r) 864 856 { ··· 889 867 } 890 868 891 869 struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, 892 - struct bkey *search, struct bset_tree *start) 870 + struct bkey *search, struct bset_tree *start) 893 871 { 894 872 struct bkey *ret = NULL; 895 873 iter->size = ARRAY_SIZE(iter->data); 896 874 iter->used = 0; 875 + 876 + #ifdef CONFIG_BCACHE_DEBUG 877 + iter->b = b; 878 + #endif 897 879 898 880 for (; start <= &b->sets[b->nsets]; start++) { 899 881 ret = bch_bset_search(b, start, search); ··· 913 887 struct bkey *ret = NULL; 914 888 915 889 if (!btree_iter_end(iter)) { 890 + bch_btree_iter_next_check(iter); 891 + 916 892 ret = iter->data->k; 917 893 iter->data->k = bkey_next(iter->data->k); 918 894 ··· 942 914 } while (ret && fn(b, ret)); 943 915 944 916 return ret; 945 - } 946 - 947 - struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search) 948 - { 949 - struct btree_iter iter; 950 - 951 - bch_btree_iter_init(b, &iter, search); 952 - return bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); 953 917 } 954 918 955 919 /* Mergesort */ ··· 1018 998 out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0; 1019 999 1020 1000 pr_debug("sorted %i keys", out->keys); 1021 - bch_check_key_order(b, out); 1022 1001 } 1023 1002 1024 1003 static void __btree_sort(struct btree *b, struct btree_iter *iter, ··· 1048 1029 * memcpy() 1049 1030 */ 1050 1031 1051 - out->magic = bset_magic(b->c); 1032 + out->magic = bset_magic(&b->c->sb); 1052 1033 out->seq = b->sets[0].data->seq; 1053 1034 out->version = b->sets[0].data->version; 1054 1035 swap(out, b->sets[0].data); ··· 1069 1050 if (b->written) 1070 1051 bset_build_written_tree(b); 1071 1052 1072 - if (!start) { 1073 - spin_lock(&b->c->sort_time_lock); 1053 + if (!start) 1074 1054 bch_time_stats_update(&b->c->sort_time, start_time); 1075 - spin_unlock(&b->c->sort_time_lock); 1076 - } 1077 1055 } 1078 1056 1079 1057 void bch_btree_sort_partial(struct btree *b, unsigned start) 1080 1058 { 1081 - size_t oldsize = 0, order = b->page_order, keys = 0; 1059 + size_t order = b->page_order, keys = 0; 1082 1060 struct btree_iter iter; 1061 + int oldsize = bch_count_data(b); 1062 + 1083 1063 __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]); 1084 1064 1085 1065 BUG_ON(b->sets[b->nsets].data == write_block(b) && 1086 1066 (b->sets[b->nsets].size || b->nsets)); 1087 1067 1088 - if (b->written) 1089 - oldsize = bch_count_data(b); 1090 1068 1091 1069 if (start) { 1092 1070 unsigned i; ··· 1099 1083 1100 1084 __btree_sort(b, &iter, start, order, false); 1101 1085 1102 - EBUG_ON(b->written && bch_count_data(b) != oldsize); 1086 + EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize); 1103 1087 } 1104 1088 1105 1089 void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter) ··· 1117 1101 1118 1102 btree_mergesort(b, new->sets->data, &iter, false, true); 1119 1103 1120 - spin_lock(&b->c->sort_time_lock); 1121 1104 bch_time_stats_update(&b->c->sort_time, start_time); 1122 - spin_unlock(&b->c->sort_time_lock); 1123 1105 1124 1106 bkey_copy_key(&new->key, &b->key); 1125 1107 new->sets->size = 0; ··· 1162 1148 /* Sysfs stuff */ 1163 1149 1164 1150 struct bset_stats { 1151 + struct btree_op op; 1165 1152 size_t nodes; 1166 1153 size_t sets_written, sets_unwritten; 1167 1154 size_t bytes_written, bytes_unwritten; 1168 1155 size_t floats, failed; 1169 1156 }; 1170 1157 1171 - static int bch_btree_bset_stats(struct btree *b, struct btree_op *op, 1172 - struct bset_stats *stats) 1158 + static int btree_bset_stats(struct btree_op *op, struct btree *b) 1173 1159 { 1174 - struct bkey *k; 1160 + struct bset_stats *stats = container_of(op, struct bset_stats, op); 1175 1161 unsigned i; 1176 1162 1177 1163 stats->nodes++; ··· 1196 1182 } 1197 1183 } 1198 1184 1199 - if (b->level) { 1200 - struct btree_iter iter; 1201 - 1202 - for_each_key_filter(b, k, &iter, bch_ptr_bad) { 1203 - int ret = btree(bset_stats, k, b, op, stats); 1204 - if (ret) 1205 - return ret; 1206 - } 1207 - } 1208 - 1209 - return 0; 1185 + return MAP_CONTINUE; 1210 1186 } 1211 1187 1212 1188 int bch_bset_print_stats(struct cache_set *c, char *buf) 1213 1189 { 1214 - struct btree_op op; 1215 1190 struct bset_stats t; 1216 1191 int ret; 1217 1192 1218 - bch_btree_op_init_stack(&op); 1219 1193 memset(&t, 0, sizeof(struct bset_stats)); 1194 + bch_btree_op_init(&t.op, -1); 1220 1195 1221 - ret = btree_root(bset_stats, c, &op, &t); 1222 - if (ret) 1196 + ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats); 1197 + if (ret < 0) 1223 1198 return ret; 1224 1199 1225 1200 return snprintf(buf, PAGE_SIZE,

+51 -42

drivers/md/bcache/bset.h

··· 148 148 149 149 struct btree_iter { 150 150 size_t size, used; 151 + #ifdef CONFIG_BCACHE_DEBUG 152 + struct btree *b; 153 + #endif 151 154 struct btree_iter_set { 152 155 struct bkey *k, *end; 153 156 } data[MAX_BSETS]; ··· 196 193 : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); 197 194 } 198 195 199 - static inline size_t bkey_u64s(const struct bkey *k) 200 - { 201 - BUG_ON(KEY_CSUM(k) > 1); 202 - return 2 + KEY_PTRS(k) + (KEY_CSUM(k) ? 1 : 0); 203 - } 204 - 205 - static inline size_t bkey_bytes(const struct bkey *k) 206 - { 207 - return bkey_u64s(k) * sizeof(uint64_t); 208 - } 209 - 210 - static inline void bkey_copy(struct bkey *dest, const struct bkey *src) 211 - { 212 - memcpy(dest, src, bkey_bytes(src)); 213 - } 214 - 215 - static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src) 216 - { 217 - if (!src) 218 - src = &KEY(0, 0, 0); 219 - 220 - SET_KEY_INODE(dest, KEY_INODE(src)); 221 - SET_KEY_OFFSET(dest, KEY_OFFSET(src)); 222 - } 223 - 224 - static inline struct bkey *bkey_next(const struct bkey *k) 225 - { 226 - uint64_t *d = (void *) k; 227 - return (struct bkey *) (d + bkey_u64s(k)); 228 - } 229 - 230 196 /* Keylists */ 231 197 232 198 struct keylist { 233 - struct bkey *top; 234 199 union { 235 - uint64_t *list; 236 - struct bkey *bottom; 200 + struct bkey *keys; 201 + uint64_t *keys_p; 202 + }; 203 + union { 204 + struct bkey *top; 205 + uint64_t *top_p; 237 206 }; 238 207 239 208 /* Enough room for btree_split's keys without realloc */ 240 209 #define KEYLIST_INLINE 16 241 - uint64_t d[KEYLIST_INLINE]; 210 + uint64_t inline_keys[KEYLIST_INLINE]; 242 211 }; 243 212 244 213 static inline void bch_keylist_init(struct keylist *l) 245 214 { 246 - l->top = (void *) (l->list = l->d); 215 + l->top_p = l->keys_p = l->inline_keys; 247 216 } 248 217 249 218 static inline void bch_keylist_push(struct keylist *l) ··· 231 256 232 257 static inline bool bch_keylist_empty(struct keylist *l) 233 258 { 234 - return l->top == (void *) l->list; 259 + return l->top == l->keys; 260 + } 261 + 262 + static inline void bch_keylist_reset(struct keylist *l) 263 + { 264 + l->top = l->keys; 235 265 } 236 266 237 267 static inline void bch_keylist_free(struct keylist *l) 238 268 { 239 - if (l->list != l->d) 240 - kfree(l->list); 269 + if (l->keys_p != l->inline_keys) 270 + kfree(l->keys_p); 241 271 } 242 272 243 - void bch_keylist_copy(struct keylist *, struct keylist *); 273 + static inline size_t bch_keylist_nkeys(struct keylist *l) 274 + { 275 + return l->top_p - l->keys_p; 276 + } 277 + 278 + static inline size_t bch_keylist_bytes(struct keylist *l) 279 + { 280 + return bch_keylist_nkeys(l) * sizeof(uint64_t); 281 + } 282 + 244 283 struct bkey *bch_keylist_pop(struct keylist *); 284 + void bch_keylist_pop_front(struct keylist *); 245 285 int bch_keylist_realloc(struct keylist *, int, struct cache_set *); 246 286 247 287 void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, ··· 277 287 } 278 288 279 289 const char *bch_ptr_status(struct cache_set *, const struct bkey *); 280 - bool __bch_ptr_invalid(struct cache_set *, int level, const struct bkey *); 290 + bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); 291 + bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *); 292 + 281 293 bool bch_ptr_bad(struct btree *, const struct bkey *); 282 294 283 295 static inline uint8_t gen_after(uint8_t a, uint8_t b) ··· 303 311 304 312 typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *); 305 313 306 - struct bkey *bch_next_recurse_key(struct btree *, struct bkey *); 307 314 struct bkey *bch_btree_iter_next(struct btree_iter *); 308 315 struct bkey *bch_btree_iter_next_filter(struct btree_iter *, 309 316 struct btree *, ptr_filter_fn); ··· 352 361 struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, 353 362 const struct bkey *); 354 363 364 + /* 365 + * Returns the first key that is strictly greater than search 366 + */ 355 367 static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t, 356 368 const struct bkey *search) 357 369 { 358 370 return search ? __bch_bset_search(b, t, search) : t->data->start; 359 371 } 372 + 373 + #define PRECEDING_KEY(_k) \ 374 + ({ \ 375 + struct bkey *_ret = NULL; \ 376 + \ 377 + if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \ 378 + _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \ 379 + \ 380 + if (!_ret->low) \ 381 + _ret->high--; \ 382 + _ret->low--; \ 383 + } \ 384 + \ 385 + _ret; \ 386 + }) 360 387 361 388 bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *); 362 389 void bch_btree_sort_lazy(struct btree *);

+768 -680

drivers/md/bcache/btree.c

··· 23 23 #include "bcache.h" 24 24 #include "btree.h" 25 25 #include "debug.h" 26 - #include "request.h" 27 26 #include "writeback.h" 28 27 29 28 #include <linux/slab.h> 30 29 #include <linux/bitops.h> 30 + #include <linux/freezer.h> 31 31 #include <linux/hash.h> 32 + #include <linux/kthread.h> 32 33 #include <linux/prefetch.h> 33 34 #include <linux/random.h> 34 35 #include <linux/rcupdate.h> ··· 89 88 * Test module load/unload 90 89 */ 91 90 92 - static const char * const op_types[] = { 93 - "insert", "replace" 91 + enum { 92 + BTREE_INSERT_STATUS_INSERT, 93 + BTREE_INSERT_STATUS_BACK_MERGE, 94 + BTREE_INSERT_STATUS_OVERWROTE, 95 + BTREE_INSERT_STATUS_FRONT_MERGE, 94 96 }; 95 - 96 - static const char *op_type(struct btree_op *op) 97 - { 98 - return op_types[op->type]; 99 - } 100 97 101 98 #define MAX_NEED_GC 64 102 99 #define MAX_SAVE_PRIO 72 ··· 104 105 #define PTR_HASH(c, k) \ 105 106 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) 106 107 107 - struct workqueue_struct *bch_gc_wq; 108 108 static struct workqueue_struct *btree_io_wq; 109 109 110 - void bch_btree_op_init_stack(struct btree_op *op) 110 + static inline bool should_split(struct btree *b) 111 111 { 112 - memset(op, 0, sizeof(struct btree_op)); 113 - closure_init_stack(&op->cl); 114 - op->lock = -1; 115 - bch_keylist_init(&op->keys); 112 + struct bset *i = write_block(b); 113 + return b->written >= btree_blocks(b) || 114 + (b->written + __set_blocks(i, i->keys + 15, b->c) 115 + > btree_blocks(b)); 116 116 } 117 + 118 + #define insert_lock(s, b) ((b)->level <= (s)->lock) 119 + 120 + /* 121 + * These macros are for recursing down the btree - they handle the details of 122 + * locking and looking up nodes in the cache for you. They're best treated as 123 + * mere syntax when reading code that uses them. 124 + * 125 + * op->lock determines whether we take a read or a write lock at a given depth. 126 + * If you've got a read lock and find that you need a write lock (i.e. you're 127 + * going to have to split), set op->lock and return -EINTR; btree_root() will 128 + * call you again and you'll have the correct lock. 129 + */ 130 + 131 + /** 132 + * btree - recurse down the btree on a specified key 133 + * @fn: function to call, which will be passed the child node 134 + * @key: key to recurse on 135 + * @b: parent btree node 136 + * @op: pointer to struct btree_op 137 + */ 138 + #define btree(fn, key, b, op, ...) \ 139 + ({ \ 140 + int _r, l = (b)->level - 1; \ 141 + bool _w = l <= (op)->lock; \ 142 + struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \ 143 + if (!IS_ERR(_child)) { \ 144 + _child->parent = (b); \ 145 + _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ 146 + rw_unlock(_w, _child); \ 147 + } else \ 148 + _r = PTR_ERR(_child); \ 149 + _r; \ 150 + }) 151 + 152 + /** 153 + * btree_root - call a function on the root of the btree 154 + * @fn: function to call, which will be passed the child node 155 + * @c: cache set 156 + * @op: pointer to struct btree_op 157 + */ 158 + #define btree_root(fn, c, op, ...) \ 159 + ({ \ 160 + int _r = -EINTR; \ 161 + do { \ 162 + struct btree *_b = (c)->root; \ 163 + bool _w = insert_lock(op, _b); \ 164 + rw_lock(_w, _b, _b->level); \ 165 + if (_b == (c)->root && \ 166 + _w == insert_lock(op, _b)) { \ 167 + _b->parent = NULL; \ 168 + _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ 169 + } \ 170 + rw_unlock(_w, _b); \ 171 + bch_cannibalize_unlock(c); \ 172 + if (_r == -ENOSPC) { \ 173 + wait_event((c)->try_wait, \ 174 + !(c)->try_harder); \ 175 + _r = -EINTR; \ 176 + } \ 177 + } while (_r == -EINTR); \ 178 + \ 179 + _r; \ 180 + }) 117 181 118 182 /* Btree key manipulation */ 119 183 120 - static void bkey_put(struct cache_set *c, struct bkey *k, int level) 184 + void bkey_put(struct cache_set *c, struct bkey *k) 121 185 { 122 - if ((level && KEY_OFFSET(k)) || !level) 123 - __bkey_put(c, k); 186 + unsigned i; 187 + 188 + for (i = 0; i < KEY_PTRS(k); i++) 189 + if (ptr_available(c, k, i)) 190 + atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); 124 191 } 125 192 126 193 /* Btree IO */ ··· 210 145 iter->size = b->c->sb.bucket_size / b->c->sb.block_size; 211 146 iter->used = 0; 212 147 148 + #ifdef CONFIG_BCACHE_DEBUG 149 + iter->b = b; 150 + #endif 151 + 213 152 if (!i->seq) 214 153 goto err; 215 154 ··· 229 160 goto err; 230 161 231 162 err = "bad magic"; 232 - if (i->magic != bset_magic(b->c)) 163 + if (i->magic != bset_magic(&b->c->sb)) 233 164 goto err; 234 165 235 166 err = "bad checksum"; ··· 317 248 goto err; 318 249 319 250 bch_btree_node_read_done(b); 320 - 321 - spin_lock(&b->c->btree_read_time_lock); 322 251 bch_time_stats_update(&b->c->btree_read_time, start_time); 323 - spin_unlock(&b->c->btree_read_time_lock); 324 252 325 253 return; 326 254 err: ··· 393 327 b->bio = bch_bbio_alloc(b->c); 394 328 395 329 b->bio->bi_end_io = btree_node_write_endio; 396 - b->bio->bi_private = &b->io.cl; 330 + b->bio->bi_private = cl; 397 331 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; 398 332 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); 399 333 bch_bio_map(b->bio, i); ··· 449 383 BUG_ON(b->written >= btree_blocks(b)); 450 384 BUG_ON(b->written && !i->keys); 451 385 BUG_ON(b->sets->data->seq != i->seq); 452 - bch_check_key_order(b, i); 386 + bch_check_keys(b, "writing"); 453 387 454 388 cancel_delayed_work(&b->work); 455 389 ··· 471 405 bch_bset_init_next(b); 472 406 } 473 407 408 + static void bch_btree_node_write_sync(struct btree *b) 409 + { 410 + struct closure cl; 411 + 412 + closure_init_stack(&cl); 413 + bch_btree_node_write(b, &cl); 414 + closure_sync(&cl); 415 + } 416 + 474 417 static void btree_node_write_work(struct work_struct *w) 475 418 { 476 419 struct btree *b = container_of(to_delayed_work(w), struct btree, work); ··· 491 416 rw_unlock(true, b); 492 417 } 493 418 494 - static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) 419 + static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) 495 420 { 496 421 struct bset *i = b->sets[b->nsets].data; 497 422 struct btree_write *w = btree_current_write(b); ··· 504 429 505 430 set_btree_node_dirty(b); 506 431 507 - if (op && op->journal) { 432 + if (journal_ref) { 508 433 if (w->journal && 509 - journal_pin_cmp(b->c, w, op)) { 434 + journal_pin_cmp(b->c, w->journal, journal_ref)) { 510 435 atomic_dec_bug(w->journal); 511 436 w->journal = NULL; 512 437 } 513 438 514 439 if (!w->journal) { 515 - w->journal = op->journal; 440 + w->journal = journal_ref; 516 441 atomic_inc(w->journal); 517 442 } 518 443 } ··· 641 566 return b; 642 567 } 643 568 644 - static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) 569 + static int mca_reap(struct btree *b, unsigned min_order, bool flush) 645 570 { 571 + struct closure cl; 572 + 573 + closure_init_stack(&cl); 646 574 lockdep_assert_held(&b->c->bucket_lock); 647 575 648 576 if (!down_write_trylock(&b->lock)) 649 577 return -ENOMEM; 650 578 651 - if (b->page_order < min_order) { 579 + BUG_ON(btree_node_dirty(b) && !b->sets[0].data); 580 + 581 + if (b->page_order < min_order || 582 + (!flush && 583 + (btree_node_dirty(b) || 584 + atomic_read(&b->io.cl.remaining) != -1))) { 652 585 rw_unlock(true, b); 653 586 return -ENOMEM; 654 587 } 655 588 656 - BUG_ON(btree_node_dirty(b) && !b->sets[0].data); 589 + if (btree_node_dirty(b)) 590 + bch_btree_node_write_sync(b); 657 591 658 - if (cl && btree_node_dirty(b)) 659 - bch_btree_node_write(b, NULL); 660 - 661 - if (cl) 662 - closure_wait_event_async(&b->io.wait, cl, 663 - atomic_read(&b->io.cl.remaining) == -1); 664 - 665 - if (btree_node_dirty(b) || 666 - !closure_is_unlocked(&b->io.cl) || 667 - work_pending(&b->work.work)) { 668 - rw_unlock(true, b); 669 - return -EAGAIN; 670 - } 592 + /* wait for any in flight btree write */ 593 + closure_wait_event(&b->io.wait, &cl, 594 + atomic_read(&b->io.cl.remaining) == -1); 671 595 672 596 return 0; 673 597 } ··· 707 633 break; 708 634 709 635 if (++i > 3 && 710 - !mca_reap(b, NULL, 0)) { 636 + !mca_reap(b, 0, false)) { 711 637 mca_data_free(b); 712 638 rw_unlock(true, b); 713 639 freed++; ··· 726 652 list_rotate_left(&c->btree_cache); 727 653 728 654 if (!b->accessed && 729 - !mca_reap(b, NULL, 0)) { 655 + !mca_reap(b, 0, false)) { 730 656 mca_bucket_free(b); 731 657 mca_data_free(b); 732 658 rw_unlock(true, b); ··· 797 723 { 798 724 unsigned i; 799 725 800 - /* XXX: doesn't check for errors */ 801 - 802 - closure_init_unlocked(&c->gc); 803 - 804 726 for (i = 0; i < mca_reserve(c); i++) 805 - mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); 727 + if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL)) 728 + return -ENOMEM; 806 729 807 730 list_splice_init(&c->btree_cache, 808 731 &c->btree_cache_freeable); ··· 846 775 return b; 847 776 } 848 777 849 - static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, 850 - int level, struct closure *cl) 778 + static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) 851 779 { 852 - int ret = -ENOMEM; 853 - struct btree *i; 780 + struct btree *b; 854 781 855 782 trace_bcache_btree_cache_cannibalize(c); 856 783 857 - if (!cl) 858 - return ERR_PTR(-ENOMEM); 784 + if (!c->try_harder) { 785 + c->try_harder = current; 786 + c->try_harder_start = local_clock(); 787 + } else if (c->try_harder != current) 788 + return ERR_PTR(-ENOSPC); 859 789 860 - /* 861 - * Trying to free up some memory - i.e. reuse some btree nodes - may 862 - * require initiating IO to flush the dirty part of the node. If we're 863 - * running under generic_make_request(), that IO will never finish and 864 - * we would deadlock. Returning -EAGAIN causes the cache lookup code to 865 - * punt to workqueue and retry. 866 - */ 867 - if (current->bio_list) 868 - return ERR_PTR(-EAGAIN); 790 + list_for_each_entry_reverse(b, &c->btree_cache, list) 791 + if (!mca_reap(b, btree_order(k), false)) 792 + return b; 869 793 870 - if (c->try_harder && c->try_harder != cl) { 871 - closure_wait_event_async(&c->try_wait, cl, !c->try_harder); 872 - return ERR_PTR(-EAGAIN); 873 - } 794 + list_for_each_entry_reverse(b, &c->btree_cache, list) 795 + if (!mca_reap(b, btree_order(k), true)) 796 + return b; 874 797 875 - c->try_harder = cl; 876 - c->try_harder_start = local_clock(); 877 - retry: 878 - list_for_each_entry_reverse(i, &c->btree_cache, list) { 879 - int r = mca_reap(i, cl, btree_order(k)); 880 - if (!r) 881 - return i; 882 - if (r != -ENOMEM) 883 - ret = r; 884 - } 885 - 886 - if (ret == -EAGAIN && 887 - closure_blocking(cl)) { 888 - mutex_unlock(&c->bucket_lock); 889 - closure_sync(cl); 890 - mutex_lock(&c->bucket_lock); 891 - goto retry; 892 - } 893 - 894 - return ERR_PTR(ret); 798 + return ERR_PTR(-ENOMEM); 895 799 } 896 800 897 801 /* ··· 875 829 * cannibalize_bucket() will take. This means every time we unlock the root of 876 830 * the btree, we need to release this lock if we have it held. 877 831 */ 878 - void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) 832 + static void bch_cannibalize_unlock(struct cache_set *c) 879 833 { 880 - if (c->try_harder == cl) { 834 + if (c->try_harder == current) { 881 835 bch_time_stats_update(&c->try_harder_time, c->try_harder_start); 882 836 c->try_harder = NULL; 883 - __closure_wake_up(&c->try_wait); 837 + wake_up(&c->try_wait); 884 838 } 885 839 } 886 840 887 - static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, 888 - int level, struct closure *cl) 841 + static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) 889 842 { 890 843 struct btree *b; 844 + 845 + BUG_ON(current->bio_list); 891 846 892 847 lockdep_assert_held(&c->bucket_lock); 893 848 ··· 899 852 * the list. Check if there's any freed nodes there: 900 853 */ 901 854 list_for_each_entry(b, &c->btree_cache_freeable, list) 902 - if (!mca_reap(b, NULL, btree_order(k))) 855 + if (!mca_reap(b, btree_order(k), false)) 903 856 goto out; 904 857 905 858 /* We never free struct btree itself, just the memory that holds the on 906 859 * disk node. Check the freed list before allocating a new one: 907 860 */ 908 861 list_for_each_entry(b, &c->btree_cache_freed, list) 909 - if (!mca_reap(b, NULL, 0)) { 862 + if (!mca_reap(b, 0, false)) { 910 863 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); 911 864 if (!b->sets[0].data) 912 865 goto err; ··· 931 884 932 885 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); 933 886 b->level = level; 887 + b->parent = (void *) ~0UL; 934 888 935 889 mca_reinit(b); 936 890 ··· 940 892 if (b) 941 893 rw_unlock(true, b); 942 894 943 - b = mca_cannibalize(c, k, level, cl); 895 + b = mca_cannibalize(c, k); 944 896 if (!IS_ERR(b)) 945 897 goto out; 946 898 ··· 951 903 * bch_btree_node_get - find a btree node in the cache and lock it, reading it 952 904 * in from disk if necessary. 953 905 * 954 - * If IO is necessary, it uses the closure embedded in struct btree_op to wait; 955 - * if that closure is in non blocking mode, will return -EAGAIN. 906 + * If IO is necessary and running under generic_make_request, returns -EAGAIN. 956 907 * 957 908 * The btree node will have either a read or a write lock held, depending on 958 909 * level and op->lock. 959 910 */ 960 911 struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, 961 - int level, struct btree_op *op) 912 + int level, bool write) 962 913 { 963 914 int i = 0; 964 - bool write = level <= op->lock; 965 915 struct btree *b; 966 916 967 917 BUG_ON(level < 0); ··· 971 925 return ERR_PTR(-EAGAIN); 972 926 973 927 mutex_lock(&c->bucket_lock); 974 - b = mca_alloc(c, k, level, &op->cl); 928 + b = mca_alloc(c, k, level); 975 929 mutex_unlock(&c->bucket_lock); 976 930 977 931 if (!b) ··· 1017 971 struct btree *b; 1018 972 1019 973 mutex_lock(&c->bucket_lock); 1020 - b = mca_alloc(c, k, level, NULL); 974 + b = mca_alloc(c, k, level); 1021 975 mutex_unlock(&c->bucket_lock); 1022 976 1023 977 if (!IS_ERR_OR_NULL(b)) { ··· 1028 982 1029 983 /* Btree alloc */ 1030 984 1031 - static void btree_node_free(struct btree *b, struct btree_op *op) 985 + static void btree_node_free(struct btree *b) 1032 986 { 1033 987 unsigned i; 1034 988 1035 989 trace_bcache_btree_node_free(b); 1036 990 1037 - /* 1038 - * The BUG_ON() in btree_node_get() implies that we must have a write 1039 - * lock on parent to free or even invalidate a node 1040 - */ 1041 - BUG_ON(op->lock <= b->level); 1042 991 BUG_ON(b == b->c->root); 1043 992 1044 993 if (btree_node_dirty(b)) ··· 1056 1015 mutex_unlock(&b->c->bucket_lock); 1057 1016 } 1058 1017 1059 - struct btree *bch_btree_node_alloc(struct cache_set *c, int level, 1060 - struct closure *cl) 1018 + struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait) 1061 1019 { 1062 1020 BKEY_PADDED(key) k; 1063 1021 struct btree *b = ERR_PTR(-EAGAIN); 1064 1022 1065 1023 mutex_lock(&c->bucket_lock); 1066 1024 retry: 1067 - if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl)) 1025 + if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait)) 1068 1026 goto err; 1069 1027 1028 + bkey_put(c, &k.key); 1070 1029 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); 1071 1030 1072 - b = mca_alloc(c, &k.key, level, cl); 1031 + b = mca_alloc(c, &k.key, level); 1073 1032 if (IS_ERR(b)) 1074 1033 goto err_free; 1075 1034 1076 1035 if (!b) { 1077 1036 cache_bug(c, 1078 1037 "Tried to allocate bucket that was in btree cache"); 1079 - __bkey_put(c, &k.key); 1080 1038 goto retry; 1081 1039 } 1082 1040 ··· 1088 1048 return b; 1089 1049 err_free: 1090 1050 bch_bucket_free(c, &k.key); 1091 - __bkey_put(c, &k.key); 1092 1051 err: 1093 1052 mutex_unlock(&c->bucket_lock); 1094 1053 ··· 1095 1056 return b; 1096 1057 } 1097 1058 1098 - static struct btree *btree_node_alloc_replacement(struct btree *b, 1099 - struct closure *cl) 1059 + static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) 1100 1060 { 1101 - struct btree *n = bch_btree_node_alloc(b->c, b->level, cl); 1061 + struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); 1102 1062 if (!IS_ERR_OR_NULL(n)) 1103 1063 bch_btree_sort_into(b, n); 1104 1064 1105 1065 return n; 1066 + } 1067 + 1068 + static void make_btree_freeing_key(struct btree *b, struct bkey *k) 1069 + { 1070 + unsigned i; 1071 + 1072 + bkey_copy(k, &b->key); 1073 + bkey_copy_key(k, &ZERO_KEY); 1074 + 1075 + for (i = 0; i < KEY_PTRS(k); i++) { 1076 + uint8_t g = PTR_BUCKET(b->c, k, i)->gen + 1; 1077 + 1078 + SET_PTR_GEN(k, i, g); 1079 + } 1080 + 1081 + atomic_inc(&b->c->prio_blocked); 1106 1082 } 1107 1083 1108 1084 /* Garbage collection */ ··· 1173 1119 1174 1120 #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) 1175 1121 1176 - static int btree_gc_mark_node(struct btree *b, unsigned *keys, 1177 - struct gc_stat *gc) 1122 + static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) 1178 1123 { 1179 1124 uint8_t stale = 0; 1180 - unsigned last_dev = -1; 1181 - struct bcache_device *d = NULL; 1125 + unsigned keys = 0, good_keys = 0; 1182 1126 struct bkey *k; 1183 1127 struct btree_iter iter; 1184 1128 struct bset_tree *t; ··· 1184 1132 gc->nodes++; 1185 1133 1186 1134 for_each_key_filter(b, k, &iter, bch_ptr_invalid) { 1187 - if (last_dev != KEY_INODE(k)) { 1188 - last_dev = KEY_INODE(k); 1189 - 1190 - d = KEY_INODE(k) < b->c->nr_uuids 1191 - ? b->c->devices[last_dev] 1192 - : NULL; 1193 - } 1194 - 1195 1135 stale = max(stale, btree_mark_key(b, k)); 1136 + keys++; 1196 1137 1197 1138 if (bch_ptr_bad(b, k)) 1198 1139 continue; 1199 1140 1200 - *keys += bkey_u64s(k); 1201 - 1202 1141 gc->key_bytes += bkey_u64s(k); 1203 1142 gc->nkeys++; 1143 + good_keys++; 1204 1144 1205 1145 gc->data += KEY_SIZE(k); 1206 - if (KEY_DIRTY(k)) 1207 - gc->dirty += KEY_SIZE(k); 1208 1146 } 1209 1147 1210 1148 for (t = b->sets; t <= &b->sets[b->nsets]; t++) ··· 1203 1161 bkey_cmp(&b->key, &t->end) < 0, 1204 1162 b, "found short btree key in gc"); 1205 1163 1206 - return stale; 1164 + if (b->c->gc_always_rewrite) 1165 + return true; 1166 + 1167 + if (stale > 10) 1168 + return true; 1169 + 1170 + if ((keys - good_keys) * 2 > keys) 1171 + return true; 1172 + 1173 + return false; 1207 1174 } 1208 1175 1209 - static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, 1210 - struct btree_op *op) 1211 - { 1212 - /* 1213 - * We block priorities from being written for the duration of garbage 1214 - * collection, so we can't sleep in btree_alloc() -> 1215 - * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it 1216 - * our closure. 1217 - */ 1218 - struct btree *n = btree_node_alloc_replacement(b, NULL); 1219 - 1220 - if (!IS_ERR_OR_NULL(n)) { 1221 - swap(b, n); 1222 - __bkey_put(b->c, &b->key); 1223 - 1224 - memcpy(k->ptr, b->key.ptr, 1225 - sizeof(uint64_t) * KEY_PTRS(&b->key)); 1226 - 1227 - btree_node_free(n, op); 1228 - up_write(&n->lock); 1229 - } 1230 - 1231 - return b; 1232 - } 1233 - 1234 - /* 1235 - * Leaving this at 2 until we've got incremental garbage collection done; it 1236 - * could be higher (and has been tested with 4) except that garbage collection 1237 - * could take much longer, adversely affecting latency. 1238 - */ 1239 - #define GC_MERGE_NODES 2U 1176 + #define GC_MERGE_NODES 4U 1240 1177 1241 1178 struct gc_merge_info { 1242 1179 struct btree *b; 1243 - struct bkey *k; 1244 1180 unsigned keys; 1245 1181 }; 1246 1182 1247 - static void btree_gc_coalesce(struct btree *b, struct btree_op *op, 1248 - struct gc_stat *gc, struct gc_merge_info *r) 1249 - { 1250 - unsigned nodes = 0, keys = 0, blocks; 1251 - int i; 1183 + static int bch_btree_insert_node(struct btree *, struct btree_op *, 1184 + struct keylist *, atomic_t *, struct bkey *); 1252 1185 1253 - while (nodes < GC_MERGE_NODES && r[nodes].b) 1186 + static int btree_gc_coalesce(struct btree *b, struct btree_op *op, 1187 + struct keylist *keylist, struct gc_stat *gc, 1188 + struct gc_merge_info *r) 1189 + { 1190 + unsigned i, nodes = 0, keys = 0, blocks; 1191 + struct btree *new_nodes[GC_MERGE_NODES]; 1192 + struct closure cl; 1193 + struct bkey *k; 1194 + 1195 + memset(new_nodes, 0, sizeof(new_nodes)); 1196 + closure_init_stack(&cl); 1197 + 1198 + while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) 1254 1199 keys += r[nodes++].keys; 1255 1200 1256 1201 blocks = btree_default_blocks(b->c) * 2 / 3; 1257 1202 1258 1203 if (nodes < 2 || 1259 1204 __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) 1260 - return; 1205 + return 0; 1261 1206 1262 - for (i = nodes - 1; i >= 0; --i) { 1263 - if (r[i].b->written) 1264 - r[i].b = btree_gc_alloc(r[i].b, r[i].k, op); 1265 - 1266 - if (r[i].b->written) 1267 - return; 1207 + for (i = 0; i < nodes; i++) { 1208 + new_nodes[i] = btree_node_alloc_replacement(r[i].b, false); 1209 + if (IS_ERR_OR_NULL(new_nodes[i])) 1210 + goto out_nocoalesce; 1268 1211 } 1269 1212 1270 1213 for (i = nodes - 1; i > 0; --i) { 1271 - struct bset *n1 = r[i].b->sets->data; 1272 - struct bset *n2 = r[i - 1].b->sets->data; 1214 + struct bset *n1 = new_nodes[i]->sets->data; 1215 + struct bset *n2 = new_nodes[i - 1]->sets->data; 1273 1216 struct bkey *k, *last = NULL; 1274 1217 1275 1218 keys = 0; 1276 1219 1277 - if (i == 1) { 1278 - /* 1279 - * Last node we're not getting rid of - we're getting 1280 - * rid of the node at r[0]. Have to try and fit all of 1281 - * the remaining keys into this node; we can't ensure 1282 - * they will always fit due to rounding and variable 1283 - * length keys (shouldn't be possible in practice, 1284 - * though) 1285 - */ 1286 - if (__set_blocks(n1, n1->keys + r->keys, 1287 - b->c) > btree_blocks(r[i].b)) 1288 - return; 1289 - 1290 - keys = n2->keys; 1291 - last = &r->b->key; 1292 - } else 1220 + if (i > 1) { 1293 1221 for (k = n2->start; 1294 1222 k < end(n2); 1295 1223 k = bkey_next(k)) { ··· 1270 1258 last = k; 1271 1259 keys += bkey_u64s(k); 1272 1260 } 1261 + } else { 1262 + /* 1263 + * Last node we're not getting rid of - we're getting 1264 + * rid of the node at r[0]. Have to try and fit all of 1265 + * the remaining keys into this node; we can't ensure 1266 + * they will always fit due to rounding and variable 1267 + * length keys (shouldn't be possible in practice, 1268 + * though) 1269 + */ 1270 + if (__set_blocks(n1, n1->keys + n2->keys, 1271 + b->c) > btree_blocks(new_nodes[i])) 1272 + goto out_nocoalesce; 1273 + 1274 + keys = n2->keys; 1275 + /* Take the key of the node we're getting rid of */ 1276 + last = &r->b->key; 1277 + } 1273 1278 1274 1279 BUG_ON(__set_blocks(n1, n1->keys + keys, 1275 - b->c) > btree_blocks(r[i].b)); 1280 + b->c) > btree_blocks(new_nodes[i])); 1276 1281 1277 - if (last) { 1278 - bkey_copy_key(&r[i].b->key, last); 1279 - bkey_copy_key(r[i].k, last); 1280 - } 1282 + if (last) 1283 + bkey_copy_key(&new_nodes[i]->key, last); 1281 1284 1282 1285 memcpy(end(n1), 1283 1286 n2->start, 1284 1287 (void *) node(n2, keys) - (void *) n2->start); 1285 1288 1286 1289 n1->keys += keys; 1290 + r[i].keys = n1->keys; 1287 1291 1288 1292 memmove(n2->start, 1289 1293 node(n2, keys), ··· 1307 1279 1308 1280 n2->keys -= keys; 1309 1281 1310 - r[i].keys = n1->keys; 1311 - r[i - 1].keys = n2->keys; 1282 + if (bch_keylist_realloc(keylist, 1283 + KEY_PTRS(&new_nodes[i]->key), b->c)) 1284 + goto out_nocoalesce; 1285 + 1286 + bch_btree_node_write(new_nodes[i], &cl); 1287 + bch_keylist_add(keylist, &new_nodes[i]->key); 1312 1288 } 1313 1289 1314 - btree_node_free(r->b, op); 1315 - up_write(&r->b->lock); 1290 + for (i = 0; i < nodes; i++) { 1291 + if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c)) 1292 + goto out_nocoalesce; 1293 + 1294 + make_btree_freeing_key(r[i].b, keylist->top); 1295 + bch_keylist_push(keylist); 1296 + } 1297 + 1298 + /* We emptied out this node */ 1299 + BUG_ON(new_nodes[0]->sets->data->keys); 1300 + btree_node_free(new_nodes[0]); 1301 + rw_unlock(true, new_nodes[0]); 1302 + 1303 + closure_sync(&cl); 1304 + 1305 + for (i = 0; i < nodes; i++) { 1306 + btree_node_free(r[i].b); 1307 + rw_unlock(true, r[i].b); 1308 + 1309 + r[i].b = new_nodes[i]; 1310 + } 1311 + 1312 + bch_btree_insert_node(b, op, keylist, NULL, NULL); 1313 + BUG_ON(!bch_keylist_empty(keylist)); 1314 + 1315 + memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); 1316 + r[nodes - 1].b = ERR_PTR(-EINTR); 1316 1317 1317 1318 trace_bcache_btree_gc_coalesce(nodes); 1318 - 1319 1319 gc->nodes--; 1320 - nodes--; 1321 1320 1322 - memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes); 1323 - memset(&r[nodes], 0, sizeof(struct gc_merge_info)); 1321 + /* Invalidated our iterator */ 1322 + return -EINTR; 1323 + 1324 + out_nocoalesce: 1325 + closure_sync(&cl); 1326 + 1327 + while ((k = bch_keylist_pop(keylist))) 1328 + if (!bkey_cmp(k, &ZERO_KEY)) 1329 + atomic_dec(&b->c->prio_blocked); 1330 + 1331 + for (i = 0; i < nodes; i++) 1332 + if (!IS_ERR_OR_NULL(new_nodes[i])) { 1333 + btree_node_free(new_nodes[i]); 1334 + rw_unlock(true, new_nodes[i]); 1335 + } 1336 + return 0; 1337 + } 1338 + 1339 + static unsigned btree_gc_count_keys(struct btree *b) 1340 + { 1341 + struct bkey *k; 1342 + struct btree_iter iter; 1343 + unsigned ret = 0; 1344 + 1345 + for_each_key_filter(b, k, &iter, bch_ptr_bad) 1346 + ret += bkey_u64s(k); 1347 + 1348 + return ret; 1324 1349 } 1325 1350 1326 1351 static int btree_gc_recurse(struct btree *b, struct btree_op *op, 1327 1352 struct closure *writes, struct gc_stat *gc) 1328 1353 { 1329 - void write(struct btree *r) 1330 - { 1331 - if (!r->written) 1332 - bch_btree_node_write(r, &op->cl); 1333 - else if (btree_node_dirty(r)) 1334 - bch_btree_node_write(r, writes); 1335 - 1336 - up_write(&r->lock); 1337 - } 1338 - 1339 - int ret = 0, stale; 1340 1354 unsigned i; 1355 + int ret = 0; 1356 + bool should_rewrite; 1357 + struct btree *n; 1358 + struct bkey *k; 1359 + struct keylist keys; 1360 + struct btree_iter iter; 1341 1361 struct gc_merge_info r[GC_MERGE_NODES]; 1362 + struct gc_merge_info *last = r + GC_MERGE_NODES - 1; 1342 1363 1343 - memset(r, 0, sizeof(r)); 1364 + bch_keylist_init(&keys); 1365 + bch_btree_iter_init(b, &iter, &b->c->gc_done); 1344 1366 1345 - while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { 1346 - r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op); 1367 + for (i = 0; i < GC_MERGE_NODES; i++) 1368 + r[i].b = ERR_PTR(-EINTR); 1347 1369 1348 - if (IS_ERR(r->b)) { 1349 - ret = PTR_ERR(r->b); 1350 - break; 1370 + while (1) { 1371 + k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); 1372 + if (k) { 1373 + r->b = bch_btree_node_get(b->c, k, b->level - 1, true); 1374 + if (IS_ERR(r->b)) { 1375 + ret = PTR_ERR(r->b); 1376 + break; 1377 + } 1378 + 1379 + r->keys = btree_gc_count_keys(r->b); 1380 + 1381 + ret = btree_gc_coalesce(b, op, &keys, gc, r); 1382 + if (ret) 1383 + break; 1351 1384 } 1352 1385 1353 - r->keys = 0; 1354 - stale = btree_gc_mark_node(r->b, &r->keys, gc); 1355 - 1356 - if (!b->written && 1357 - (r->b->level || stale > 10 || 1358 - b->c->gc_always_rewrite)) 1359 - r->b = btree_gc_alloc(r->b, r->k, op); 1360 - 1361 - if (r->b->level) 1362 - ret = btree_gc_recurse(r->b, op, writes, gc); 1363 - 1364 - if (ret) { 1365 - write(r->b); 1386 + if (!last->b) 1366 1387 break; 1388 + 1389 + if (!IS_ERR(last->b)) { 1390 + should_rewrite = btree_gc_mark_node(last->b, gc); 1391 + if (should_rewrite) { 1392 + n = btree_node_alloc_replacement(last->b, 1393 + false); 1394 + 1395 + if (!IS_ERR_OR_NULL(n)) { 1396 + bch_btree_node_write_sync(n); 1397 + bch_keylist_add(&keys, &n->key); 1398 + 1399 + make_btree_freeing_key(last->b, 1400 + keys.top); 1401 + bch_keylist_push(&keys); 1402 + 1403 + btree_node_free(last->b); 1404 + 1405 + bch_btree_insert_node(b, op, &keys, 1406 + NULL, NULL); 1407 + BUG_ON(!bch_keylist_empty(&keys)); 1408 + 1409 + rw_unlock(true, last->b); 1410 + last->b = n; 1411 + 1412 + /* Invalidated our iterator */ 1413 + ret = -EINTR; 1414 + break; 1415 + } 1416 + } 1417 + 1418 + if (last->b->level) { 1419 + ret = btree_gc_recurse(last->b, op, writes, gc); 1420 + if (ret) 1421 + break; 1422 + } 1423 + 1424 + bkey_copy_key(&b->c->gc_done, &last->b->key); 1425 + 1426 + /* 1427 + * Must flush leaf nodes before gc ends, since replace 1428 + * operations aren't journalled 1429 + */ 1430 + if (btree_node_dirty(last->b)) 1431 + bch_btree_node_write(last->b, writes); 1432 + rw_unlock(true, last->b); 1367 1433 } 1368 1434 1369 - bkey_copy_key(&b->c->gc_done, r->k); 1435 + memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); 1436 + r->b = NULL; 1370 1437 1371 - if (!b->written) 1372 - btree_gc_coalesce(b, op, gc, r); 1373 - 1374 - if (r[GC_MERGE_NODES - 1].b) 1375 - write(r[GC_MERGE_NODES - 1].b); 1376 - 1377 - memmove(&r[1], &r[0], 1378 - sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1)); 1379 - 1380 - /* When we've got incremental GC working, we'll want to do 1381 - * if (should_resched()) 1382 - * return -EAGAIN; 1383 - */ 1384 - cond_resched(); 1385 - #if 0 1386 1438 if (need_resched()) { 1387 1439 ret = -EAGAIN; 1388 1440 break; 1389 1441 } 1390 - #endif 1391 1442 } 1392 1443 1393 - for (i = 1; i < GC_MERGE_NODES && r[i].b; i++) 1394 - write(r[i].b); 1444 + for (i = 0; i < GC_MERGE_NODES; i++) 1445 + if (!IS_ERR_OR_NULL(r[i].b)) { 1446 + if (btree_node_dirty(r[i].b)) 1447 + bch_btree_node_write(r[i].b, writes); 1448 + rw_unlock(true, r[i].b); 1449 + } 1395 1450 1396 - /* Might have freed some children, must remove their keys */ 1397 - if (!b->written) 1398 - bch_btree_sort(b); 1451 + bch_keylist_free(&keys); 1399 1452 1400 1453 return ret; 1401 1454 } ··· 1485 1376 struct closure *writes, struct gc_stat *gc) 1486 1377 { 1487 1378 struct btree *n = NULL; 1488 - unsigned keys = 0; 1489 - int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); 1379 + int ret = 0; 1380 + bool should_rewrite; 1490 1381 1491 - if (b->level || stale > 10) 1492 - n = btree_node_alloc_replacement(b, NULL); 1382 + should_rewrite = btree_gc_mark_node(b, gc); 1383 + if (should_rewrite) { 1384 + n = btree_node_alloc_replacement(b, false); 1493 1385 1494 - if (!IS_ERR_OR_NULL(n)) 1495 - swap(b, n); 1386 + if (!IS_ERR_OR_NULL(n)) { 1387 + bch_btree_node_write_sync(n); 1388 + bch_btree_set_root(n); 1389 + btree_node_free(b); 1390 + rw_unlock(true, n); 1496 1391 1497 - if (b->level) 1392 + return -EINTR; 1393 + } 1394 + } 1395 + 1396 + if (b->level) { 1498 1397 ret = btree_gc_recurse(b, op, writes, gc); 1499 - 1500 - if (!b->written || btree_node_dirty(b)) { 1501 - bch_btree_node_write(b, n ? &op->cl : NULL); 1398 + if (ret) 1399 + return ret; 1502 1400 } 1503 1401 1504 - if (!IS_ERR_OR_NULL(n)) { 1505 - closure_sync(&op->cl); 1506 - bch_btree_set_root(b); 1507 - btree_node_free(n, op); 1508 - rw_unlock(true, b); 1509 - } 1402 + bkey_copy_key(&b->c->gc_done, &b->key); 1510 1403 1511 1404 return ret; 1512 1405 } ··· 1590 1479 return available; 1591 1480 } 1592 1481 1593 - static void bch_btree_gc(struct closure *cl) 1482 + static void bch_btree_gc(struct cache_set *c) 1594 1483 { 1595 - struct cache_set *c = container_of(cl, struct cache_set, gc.cl); 1596 1484 int ret; 1597 1485 unsigned long available; 1598 1486 struct gc_stat stats; ··· 1603 1493 1604 1494 memset(&stats, 0, sizeof(struct gc_stat)); 1605 1495 closure_init_stack(&writes); 1606 - bch_btree_op_init_stack(&op); 1607 - op.lock = SHRT_MAX; 1496 + bch_btree_op_init(&op, SHRT_MAX); 1608 1497 1609 1498 btree_gc_start(c); 1610 1499 1611 - atomic_inc(&c->prio_blocked); 1500 + do { 1501 + ret = btree_root(gc_root, c, &op, &writes, &stats); 1502 + closure_sync(&writes); 1612 1503 1613 - ret = btree_root(gc_root, c, &op, &writes, &stats); 1614 - closure_sync(&op.cl); 1615 - closure_sync(&writes); 1616 - 1617 - if (ret) { 1618 - pr_warn("gc failed!"); 1619 - continue_at(cl, bch_btree_gc, bch_gc_wq); 1620 - } 1621 - 1622 - /* Possibly wait for new UUIDs or whatever to hit disk */ 1623 - bch_journal_meta(c, &op.cl); 1624 - closure_sync(&op.cl); 1504 + if (ret && ret != -EAGAIN) 1505 + pr_warn("gc failed!"); 1506 + } while (ret); 1625 1507 1626 1508 available = bch_btree_gc_finish(c); 1627 - 1628 - atomic_dec(&c->prio_blocked); 1629 1509 wake_up_allocators(c); 1630 1510 1631 1511 bch_time_stats_update(&c->btree_gc_time, start_time); 1632 1512 1633 1513 stats.key_bytes *= sizeof(uint64_t); 1634 - stats.dirty <<= 9; 1635 1514 stats.data <<= 9; 1636 1515 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; 1637 1516 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); 1638 1517 1639 1518 trace_bcache_gc_end(c); 1640 1519 1641 - continue_at(cl, bch_moving_gc, bch_gc_wq); 1520 + bch_moving_gc(c); 1642 1521 } 1643 1522 1644 - void bch_queue_gc(struct cache_set *c) 1523 + static int bch_gc_thread(void *arg) 1645 1524 { 1646 - closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl); 1525 + struct cache_set *c = arg; 1526 + struct cache *ca; 1527 + unsigned i; 1528 + 1529 + while (1) { 1530 + again: 1531 + bch_btree_gc(c); 1532 + 1533 + set_current_state(TASK_INTERRUPTIBLE); 1534 + if (kthread_should_stop()) 1535 + break; 1536 + 1537 + mutex_lock(&c->bucket_lock); 1538 + 1539 + for_each_cache(ca, c, i) 1540 + if (ca->invalidate_needs_gc) { 1541 + mutex_unlock(&c->bucket_lock); 1542 + set_current_state(TASK_RUNNING); 1543 + goto again; 1544 + } 1545 + 1546 + mutex_unlock(&c->bucket_lock); 1547 + 1548 + try_to_freeze(); 1549 + schedule(); 1550 + } 1551 + 1552 + return 0; 1553 + } 1554 + 1555 + int bch_gc_thread_start(struct cache_set *c) 1556 + { 1557 + c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc"); 1558 + if (IS_ERR(c->gc_thread)) 1559 + return PTR_ERR(c->gc_thread); 1560 + 1561 + set_task_state(c->gc_thread, TASK_INTERRUPTIBLE); 1562 + return 0; 1647 1563 } 1648 1564 1649 1565 /* Initial partial gc */ ··· 1677 1541 static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, 1678 1542 unsigned long **seen) 1679 1543 { 1680 - int ret; 1544 + int ret = 0; 1681 1545 unsigned i; 1682 - struct bkey *k; 1546 + struct bkey *k, *p = NULL; 1683 1547 struct bucket *g; 1684 1548 struct btree_iter iter; 1685 1549 ··· 1706 1570 } 1707 1571 1708 1572 if (b->level) { 1709 - k = bch_next_recurse_key(b, &ZERO_KEY); 1573 + bch_btree_iter_init(b, &iter, NULL); 1710 1574 1711 - while (k) { 1712 - struct bkey *p = bch_next_recurse_key(b, k); 1575 + do { 1576 + k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); 1577 + if (k) 1578 + btree_node_prefetch(b->c, k, b->level - 1); 1579 + 1713 1580 if (p) 1714 - btree_node_prefetch(b->c, p, b->level - 1); 1581 + ret = btree(check_recurse, p, b, op, seen); 1715 1582 1716 - ret = btree(check_recurse, k, b, op, seen); 1717 - if (ret) 1718 - return ret; 1719 - 1720 - k = p; 1721 - } 1583 + p = k; 1584 + } while (p && !ret); 1722 1585 } 1723 1586 1724 1587 return 0; 1725 1588 } 1726 1589 1727 - int bch_btree_check(struct cache_set *c, struct btree_op *op) 1590 + int bch_btree_check(struct cache_set *c) 1728 1591 { 1729 1592 int ret = -ENOMEM; 1730 1593 unsigned i; 1731 1594 unsigned long *seen[MAX_CACHES_PER_SET]; 1595 + struct btree_op op; 1732 1596 1733 1597 memset(seen, 0, sizeof(seen)); 1598 + bch_btree_op_init(&op, SHRT_MAX); 1734 1599 1735 1600 for (i = 0; c->cache[i]; i++) { 1736 1601 size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); ··· 1743 1606 memset(seen[i], 0xFF, n); 1744 1607 } 1745 1608 1746 - ret = btree_root(check_recurse, c, op, seen); 1609 + ret = btree_root(check_recurse, c, &op, seen); 1747 1610 err: 1748 1611 for (i = 0; i < MAX_CACHES_PER_SET; i++) 1749 1612 kfree(seen[i]); ··· 1765 1628 bch_bset_fix_lookup_table(b, where); 1766 1629 } 1767 1630 1768 - static bool fix_overlapping_extents(struct btree *b, 1769 - struct bkey *insert, 1631 + static bool fix_overlapping_extents(struct btree *b, struct bkey *insert, 1770 1632 struct btree_iter *iter, 1771 - struct btree_op *op) 1633 + struct bkey *replace_key) 1772 1634 { 1773 1635 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) 1774 1636 { ··· 1795 1659 * We might overlap with 0 size extents; we can't skip these 1796 1660 * because if they're in the set we're inserting to we have to 1797 1661 * adjust them so they don't overlap with the key we're 1798 - * inserting. But we don't want to check them for BTREE_REPLACE 1662 + * inserting. But we don't want to check them for replace 1799 1663 * operations. 1800 1664 */ 1801 1665 1802 - if (op->type == BTREE_REPLACE && 1803 - KEY_SIZE(k)) { 1666 + if (replace_key && KEY_SIZE(k)) { 1804 1667 /* 1805 1668 * k might have been split since we inserted/found the 1806 1669 * key we're replacing 1807 1670 */ 1808 1671 unsigned i; 1809 1672 uint64_t offset = KEY_START(k) - 1810 - KEY_START(&op->replace); 1673 + KEY_START(replace_key); 1811 1674 1812 1675 /* But it must be a subset of the replace key */ 1813 - if (KEY_START(k) < KEY_START(&op->replace) || 1814 - KEY_OFFSET(k) > KEY_OFFSET(&op->replace)) 1676 + if (KEY_START(k) < KEY_START(replace_key) || 1677 + KEY_OFFSET(k) > KEY_OFFSET(replace_key)) 1815 1678 goto check_failed; 1816 1679 1817 1680 /* We didn't find a key that we were supposed to */ 1818 1681 if (KEY_START(k) > KEY_START(insert) + sectors_found) 1819 1682 goto check_failed; 1820 1683 1821 - if (KEY_PTRS(&op->replace) != KEY_PTRS(k)) 1684 + if (KEY_PTRS(replace_key) != KEY_PTRS(k)) 1822 1685 goto check_failed; 1823 1686 1824 1687 /* skip past gen */ 1825 1688 offset <<= 8; 1826 1689 1827 - BUG_ON(!KEY_PTRS(&op->replace)); 1690 + BUG_ON(!KEY_PTRS(replace_key)); 1828 1691 1829 - for (i = 0; i < KEY_PTRS(&op->replace); i++) 1830 - if (k->ptr[i] != op->replace.ptr[i] + offset) 1692 + for (i = 0; i < KEY_PTRS(replace_key); i++) 1693 + if (k->ptr[i] != replace_key->ptr[i] + offset) 1831 1694 goto check_failed; 1832 1695 1833 1696 sectors_found = KEY_OFFSET(k) - KEY_START(insert); ··· 1877 1742 if (bkey_cmp(insert, k) < 0) { 1878 1743 bch_cut_front(insert, k); 1879 1744 } else { 1745 + if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) 1746 + old_offset = KEY_START(insert); 1747 + 1880 1748 if (bkey_written(b, k) && 1881 1749 bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { 1882 1750 /* ··· 1897 1759 } 1898 1760 1899 1761 check_failed: 1900 - if (op->type == BTREE_REPLACE) { 1762 + if (replace_key) { 1901 1763 if (!sectors_found) { 1902 - op->insert_collision = true; 1903 1764 return true; 1904 1765 } else if (sectors_found < KEY_SIZE(insert)) { 1905 1766 SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - ··· 1911 1774 } 1912 1775 1913 1776 static bool btree_insert_key(struct btree *b, struct btree_op *op, 1914 - struct bkey *k) 1777 + struct bkey *k, struct bkey *replace_key) 1915 1778 { 1916 1779 struct bset *i = b->sets[b->nsets].data; 1917 1780 struct bkey *m, *prev; ··· 1923 1786 1924 1787 if (!b->level) { 1925 1788 struct btree_iter iter; 1926 - struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0); 1927 1789 1928 1790 /* 1929 1791 * bset_search() returns the first key that is strictly greater 1930 1792 * than the search key - but for back merging, we want to find 1931 - * the first key that is greater than or equal to KEY_START(k) - 1932 - * unless KEY_START(k) is 0. 1793 + * the previous key. 1933 1794 */ 1934 - if (KEY_OFFSET(&search)) 1935 - SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1); 1936 - 1937 1795 prev = NULL; 1938 - m = bch_btree_iter_init(b, &iter, &search); 1796 + m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k))); 1939 1797 1940 - if (fix_overlapping_extents(b, k, &iter, op)) 1798 + if (fix_overlapping_extents(b, k, &iter, replace_key)) { 1799 + op->insert_collision = true; 1941 1800 return false; 1801 + } 1802 + 1803 + if (KEY_DIRTY(k)) 1804 + bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), 1805 + KEY_START(k), KEY_SIZE(k)); 1942 1806 1943 1807 while (m != end(i) && 1944 1808 bkey_cmp(k, &START_KEY(m)) > 0) ··· 1963 1825 if (m != end(i) && 1964 1826 bch_bkey_try_merge(b, k, m)) 1965 1827 goto copy; 1966 - } else 1828 + } else { 1829 + BUG_ON(replace_key); 1967 1830 m = bch_bset_search(b, &b->sets[b->nsets], k); 1831 + } 1968 1832 1969 1833 insert: shift_keys(b, m, k); 1970 1834 copy: bkey_copy(m, k); 1971 1835 merged: 1972 - if (KEY_DIRTY(k)) 1973 - bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), 1974 - KEY_START(k), KEY_SIZE(k)); 1975 - 1976 - bch_check_keys(b, "%u for %s", status, op_type(op)); 1836 + bch_check_keys(b, "%u for %s", status, 1837 + replace_key ? "replace" : "insert"); 1977 1838 1978 1839 if (b->level && !KEY_OFFSET(k)) 1979 1840 btree_current_write(b)->prio_blocked++; 1980 1841 1981 - trace_bcache_btree_insert_key(b, k, op->type, status); 1842 + trace_bcache_btree_insert_key(b, k, replace_key != NULL, status); 1982 1843 1983 1844 return true; 1984 1845 } 1985 1846 1986 - static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) 1847 + static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, 1848 + struct keylist *insert_keys, 1849 + struct bkey *replace_key) 1987 1850 { 1988 1851 bool ret = false; 1989 - struct bkey *k; 1990 - unsigned oldsize = bch_count_data(b); 1852 + int oldsize = bch_count_data(b); 1991 1853 1992 - while ((k = bch_keylist_pop(&op->keys))) { 1993 - bkey_put(b->c, k, b->level); 1994 - ret |= btree_insert_key(b, op, k); 1854 + while (!bch_keylist_empty(insert_keys)) { 1855 + struct bset *i = write_block(b); 1856 + struct bkey *k = insert_keys->keys; 1857 + 1858 + if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c) 1859 + > btree_blocks(b)) 1860 + break; 1861 + 1862 + if (bkey_cmp(k, &b->key) <= 0) { 1863 + if (!b->level) 1864 + bkey_put(b->c, k); 1865 + 1866 + ret |= btree_insert_key(b, op, k, replace_key); 1867 + bch_keylist_pop_front(insert_keys); 1868 + } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { 1869 + BKEY_PADDED(key) temp; 1870 + bkey_copy(&temp.key, insert_keys->keys); 1871 + 1872 + bch_cut_back(&b->key, &temp.key); 1873 + bch_cut_front(&b->key, insert_keys->keys); 1874 + 1875 + ret |= btree_insert_key(b, op, &temp.key, replace_key); 1876 + break; 1877 + } else { 1878 + break; 1879 + } 1995 1880 } 1881 + 1882 + BUG_ON(!bch_keylist_empty(insert_keys) && b->level); 1996 1883 1997 1884 BUG_ON(bch_count_data(b) < oldsize); 1998 1885 return ret; 1999 1886 } 2000 1887 2001 - bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, 2002 - struct bio *bio) 1888 + static int btree_split(struct btree *b, struct btree_op *op, 1889 + struct keylist *insert_keys, 1890 + struct bkey *replace_key) 2003 1891 { 2004 - bool ret = false; 2005 - uint64_t btree_ptr = b->key.ptr[0]; 2006 - unsigned long seq = b->seq; 2007 - BKEY_PADDED(k) tmp; 2008 - 2009 - rw_unlock(false, b); 2010 - rw_lock(true, b, b->level); 2011 - 2012 - if (b->key.ptr[0] != btree_ptr || 2013 - b->seq != seq + 1 || 2014 - should_split(b)) 2015 - goto out; 2016 - 2017 - op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio)); 2018 - 2019 - SET_KEY_PTRS(&op->replace, 1); 2020 - get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); 2021 - 2022 - SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV); 2023 - 2024 - bkey_copy(&tmp.k, &op->replace); 2025 - 2026 - BUG_ON(op->type != BTREE_INSERT); 2027 - BUG_ON(!btree_insert_key(b, op, &tmp.k)); 2028 - ret = true; 2029 - out: 2030 - downgrade_write(&b->lock); 2031 - return ret; 2032 - } 2033 - 2034 - static int btree_split(struct btree *b, struct btree_op *op) 2035 - { 2036 - bool split, root = b == b->c->root; 1892 + bool split; 2037 1893 struct btree *n1, *n2 = NULL, *n3 = NULL; 2038 1894 uint64_t start_time = local_clock(); 1895 + struct closure cl; 1896 + struct keylist parent_keys; 2039 1897 2040 - if (b->level) 2041 - set_closure_blocking(&op->cl); 1898 + closure_init_stack(&cl); 1899 + bch_keylist_init(&parent_keys); 2042 1900 2043 - n1 = btree_node_alloc_replacement(b, &op->cl); 1901 + n1 = btree_node_alloc_replacement(b, true); 2044 1902 if (IS_ERR(n1)) 2045 1903 goto err; 2046 1904 ··· 2047 1913 2048 1914 trace_bcache_btree_node_split(b, n1->sets[0].data->keys); 2049 1915 2050 - n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); 1916 + n2 = bch_btree_node_alloc(b->c, b->level, true); 2051 1917 if (IS_ERR(n2)) 2052 1918 goto err_free1; 2053 1919 2054 - if (root) { 2055 - n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl); 1920 + if (!b->parent) { 1921 + n3 = bch_btree_node_alloc(b->c, b->level + 1, true); 2056 1922 if (IS_ERR(n3)) 2057 1923 goto err_free2; 2058 1924 } 2059 1925 2060 - bch_btree_insert_keys(n1, op); 1926 + bch_btree_insert_keys(n1, op, insert_keys, replace_key); 2061 1927 2062 - /* Has to be a linear search because we don't have an auxiliary 1928 + /* 1929 + * Has to be a linear search because we don't have an auxiliary 2063 1930 * search tree yet 2064 1931 */ 2065 1932 ··· 2079 1944 2080 1945 bkey_copy_key(&n2->key, &b->key); 2081 1946 2082 - bch_keylist_add(&op->keys, &n2->key); 2083 - bch_btree_node_write(n2, &op->cl); 1947 + bch_keylist_add(&parent_keys, &n2->key); 1948 + bch_btree_node_write(n2, &cl); 2084 1949 rw_unlock(true, n2); 2085 1950 } else { 2086 1951 trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); 2087 1952 2088 - bch_btree_insert_keys(n1, op); 1953 + bch_btree_insert_keys(n1, op, insert_keys, replace_key); 2089 1954 } 2090 1955 2091 - bch_keylist_add(&op->keys, &n1->key); 2092 - bch_btree_node_write(n1, &op->cl); 1956 + bch_keylist_add(&parent_keys, &n1->key); 1957 + bch_btree_node_write(n1, &cl); 2093 1958 2094 1959 if (n3) { 1960 + /* Depth increases, make a new root */ 2095 1961 bkey_copy_key(&n3->key, &MAX_KEY); 2096 - bch_btree_insert_keys(n3, op); 2097 - bch_btree_node_write(n3, &op->cl); 1962 + bch_btree_insert_keys(n3, op, &parent_keys, NULL); 1963 + bch_btree_node_write(n3, &cl); 2098 1964 2099 - closure_sync(&op->cl); 1965 + closure_sync(&cl); 2100 1966 bch_btree_set_root(n3); 2101 1967 rw_unlock(true, n3); 2102 - } else if (root) { 2103 - op->keys.top = op->keys.bottom; 2104 - closure_sync(&op->cl); 1968 + 1969 + btree_node_free(b); 1970 + } else if (!b->parent) { 1971 + /* Root filled up but didn't need to be split */ 1972 + closure_sync(&cl); 2105 1973 bch_btree_set_root(n1); 1974 + 1975 + btree_node_free(b); 2106 1976 } else { 2107 - unsigned i; 1977 + /* Split a non root node */ 1978 + closure_sync(&cl); 1979 + make_btree_freeing_key(b, parent_keys.top); 1980 + bch_keylist_push(&parent_keys); 2108 1981 2109 - bkey_copy(op->keys.top, &b->key); 2110 - bkey_copy_key(op->keys.top, &ZERO_KEY); 1982 + btree_node_free(b); 2111 1983 2112 - for (i = 0; i < KEY_PTRS(&b->key); i++) { 2113 - uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1; 2114 - 2115 - SET_PTR_GEN(op->keys.top, i, g); 2116 - } 2117 - 2118 - bch_keylist_push(&op->keys); 2119 - closure_sync(&op->cl); 2120 - atomic_inc(&b->c->prio_blocked); 1984 + bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL); 1985 + BUG_ON(!bch_keylist_empty(&parent_keys)); 2121 1986 } 2122 1987 2123 1988 rw_unlock(true, n1); 2124 - btree_node_free(b, op); 2125 1989 2126 1990 bch_time_stats_update(&b->c->btree_split_time, start_time); 2127 1991 2128 1992 return 0; 2129 1993 err_free2: 2130 - __bkey_put(n2->c, &n2->key); 2131 - btree_node_free(n2, op); 1994 + btree_node_free(n2); 2132 1995 rw_unlock(true, n2); 2133 1996 err_free1: 2134 - __bkey_put(n1->c, &n1->key); 2135 - btree_node_free(n1, op); 1997 + btree_node_free(n1); 2136 1998 rw_unlock(true, n1); 2137 1999 err: 2138 2000 if (n3 == ERR_PTR(-EAGAIN) || ··· 2141 2009 return -ENOMEM; 2142 2010 } 2143 2011 2144 - static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, 2145 - struct keylist *stack_keys) 2012 + static int bch_btree_insert_node(struct btree *b, struct btree_op *op, 2013 + struct keylist *insert_keys, 2014 + atomic_t *journal_ref, 2015 + struct bkey *replace_key) 2146 2016 { 2147 - if (b->level) { 2148 - int ret; 2149 - struct bkey *insert = op->keys.bottom; 2150 - struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert)); 2017 + BUG_ON(b->level && replace_key); 2151 2018 2152 - if (!k) { 2153 - btree_bug(b, "no key to recurse on at level %i/%i", 2154 - b->level, b->c->root->level); 2155 - 2156 - op->keys.top = op->keys.bottom; 2157 - return -EIO; 2019 + if (should_split(b)) { 2020 + if (current->bio_list) { 2021 + op->lock = b->c->root->level + 1; 2022 + return -EAGAIN; 2023 + } else if (op->lock <= b->c->root->level) { 2024 + op->lock = b->c->root->level + 1; 2025 + return -EINTR; 2026 + } else { 2027 + /* Invalidated all iterators */ 2028 + return btree_split(b, op, insert_keys, replace_key) ?: 2029 + -EINTR; 2158 2030 } 2159 - 2160 - if (bkey_cmp(insert, k) > 0) { 2161 - unsigned i; 2162 - 2163 - if (op->type == BTREE_REPLACE) { 2164 - __bkey_put(b->c, insert); 2165 - op->keys.top = op->keys.bottom; 2166 - op->insert_collision = true; 2167 - return 0; 2168 - } 2169 - 2170 - for (i = 0; i < KEY_PTRS(insert); i++) 2171 - atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin); 2172 - 2173 - bkey_copy(stack_keys->top, insert); 2174 - 2175 - bch_cut_back(k, insert); 2176 - bch_cut_front(k, stack_keys->top); 2177 - 2178 - bch_keylist_push(stack_keys); 2179 - } 2180 - 2181 - ret = btree(insert_recurse, k, b, op, stack_keys); 2182 - if (ret) 2183 - return ret; 2184 - } 2185 - 2186 - if (!bch_keylist_empty(&op->keys)) { 2187 - if (should_split(b)) { 2188 - if (op->lock <= b->c->root->level) { 2189 - BUG_ON(b->level); 2190 - op->lock = b->c->root->level + 1; 2191 - return -EINTR; 2192 - } 2193 - return btree_split(b, op); 2194 - } 2195 - 2031 + } else { 2196 2032 BUG_ON(write_block(b) != b->sets[b->nsets].data); 2197 2033 2198 - if (bch_btree_insert_keys(b, op)) { 2034 + if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { 2199 2035 if (!b->level) 2200 - bch_btree_leaf_dirty(b, op); 2036 + bch_btree_leaf_dirty(b, journal_ref); 2201 2037 else 2202 - bch_btree_node_write(b, &op->cl); 2038 + bch_btree_node_write_sync(b); 2203 2039 } 2204 - } 2205 2040 2206 - return 0; 2041 + return 0; 2042 + } 2207 2043 } 2208 2044 2209 - int bch_btree_insert(struct btree_op *op, struct cache_set *c) 2045 + int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, 2046 + struct bkey *check_key) 2210 2047 { 2211 - int ret = 0; 2212 - struct keylist stack_keys; 2048 + int ret = -EINTR; 2049 + uint64_t btree_ptr = b->key.ptr[0]; 2050 + unsigned long seq = b->seq; 2051 + struct keylist insert; 2052 + bool upgrade = op->lock == -1; 2213 2053 2214 - /* 2215 - * Don't want to block with the btree locked unless we have to, 2216 - * otherwise we get deadlocks with try_harder and between split/gc 2217 - */ 2218 - clear_closure_blocking(&op->cl); 2054 + bch_keylist_init(&insert); 2219 2055 2220 - BUG_ON(bch_keylist_empty(&op->keys)); 2221 - bch_keylist_copy(&stack_keys, &op->keys); 2222 - bch_keylist_init(&op->keys); 2056 + if (upgrade) { 2057 + rw_unlock(false, b); 2058 + rw_lock(true, b, b->level); 2223 2059 2224 - while (!bch_keylist_empty(&stack_keys) || 2225 - !bch_keylist_empty(&op->keys)) { 2226 - if (bch_keylist_empty(&op->keys)) { 2227 - bch_keylist_add(&op->keys, 2228 - bch_keylist_pop(&stack_keys)); 2229 - op->lock = 0; 2230 - } 2231 - 2232 - ret = btree_root(insert_recurse, c, op, &stack_keys); 2233 - 2234 - if (ret == -EAGAIN) { 2235 - ret = 0; 2236 - closure_sync(&op->cl); 2237 - } else if (ret) { 2238 - struct bkey *k; 2239 - 2240 - pr_err("error %i trying to insert key for %s", 2241 - ret, op_type(op)); 2242 - 2243 - while ((k = bch_keylist_pop(&stack_keys) ?: 2244 - bch_keylist_pop(&op->keys))) 2245 - bkey_put(c, k, 0); 2246 - } 2060 + if (b->key.ptr[0] != btree_ptr || 2061 + b->seq != seq + 1) 2062 + goto out; 2247 2063 } 2248 2064 2249 - bch_keylist_free(&stack_keys); 2065 + SET_KEY_PTRS(check_key, 1); 2066 + get_random_bytes(&check_key->ptr[0], sizeof(uint64_t)); 2250 2067 2251 - if (op->journal) 2252 - atomic_dec_bug(op->journal); 2253 - op->journal = NULL; 2068 + SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV); 2069 + 2070 + bch_keylist_add(&insert, check_key); 2071 + 2072 + ret = bch_btree_insert_node(b, op, &insert, NULL, NULL); 2073 + 2074 + BUG_ON(!ret && !bch_keylist_empty(&insert)); 2075 + out: 2076 + if (upgrade) 2077 + downgrade_write(&b->lock); 2078 + return ret; 2079 + } 2080 + 2081 + struct btree_insert_op { 2082 + struct btree_op op; 2083 + struct keylist *keys; 2084 + atomic_t *journal_ref; 2085 + struct bkey *replace_key; 2086 + }; 2087 + 2088 + int btree_insert_fn(struct btree_op *b_op, struct btree *b) 2089 + { 2090 + struct btree_insert_op *op = container_of(b_op, 2091 + struct btree_insert_op, op); 2092 + 2093 + int ret = bch_btree_insert_node(b, &op->op, op->keys, 2094 + op->journal_ref, op->replace_key); 2095 + if (ret && !bch_keylist_empty(op->keys)) 2096 + return ret; 2097 + else 2098 + return MAP_DONE; 2099 + } 2100 + 2101 + int bch_btree_insert(struct cache_set *c, struct keylist *keys, 2102 + atomic_t *journal_ref, struct bkey *replace_key) 2103 + { 2104 + struct btree_insert_op op; 2105 + int ret = 0; 2106 + 2107 + BUG_ON(current->bio_list); 2108 + BUG_ON(bch_keylist_empty(keys)); 2109 + 2110 + bch_btree_op_init(&op.op, 0); 2111 + op.keys = keys; 2112 + op.journal_ref = journal_ref; 2113 + op.replace_key = replace_key; 2114 + 2115 + while (!ret && !bch_keylist_empty(keys)) { 2116 + op.op.lock = 0; 2117 + ret = bch_btree_map_leaf_nodes(&op.op, c, 2118 + &START_KEY(keys->keys), 2119 + btree_insert_fn); 2120 + } 2121 + 2122 + if (ret) { 2123 + struct bkey *k; 2124 + 2125 + pr_err("error %i", ret); 2126 + 2127 + while ((k = bch_keylist_pop(keys))) 2128 + bkey_put(c, k); 2129 + } else if (op.op.insert_collision) 2130 + ret = -ESRCH; 2131 + 2254 2132 return ret; 2255 2133 } 2256 2134 ··· 2283 2141 mutex_unlock(&b->c->bucket_lock); 2284 2142 2285 2143 b->c->root = b; 2286 - __bkey_put(b->c, &b->key); 2287 2144 2288 2145 bch_journal_meta(b->c, &cl); 2289 2146 closure_sync(&cl); 2290 2147 } 2291 2148 2292 - /* Cache lookup */ 2149 + /* Map across nodes or keys */ 2293 2150 2294 - static int submit_partial_cache_miss(struct btree *b, struct btree_op *op, 2295 - struct bkey *k) 2151 + static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, 2152 + struct bkey *from, 2153 + btree_map_nodes_fn *fn, int flags) 2296 2154 { 2297 - struct search *s = container_of(op, struct search, op); 2298 - struct bio *bio = &s->bio.bio; 2299 - int ret = 0; 2155 + int ret = MAP_CONTINUE; 2300 2156 2301 - while (!ret && 2302 - !op->lookup_done) { 2303 - unsigned sectors = INT_MAX; 2157 + if (b->level) { 2158 + struct bkey *k; 2159 + struct btree_iter iter; 2304 2160 2305 - if (KEY_INODE(k) == op->inode) { 2306 - if (KEY_START(k) <= bio->bi_sector) 2307 - break; 2161 + bch_btree_iter_init(b, &iter, from); 2308 2162 2309 - sectors = min_t(uint64_t, sectors, 2310 - KEY_START(k) - bio->bi_sector); 2163 + while ((k = bch_btree_iter_next_filter(&iter, b, 2164 + bch_ptr_bad))) { 2165 + ret = btree(map_nodes_recurse, k, b, 2166 + op, from, fn, flags); 2167 + from = NULL; 2168 + 2169 + if (ret != MAP_CONTINUE) 2170 + return ret; 2311 2171 } 2312 - 2313 - ret = s->d->cache_miss(b, s, bio, sectors); 2314 2172 } 2173 + 2174 + if (!b->level || flags == MAP_ALL_NODES) 2175 + ret = fn(op, b); 2315 2176 2316 2177 return ret; 2317 2178 } 2318 2179 2319 - /* 2320 - * Read from a single key, handling the initial cache miss if the key starts in 2321 - * the middle of the bio 2322 - */ 2323 - static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, 2324 - struct bkey *k) 2180 + int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, 2181 + struct bkey *from, btree_map_nodes_fn *fn, int flags) 2325 2182 { 2326 - struct search *s = container_of(op, struct search, op); 2327 - struct bio *bio = &s->bio.bio; 2328 - unsigned ptr; 2329 - struct bio *n; 2330 - 2331 - int ret = submit_partial_cache_miss(b, op, k); 2332 - if (ret || op->lookup_done) 2333 - return ret; 2334 - 2335 - /* XXX: figure out best pointer - for multiple cache devices */ 2336 - ptr = 0; 2337 - 2338 - PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; 2339 - 2340 - while (!op->lookup_done && 2341 - KEY_INODE(k) == op->inode && 2342 - bio->bi_sector < KEY_OFFSET(k)) { 2343 - struct bkey *bio_key; 2344 - sector_t sector = PTR_OFFSET(k, ptr) + 2345 - (bio->bi_sector - KEY_START(k)); 2346 - unsigned sectors = min_t(uint64_t, INT_MAX, 2347 - KEY_OFFSET(k) - bio->bi_sector); 2348 - 2349 - n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); 2350 - if (n == bio) 2351 - op->lookup_done = true; 2352 - 2353 - bio_key = &container_of(n, struct bbio, bio)->key; 2354 - 2355 - /* 2356 - * The bucket we're reading from might be reused while our bio 2357 - * is in flight, and we could then end up reading the wrong 2358 - * data. 2359 - * 2360 - * We guard against this by checking (in cache_read_endio()) if 2361 - * the pointer is stale again; if so, we treat it as an error 2362 - * and reread from the backing device (but we don't pass that 2363 - * error up anywhere). 2364 - */ 2365 - 2366 - bch_bkey_copy_single_ptr(bio_key, k, ptr); 2367 - SET_PTR_OFFSET(bio_key, 0, sector); 2368 - 2369 - n->bi_end_io = bch_cache_read_endio; 2370 - n->bi_private = &s->cl; 2371 - 2372 - __bch_submit_bbio(n, b->c); 2373 - } 2374 - 2375 - return 0; 2183 + return btree_root(map_nodes_recurse, c, op, from, fn, flags); 2376 2184 } 2377 2185 2378 - int bch_btree_search_recurse(struct btree *b, struct btree_op *op) 2186 + static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, 2187 + struct bkey *from, btree_map_keys_fn *fn, 2188 + int flags) 2379 2189 { 2380 - struct search *s = container_of(op, struct search, op); 2381 - struct bio *bio = &s->bio.bio; 2382 - 2383 - int ret = 0; 2190 + int ret = MAP_CONTINUE; 2384 2191 struct bkey *k; 2385 2192 struct btree_iter iter; 2386 - bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); 2387 2193 2388 - do { 2389 - k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); 2390 - if (!k) { 2391 - /* 2392 - * b->key would be exactly what we want, except that 2393 - * pointers to btree nodes have nonzero size - we 2394 - * wouldn't go far enough 2395 - */ 2194 + bch_btree_iter_init(b, &iter, from); 2396 2195 2397 - ret = submit_partial_cache_miss(b, op, 2398 - &KEY(KEY_INODE(&b->key), 2399 - KEY_OFFSET(&b->key), 0)); 2400 - break; 2401 - } 2196 + while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) { 2197 + ret = !b->level 2198 + ? fn(op, b, k) 2199 + : btree(map_keys_recurse, k, b, op, from, fn, flags); 2200 + from = NULL; 2402 2201 2403 - ret = b->level 2404 - ? btree(search_recurse, k, b, op) 2405 - : submit_partial_cache_hit(b, op, k); 2406 - } while (!ret && 2407 - !op->lookup_done); 2202 + if (ret != MAP_CONTINUE) 2203 + return ret; 2204 + } 2205 + 2206 + if (!b->level && (flags & MAP_END_KEY)) 2207 + ret = fn(op, b, &KEY(KEY_INODE(&b->key), 2208 + KEY_OFFSET(&b->key), 0)); 2408 2209 2409 2210 return ret; 2211 + } 2212 + 2213 + int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, 2214 + struct bkey *from, btree_map_keys_fn *fn, int flags) 2215 + { 2216 + return btree_root(map_keys_recurse, c, op, from, fn, flags); 2410 2217 } 2411 2218 2412 2219 /* Keybuf code */ ··· 2376 2285 return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); 2377 2286 } 2378 2287 2379 - static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, 2380 - struct keybuf *buf, struct bkey *end, 2381 - keybuf_pred_fn *pred) 2288 + struct refill { 2289 + struct btree_op op; 2290 + unsigned nr_found; 2291 + struct keybuf *buf; 2292 + struct bkey *end; 2293 + keybuf_pred_fn *pred; 2294 + }; 2295 + 2296 + static int refill_keybuf_fn(struct btree_op *op, struct btree *b, 2297 + struct bkey *k) 2382 2298 { 2383 - struct btree_iter iter; 2384 - bch_btree_iter_init(b, &iter, &buf->last_scanned); 2299 + struct refill *refill = container_of(op, struct refill, op); 2300 + struct keybuf *buf = refill->buf; 2301 + int ret = MAP_CONTINUE; 2385 2302 2386 - while (!array_freelist_empty(&buf->freelist)) { 2387 - struct bkey *k = bch_btree_iter_next_filter(&iter, b, 2388 - bch_ptr_bad); 2389 - 2390 - if (!b->level) { 2391 - if (!k) { 2392 - buf->last_scanned = b->key; 2393 - break; 2394 - } 2395 - 2396 - buf->last_scanned = *k; 2397 - if (bkey_cmp(&buf->last_scanned, end) >= 0) 2398 - break; 2399 - 2400 - if (pred(buf, k)) { 2401 - struct keybuf_key *w; 2402 - 2403 - spin_lock(&buf->lock); 2404 - 2405 - w = array_alloc(&buf->freelist); 2406 - 2407 - w->private = NULL; 2408 - bkey_copy(&w->key, k); 2409 - 2410 - if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) 2411 - array_free(&buf->freelist, w); 2412 - 2413 - spin_unlock(&buf->lock); 2414 - } 2415 - } else { 2416 - if (!k) 2417 - break; 2418 - 2419 - btree(refill_keybuf, k, b, op, buf, end, pred); 2420 - /* 2421 - * Might get an error here, but can't really do anything 2422 - * and it'll get logged elsewhere. Just read what we 2423 - * can. 2424 - */ 2425 - 2426 - if (bkey_cmp(&buf->last_scanned, end) >= 0) 2427 - break; 2428 - 2429 - cond_resched(); 2430 - } 2303 + if (bkey_cmp(k, refill->end) >= 0) { 2304 + ret = MAP_DONE; 2305 + goto out; 2431 2306 } 2432 2307 2433 - return 0; 2308 + if (!KEY_SIZE(k)) /* end key */ 2309 + goto out; 2310 + 2311 + if (refill->pred(buf, k)) { 2312 + struct keybuf_key *w; 2313 + 2314 + spin_lock(&buf->lock); 2315 + 2316 + w = array_alloc(&buf->freelist); 2317 + if (!w) { 2318 + spin_unlock(&buf->lock); 2319 + return MAP_DONE; 2320 + } 2321 + 2322 + w->private = NULL; 2323 + bkey_copy(&w->key, k); 2324 + 2325 + if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) 2326 + array_free(&buf->freelist, w); 2327 + else 2328 + refill->nr_found++; 2329 + 2330 + if (array_freelist_empty(&buf->freelist)) 2331 + ret = MAP_DONE; 2332 + 2333 + spin_unlock(&buf->lock); 2334 + } 2335 + out: 2336 + buf->last_scanned = *k; 2337 + return ret; 2434 2338 } 2435 2339 2436 2340 void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, 2437 2341 struct bkey *end, keybuf_pred_fn *pred) 2438 2342 { 2439 2343 struct bkey start = buf->last_scanned; 2440 - struct btree_op op; 2441 - bch_btree_op_init_stack(&op); 2344 + struct refill refill; 2442 2345 2443 2346 cond_resched(); 2444 2347 2445 - btree_root(refill_keybuf, c, &op, buf, end, pred); 2446 - closure_sync(&op.cl); 2348 + bch_btree_op_init(&refill.op, -1); 2349 + refill.nr_found = 0; 2350 + refill.buf = buf; 2351 + refill.end = end; 2352 + refill.pred = pred; 2447 2353 2448 - pr_debug("found %s keys from %llu:%llu to %llu:%llu", 2449 - RB_EMPTY_ROOT(&buf->keys) ? "no" : 2450 - array_freelist_empty(&buf->freelist) ? "some" : "a few", 2451 - KEY_INODE(&start), KEY_OFFSET(&start), 2452 - KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned)); 2354 + bch_btree_map_keys(&refill.op, c, &buf->last_scanned, 2355 + refill_keybuf_fn, MAP_END_KEY); 2356 + 2357 + trace_bcache_keyscan(refill.nr_found, 2358 + KEY_INODE(&start), KEY_OFFSET(&start), 2359 + KEY_INODE(&buf->last_scanned), 2360 + KEY_OFFSET(&buf->last_scanned)); 2453 2361 2454 2362 spin_lock(&buf->lock); 2455 2363 ··· 2526 2436 } 2527 2437 2528 2438 struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, 2529 - struct keybuf *buf, 2530 - struct bkey *end, 2531 - keybuf_pred_fn *pred) 2439 + struct keybuf *buf, 2440 + struct bkey *end, 2441 + keybuf_pred_fn *pred) 2532 2442 { 2533 2443 struct keybuf_key *ret; 2534 2444 ··· 2561 2471 { 2562 2472 if (btree_io_wq) 2563 2473 destroy_workqueue(btree_io_wq); 2564 - if (bch_gc_wq) 2565 - destroy_workqueue(bch_gc_wq); 2566 2474 } 2567 2475 2568 2476 int __init bch_btree_init(void) 2569 2477 { 2570 - if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) || 2571 - !(btree_io_wq = create_singlethread_workqueue("bch_btree_io"))) 2478 + btree_io_wq = create_singlethread_workqueue("bch_btree_io"); 2479 + if (!btree_io_wq) 2572 2480 return -ENOMEM; 2573 2481 2574 2482 return 0;

+67 -132

drivers/md/bcache/btree.h

··· 125 125 unsigned long seq; 126 126 struct rw_semaphore lock; 127 127 struct cache_set *c; 128 + struct btree *parent; 128 129 129 130 unsigned long flags; 130 131 uint16_t written; /* would be nice to kill */ ··· 201 200 202 201 static inline void set_gc_sectors(struct cache_set *c) 203 202 { 204 - atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8); 205 - } 206 - 207 - static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) 208 - { 209 - return __bch_ptr_invalid(b->c, b->level, k); 203 + atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); 210 204 } 211 205 212 206 static inline struct bkey *bch_btree_iter_init(struct btree *b, ··· 210 214 { 211 215 return __bch_btree_iter_init(b, iter, search, b->sets); 212 216 } 217 + 218 + static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) 219 + { 220 + if (b->level) 221 + return bch_btree_ptr_invalid(b->c, k); 222 + else 223 + return bch_extent_ptr_invalid(b->c, k); 224 + } 225 + 226 + void bkey_put(struct cache_set *c, struct bkey *k); 213 227 214 228 /* Looping macros */ 215 229 ··· 240 234 /* Recursing down the btree */ 241 235 242 236 struct btree_op { 243 - struct closure cl; 244 - struct cache_set *c; 245 - 246 - /* Journal entry we have a refcount on */ 247 - atomic_t *journal; 248 - 249 - /* Bio to be inserted into the cache */ 250 - struct bio *cache_bio; 251 - 252 - unsigned inode; 253 - 254 - uint16_t write_prio; 255 - 256 237 /* Btree level at which we start taking write locks */ 257 238 short lock; 258 239 259 - /* Btree insertion type */ 260 - enum { 261 - BTREE_INSERT, 262 - BTREE_REPLACE 263 - } type:8; 264 - 265 - unsigned csum:1; 266 - unsigned skip:1; 267 - unsigned flush_journal:1; 268 - 269 - unsigned insert_data_done:1; 270 - unsigned lookup_done:1; 271 240 unsigned insert_collision:1; 272 - 273 - /* Anything after this point won't get zeroed in do_bio_hook() */ 274 - 275 - /* Keys to be inserted */ 276 - struct keylist keys; 277 - BKEY_PADDED(replace); 278 241 }; 279 242 280 - enum { 281 - BTREE_INSERT_STATUS_INSERT, 282 - BTREE_INSERT_STATUS_BACK_MERGE, 283 - BTREE_INSERT_STATUS_OVERWROTE, 284 - BTREE_INSERT_STATUS_FRONT_MERGE, 285 - }; 286 - 287 - void bch_btree_op_init_stack(struct btree_op *); 243 + static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) 244 + { 245 + memset(op, 0, sizeof(struct btree_op)); 246 + op->lock = write_lock_level; 247 + } 288 248 289 249 static inline void rw_lock(bool w, struct btree *b, int level) 290 250 { ··· 262 290 263 291 static inline void rw_unlock(bool w, struct btree *b) 264 292 { 265 - #ifdef CONFIG_BCACHE_EDEBUG 266 - unsigned i; 267 - 268 - if (w && b->key.ptr[0]) 269 - for (i = 0; i <= b->nsets; i++) 270 - bch_check_key_order(b, b->sets[i].data); 271 - #endif 272 - 273 293 if (w) 274 294 b->seq++; 275 295 (w ? up_write : up_read)(&b->lock); 276 296 } 277 297 278 - #define insert_lock(s, b) ((b)->level <= (s)->lock) 279 - 280 - /* 281 - * These macros are for recursing down the btree - they handle the details of 282 - * locking and looking up nodes in the cache for you. They're best treated as 283 - * mere syntax when reading code that uses them. 284 - * 285 - * op->lock determines whether we take a read or a write lock at a given depth. 286 - * If you've got a read lock and find that you need a write lock (i.e. you're 287 - * going to have to split), set op->lock and return -EINTR; btree_root() will 288 - * call you again and you'll have the correct lock. 289 - */ 290 - 291 - /** 292 - * btree - recurse down the btree on a specified key 293 - * @fn: function to call, which will be passed the child node 294 - * @key: key to recurse on 295 - * @b: parent btree node 296 - * @op: pointer to struct btree_op 297 - */ 298 - #define btree(fn, key, b, op, ...) \ 299 - ({ \ 300 - int _r, l = (b)->level - 1; \ 301 - bool _w = l <= (op)->lock; \ 302 - struct btree *_b = bch_btree_node_get((b)->c, key, l, op); \ 303 - if (!IS_ERR(_b)) { \ 304 - _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ 305 - rw_unlock(_w, _b); \ 306 - } else \ 307 - _r = PTR_ERR(_b); \ 308 - _r; \ 309 - }) 310 - 311 - /** 312 - * btree_root - call a function on the root of the btree 313 - * @fn: function to call, which will be passed the child node 314 - * @c: cache set 315 - * @op: pointer to struct btree_op 316 - */ 317 - #define btree_root(fn, c, op, ...) \ 318 - ({ \ 319 - int _r = -EINTR; \ 320 - do { \ 321 - struct btree *_b = (c)->root; \ 322 - bool _w = insert_lock(op, _b); \ 323 - rw_lock(_w, _b, _b->level); \ 324 - if (_b == (c)->root && \ 325 - _w == insert_lock(op, _b)) \ 326 - _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ 327 - rw_unlock(_w, _b); \ 328 - bch_cannibalize_unlock(c, &(op)->cl); \ 329 - } while (_r == -EINTR); \ 330 - \ 331 - _r; \ 332 - }) 333 - 334 - static inline bool should_split(struct btree *b) 335 - { 336 - struct bset *i = write_block(b); 337 - return b->written >= btree_blocks(b) || 338 - (i->seq == b->sets[0].data->seq && 339 - b->written + __set_blocks(i, i->keys + 15, b->c) 340 - > btree_blocks(b)); 341 - } 342 - 343 298 void bch_btree_node_read(struct btree *); 344 299 void bch_btree_node_write(struct btree *, struct closure *); 345 300 346 - void bch_cannibalize_unlock(struct cache_set *, struct closure *); 347 301 void bch_btree_set_root(struct btree *); 348 - struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *); 349 - struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, 350 - int, struct btree_op *); 302 + struct btree *bch_btree_node_alloc(struct cache_set *, int, bool); 303 + struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool); 351 304 352 - bool bch_btree_insert_check_key(struct btree *, struct btree_op *, 353 - struct bio *); 354 - int bch_btree_insert(struct btree_op *, struct cache_set *); 305 + int bch_btree_insert_check_key(struct btree *, struct btree_op *, 306 + struct bkey *); 307 + int bch_btree_insert(struct cache_set *, struct keylist *, 308 + atomic_t *, struct bkey *); 355 309 356 - int bch_btree_search_recurse(struct btree *, struct btree_op *); 357 - 358 - void bch_queue_gc(struct cache_set *); 310 + int bch_gc_thread_start(struct cache_set *); 359 311 size_t bch_btree_gc_finish(struct cache_set *); 360 - void bch_moving_gc(struct closure *); 361 - int bch_btree_check(struct cache_set *, struct btree_op *); 312 + void bch_moving_gc(struct cache_set *); 313 + int bch_btree_check(struct cache_set *); 362 314 uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); 363 315 316 + static inline void wake_up_gc(struct cache_set *c) 317 + { 318 + if (c->gc_thread) 319 + wake_up_process(c->gc_thread); 320 + } 321 + 322 + #define MAP_DONE 0 323 + #define MAP_CONTINUE 1 324 + 325 + #define MAP_ALL_NODES 0 326 + #define MAP_LEAF_NODES 1 327 + 328 + #define MAP_END_KEY 1 329 + 330 + typedef int (btree_map_nodes_fn)(struct btree_op *, struct btree *); 331 + int __bch_btree_map_nodes(struct btree_op *, struct cache_set *, 332 + struct bkey *, btree_map_nodes_fn *, int); 333 + 334 + static inline int bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, 335 + struct bkey *from, btree_map_nodes_fn *fn) 336 + { 337 + return __bch_btree_map_nodes(op, c, from, fn, MAP_ALL_NODES); 338 + } 339 + 340 + static inline int bch_btree_map_leaf_nodes(struct btree_op *op, 341 + struct cache_set *c, 342 + struct bkey *from, 343 + btree_map_nodes_fn *fn) 344 + { 345 + return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES); 346 + } 347 + 348 + typedef int (btree_map_keys_fn)(struct btree_op *, struct btree *, 349 + struct bkey *); 350 + int bch_btree_map_keys(struct btree_op *, struct cache_set *, 351 + struct bkey *, btree_map_keys_fn *, int); 352 + 353 + typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); 354 + 364 355 void bch_keybuf_init(struct keybuf *); 365 - void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *, 366 - keybuf_pred_fn *); 356 + void bch_refill_keybuf(struct cache_set *, struct keybuf *, 357 + struct bkey *, keybuf_pred_fn *); 367 358 bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, 368 359 struct bkey *); 369 360 void bch_keybuf_del(struct keybuf *, struct keybuf_key *);

+16 -87

drivers/md/bcache/closure.c

··· 11 11 12 12 #include "closure.h" 13 13 14 - void closure_queue(struct closure *cl) 15 - { 16 - struct workqueue_struct *wq = cl->wq; 17 - if (wq) { 18 - INIT_WORK(&cl->work, cl->work.func); 19 - BUG_ON(!queue_work(wq, &cl->work)); 20 - } else 21 - cl->fn(cl); 22 - } 23 - EXPORT_SYMBOL_GPL(closure_queue); 24 - 25 14 #define CL_FIELD(type, field) \ 26 15 case TYPE_ ## type: \ 27 16 return &container_of(cl, struct type, cl)->field ··· 19 30 { 20 31 switch (cl->type) { 21 32 CL_FIELD(closure_with_waitlist, wait); 22 - CL_FIELD(closure_with_waitlist_and_timer, wait); 23 - default: 24 - return NULL; 25 - } 26 - } 27 - 28 - static struct timer_list *closure_timer(struct closure *cl) 29 - { 30 - switch (cl->type) { 31 - CL_FIELD(closure_with_timer, timer); 32 - CL_FIELD(closure_with_waitlist_and_timer, timer); 33 33 default: 34 34 return NULL; 35 35 } ··· 29 51 int r = flags & CLOSURE_REMAINING_MASK; 30 52 31 53 BUG_ON(flags & CLOSURE_GUARD_MASK); 32 - BUG_ON(!r && (flags & ~(CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING))); 54 + BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); 33 55 34 56 /* Must deliver precisely one wakeup */ 35 57 if (r == 1 && (flags & CLOSURE_SLEEPING)) ··· 37 59 38 60 if (!r) { 39 61 if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { 40 - /* CLOSURE_BLOCKING might be set - clear it */ 41 62 atomic_set(&cl->remaining, 42 63 CLOSURE_REMAINING_INITIALIZER); 43 64 closure_queue(cl); ··· 67 90 { 68 91 closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); 69 92 } 70 - EXPORT_SYMBOL_GPL(closure_sub); 93 + EXPORT_SYMBOL(closure_sub); 71 94 72 95 void closure_put(struct closure *cl) 73 96 { 74 97 closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); 75 98 } 76 - EXPORT_SYMBOL_GPL(closure_put); 99 + EXPORT_SYMBOL(closure_put); 77 100 78 101 static void set_waiting(struct closure *cl, unsigned long f) 79 102 { ··· 110 133 closure_sub(cl, CLOSURE_WAITING + 1); 111 134 } 112 135 } 113 - EXPORT_SYMBOL_GPL(__closure_wake_up); 136 + EXPORT_SYMBOL(__closure_wake_up); 114 137 115 138 bool closure_wait(struct closure_waitlist *list, struct closure *cl) 116 139 { ··· 123 146 124 147 return true; 125 148 } 126 - EXPORT_SYMBOL_GPL(closure_wait); 149 + EXPORT_SYMBOL(closure_wait); 127 150 128 151 /** 129 152 * closure_sync() - sleep until a closure a closure has nothing left to wait on ··· 146 169 147 170 __closure_end_sleep(cl); 148 171 } 149 - EXPORT_SYMBOL_GPL(closure_sync); 172 + EXPORT_SYMBOL(closure_sync); 150 173 151 174 /** 152 175 * closure_trylock() - try to acquire the closure, without waiting ··· 160 183 CLOSURE_REMAINING_INITIALIZER) != -1) 161 184 return false; 162 185 163 - closure_set_ret_ip(cl); 164 - 165 186 smp_mb(); 187 + 166 188 cl->parent = parent; 167 189 if (parent) 168 190 closure_get(parent); 169 191 192 + closure_set_ret_ip(cl); 170 193 closure_debug_create(cl); 171 194 return true; 172 195 } 173 - EXPORT_SYMBOL_GPL(closure_trylock); 196 + EXPORT_SYMBOL(closure_trylock); 174 197 175 198 void __closure_lock(struct closure *cl, struct closure *parent, 176 199 struct closure_waitlist *wait_list) ··· 182 205 if (closure_trylock(cl, parent)) 183 206 return; 184 207 185 - closure_wait_event_sync(wait_list, &wait, 186 - atomic_read(&cl->remaining) == -1); 208 + closure_wait_event(wait_list, &wait, 209 + atomic_read(&cl->remaining) == -1); 187 210 } 188 211 } 189 - EXPORT_SYMBOL_GPL(__closure_lock); 190 - 191 - static void closure_delay_timer_fn(unsigned long data) 192 - { 193 - struct closure *cl = (struct closure *) data; 194 - closure_sub(cl, CLOSURE_TIMER + 1); 195 - } 196 - 197 - void do_closure_timer_init(struct closure *cl) 198 - { 199 - struct timer_list *timer = closure_timer(cl); 200 - 201 - init_timer(timer); 202 - timer->data = (unsigned long) cl; 203 - timer->function = closure_delay_timer_fn; 204 - } 205 - EXPORT_SYMBOL_GPL(do_closure_timer_init); 206 - 207 - bool __closure_delay(struct closure *cl, unsigned long delay, 208 - struct timer_list *timer) 209 - { 210 - if (atomic_read(&cl->remaining) & CLOSURE_TIMER) 211 - return false; 212 - 213 - BUG_ON(timer_pending(timer)); 214 - 215 - timer->expires = jiffies + delay; 216 - 217 - atomic_add(CLOSURE_TIMER + 1, &cl->remaining); 218 - add_timer(timer); 219 - return true; 220 - } 221 - EXPORT_SYMBOL_GPL(__closure_delay); 222 - 223 - void __closure_flush(struct closure *cl, struct timer_list *timer) 224 - { 225 - if (del_timer(timer)) 226 - closure_sub(cl, CLOSURE_TIMER + 1); 227 - } 228 - EXPORT_SYMBOL_GPL(__closure_flush); 229 - 230 - void __closure_flush_sync(struct closure *cl, struct timer_list *timer) 231 - { 232 - if (del_timer_sync(timer)) 233 - closure_sub(cl, CLOSURE_TIMER + 1); 234 - } 235 - EXPORT_SYMBOL_GPL(__closure_flush_sync); 212 + EXPORT_SYMBOL(__closure_lock); 236 213 237 214 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG 238 215 ··· 204 273 list_add(&cl->all, &closure_list); 205 274 spin_unlock_irqrestore(&closure_list_lock, flags); 206 275 } 207 - EXPORT_SYMBOL_GPL(closure_debug_create); 276 + EXPORT_SYMBOL(closure_debug_create); 208 277 209 278 void closure_debug_destroy(struct closure *cl) 210 279 { ··· 217 286 list_del(&cl->all); 218 287 spin_unlock_irqrestore(&closure_list_lock, flags); 219 288 } 220 - EXPORT_SYMBOL_GPL(closure_debug_destroy); 289 + EXPORT_SYMBOL(closure_debug_destroy); 221 290 222 291 static struct dentry *debug; 223 292 ··· 235 304 cl, (void *) cl->ip, cl->fn, cl->parent, 236 305 r & CLOSURE_REMAINING_MASK); 237 306 238 - seq_printf(f, "%s%s%s%s%s%s\n", 307 + seq_printf(f, "%s%s%s%s\n", 239 308 test_bit(WORK_STRUCT_PENDING, 240 309 work_data_bits(&cl->work)) ? "Q" : "", 241 310 r & CLOSURE_RUNNING ? "R" : "", 242 - r & CLOSURE_BLOCKING ? "B" : "", 243 311 r & CLOSURE_STACK ? "S" : "", 244 - r & CLOSURE_SLEEPING ? "Sl" : "", 245 - r & CLOSURE_TIMER ? "T" : ""); 312 + r & CLOSURE_SLEEPING ? "Sl" : ""); 246 313 247 314 if (r & CLOSURE_WAITING) 248 315 seq_printf(f, " W %pF\n",

+22 -161

drivers/md/bcache/closure.h

··· 155 155 * delayed_work embeds a work item and a timer_list. The important thing is, use 156 156 * it exactly like you would a regular closure and closure_put() will magically 157 157 * handle everything for you. 158 - * 159 - * We've got closures that embed timers, too. They're called, appropriately 160 - * enough: 161 - * struct closure_with_timer; 162 - * 163 - * This gives you access to closure_delay(). It takes a refcount for a specified 164 - * number of jiffies - you could then call closure_sync() (for a slightly 165 - * convoluted version of msleep()) or continue_at() - which gives you the same 166 - * effect as using a delayed work item, except you can reuse the work_struct 167 - * already embedded in struct closure. 168 - * 169 - * Lastly, there's struct closure_with_waitlist_and_timer. It does what you 170 - * probably expect, if you happen to need the features of both. (You don't 171 - * really want to know how all this is implemented, but if I've done my job 172 - * right you shouldn't have to care). 173 158 */ 174 159 175 160 struct closure; ··· 167 182 enum closure_type { 168 183 TYPE_closure = 0, 169 184 TYPE_closure_with_waitlist = 1, 170 - TYPE_closure_with_timer = 2, 171 - TYPE_closure_with_waitlist_and_timer = 3, 172 - MAX_CLOSURE_TYPE = 3, 185 + MAX_CLOSURE_TYPE = 1, 173 186 }; 174 187 175 188 enum closure_state { 176 189 /* 177 - * CLOSURE_BLOCKING: Causes closure_wait_event() to block, instead of 178 - * waiting asynchronously 179 - * 180 190 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by 181 191 * the thread that owns the closure, and cleared by the thread that's 182 192 * waking up the closure. ··· 179 199 * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep 180 200 * - indicates that cl->task is valid and closure_put() may wake it up. 181 201 * Only set or cleared by the thread that owns the closure. 182 - * 183 - * CLOSURE_TIMER: Analagous to CLOSURE_WAITING, indicates that a closure 184 - * has an outstanding timer. Must be set by the thread that owns the 185 - * closure, and cleared by the timer function when the timer goes off. 186 202 * 187 203 * The rest are for debugging and don't affect behaviour: 188 204 * ··· 194 218 * closure with this flag set 195 219 */ 196 220 197 - CLOSURE_BITS_START = (1 << 19), 198 - CLOSURE_DESTRUCTOR = (1 << 19), 199 - CLOSURE_BLOCKING = (1 << 21), 200 - CLOSURE_WAITING = (1 << 23), 201 - CLOSURE_SLEEPING = (1 << 25), 202 - CLOSURE_TIMER = (1 << 27), 221 + CLOSURE_BITS_START = (1 << 23), 222 + CLOSURE_DESTRUCTOR = (1 << 23), 223 + CLOSURE_WAITING = (1 << 25), 224 + CLOSURE_SLEEPING = (1 << 27), 203 225 CLOSURE_RUNNING = (1 << 29), 204 226 CLOSURE_STACK = (1 << 31), 205 227 }; 206 228 207 229 #define CLOSURE_GUARD_MASK \ 208 - ((CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING|CLOSURE_WAITING| \ 209 - CLOSURE_SLEEPING|CLOSURE_TIMER|CLOSURE_RUNNING|CLOSURE_STACK) << 1) 230 + ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_SLEEPING| \ 231 + CLOSURE_RUNNING|CLOSURE_STACK) << 1) 210 232 211 233 #define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) 212 234 #define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) ··· 242 268 struct closure_waitlist wait; 243 269 }; 244 270 245 - struct closure_with_timer { 246 - struct closure cl; 247 - struct timer_list timer; 248 - }; 249 - 250 - struct closure_with_waitlist_and_timer { 251 - struct closure cl; 252 - struct closure_waitlist wait; 253 - struct timer_list timer; 254 - }; 255 - 256 271 extern unsigned invalid_closure_type(void); 257 272 258 273 #define __CLOSURE_TYPE(cl, _t) \ ··· 252 289 ( \ 253 290 __CLOSURE_TYPE(cl, closure) \ 254 291 __CLOSURE_TYPE(cl, closure_with_waitlist) \ 255 - __CLOSURE_TYPE(cl, closure_with_timer) \ 256 - __CLOSURE_TYPE(cl, closure_with_waitlist_and_timer) \ 257 292 invalid_closure_type() \ 258 293 ) 259 294 260 295 void closure_sub(struct closure *cl, int v); 261 296 void closure_put(struct closure *cl); 262 - void closure_queue(struct closure *cl); 263 297 void __closure_wake_up(struct closure_waitlist *list); 264 298 bool closure_wait(struct closure_waitlist *list, struct closure *cl); 265 299 void closure_sync(struct closure *cl); ··· 264 304 bool closure_trylock(struct closure *cl, struct closure *parent); 265 305 void __closure_lock(struct closure *cl, struct closure *parent, 266 306 struct closure_waitlist *wait_list); 267 - 268 - void do_closure_timer_init(struct closure *cl); 269 - bool __closure_delay(struct closure *cl, unsigned long delay, 270 - struct timer_list *timer); 271 - void __closure_flush(struct closure *cl, struct timer_list *timer); 272 - void __closure_flush_sync(struct closure *cl, struct timer_list *timer); 273 307 274 308 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG 275 309 ··· 308 354 atomic_sub(CLOSURE_RUNNING, &cl->remaining); 309 355 } 310 356 311 - static inline bool closure_is_stopped(struct closure *cl) 312 - { 313 - return !(atomic_read(&cl->remaining) & CLOSURE_RUNNING); 314 - } 315 - 316 357 static inline bool closure_is_unlocked(struct closure *cl) 317 358 { 318 359 return atomic_read(&cl->remaining) == -1; ··· 316 367 static inline void do_closure_init(struct closure *cl, struct closure *parent, 317 368 bool running) 318 369 { 319 - switch (cl->type) { 320 - case TYPE_closure_with_timer: 321 - case TYPE_closure_with_waitlist_and_timer: 322 - do_closure_timer_init(cl); 323 - default: 324 - break; 325 - } 326 - 327 370 cl->parent = parent; 328 371 if (parent) 329 372 closure_get(parent); ··· 370 429 static inline void closure_init_stack(struct closure *cl) 371 430 { 372 431 memset(cl, 0, sizeof(struct closure)); 373 - atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER| 374 - CLOSURE_BLOCKING|CLOSURE_STACK); 432 + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); 375 433 } 376 434 377 435 /** ··· 401 461 #define closure_lock(cl, parent) \ 402 462 __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait) 403 463 404 - /** 405 - * closure_delay() - delay some number of jiffies 406 - * @cl: the closure that will sleep 407 - * @delay: the delay in jiffies 408 - * 409 - * Takes a refcount on @cl which will be released after @delay jiffies; this may 410 - * be used to have a function run after a delay with continue_at(), or 411 - * closure_sync() may be used for a convoluted version of msleep(). 412 - */ 413 - #define closure_delay(cl, delay) \ 414 - __closure_delay(__to_internal_closure(cl), delay, &(cl)->timer) 415 - 416 - #define closure_flush(cl) \ 417 - __closure_flush(__to_internal_closure(cl), &(cl)->timer) 418 - 419 - #define closure_flush_sync(cl) \ 420 - __closure_flush_sync(__to_internal_closure(cl), &(cl)->timer) 421 - 422 464 static inline void __closure_end_sleep(struct closure *cl) 423 465 { 424 466 __set_current_state(TASK_RUNNING); ··· 417 495 418 496 if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) 419 497 atomic_add(CLOSURE_SLEEPING, &cl->remaining); 420 - } 421 - 422 - /** 423 - * closure_blocking() - returns true if the closure is in blocking mode. 424 - * 425 - * If a closure is in blocking mode, closure_wait_event() will sleep until the 426 - * condition is true instead of waiting asynchronously. 427 - */ 428 - static inline bool closure_blocking(struct closure *cl) 429 - { 430 - return atomic_read(&cl->remaining) & CLOSURE_BLOCKING; 431 - } 432 - 433 - /** 434 - * set_closure_blocking() - put a closure in blocking mode. 435 - * 436 - * If a closure is in blocking mode, closure_wait_event() will sleep until the 437 - * condition is true instead of waiting asynchronously. 438 - * 439 - * Not thread safe - can only be called by the thread running the closure. 440 - */ 441 - static inline void set_closure_blocking(struct closure *cl) 442 - { 443 - if (!closure_blocking(cl)) 444 - atomic_add(CLOSURE_BLOCKING, &cl->remaining); 445 - } 446 - 447 - /* 448 - * Not thread safe - can only be called by the thread running the closure. 449 - */ 450 - static inline void clear_closure_blocking(struct closure *cl) 451 - { 452 - if (closure_blocking(cl)) 453 - atomic_sub(CLOSURE_BLOCKING, &cl->remaining); 454 498 } 455 499 456 500 /** ··· 449 561 * refcount on our closure. If this was a stack allocated closure, that would be 450 562 * bad. 451 563 */ 452 - #define __closure_wait_event(list, cl, condition, _block) \ 564 + #define closure_wait_event(list, cl, condition) \ 453 565 ({ \ 454 - bool block = _block; \ 455 566 typeof(condition) ret; \ 456 567 \ 457 568 while (1) { \ 458 569 ret = (condition); \ 459 570 if (ret) { \ 460 571 __closure_wake_up(list); \ 461 - if (block) \ 462 - closure_sync(cl); \ 463 - \ 572 + closure_sync(cl); \ 464 573 break; \ 465 574 } \ 466 575 \ 467 - if (block) \ 468 - __closure_start_sleep(cl); \ 576 + __closure_start_sleep(cl); \ 469 577 \ 470 - if (!closure_wait(list, cl)) { \ 471 - if (!block) \ 472 - break; \ 473 - \ 578 + if (!closure_wait(list, cl)) \ 474 579 schedule(); \ 475 - } \ 476 580 } \ 477 581 \ 478 582 ret; \ 479 583 }) 480 584 481 - /** 482 - * closure_wait_event() - wait on a condition, synchronously or asynchronously. 483 - * @list: the wait list to wait on 484 - * @cl: the closure that is doing the waiting 485 - * @condition: a C expression for the event to wait for 486 - * 487 - * If the closure is in blocking mode, sleeps until the @condition evaluates to 488 - * true - exactly like wait_event(). 489 - * 490 - * If the closure is not in blocking mode, waits asynchronously; if the 491 - * condition is currently false the @cl is put onto @list and returns. @list 492 - * owns a refcount on @cl; closure_sync() or continue_at() may be used later to 493 - * wait for another thread to wake up @list, which drops the refcount on @cl. 494 - * 495 - * Returns the value of @condition; @cl will be on @list iff @condition was 496 - * false. 497 - * 498 - * closure_wake_up(@list) must be called after changing any variable that could 499 - * cause @condition to become true. 500 - */ 501 - #define closure_wait_event(list, cl, condition) \ 502 - __closure_wait_event(list, cl, condition, closure_blocking(cl)) 503 - 504 - #define closure_wait_event_async(list, cl, condition) \ 505 - __closure_wait_event(list, cl, condition, false) 506 - 507 - #define closure_wait_event_sync(list, cl, condition) \ 508 - __closure_wait_event(list, cl, condition, true) 585 + static inline void closure_queue(struct closure *cl) 586 + { 587 + struct workqueue_struct *wq = cl->wq; 588 + if (wq) { 589 + INIT_WORK(&cl->work, cl->work.func); 590 + BUG_ON(!queue_work(wq, &cl->work)); 591 + } else 592 + cl->fn(cl); 593 + } 509 594 510 595 static inline void set_closure_fn(struct closure *cl, closure_fn *fn, 511 596 struct workqueue_struct *wq) ··· 503 642 #define continue_at_nobarrier(_cl, _fn, _wq) \ 504 643 do { \ 505 644 set_closure_fn(_cl, _fn, _wq); \ 506 - closure_queue(cl); \ 645 + closure_queue(_cl); \ 507 646 return; \ 508 647 } while (0) 509 648

+70 -111

drivers/md/bcache/debug.c

··· 8 8 #include "bcache.h" 9 9 #include "btree.h" 10 10 #include "debug.h" 11 - #include "request.h" 12 11 13 12 #include <linux/console.h> 14 13 #include <linux/debugfs.h> ··· 76 77 return out - buf; 77 78 } 78 79 79 - int bch_btree_to_text(char *buf, size_t size, const struct btree *b) 80 - { 81 - return scnprintf(buf, size, "%zu level %i/%i", 82 - PTR_BUCKET_NR(b->c, &b->key, 0), 83 - b->level, b->c->root ? b->c->root->level : -1); 84 - } 85 - 86 - #if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) 87 - 88 - static bool skipped_backwards(struct btree *b, struct bkey *k) 89 - { 90 - return bkey_cmp(k, (!b->level) 91 - ? &START_KEY(bkey_next(k)) 92 - : bkey_next(k)) > 0; 93 - } 80 + #ifdef CONFIG_BCACHE_DEBUG 94 81 95 82 static void dump_bset(struct btree *b, struct bset *i) 96 83 { 97 - struct bkey *k; 84 + struct bkey *k, *next; 98 85 unsigned j; 99 86 char buf[80]; 100 87 101 - for (k = i->start; k < end(i); k = bkey_next(k)) { 88 + for (k = i->start; k < end(i); k = next) { 89 + next = bkey_next(k); 90 + 102 91 bch_bkey_to_text(buf, sizeof(buf), k); 103 92 printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), 104 93 (uint64_t *) k - i->d, i->keys, buf); ··· 102 115 103 116 printk(" %s\n", bch_ptr_status(b->c, k)); 104 117 105 - if (bkey_next(k) < end(i) && 106 - skipped_backwards(b, k)) 118 + if (next < end(i) && 119 + bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0) 107 120 printk(KERN_ERR "Key skipped backwards\n"); 108 121 } 109 122 } 110 123 111 - #endif 124 + static void bch_dump_bucket(struct btree *b) 125 + { 126 + unsigned i; 112 127 113 - #ifdef CONFIG_BCACHE_DEBUG 128 + console_lock(); 129 + for (i = 0; i <= b->nsets; i++) 130 + dump_bset(b, b->sets[i].data); 131 + console_unlock(); 132 + } 114 133 115 134 void bch_btree_verify(struct btree *b, struct bset *new) 116 135 { ··· 169 176 mutex_unlock(&b->c->verify_lock); 170 177 } 171 178 172 - static void data_verify_endio(struct bio *bio, int error) 173 - { 174 - struct closure *cl = bio->bi_private; 175 - closure_put(cl); 176 - } 177 - 178 - void bch_data_verify(struct search *s) 179 + void bch_data_verify(struct cached_dev *dc, struct bio *bio) 179 180 { 180 181 char name[BDEVNAME_SIZE]; 181 - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 182 - struct closure *cl = &s->cl; 183 182 struct bio *check; 184 183 struct bio_vec *bv; 185 184 int i; 186 185 187 - if (!s->unaligned_bvec) 188 - bio_for_each_segment(bv, s->orig_bio, i) 189 - bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; 190 - 191 - check = bio_clone(s->orig_bio, GFP_NOIO); 186 + check = bio_clone(bio, GFP_NOIO); 192 187 if (!check) 193 188 return; 194 189 195 190 if (bio_alloc_pages(check, GFP_NOIO)) 196 191 goto out_put; 197 192 198 - check->bi_rw = READ_SYNC; 199 - check->bi_private = cl; 200 - check->bi_end_io = data_verify_endio; 193 + submit_bio_wait(READ_SYNC, check); 201 194 202 - closure_bio_submit(check, cl, &dc->disk); 203 - closure_sync(cl); 195 + bio_for_each_segment(bv, bio, i) { 196 + void *p1 = kmap_atomic(bv->bv_page); 197 + void *p2 = page_address(check->bi_io_vec[i].bv_page); 204 198 205 - bio_for_each_segment(bv, s->orig_bio, i) { 206 - void *p1 = kmap(bv->bv_page); 207 - void *p2 = kmap(check->bi_io_vec[i].bv_page); 199 + cache_set_err_on(memcmp(p1 + bv->bv_offset, 200 + p2 + bv->bv_offset, 201 + bv->bv_len), 202 + dc->disk.c, 203 + "verify failed at dev %s sector %llu", 204 + bdevname(dc->bdev, name), 205 + (uint64_t) bio->bi_sector); 208 206 209 - if (memcmp(p1 + bv->bv_offset, 210 - p2 + bv->bv_offset, 211 - bv->bv_len)) 212 - printk(KERN_ERR 213 - "bcache (%s): verify failed at sector %llu\n", 214 - bdevname(dc->bdev, name), 215 - (uint64_t) s->orig_bio->bi_sector); 216 - 217 - kunmap(bv->bv_page); 218 - kunmap(check->bi_io_vec[i].bv_page); 207 + kunmap_atomic(p1); 219 208 } 220 209 221 - __bio_for_each_segment(bv, check, i, 0) 210 + bio_for_each_segment_all(bv, check, i) 222 211 __free_page(bv->bv_page); 223 212 out_put: 224 213 bio_put(check); 225 214 } 226 215 227 - #endif 228 - 229 - #ifdef CONFIG_BCACHE_EDEBUG 230 - 231 - unsigned bch_count_data(struct btree *b) 216 + int __bch_count_data(struct btree *b) 232 217 { 233 218 unsigned ret = 0; 234 219 struct btree_iter iter; ··· 218 247 return ret; 219 248 } 220 249 221 - static void vdump_bucket_and_panic(struct btree *b, const char *fmt, 222 - va_list args) 223 - { 224 - unsigned i; 225 - char buf[80]; 226 - 227 - console_lock(); 228 - 229 - for (i = 0; i <= b->nsets; i++) 230 - dump_bset(b, b->sets[i].data); 231 - 232 - vprintk(fmt, args); 233 - 234 - console_unlock(); 235 - 236 - bch_btree_to_text(buf, sizeof(buf), b); 237 - panic("at %s\n", buf); 238 - } 239 - 240 - void bch_check_key_order_msg(struct btree *b, struct bset *i, 241 - const char *fmt, ...) 242 - { 243 - struct bkey *k; 244 - 245 - if (!i->keys) 246 - return; 247 - 248 - for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k)) 249 - if (skipped_backwards(b, k)) { 250 - va_list args; 251 - va_start(args, fmt); 252 - 253 - vdump_bucket_and_panic(b, fmt, args); 254 - va_end(args); 255 - } 256 - } 257 - 258 - void bch_check_keys(struct btree *b, const char *fmt, ...) 250 + void __bch_check_keys(struct btree *b, const char *fmt, ...) 259 251 { 260 252 va_list args; 261 253 struct bkey *k, *p = NULL; 262 254 struct btree_iter iter; 263 - 264 - if (b->level) 265 - return; 255 + const char *err; 266 256 267 257 for_each_key(b, k, &iter) { 268 - if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) { 269 - printk(KERN_ERR "Keys out of order:\n"); 270 - goto bug; 271 - } 258 + if (!b->level) { 259 + err = "Keys out of order"; 260 + if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) 261 + goto bug; 272 262 273 - if (bch_ptr_invalid(b, k)) 274 - continue; 263 + if (bch_ptr_invalid(b, k)) 264 + continue; 275 265 276 - if (p && bkey_cmp(p, &START_KEY(k)) > 0) { 277 - printk(KERN_ERR "Overlapping keys:\n"); 278 - goto bug; 266 + err = "Overlapping keys"; 267 + if (p && bkey_cmp(p, &START_KEY(k)) > 0) 268 + goto bug; 269 + } else { 270 + if (bch_ptr_bad(b, k)) 271 + continue; 272 + 273 + err = "Duplicate keys"; 274 + if (p && !bkey_cmp(p, k)) 275 + goto bug; 279 276 } 280 277 p = k; 281 278 } 279 + 280 + err = "Key larger than btree node key"; 281 + if (p && bkey_cmp(p, &b->key) > 0) 282 + goto bug; 283 + 282 284 return; 283 285 bug: 286 + bch_dump_bucket(b); 287 + 284 288 va_start(args, fmt); 285 - vdump_bucket_and_panic(b, fmt, args); 289 + vprintk(fmt, args); 286 290 va_end(args); 291 + 292 + panic("bcache error: %s:\n", err); 293 + } 294 + 295 + void bch_btree_iter_next_check(struct btree_iter *iter) 296 + { 297 + struct bkey *k = iter->data->k, *next = bkey_next(k); 298 + 299 + if (next < iter->data->end && 300 + bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) { 301 + bch_dump_bucket(iter->b); 302 + panic("Key skipped backwards\n"); 303 + } 287 304 } 288 305 289 306 #endif

+27 -23

drivers/md/bcache/debug.h

··· 4 4 /* Btree/bkey debug printing */ 5 5 6 6 int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); 7 - int bch_btree_to_text(char *buf, size_t size, const struct btree *b); 8 - 9 - #ifdef CONFIG_BCACHE_EDEBUG 10 - 11 - unsigned bch_count_data(struct btree *); 12 - void bch_check_key_order_msg(struct btree *, struct bset *, const char *, ...); 13 - void bch_check_keys(struct btree *, const char *, ...); 14 - 15 - #define bch_check_key_order(b, i) \ 16 - bch_check_key_order_msg(b, i, "keys out of order") 17 - #define EBUG_ON(cond) BUG_ON(cond) 18 - 19 - #else /* EDEBUG */ 20 - 21 - #define bch_count_data(b) 0 22 - #define bch_check_key_order(b, i) do {} while (0) 23 - #define bch_check_key_order_msg(b, i, ...) do {} while (0) 24 - #define bch_check_keys(b, ...) do {} while (0) 25 - #define EBUG_ON(cond) do {} while (0) 26 - 27 - #endif 28 7 29 8 #ifdef CONFIG_BCACHE_DEBUG 30 9 31 10 void bch_btree_verify(struct btree *, struct bset *); 32 - void bch_data_verify(struct search *); 11 + void bch_data_verify(struct cached_dev *, struct bio *); 12 + int __bch_count_data(struct btree *); 13 + void __bch_check_keys(struct btree *, const char *, ...); 14 + void bch_btree_iter_next_check(struct btree_iter *); 15 + 16 + #define EBUG_ON(cond) BUG_ON(cond) 17 + #define expensive_debug_checks(c) ((c)->expensive_debug_checks) 18 + #define key_merging_disabled(c) ((c)->key_merging_disabled) 19 + #define bypass_torture_test(d) ((d)->bypass_torture_test) 33 20 34 21 #else /* DEBUG */ 35 22 36 23 static inline void bch_btree_verify(struct btree *b, struct bset *i) {} 37 - static inline void bch_data_verify(struct search *s) {}; 24 + static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} 25 + static inline int __bch_count_data(struct btree *b) { return -1; } 26 + static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {} 27 + static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} 28 + 29 + #define EBUG_ON(cond) do { if (cond); } while (0) 30 + #define expensive_debug_checks(c) 0 31 + #define key_merging_disabled(c) 0 32 + #define bypass_torture_test(d) 0 38 33 39 34 #endif 35 + 36 + #define bch_count_data(b) \ 37 + (expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1) 38 + 39 + #define bch_check_keys(b, ...) \ 40 + do { \ 41 + if (expensive_debug_checks((b)->c)) \ 42 + __bch_check_keys(b, __VA_ARGS__); \ 43 + } while (0) 40 44 41 45 #ifdef CONFIG_DEBUG_FS 42 46 void bch_debug_init_cache_set(struct cache_set *);

+147 -154

drivers/md/bcache/journal.c

··· 7 7 #include "bcache.h" 8 8 #include "btree.h" 9 9 #include "debug.h" 10 - #include "request.h" 11 10 12 11 #include <trace/events/bcache.h> 13 12 ··· 30 31 } 31 32 32 33 static int journal_read_bucket(struct cache *ca, struct list_head *list, 33 - struct btree_op *op, unsigned bucket_index) 34 + unsigned bucket_index) 34 35 { 35 36 struct journal_device *ja = &ca->journal; 36 37 struct bio *bio = &ja->bio; 37 38 38 39 struct journal_replay *i; 39 40 struct jset *j, *data = ca->set->journal.w[0].data; 41 + struct closure cl; 40 42 unsigned len, left, offset = 0; 41 43 int ret = 0; 42 44 sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); 45 + 46 + closure_init_stack(&cl); 43 47 44 48 pr_debug("reading %llu", (uint64_t) bucket); 45 49 ··· 57 55 bio->bi_size = len << 9; 58 56 59 57 bio->bi_end_io = journal_read_endio; 60 - bio->bi_private = &op->cl; 58 + bio->bi_private = &cl; 61 59 bch_bio_map(bio, data); 62 60 63 - closure_bio_submit(bio, &op->cl, ca); 64 - closure_sync(&op->cl); 61 + closure_bio_submit(bio, &cl, ca); 62 + closure_sync(&cl); 65 63 66 64 /* This function could be simpler now since we no longer write 67 65 * journal entries that overlap bucket boundaries; this means ··· 74 72 struct list_head *where; 75 73 size_t blocks, bytes = set_bytes(j); 76 74 77 - if (j->magic != jset_magic(ca->set)) 75 + if (j->magic != jset_magic(&ca->sb)) 78 76 return ret; 79 77 80 78 if (bytes > left << 9) ··· 131 129 return ret; 132 130 } 133 131 134 - int bch_journal_read(struct cache_set *c, struct list_head *list, 135 - struct btree_op *op) 132 + int bch_journal_read(struct cache_set *c, struct list_head *list) 136 133 { 137 134 #define read_bucket(b) \ 138 135 ({ \ 139 - int ret = journal_read_bucket(ca, list, op, b); \ 136 + int ret = journal_read_bucket(ca, list, b); \ 140 137 __set_bit(b, bitmap); \ 141 138 if (ret < 0) \ 142 139 return ret; \ ··· 293 292 } 294 293 } 295 294 296 - int bch_journal_replay(struct cache_set *s, struct list_head *list, 297 - struct btree_op *op) 295 + int bch_journal_replay(struct cache_set *s, struct list_head *list) 298 296 { 299 297 int ret = 0, keys = 0, entries = 0; 300 298 struct bkey *k; ··· 301 301 list_entry(list->prev, struct journal_replay, list); 302 302 303 303 uint64_t start = i->j.last_seq, end = i->j.seq, n = start; 304 + struct keylist keylist; 305 + 306 + bch_keylist_init(&keylist); 304 307 305 308 list_for_each_entry(i, list, list) { 306 309 BUG_ON(i->pin && atomic_read(i->pin) != 1); 307 310 308 - if (n != i->j.seq) 309 - pr_err( 310 - "journal entries %llu-%llu missing! (replaying %llu-%llu)\n", 311 - n, i->j.seq - 1, start, end); 311 + cache_set_err_on(n != i->j.seq, s, 312 + "bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)", 313 + n, i->j.seq - 1, start, end); 312 314 313 315 for (k = i->j.start; 314 316 k < end(&i->j); 315 317 k = bkey_next(k)) { 316 318 trace_bcache_journal_replay_key(k); 317 319 318 - bkey_copy(op->keys.top, k); 319 - bch_keylist_push(&op->keys); 320 + bkey_copy(keylist.top, k); 321 + bch_keylist_push(&keylist); 320 322 321 - op->journal = i->pin; 322 - atomic_inc(op->journal); 323 - 324 - ret = bch_btree_insert(op, s); 323 + ret = bch_btree_insert(s, &keylist, i->pin, NULL); 325 324 if (ret) 326 325 goto err; 327 326 328 - BUG_ON(!bch_keylist_empty(&op->keys)); 327 + BUG_ON(!bch_keylist_empty(&keylist)); 329 328 keys++; 330 329 331 330 cond_resched(); ··· 338 339 339 340 pr_info("journal replay done, %i keys in %i entries, seq %llu", 340 341 keys, entries, end); 341 - 342 + err: 342 343 while (!list_empty(list)) { 343 344 i = list_first_entry(list, struct journal_replay, list); 344 345 list_del(&i->list); 345 346 kfree(i); 346 347 } 347 - err: 348 - closure_sync(&op->cl); 348 + 349 349 return ret; 350 350 } 351 351 ··· 356 358 * Try to find the btree node with that references the oldest journal 357 359 * entry, best is our current candidate and is locked if non NULL: 358 360 */ 359 - struct btree *b, *best = NULL; 360 - unsigned iter; 361 + struct btree *b, *best; 362 + unsigned i; 363 + retry: 364 + best = NULL; 361 365 362 - for_each_cached_btree(b, c, iter) { 363 - if (!down_write_trylock(&b->lock)) 364 - continue; 365 - 366 - if (!btree_node_dirty(b) || 367 - !btree_current_write(b)->journal) { 368 - rw_unlock(true, b); 369 - continue; 366 + for_each_cached_btree(b, c, i) 367 + if (btree_current_write(b)->journal) { 368 + if (!best) 369 + best = b; 370 + else if (journal_pin_cmp(c, 371 + btree_current_write(best)->journal, 372 + btree_current_write(b)->journal)) { 373 + best = b; 374 + } 370 375 } 371 376 372 - if (!best) 373 - best = b; 374 - else if (journal_pin_cmp(c, 375 - btree_current_write(best), 376 - btree_current_write(b))) { 377 - rw_unlock(true, best); 378 - best = b; 379 - } else 377 + b = best; 378 + if (b) { 379 + rw_lock(true, b, b->level); 380 + 381 + if (!btree_current_write(b)->journal) { 380 382 rw_unlock(true, b); 383 + /* We raced */ 384 + goto retry; 385 + } 386 + 387 + bch_btree_node_write(b, NULL); 388 + rw_unlock(true, b); 381 389 } 382 - 383 - if (best) 384 - goto out; 385 - 386 - /* We can't find the best btree node, just pick the first */ 387 - list_for_each_entry(b, &c->btree_cache, list) 388 - if (!b->level && btree_node_dirty(b)) { 389 - best = b; 390 - rw_lock(true, best, best->level); 391 - goto found; 392 - } 393 - 394 - out: 395 - if (!best) 396 - return; 397 - found: 398 - if (btree_node_dirty(best)) 399 - bch_btree_node_write(best, NULL); 400 - rw_unlock(true, best); 401 390 } 402 391 403 392 #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) ··· 480 495 do_journal_discard(ca); 481 496 482 497 if (c->journal.blocks_free) 483 - return; 498 + goto out; 484 499 485 500 /* 486 501 * Allocate: ··· 506 521 507 522 if (n) 508 523 c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; 509 - 524 + out: 510 525 if (!journal_full(&c->journal)) 511 526 __closure_wake_up(&c->journal.wait); 512 527 } ··· 539 554 struct journal_write *w = bio->bi_private; 540 555 541 556 cache_set_err_on(error, w->c, "journal io error"); 542 - closure_put(&w->c->journal.io.cl); 557 + closure_put(&w->c->journal.io); 543 558 } 544 559 545 560 static void journal_write(struct closure *); 546 561 547 562 static void journal_write_done(struct closure *cl) 548 563 { 549 - struct journal *j = container_of(cl, struct journal, io.cl); 550 - struct cache_set *c = container_of(j, struct cache_set, journal); 551 - 564 + struct journal *j = container_of(cl, struct journal, io); 552 565 struct journal_write *w = (j->cur == j->w) 553 566 ? &j->w[1] 554 567 : &j->w[0]; 555 568 556 569 __closure_wake_up(&w->wait); 557 - 558 - if (c->journal_delay_ms) 559 - closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms)); 560 - 561 - continue_at(cl, journal_write, system_wq); 570 + continue_at_nobarrier(cl, journal_write, system_wq); 562 571 } 563 572 564 573 static void journal_write_unlocked(struct closure *cl) 565 574 __releases(c->journal.lock) 566 575 { 567 - struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); 576 + struct cache_set *c = container_of(cl, struct cache_set, journal.io); 568 577 struct cache *ca; 569 578 struct journal_write *w = c->journal.cur; 570 579 struct bkey *k = &c->journal.key; ··· 596 617 for_each_cache(ca, c, i) 597 618 w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; 598 619 599 - w->data->magic = jset_magic(c); 620 + w->data->magic = jset_magic(&c->sb); 600 621 w->data->version = BCACHE_JSET_VERSION; 601 622 w->data->last_seq = last_seq(&c->journal); 602 623 w->data->csum = csum_set(w->data); ··· 639 660 640 661 static void journal_write(struct closure *cl) 641 662 { 642 - struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); 663 + struct cache_set *c = container_of(cl, struct cache_set, journal.io); 643 664 644 665 spin_lock(&c->journal.lock); 645 666 journal_write_unlocked(cl); 646 667 } 647 668 648 - static void __journal_try_write(struct cache_set *c, bool noflush) 669 + static void journal_try_write(struct cache_set *c) 649 670 __releases(c->journal.lock) 650 671 { 651 - struct closure *cl = &c->journal.io.cl; 672 + struct closure *cl = &c->journal.io; 673 + struct journal_write *w = c->journal.cur; 652 674 653 - if (!closure_trylock(cl, &c->cl)) 654 - spin_unlock(&c->journal.lock); 655 - else if (noflush && journal_full(&c->journal)) { 656 - spin_unlock(&c->journal.lock); 657 - continue_at(cl, journal_write, system_wq); 658 - } else 675 + w->need_write = true; 676 + 677 + if (closure_trylock(cl, &c->cl)) 659 678 journal_write_unlocked(cl); 679 + else 680 + spin_unlock(&c->journal.lock); 660 681 } 661 682 662 - #define journal_try_write(c) __journal_try_write(c, false) 663 - 664 - void bch_journal_meta(struct cache_set *c, struct closure *cl) 683 + static struct journal_write *journal_wait_for_write(struct cache_set *c, 684 + unsigned nkeys) 665 685 { 666 - struct journal_write *w; 686 + size_t sectors; 687 + struct closure cl; 667 688 668 - if (CACHE_SYNC(&c->sb)) { 689 + closure_init_stack(&cl); 690 + 691 + spin_lock(&c->journal.lock); 692 + 693 + while (1) { 694 + struct journal_write *w = c->journal.cur; 695 + 696 + sectors = __set_blocks(w->data, w->data->keys + nkeys, 697 + c) * c->sb.block_size; 698 + 699 + if (sectors <= min_t(size_t, 700 + c->journal.blocks_free * c->sb.block_size, 701 + PAGE_SECTORS << JSET_BITS)) 702 + return w; 703 + 704 + /* XXX: tracepoint */ 705 + if (!journal_full(&c->journal)) { 706 + trace_bcache_journal_entry_full(c); 707 + 708 + /* 709 + * XXX: If we were inserting so many keys that they 710 + * won't fit in an _empty_ journal write, we'll 711 + * deadlock. For now, handle this in 712 + * bch_keylist_realloc() - but something to think about. 713 + */ 714 + BUG_ON(!w->data->keys); 715 + 716 + closure_wait(&w->wait, &cl); 717 + journal_try_write(c); /* unlocks */ 718 + } else { 719 + trace_bcache_journal_full(c); 720 + 721 + closure_wait(&c->journal.wait, &cl); 722 + journal_reclaim(c); 723 + spin_unlock(&c->journal.lock); 724 + 725 + btree_flush_write(c); 726 + } 727 + 728 + closure_sync(&cl); 669 729 spin_lock(&c->journal.lock); 670 - 671 - w = c->journal.cur; 672 - w->need_write = true; 673 - 674 - if (cl) 675 - BUG_ON(!closure_wait(&w->wait, cl)); 676 - 677 - closure_flush(&c->journal.io); 678 - __journal_try_write(c, true); 679 730 } 731 + } 732 + 733 + static void journal_write_work(struct work_struct *work) 734 + { 735 + struct cache_set *c = container_of(to_delayed_work(work), 736 + struct cache_set, 737 + journal.work); 738 + spin_lock(&c->journal.lock); 739 + journal_try_write(c); 680 740 } 681 741 682 742 /* ··· 724 706 * bch_journal() hands those same keys off to btree_insert_async() 725 707 */ 726 708 727 - void bch_journal(struct closure *cl) 709 + atomic_t *bch_journal(struct cache_set *c, 710 + struct keylist *keys, 711 + struct closure *parent) 728 712 { 729 - struct btree_op *op = container_of(cl, struct btree_op, cl); 730 - struct cache_set *c = op->c; 731 713 struct journal_write *w; 732 - size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list; 714 + atomic_t *ret; 733 715 734 - if (op->type != BTREE_INSERT || 735 - !CACHE_SYNC(&c->sb)) 736 - goto out; 716 + if (!CACHE_SYNC(&c->sb)) 717 + return NULL; 737 718 738 - /* 739 - * If we're looping because we errored, might already be waiting on 740 - * another journal write: 741 - */ 742 - while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING) 743 - closure_sync(cl->parent); 719 + w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); 744 720 745 - spin_lock(&c->journal.lock); 721 + memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys)); 722 + w->data->keys += bch_keylist_nkeys(keys); 746 723 747 - if (journal_full(&c->journal)) { 748 - trace_bcache_journal_full(c); 724 + ret = &fifo_back(&c->journal.pin); 725 + atomic_inc(ret); 749 726 750 - closure_wait(&c->journal.wait, cl); 751 - 752 - journal_reclaim(c); 753 - spin_unlock(&c->journal.lock); 754 - 755 - btree_flush_write(c); 756 - continue_at(cl, bch_journal, bcache_wq); 757 - } 758 - 759 - w = c->journal.cur; 760 - w->need_write = true; 761 - b = __set_blocks(w->data, w->data->keys + n, c); 762 - 763 - if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS || 764 - b > c->journal.blocks_free) { 765 - trace_bcache_journal_entry_full(c); 766 - 767 - /* 768 - * XXX: If we were inserting so many keys that they won't fit in 769 - * an _empty_ journal write, we'll deadlock. For now, handle 770 - * this in bch_keylist_realloc() - but something to think about. 771 - */ 772 - BUG_ON(!w->data->keys); 773 - 774 - BUG_ON(!closure_wait(&w->wait, cl)); 775 - 776 - closure_flush(&c->journal.io); 777 - 727 + if (parent) { 728 + closure_wait(&w->wait, parent); 778 729 journal_try_write(c); 779 - continue_at(cl, bch_journal, bcache_wq); 730 + } else if (!w->need_write) { 731 + schedule_delayed_work(&c->journal.work, 732 + msecs_to_jiffies(c->journal_delay_ms)); 733 + spin_unlock(&c->journal.lock); 734 + } else { 735 + spin_unlock(&c->journal.lock); 780 736 } 781 737 782 - memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t)); 783 - w->data->keys += n; 784 738 785 - op->journal = &fifo_back(&c->journal.pin); 786 - atomic_inc(op->journal); 739 + return ret; 740 + } 787 741 788 - if (op->flush_journal) { 789 - closure_flush(&c->journal.io); 790 - closure_wait(&w->wait, cl->parent); 791 - } 742 + void bch_journal_meta(struct cache_set *c, struct closure *cl) 743 + { 744 + struct keylist keys; 745 + atomic_t *ref; 792 746 793 - journal_try_write(c); 794 - out: 795 - bch_btree_insert_async(cl); 747 + bch_keylist_init(&keys); 748 + 749 + ref = bch_journal(c, &keys, cl); 750 + if (ref) 751 + atomic_dec_bug(ref); 796 752 } 797 753 798 754 void bch_journal_free(struct cache_set *c) ··· 782 790 783 791 closure_init_unlocked(&j->io); 784 792 spin_lock_init(&j->lock); 793 + INIT_DELAYED_WORK(&j->work, journal_write_work); 785 794 786 795 c->journal_delay_ms = 100; 787 796

+7 -45

drivers/md/bcache/journal.h

··· 75 75 * nodes that are pinning the oldest journal entries first. 76 76 */ 77 77 78 - #define BCACHE_JSET_VERSION_UUIDv1 1 79 - /* Always latest UUID format */ 80 - #define BCACHE_JSET_VERSION_UUID 1 81 - #define BCACHE_JSET_VERSION 1 82 - 83 - /* 84 - * On disk format for a journal entry: 85 - * seq is monotonically increasing; every journal entry has its own unique 86 - * sequence number. 87 - * 88 - * last_seq is the oldest journal entry that still has keys the btree hasn't 89 - * flushed to disk yet. 90 - * 91 - * version is for on disk format changes. 92 - */ 93 - struct jset { 94 - uint64_t csum; 95 - uint64_t magic; 96 - uint64_t seq; 97 - uint32_t version; 98 - uint32_t keys; 99 - 100 - uint64_t last_seq; 101 - 102 - BKEY_PADDED(uuid_bucket); 103 - BKEY_PADDED(btree_root); 104 - uint16_t btree_level; 105 - uint16_t pad[3]; 106 - 107 - uint64_t prio_bucket[MAX_CACHES_PER_SET]; 108 - 109 - union { 110 - struct bkey start[0]; 111 - uint64_t d[0]; 112 - }; 113 - }; 114 - 115 78 /* 116 79 * Only used for holding the journal entries we read in btree_journal_read() 117 80 * during cache_registration ··· 103 140 spinlock_t lock; 104 141 /* used when waiting because the journal was full */ 105 142 struct closure_waitlist wait; 106 - struct closure_with_timer io; 143 + struct closure io; 144 + struct delayed_work work; 107 145 108 146 /* Number of blocks free in the bucket(s) we're currently writing to */ 109 147 unsigned blocks_free; ··· 152 188 }; 153 189 154 190 #define journal_pin_cmp(c, l, r) \ 155 - (fifo_idx(&(c)->journal.pin, (l)->journal) > \ 156 - fifo_idx(&(c)->journal.pin, (r)->journal)) 191 + (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) 157 192 158 193 #define JOURNAL_PIN 20000 159 194 ··· 162 199 struct closure; 163 200 struct cache_set; 164 201 struct btree_op; 202 + struct keylist; 165 203 166 - void bch_journal(struct closure *); 204 + atomic_t *bch_journal(struct cache_set *, struct keylist *, struct closure *); 167 205 void bch_journal_next(struct journal *); 168 206 void bch_journal_mark(struct cache_set *, struct list_head *); 169 207 void bch_journal_meta(struct cache_set *, struct closure *); 170 - int bch_journal_read(struct cache_set *, struct list_head *, 171 - struct btree_op *); 172 - int bch_journal_replay(struct cache_set *, struct list_head *, 173 - struct btree_op *); 208 + int bch_journal_read(struct cache_set *, struct list_head *); 209 + int bch_journal_replay(struct cache_set *, struct list_head *); 174 210 175 211 void bch_journal_free(struct cache_set *); 176 212 int bch_journal_alloc(struct cache_set *);

+39 -48

drivers/md/bcache/movinggc.c

··· 12 12 #include <trace/events/bcache.h> 13 13 14 14 struct moving_io { 15 + struct closure cl; 15 16 struct keybuf_key *w; 16 - struct search s; 17 + struct data_insert_op op; 17 18 struct bbio bio; 18 19 }; 19 20 ··· 39 38 40 39 static void moving_io_destructor(struct closure *cl) 41 40 { 42 - struct moving_io *io = container_of(cl, struct moving_io, s.cl); 41 + struct moving_io *io = container_of(cl, struct moving_io, cl); 43 42 kfree(io); 44 43 } 45 44 46 45 static void write_moving_finish(struct closure *cl) 47 46 { 48 - struct moving_io *io = container_of(cl, struct moving_io, s.cl); 47 + struct moving_io *io = container_of(cl, struct moving_io, cl); 49 48 struct bio *bio = &io->bio.bio; 50 49 struct bio_vec *bv; 51 50 int i; ··· 53 52 bio_for_each_segment_all(bv, bio, i) 54 53 __free_page(bv->bv_page); 55 54 56 - if (io->s.op.insert_collision) 55 + if (io->op.replace_collision) 57 56 trace_bcache_gc_copy_collision(&io->w->key); 58 57 59 - bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); 58 + bch_keybuf_del(&io->op.c->moving_gc_keys, io->w); 60 59 61 - atomic_dec_bug(&io->s.op.c->in_flight); 62 - closure_wake_up(&io->s.op.c->moving_gc_wait); 60 + up(&io->op.c->moving_in_flight); 63 61 64 62 closure_return_with_destructor(cl, moving_io_destructor); 65 63 } ··· 66 66 static void read_moving_endio(struct bio *bio, int error) 67 67 { 68 68 struct moving_io *io = container_of(bio->bi_private, 69 - struct moving_io, s.cl); 69 + struct moving_io, cl); 70 70 71 71 if (error) 72 - io->s.error = error; 72 + io->op.error = error; 73 73 74 - bch_bbio_endio(io->s.op.c, bio, error, "reading data to move"); 74 + bch_bbio_endio(io->op.c, bio, error, "reading data to move"); 75 75 } 76 76 77 77 static void moving_init(struct moving_io *io) ··· 85 85 bio->bi_size = KEY_SIZE(&io->w->key) << 9; 86 86 bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key), 87 87 PAGE_SECTORS); 88 - bio->bi_private = &io->s.cl; 88 + bio->bi_private = &io->cl; 89 89 bio->bi_io_vec = bio->bi_inline_vecs; 90 90 bch_bio_map(bio, NULL); 91 91 } 92 92 93 93 static void write_moving(struct closure *cl) 94 94 { 95 - struct search *s = container_of(cl, struct search, cl); 96 - struct moving_io *io = container_of(s, struct moving_io, s); 95 + struct moving_io *io = container_of(cl, struct moving_io, cl); 96 + struct data_insert_op *op = &io->op; 97 97 98 - if (!s->error) { 98 + if (!op->error) { 99 99 moving_init(io); 100 100 101 - io->bio.bio.bi_sector = KEY_START(&io->w->key); 102 - s->op.lock = -1; 103 - s->op.write_prio = 1; 104 - s->op.cache_bio = &io->bio.bio; 101 + io->bio.bio.bi_sector = KEY_START(&io->w->key); 102 + op->write_prio = 1; 103 + op->bio = &io->bio.bio; 105 104 106 - s->writeback = KEY_DIRTY(&io->w->key); 107 - s->op.csum = KEY_CSUM(&io->w->key); 105 + op->writeback = KEY_DIRTY(&io->w->key); 106 + op->csum = KEY_CSUM(&io->w->key); 108 107 109 - s->op.type = BTREE_REPLACE; 110 - bkey_copy(&s->op.replace, &io->w->key); 108 + bkey_copy(&op->replace_key, &io->w->key); 109 + op->replace = true; 111 110 112 - closure_init(&s->op.cl, cl); 113 - bch_insert_data(&s->op.cl); 111 + closure_call(&op->cl, bch_data_insert, NULL, cl); 114 112 } 115 113 116 - continue_at(cl, write_moving_finish, NULL); 114 + continue_at(cl, write_moving_finish, system_wq); 117 115 } 118 116 119 117 static void read_moving_submit(struct closure *cl) 120 118 { 121 - struct search *s = container_of(cl, struct search, cl); 122 - struct moving_io *io = container_of(s, struct moving_io, s); 119 + struct moving_io *io = container_of(cl, struct moving_io, cl); 123 120 struct bio *bio = &io->bio.bio; 124 121 125 - bch_submit_bbio(bio, s->op.c, &io->w->key, 0); 122 + bch_submit_bbio(bio, io->op.c, &io->w->key, 0); 126 123 127 - continue_at(cl, write_moving, bch_gc_wq); 124 + continue_at(cl, write_moving, system_wq); 128 125 } 129 126 130 - static void read_moving(struct closure *cl) 127 + static void read_moving(struct cache_set *c) 131 128 { 132 - struct cache_set *c = container_of(cl, struct cache_set, moving_gc); 133 129 struct keybuf_key *w; 134 130 struct moving_io *io; 135 131 struct bio *bio; 132 + struct closure cl; 133 + 134 + closure_init_stack(&cl); 136 135 137 136 /* XXX: if we error, background writeback could stall indefinitely */ 138 137 ··· 149 150 150 151 w->private = io; 151 152 io->w = w; 152 - io->s.op.inode = KEY_INODE(&w->key); 153 - io->s.op.c = c; 153 + io->op.inode = KEY_INODE(&w->key); 154 + io->op.c = c; 154 155 155 156 moving_init(io); 156 157 bio = &io->bio.bio; ··· 163 164 164 165 trace_bcache_gc_copy(&w->key); 165 166 166 - closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); 167 - 168 - if (atomic_inc_return(&c->in_flight) >= 64) { 169 - closure_wait_event(&c->moving_gc_wait, cl, 170 - atomic_read(&c->in_flight) < 64); 171 - continue_at(cl, read_moving, bch_gc_wq); 172 - } 167 + down(&c->moving_in_flight); 168 + closure_call(&io->cl, read_moving_submit, NULL, &cl); 173 169 } 174 170 175 171 if (0) { ··· 174 180 bch_keybuf_del(&c->moving_gc_keys, w); 175 181 } 176 182 177 - closure_return(cl); 183 + closure_sync(&cl); 178 184 } 179 185 180 186 static bool bucket_cmp(struct bucket *l, struct bucket *r) ··· 187 193 return GC_SECTORS_USED(heap_peek(&ca->heap)); 188 194 } 189 195 190 - void bch_moving_gc(struct closure *cl) 196 + void bch_moving_gc(struct cache_set *c) 191 197 { 192 - struct cache_set *c = container_of(cl, struct cache_set, gc.cl); 193 198 struct cache *ca; 194 199 struct bucket *b; 195 200 unsigned i; 196 201 197 202 if (!c->copy_gc_enabled) 198 - closure_return(cl); 203 + return; 199 204 200 205 mutex_lock(&c->bucket_lock); 201 206 ··· 235 242 236 243 c->moving_gc_keys.last_scanned = ZERO_KEY; 237 244 238 - closure_init(&c->moving_gc, cl); 239 - read_moving(&c->moving_gc); 240 - 241 - closure_return(cl); 245 + read_moving(c); 242 246 } 243 247 244 248 void bch_moving_init_cache_set(struct cache_set *c) 245 249 { 246 250 bch_keybuf_init(&c->moving_gc_keys); 251 + sema_init(&c->moving_in_flight, 64); 247 252 }

+707 -743

drivers/md/bcache/request.c

··· 25 25 26 26 struct kmem_cache *bch_search_cache; 27 27 28 - static void check_should_skip(struct cached_dev *, struct search *); 28 + static void bch_data_insert_start(struct closure *); 29 29 30 30 /* Cgroup interface */ 31 31 ··· 213 213 214 214 /* Insert data into cache */ 215 215 216 - static void bio_invalidate(struct closure *cl) 216 + static void bch_data_insert_keys(struct closure *cl) 217 217 { 218 - struct btree_op *op = container_of(cl, struct btree_op, cl); 219 - struct bio *bio = op->cache_bio; 218 + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 219 + atomic_t *journal_ref = NULL; 220 + struct bkey *replace_key = op->replace ? &op->replace_key : NULL; 221 + int ret; 222 + 223 + /* 224 + * If we're looping, might already be waiting on 225 + * another journal write - can't wait on more than one journal write at 226 + * a time 227 + * 228 + * XXX: this looks wrong 229 + */ 230 + #if 0 231 + while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING) 232 + closure_sync(&s->cl); 233 + #endif 234 + 235 + if (!op->replace) 236 + journal_ref = bch_journal(op->c, &op->insert_keys, 237 + op->flush_journal ? cl : NULL); 238 + 239 + ret = bch_btree_insert(op->c, &op->insert_keys, 240 + journal_ref, replace_key); 241 + if (ret == -ESRCH) { 242 + op->replace_collision = true; 243 + } else if (ret) { 244 + op->error = -ENOMEM; 245 + op->insert_data_done = true; 246 + } 247 + 248 + if (journal_ref) 249 + atomic_dec_bug(journal_ref); 250 + 251 + if (!op->insert_data_done) 252 + continue_at(cl, bch_data_insert_start, bcache_wq); 253 + 254 + bch_keylist_free(&op->insert_keys); 255 + closure_return(cl); 256 + } 257 + 258 + static void bch_data_invalidate(struct closure *cl) 259 + { 260 + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 261 + struct bio *bio = op->bio; 220 262 221 263 pr_debug("invalidating %i sectors from %llu", 222 264 bio_sectors(bio), (uint64_t) bio->bi_sector); 223 265 224 266 while (bio_sectors(bio)) { 225 - unsigned len = min(bio_sectors(bio), 1U << 14); 267 + unsigned sectors = min(bio_sectors(bio), 268 + 1U << (KEY_SIZE_BITS - 1)); 226 269 227 - if (bch_keylist_realloc(&op->keys, 0, op->c)) 270 + if (bch_keylist_realloc(&op->insert_keys, 0, op->c)) 228 271 goto out; 229 272 230 - bio->bi_sector += len; 231 - bio->bi_size -= len << 9; 273 + bio->bi_sector += sectors; 274 + bio->bi_size -= sectors << 9; 232 275 233 - bch_keylist_add(&op->keys, 234 - &KEY(op->inode, bio->bi_sector, len)); 276 + bch_keylist_add(&op->insert_keys, 277 + &KEY(op->inode, bio->bi_sector, sectors)); 235 278 } 236 279 237 280 op->insert_data_done = true; 238 281 bio_put(bio); 239 282 out: 240 - continue_at(cl, bch_journal, bcache_wq); 283 + continue_at(cl, bch_data_insert_keys, bcache_wq); 241 284 } 242 285 243 - struct open_bucket { 244 - struct list_head list; 245 - struct task_struct *last; 246 - unsigned sectors_free; 247 - BKEY_PADDED(key); 248 - }; 249 - 250 - void bch_open_buckets_free(struct cache_set *c) 286 + static void bch_data_insert_error(struct closure *cl) 251 287 { 252 - struct open_bucket *b; 253 - 254 - while (!list_empty(&c->data_buckets)) { 255 - b = list_first_entry(&c->data_buckets, 256 - struct open_bucket, list); 257 - list_del(&b->list); 258 - kfree(b); 259 - } 260 - } 261 - 262 - int bch_open_buckets_alloc(struct cache_set *c) 263 - { 264 - int i; 265 - 266 - spin_lock_init(&c->data_bucket_lock); 267 - 268 - for (i = 0; i < 6; i++) { 269 - struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); 270 - if (!b) 271 - return -ENOMEM; 272 - 273 - list_add(&b->list, &c->data_buckets); 274 - } 275 - 276 - return 0; 277 - } 278 - 279 - /* 280 - * We keep multiple buckets open for writes, and try to segregate different 281 - * write streams for better cache utilization: first we look for a bucket where 282 - * the last write to it was sequential with the current write, and failing that 283 - * we look for a bucket that was last used by the same task. 284 - * 285 - * The ideas is if you've got multiple tasks pulling data into the cache at the 286 - * same time, you'll get better cache utilization if you try to segregate their 287 - * data and preserve locality. 288 - * 289 - * For example, say you've starting Firefox at the same time you're copying a 290 - * bunch of files. Firefox will likely end up being fairly hot and stay in the 291 - * cache awhile, but the data you copied might not be; if you wrote all that 292 - * data to the same buckets it'd get invalidated at the same time. 293 - * 294 - * Both of those tasks will be doing fairly random IO so we can't rely on 295 - * detecting sequential IO to segregate their data, but going off of the task 296 - * should be a sane heuristic. 297 - */ 298 - static struct open_bucket *pick_data_bucket(struct cache_set *c, 299 - const struct bkey *search, 300 - struct task_struct *task, 301 - struct bkey *alloc) 302 - { 303 - struct open_bucket *ret, *ret_task = NULL; 304 - 305 - list_for_each_entry_reverse(ret, &c->data_buckets, list) 306 - if (!bkey_cmp(&ret->key, search)) 307 - goto found; 308 - else if (ret->last == task) 309 - ret_task = ret; 310 - 311 - ret = ret_task ?: list_first_entry(&c->data_buckets, 312 - struct open_bucket, list); 313 - found: 314 - if (!ret->sectors_free && KEY_PTRS(alloc)) { 315 - ret->sectors_free = c->sb.bucket_size; 316 - bkey_copy(&ret->key, alloc); 317 - bkey_init(alloc); 318 - } 319 - 320 - if (!ret->sectors_free) 321 - ret = NULL; 322 - 323 - return ret; 324 - } 325 - 326 - /* 327 - * Allocates some space in the cache to write to, and k to point to the newly 328 - * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the 329 - * end of the newly allocated space). 330 - * 331 - * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many 332 - * sectors were actually allocated. 333 - * 334 - * If s->writeback is true, will not fail. 335 - */ 336 - static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, 337 - struct search *s) 338 - { 339 - struct cache_set *c = s->op.c; 340 - struct open_bucket *b; 341 - BKEY_PADDED(key) alloc; 342 - struct closure cl, *w = NULL; 343 - unsigned i; 344 - 345 - if (s->writeback) { 346 - closure_init_stack(&cl); 347 - w = &cl; 348 - } 349 - 350 - /* 351 - * We might have to allocate a new bucket, which we can't do with a 352 - * spinlock held. So if we have to allocate, we drop the lock, allocate 353 - * and then retry. KEY_PTRS() indicates whether alloc points to 354 - * allocated bucket(s). 355 - */ 356 - 357 - bkey_init(&alloc.key); 358 - spin_lock(&c->data_bucket_lock); 359 - 360 - while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) { 361 - unsigned watermark = s->op.write_prio 362 - ? WATERMARK_MOVINGGC 363 - : WATERMARK_NONE; 364 - 365 - spin_unlock(&c->data_bucket_lock); 366 - 367 - if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w)) 368 - return false; 369 - 370 - spin_lock(&c->data_bucket_lock); 371 - } 372 - 373 - /* 374 - * If we had to allocate, we might race and not need to allocate the 375 - * second time we call find_data_bucket(). If we allocated a bucket but 376 - * didn't use it, drop the refcount bch_bucket_alloc_set() took: 377 - */ 378 - if (KEY_PTRS(&alloc.key)) 379 - __bkey_put(c, &alloc.key); 380 - 381 - for (i = 0; i < KEY_PTRS(&b->key); i++) 382 - EBUG_ON(ptr_stale(c, &b->key, i)); 383 - 384 - /* Set up the pointer to the space we're allocating: */ 385 - 386 - for (i = 0; i < KEY_PTRS(&b->key); i++) 387 - k->ptr[i] = b->key.ptr[i]; 388 - 389 - sectors = min(sectors, b->sectors_free); 390 - 391 - SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); 392 - SET_KEY_SIZE(k, sectors); 393 - SET_KEY_PTRS(k, KEY_PTRS(&b->key)); 394 - 395 - /* 396 - * Move b to the end of the lru, and keep track of what this bucket was 397 - * last used for: 398 - */ 399 - list_move_tail(&b->list, &c->data_buckets); 400 - bkey_copy_key(&b->key, k); 401 - b->last = s->task; 402 - 403 - b->sectors_free -= sectors; 404 - 405 - for (i = 0; i < KEY_PTRS(&b->key); i++) { 406 - SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); 407 - 408 - atomic_long_add(sectors, 409 - &PTR_CACHE(c, &b->key, i)->sectors_written); 410 - } 411 - 412 - if (b->sectors_free < c->sb.block_size) 413 - b->sectors_free = 0; 414 - 415 - /* 416 - * k takes refcounts on the buckets it points to until it's inserted 417 - * into the btree, but if we're done with this bucket we just transfer 418 - * get_data_bucket()'s refcount. 419 - */ 420 - if (b->sectors_free) 421 - for (i = 0; i < KEY_PTRS(&b->key); i++) 422 - atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); 423 - 424 - spin_unlock(&c->data_bucket_lock); 425 - return true; 426 - } 427 - 428 - static void bch_insert_data_error(struct closure *cl) 429 - { 430 - struct btree_op *op = container_of(cl, struct btree_op, cl); 288 + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 431 289 432 290 /* 433 291 * Our data write just errored, which means we've got a bunch of keys to ··· 296 438 * from the keys we'll accomplish just that. 297 439 */ 298 440 299 - struct bkey *src = op->keys.bottom, *dst = op->keys.bottom; 441 + struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys; 300 442 301 - while (src != op->keys.top) { 443 + while (src != op->insert_keys.top) { 302 444 struct bkey *n = bkey_next(src); 303 445 304 446 SET_KEY_PTRS(src, 0); 305 - bkey_copy(dst, src); 447 + memmove(dst, src, bkey_bytes(src)); 306 448 307 449 dst = bkey_next(dst); 308 450 src = n; 309 451 } 310 452 311 - op->keys.top = dst; 453 + op->insert_keys.top = dst; 312 454 313 - bch_journal(cl); 455 + bch_data_insert_keys(cl); 314 456 } 315 457 316 - static void bch_insert_data_endio(struct bio *bio, int error) 458 + static void bch_data_insert_endio(struct bio *bio, int error) 317 459 { 318 460 struct closure *cl = bio->bi_private; 319 - struct btree_op *op = container_of(cl, struct btree_op, cl); 320 - struct search *s = container_of(op, struct search, op); 461 + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 321 462 322 463 if (error) { 323 464 /* TODO: We could try to recover from this. */ 324 - if (s->writeback) 325 - s->error = error; 326 - else if (s->write) 327 - set_closure_fn(cl, bch_insert_data_error, bcache_wq); 465 + if (op->writeback) 466 + op->error = error; 467 + else if (!op->replace) 468 + set_closure_fn(cl, bch_data_insert_error, bcache_wq); 328 469 else 329 470 set_closure_fn(cl, NULL, NULL); 330 471 } ··· 331 474 bch_bbio_endio(op->c, bio, error, "writing data to cache"); 332 475 } 333 476 334 - static void bch_insert_data_loop(struct closure *cl) 477 + static void bch_data_insert_start(struct closure *cl) 335 478 { 336 - struct btree_op *op = container_of(cl, struct btree_op, cl); 337 - struct search *s = container_of(op, struct search, op); 338 - struct bio *bio = op->cache_bio, *n; 479 + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 480 + struct bio *bio = op->bio, *n; 339 481 340 - if (op->skip) 341 - return bio_invalidate(cl); 482 + if (op->bypass) 483 + return bch_data_invalidate(cl); 342 484 343 485 if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { 344 486 set_gc_sectors(op->c); 345 - bch_queue_gc(op->c); 487 + wake_up_gc(op->c); 346 488 } 347 489 348 490 /* ··· 353 497 do { 354 498 unsigned i; 355 499 struct bkey *k; 356 - struct bio_set *split = s->d 357 - ? s->d->bio_split : op->c->bio_split; 500 + struct bio_set *split = op->c->bio_split; 358 501 359 502 /* 1 for the device pointer and 1 for the chksum */ 360 - if (bch_keylist_realloc(&op->keys, 503 + if (bch_keylist_realloc(&op->insert_keys, 361 504 1 + (op->csum ? 1 : 0), 362 505 op->c)) 363 - continue_at(cl, bch_journal, bcache_wq); 506 + continue_at(cl, bch_data_insert_keys, bcache_wq); 364 507 365 - k = op->keys.top; 508 + k = op->insert_keys.top; 366 509 bkey_init(k); 367 510 SET_KEY_INODE(k, op->inode); 368 511 SET_KEY_OFFSET(k, bio->bi_sector); 369 512 370 - if (!bch_alloc_sectors(k, bio_sectors(bio), s)) 513 + if (!bch_alloc_sectors(op->c, k, bio_sectors(bio), 514 + op->write_point, op->write_prio, 515 + op->writeback)) 371 516 goto err; 372 517 373 518 n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); 374 519 375 - n->bi_end_io = bch_insert_data_endio; 520 + n->bi_end_io = bch_data_insert_endio; 376 521 n->bi_private = cl; 377 522 378 - if (s->writeback) { 523 + if (op->writeback) { 379 524 SET_KEY_DIRTY(k, true); 380 525 381 526 for (i = 0; i < KEY_PTRS(k); i++) ··· 389 532 bio_csum(n, k); 390 533 391 534 trace_bcache_cache_insert(k); 392 - bch_keylist_push(&op->keys); 535 + bch_keylist_push(&op->insert_keys); 393 536 394 537 n->bi_rw |= REQ_WRITE; 395 538 bch_submit_bbio(n, op->c, k, 0); 396 539 } while (n != bio); 397 540 398 541 op->insert_data_done = true; 399 - continue_at(cl, bch_journal, bcache_wq); 542 + continue_at(cl, bch_data_insert_keys, bcache_wq); 400 543 err: 401 544 /* bch_alloc_sectors() blocks if s->writeback = true */ 402 - BUG_ON(s->writeback); 545 + BUG_ON(op->writeback); 403 546 404 547 /* 405 548 * But if it's not a writeback write we'd rather just bail out if ··· 407 550 * we might be starving btree writes for gc or something. 408 551 */ 409 552 410 - if (s->write) { 553 + if (!op->replace) { 411 554 /* 412 555 * Writethrough write: We can't complete the write until we've 413 556 * updated the index. But we don't want to delay the write while 414 557 * we wait for buckets to be freed up, so just invalidate the 415 558 * rest of the write. 416 559 */ 417 - op->skip = true; 418 - return bio_invalidate(cl); 560 + op->bypass = true; 561 + return bch_data_invalidate(cl); 419 562 } else { 420 563 /* 421 564 * From a cache miss, we can just insert the keys for the data ··· 424 567 op->insert_data_done = true; 425 568 bio_put(bio); 426 569 427 - if (!bch_keylist_empty(&op->keys)) 428 - continue_at(cl, bch_journal, bcache_wq); 570 + if (!bch_keylist_empty(&op->insert_keys)) 571 + continue_at(cl, bch_data_insert_keys, bcache_wq); 429 572 else 430 573 closure_return(cl); 431 574 } 432 575 } 433 576 434 577 /** 435 - * bch_insert_data - stick some data in the cache 578 + * bch_data_insert - stick some data in the cache 436 579 * 437 580 * This is the starting point for any data to end up in a cache device; it could 438 581 * be from a normal write, or a writeback write, or a write to a flash only ··· 444 587 * data is written it calls bch_journal, and after the keys have been added to 445 588 * the next journal write they're inserted into the btree. 446 589 * 447 - * It inserts the data in op->cache_bio; bi_sector is used for the key offset, 590 + * It inserts the data in s->cache_bio; bi_sector is used for the key offset, 448 591 * and op->inode is used for the key inode. 449 592 * 450 - * If op->skip is true, instead of inserting the data it invalidates the region 451 - * of the cache represented by op->cache_bio and op->inode. 593 + * If s->bypass is true, instead of inserting the data it invalidates the 594 + * region of the cache represented by s->cache_bio and op->inode. 452 595 */ 453 - void bch_insert_data(struct closure *cl) 596 + void bch_data_insert(struct closure *cl) 454 597 { 455 - struct btree_op *op = container_of(cl, struct btree_op, cl); 598 + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 456 599 457 - bch_keylist_init(&op->keys); 458 - bio_get(op->cache_bio); 459 - bch_insert_data_loop(cl); 600 + trace_bcache_write(op->bio, op->writeback, op->bypass); 601 + 602 + bch_keylist_init(&op->insert_keys); 603 + bio_get(op->bio); 604 + bch_data_insert_start(cl); 460 605 } 461 606 462 - void bch_btree_insert_async(struct closure *cl) 463 - { 464 - struct btree_op *op = container_of(cl, struct btree_op, cl); 465 - struct search *s = container_of(op, struct search, op); 466 - 467 - if (bch_btree_insert(op, op->c)) { 468 - s->error = -ENOMEM; 469 - op->insert_data_done = true; 470 - } 471 - 472 - if (op->insert_data_done) { 473 - bch_keylist_free(&op->keys); 474 - closure_return(cl); 475 - } else 476 - continue_at(cl, bch_insert_data_loop, bcache_wq); 477 - } 478 - 479 - /* Common code for the make_request functions */ 480 - 481 - static void request_endio(struct bio *bio, int error) 482 - { 483 - struct closure *cl = bio->bi_private; 484 - 485 - if (error) { 486 - struct search *s = container_of(cl, struct search, cl); 487 - s->error = error; 488 - /* Only cache read errors are recoverable */ 489 - s->recoverable = false; 490 - } 491 - 492 - bio_put(bio); 493 - closure_put(cl); 494 - } 495 - 496 - void bch_cache_read_endio(struct bio *bio, int error) 497 - { 498 - struct bbio *b = container_of(bio, struct bbio, bio); 499 - struct closure *cl = bio->bi_private; 500 - struct search *s = container_of(cl, struct search, cl); 501 - 502 - /* 503 - * If the bucket was reused while our bio was in flight, we might have 504 - * read the wrong data. Set s->error but not error so it doesn't get 505 - * counted against the cache device, but we'll still reread the data 506 - * from the backing device. 507 - */ 508 - 509 - if (error) 510 - s->error = error; 511 - else if (ptr_stale(s->op.c, &b->key, 0)) { 512 - atomic_long_inc(&s->op.c->cache_read_races); 513 - s->error = -EINTR; 514 - } 515 - 516 - bch_bbio_endio(s->op.c, bio, error, "reading from cache"); 517 - } 518 - 519 - static void bio_complete(struct search *s) 520 - { 521 - if (s->orig_bio) { 522 - int cpu, rw = bio_data_dir(s->orig_bio); 523 - unsigned long duration = jiffies - s->start_time; 524 - 525 - cpu = part_stat_lock(); 526 - part_round_stats(cpu, &s->d->disk->part0); 527 - part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration); 528 - part_stat_unlock(); 529 - 530 - trace_bcache_request_end(s, s->orig_bio); 531 - bio_endio(s->orig_bio, s->error); 532 - s->orig_bio = NULL; 533 - } 534 - } 535 - 536 - static void do_bio_hook(struct search *s) 537 - { 538 - struct bio *bio = &s->bio.bio; 539 - memcpy(bio, s->orig_bio, sizeof(struct bio)); 540 - 541 - bio->bi_end_io = request_endio; 542 - bio->bi_private = &s->cl; 543 - atomic_set(&bio->bi_cnt, 3); 544 - } 545 - 546 - static void search_free(struct closure *cl) 547 - { 548 - struct search *s = container_of(cl, struct search, cl); 549 - bio_complete(s); 550 - 551 - if (s->op.cache_bio) 552 - bio_put(s->op.cache_bio); 553 - 554 - if (s->unaligned_bvec) 555 - mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec); 556 - 557 - closure_debug_destroy(cl); 558 - mempool_free(s, s->d->c->search); 559 - } 560 - 561 - static struct search *search_alloc(struct bio *bio, struct bcache_device *d) 562 - { 563 - struct bio_vec *bv; 564 - struct search *s = mempool_alloc(d->c->search, GFP_NOIO); 565 - memset(s, 0, offsetof(struct search, op.keys)); 566 - 567 - __closure_init(&s->cl, NULL); 568 - 569 - s->op.inode = d->id; 570 - s->op.c = d->c; 571 - s->d = d; 572 - s->op.lock = -1; 573 - s->task = current; 574 - s->orig_bio = bio; 575 - s->write = (bio->bi_rw & REQ_WRITE) != 0; 576 - s->op.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; 577 - s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; 578 - s->recoverable = 1; 579 - s->start_time = jiffies; 580 - do_bio_hook(s); 581 - 582 - if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) { 583 - bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO); 584 - memcpy(bv, bio_iovec(bio), 585 - sizeof(struct bio_vec) * bio_segments(bio)); 586 - 587 - s->bio.bio.bi_io_vec = bv; 588 - s->unaligned_bvec = 1; 589 - } 590 - 591 - return s; 592 - } 593 - 594 - static void btree_read_async(struct closure *cl) 595 - { 596 - struct btree_op *op = container_of(cl, struct btree_op, cl); 597 - 598 - int ret = btree_root(search_recurse, op->c, op); 599 - 600 - if (ret == -EAGAIN) 601 - continue_at(cl, btree_read_async, bcache_wq); 602 - 603 - closure_return(cl); 604 - } 605 - 606 - /* Cached devices */ 607 - 608 - static void cached_dev_bio_complete(struct closure *cl) 609 - { 610 - struct search *s = container_of(cl, struct search, cl); 611 - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 612 - 613 - search_free(cl); 614 - cached_dev_put(dc); 615 - } 616 - 617 - /* Process reads */ 618 - 619 - static void cached_dev_read_complete(struct closure *cl) 620 - { 621 - struct search *s = container_of(cl, struct search, cl); 622 - 623 - if (s->op.insert_collision) 624 - bch_mark_cache_miss_collision(s); 625 - 626 - if (s->op.cache_bio) { 627 - int i; 628 - struct bio_vec *bv; 629 - 630 - __bio_for_each_segment(bv, s->op.cache_bio, i, 0) 631 - __free_page(bv->bv_page); 632 - } 633 - 634 - cached_dev_bio_complete(cl); 635 - } 636 - 637 - static void request_read_error(struct closure *cl) 638 - { 639 - struct search *s = container_of(cl, struct search, cl); 640 - struct bio_vec *bv; 641 - int i; 642 - 643 - if (s->recoverable) { 644 - /* Retry from the backing device: */ 645 - trace_bcache_read_retry(s->orig_bio); 646 - 647 - s->error = 0; 648 - bv = s->bio.bio.bi_io_vec; 649 - do_bio_hook(s); 650 - s->bio.bio.bi_io_vec = bv; 651 - 652 - if (!s->unaligned_bvec) 653 - bio_for_each_segment(bv, s->orig_bio, i) 654 - bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; 655 - else 656 - memcpy(s->bio.bio.bi_io_vec, 657 - bio_iovec(s->orig_bio), 658 - sizeof(struct bio_vec) * 659 - bio_segments(s->orig_bio)); 660 - 661 - /* XXX: invalidate cache */ 662 - 663 - closure_bio_submit(&s->bio.bio, &s->cl, s->d); 664 - } 665 - 666 - continue_at(cl, cached_dev_read_complete, NULL); 667 - } 668 - 669 - static void request_read_done(struct closure *cl) 670 - { 671 - struct search *s = container_of(cl, struct search, cl); 672 - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 673 - 674 - /* 675 - * s->cache_bio != NULL implies that we had a cache miss; cache_bio now 676 - * contains data ready to be inserted into the cache. 677 - * 678 - * First, we copy the data we just read from cache_bio's bounce buffers 679 - * to the buffers the original bio pointed to: 680 - */ 681 - 682 - if (s->op.cache_bio) { 683 - bio_reset(s->op.cache_bio); 684 - s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; 685 - s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; 686 - s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; 687 - bch_bio_map(s->op.cache_bio, NULL); 688 - 689 - bio_copy_data(s->cache_miss, s->op.cache_bio); 690 - 691 - bio_put(s->cache_miss); 692 - s->cache_miss = NULL; 693 - } 694 - 695 - if (verify(dc, &s->bio.bio) && s->recoverable) 696 - bch_data_verify(s); 697 - 698 - bio_complete(s); 699 - 700 - if (s->op.cache_bio && 701 - !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) { 702 - s->op.type = BTREE_REPLACE; 703 - closure_call(&s->op.cl, bch_insert_data, NULL, cl); 704 - } 705 - 706 - continue_at(cl, cached_dev_read_complete, NULL); 707 - } 708 - 709 - static void request_read_done_bh(struct closure *cl) 710 - { 711 - struct search *s = container_of(cl, struct search, cl); 712 - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 713 - 714 - bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); 715 - trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip); 716 - 717 - if (s->error) 718 - continue_at_nobarrier(cl, request_read_error, bcache_wq); 719 - else if (s->op.cache_bio || verify(dc, &s->bio.bio)) 720 - continue_at_nobarrier(cl, request_read_done, bcache_wq); 721 - else 722 - continue_at_nobarrier(cl, cached_dev_read_complete, NULL); 723 - } 724 - 725 - static int cached_dev_cache_miss(struct btree *b, struct search *s, 726 - struct bio *bio, unsigned sectors) 727 - { 728 - int ret = 0; 729 - unsigned reada; 730 - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 731 - struct bio *miss; 732 - 733 - miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); 734 - if (miss == bio) 735 - s->op.lookup_done = true; 736 - 737 - miss->bi_end_io = request_endio; 738 - miss->bi_private = &s->cl; 739 - 740 - if (s->cache_miss || s->op.skip) 741 - goto out_submit; 742 - 743 - if (miss != bio || 744 - (bio->bi_rw & REQ_RAHEAD) || 745 - (bio->bi_rw & REQ_META) || 746 - s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA) 747 - reada = 0; 748 - else { 749 - reada = min(dc->readahead >> 9, 750 - sectors - bio_sectors(miss)); 751 - 752 - if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev)) 753 - reada = bdev_sectors(miss->bi_bdev) - 754 - bio_end_sector(miss); 755 - } 756 - 757 - s->cache_bio_sectors = bio_sectors(miss) + reada; 758 - s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT, 759 - DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS), 760 - dc->disk.bio_split); 761 - 762 - if (!s->op.cache_bio) 763 - goto out_submit; 764 - 765 - s->op.cache_bio->bi_sector = miss->bi_sector; 766 - s->op.cache_bio->bi_bdev = miss->bi_bdev; 767 - s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; 768 - 769 - s->op.cache_bio->bi_end_io = request_endio; 770 - s->op.cache_bio->bi_private = &s->cl; 771 - 772 - /* btree_search_recurse()'s btree iterator is no good anymore */ 773 - ret = -EINTR; 774 - if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio)) 775 - goto out_put; 776 - 777 - bch_bio_map(s->op.cache_bio, NULL); 778 - if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) 779 - goto out_put; 780 - 781 - s->cache_miss = miss; 782 - bio_get(s->op.cache_bio); 783 - 784 - closure_bio_submit(s->op.cache_bio, &s->cl, s->d); 785 - 786 - return ret; 787 - out_put: 788 - bio_put(s->op.cache_bio); 789 - s->op.cache_bio = NULL; 790 - out_submit: 791 - closure_bio_submit(miss, &s->cl, s->d); 792 - return ret; 793 - } 794 - 795 - static void request_read(struct cached_dev *dc, struct search *s) 796 - { 797 - struct closure *cl = &s->cl; 798 - 799 - check_should_skip(dc, s); 800 - closure_call(&s->op.cl, btree_read_async, NULL, cl); 801 - 802 - continue_at(cl, request_read_done_bh, NULL); 803 - } 804 - 805 - /* Process writes */ 806 - 807 - static void cached_dev_write_complete(struct closure *cl) 808 - { 809 - struct search *s = container_of(cl, struct search, cl); 810 - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 811 - 812 - up_read_non_owner(&dc->writeback_lock); 813 - cached_dev_bio_complete(cl); 814 - } 815 - 816 - static void request_write(struct cached_dev *dc, struct search *s) 817 - { 818 - struct closure *cl = &s->cl; 819 - struct bio *bio = &s->bio.bio; 820 - struct bkey start, end; 821 - start = KEY(dc->disk.id, bio->bi_sector, 0); 822 - end = KEY(dc->disk.id, bio_end_sector(bio), 0); 823 - 824 - bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); 825 - 826 - check_should_skip(dc, s); 827 - down_read_non_owner(&dc->writeback_lock); 828 - 829 - if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { 830 - s->op.skip = false; 831 - s->writeback = true; 832 - } 833 - 834 - if (bio->bi_rw & REQ_DISCARD) 835 - goto skip; 836 - 837 - if (should_writeback(dc, s->orig_bio, 838 - cache_mode(dc, bio), 839 - s->op.skip)) { 840 - s->op.skip = false; 841 - s->writeback = true; 842 - } 843 - 844 - if (s->op.skip) 845 - goto skip; 846 - 847 - trace_bcache_write(s->orig_bio, s->writeback, s->op.skip); 848 - 849 - if (!s->writeback) { 850 - s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, 851 - dc->disk.bio_split); 852 - 853 - closure_bio_submit(bio, cl, s->d); 854 - } else { 855 - bch_writeback_add(dc); 856 - s->op.cache_bio = bio; 857 - 858 - if (bio->bi_rw & REQ_FLUSH) { 859 - /* Also need to send a flush to the backing device */ 860 - struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0, 861 - dc->disk.bio_split); 862 - 863 - flush->bi_rw = WRITE_FLUSH; 864 - flush->bi_bdev = bio->bi_bdev; 865 - flush->bi_end_io = request_endio; 866 - flush->bi_private = cl; 867 - 868 - closure_bio_submit(flush, cl, s->d); 869 - } 870 - } 871 - out: 872 - closure_call(&s->op.cl, bch_insert_data, NULL, cl); 873 - continue_at(cl, cached_dev_write_complete, NULL); 874 - skip: 875 - s->op.skip = true; 876 - s->op.cache_bio = s->orig_bio; 877 - bio_get(s->op.cache_bio); 878 - 879 - if ((bio->bi_rw & REQ_DISCARD) && 880 - !blk_queue_discard(bdev_get_queue(dc->bdev))) 881 - goto out; 882 - 883 - closure_bio_submit(bio, cl, s->d); 884 - goto out; 885 - } 886 - 887 - static void request_nodata(struct cached_dev *dc, struct search *s) 888 - { 889 - struct closure *cl = &s->cl; 890 - struct bio *bio = &s->bio.bio; 891 - 892 - if (bio->bi_rw & REQ_DISCARD) { 893 - request_write(dc, s); 894 - return; 895 - } 896 - 897 - if (s->op.flush_journal) 898 - bch_journal_meta(s->op.c, cl); 899 - 900 - closure_bio_submit(bio, cl, s->d); 901 - 902 - continue_at(cl, cached_dev_bio_complete, NULL); 903 - } 904 - 905 - /* Cached devices - read & write stuff */ 607 + /* Congested? */ 906 608 907 609 unsigned bch_get_congested(struct cache_set *c) 908 610 { ··· 504 1088 return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; 505 1089 } 506 1090 507 - static void check_should_skip(struct cached_dev *dc, struct search *s) 1091 + static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) 508 1092 { 509 - struct cache_set *c = s->op.c; 510 - struct bio *bio = &s->bio.bio; 1093 + struct cache_set *c = dc->disk.c; 511 1094 unsigned mode = cache_mode(dc, bio); 512 1095 unsigned sectors, congested = bch_get_congested(c); 1096 + struct task_struct *task = current; 1097 + struct io *i; 513 1098 514 - if (atomic_read(&dc->disk.detaching) || 1099 + if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || 515 1100 c->gc_stats.in_use > CUTOFF_CACHE_ADD || 516 1101 (bio->bi_rw & REQ_DISCARD)) 517 1102 goto skip; ··· 522 1105 (bio->bi_rw & REQ_WRITE))) 523 1106 goto skip; 524 1107 525 - if (bio->bi_sector & (c->sb.block_size - 1) || 1108 + if (bio->bi_sector & (c->sb.block_size - 1) || 526 1109 bio_sectors(bio) & (c->sb.block_size - 1)) { 527 1110 pr_debug("skipping unaligned io"); 528 1111 goto skip; 1112 + } 1113 + 1114 + if (bypass_torture_test(dc)) { 1115 + if ((get_random_int() & 3) == 3) 1116 + goto skip; 1117 + else 1118 + goto rescale; 529 1119 } 530 1120 531 1121 if (!congested && !dc->sequential_cutoff) ··· 544 1120 (bio->bi_rw & REQ_SYNC)) 545 1121 goto rescale; 546 1122 547 - if (dc->sequential_merge) { 548 - struct io *i; 1123 + spin_lock(&dc->io_lock); 549 1124 550 - spin_lock(&dc->io_lock); 1125 + hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) 1126 + if (i->last == bio->bi_sector && 1127 + time_before(jiffies, i->jiffies)) 1128 + goto found; 551 1129 552 - hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) 553 - if (i->last == bio->bi_sector && 554 - time_before(jiffies, i->jiffies)) 555 - goto found; 1130 + i = list_first_entry(&dc->io_lru, struct io, lru); 556 1131 557 - i = list_first_entry(&dc->io_lru, struct io, lru); 558 - 559 - add_sequential(s->task); 560 - i->sequential = 0; 1132 + add_sequential(task); 1133 + i->sequential = 0; 561 1134 found: 562 - if (i->sequential + bio->bi_size > i->sequential) 563 - i->sequential += bio->bi_size; 1135 + if (i->sequential + bio->bi_size > i->sequential) 1136 + i->sequential += bio->bi_size; 564 1137 565 - i->last = bio_end_sector(bio); 566 - i->jiffies = jiffies + msecs_to_jiffies(5000); 567 - s->task->sequential_io = i->sequential; 1138 + i->last = bio_end_sector(bio); 1139 + i->jiffies = jiffies + msecs_to_jiffies(5000); 1140 + task->sequential_io = i->sequential; 568 1141 569 - hlist_del(&i->hash); 570 - hlist_add_head(&i->hash, iohash(dc, i->last)); 571 - list_move_tail(&i->lru, &dc->io_lru); 1142 + hlist_del(&i->hash); 1143 + hlist_add_head(&i->hash, iohash(dc, i->last)); 1144 + list_move_tail(&i->lru, &dc->io_lru); 572 1145 573 - spin_unlock(&dc->io_lock); 574 - } else { 575 - s->task->sequential_io = bio->bi_size; 1146 + spin_unlock(&dc->io_lock); 576 1147 577 - add_sequential(s->task); 578 - } 579 - 580 - sectors = max(s->task->sequential_io, 581 - s->task->sequential_io_avg) >> 9; 1148 + sectors = max(task->sequential_io, 1149 + task->sequential_io_avg) >> 9; 582 1150 583 1151 if (dc->sequential_cutoff && 584 1152 sectors >= dc->sequential_cutoff >> 9) { 585 - trace_bcache_bypass_sequential(s->orig_bio); 1153 + trace_bcache_bypass_sequential(bio); 586 1154 goto skip; 587 1155 } 588 1156 589 1157 if (congested && sectors >= congested) { 590 - trace_bcache_bypass_congested(s->orig_bio); 1158 + trace_bcache_bypass_congested(bio); 591 1159 goto skip; 592 1160 } 593 1161 594 1162 rescale: 595 1163 bch_rescale_priorities(c, bio_sectors(bio)); 596 - return; 1164 + return false; 597 1165 skip: 598 - bch_mark_sectors_bypassed(s, bio_sectors(bio)); 599 - s->op.skip = true; 1166 + bch_mark_sectors_bypassed(c, dc, bio_sectors(bio)); 1167 + return true; 600 1168 } 1169 + 1170 + /* Cache lookup */ 1171 + 1172 + struct search { 1173 + /* Stack frame for bio_complete */ 1174 + struct closure cl; 1175 + 1176 + struct bcache_device *d; 1177 + 1178 + struct bbio bio; 1179 + struct bio *orig_bio; 1180 + struct bio *cache_miss; 1181 + 1182 + unsigned insert_bio_sectors; 1183 + 1184 + unsigned recoverable:1; 1185 + unsigned unaligned_bvec:1; 1186 + unsigned write:1; 1187 + unsigned read_dirty_data:1; 1188 + 1189 + unsigned long start_time; 1190 + 1191 + struct btree_op op; 1192 + struct data_insert_op iop; 1193 + }; 1194 + 1195 + static void bch_cache_read_endio(struct bio *bio, int error) 1196 + { 1197 + struct bbio *b = container_of(bio, struct bbio, bio); 1198 + struct closure *cl = bio->bi_private; 1199 + struct search *s = container_of(cl, struct search, cl); 1200 + 1201 + /* 1202 + * If the bucket was reused while our bio was in flight, we might have 1203 + * read the wrong data. Set s->error but not error so it doesn't get 1204 + * counted against the cache device, but we'll still reread the data 1205 + * from the backing device. 1206 + */ 1207 + 1208 + if (error) 1209 + s->iop.error = error; 1210 + else if (ptr_stale(s->iop.c, &b->key, 0)) { 1211 + atomic_long_inc(&s->iop.c->cache_read_races); 1212 + s->iop.error = -EINTR; 1213 + } 1214 + 1215 + bch_bbio_endio(s->iop.c, bio, error, "reading from cache"); 1216 + } 1217 + 1218 + /* 1219 + * Read from a single key, handling the initial cache miss if the key starts in 1220 + * the middle of the bio 1221 + */ 1222 + static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) 1223 + { 1224 + struct search *s = container_of(op, struct search, op); 1225 + struct bio *n, *bio = &s->bio.bio; 1226 + struct bkey *bio_key; 1227 + unsigned ptr; 1228 + 1229 + if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_sector, 0)) <= 0) 1230 + return MAP_CONTINUE; 1231 + 1232 + if (KEY_INODE(k) != s->iop.inode || 1233 + KEY_START(k) > bio->bi_sector) { 1234 + unsigned bio_sectors = bio_sectors(bio); 1235 + unsigned sectors = KEY_INODE(k) == s->iop.inode 1236 + ? min_t(uint64_t, INT_MAX, 1237 + KEY_START(k) - bio->bi_sector) 1238 + : INT_MAX; 1239 + 1240 + int ret = s->d->cache_miss(b, s, bio, sectors); 1241 + if (ret != MAP_CONTINUE) 1242 + return ret; 1243 + 1244 + /* if this was a complete miss we shouldn't get here */ 1245 + BUG_ON(bio_sectors <= sectors); 1246 + } 1247 + 1248 + if (!KEY_SIZE(k)) 1249 + return MAP_CONTINUE; 1250 + 1251 + /* XXX: figure out best pointer - for multiple cache devices */ 1252 + ptr = 0; 1253 + 1254 + PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; 1255 + 1256 + if (KEY_DIRTY(k)) 1257 + s->read_dirty_data = true; 1258 + 1259 + n = bch_bio_split(bio, min_t(uint64_t, INT_MAX, 1260 + KEY_OFFSET(k) - bio->bi_sector), 1261 + GFP_NOIO, s->d->bio_split); 1262 + 1263 + bio_key = &container_of(n, struct bbio, bio)->key; 1264 + bch_bkey_copy_single_ptr(bio_key, k, ptr); 1265 + 1266 + bch_cut_front(&KEY(s->iop.inode, n->bi_sector, 0), bio_key); 1267 + bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key); 1268 + 1269 + n->bi_end_io = bch_cache_read_endio; 1270 + n->bi_private = &s->cl; 1271 + 1272 + /* 1273 + * The bucket we're reading from might be reused while our bio 1274 + * is in flight, and we could then end up reading the wrong 1275 + * data. 1276 + * 1277 + * We guard against this by checking (in cache_read_endio()) if 1278 + * the pointer is stale again; if so, we treat it as an error 1279 + * and reread from the backing device (but we don't pass that 1280 + * error up anywhere). 1281 + */ 1282 + 1283 + __bch_submit_bbio(n, b->c); 1284 + return n == bio ? MAP_DONE : MAP_CONTINUE; 1285 + } 1286 + 1287 + static void cache_lookup(struct closure *cl) 1288 + { 1289 + struct search *s = container_of(cl, struct search, iop.cl); 1290 + struct bio *bio = &s->bio.bio; 1291 + 1292 + int ret = bch_btree_map_keys(&s->op, s->iop.c, 1293 + &KEY(s->iop.inode, bio->bi_sector, 0), 1294 + cache_lookup_fn, MAP_END_KEY); 1295 + if (ret == -EAGAIN) 1296 + continue_at(cl, cache_lookup, bcache_wq); 1297 + 1298 + closure_return(cl); 1299 + } 1300 + 1301 + /* Common code for the make_request functions */ 1302 + 1303 + static void request_endio(struct bio *bio, int error) 1304 + { 1305 + struct closure *cl = bio->bi_private; 1306 + 1307 + if (error) { 1308 + struct search *s = container_of(cl, struct search, cl); 1309 + s->iop.error = error; 1310 + /* Only cache read errors are recoverable */ 1311 + s->recoverable = false; 1312 + } 1313 + 1314 + bio_put(bio); 1315 + closure_put(cl); 1316 + } 1317 + 1318 + static void bio_complete(struct search *s) 1319 + { 1320 + if (s->orig_bio) { 1321 + int cpu, rw = bio_data_dir(s->orig_bio); 1322 + unsigned long duration = jiffies - s->start_time; 1323 + 1324 + cpu = part_stat_lock(); 1325 + part_round_stats(cpu, &s->d->disk->part0); 1326 + part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration); 1327 + part_stat_unlock(); 1328 + 1329 + trace_bcache_request_end(s->d, s->orig_bio); 1330 + bio_endio(s->orig_bio, s->iop.error); 1331 + s->orig_bio = NULL; 1332 + } 1333 + } 1334 + 1335 + static void do_bio_hook(struct search *s) 1336 + { 1337 + struct bio *bio = &s->bio.bio; 1338 + memcpy(bio, s->orig_bio, sizeof(struct bio)); 1339 + 1340 + bio->bi_end_io = request_endio; 1341 + bio->bi_private = &s->cl; 1342 + atomic_set(&bio->bi_cnt, 3); 1343 + } 1344 + 1345 + static void search_free(struct closure *cl) 1346 + { 1347 + struct search *s = container_of(cl, struct search, cl); 1348 + bio_complete(s); 1349 + 1350 + if (s->iop.bio) 1351 + bio_put(s->iop.bio); 1352 + 1353 + if (s->unaligned_bvec) 1354 + mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec); 1355 + 1356 + closure_debug_destroy(cl); 1357 + mempool_free(s, s->d->c->search); 1358 + } 1359 + 1360 + static struct search *search_alloc(struct bio *bio, struct bcache_device *d) 1361 + { 1362 + struct search *s; 1363 + struct bio_vec *bv; 1364 + 1365 + s = mempool_alloc(d->c->search, GFP_NOIO); 1366 + memset(s, 0, offsetof(struct search, iop.insert_keys)); 1367 + 1368 + __closure_init(&s->cl, NULL); 1369 + 1370 + s->iop.inode = d->id; 1371 + s->iop.c = d->c; 1372 + s->d = d; 1373 + s->op.lock = -1; 1374 + s->iop.write_point = hash_long((unsigned long) current, 16); 1375 + s->orig_bio = bio; 1376 + s->write = (bio->bi_rw & REQ_WRITE) != 0; 1377 + s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; 1378 + s->recoverable = 1; 1379 + s->start_time = jiffies; 1380 + do_bio_hook(s); 1381 + 1382 + if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) { 1383 + bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO); 1384 + memcpy(bv, bio_iovec(bio), 1385 + sizeof(struct bio_vec) * bio_segments(bio)); 1386 + 1387 + s->bio.bio.bi_io_vec = bv; 1388 + s->unaligned_bvec = 1; 1389 + } 1390 + 1391 + return s; 1392 + } 1393 + 1394 + /* Cached devices */ 1395 + 1396 + static void cached_dev_bio_complete(struct closure *cl) 1397 + { 1398 + struct search *s = container_of(cl, struct search, cl); 1399 + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 1400 + 1401 + search_free(cl); 1402 + cached_dev_put(dc); 1403 + } 1404 + 1405 + /* Process reads */ 1406 + 1407 + static void cached_dev_cache_miss_done(struct closure *cl) 1408 + { 1409 + struct search *s = container_of(cl, struct search, cl); 1410 + 1411 + if (s->iop.replace_collision) 1412 + bch_mark_cache_miss_collision(s->iop.c, s->d); 1413 + 1414 + if (s->iop.bio) { 1415 + int i; 1416 + struct bio_vec *bv; 1417 + 1418 + bio_for_each_segment_all(bv, s->iop.bio, i) 1419 + __free_page(bv->bv_page); 1420 + } 1421 + 1422 + cached_dev_bio_complete(cl); 1423 + } 1424 + 1425 + static void cached_dev_read_error(struct closure *cl) 1426 + { 1427 + struct search *s = container_of(cl, struct search, cl); 1428 + struct bio *bio = &s->bio.bio; 1429 + struct bio_vec *bv; 1430 + int i; 1431 + 1432 + if (s->recoverable) { 1433 + /* Retry from the backing device: */ 1434 + trace_bcache_read_retry(s->orig_bio); 1435 + 1436 + s->iop.error = 0; 1437 + bv = s->bio.bio.bi_io_vec; 1438 + do_bio_hook(s); 1439 + s->bio.bio.bi_io_vec = bv; 1440 + 1441 + if (!s->unaligned_bvec) 1442 + bio_for_each_segment(bv, s->orig_bio, i) 1443 + bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; 1444 + else 1445 + memcpy(s->bio.bio.bi_io_vec, 1446 + bio_iovec(s->orig_bio), 1447 + sizeof(struct bio_vec) * 1448 + bio_segments(s->orig_bio)); 1449 + 1450 + /* XXX: invalidate cache */ 1451 + 1452 + closure_bio_submit(bio, cl, s->d); 1453 + } 1454 + 1455 + continue_at(cl, cached_dev_cache_miss_done, NULL); 1456 + } 1457 + 1458 + static void cached_dev_read_done(struct closure *cl) 1459 + { 1460 + struct search *s = container_of(cl, struct search, cl); 1461 + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 1462 + 1463 + /* 1464 + * We had a cache miss; cache_bio now contains data ready to be inserted 1465 + * into the cache. 1466 + * 1467 + * First, we copy the data we just read from cache_bio's bounce buffers 1468 + * to the buffers the original bio pointed to: 1469 + */ 1470 + 1471 + if (s->iop.bio) { 1472 + bio_reset(s->iop.bio); 1473 + s->iop.bio->bi_sector = s->cache_miss->bi_sector; 1474 + s->iop.bio->bi_bdev = s->cache_miss->bi_bdev; 1475 + s->iop.bio->bi_size = s->insert_bio_sectors << 9; 1476 + bch_bio_map(s->iop.bio, NULL); 1477 + 1478 + bio_copy_data(s->cache_miss, s->iop.bio); 1479 + 1480 + bio_put(s->cache_miss); 1481 + s->cache_miss = NULL; 1482 + } 1483 + 1484 + if (verify(dc, &s->bio.bio) && s->recoverable && 1485 + !s->unaligned_bvec && !s->read_dirty_data) 1486 + bch_data_verify(dc, s->orig_bio); 1487 + 1488 + bio_complete(s); 1489 + 1490 + if (s->iop.bio && 1491 + !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) { 1492 + BUG_ON(!s->iop.replace); 1493 + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); 1494 + } 1495 + 1496 + continue_at(cl, cached_dev_cache_miss_done, NULL); 1497 + } 1498 + 1499 + static void cached_dev_read_done_bh(struct closure *cl) 1500 + { 1501 + struct search *s = container_of(cl, struct search, cl); 1502 + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 1503 + 1504 + bch_mark_cache_accounting(s->iop.c, s->d, 1505 + !s->cache_miss, s->iop.bypass); 1506 + trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass); 1507 + 1508 + if (s->iop.error) 1509 + continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); 1510 + else if (s->iop.bio || verify(dc, &s->bio.bio)) 1511 + continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq); 1512 + else 1513 + continue_at_nobarrier(cl, cached_dev_bio_complete, NULL); 1514 + } 1515 + 1516 + static int cached_dev_cache_miss(struct btree *b, struct search *s, 1517 + struct bio *bio, unsigned sectors) 1518 + { 1519 + int ret = MAP_CONTINUE; 1520 + unsigned reada = 0; 1521 + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 1522 + struct bio *miss, *cache_bio; 1523 + 1524 + if (s->cache_miss || s->iop.bypass) { 1525 + miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); 1526 + ret = miss == bio ? MAP_DONE : MAP_CONTINUE; 1527 + goto out_submit; 1528 + } 1529 + 1530 + if (!(bio->bi_rw & REQ_RAHEAD) && 1531 + !(bio->bi_rw & REQ_META) && 1532 + s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) 1533 + reada = min_t(sector_t, dc->readahead >> 9, 1534 + bdev_sectors(bio->bi_bdev) - bio_end_sector(bio)); 1535 + 1536 + s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); 1537 + 1538 + s->iop.replace_key = KEY(s->iop.inode, 1539 + bio->bi_sector + s->insert_bio_sectors, 1540 + s->insert_bio_sectors); 1541 + 1542 + ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key); 1543 + if (ret) 1544 + return ret; 1545 + 1546 + s->iop.replace = true; 1547 + 1548 + miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); 1549 + 1550 + /* btree_search_recurse()'s btree iterator is no good anymore */ 1551 + ret = miss == bio ? MAP_DONE : -EINTR; 1552 + 1553 + cache_bio = bio_alloc_bioset(GFP_NOWAIT, 1554 + DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS), 1555 + dc->disk.bio_split); 1556 + if (!cache_bio) 1557 + goto out_submit; 1558 + 1559 + cache_bio->bi_sector = miss->bi_sector; 1560 + cache_bio->bi_bdev = miss->bi_bdev; 1561 + cache_bio->bi_size = s->insert_bio_sectors << 9; 1562 + 1563 + cache_bio->bi_end_io = request_endio; 1564 + cache_bio->bi_private = &s->cl; 1565 + 1566 + bch_bio_map(cache_bio, NULL); 1567 + if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) 1568 + goto out_put; 1569 + 1570 + if (reada) 1571 + bch_mark_cache_readahead(s->iop.c, s->d); 1572 + 1573 + s->cache_miss = miss; 1574 + s->iop.bio = cache_bio; 1575 + bio_get(cache_bio); 1576 + closure_bio_submit(cache_bio, &s->cl, s->d); 1577 + 1578 + return ret; 1579 + out_put: 1580 + bio_put(cache_bio); 1581 + out_submit: 1582 + miss->bi_end_io = request_endio; 1583 + miss->bi_private = &s->cl; 1584 + closure_bio_submit(miss, &s->cl, s->d); 1585 + return ret; 1586 + } 1587 + 1588 + static void cached_dev_read(struct cached_dev *dc, struct search *s) 1589 + { 1590 + struct closure *cl = &s->cl; 1591 + 1592 + closure_call(&s->iop.cl, cache_lookup, NULL, cl); 1593 + continue_at(cl, cached_dev_read_done_bh, NULL); 1594 + } 1595 + 1596 + /* Process writes */ 1597 + 1598 + static void cached_dev_write_complete(struct closure *cl) 1599 + { 1600 + struct search *s = container_of(cl, struct search, cl); 1601 + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 1602 + 1603 + up_read_non_owner(&dc->writeback_lock); 1604 + cached_dev_bio_complete(cl); 1605 + } 1606 + 1607 + static void cached_dev_write(struct cached_dev *dc, struct search *s) 1608 + { 1609 + struct closure *cl = &s->cl; 1610 + struct bio *bio = &s->bio.bio; 1611 + struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0); 1612 + struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0); 1613 + 1614 + bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end); 1615 + 1616 + down_read_non_owner(&dc->writeback_lock); 1617 + if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { 1618 + /* 1619 + * We overlap with some dirty data undergoing background 1620 + * writeback, force this write to writeback 1621 + */ 1622 + s->iop.bypass = false; 1623 + s->iop.writeback = true; 1624 + } 1625 + 1626 + /* 1627 + * Discards aren't _required_ to do anything, so skipping if 1628 + * check_overlapping returned true is ok 1629 + * 1630 + * But check_overlapping drops dirty keys for which io hasn't started, 1631 + * so we still want to call it. 1632 + */ 1633 + if (bio->bi_rw & REQ_DISCARD) 1634 + s->iop.bypass = true; 1635 + 1636 + if (should_writeback(dc, s->orig_bio, 1637 + cache_mode(dc, bio), 1638 + s->iop.bypass)) { 1639 + s->iop.bypass = false; 1640 + s->iop.writeback = true; 1641 + } 1642 + 1643 + if (s->iop.bypass) { 1644 + s->iop.bio = s->orig_bio; 1645 + bio_get(s->iop.bio); 1646 + 1647 + if (!(bio->bi_rw & REQ_DISCARD) || 1648 + blk_queue_discard(bdev_get_queue(dc->bdev))) 1649 + closure_bio_submit(bio, cl, s->d); 1650 + } else if (s->iop.writeback) { 1651 + bch_writeback_add(dc); 1652 + s->iop.bio = bio; 1653 + 1654 + if (bio->bi_rw & REQ_FLUSH) { 1655 + /* Also need to send a flush to the backing device */ 1656 + struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0, 1657 + dc->disk.bio_split); 1658 + 1659 + flush->bi_rw = WRITE_FLUSH; 1660 + flush->bi_bdev = bio->bi_bdev; 1661 + flush->bi_end_io = request_endio; 1662 + flush->bi_private = cl; 1663 + 1664 + closure_bio_submit(flush, cl, s->d); 1665 + } 1666 + } else { 1667 + s->iop.bio = bio_clone_bioset(bio, GFP_NOIO, 1668 + dc->disk.bio_split); 1669 + 1670 + closure_bio_submit(bio, cl, s->d); 1671 + } 1672 + 1673 + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); 1674 + continue_at(cl, cached_dev_write_complete, NULL); 1675 + } 1676 + 1677 + static void cached_dev_nodata(struct closure *cl) 1678 + { 1679 + struct search *s = container_of(cl, struct search, cl); 1680 + struct bio *bio = &s->bio.bio; 1681 + 1682 + if (s->iop.flush_journal) 1683 + bch_journal_meta(s->iop.c, cl); 1684 + 1685 + /* If it's a flush, we send the flush to the backing device too */ 1686 + closure_bio_submit(bio, cl, s->d); 1687 + 1688 + continue_at(cl, cached_dev_bio_complete, NULL); 1689 + } 1690 + 1691 + /* Cached devices - read & write stuff */ 601 1692 602 1693 static void cached_dev_make_request(struct request_queue *q, struct bio *bio) 603 1694 { ··· 1131 1192 1132 1193 if (cached_dev_get(dc)) { 1133 1194 s = search_alloc(bio, d); 1134 - trace_bcache_request_start(s, bio); 1195 + trace_bcache_request_start(s->d, bio); 1135 1196 1136 - if (!bio_has_data(bio)) 1137 - request_nodata(dc, s); 1138 - else if (rw) 1139 - request_write(dc, s); 1140 - else 1141 - request_read(dc, s); 1197 + if (!bio->bi_size) { 1198 + /* 1199 + * can't call bch_journal_meta from under 1200 + * generic_make_request 1201 + */ 1202 + continue_at_nobarrier(&s->cl, 1203 + cached_dev_nodata, 1204 + bcache_wq); 1205 + } else { 1206 + s->iop.bypass = check_should_bypass(dc, bio); 1207 + 1208 + if (rw) 1209 + cached_dev_write(dc, s); 1210 + else 1211 + cached_dev_read(dc, s); 1212 + } 1142 1213 } else { 1143 1214 if ((bio->bi_rw & REQ_DISCARD) && 1144 1215 !blk_queue_discard(bdev_get_queue(dc->bdev))) ··· 1223 1274 bio_advance(bio, min(sectors << 9, bio->bi_size)); 1224 1275 1225 1276 if (!bio->bi_size) 1226 - s->op.lookup_done = true; 1277 + return MAP_DONE; 1227 1278 1228 - return 0; 1279 + return MAP_CONTINUE; 1280 + } 1281 + 1282 + static void flash_dev_nodata(struct closure *cl) 1283 + { 1284 + struct search *s = container_of(cl, struct search, cl); 1285 + 1286 + if (s->iop.flush_journal) 1287 + bch_journal_meta(s->iop.c, cl); 1288 + 1289 + continue_at(cl, search_free, NULL); 1229 1290 } 1230 1291 1231 1292 static void flash_dev_make_request(struct request_queue *q, struct bio *bio) ··· 1254 1295 cl = &s->cl; 1255 1296 bio = &s->bio.bio; 1256 1297 1257 - trace_bcache_request_start(s, bio); 1298 + trace_bcache_request_start(s->d, bio); 1258 1299 1259 - if (bio_has_data(bio) && !rw) { 1260 - closure_call(&s->op.cl, btree_read_async, NULL, cl); 1261 - } else if (bio_has_data(bio) || s->op.skip) { 1262 - bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, 1300 + if (!bio->bi_size) { 1301 + /* 1302 + * can't call bch_journal_meta from under 1303 + * generic_make_request 1304 + */ 1305 + continue_at_nobarrier(&s->cl, 1306 + flash_dev_nodata, 1307 + bcache_wq); 1308 + } else if (rw) { 1309 + bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, 1263 1310 &KEY(d->id, bio->bi_sector, 0), 1264 1311 &KEY(d->id, bio_end_sector(bio), 0)); 1265 1312 1266 - s->writeback = true; 1267 - s->op.cache_bio = bio; 1313 + s->iop.bypass = (bio->bi_rw & REQ_DISCARD) != 0; 1314 + s->iop.writeback = true; 1315 + s->iop.bio = bio; 1268 1316 1269 - closure_call(&s->op.cl, bch_insert_data, NULL, cl); 1317 + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); 1270 1318 } else { 1271 - /* No data - probably a cache flush */ 1272 - if (s->op.flush_journal) 1273 - bch_journal_meta(s->op.c, cl); 1319 + closure_call(&s->iop.cl, cache_lookup, NULL, cl); 1274 1320 } 1275 1321 1276 1322 continue_at(cl, search_free, NULL);

+20 -27

drivers/md/bcache/request.h

··· 3 3 4 4 #include <linux/cgroup.h> 5 5 6 - struct search { 7 - /* Stack frame for bio_complete */ 6 + struct data_insert_op { 8 7 struct closure cl; 8 + struct cache_set *c; 9 + struct bio *bio; 9 10 10 - struct bcache_device *d; 11 - struct task_struct *task; 12 - 13 - struct bbio bio; 14 - struct bio *orig_bio; 15 - struct bio *cache_miss; 16 - unsigned cache_bio_sectors; 17 - 18 - unsigned recoverable:1; 19 - unsigned unaligned_bvec:1; 20 - 21 - unsigned write:1; 22 - unsigned writeback:1; 23 - 24 - /* IO error returned to s->bio */ 11 + unsigned inode; 12 + uint16_t write_point; 13 + uint16_t write_prio; 25 14 short error; 26 - unsigned long start_time; 27 15 28 - /* Anything past op->keys won't get zeroed in do_bio_hook */ 29 - struct btree_op op; 16 + unsigned bypass:1; 17 + unsigned writeback:1; 18 + unsigned flush_journal:1; 19 + unsigned csum:1; 20 + 21 + unsigned replace:1; 22 + unsigned replace_collision:1; 23 + 24 + unsigned insert_data_done:1; 25 + 26 + /* Anything past this point won't get zeroed in search_alloc() */ 27 + struct keylist insert_keys; 28 + BKEY_PADDED(replace_key); 30 29 }; 31 30 32 - void bch_cache_read_endio(struct bio *, int); 33 31 unsigned bch_get_congested(struct cache_set *); 34 - void bch_insert_data(struct closure *cl); 35 - void bch_btree_insert_async(struct closure *); 36 - void bch_cache_read_endio(struct bio *, int); 37 - 38 - void bch_open_buckets_free(struct cache_set *); 39 - int bch_open_buckets_alloc(struct cache_set *); 32 + void bch_data_insert(struct closure *cl); 40 33 41 34 void bch_cached_dev_request_init(struct cached_dev *dc); 42 35 void bch_flash_dev_request_init(struct bcache_device *d);

+13 -13

drivers/md/bcache/stats.c

··· 7 7 #include "bcache.h" 8 8 #include "stats.h" 9 9 #include "btree.h" 10 - #include "request.h" 11 10 #include "sysfs.h" 12 11 13 12 /* ··· 195 196 atomic_inc(&stats->cache_bypass_misses); 196 197 } 197 198 198 - void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass) 199 + void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, 200 + bool hit, bool bypass) 199 201 { 200 - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 202 + struct cached_dev *dc = container_of(d, struct cached_dev, disk); 201 203 mark_cache_stats(&dc->accounting.collector, hit, bypass); 202 - mark_cache_stats(&s->op.c->accounting.collector, hit, bypass); 204 + mark_cache_stats(&c->accounting.collector, hit, bypass); 203 205 #ifdef CONFIG_CGROUP_BCACHE 204 206 mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass); 205 207 #endif 206 208 } 207 209 208 - void bch_mark_cache_readahead(struct search *s) 210 + void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d) 209 211 { 210 - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 212 + struct cached_dev *dc = container_of(d, struct cached_dev, disk); 211 213 atomic_inc(&dc->accounting.collector.cache_readaheads); 212 - atomic_inc(&s->op.c->accounting.collector.cache_readaheads); 214 + atomic_inc(&c->accounting.collector.cache_readaheads); 213 215 } 214 216 215 - void bch_mark_cache_miss_collision(struct search *s) 217 + void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d) 216 218 { 217 - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 219 + struct cached_dev *dc = container_of(d, struct cached_dev, disk); 218 220 atomic_inc(&dc->accounting.collector.cache_miss_collisions); 219 - atomic_inc(&s->op.c->accounting.collector.cache_miss_collisions); 221 + atomic_inc(&c->accounting.collector.cache_miss_collisions); 220 222 } 221 223 222 - void bch_mark_sectors_bypassed(struct search *s, int sectors) 224 + void bch_mark_sectors_bypassed(struct cache_set *c, struct cached_dev *dc, 225 + int sectors) 223 226 { 224 - struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 225 227 atomic_add(sectors, &dc->accounting.collector.sectors_bypassed); 226 - atomic_add(sectors, &s->op.c->accounting.collector.sectors_bypassed); 228 + atomic_add(sectors, &c->accounting.collector.sectors_bypassed); 227 229 } 228 230 229 231 void bch_cache_accounting_init(struct cache_accounting *acc,

+8 -5

drivers/md/bcache/stats.h

··· 38 38 struct cache_stats day; 39 39 }; 40 40 41 - struct search; 41 + struct cache_set; 42 + struct cached_dev; 43 + struct bcache_device; 42 44 43 45 void bch_cache_accounting_init(struct cache_accounting *acc, 44 46 struct closure *parent); ··· 52 50 53 51 void bch_cache_accounting_destroy(struct cache_accounting *acc); 54 52 55 - void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass); 56 - void bch_mark_cache_readahead(struct search *s); 57 - void bch_mark_cache_miss_collision(struct search *s); 58 - void bch_mark_sectors_bypassed(struct search *s, int sectors); 53 + void bch_mark_cache_accounting(struct cache_set *, struct bcache_device *, 54 + bool, bool); 55 + void bch_mark_cache_readahead(struct cache_set *, struct bcache_device *); 56 + void bch_mark_cache_miss_collision(struct cache_set *, struct bcache_device *); 57 + void bch_mark_sectors_bypassed(struct cache_set *, struct cached_dev *, int); 59 58 60 59 #endif /* _BCACHE_STATS_H_ */

+105 -85

drivers/md/bcache/super.c

··· 16 16 #include <linux/buffer_head.h> 17 17 #include <linux/debugfs.h> 18 18 #include <linux/genhd.h> 19 + #include <linux/idr.h> 19 20 #include <linux/kthread.h> 20 21 #include <linux/module.h> 21 22 #include <linux/random.h> ··· 46 45 NULL 47 46 }; 48 47 49 - struct uuid_entry_v0 { 50 - uint8_t uuid[16]; 51 - uint8_t label[32]; 52 - uint32_t first_reg; 53 - uint32_t last_reg; 54 - uint32_t invalidated; 55 - uint32_t pad; 56 - }; 57 - 58 48 static struct kobject *bcache_kobj; 59 49 struct mutex bch_register_lock; 60 50 LIST_HEAD(bch_cache_sets); 61 51 static LIST_HEAD(uncached_devices); 62 52 63 - static int bcache_major, bcache_minor; 53 + static int bcache_major; 54 + static DEFINE_IDA(bcache_minor); 64 55 static wait_queue_head_t unregister_wait; 65 56 struct workqueue_struct *bcache_wq; 66 57 ··· 375 382 { 376 383 struct bkey *k = &j->uuid_bucket; 377 384 378 - if (__bch_ptr_invalid(c, 1, k)) 385 + if (bch_btree_ptr_invalid(c, k)) 379 386 return "bad uuid pointer"; 380 387 381 388 bkey_copy(&c->uuid_bucket, k); ··· 420 427 421 428 lockdep_assert_held(&bch_register_lock); 422 429 423 - if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, &cl)) 430 + if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true)) 424 431 return 1; 425 432 426 433 SET_KEY_SIZE(&k.key, c->sb.bucket_size); ··· 428 435 closure_sync(&cl); 429 436 430 437 bkey_copy(&c->uuid_bucket, &k.key); 431 - __bkey_put(c, &k.key); 438 + bkey_put(c, &k.key); 432 439 return 0; 433 440 } 434 441 ··· 555 562 } 556 563 557 564 p->next_bucket = ca->prio_buckets[i + 1]; 558 - p->magic = pset_magic(ca); 565 + p->magic = pset_magic(&ca->sb); 559 566 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); 560 567 561 - bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, &cl); 568 + bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true); 562 569 BUG_ON(bucket == -1); 563 570 564 571 mutex_unlock(&ca->set->bucket_lock); ··· 606 613 if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8)) 607 614 pr_warn("bad csum reading priorities"); 608 615 609 - if (p->magic != pset_magic(ca)) 616 + if (p->magic != pset_magic(&ca->sb)) 610 617 pr_warn("bad magic reading priorities"); 611 618 612 619 bucket = p->next_bucket; ··· 623 630 static int open_dev(struct block_device *b, fmode_t mode) 624 631 { 625 632 struct bcache_device *d = b->bd_disk->private_data; 626 - if (atomic_read(&d->closing)) 633 + if (test_bit(BCACHE_DEV_CLOSING, &d->flags)) 627 634 return -ENXIO; 628 635 629 636 closure_get(&d->cl); ··· 652 659 653 660 void bcache_device_stop(struct bcache_device *d) 654 661 { 655 - if (!atomic_xchg(&d->closing, 1)) 662 + if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags)) 656 663 closure_queue(&d->cl); 657 664 } 658 665 659 666 static void bcache_device_unlink(struct bcache_device *d) 660 667 { 661 - unsigned i; 662 - struct cache *ca; 668 + lockdep_assert_held(&bch_register_lock); 663 669 664 - sysfs_remove_link(&d->c->kobj, d->name); 665 - sysfs_remove_link(&d->kobj, "cache"); 670 + if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) { 671 + unsigned i; 672 + struct cache *ca; 666 673 667 - for_each_cache(ca, d->c, i) 668 - bd_unlink_disk_holder(ca->bdev, d->disk); 674 + sysfs_remove_link(&d->c->kobj, d->name); 675 + sysfs_remove_link(&d->kobj, "cache"); 676 + 677 + for_each_cache(ca, d->c, i) 678 + bd_unlink_disk_holder(ca->bdev, d->disk); 679 + } 669 680 } 670 681 671 682 static void bcache_device_link(struct bcache_device *d, struct cache_set *c, ··· 693 696 { 694 697 lockdep_assert_held(&bch_register_lock); 695 698 696 - if (atomic_read(&d->detaching)) { 699 + if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) { 697 700 struct uuid_entry *u = d->c->uuids + d->id; 698 701 699 702 SET_UUID_FLASH_ONLY(u, 0); 700 703 memcpy(u->uuid, invalid_uuid, 16); 701 704 u->invalidated = cpu_to_le32(get_seconds()); 702 705 bch_uuid_write(d->c); 703 - 704 - atomic_set(&d->detaching, 0); 705 706 } 706 707 707 - if (!d->flush_done) 708 - bcache_device_unlink(d); 708 + bcache_device_unlink(d); 709 709 710 710 d->c->devices[d->id] = NULL; 711 711 closure_put(&d->c->caching); ··· 733 739 del_gendisk(d->disk); 734 740 if (d->disk && d->disk->queue) 735 741 blk_cleanup_queue(d->disk->queue); 736 - if (d->disk) 742 + if (d->disk) { 743 + ida_simple_remove(&bcache_minor, d->disk->first_minor); 737 744 put_disk(d->disk); 745 + } 738 746 739 747 bio_split_pool_free(&d->bio_split_hook); 740 748 if (d->unaligned_bvec) 741 749 mempool_destroy(d->unaligned_bvec); 742 750 if (d->bio_split) 743 751 bioset_free(d->bio_split); 752 + if (is_vmalloc_addr(d->full_dirty_stripes)) 753 + vfree(d->full_dirty_stripes); 754 + else 755 + kfree(d->full_dirty_stripes); 744 756 if (is_vmalloc_addr(d->stripe_sectors_dirty)) 745 757 vfree(d->stripe_sectors_dirty); 746 758 else ··· 760 760 { 761 761 struct request_queue *q; 762 762 size_t n; 763 + int minor; 763 764 764 - if (!d->stripe_size_bits) 765 - d->stripe_size_bits = 31; 765 + if (!d->stripe_size) 766 + d->stripe_size = 1 << 31; 766 767 767 - d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >> 768 - d->stripe_size_bits; 768 + d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size); 769 769 770 - if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) 770 + if (!d->nr_stripes || 771 + d->nr_stripes > INT_MAX || 772 + d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) { 773 + pr_err("nr_stripes too large"); 771 774 return -ENOMEM; 775 + } 772 776 773 777 n = d->nr_stripes * sizeof(atomic_t); 774 778 d->stripe_sectors_dirty = n < PAGE_SIZE << 6 ··· 781 777 if (!d->stripe_sectors_dirty) 782 778 return -ENOMEM; 783 779 780 + n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long); 781 + d->full_dirty_stripes = n < PAGE_SIZE << 6 782 + ? kzalloc(n, GFP_KERNEL) 783 + : vzalloc(n); 784 + if (!d->full_dirty_stripes) 785 + return -ENOMEM; 786 + 787 + minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL); 788 + if (minor < 0) 789 + return minor; 790 + 784 791 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || 785 792 !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, 786 793 sizeof(struct bio_vec) * BIO_MAX_PAGES)) || 787 794 bio_split_pool_init(&d->bio_split_hook) || 788 - !(d->disk = alloc_disk(1)) || 789 - !(q = blk_alloc_queue(GFP_KERNEL))) 795 + !(d->disk = alloc_disk(1))) { 796 + ida_simple_remove(&bcache_minor, minor); 790 797 return -ENOMEM; 798 + } 791 799 792 800 set_capacity(d->disk, sectors); 793 - snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); 801 + snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor); 794 802 795 803 d->disk->major = bcache_major; 796 - d->disk->first_minor = bcache_minor++; 804 + d->disk->first_minor = minor; 797 805 d->disk->fops = &bcache_ops; 798 806 d->disk->private_data = d; 807 + 808 + q = blk_alloc_queue(GFP_KERNEL); 809 + if (!q) 810 + return -ENOMEM; 799 811 800 812 blk_queue_make_request(q, NULL); 801 813 d->disk->queue = q; ··· 894 874 struct closure cl; 895 875 closure_init_stack(&cl); 896 876 897 - BUG_ON(!atomic_read(&dc->disk.detaching)); 877 + BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); 898 878 BUG_ON(atomic_read(&dc->count)); 899 879 900 880 mutex_lock(&bch_register_lock); ··· 908 888 bcache_device_detach(&dc->disk); 909 889 list_move(&dc->list, &uncached_devices); 910 890 891 + clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags); 892 + 911 893 mutex_unlock(&bch_register_lock); 912 894 913 895 pr_info("Caching disabled for %s", bdevname(dc->bdev, buf)); ··· 922 900 { 923 901 lockdep_assert_held(&bch_register_lock); 924 902 925 - if (atomic_read(&dc->disk.closing)) 903 + if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) 926 904 return; 927 905 928 - if (atomic_xchg(&dc->disk.detaching, 1)) 906 + if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) 929 907 return; 930 908 931 909 /* ··· 1052 1030 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); 1053 1031 1054 1032 cancel_delayed_work_sync(&dc->writeback_rate_update); 1033 + kthread_stop(dc->writeback_thread); 1055 1034 1056 1035 mutex_lock(&bch_register_lock); 1057 1036 ··· 1081 1058 struct bcache_device *d = &dc->disk; 1082 1059 1083 1060 mutex_lock(&bch_register_lock); 1084 - d->flush_done = 1; 1085 - 1086 - if (d->c) 1087 - bcache_device_unlink(d); 1088 - 1061 + bcache_device_unlink(d); 1089 1062 mutex_unlock(&bch_register_lock); 1090 1063 1091 1064 bch_cache_accounting_destroy(&dc->accounting); ··· 1107 1088 spin_lock_init(&dc->io_lock); 1108 1089 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); 1109 1090 1110 - dc->sequential_merge = true; 1111 1091 dc->sequential_cutoff = 4 << 20; 1112 1092 1113 1093 for (io = dc->io; io < dc->io + RECENT_IO; io++) { ··· 1278 1260 { 1279 1261 va_list args; 1280 1262 1281 - if (test_bit(CACHE_SET_STOPPING, &c->flags)) 1263 + if (c->on_error != ON_ERROR_PANIC && 1264 + test_bit(CACHE_SET_STOPPING, &c->flags)) 1282 1265 return false; 1283 1266 1284 1267 /* XXX: we can be called from atomic context ··· 1293 1274 va_end(args); 1294 1275 1295 1276 printk(", disabling caching\n"); 1277 + 1278 + if (c->on_error == ON_ERROR_PANIC) 1279 + panic("panic forced after error\n"); 1296 1280 1297 1281 bch_cache_set_unregister(c); 1298 1282 return true; ··· 1360 1338 1361 1339 kobject_put(&c->internal); 1362 1340 kobject_del(&c->kobj); 1341 + 1342 + if (c->gc_thread) 1343 + kthread_stop(c->gc_thread); 1363 1344 1364 1345 if (!IS_ERR_OR_NULL(c->root)) 1365 1346 list_add(&c->root->list, &c->btree_cache); ··· 1458 1433 1459 1434 c->sort_crit_factor = int_sqrt(c->btree_pages); 1460 1435 1461 - mutex_init(&c->bucket_lock); 1462 - mutex_init(&c->sort_lock); 1463 - spin_lock_init(&c->sort_time_lock); 1464 1436 closure_init_unlocked(&c->sb_write); 1437 + mutex_init(&c->bucket_lock); 1438 + init_waitqueue_head(&c->try_wait); 1439 + init_waitqueue_head(&c->bucket_wait); 1465 1440 closure_init_unlocked(&c->uuid_write); 1466 - spin_lock_init(&c->btree_read_time_lock); 1441 + mutex_init(&c->sort_lock); 1442 + 1443 + spin_lock_init(&c->sort_time.lock); 1444 + spin_lock_init(&c->btree_gc_time.lock); 1445 + spin_lock_init(&c->btree_split_time.lock); 1446 + spin_lock_init(&c->btree_read_time.lock); 1447 + spin_lock_init(&c->try_harder_time.lock); 1448 + 1467 1449 bch_moving_init_cache_set(c); 1468 1450 1469 1451 INIT_LIST_HEAD(&c->list); ··· 1515 1483 const char *err = "cannot allocate memory"; 1516 1484 struct cached_dev *dc, *t; 1517 1485 struct cache *ca; 1486 + struct closure cl; 1518 1487 unsigned i; 1519 1488 1520 - struct btree_op op; 1521 - bch_btree_op_init_stack(&op); 1522 - op.lock = SHRT_MAX; 1489 + closure_init_stack(&cl); 1523 1490 1524 1491 for_each_cache(ca, c, i) 1525 1492 c->nbuckets += ca->sb.nbuckets; ··· 1529 1498 struct jset *j; 1530 1499 1531 1500 err = "cannot allocate memory for journal"; 1532 - if (bch_journal_read(c, &journal, &op)) 1501 + if (bch_journal_read(c, &journal)) 1533 1502 goto err; 1534 1503 1535 1504 pr_debug("btree_journal_read() done"); ··· 1553 1522 k = &j->btree_root; 1554 1523 1555 1524 err = "bad btree root"; 1556 - if (__bch_ptr_invalid(c, j->btree_level + 1, k)) 1525 + if (bch_btree_ptr_invalid(c, k)) 1557 1526 goto err; 1558 1527 1559 1528 err = "error reading btree root"; 1560 - c->root = bch_btree_node_get(c, k, j->btree_level, &op); 1529 + c->root = bch_btree_node_get(c, k, j->btree_level, true); 1561 1530 if (IS_ERR_OR_NULL(c->root)) 1562 1531 goto err; 1563 1532 1564 1533 list_del_init(&c->root->list); 1565 1534 rw_unlock(true, c->root); 1566 1535 1567 - err = uuid_read(c, j, &op.cl); 1536 + err = uuid_read(c, j, &cl); 1568 1537 if (err) 1569 1538 goto err; 1570 1539 1571 1540 err = "error in recovery"; 1572 - if (bch_btree_check(c, &op)) 1541 + if (bch_btree_check(c)) 1573 1542 goto err; 1574 1543 1575 1544 bch_journal_mark(c, &journal); ··· 1601 1570 if (j->version < BCACHE_JSET_VERSION_UUID) 1602 1571 __uuid_write(c); 1603 1572 1604 - bch_journal_replay(c, &journal, &op); 1573 + bch_journal_replay(c, &journal); 1605 1574 } else { 1606 1575 pr_notice("invalidating existing data"); 1607 - /* Don't want invalidate_buckets() to queue a gc yet */ 1608 - closure_lock(&c->gc, NULL); 1609 1576 1610 1577 for_each_cache(ca, c, i) { 1611 1578 unsigned j; ··· 1629 1600 1630 1601 err = "cannot allocate new UUID bucket"; 1631 1602 if (__uuid_write(c)) 1632 - goto err_unlock_gc; 1603 + goto err; 1633 1604 1634 1605 err = "cannot allocate new btree root"; 1635 - c->root = bch_btree_node_alloc(c, 0, &op.cl); 1606 + c->root = bch_btree_node_alloc(c, 0, true); 1636 1607 if (IS_ERR_OR_NULL(c->root)) 1637 - goto err_unlock_gc; 1608 + goto err; 1638 1609 1639 1610 bkey_copy_key(&c->root->key, &MAX_KEY); 1640 - bch_btree_node_write(c->root, &op.cl); 1611 + bch_btree_node_write(c->root, &cl); 1641 1612 1642 1613 bch_btree_set_root(c->root); 1643 1614 rw_unlock(true, c->root); ··· 1650 1621 SET_CACHE_SYNC(&c->sb, true); 1651 1622 1652 1623 bch_journal_next(&c->journal); 1653 - bch_journal_meta(c, &op.cl); 1654 - 1655 - /* Unlock */ 1656 - closure_set_stopped(&c->gc.cl); 1657 - closure_put(&c->gc.cl); 1624 + bch_journal_meta(c, &cl); 1658 1625 } 1659 1626 1660 - closure_sync(&op.cl); 1627 + err = "error starting gc thread"; 1628 + if (bch_gc_thread_start(c)) 1629 + goto err; 1630 + 1631 + closure_sync(&cl); 1661 1632 c->sb.last_mount = get_seconds(); 1662 1633 bcache_write_super(c); 1663 1634 ··· 1667 1638 flash_devs_run(c); 1668 1639 1669 1640 return; 1670 - err_unlock_gc: 1671 - closure_set_stopped(&c->gc.cl); 1672 - closure_put(&c->gc.cl); 1673 1641 err: 1674 - closure_sync(&op.cl); 1642 + closure_sync(&cl); 1675 1643 /* XXX: test this, it's broken */ 1676 - bch_cache_set_error(c, err); 1644 + bch_cache_set_error(c, "%s", err); 1677 1645 } 1678 1646 1679 1647 static bool can_attach_cache(struct cache *ca, struct cache_set *c) ··· 1751 1725 if (ca->set) 1752 1726 ca->set->cache[ca->sb.nr_this_dev] = NULL; 1753 1727 1754 - bch_cache_allocator_exit(ca); 1755 - 1756 1728 bio_split_pool_free(&ca->bio_split_hook); 1757 1729 1758 1730 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); ··· 1781 1757 1782 1758 __module_get(THIS_MODULE); 1783 1759 kobject_init(&ca->kobj, &bch_cache_ktype); 1784 - 1785 - INIT_LIST_HEAD(&ca->discards); 1786 1760 1787 1761 bio_init(&ca->journal.bio); 1788 1762 ca->journal.bio.bi_max_vecs = 8; ··· 2028 2006 static void bcache_exit(void) 2029 2007 { 2030 2008 bch_debug_exit(); 2031 - bch_writeback_exit(); 2032 2009 bch_request_exit(); 2033 2010 bch_btree_exit(); 2034 2011 if (bcache_kobj) ··· 2060 2039 sysfs_create_files(bcache_kobj, files) || 2061 2040 bch_btree_init() || 2062 2041 bch_request_init() || 2063 - bch_writeback_init() || 2064 2042 bch_debug_init(bcache_kobj)) 2065 2043 goto err; 2066 2044

+33 -9

drivers/md/bcache/sysfs.c

··· 21 21 NULL 22 22 }; 23 23 24 + static const char * const error_actions[] = { 25 + "unregister", 26 + "panic", 27 + NULL 28 + }; 29 + 24 30 write_attribute(attach); 25 31 write_attribute(detach); 26 32 write_attribute(unregister); ··· 72 66 rw_attribute(congested_write_threshold_us); 73 67 74 68 rw_attribute(sequential_cutoff); 75 - rw_attribute(sequential_merge); 76 69 rw_attribute(data_csum); 77 70 rw_attribute(cache_mode); 78 71 rw_attribute(writeback_metadata); ··· 95 90 rw_attribute(running); 96 91 rw_attribute(label); 97 92 rw_attribute(readahead); 93 + rw_attribute(errors); 98 94 rw_attribute(io_error_limit); 99 95 rw_attribute(io_error_halflife); 100 96 rw_attribute(verify); 97 + rw_attribute(bypass_torture_test); 101 98 rw_attribute(key_merging_disabled); 102 99 rw_attribute(gc_always_rewrite); 100 + rw_attribute(expensive_debug_checks); 103 101 rw_attribute(freelist_percent); 104 102 rw_attribute(cache_replacement_policy); 105 103 rw_attribute(btree_shrinker_disabled); ··· 124 116 125 117 sysfs_printf(data_csum, "%i", dc->disk.data_csum); 126 118 var_printf(verify, "%i"); 119 + var_printf(bypass_torture_test, "%i"); 127 120 var_printf(writeback_metadata, "%i"); 128 121 var_printf(writeback_running, "%i"); 129 122 var_print(writeback_delay); ··· 159 150 sysfs_hprint(dirty_data, 160 151 bcache_dev_sectors_dirty(&dc->disk) << 9); 161 152 162 - sysfs_hprint(stripe_size, (1 << dc->disk.stripe_size_bits) << 9); 153 + sysfs_hprint(stripe_size, dc->disk.stripe_size << 9); 163 154 var_printf(partial_stripes_expensive, "%u"); 164 155 165 - var_printf(sequential_merge, "%i"); 166 156 var_hprint(sequential_cutoff); 167 157 var_hprint(readahead); 168 158 ··· 193 185 194 186 sysfs_strtoul(data_csum, dc->disk.data_csum); 195 187 d_strtoul(verify); 188 + d_strtoul(bypass_torture_test); 196 189 d_strtoul(writeback_metadata); 197 190 d_strtoul(writeback_running); 198 191 d_strtoul(writeback_delay); ··· 208 199 dc->writeback_rate_p_term_inverse, 1, INT_MAX); 209 200 d_strtoul(writeback_rate_d_smooth); 210 201 211 - d_strtoul(sequential_merge); 212 202 d_strtoi_h(sequential_cutoff); 213 203 d_strtoi_h(readahead); 214 204 ··· 319 311 &sysfs_stripe_size, 320 312 &sysfs_partial_stripes_expensive, 321 313 &sysfs_sequential_cutoff, 322 - &sysfs_sequential_merge, 323 314 &sysfs_clear_stats, 324 315 &sysfs_running, 325 316 &sysfs_state, ··· 326 319 &sysfs_readahead, 327 320 #ifdef CONFIG_BCACHE_DEBUG 328 321 &sysfs_verify, 322 + &sysfs_bypass_torture_test, 329 323 #endif 330 324 NULL 331 325 }; ··· 374 366 } 375 367 376 368 if (attr == &sysfs_unregister) { 377 - atomic_set(&d->detaching, 1); 369 + set_bit(BCACHE_DEV_DETACHING, &d->flags); 378 370 bcache_device_stop(d); 379 371 } 380 372 ··· 489 481 490 482 sysfs_print(btree_used_percent, btree_used(c)); 491 483 sysfs_print(btree_nodes, c->gc_stats.nodes); 492 - sysfs_hprint(dirty_data, c->gc_stats.dirty); 493 484 sysfs_hprint(average_key_size, average_key_size(c)); 494 485 495 486 sysfs_print(cache_read_races, ··· 498 491 atomic_long_read(&c->writeback_keys_done)); 499 492 sysfs_print(writeback_keys_failed, 500 493 atomic_long_read(&c->writeback_keys_failed)); 494 + 495 + if (attr == &sysfs_errors) 496 + return bch_snprint_string_list(buf, PAGE_SIZE, error_actions, 497 + c->on_error); 501 498 502 499 /* See count_io_errors for why 88 */ 503 500 sysfs_print(io_error_halflife, c->error_decay * 88); ··· 517 506 sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); 518 507 sysfs_printf(verify, "%i", c->verify); 519 508 sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled); 509 + sysfs_printf(expensive_debug_checks, 510 + "%i", c->expensive_debug_checks); 520 511 sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); 521 512 sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); 522 513 sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); ··· 568 555 } 569 556 570 557 if (attr == &sysfs_trigger_gc) 571 - bch_queue_gc(c); 558 + wake_up_gc(c); 572 559 573 560 if (attr == &sysfs_prune_cache) { 574 561 struct shrink_control sc; ··· 582 569 sysfs_strtoul(congested_write_threshold_us, 583 570 c->congested_write_threshold_us); 584 571 572 + if (attr == &sysfs_errors) { 573 + ssize_t v = bch_read_string_list(buf, error_actions); 574 + 575 + if (v < 0) 576 + return v; 577 + 578 + c->on_error = v; 579 + } 580 + 585 581 if (attr == &sysfs_io_error_limit) 586 582 c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; 587 583 ··· 601 579 sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); 602 580 sysfs_strtoul(verify, c->verify); 603 581 sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); 582 + sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks); 604 583 sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); 605 584 sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); 606 585 sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); ··· 641 618 &sysfs_cache_available_percent, 642 619 643 620 &sysfs_average_key_size, 644 - &sysfs_dirty_data, 645 621 622 + &sysfs_errors, 646 623 &sysfs_io_error_limit, 647 624 &sysfs_io_error_halflife, 648 625 &sysfs_congested, ··· 676 653 #ifdef CONFIG_BCACHE_DEBUG 677 654 &sysfs_verify, 678 655 &sysfs_key_merging_disabled, 656 + &sysfs_expensive_debug_checks, 679 657 #endif 680 658 &sysfs_gc_always_rewrite, 681 659 &sysfs_btree_shrinker_disabled,

-1

drivers/md/bcache/trace.c

··· 1 1 #include "bcache.h" 2 2 #include "btree.h" 3 - #include "request.h" 4 3 5 4 #include <linux/blktrace_api.h> 6 5 #include <linux/module.h>

+9 -3

drivers/md/bcache/util.c

··· 168 168 169 169 void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) 170 170 { 171 - uint64_t now = local_clock(); 172 - uint64_t duration = time_after64(now, start_time) 171 + uint64_t now, duration, last; 172 + 173 + spin_lock(&stats->lock); 174 + 175 + now = local_clock(); 176 + duration = time_after64(now, start_time) 173 177 ? now - start_time : 0; 174 - uint64_t last = time_after64(now, stats->last) 178 + last = time_after64(now, stats->last) 175 179 ? now - stats->last : 0; 176 180 177 181 stats->max_duration = max(stats->max_duration, duration); ··· 192 188 } 193 189 194 190 stats->last = now ?: 1; 191 + 192 + spin_unlock(&stats->lock); 195 193 } 196 194 197 195 /**

+3 -12

drivers/md/bcache/util.h

··· 15 15 16 16 struct closure; 17 17 18 - #ifdef CONFIG_BCACHE_EDEBUG 18 + #ifdef CONFIG_BCACHE_DEBUG 19 19 20 20 #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) 21 21 #define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) 22 22 23 - #else /* EDEBUG */ 23 + #else /* DEBUG */ 24 24 25 25 #define atomic_dec_bug(v) atomic_dec(v) 26 26 #define atomic_inc_bug(v, i) atomic_inc(v) 27 27 28 28 #endif 29 - 30 - #define BITMASK(name, type, field, offset, size) \ 31 - static inline uint64_t name(const type *k) \ 32 - { return (k->field >> offset) & ~(((uint64_t) ~0) << size); } \ 33 - \ 34 - static inline void SET_##name(type *k, uint64_t v) \ 35 - { \ 36 - k->field &= ~(~((uint64_t) ~0 << size) << offset); \ 37 - k->field |= v << offset; \ 38 - } 39 29 40 30 #define DECLARE_HEAP(type, name) \ 41 31 struct { \ ··· 378 388 ssize_t bch_read_string_list(const char *buf, const char * const list[]); 379 389 380 390 struct time_stats { 391 + spinlock_t lock; 381 392 /* 382 393 * all fields are in nanoseconds, averages are ewmas stored left shifted 383 394 * by 8

+222 -231

drivers/md/bcache/writeback.c

··· 11 11 #include "debug.h" 12 12 #include "writeback.h" 13 13 14 + #include <linux/delay.h> 15 + #include <linux/freezer.h> 16 + #include <linux/kthread.h> 14 17 #include <trace/events/bcache.h> 15 - 16 - static struct workqueue_struct *dirty_wq; 17 - 18 - static void read_dirty(struct closure *); 19 - 20 - struct dirty_io { 21 - struct closure cl; 22 - struct cached_dev *dc; 23 - struct bio bio; 24 - }; 25 18 26 19 /* Rate limiting */ 27 20 ··· 65 72 dc->writeback_rate_derivative = derivative; 66 73 dc->writeback_rate_change = change; 67 74 dc->writeback_rate_target = target; 68 - 69 - schedule_delayed_work(&dc->writeback_rate_update, 70 - dc->writeback_rate_update_seconds * HZ); 71 75 } 72 76 73 77 static void update_writeback_rate(struct work_struct *work) ··· 80 90 __update_writeback_rate(dc); 81 91 82 92 up_read(&dc->writeback_lock); 93 + 94 + schedule_delayed_work(&dc->writeback_rate_update, 95 + dc->writeback_rate_update_seconds * HZ); 83 96 } 84 97 85 98 static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) 86 99 { 87 100 uint64_t ret; 88 101 89 - if (atomic_read(&dc->disk.detaching) || 102 + if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || 90 103 !dc->writeback_percent) 91 104 return 0; 92 105 ··· 98 105 return min_t(uint64_t, ret, HZ); 99 106 } 100 107 101 - /* Background writeback */ 102 - 103 - static bool dirty_pred(struct keybuf *buf, struct bkey *k) 104 - { 105 - return KEY_DIRTY(k); 106 - } 107 - 108 - static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) 109 - { 110 - uint64_t stripe; 111 - unsigned nr_sectors = KEY_SIZE(k); 112 - struct cached_dev *dc = container_of(buf, struct cached_dev, 113 - writeback_keys); 114 - unsigned stripe_size = 1 << dc->disk.stripe_size_bits; 115 - 116 - if (!KEY_DIRTY(k)) 117 - return false; 118 - 119 - stripe = KEY_START(k) >> dc->disk.stripe_size_bits; 120 - while (1) { 121 - if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) != 122 - stripe_size) 123 - return false; 124 - 125 - if (nr_sectors <= stripe_size) 126 - return true; 127 - 128 - nr_sectors -= stripe_size; 129 - stripe++; 130 - } 131 - } 108 + struct dirty_io { 109 + struct closure cl; 110 + struct cached_dev *dc; 111 + struct bio bio; 112 + }; 132 113 133 114 static void dirty_init(struct keybuf_key *w) 134 115 { ··· 119 152 bio->bi_io_vec = bio->bi_inline_vecs; 120 153 bch_bio_map(bio, NULL); 121 154 } 122 - 123 - static void refill_dirty(struct closure *cl) 124 - { 125 - struct cached_dev *dc = container_of(cl, struct cached_dev, 126 - writeback.cl); 127 - struct keybuf *buf = &dc->writeback_keys; 128 - bool searched_from_start = false; 129 - struct bkey end = MAX_KEY; 130 - SET_KEY_INODE(&end, dc->disk.id); 131 - 132 - if (!atomic_read(&dc->disk.detaching) && 133 - !dc->writeback_running) 134 - closure_return(cl); 135 - 136 - down_write(&dc->writeback_lock); 137 - 138 - if (!atomic_read(&dc->has_dirty)) { 139 - SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); 140 - bch_write_bdev_super(dc, NULL); 141 - 142 - up_write(&dc->writeback_lock); 143 - closure_return(cl); 144 - } 145 - 146 - if (bkey_cmp(&buf->last_scanned, &end) >= 0) { 147 - buf->last_scanned = KEY(dc->disk.id, 0, 0); 148 - searched_from_start = true; 149 - } 150 - 151 - if (dc->partial_stripes_expensive) { 152 - uint64_t i; 153 - 154 - for (i = 0; i < dc->disk.nr_stripes; i++) 155 - if (atomic_read(dc->disk.stripe_sectors_dirty + i) == 156 - 1 << dc->disk.stripe_size_bits) 157 - goto full_stripes; 158 - 159 - goto normal_refill; 160 - full_stripes: 161 - bch_refill_keybuf(dc->disk.c, buf, &end, 162 - dirty_full_stripe_pred); 163 - } else { 164 - normal_refill: 165 - bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); 166 - } 167 - 168 - if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { 169 - /* Searched the entire btree - delay awhile */ 170 - 171 - if (RB_EMPTY_ROOT(&buf->keys)) { 172 - atomic_set(&dc->has_dirty, 0); 173 - cached_dev_put(dc); 174 - } 175 - 176 - if (!atomic_read(&dc->disk.detaching)) 177 - closure_delay(&dc->writeback, dc->writeback_delay * HZ); 178 - } 179 - 180 - up_write(&dc->writeback_lock); 181 - 182 - bch_ratelimit_reset(&dc->writeback_rate); 183 - 184 - /* Punt to workqueue only so we don't recurse and blow the stack */ 185 - continue_at(cl, read_dirty, dirty_wq); 186 - } 187 - 188 - void bch_writeback_queue(struct cached_dev *dc) 189 - { 190 - if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) { 191 - if (!atomic_read(&dc->disk.detaching)) 192 - closure_delay(&dc->writeback, dc->writeback_delay * HZ); 193 - 194 - continue_at(&dc->writeback.cl, refill_dirty, dirty_wq); 195 - } 196 - } 197 - 198 - void bch_writeback_add(struct cached_dev *dc) 199 - { 200 - if (!atomic_read(&dc->has_dirty) && 201 - !atomic_xchg(&dc->has_dirty, 1)) { 202 - atomic_inc(&dc->count); 203 - 204 - if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { 205 - SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); 206 - /* XXX: should do this synchronously */ 207 - bch_write_bdev_super(dc, NULL); 208 - } 209 - 210 - bch_writeback_queue(dc); 211 - 212 - if (dc->writeback_percent) 213 - schedule_delayed_work(&dc->writeback_rate_update, 214 - dc->writeback_rate_update_seconds * HZ); 215 - } 216 - } 217 - 218 - void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, 219 - uint64_t offset, int nr_sectors) 220 - { 221 - struct bcache_device *d = c->devices[inode]; 222 - unsigned stripe_size, stripe_offset; 223 - uint64_t stripe; 224 - 225 - if (!d) 226 - return; 227 - 228 - stripe_size = 1 << d->stripe_size_bits; 229 - stripe = offset >> d->stripe_size_bits; 230 - stripe_offset = offset & (stripe_size - 1); 231 - 232 - while (nr_sectors) { 233 - int s = min_t(unsigned, abs(nr_sectors), 234 - stripe_size - stripe_offset); 235 - 236 - if (nr_sectors < 0) 237 - s = -s; 238 - 239 - atomic_add(s, d->stripe_sectors_dirty + stripe); 240 - nr_sectors -= s; 241 - stripe_offset = 0; 242 - stripe++; 243 - } 244 - } 245 - 246 - /* Background writeback - IO loop */ 247 155 248 156 static void dirty_io_destructor(struct closure *cl) 249 157 { ··· 139 297 140 298 /* This is kind of a dumb way of signalling errors. */ 141 299 if (KEY_DIRTY(&w->key)) { 300 + int ret; 142 301 unsigned i; 143 - struct btree_op op; 144 - bch_btree_op_init_stack(&op); 302 + struct keylist keys; 145 303 146 - op.type = BTREE_REPLACE; 147 - bkey_copy(&op.replace, &w->key); 304 + bch_keylist_init(&keys); 148 305 149 - SET_KEY_DIRTY(&w->key, false); 150 - bch_keylist_add(&op.keys, &w->key); 306 + bkey_copy(keys.top, &w->key); 307 + SET_KEY_DIRTY(keys.top, false); 308 + bch_keylist_push(&keys); 151 309 152 310 for (i = 0; i < KEY_PTRS(&w->key); i++) 153 311 atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); 154 312 155 - bch_btree_insert(&op, dc->disk.c); 156 - closure_sync(&op.cl); 313 + ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key); 157 314 158 - if (op.insert_collision) 315 + if (ret) 159 316 trace_bcache_writeback_collision(&w->key); 160 317 161 - atomic_long_inc(op.insert_collision 318 + atomic_long_inc(ret 162 319 ? &dc->disk.c->writeback_keys_failed 163 320 : &dc->disk.c->writeback_keys_done); 164 321 } ··· 215 374 continue_at(cl, write_dirty, system_wq); 216 375 } 217 376 218 - static void read_dirty(struct closure *cl) 377 + static void read_dirty(struct cached_dev *dc) 219 378 { 220 - struct cached_dev *dc = container_of(cl, struct cached_dev, 221 - writeback.cl); 222 - unsigned delay = writeback_delay(dc, 0); 379 + unsigned delay = 0; 223 380 struct keybuf_key *w; 224 381 struct dirty_io *io; 382 + struct closure cl; 383 + 384 + closure_init_stack(&cl); 225 385 226 386 /* 227 387 * XXX: if we error, background writeback just spins. Should use some 228 388 * mempools. 229 389 */ 230 390 231 - while (1) { 391 + while (!kthread_should_stop()) { 392 + try_to_freeze(); 393 + 232 394 w = bch_keybuf_next(&dc->writeback_keys); 233 395 if (!w) 234 396 break; 235 397 236 398 BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); 237 399 238 - if (delay > 0 && 239 - (KEY_START(&w->key) != dc->last_read || 240 - jiffies_to_msecs(delay) > 50)) 241 - delay = schedule_timeout_uninterruptible(delay); 400 + if (KEY_START(&w->key) != dc->last_read || 401 + jiffies_to_msecs(delay) > 50) 402 + while (!kthread_should_stop() && delay) 403 + delay = schedule_timeout_interruptible(delay); 242 404 243 405 dc->last_read = KEY_OFFSET(&w->key); 244 406 ··· 267 423 trace_bcache_writeback(&w->key); 268 424 269 425 down(&dc->in_flight); 270 - closure_call(&io->cl, read_dirty_submit, NULL, cl); 426 + closure_call(&io->cl, read_dirty_submit, NULL, &cl); 271 427 272 428 delay = writeback_delay(dc, KEY_SIZE(&w->key)); 273 429 } ··· 283 439 * Wait for outstanding writeback IOs to finish (and keybuf slots to be 284 440 * freed) before refilling again 285 441 */ 286 - continue_at(cl, refill_dirty, dirty_wq); 442 + closure_sync(&cl); 287 443 } 288 444 289 - /* Init */ 445 + /* Scan for dirty data */ 290 446 291 - static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, 292 - struct cached_dev *dc) 447 + void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, 448 + uint64_t offset, int nr_sectors) 293 449 { 294 - struct bkey *k; 295 - struct btree_iter iter; 450 + struct bcache_device *d = c->devices[inode]; 451 + unsigned stripe_offset, stripe, sectors_dirty; 296 452 297 - bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0)); 298 - while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) 299 - if (!b->level) { 300 - if (KEY_INODE(k) > dc->disk.id) 301 - break; 453 + if (!d) 454 + return; 302 455 303 - if (KEY_DIRTY(k)) 304 - bcache_dev_sectors_dirty_add(b->c, dc->disk.id, 305 - KEY_START(k), 306 - KEY_SIZE(k)); 307 - } else { 308 - btree(sectors_dirty_init, k, b, op, dc); 309 - if (KEY_INODE(k) > dc->disk.id) 310 - break; 456 + stripe = offset_to_stripe(d, offset); 457 + stripe_offset = offset & (d->stripe_size - 1); 311 458 312 - cond_resched(); 459 + while (nr_sectors) { 460 + int s = min_t(unsigned, abs(nr_sectors), 461 + d->stripe_size - stripe_offset); 462 + 463 + if (nr_sectors < 0) 464 + s = -s; 465 + 466 + if (stripe >= d->nr_stripes) 467 + return; 468 + 469 + sectors_dirty = atomic_add_return(s, 470 + d->stripe_sectors_dirty + stripe); 471 + if (sectors_dirty == d->stripe_size) 472 + set_bit(stripe, d->full_dirty_stripes); 473 + else 474 + clear_bit(stripe, d->full_dirty_stripes); 475 + 476 + nr_sectors -= s; 477 + stripe_offset = 0; 478 + stripe++; 479 + } 480 + } 481 + 482 + static bool dirty_pred(struct keybuf *buf, struct bkey *k) 483 + { 484 + return KEY_DIRTY(k); 485 + } 486 + 487 + static void refill_full_stripes(struct cached_dev *dc) 488 + { 489 + struct keybuf *buf = &dc->writeback_keys; 490 + unsigned start_stripe, stripe, next_stripe; 491 + bool wrapped = false; 492 + 493 + stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned)); 494 + 495 + if (stripe >= dc->disk.nr_stripes) 496 + stripe = 0; 497 + 498 + start_stripe = stripe; 499 + 500 + while (1) { 501 + stripe = find_next_bit(dc->disk.full_dirty_stripes, 502 + dc->disk.nr_stripes, stripe); 503 + 504 + if (stripe == dc->disk.nr_stripes) 505 + goto next; 506 + 507 + next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes, 508 + dc->disk.nr_stripes, stripe); 509 + 510 + buf->last_scanned = KEY(dc->disk.id, 511 + stripe * dc->disk.stripe_size, 0); 512 + 513 + bch_refill_keybuf(dc->disk.c, buf, 514 + &KEY(dc->disk.id, 515 + next_stripe * dc->disk.stripe_size, 0), 516 + dirty_pred); 517 + 518 + if (array_freelist_empty(&buf->freelist)) 519 + return; 520 + 521 + stripe = next_stripe; 522 + next: 523 + if (wrapped && stripe > start_stripe) 524 + return; 525 + 526 + if (stripe == dc->disk.nr_stripes) { 527 + stripe = 0; 528 + wrapped = true; 313 529 } 530 + } 531 + } 532 + 533 + static bool refill_dirty(struct cached_dev *dc) 534 + { 535 + struct keybuf *buf = &dc->writeback_keys; 536 + struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); 537 + bool searched_from_start = false; 538 + 539 + if (dc->partial_stripes_expensive) { 540 + refill_full_stripes(dc); 541 + if (array_freelist_empty(&buf->freelist)) 542 + return false; 543 + } 544 + 545 + if (bkey_cmp(&buf->last_scanned, &end) >= 0) { 546 + buf->last_scanned = KEY(dc->disk.id, 0, 0); 547 + searched_from_start = true; 548 + } 549 + 550 + bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); 551 + 552 + return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start; 553 + } 554 + 555 + static int bch_writeback_thread(void *arg) 556 + { 557 + struct cached_dev *dc = arg; 558 + bool searched_full_index; 559 + 560 + while (!kthread_should_stop()) { 561 + down_write(&dc->writeback_lock); 562 + if (!atomic_read(&dc->has_dirty) || 563 + (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && 564 + !dc->writeback_running)) { 565 + up_write(&dc->writeback_lock); 566 + set_current_state(TASK_INTERRUPTIBLE); 567 + 568 + if (kthread_should_stop()) 569 + return 0; 570 + 571 + try_to_freeze(); 572 + schedule(); 573 + continue; 574 + } 575 + 576 + searched_full_index = refill_dirty(dc); 577 + 578 + if (searched_full_index && 579 + RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { 580 + atomic_set(&dc->has_dirty, 0); 581 + cached_dev_put(dc); 582 + SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); 583 + bch_write_bdev_super(dc, NULL); 584 + } 585 + 586 + up_write(&dc->writeback_lock); 587 + 588 + bch_ratelimit_reset(&dc->writeback_rate); 589 + read_dirty(dc); 590 + 591 + if (searched_full_index) { 592 + unsigned delay = dc->writeback_delay * HZ; 593 + 594 + while (delay && 595 + !kthread_should_stop() && 596 + !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) 597 + delay = schedule_timeout_interruptible(delay); 598 + } 599 + } 314 600 315 601 return 0; 316 602 } 317 603 318 - void bch_sectors_dirty_init(struct cached_dev *dc) 319 - { 320 - struct btree_op op; 604 + /* Init */ 321 605 322 - bch_btree_op_init_stack(&op); 323 - btree_root(sectors_dirty_init, dc->disk.c, &op, dc); 606 + struct sectors_dirty_init { 607 + struct btree_op op; 608 + unsigned inode; 609 + }; 610 + 611 + static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, 612 + struct bkey *k) 613 + { 614 + struct sectors_dirty_init *op = container_of(_op, 615 + struct sectors_dirty_init, op); 616 + if (KEY_INODE(k) > op->inode) 617 + return MAP_DONE; 618 + 619 + if (KEY_DIRTY(k)) 620 + bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), 621 + KEY_START(k), KEY_SIZE(k)); 622 + 623 + return MAP_CONTINUE; 324 624 } 325 625 326 - void bch_cached_dev_writeback_init(struct cached_dev *dc) 626 + void bch_sectors_dirty_init(struct cached_dev *dc) 627 + { 628 + struct sectors_dirty_init op; 629 + 630 + bch_btree_op_init(&op.op, -1); 631 + op.inode = dc->disk.id; 632 + 633 + bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0), 634 + sectors_dirty_init_fn, 0); 635 + } 636 + 637 + int bch_cached_dev_writeback_init(struct cached_dev *dc) 327 638 { 328 639 sema_init(&dc->in_flight, 64); 329 - closure_init_unlocked(&dc->writeback); 330 640 init_rwsem(&dc->writeback_lock); 331 - 332 641 bch_keybuf_init(&dc->writeback_keys); 333 642 334 643 dc->writeback_metadata = true; ··· 495 498 dc->writeback_rate_p_term_inverse = 64; 496 499 dc->writeback_rate_d_smooth = 8; 497 500 501 + dc->writeback_thread = kthread_create(bch_writeback_thread, dc, 502 + "bcache_writeback"); 503 + if (IS_ERR(dc->writeback_thread)) 504 + return PTR_ERR(dc->writeback_thread); 505 + 506 + set_task_state(dc->writeback_thread, TASK_INTERRUPTIBLE); 507 + 498 508 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); 499 509 schedule_delayed_work(&dc->writeback_rate_update, 500 510 dc->writeback_rate_update_seconds * HZ); 501 - } 502 - 503 - void bch_writeback_exit(void) 504 - { 505 - if (dirty_wq) 506 - destroy_workqueue(dirty_wq); 507 - } 508 - 509 - int __init bch_writeback_init(void) 510 - { 511 - dirty_wq = create_workqueue("bcache_writeback"); 512 - if (!dirty_wq) 513 - return -ENOMEM; 514 511 515 512 return 0; 516 513 }

+36 -10

drivers/md/bcache/writeback.h

··· 14 14 return ret; 15 15 } 16 16 17 - static inline bool bcache_dev_stripe_dirty(struct bcache_device *d, 17 + static inline unsigned offset_to_stripe(struct bcache_device *d, 18 + uint64_t offset) 19 + { 20 + do_div(offset, d->stripe_size); 21 + return offset; 22 + } 23 + 24 + static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, 18 25 uint64_t offset, 19 26 unsigned nr_sectors) 20 27 { 21 - uint64_t stripe = offset >> d->stripe_size_bits; 28 + unsigned stripe = offset_to_stripe(&dc->disk, offset); 22 29 23 30 while (1) { 24 - if (atomic_read(d->stripe_sectors_dirty + stripe)) 31 + if (atomic_read(dc->disk.stripe_sectors_dirty + stripe)) 25 32 return true; 26 33 27 - if (nr_sectors <= 1 << d->stripe_size_bits) 34 + if (nr_sectors <= dc->disk.stripe_size) 28 35 return false; 29 36 30 - nr_sectors -= 1 << d->stripe_size_bits; 37 + nr_sectors -= dc->disk.stripe_size; 31 38 stripe++; 32 39 } 33 40 } ··· 45 38 unsigned in_use = dc->disk.c->gc_stats.in_use; 46 39 47 40 if (cache_mode != CACHE_MODE_WRITEBACK || 48 - atomic_read(&dc->disk.detaching) || 41 + test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || 49 42 in_use > CUTOFF_WRITEBACK_SYNC) 50 43 return false; 51 44 52 45 if (dc->partial_stripes_expensive && 53 - bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector, 46 + bcache_dev_stripe_dirty(dc, bio->bi_sector, 54 47 bio_sectors(bio))) 55 48 return true; 56 49 ··· 61 54 in_use <= CUTOFF_WRITEBACK; 62 55 } 63 56 57 + static inline void bch_writeback_queue(struct cached_dev *dc) 58 + { 59 + wake_up_process(dc->writeback_thread); 60 + } 61 + 62 + static inline void bch_writeback_add(struct cached_dev *dc) 63 + { 64 + if (!atomic_read(&dc->has_dirty) && 65 + !atomic_xchg(&dc->has_dirty, 1)) { 66 + atomic_inc(&dc->count); 67 + 68 + if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { 69 + SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); 70 + /* XXX: should do this synchronously */ 71 + bch_write_bdev_super(dc, NULL); 72 + } 73 + 74 + bch_writeback_queue(dc); 75 + } 76 + } 77 + 64 78 void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); 65 - void bch_writeback_queue(struct cached_dev *); 66 - void bch_writeback_add(struct cached_dev *); 67 79 68 80 void bch_sectors_dirty_init(struct cached_dev *dc); 69 - void bch_cached_dev_writeback_init(struct cached_dev *); 81 + int bch_cached_dev_writeback_init(struct cached_dev *); 70 82 71 83 #endif

+37 -10

include/trace/events/bcache.h

··· 6 6 7 7 #include <linux/tracepoint.h> 8 8 9 - struct search; 10 - 11 9 DECLARE_EVENT_CLASS(bcache_request, 12 - TP_PROTO(struct search *s, struct bio *bio), 13 - TP_ARGS(s, bio), 10 + TP_PROTO(struct bcache_device *d, struct bio *bio), 11 + TP_ARGS(d, bio), 14 12 15 13 TP_STRUCT__entry( 16 14 __field(dev_t, dev ) ··· 22 24 23 25 TP_fast_assign( 24 26 __entry->dev = bio->bi_bdev->bd_dev; 25 - __entry->orig_major = s->d->disk->major; 26 - __entry->orig_minor = s->d->disk->first_minor; 27 + __entry->orig_major = d->disk->major; 28 + __entry->orig_minor = d->disk->first_minor; 27 29 __entry->sector = bio->bi_sector; 28 30 __entry->orig_sector = bio->bi_sector - 16; 29 31 __entry->nr_sector = bio->bi_size >> 9; ··· 77 79 /* request.c */ 78 80 79 81 DEFINE_EVENT(bcache_request, bcache_request_start, 80 - TP_PROTO(struct search *s, struct bio *bio), 81 - TP_ARGS(s, bio) 82 + TP_PROTO(struct bcache_device *d, struct bio *bio), 83 + TP_ARGS(d, bio) 82 84 ); 83 85 84 86 DEFINE_EVENT(bcache_request, bcache_request_end, 85 - TP_PROTO(struct search *s, struct bio *bio), 86 - TP_ARGS(s, bio) 87 + TP_PROTO(struct bcache_device *d, struct bio *bio), 88 + TP_ARGS(d, bio) 87 89 ); 88 90 89 91 DECLARE_EVENT_CLASS(bcache_bio, ··· 366 368 DEFINE_EVENT(btree_node, bcache_btree_set_root, 367 369 TP_PROTO(struct btree *b), 368 370 TP_ARGS(b) 371 + ); 372 + 373 + TRACE_EVENT(bcache_keyscan, 374 + TP_PROTO(unsigned nr_found, 375 + unsigned start_inode, uint64_t start_offset, 376 + unsigned end_inode, uint64_t end_offset), 377 + TP_ARGS(nr_found, 378 + start_inode, start_offset, 379 + end_inode, end_offset), 380 + 381 + TP_STRUCT__entry( 382 + __field(__u32, nr_found ) 383 + __field(__u32, start_inode ) 384 + __field(__u64, start_offset ) 385 + __field(__u32, end_inode ) 386 + __field(__u64, end_offset ) 387 + ), 388 + 389 + TP_fast_assign( 390 + __entry->nr_found = nr_found; 391 + __entry->start_inode = start_inode; 392 + __entry->start_offset = start_offset; 393 + __entry->end_inode = end_inode; 394 + __entry->end_offset = end_offset; 395 + ), 396 + 397 + TP_printk("found %u keys from %u:%llu to %u:%llu", __entry->nr_found, 398 + __entry->start_inode, __entry->start_offset, 399 + __entry->end_inode, __entry->end_offset) 369 400 ); 370 401 371 402 /* Allocator */

+373

include/uapi/linux/bcache.h

··· 1 + #ifndef _LINUX_BCACHE_H 2 + #define _LINUX_BCACHE_H 3 + 4 + /* 5 + * Bcache on disk data structures 6 + */ 7 + 8 + #include <asm/types.h> 9 + 10 + #define BITMASK(name, type, field, offset, size) \ 11 + static inline __u64 name(const type *k) \ 12 + { return (k->field >> offset) & ~(~0ULL << size); } \ 13 + \ 14 + static inline void SET_##name(type *k, __u64 v) \ 15 + { \ 16 + k->field &= ~(~(~0ULL << size) << offset); \ 17 + k->field |= (v & ~(~0ULL << size)) << offset; \ 18 + } 19 + 20 + /* Btree keys - all units are in sectors */ 21 + 22 + struct bkey { 23 + __u64 high; 24 + __u64 low; 25 + __u64 ptr[]; 26 + }; 27 + 28 + #define KEY_FIELD(name, field, offset, size) \ 29 + BITMASK(name, struct bkey, field, offset, size) 30 + 31 + #define PTR_FIELD(name, offset, size) \ 32 + static inline __u64 name(const struct bkey *k, unsigned i) \ 33 + { return (k->ptr[i] >> offset) & ~(~0ULL << size); } \ 34 + \ 35 + static inline void SET_##name(struct bkey *k, unsigned i, __u64 v) \ 36 + { \ 37 + k->ptr[i] &= ~(~(~0ULL << size) << offset); \ 38 + k->ptr[i] |= (v & ~(~0ULL << size)) << offset; \ 39 + } 40 + 41 + #define KEY_SIZE_BITS 16 42 + 43 + KEY_FIELD(KEY_PTRS, high, 60, 3) 44 + KEY_FIELD(HEADER_SIZE, high, 58, 2) 45 + KEY_FIELD(KEY_CSUM, high, 56, 2) 46 + KEY_FIELD(KEY_PINNED, high, 55, 1) 47 + KEY_FIELD(KEY_DIRTY, high, 36, 1) 48 + 49 + KEY_FIELD(KEY_SIZE, high, 20, KEY_SIZE_BITS) 50 + KEY_FIELD(KEY_INODE, high, 0, 20) 51 + 52 + /* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */ 53 + 54 + static inline __u64 KEY_OFFSET(const struct bkey *k) 55 + { 56 + return k->low; 57 + } 58 + 59 + static inline void SET_KEY_OFFSET(struct bkey *k, __u64 v) 60 + { 61 + k->low = v; 62 + } 63 + 64 + /* 65 + * The high bit being set is a relic from when we used it to do binary 66 + * searches - it told you where a key started. It's not used anymore, 67 + * and can probably be safely dropped. 68 + */ 69 + #define KEY(inode, offset, size) \ 70 + ((struct bkey) { \ 71 + .high = (1ULL << 63) | ((__u64) (size) << 20) | (inode), \ 72 + .low = (offset) \ 73 + }) 74 + 75 + #define ZERO_KEY KEY(0, 0, 0) 76 + 77 + #define MAX_KEY_INODE (~(~0 << 20)) 78 + #define MAX_KEY_OFFSET (~0ULL >> 1) 79 + #define MAX_KEY KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0) 80 + 81 + #define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k)) 82 + #define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0) 83 + 84 + #define PTR_DEV_BITS 12 85 + 86 + PTR_FIELD(PTR_DEV, 51, PTR_DEV_BITS) 87 + PTR_FIELD(PTR_OFFSET, 8, 43) 88 + PTR_FIELD(PTR_GEN, 0, 8) 89 + 90 + #define PTR_CHECK_DEV ((1 << PTR_DEV_BITS) - 1) 91 + 92 + #define PTR(gen, offset, dev) \ 93 + ((((__u64) dev) << 51) | ((__u64) offset) << 8 | gen) 94 + 95 + /* Bkey utility code */ 96 + 97 + static inline unsigned long bkey_u64s(const struct bkey *k) 98 + { 99 + return (sizeof(struct bkey) / sizeof(__u64)) + KEY_PTRS(k); 100 + } 101 + 102 + static inline unsigned long bkey_bytes(const struct bkey *k) 103 + { 104 + return bkey_u64s(k) * sizeof(__u64); 105 + } 106 + 107 + #define bkey_copy(_dest, _src) memcpy(_dest, _src, bkey_bytes(_src)) 108 + 109 + static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src) 110 + { 111 + SET_KEY_INODE(dest, KEY_INODE(src)); 112 + SET_KEY_OFFSET(dest, KEY_OFFSET(src)); 113 + } 114 + 115 + static inline struct bkey *bkey_next(const struct bkey *k) 116 + { 117 + __u64 *d = (void *) k; 118 + return (struct bkey *) (d + bkey_u64s(k)); 119 + } 120 + 121 + static inline struct bkey *bkey_last(const struct bkey *k, unsigned nr_keys) 122 + { 123 + __u64 *d = (void *) k; 124 + return (struct bkey *) (d + nr_keys); 125 + } 126 + /* Enough for a key with 6 pointers */ 127 + #define BKEY_PAD 8 128 + 129 + #define BKEY_PADDED(key) \ 130 + union { struct bkey key; __u64 key ## _pad[BKEY_PAD]; } 131 + 132 + /* Superblock */ 133 + 134 + /* Version 0: Cache device 135 + * Version 1: Backing device 136 + * Version 2: Seed pointer into btree node checksum 137 + * Version 3: Cache device with new UUID format 138 + * Version 4: Backing device with data offset 139 + */ 140 + #define BCACHE_SB_VERSION_CDEV 0 141 + #define BCACHE_SB_VERSION_BDEV 1 142 + #define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 143 + #define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 144 + #define BCACHE_SB_MAX_VERSION 4 145 + 146 + #define SB_SECTOR 8 147 + #define SB_SIZE 4096 148 + #define SB_LABEL_SIZE 32 149 + #define SB_JOURNAL_BUCKETS 256U 150 + /* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */ 151 + #define MAX_CACHES_PER_SET 8 152 + 153 + #define BDEV_DATA_START_DEFAULT 16 /* sectors */ 154 + 155 + struct cache_sb { 156 + __u64 csum; 157 + __u64 offset; /* sector where this sb was written */ 158 + __u64 version; 159 + 160 + __u8 magic[16]; 161 + 162 + __u8 uuid[16]; 163 + union { 164 + __u8 set_uuid[16]; 165 + __u64 set_magic; 166 + }; 167 + __u8 label[SB_LABEL_SIZE]; 168 + 169 + __u64 flags; 170 + __u64 seq; 171 + __u64 pad[8]; 172 + 173 + union { 174 + struct { 175 + /* Cache devices */ 176 + __u64 nbuckets; /* device size */ 177 + 178 + __u16 block_size; /* sectors */ 179 + __u16 bucket_size; /* sectors */ 180 + 181 + __u16 nr_in_set; 182 + __u16 nr_this_dev; 183 + }; 184 + struct { 185 + /* Backing devices */ 186 + __u64 data_offset; 187 + 188 + /* 189 + * block_size from the cache device section is still used by 190 + * backing devices, so don't add anything here until we fix 191 + * things to not need it for backing devices anymore 192 + */ 193 + }; 194 + }; 195 + 196 + __u32 last_mount; /* time_t */ 197 + 198 + __u16 first_bucket; 199 + union { 200 + __u16 njournal_buckets; 201 + __u16 keys; 202 + }; 203 + __u64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */ 204 + }; 205 + 206 + static inline _Bool SB_IS_BDEV(const struct cache_sb *sb) 207 + { 208 + return sb->version == BCACHE_SB_VERSION_BDEV 209 + || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; 210 + } 211 + 212 + BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); 213 + BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1); 214 + BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3); 215 + #define CACHE_REPLACEMENT_LRU 0U 216 + #define CACHE_REPLACEMENT_FIFO 1U 217 + #define CACHE_REPLACEMENT_RANDOM 2U 218 + 219 + BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); 220 + #define CACHE_MODE_WRITETHROUGH 0U 221 + #define CACHE_MODE_WRITEBACK 1U 222 + #define CACHE_MODE_WRITEAROUND 2U 223 + #define CACHE_MODE_NONE 3U 224 + BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2); 225 + #define BDEV_STATE_NONE 0U 226 + #define BDEV_STATE_CLEAN 1U 227 + #define BDEV_STATE_DIRTY 2U 228 + #define BDEV_STATE_STALE 3U 229 + 230 + /* 231 + * Magic numbers 232 + * 233 + * The various other data structures have their own magic numbers, which are 234 + * xored with the first part of the cache set's UUID 235 + */ 236 + 237 + #define JSET_MAGIC 0x245235c1a3625032ULL 238 + #define PSET_MAGIC 0x6750e15f87337f91ULL 239 + #define BSET_MAGIC 0x90135c78b99e07f5ULL 240 + 241 + static inline __u64 jset_magic(struct cache_sb *sb) 242 + { 243 + return sb->set_magic ^ JSET_MAGIC; 244 + } 245 + 246 + static inline __u64 pset_magic(struct cache_sb *sb) 247 + { 248 + return sb->set_magic ^ PSET_MAGIC; 249 + } 250 + 251 + static inline __u64 bset_magic(struct cache_sb *sb) 252 + { 253 + return sb->set_magic ^ BSET_MAGIC; 254 + } 255 + 256 + /* 257 + * Journal 258 + * 259 + * On disk format for a journal entry: 260 + * seq is monotonically increasing; every journal entry has its own unique 261 + * sequence number. 262 + * 263 + * last_seq is the oldest journal entry that still has keys the btree hasn't 264 + * flushed to disk yet. 265 + * 266 + * version is for on disk format changes. 267 + */ 268 + 269 + #define BCACHE_JSET_VERSION_UUIDv1 1 270 + #define BCACHE_JSET_VERSION_UUID 1 /* Always latest UUID format */ 271 + #define BCACHE_JSET_VERSION 1 272 + 273 + struct jset { 274 + __u64 csum; 275 + __u64 magic; 276 + __u64 seq; 277 + __u32 version; 278 + __u32 keys; 279 + 280 + __u64 last_seq; 281 + 282 + BKEY_PADDED(uuid_bucket); 283 + BKEY_PADDED(btree_root); 284 + __u16 btree_level; 285 + __u16 pad[3]; 286 + 287 + __u64 prio_bucket[MAX_CACHES_PER_SET]; 288 + 289 + union { 290 + struct bkey start[0]; 291 + __u64 d[0]; 292 + }; 293 + }; 294 + 295 + /* Bucket prios/gens */ 296 + 297 + struct prio_set { 298 + __u64 csum; 299 + __u64 magic; 300 + __u64 seq; 301 + __u32 version; 302 + __u32 pad; 303 + 304 + __u64 next_bucket; 305 + 306 + struct bucket_disk { 307 + __u16 prio; 308 + __u8 gen; 309 + } __attribute((packed)) data[]; 310 + }; 311 + 312 + /* UUIDS - per backing device/flash only volume metadata */ 313 + 314 + struct uuid_entry { 315 + union { 316 + struct { 317 + __u8 uuid[16]; 318 + __u8 label[32]; 319 + __u32 first_reg; 320 + __u32 last_reg; 321 + __u32 invalidated; 322 + 323 + __u32 flags; 324 + /* Size of flash only volumes */ 325 + __u64 sectors; 326 + }; 327 + 328 + __u8 pad[128]; 329 + }; 330 + }; 331 + 332 + BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1); 333 + 334 + /* Btree nodes */ 335 + 336 + /* Version 1: Seed pointer into btree node checksum 337 + */ 338 + #define BCACHE_BSET_CSUM 1 339 + #define BCACHE_BSET_VERSION 1 340 + 341 + /* 342 + * Btree nodes 343 + * 344 + * On disk a btree node is a list/log of these; within each set the keys are 345 + * sorted 346 + */ 347 + struct bset { 348 + __u64 csum; 349 + __u64 magic; 350 + __u64 seq; 351 + __u32 version; 352 + __u32 keys; 353 + 354 + union { 355 + struct bkey start[0]; 356 + __u64 d[0]; 357 + }; 358 + }; 359 + 360 + /* OBSOLETE */ 361 + 362 + /* UUIDS - per backing device/flash only volume metadata */ 363 + 364 + struct uuid_entry_v0 { 365 + __u8 uuid[16]; 366 + __u8 label[32]; 367 + __u32 first_reg; 368 + __u32 last_reg; 369 + __u32 invalidated; 370 + __u32 pad; 371 + }; 372 + 373 + #endif /* _LINUX_BCACHE_H */