Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost

Pull virtio updates from Michael Tsirkin:

- vdpa sim refactoring

- virtio mem: Big Block Mode support

- misc cleanus, fixes

* tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost: (61 commits)
vdpa: Use simpler version of ida allocation
vdpa: Add missing comment for virtqueue count
uapi: virtio_ids: add missing device type IDs from OASIS spec
uapi: virtio_ids.h: consistent indentions
vhost scsi: fix error return code in vhost_scsi_set_endpoint()
virtio_ring: Fix two use after free bugs
virtio_net: Fix error code in probe()
virtio_ring: Cut and paste bugs in vring_create_virtqueue_packed()
tools/virtio: add barrier for aarch64
tools/virtio: add krealloc_array
tools/virtio: include asm/bug.h
vdpa/mlx5: Use write memory barrier after updating CQ index
vdpa: split vdpasim to core and net modules
vdpa_sim: split vdpasim_virtqueue's iov field in out_iov and in_iov
vdpa_sim: make vdpasim->buffer size configurable
vdpa_sim: use kvmalloc to allocate vdpasim->buffer
vdpa_sim: set vringh notify callback
vdpa_sim: add set_config callback in vdpasim_dev_attr
vdpa_sim: add get_config callback in vdpasim_dev_attr
vdpa_sim: make 'config' generic and usable for any device type
...

+1843 -810
+1
drivers/net/virtio_net.c
··· 3072 3072 dev_err(&vdev->dev, 3073 3073 "device MTU appears to have changed it is now %d < %d", 3074 3074 mtu, dev->min_mtu); 3075 + err = -EINVAL; 3075 3076 goto free; 3076 3077 } 3077 3078
+10 -8
drivers/vdpa/Kconfig
··· 9 9 if VDPA 10 10 11 11 config VDPA_SIM 12 - tristate "vDPA device simulator" 12 + tristate "vDPA device simulator core" 13 13 depends on RUNTIME_TESTING_MENU && HAS_DMA 14 14 select DMA_OPS 15 15 select VHOST_RING 16 - select GENERIC_NET_UTILS 17 - default n 18 16 help 19 - vDPA networking device simulator which loop TX traffic back 20 - to RX. This device is used for testing, prototyping and 21 - development of vDPA. 17 + Enable this module to support vDPA device simulators. These devices 18 + are used for testing, prototyping and development of vDPA. 19 + 20 + config VDPA_SIM_NET 21 + tristate "vDPA simulator for networking device" 22 + depends on VDPA_SIM 23 + select GENERIC_NET_UTILS 24 + help 25 + vDPA networking device simulator which loops TX traffic back to RX. 22 26 23 27 config IFCVF 24 28 tristate "Intel IFC VF vDPA driver" 25 29 depends on PCI_MSI 26 - default n 27 30 help 28 31 This kernel module can drive Intel IFC VF NIC to offload 29 32 virtio dataplane traffic to hardware. ··· 45 42 tristate "vDPA driver for ConnectX devices" 46 43 select MLX5_VDPA 47 44 depends on MLX5_CORE 48 - default n 49 45 help 50 46 VDPA network driver for ConnectX6 and newer. Provides offloading 51 47 of virtio net datapath such that descriptors put on the ring will
+2 -9
drivers/vdpa/ifcvf/ifcvf_main.c
··· 417 417 return ret; 418 418 } 419 419 420 - ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); 420 + ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)); 421 421 if (ret) { 422 - IFCVF_ERR(pdev, "No usable DMA confiugration\n"); 423 - return ret; 424 - } 425 - 426 - ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); 427 - if (ret) { 428 - IFCVF_ERR(pdev, 429 - "No usable coherent DMA confiugration\n"); 422 + IFCVF_ERR(pdev, "No usable DMA configuration\n"); 430 423 return ret; 431 424 } 432 425
+5
drivers/vdpa/mlx5/net/mlx5_vnet.c
··· 479 479 static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num) 480 480 { 481 481 mlx5_cq_set_ci(&mvq->cq.mcq); 482 + 483 + /* make sure CQ cosumer update is visible to the hardware before updating 484 + * RX doorbell record. 485 + */ 486 + dma_wmb(); 482 487 rx_post(&mvq->vqqp, num); 483 488 if (mvq->event_cb.callback) 484 489 mvq->event_cb.callback(mvq->event_cb.private);
+1 -1
drivers/vdpa/vdpa.c
··· 89 89 if (!vdev) 90 90 goto err; 91 91 92 - err = ida_simple_get(&vdpa_index_ida, 0, 0, GFP_KERNEL); 92 + err = ida_alloc(&vdpa_index_ida, GFP_KERNEL); 93 93 if (err < 0) 94 94 goto err_ida; 95 95
+1
drivers/vdpa/vdpa_sim/Makefile
··· 1 1 # SPDX-License-Identifier: GPL-2.0 2 2 obj-$(CONFIG_VDPA_SIM) += vdpa_sim.o 3 + obj-$(CONFIG_VDPA_SIM_NET) += vdpa_sim_net.o
+82 -216
drivers/vdpa/vdpa_sim/vdpa_sim.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0-only 2 2 /* 3 - * VDPA networking device simulator. 3 + * VDPA device simulator core. 4 4 * 5 5 * Copyright (c) 2020, Red Hat Inc. All rights reserved. 6 6 * Author: Jason Wang <jasowang@redhat.com> ··· 11 11 #include <linux/module.h> 12 12 #include <linux/device.h> 13 13 #include <linux/kernel.h> 14 - #include <linux/fs.h> 15 - #include <linux/poll.h> 16 14 #include <linux/slab.h> 17 15 #include <linux/sched.h> 18 - #include <linux/wait.h> 19 - #include <linux/uuid.h> 20 - #include <linux/iommu.h> 21 16 #include <linux/dma-map-ops.h> 22 - #include <linux/sysfs.h> 23 - #include <linux/file.h> 24 - #include <linux/etherdevice.h> 25 17 #include <linux/vringh.h> 26 18 #include <linux/vdpa.h> 27 - #include <linux/virtio_byteorder.h> 28 19 #include <linux/vhost_iotlb.h> 29 - #include <uapi/linux/virtio_config.h> 30 - #include <uapi/linux/virtio_net.h> 20 + 21 + #include "vdpa_sim.h" 31 22 32 23 #define DRV_VERSION "0.1" 33 24 #define DRV_AUTHOR "Jason Wang <jasowang@redhat.com>" 34 - #define DRV_DESC "vDPA Device Simulator" 25 + #define DRV_DESC "vDPA Device Simulator core" 35 26 #define DRV_LICENSE "GPL v2" 36 27 37 28 static int batch_mapping = 1; 38 29 module_param(batch_mapping, int, 0444); 39 30 MODULE_PARM_DESC(batch_mapping, "Batched mapping 1 -Enable; 0 - Disable"); 40 31 41 - static char *macaddr; 42 - module_param(macaddr, charp, 0); 43 - MODULE_PARM_DESC(macaddr, "Ethernet MAC address"); 44 - 45 - struct vdpasim_virtqueue { 46 - struct vringh vring; 47 - struct vringh_kiov iov; 48 - unsigned short head; 49 - bool ready; 50 - u64 desc_addr; 51 - u64 device_addr; 52 - u64 driver_addr; 53 - u32 num; 54 - void *private; 55 - irqreturn_t (*cb)(void *data); 56 - }; 32 + static int max_iotlb_entries = 2048; 33 + module_param(max_iotlb_entries, int, 0444); 34 + MODULE_PARM_DESC(max_iotlb_entries, 35 + "Maximum number of iotlb entries. 0 means unlimited. (default: 2048)"); 57 36 58 37 #define VDPASIM_QUEUE_ALIGN PAGE_SIZE 59 38 #define VDPASIM_QUEUE_MAX 256 60 - #define VDPASIM_DEVICE_ID 0x1 61 39 #define VDPASIM_VENDOR_ID 0 62 - #define VDPASIM_VQ_NUM 0x2 63 - #define VDPASIM_NAME "vdpasim-netdev" 64 - 65 - static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) | 66 - (1ULL << VIRTIO_F_VERSION_1) | 67 - (1ULL << VIRTIO_F_ACCESS_PLATFORM) | 68 - (1ULL << VIRTIO_NET_F_MAC); 69 - 70 - /* State of each vdpasim device */ 71 - struct vdpasim { 72 - struct vdpa_device vdpa; 73 - struct vdpasim_virtqueue vqs[VDPASIM_VQ_NUM]; 74 - struct work_struct work; 75 - /* spinlock to synchronize virtqueue state */ 76 - spinlock_t lock; 77 - struct virtio_net_config config; 78 - struct vhost_iotlb *iommu; 79 - void *buffer; 80 - u32 status; 81 - u32 generation; 82 - u64 features; 83 - /* spinlock to synchronize iommu table */ 84 - spinlock_t iommu_lock; 85 - }; 86 - 87 - /* TODO: cross-endian support */ 88 - static inline bool vdpasim_is_little_endian(struct vdpasim *vdpasim) 89 - { 90 - return virtio_legacy_is_little_endian() || 91 - (vdpasim->features & (1ULL << VIRTIO_F_VERSION_1)); 92 - } 93 - 94 - static inline u16 vdpasim16_to_cpu(struct vdpasim *vdpasim, __virtio16 val) 95 - { 96 - return __virtio16_to_cpu(vdpasim_is_little_endian(vdpasim), val); 97 - } 98 - 99 - static inline __virtio16 cpu_to_vdpasim16(struct vdpasim *vdpasim, u16 val) 100 - { 101 - return __cpu_to_virtio16(vdpasim_is_little_endian(vdpasim), val); 102 - } 103 - 104 - static struct vdpasim *vdpasim_dev; 105 40 106 41 static struct vdpasim *vdpa_to_sim(struct vdpa_device *vdpa) 107 42 { ··· 50 115 return vdpa_to_sim(vdpa); 51 116 } 52 117 118 + static void vdpasim_vq_notify(struct vringh *vring) 119 + { 120 + struct vdpasim_virtqueue *vq = 121 + container_of(vring, struct vdpasim_virtqueue, vring); 122 + 123 + if (!vq->cb) 124 + return; 125 + 126 + vq->cb(vq->private); 127 + } 128 + 53 129 static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx) 54 130 { 55 131 struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; 56 132 57 - vringh_init_iotlb(&vq->vring, vdpasim_features, 133 + vringh_init_iotlb(&vq->vring, vdpasim->dev_attr.supported_features, 58 134 VDPASIM_QUEUE_MAX, false, 59 135 (struct vring_desc *)(uintptr_t)vq->desc_addr, 60 136 (struct vring_avail *) 61 137 (uintptr_t)vq->driver_addr, 62 138 (struct vring_used *) 63 139 (uintptr_t)vq->device_addr); 140 + 141 + vq->vring.notify = vdpasim_vq_notify; 64 142 } 65 143 66 - static void vdpasim_vq_reset(struct vdpasim_virtqueue *vq) 144 + static void vdpasim_vq_reset(struct vdpasim *vdpasim, 145 + struct vdpasim_virtqueue *vq) 67 146 { 68 147 vq->ready = false; 69 148 vq->desc_addr = 0; ··· 85 136 vq->device_addr = 0; 86 137 vq->cb = NULL; 87 138 vq->private = NULL; 88 - vringh_init_iotlb(&vq->vring, vdpasim_features, VDPASIM_QUEUE_MAX, 89 - false, NULL, NULL, NULL); 139 + vringh_init_iotlb(&vq->vring, vdpasim->dev_attr.supported_features, 140 + VDPASIM_QUEUE_MAX, false, NULL, NULL, NULL); 141 + 142 + vq->vring.notify = NULL; 90 143 } 91 144 92 145 static void vdpasim_reset(struct vdpasim *vdpasim) 93 146 { 94 147 int i; 95 148 96 - for (i = 0; i < VDPASIM_VQ_NUM; i++) 97 - vdpasim_vq_reset(&vdpasim->vqs[i]); 149 + for (i = 0; i < vdpasim->dev_attr.nvqs; i++) 150 + vdpasim_vq_reset(vdpasim, &vdpasim->vqs[i]); 98 151 99 152 spin_lock(&vdpasim->iommu_lock); 100 153 vhost_iotlb_reset(vdpasim->iommu); ··· 105 154 vdpasim->features = 0; 106 155 vdpasim->status = 0; 107 156 ++vdpasim->generation; 108 - } 109 - 110 - static void vdpasim_work(struct work_struct *work) 111 - { 112 - struct vdpasim *vdpasim = container_of(work, struct 113 - vdpasim, work); 114 - struct vdpasim_virtqueue *txq = &vdpasim->vqs[1]; 115 - struct vdpasim_virtqueue *rxq = &vdpasim->vqs[0]; 116 - ssize_t read, write; 117 - size_t total_write; 118 - int pkts = 0; 119 - int err; 120 - 121 - spin_lock(&vdpasim->lock); 122 - 123 - if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) 124 - goto out; 125 - 126 - if (!txq->ready || !rxq->ready) 127 - goto out; 128 - 129 - while (true) { 130 - total_write = 0; 131 - err = vringh_getdesc_iotlb(&txq->vring, &txq->iov, NULL, 132 - &txq->head, GFP_ATOMIC); 133 - if (err <= 0) 134 - break; 135 - 136 - err = vringh_getdesc_iotlb(&rxq->vring, NULL, &rxq->iov, 137 - &rxq->head, GFP_ATOMIC); 138 - if (err <= 0) { 139 - vringh_complete_iotlb(&txq->vring, txq->head, 0); 140 - break; 141 - } 142 - 143 - while (true) { 144 - read = vringh_iov_pull_iotlb(&txq->vring, &txq->iov, 145 - vdpasim->buffer, 146 - PAGE_SIZE); 147 - if (read <= 0) 148 - break; 149 - 150 - write = vringh_iov_push_iotlb(&rxq->vring, &rxq->iov, 151 - vdpasim->buffer, read); 152 - if (write <= 0) 153 - break; 154 - 155 - total_write += write; 156 - } 157 - 158 - /* Make sure data is wrote before advancing index */ 159 - smp_wmb(); 160 - 161 - vringh_complete_iotlb(&txq->vring, txq->head, 0); 162 - vringh_complete_iotlb(&rxq->vring, rxq->head, total_write); 163 - 164 - /* Make sure used is visible before rasing the interrupt. */ 165 - smp_wmb(); 166 - 167 - local_bh_disable(); 168 - if (txq->cb) 169 - txq->cb(txq->private); 170 - if (rxq->cb) 171 - rxq->cb(rxq->private); 172 - local_bh_enable(); 173 - 174 - if (++pkts > 4) { 175 - schedule_work(&vdpasim->work); 176 - goto out; 177 - } 178 - } 179 - 180 - out: 181 - spin_unlock(&vdpasim->lock); 182 157 } 183 158 184 159 static int dir_to_perm(enum dma_data_direction dir) ··· 219 342 .free = vdpasim_free_coherent, 220 343 }; 221 344 222 - static const struct vdpa_config_ops vdpasim_net_config_ops; 223 - static const struct vdpa_config_ops vdpasim_net_batch_config_ops; 345 + static const struct vdpa_config_ops vdpasim_config_ops; 346 + static const struct vdpa_config_ops vdpasim_batch_config_ops; 224 347 225 - static struct vdpasim *vdpasim_create(void) 348 + struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) 226 349 { 227 350 const struct vdpa_config_ops *ops; 228 351 struct vdpasim *vdpasim; 229 352 struct device *dev; 230 - int ret = -ENOMEM; 353 + int i, ret = -ENOMEM; 231 354 232 355 if (batch_mapping) 233 - ops = &vdpasim_net_batch_config_ops; 356 + ops = &vdpasim_batch_config_ops; 234 357 else 235 - ops = &vdpasim_net_config_ops; 358 + ops = &vdpasim_config_ops; 236 359 237 - vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, VDPASIM_VQ_NUM); 360 + vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, 361 + dev_attr->nvqs); 238 362 if (!vdpasim) 239 363 goto err_alloc; 240 364 241 - INIT_WORK(&vdpasim->work, vdpasim_work); 365 + vdpasim->dev_attr = *dev_attr; 366 + INIT_WORK(&vdpasim->work, dev_attr->work_fn); 242 367 spin_lock_init(&vdpasim->lock); 243 368 spin_lock_init(&vdpasim->iommu_lock); 244 369 ··· 250 371 goto err_iommu; 251 372 set_dma_ops(dev, &vdpasim_dma_ops); 252 373 253 - vdpasim->iommu = vhost_iotlb_alloc(2048, 0); 374 + vdpasim->config = kzalloc(dev_attr->config_size, GFP_KERNEL); 375 + if (!vdpasim->config) 376 + goto err_iommu; 377 + 378 + vdpasim->vqs = kcalloc(dev_attr->nvqs, sizeof(struct vdpasim_virtqueue), 379 + GFP_KERNEL); 380 + if (!vdpasim->vqs) 381 + goto err_iommu; 382 + 383 + vdpasim->iommu = vhost_iotlb_alloc(max_iotlb_entries, 0); 254 384 if (!vdpasim->iommu) 255 385 goto err_iommu; 256 386 257 - vdpasim->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); 387 + vdpasim->buffer = kvmalloc(dev_attr->buffer_size, GFP_KERNEL); 258 388 if (!vdpasim->buffer) 259 389 goto err_iommu; 260 390 261 - if (macaddr) { 262 - mac_pton(macaddr, vdpasim->config.mac); 263 - if (!is_valid_ether_addr(vdpasim->config.mac)) { 264 - ret = -EADDRNOTAVAIL; 265 - goto err_iommu; 266 - } 267 - } else { 268 - eth_random_addr(vdpasim->config.mac); 269 - } 270 - 271 - vringh_set_iotlb(&vdpasim->vqs[0].vring, vdpasim->iommu); 272 - vringh_set_iotlb(&vdpasim->vqs[1].vring, vdpasim->iommu); 391 + for (i = 0; i < dev_attr->nvqs; i++) 392 + vringh_set_iotlb(&vdpasim->vqs[i].vring, vdpasim->iommu); 273 393 274 394 vdpasim->vdpa.dma_dev = dev; 275 - ret = vdpa_register_device(&vdpasim->vdpa); 276 - if (ret) 277 - goto err_iommu; 278 395 279 396 return vdpasim; 280 397 ··· 279 404 err_alloc: 280 405 return ERR_PTR(ret); 281 406 } 407 + EXPORT_SYMBOL_GPL(vdpasim_create); 282 408 283 409 static int vdpasim_set_vq_address(struct vdpa_device *vdpa, u16 idx, 284 410 u64 desc_area, u64 driver_area, ··· 374 498 375 499 static u64 vdpasim_get_features(struct vdpa_device *vdpa) 376 500 { 377 - return vdpasim_features; 501 + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); 502 + 503 + return vdpasim->dev_attr.supported_features; 378 504 } 379 505 380 506 static int vdpasim_set_features(struct vdpa_device *vdpa, u64 features) 381 507 { 382 508 struct vdpasim *vdpasim = vdpa_to_sim(vdpa); 383 - struct virtio_net_config *config = &vdpasim->config; 384 509 385 510 /* DMA mapping must be done by driver */ 386 511 if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) 387 512 return -EINVAL; 388 513 389 - vdpasim->features = features & vdpasim_features; 514 + vdpasim->features = features & vdpasim->dev_attr.supported_features; 390 515 391 - /* We generally only know whether guest is using the legacy interface 392 - * here, so generally that's the earliest we can set config fields. 393 - * Note: We actually require VIRTIO_F_ACCESS_PLATFORM above which 394 - * implies VIRTIO_F_VERSION_1, but let's not try to be clever here. 395 - */ 396 - 397 - config->mtu = cpu_to_vdpasim16(vdpasim, 1500); 398 - config->status = cpu_to_vdpasim16(vdpasim, VIRTIO_NET_S_LINK_UP); 399 516 return 0; 400 517 } 401 518 ··· 405 536 406 537 static u32 vdpasim_get_device_id(struct vdpa_device *vdpa) 407 538 { 408 - return VDPASIM_DEVICE_ID; 539 + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); 540 + 541 + return vdpasim->dev_attr.id; 409 542 } 410 543 411 544 static u32 vdpasim_get_vendor_id(struct vdpa_device *vdpa) ··· 443 572 { 444 573 struct vdpasim *vdpasim = vdpa_to_sim(vdpa); 445 574 446 - if (offset + len < sizeof(struct virtio_net_config)) 447 - memcpy(buf, (u8 *)&vdpasim->config + offset, len); 575 + if (offset + len > vdpasim->dev_attr.config_size) 576 + return; 577 + 578 + if (vdpasim->dev_attr.get_config) 579 + vdpasim->dev_attr.get_config(vdpasim, vdpasim->config); 580 + 581 + memcpy(buf, vdpasim->config + offset, len); 448 582 } 449 583 450 584 static void vdpasim_set_config(struct vdpa_device *vdpa, unsigned int offset, 451 585 const void *buf, unsigned int len) 452 586 { 453 - /* No writable config supportted by vdpasim */ 587 + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); 588 + 589 + if (offset + len > vdpasim->dev_attr.config_size) 590 + return; 591 + 592 + memcpy(vdpasim->config + offset, buf, len); 593 + 594 + if (vdpasim->dev_attr.set_config) 595 + vdpasim->dev_attr.set_config(vdpasim, vdpasim->config); 454 596 } 455 597 456 598 static u32 vdpasim_get_generation(struct vdpa_device *vdpa) ··· 540 656 struct vdpasim *vdpasim = vdpa_to_sim(vdpa); 541 657 542 658 cancel_work_sync(&vdpasim->work); 543 - kfree(vdpasim->buffer); 659 + kvfree(vdpasim->buffer); 544 660 if (vdpasim->iommu) 545 661 vhost_iotlb_free(vdpasim->iommu); 662 + kfree(vdpasim->vqs); 663 + kfree(vdpasim->config); 546 664 } 547 665 548 - static const struct vdpa_config_ops vdpasim_net_config_ops = { 666 + static const struct vdpa_config_ops vdpasim_config_ops = { 549 667 .set_vq_address = vdpasim_set_vq_address, 550 668 .set_vq_num = vdpasim_set_vq_num, 551 669 .kick_vq = vdpasim_kick_vq, ··· 574 688 .free = vdpasim_free, 575 689 }; 576 690 577 - static const struct vdpa_config_ops vdpasim_net_batch_config_ops = { 691 + static const struct vdpa_config_ops vdpasim_batch_config_ops = { 578 692 .set_vq_address = vdpasim_set_vq_address, 579 693 .set_vq_num = vdpasim_set_vq_num, 580 694 .kick_vq = vdpasim_kick_vq, ··· 599 713 .set_map = vdpasim_set_map, 600 714 .free = vdpasim_free, 601 715 }; 602 - 603 - static int __init vdpasim_dev_init(void) 604 - { 605 - vdpasim_dev = vdpasim_create(); 606 - 607 - if (!IS_ERR(vdpasim_dev)) 608 - return 0; 609 - 610 - return PTR_ERR(vdpasim_dev); 611 - } 612 - 613 - static void __exit vdpasim_dev_exit(void) 614 - { 615 - struct vdpa_device *vdpa = &vdpasim_dev->vdpa; 616 - 617 - vdpa_unregister_device(vdpa); 618 - } 619 - 620 - module_init(vdpasim_dev_init) 621 - module_exit(vdpasim_dev_exit) 622 716 623 717 MODULE_VERSION(DRV_VERSION); 624 718 MODULE_LICENSE(DRV_LICENSE);
+105
drivers/vdpa/vdpa_sim/vdpa_sim.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Copyright (c) 2020, Red Hat Inc. All rights reserved. 4 + */ 5 + 6 + #ifndef _VDPA_SIM_H 7 + #define _VDPA_SIM_H 8 + 9 + #include <linux/vringh.h> 10 + #include <linux/vdpa.h> 11 + #include <linux/virtio_byteorder.h> 12 + #include <linux/vhost_iotlb.h> 13 + #include <uapi/linux/virtio_config.h> 14 + 15 + #define VDPASIM_FEATURES ((1ULL << VIRTIO_F_ANY_LAYOUT) | \ 16 + (1ULL << VIRTIO_F_VERSION_1) | \ 17 + (1ULL << VIRTIO_F_ACCESS_PLATFORM)) 18 + 19 + struct vdpasim; 20 + 21 + struct vdpasim_virtqueue { 22 + struct vringh vring; 23 + struct vringh_kiov in_iov; 24 + struct vringh_kiov out_iov; 25 + unsigned short head; 26 + bool ready; 27 + u64 desc_addr; 28 + u64 device_addr; 29 + u64 driver_addr; 30 + u32 num; 31 + void *private; 32 + irqreturn_t (*cb)(void *data); 33 + }; 34 + 35 + struct vdpasim_dev_attr { 36 + u64 supported_features; 37 + size_t config_size; 38 + size_t buffer_size; 39 + int nvqs; 40 + u32 id; 41 + 42 + work_func_t work_fn; 43 + void (*get_config)(struct vdpasim *vdpasim, void *config); 44 + void (*set_config)(struct vdpasim *vdpasim, const void *config); 45 + }; 46 + 47 + /* State of each vdpasim device */ 48 + struct vdpasim { 49 + struct vdpa_device vdpa; 50 + struct vdpasim_virtqueue *vqs; 51 + struct work_struct work; 52 + struct vdpasim_dev_attr dev_attr; 53 + /* spinlock to synchronize virtqueue state */ 54 + spinlock_t lock; 55 + /* virtio config according to device type */ 56 + void *config; 57 + struct vhost_iotlb *iommu; 58 + void *buffer; 59 + u32 status; 60 + u32 generation; 61 + u64 features; 62 + /* spinlock to synchronize iommu table */ 63 + spinlock_t iommu_lock; 64 + }; 65 + 66 + struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *attr); 67 + 68 + /* TODO: cross-endian support */ 69 + static inline bool vdpasim_is_little_endian(struct vdpasim *vdpasim) 70 + { 71 + return virtio_legacy_is_little_endian() || 72 + (vdpasim->features & (1ULL << VIRTIO_F_VERSION_1)); 73 + } 74 + 75 + static inline u16 vdpasim16_to_cpu(struct vdpasim *vdpasim, __virtio16 val) 76 + { 77 + return __virtio16_to_cpu(vdpasim_is_little_endian(vdpasim), val); 78 + } 79 + 80 + static inline __virtio16 cpu_to_vdpasim16(struct vdpasim *vdpasim, u16 val) 81 + { 82 + return __cpu_to_virtio16(vdpasim_is_little_endian(vdpasim), val); 83 + } 84 + 85 + static inline u32 vdpasim32_to_cpu(struct vdpasim *vdpasim, __virtio32 val) 86 + { 87 + return __virtio32_to_cpu(vdpasim_is_little_endian(vdpasim), val); 88 + } 89 + 90 + static inline __virtio32 cpu_to_vdpasim32(struct vdpasim *vdpasim, u32 val) 91 + { 92 + return __cpu_to_virtio32(vdpasim_is_little_endian(vdpasim), val); 93 + } 94 + 95 + static inline u64 vdpasim64_to_cpu(struct vdpasim *vdpasim, __virtio64 val) 96 + { 97 + return __virtio64_to_cpu(vdpasim_is_little_endian(vdpasim), val); 98 + } 99 + 100 + static inline __virtio64 cpu_to_vdpasim64(struct vdpasim *vdpasim, u64 val) 101 + { 102 + return __cpu_to_virtio64(vdpasim_is_little_endian(vdpasim), val); 103 + } 104 + 105 + #endif
+177
drivers/vdpa/vdpa_sim/vdpa_sim_net.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * VDPA simulator for networking device. 4 + * 5 + * Copyright (c) 2020, Red Hat Inc. All rights reserved. 6 + * Author: Jason Wang <jasowang@redhat.com> 7 + * 8 + */ 9 + 10 + #include <linux/init.h> 11 + #include <linux/module.h> 12 + #include <linux/device.h> 13 + #include <linux/kernel.h> 14 + #include <linux/sched.h> 15 + #include <linux/etherdevice.h> 16 + #include <linux/vringh.h> 17 + #include <linux/vdpa.h> 18 + #include <uapi/linux/virtio_net.h> 19 + 20 + #include "vdpa_sim.h" 21 + 22 + #define DRV_VERSION "0.1" 23 + #define DRV_AUTHOR "Jason Wang <jasowang@redhat.com>" 24 + #define DRV_DESC "vDPA Device Simulator for networking device" 25 + #define DRV_LICENSE "GPL v2" 26 + 27 + #define VDPASIM_NET_FEATURES (VDPASIM_FEATURES | \ 28 + (1ULL << VIRTIO_NET_F_MAC)) 29 + 30 + #define VDPASIM_NET_VQ_NUM 2 31 + 32 + static char *macaddr; 33 + module_param(macaddr, charp, 0); 34 + MODULE_PARM_DESC(macaddr, "Ethernet MAC address"); 35 + 36 + u8 macaddr_buf[ETH_ALEN]; 37 + 38 + static struct vdpasim *vdpasim_net_dev; 39 + 40 + static void vdpasim_net_work(struct work_struct *work) 41 + { 42 + struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); 43 + struct vdpasim_virtqueue *txq = &vdpasim->vqs[1]; 44 + struct vdpasim_virtqueue *rxq = &vdpasim->vqs[0]; 45 + ssize_t read, write; 46 + size_t total_write; 47 + int pkts = 0; 48 + int err; 49 + 50 + spin_lock(&vdpasim->lock); 51 + 52 + if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) 53 + goto out; 54 + 55 + if (!txq->ready || !rxq->ready) 56 + goto out; 57 + 58 + while (true) { 59 + total_write = 0; 60 + err = vringh_getdesc_iotlb(&txq->vring, &txq->out_iov, NULL, 61 + &txq->head, GFP_ATOMIC); 62 + if (err <= 0) 63 + break; 64 + 65 + err = vringh_getdesc_iotlb(&rxq->vring, NULL, &rxq->in_iov, 66 + &rxq->head, GFP_ATOMIC); 67 + if (err <= 0) { 68 + vringh_complete_iotlb(&txq->vring, txq->head, 0); 69 + break; 70 + } 71 + 72 + while (true) { 73 + read = vringh_iov_pull_iotlb(&txq->vring, &txq->out_iov, 74 + vdpasim->buffer, 75 + PAGE_SIZE); 76 + if (read <= 0) 77 + break; 78 + 79 + write = vringh_iov_push_iotlb(&rxq->vring, &rxq->in_iov, 80 + vdpasim->buffer, read); 81 + if (write <= 0) 82 + break; 83 + 84 + total_write += write; 85 + } 86 + 87 + /* Make sure data is wrote before advancing index */ 88 + smp_wmb(); 89 + 90 + vringh_complete_iotlb(&txq->vring, txq->head, 0); 91 + vringh_complete_iotlb(&rxq->vring, rxq->head, total_write); 92 + 93 + /* Make sure used is visible before rasing the interrupt. */ 94 + smp_wmb(); 95 + 96 + local_bh_disable(); 97 + if (vringh_need_notify_iotlb(&txq->vring) > 0) 98 + vringh_notify(&txq->vring); 99 + if (vringh_need_notify_iotlb(&rxq->vring) > 0) 100 + vringh_notify(&rxq->vring); 101 + local_bh_enable(); 102 + 103 + if (++pkts > 4) { 104 + schedule_work(&vdpasim->work); 105 + goto out; 106 + } 107 + } 108 + 109 + out: 110 + spin_unlock(&vdpasim->lock); 111 + } 112 + 113 + static void vdpasim_net_get_config(struct vdpasim *vdpasim, void *config) 114 + { 115 + struct virtio_net_config *net_config = 116 + (struct virtio_net_config *)config; 117 + 118 + net_config->mtu = cpu_to_vdpasim16(vdpasim, 1500); 119 + net_config->status = cpu_to_vdpasim16(vdpasim, VIRTIO_NET_S_LINK_UP); 120 + memcpy(net_config->mac, macaddr_buf, ETH_ALEN); 121 + } 122 + 123 + static int __init vdpasim_net_init(void) 124 + { 125 + struct vdpasim_dev_attr dev_attr = {}; 126 + int ret; 127 + 128 + if (macaddr) { 129 + mac_pton(macaddr, macaddr_buf); 130 + if (!is_valid_ether_addr(macaddr_buf)) { 131 + ret = -EADDRNOTAVAIL; 132 + goto out; 133 + } 134 + } else { 135 + eth_random_addr(macaddr_buf); 136 + } 137 + 138 + dev_attr.id = VIRTIO_ID_NET; 139 + dev_attr.supported_features = VDPASIM_NET_FEATURES; 140 + dev_attr.nvqs = VDPASIM_NET_VQ_NUM; 141 + dev_attr.config_size = sizeof(struct virtio_net_config); 142 + dev_attr.get_config = vdpasim_net_get_config; 143 + dev_attr.work_fn = vdpasim_net_work; 144 + dev_attr.buffer_size = PAGE_SIZE; 145 + 146 + vdpasim_net_dev = vdpasim_create(&dev_attr); 147 + if (IS_ERR(vdpasim_net_dev)) { 148 + ret = PTR_ERR(vdpasim_net_dev); 149 + goto out; 150 + } 151 + 152 + ret = vdpa_register_device(&vdpasim_net_dev->vdpa); 153 + if (ret) 154 + goto put_dev; 155 + 156 + return 0; 157 + 158 + put_dev: 159 + put_device(&vdpasim_net_dev->vdpa.dev); 160 + out: 161 + return ret; 162 + } 163 + 164 + static void __exit vdpasim_net_exit(void) 165 + { 166 + struct vdpa_device *vdpa = &vdpasim_net_dev->vdpa; 167 + 168 + vdpa_unregister_device(vdpa); 169 + } 170 + 171 + module_init(vdpasim_net_init); 172 + module_exit(vdpasim_net_exit); 173 + 174 + MODULE_VERSION(DRV_VERSION); 175 + MODULE_LICENSE(DRV_LICENSE); 176 + MODULE_AUTHOR(DRV_AUTHOR); 177 + MODULE_DESCRIPTION(DRV_DESC);
+2 -1
drivers/vhost/scsi.c
··· 1643 1643 if (!vhost_vq_is_setup(vq)) 1644 1644 continue; 1645 1645 1646 - if (vhost_scsi_setup_vq_cmds(vq, vq->num)) 1646 + ret = vhost_scsi_setup_vq_cmds(vq, vq->num); 1647 + if (ret) 1647 1648 goto destroy_vq_cmds; 1648 1649 } 1649 1650
+3 -7
drivers/vhost/vdpa.c
··· 245 245 return -EFAULT; 246 246 if (vhost_vdpa_config_validate(v, &config)) 247 247 return -EINVAL; 248 - buf = kvzalloc(config.len, GFP_KERNEL); 249 - if (!buf) 250 - return -ENOMEM; 251 248 252 - if (copy_from_user(buf, c->buf, config.len)) { 253 - kvfree(buf); 254 - return -EFAULT; 255 - } 249 + buf = vmemdup_user(c->buf, config.len); 250 + if (IS_ERR(buf)) 251 + return PTR_ERR(buf); 256 252 257 253 ops->set_config(vdpa, config.off, buf, config.len); 258 254
+1310 -525
drivers/virtio/virtio_mem.c
··· 27 27 module_param(unplug_online, bool, 0644); 28 28 MODULE_PARM_DESC(unplug_online, "Try to unplug online memory"); 29 29 30 - enum virtio_mem_mb_state { 30 + static bool force_bbm; 31 + module_param(force_bbm, bool, 0444); 32 + MODULE_PARM_DESC(force_bbm, 33 + "Force Big Block Mode. Default is 0 (auto-selection)"); 34 + 35 + static unsigned long bbm_block_size; 36 + module_param(bbm_block_size, ulong, 0444); 37 + MODULE_PARM_DESC(bbm_block_size, 38 + "Big Block size in bytes. Default is 0 (auto-detection)."); 39 + 40 + static bool bbm_safe_unplug = true; 41 + module_param(bbm_safe_unplug, bool, 0444); 42 + MODULE_PARM_DESC(bbm_safe_unplug, 43 + "Use a safe unplug mechanism in BBM, avoiding long/endless loops"); 44 + 45 + /* 46 + * virtio-mem currently supports the following modes of operation: 47 + * 48 + * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The 49 + * size of a Sub Block (SB) is determined based on the device block size, the 50 + * pageblock size, and the maximum allocation granularity of the buddy. 51 + * Subblocks within a Linux memory block might either be plugged or unplugged. 52 + * Memory is added/removed to Linux MM in Linux memory block granularity. 53 + * 54 + * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks. 55 + * Memory is added/removed to Linux MM in Big Block granularity. 56 + * 57 + * The mode is determined automatically based on the Linux memory block size 58 + * and the device block size. 59 + * 60 + * User space / core MM (auto onlining) is responsible for onlining added 61 + * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are 62 + * always onlined separately, and all memory within a Linux memory block is 63 + * onlined to the same zone - virtio-mem relies on this behavior. 64 + */ 65 + 66 + /* 67 + * State of a Linux memory block in SBM. 68 + */ 69 + enum virtio_mem_sbm_mb_state { 31 70 /* Unplugged, not added to Linux. Can be reused later. */ 32 - VIRTIO_MEM_MB_STATE_UNUSED = 0, 71 + VIRTIO_MEM_SBM_MB_UNUSED = 0, 33 72 /* (Partially) plugged, not added to Linux. Error on add_memory(). */ 34 - VIRTIO_MEM_MB_STATE_PLUGGED, 73 + VIRTIO_MEM_SBM_MB_PLUGGED, 35 74 /* Fully plugged, fully added to Linux, offline. */ 36 - VIRTIO_MEM_MB_STATE_OFFLINE, 75 + VIRTIO_MEM_SBM_MB_OFFLINE, 37 76 /* Partially plugged, fully added to Linux, offline. */ 38 - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL, 77 + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 39 78 /* Fully plugged, fully added to Linux, online. */ 40 - VIRTIO_MEM_MB_STATE_ONLINE, 79 + VIRTIO_MEM_SBM_MB_ONLINE, 41 80 /* Partially plugged, fully added to Linux, online. */ 42 - VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL, 43 - VIRTIO_MEM_MB_STATE_COUNT 81 + VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL, 82 + VIRTIO_MEM_SBM_MB_COUNT 83 + }; 84 + 85 + /* 86 + * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks. 87 + */ 88 + enum virtio_mem_bbm_bb_state { 89 + /* Unplugged, not added to Linux. Can be reused later. */ 90 + VIRTIO_MEM_BBM_BB_UNUSED = 0, 91 + /* Plugged, not added to Linux. Error on add_memory(). */ 92 + VIRTIO_MEM_BBM_BB_PLUGGED, 93 + /* Plugged and added to Linux. */ 94 + VIRTIO_MEM_BBM_BB_ADDED, 95 + /* All online parts are fake-offline, ready to remove. */ 96 + VIRTIO_MEM_BBM_BB_FAKE_OFFLINE, 97 + VIRTIO_MEM_BBM_BB_COUNT 44 98 }; 45 99 46 100 struct virtio_mem { ··· 105 51 106 52 /* Workqueue that processes the plug/unplug requests. */ 107 53 struct work_struct wq; 54 + atomic_t wq_active; 108 55 atomic_t config_changed; 109 56 110 57 /* Virtqueue for guest->host requests. */ ··· 125 70 126 71 /* The device block size (for communicating with the device). */ 127 72 uint64_t device_block_size; 128 - /* The translated node id. NUMA_NO_NODE in case not specified. */ 73 + /* The determined node id for all memory of the device. */ 129 74 int nid; 130 75 /* Physical start address of the memory region. */ 131 76 uint64_t addr; 132 77 /* Maximum region size in bytes. */ 133 78 uint64_t region_size; 134 - 135 - /* The subblock size. */ 136 - uint64_t subblock_size; 137 - /* The number of subblocks per memory block. */ 138 - uint32_t nb_sb_per_mb; 139 - 140 - /* Id of the first memory block of this device. */ 141 - unsigned long first_mb_id; 142 - /* Id of the last memory block of this device. */ 143 - unsigned long last_mb_id; 144 - /* Id of the last usable memory block of this device. */ 145 - unsigned long last_usable_mb_id; 146 - /* Id of the next memory bock to prepare when needed. */ 147 - unsigned long next_mb_id; 148 79 149 80 /* The parent resource for all memory added via this device. */ 150 81 struct resource *parent_resource; ··· 140 99 */ 141 100 const char *resource_name; 142 101 143 - /* Summary of all memory block states. */ 144 - unsigned long nb_mb_state[VIRTIO_MEM_MB_STATE_COUNT]; 145 - #define VIRTIO_MEM_NB_OFFLINE_THRESHOLD 10 146 - 147 102 /* 148 - * One byte state per memory block. 149 - * 150 - * Allocated via vmalloc(). When preparing new blocks, resized 151 - * (alloc+copy+free) when needed (crossing pages with the next mb). 152 - * (when crossing pages). 153 - * 154 - * With 128MB memory blocks, we have states for 512GB of memory in one 155 - * page. 103 + * We don't want to add too much memory if it's not getting onlined, 104 + * to avoid running OOM. Besides this threshold, we allow to have at 105 + * least two offline blocks at a time (whatever is bigger). 156 106 */ 157 - uint8_t *mb_state; 107 + #define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD (1024 * 1024 * 1024) 108 + atomic64_t offline_size; 109 + uint64_t offline_threshold; 110 + 111 + /* If set, the driver is in SBM, otherwise in BBM. */ 112 + bool in_sbm; 113 + 114 + union { 115 + struct { 116 + /* Id of the first memory block of this device. */ 117 + unsigned long first_mb_id; 118 + /* Id of the last usable memory block of this device. */ 119 + unsigned long last_usable_mb_id; 120 + /* Id of the next memory bock to prepare when needed. */ 121 + unsigned long next_mb_id; 122 + 123 + /* The subblock size. */ 124 + uint64_t sb_size; 125 + /* The number of subblocks per Linux memory block. */ 126 + uint32_t sbs_per_mb; 127 + 128 + /* Summary of all memory block states. */ 129 + unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT]; 130 + 131 + /* 132 + * One byte state per memory block. Allocated via 133 + * vmalloc(). Resized (alloc+copy+free) on demand. 134 + * 135 + * With 128 MiB memory blocks, we have states for 512 136 + * GiB of memory in one 4 KiB page. 137 + */ 138 + uint8_t *mb_states; 139 + 140 + /* 141 + * Bitmap: one bit per subblock. Allocated similar to 142 + * sbm.mb_states. 143 + * 144 + * A set bit means the corresponding subblock is 145 + * plugged, otherwise it's unblocked. 146 + * 147 + * With 4 MiB subblocks, we manage 128 GiB of memory 148 + * in one 4 KiB page. 149 + */ 150 + unsigned long *sb_states; 151 + } sbm; 152 + 153 + struct { 154 + /* Id of the first big block of this device. */ 155 + unsigned long first_bb_id; 156 + /* Id of the last usable big block of this device. */ 157 + unsigned long last_usable_bb_id; 158 + /* Id of the next device bock to prepare when needed. */ 159 + unsigned long next_bb_id; 160 + 161 + /* Summary of all big block states. */ 162 + unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT]; 163 + 164 + /* One byte state per big block. See sbm.mb_states. */ 165 + uint8_t *bb_states; 166 + 167 + /* The block size used for plugging/adding/removing. */ 168 + uint64_t bb_size; 169 + } bbm; 170 + }; 158 171 159 172 /* 160 - * $nb_sb_per_mb bit per memory block. Handled similar to mb_state. 161 - * 162 - * With 4MB subblocks, we manage 128GB of memory in one page. 163 - */ 164 - unsigned long *sb_bitmap; 165 - 166 - /* 167 - * Mutex that protects the nb_mb_state, mb_state, and sb_bitmap. 173 + * Mutex that protects the sbm.mb_count, sbm.mb_states, 174 + * sbm.sb_states, bbm.bb_count, and bbm.bb_states 168 175 * 169 176 * When this lock is held the pointers can't change, ONLINE and 170 177 * OFFLINE blocks can't change the state and no subblocks will get ··· 249 160 static LIST_HEAD(virtio_mem_devices); 250 161 251 162 static void virtio_mem_online_page_cb(struct page *page, unsigned int order); 163 + static void virtio_mem_fake_offline_going_offline(unsigned long pfn, 164 + unsigned long nr_pages); 165 + static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, 166 + unsigned long nr_pages); 167 + static void virtio_mem_retry(struct virtio_mem *vm); 252 168 253 169 /* 254 170 * Register a virtio-mem device so it will be considered for the online_page ··· 307 213 } 308 214 309 215 /* 216 + * Calculate the big block id of a given address. 217 + */ 218 + static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm, 219 + uint64_t addr) 220 + { 221 + return addr / vm->bbm.bb_size; 222 + } 223 + 224 + /* 225 + * Calculate the physical start address of a given big block id. 226 + */ 227 + static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm, 228 + unsigned long bb_id) 229 + { 230 + return bb_id * vm->bbm.bb_size; 231 + } 232 + 233 + /* 310 234 * Calculate the subblock id of a given address. 311 235 */ 312 236 static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm, ··· 333 221 const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr); 334 222 const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id); 335 223 336 - return (addr - mb_addr) / vm->subblock_size; 224 + return (addr - mb_addr) / vm->sbm.sb_size; 337 225 } 338 226 339 227 /* 340 - * Set the state of a memory block, taking care of the state counter. 228 + * Set the state of a big block, taking care of the state counter. 341 229 */ 342 - static void virtio_mem_mb_set_state(struct virtio_mem *vm, unsigned long mb_id, 343 - enum virtio_mem_mb_state state) 230 + static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm, 231 + unsigned long bb_id, 232 + enum virtio_mem_bbm_bb_state state) 344 233 { 345 - const unsigned long idx = mb_id - vm->first_mb_id; 346 - enum virtio_mem_mb_state old_state; 234 + const unsigned long idx = bb_id - vm->bbm.first_bb_id; 235 + enum virtio_mem_bbm_bb_state old_state; 347 236 348 - old_state = vm->mb_state[idx]; 349 - vm->mb_state[idx] = state; 237 + old_state = vm->bbm.bb_states[idx]; 238 + vm->bbm.bb_states[idx] = state; 350 239 351 - BUG_ON(vm->nb_mb_state[old_state] == 0); 352 - vm->nb_mb_state[old_state]--; 353 - vm->nb_mb_state[state]++; 240 + BUG_ON(vm->bbm.bb_count[old_state] == 0); 241 + vm->bbm.bb_count[old_state]--; 242 + vm->bbm.bb_count[state]++; 354 243 } 355 244 356 245 /* 357 - * Get the state of a memory block. 246 + * Get the state of a big block. 358 247 */ 359 - static enum virtio_mem_mb_state virtio_mem_mb_get_state(struct virtio_mem *vm, 360 - unsigned long mb_id) 248 + static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm, 249 + unsigned long bb_id) 361 250 { 362 - const unsigned long idx = mb_id - vm->first_mb_id; 363 - 364 - return vm->mb_state[idx]; 251 + return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id]; 365 252 } 366 253 367 254 /* 368 - * Prepare the state array for the next memory block. 255 + * Prepare the big block state array for the next big block. 369 256 */ 370 - static int virtio_mem_mb_state_prepare_next_mb(struct virtio_mem *vm) 257 + static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm) 371 258 { 372 - unsigned long old_bytes = vm->next_mb_id - vm->first_mb_id + 1; 373 - unsigned long new_bytes = vm->next_mb_id - vm->first_mb_id + 2; 259 + unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id; 260 + unsigned long new_bytes = old_bytes + 1; 374 261 int old_pages = PFN_UP(old_bytes); 375 262 int new_pages = PFN_UP(new_bytes); 376 - uint8_t *new_mb_state; 263 + uint8_t *new_array; 377 264 378 - if (vm->mb_state && old_pages == new_pages) 265 + if (vm->bbm.bb_states && old_pages == new_pages) 379 266 return 0; 380 267 381 - new_mb_state = vzalloc(new_pages * PAGE_SIZE); 382 - if (!new_mb_state) 268 + new_array = vzalloc(new_pages * PAGE_SIZE); 269 + if (!new_array) 383 270 return -ENOMEM; 384 271 385 272 mutex_lock(&vm->hotplug_mutex); 386 - if (vm->mb_state) 387 - memcpy(new_mb_state, vm->mb_state, old_pages * PAGE_SIZE); 388 - vfree(vm->mb_state); 389 - vm->mb_state = new_mb_state; 273 + if (vm->bbm.bb_states) 274 + memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE); 275 + vfree(vm->bbm.bb_states); 276 + vm->bbm.bb_states = new_array; 390 277 mutex_unlock(&vm->hotplug_mutex); 391 278 392 279 return 0; 393 280 } 394 281 395 - #define virtio_mem_for_each_mb_state(_vm, _mb_id, _state) \ 396 - for (_mb_id = _vm->first_mb_id; \ 397 - _mb_id < _vm->next_mb_id && _vm->nb_mb_state[_state]; \ 398 - _mb_id++) \ 399 - if (virtio_mem_mb_get_state(_vm, _mb_id) == _state) 282 + #define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \ 283 + for (_bb_id = vm->bbm.first_bb_id; \ 284 + _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \ 285 + _bb_id++) \ 286 + if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) 400 287 401 - #define virtio_mem_for_each_mb_state_rev(_vm, _mb_id, _state) \ 402 - for (_mb_id = _vm->next_mb_id - 1; \ 403 - _mb_id >= _vm->first_mb_id && _vm->nb_mb_state[_state]; \ 288 + #define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \ 289 + for (_bb_id = vm->bbm.next_bb_id - 1; \ 290 + _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \ 291 + _bb_id--) \ 292 + if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) 293 + 294 + /* 295 + * Set the state of a memory block, taking care of the state counter. 296 + */ 297 + static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm, 298 + unsigned long mb_id, uint8_t state) 299 + { 300 + const unsigned long idx = mb_id - vm->sbm.first_mb_id; 301 + uint8_t old_state; 302 + 303 + old_state = vm->sbm.mb_states[idx]; 304 + vm->sbm.mb_states[idx] = state; 305 + 306 + BUG_ON(vm->sbm.mb_count[old_state] == 0); 307 + vm->sbm.mb_count[old_state]--; 308 + vm->sbm.mb_count[state]++; 309 + } 310 + 311 + /* 312 + * Get the state of a memory block. 313 + */ 314 + static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm, 315 + unsigned long mb_id) 316 + { 317 + const unsigned long idx = mb_id - vm->sbm.first_mb_id; 318 + 319 + return vm->sbm.mb_states[idx]; 320 + } 321 + 322 + /* 323 + * Prepare the state array for the next memory block. 324 + */ 325 + static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm) 326 + { 327 + int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id); 328 + int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1); 329 + uint8_t *new_array; 330 + 331 + if (vm->sbm.mb_states && old_pages == new_pages) 332 + return 0; 333 + 334 + new_array = vzalloc(new_pages * PAGE_SIZE); 335 + if (!new_array) 336 + return -ENOMEM; 337 + 338 + mutex_lock(&vm->hotplug_mutex); 339 + if (vm->sbm.mb_states) 340 + memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE); 341 + vfree(vm->sbm.mb_states); 342 + vm->sbm.mb_states = new_array; 343 + mutex_unlock(&vm->hotplug_mutex); 344 + 345 + return 0; 346 + } 347 + 348 + #define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \ 349 + for (_mb_id = _vm->sbm.first_mb_id; \ 350 + _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \ 351 + _mb_id++) \ 352 + if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) 353 + 354 + #define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \ 355 + for (_mb_id = _vm->sbm.next_mb_id - 1; \ 356 + _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \ 404 357 _mb_id--) \ 405 - if (virtio_mem_mb_get_state(_vm, _mb_id) == _state) 358 + if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) 359 + 360 + /* 361 + * Calculate the bit number in the subblock bitmap for the given subblock 362 + * inside the given memory block. 363 + */ 364 + static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm, 365 + unsigned long mb_id, int sb_id) 366 + { 367 + return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id; 368 + } 406 369 407 370 /* 408 371 * Mark all selected subblocks plugged. 409 372 * 410 373 * Will not modify the state of the memory block. 411 374 */ 412 - static void virtio_mem_mb_set_sb_plugged(struct virtio_mem *vm, 413 - unsigned long mb_id, int sb_id, 414 - int count) 375 + static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm, 376 + unsigned long mb_id, int sb_id, 377 + int count) 415 378 { 416 - const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; 379 + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 417 380 418 - __bitmap_set(vm->sb_bitmap, bit, count); 381 + __bitmap_set(vm->sbm.sb_states, bit, count); 419 382 } 420 383 421 384 /* ··· 498 311 * 499 312 * Will not modify the state of the memory block. 500 313 */ 501 - static void virtio_mem_mb_set_sb_unplugged(struct virtio_mem *vm, 502 - unsigned long mb_id, int sb_id, 503 - int count) 314 + static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm, 315 + unsigned long mb_id, int sb_id, 316 + int count) 504 317 { 505 - const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; 318 + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 506 319 507 - __bitmap_clear(vm->sb_bitmap, bit, count); 320 + __bitmap_clear(vm->sbm.sb_states, bit, count); 508 321 } 509 322 510 323 /* 511 324 * Test if all selected subblocks are plugged. 512 325 */ 513 - static bool virtio_mem_mb_test_sb_plugged(struct virtio_mem *vm, 514 - unsigned long mb_id, int sb_id, 515 - int count) 326 + static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm, 327 + unsigned long mb_id, int sb_id, 328 + int count) 516 329 { 517 - const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; 330 + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 518 331 519 332 if (count == 1) 520 - return test_bit(bit, vm->sb_bitmap); 333 + return test_bit(bit, vm->sbm.sb_states); 521 334 522 335 /* TODO: Helper similar to bitmap_set() */ 523 - return find_next_zero_bit(vm->sb_bitmap, bit + count, bit) >= 336 + return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >= 524 337 bit + count; 525 338 } 526 339 527 340 /* 528 341 * Test if all selected subblocks are unplugged. 529 342 */ 530 - static bool virtio_mem_mb_test_sb_unplugged(struct virtio_mem *vm, 531 - unsigned long mb_id, int sb_id, 532 - int count) 343 + static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm, 344 + unsigned long mb_id, int sb_id, 345 + int count) 533 346 { 534 - const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; 347 + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 535 348 536 349 /* TODO: Helper similar to bitmap_set() */ 537 - return find_next_bit(vm->sb_bitmap, bit + count, bit) >= bit + count; 350 + return find_next_bit(vm->sbm.sb_states, bit + count, bit) >= 351 + bit + count; 538 352 } 539 353 540 354 /* 541 - * Find the first unplugged subblock. Returns vm->nb_sb_per_mb in case there is 355 + * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is 542 356 * none. 543 357 */ 544 - static int virtio_mem_mb_first_unplugged_sb(struct virtio_mem *vm, 358 + static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm, 545 359 unsigned long mb_id) 546 360 { 547 - const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb; 361 + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0); 548 362 549 - return find_next_zero_bit(vm->sb_bitmap, bit + vm->nb_sb_per_mb, bit) - 550 - bit; 363 + return find_next_zero_bit(vm->sbm.sb_states, 364 + bit + vm->sbm.sbs_per_mb, bit) - bit; 551 365 } 552 366 553 367 /* 554 368 * Prepare the subblock bitmap for the next memory block. 555 369 */ 556 - static int virtio_mem_sb_bitmap_prepare_next_mb(struct virtio_mem *vm) 370 + static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm) 557 371 { 558 - const unsigned long old_nb_mb = vm->next_mb_id - vm->first_mb_id; 559 - const unsigned long old_nb_bits = old_nb_mb * vm->nb_sb_per_mb; 560 - const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->nb_sb_per_mb; 372 + const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id; 373 + const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb; 374 + const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb; 561 375 int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long)); 562 376 int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long)); 563 - unsigned long *new_sb_bitmap, *old_sb_bitmap; 377 + unsigned long *new_bitmap, *old_bitmap; 564 378 565 - if (vm->sb_bitmap && old_pages == new_pages) 379 + if (vm->sbm.sb_states && old_pages == new_pages) 566 380 return 0; 567 381 568 - new_sb_bitmap = vzalloc(new_pages * PAGE_SIZE); 569 - if (!new_sb_bitmap) 382 + new_bitmap = vzalloc(new_pages * PAGE_SIZE); 383 + if (!new_bitmap) 570 384 return -ENOMEM; 571 385 572 386 mutex_lock(&vm->hotplug_mutex); 573 - if (new_sb_bitmap) 574 - memcpy(new_sb_bitmap, vm->sb_bitmap, old_pages * PAGE_SIZE); 387 + if (new_bitmap) 388 + memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE); 575 389 576 - old_sb_bitmap = vm->sb_bitmap; 577 - vm->sb_bitmap = new_sb_bitmap; 390 + old_bitmap = vm->sbm.sb_states; 391 + vm->sbm.sb_states = new_bitmap; 578 392 mutex_unlock(&vm->hotplug_mutex); 579 393 580 - vfree(old_sb_bitmap); 394 + vfree(old_bitmap); 581 395 return 0; 582 396 } 583 397 584 398 /* 585 - * Try to add a memory block to Linux. This will usually only fail 586 - * if out of memory. 399 + * Test if we could add memory without creating too much offline memory - 400 + * to avoid running OOM if memory is getting onlined deferred. 401 + */ 402 + static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size) 403 + { 404 + if (WARN_ON_ONCE(size > vm->offline_threshold)) 405 + return false; 406 + 407 + return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold; 408 + } 409 + 410 + /* 411 + * Try adding memory to Linux. Will usually only fail if out of memory. 587 412 * 588 413 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 589 414 * onlining code). 590 415 * 591 - * Will not modify the state of the memory block. 416 + * Will not modify the state of memory blocks in virtio-mem. 592 417 */ 593 - static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id) 418 + static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr, 419 + uint64_t size) 594 420 { 595 - const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 596 - int nid = vm->nid; 597 - 598 - if (nid == NUMA_NO_NODE) 599 - nid = memory_add_physaddr_to_nid(addr); 421 + int rc; 600 422 601 423 /* 602 424 * When force-unloading the driver and we still have memory added to ··· 618 422 return -ENOMEM; 619 423 } 620 424 621 - dev_dbg(&vm->vdev->dev, "adding memory block: %lu\n", mb_id); 622 - return add_memory_driver_managed(nid, addr, memory_block_size_bytes(), 623 - vm->resource_name, 624 - MEMHP_MERGE_RESOURCE); 425 + dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr, 426 + addr + size - 1); 427 + /* Memory might get onlined immediately. */ 428 + atomic64_add(size, &vm->offline_size); 429 + rc = add_memory_driver_managed(vm->nid, addr, size, vm->resource_name, 430 + MEMHP_MERGE_RESOURCE); 431 + if (rc) { 432 + atomic64_sub(size, &vm->offline_size); 433 + dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc); 434 + /* 435 + * TODO: Linux MM does not properly clean up yet in all cases 436 + * where adding of memory failed - especially on -ENOMEM. 437 + */ 438 + } 439 + return rc; 625 440 } 626 441 627 442 /* 628 - * Try to remove a memory block from Linux. Will only fail if the memory block 629 - * is not offline. 443 + * See virtio_mem_add_memory(): Try adding a single Linux memory block. 444 + */ 445 + static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id) 446 + { 447 + const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 448 + const uint64_t size = memory_block_size_bytes(); 449 + 450 + return virtio_mem_add_memory(vm, addr, size); 451 + } 452 + 453 + /* 454 + * See virtio_mem_add_memory(): Try adding a big block. 455 + */ 456 + static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id) 457 + { 458 + const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 459 + const uint64_t size = vm->bbm.bb_size; 460 + 461 + return virtio_mem_add_memory(vm, addr, size); 462 + } 463 + 464 + /* 465 + * Try removing memory from Linux. Will only fail if memory blocks aren't 466 + * offline. 630 467 * 631 468 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 632 469 * onlining code). 633 470 * 634 - * Will not modify the state of the memory block. 471 + * Will not modify the state of memory blocks in virtio-mem. 635 472 */ 636 - static int virtio_mem_mb_remove(struct virtio_mem *vm, unsigned long mb_id) 473 + static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr, 474 + uint64_t size) 637 475 { 638 - const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 639 - int nid = vm->nid; 476 + int rc; 640 477 641 - if (nid == NUMA_NO_NODE) 642 - nid = memory_add_physaddr_to_nid(addr); 643 - 644 - dev_dbg(&vm->vdev->dev, "removing memory block: %lu\n", mb_id); 645 - return remove_memory(nid, addr, memory_block_size_bytes()); 478 + dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr, 479 + addr + size - 1); 480 + rc = remove_memory(vm->nid, addr, size); 481 + if (!rc) { 482 + atomic64_sub(size, &vm->offline_size); 483 + /* 484 + * We might have freed up memory we can now unplug, retry 485 + * immediately instead of waiting. 486 + */ 487 + virtio_mem_retry(vm); 488 + } else { 489 + dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc); 490 + } 491 + return rc; 646 492 } 647 493 648 494 /* 649 - * Try to offline and remove a memory block from Linux. 495 + * See virtio_mem_remove_memory(): Try removing a single Linux memory block. 496 + */ 497 + static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id) 498 + { 499 + const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 500 + const uint64_t size = memory_block_size_bytes(); 501 + 502 + return virtio_mem_remove_memory(vm, addr, size); 503 + } 504 + 505 + /* 506 + * See virtio_mem_remove_memory(): Try to remove all Linux memory blocks covered 507 + * by the big block. 508 + */ 509 + static int virtio_mem_bbm_remove_bb(struct virtio_mem *vm, unsigned long bb_id) 510 + { 511 + const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 512 + const uint64_t size = vm->bbm.bb_size; 513 + 514 + return virtio_mem_remove_memory(vm, addr, size); 515 + } 516 + 517 + /* 518 + * Try offlining and removing memory from Linux. 650 519 * 651 520 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 652 521 * onlining code). 653 522 * 654 - * Will not modify the state of the memory block. 523 + * Will not modify the state of memory blocks in virtio-mem. 655 524 */ 656 - static int virtio_mem_mb_offline_and_remove(struct virtio_mem *vm, 657 - unsigned long mb_id) 525 + static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm, 526 + uint64_t addr, 527 + uint64_t size) 528 + { 529 + int rc; 530 + 531 + dev_dbg(&vm->vdev->dev, 532 + "offlining and removing memory: 0x%llx - 0x%llx\n", addr, 533 + addr + size - 1); 534 + 535 + rc = offline_and_remove_memory(vm->nid, addr, size); 536 + if (!rc) { 537 + atomic64_sub(size, &vm->offline_size); 538 + /* 539 + * We might have freed up memory we can now unplug, retry 540 + * immediately instead of waiting. 541 + */ 542 + virtio_mem_retry(vm); 543 + } else { 544 + dev_dbg(&vm->vdev->dev, 545 + "offlining and removing memory failed: %d\n", rc); 546 + } 547 + return rc; 548 + } 549 + 550 + /* 551 + * See virtio_mem_offline_and_remove_memory(): Try offlining and removing 552 + * a single Linux memory block. 553 + */ 554 + static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm, 555 + unsigned long mb_id) 658 556 { 659 557 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 660 - int nid = vm->nid; 558 + const uint64_t size = memory_block_size_bytes(); 661 559 662 - if (nid == NUMA_NO_NODE) 663 - nid = memory_add_physaddr_to_nid(addr); 560 + return virtio_mem_offline_and_remove_memory(vm, addr, size); 561 + } 664 562 665 - dev_dbg(&vm->vdev->dev, "offlining and removing memory block: %lu\n", 666 - mb_id); 667 - return offline_and_remove_memory(nid, addr, memory_block_size_bytes()); 563 + /* 564 + * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a 565 + * all Linux memory blocks covered by the big block. 566 + */ 567 + static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm, 568 + unsigned long bb_id) 569 + { 570 + const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 571 + const uint64_t size = vm->bbm.bb_size; 572 + 573 + return virtio_mem_offline_and_remove_memory(vm, addr, size); 668 574 } 669 575 670 576 /* ··· 797 499 * Test if a virtio-mem device overlaps with the given range. Can be called 798 500 * from (notifier) callbacks lockless. 799 501 */ 800 - static bool virtio_mem_overlaps_range(struct virtio_mem *vm, 801 - unsigned long start, unsigned long size) 502 + static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start, 503 + uint64_t size) 802 504 { 803 - unsigned long dev_start = virtio_mem_mb_id_to_phys(vm->first_mb_id); 804 - unsigned long dev_end = virtio_mem_mb_id_to_phys(vm->last_mb_id) + 805 - memory_block_size_bytes(); 806 - 807 - return start < dev_end && dev_start < start + size; 505 + return start < vm->addr + vm->region_size && vm->addr < start + size; 808 506 } 809 507 810 508 /* 811 - * Test if a virtio-mem device owns a memory block. Can be called from 509 + * Test if a virtio-mem device contains a given range. Can be called from 812 510 * (notifier) callbacks lockless. 813 511 */ 814 - static bool virtio_mem_owned_mb(struct virtio_mem *vm, unsigned long mb_id) 512 + static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start, 513 + uint64_t size) 815 514 { 816 - return mb_id >= vm->first_mb_id && mb_id <= vm->last_mb_id; 515 + return start >= vm->addr && start + size <= vm->addr + vm->region_size; 817 516 } 818 517 819 - static int virtio_mem_notify_going_online(struct virtio_mem *vm, 820 - unsigned long mb_id) 518 + static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm, 519 + unsigned long mb_id) 821 520 { 822 - switch (virtio_mem_mb_get_state(vm, mb_id)) { 823 - case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL: 824 - case VIRTIO_MEM_MB_STATE_OFFLINE: 521 + switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 522 + case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 523 + case VIRTIO_MEM_SBM_MB_OFFLINE: 825 524 return NOTIFY_OK; 826 525 default: 827 526 break; ··· 828 533 return NOTIFY_BAD; 829 534 } 830 535 831 - static void virtio_mem_notify_offline(struct virtio_mem *vm, 832 - unsigned long mb_id) 536 + static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm, 537 + unsigned long mb_id) 833 538 { 834 - switch (virtio_mem_mb_get_state(vm, mb_id)) { 835 - case VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL: 836 - virtio_mem_mb_set_state(vm, mb_id, 837 - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL); 539 + switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 540 + case VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL: 541 + virtio_mem_sbm_set_mb_state(vm, mb_id, 542 + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 838 543 break; 839 - case VIRTIO_MEM_MB_STATE_ONLINE: 840 - virtio_mem_mb_set_state(vm, mb_id, 841 - VIRTIO_MEM_MB_STATE_OFFLINE); 544 + case VIRTIO_MEM_SBM_MB_ONLINE: 545 + virtio_mem_sbm_set_mb_state(vm, mb_id, 546 + VIRTIO_MEM_SBM_MB_OFFLINE); 842 547 break; 843 548 default: 844 549 BUG(); 845 550 break; 846 551 } 552 + } 847 553 554 + static void virtio_mem_sbm_notify_online(struct virtio_mem *vm, 555 + unsigned long mb_id) 556 + { 557 + switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 558 + case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 559 + virtio_mem_sbm_set_mb_state(vm, mb_id, 560 + VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL); 561 + break; 562 + case VIRTIO_MEM_SBM_MB_OFFLINE: 563 + virtio_mem_sbm_set_mb_state(vm, mb_id, 564 + VIRTIO_MEM_SBM_MB_ONLINE); 565 + break; 566 + default: 567 + BUG(); 568 + break; 569 + } 570 + } 571 + 572 + static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm, 573 + unsigned long mb_id) 574 + { 575 + const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); 576 + unsigned long pfn; 577 + int sb_id; 578 + 579 + for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { 580 + if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 581 + continue; 582 + pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 583 + sb_id * vm->sbm.sb_size); 584 + virtio_mem_fake_offline_going_offline(pfn, nr_pages); 585 + } 586 + } 587 + 588 + static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm, 589 + unsigned long mb_id) 590 + { 591 + const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); 592 + unsigned long pfn; 593 + int sb_id; 594 + 595 + for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { 596 + if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 597 + continue; 598 + pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 599 + sb_id * vm->sbm.sb_size); 600 + virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); 601 + } 602 + } 603 + 604 + static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm, 605 + unsigned long bb_id, 606 + unsigned long pfn, 607 + unsigned long nr_pages) 608 + { 848 609 /* 849 - * Trigger the workqueue, maybe we can now unplug memory. Also, 850 - * when we offline and remove a memory block, this will re-trigger 851 - * us immediately - which is often nice because the removal of 852 - * the memory block (e.g., memmap) might have freed up memory 853 - * on other memory blocks we manage. 610 + * When marked as "fake-offline", all online memory of this device block 611 + * is allocated by us. Otherwise, we don't have any memory allocated. 854 612 */ 855 - virtio_mem_retry(vm); 613 + if (virtio_mem_bbm_get_bb_state(vm, bb_id) != 614 + VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) 615 + return; 616 + virtio_mem_fake_offline_going_offline(pfn, nr_pages); 856 617 } 857 618 858 - static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id) 619 + static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm, 620 + unsigned long bb_id, 621 + unsigned long pfn, 622 + unsigned long nr_pages) 859 623 { 860 - unsigned long nb_offline; 861 - 862 - switch (virtio_mem_mb_get_state(vm, mb_id)) { 863 - case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL: 864 - virtio_mem_mb_set_state(vm, mb_id, 865 - VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL); 866 - break; 867 - case VIRTIO_MEM_MB_STATE_OFFLINE: 868 - virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_ONLINE); 869 - break; 870 - default: 871 - BUG(); 872 - break; 873 - } 874 - nb_offline = vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] + 875 - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL]; 876 - 877 - /* see if we can add new blocks now that we onlined one block */ 878 - if (nb_offline == VIRTIO_MEM_NB_OFFLINE_THRESHOLD - 1) 879 - virtio_mem_retry(vm); 880 - } 881 - 882 - static void virtio_mem_notify_going_offline(struct virtio_mem *vm, 883 - unsigned long mb_id) 884 - { 885 - const unsigned long nr_pages = PFN_DOWN(vm->subblock_size); 886 - struct page *page; 887 - unsigned long pfn; 888 - int sb_id, i; 889 - 890 - for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) { 891 - if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1)) 892 - continue; 893 - /* 894 - * Drop our reference to the pages so the memory can get 895 - * offlined and add the unplugged pages to the managed 896 - * page counters (so offlining code can correctly subtract 897 - * them again). 898 - */ 899 - pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 900 - sb_id * vm->subblock_size); 901 - adjust_managed_page_count(pfn_to_page(pfn), nr_pages); 902 - for (i = 0; i < nr_pages; i++) { 903 - page = pfn_to_page(pfn + i); 904 - if (WARN_ON(!page_ref_dec_and_test(page))) 905 - dump_page(page, "unplugged page referenced"); 906 - } 907 - } 908 - } 909 - 910 - static void virtio_mem_notify_cancel_offline(struct virtio_mem *vm, 911 - unsigned long mb_id) 912 - { 913 - const unsigned long nr_pages = PFN_DOWN(vm->subblock_size); 914 - unsigned long pfn; 915 - int sb_id, i; 916 - 917 - for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) { 918 - if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1)) 919 - continue; 920 - /* 921 - * Get the reference we dropped when going offline and 922 - * subtract the unplugged pages from the managed page 923 - * counters. 924 - */ 925 - pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 926 - sb_id * vm->subblock_size); 927 - adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); 928 - for (i = 0; i < nr_pages; i++) 929 - page_ref_inc(pfn_to_page(pfn + i)); 930 - } 624 + if (virtio_mem_bbm_get_bb_state(vm, bb_id) != 625 + VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) 626 + return; 627 + virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); 931 628 } 932 629 933 630 /* ··· 935 648 struct memory_notify *mhp = arg; 936 649 const unsigned long start = PFN_PHYS(mhp->start_pfn); 937 650 const unsigned long size = PFN_PHYS(mhp->nr_pages); 938 - const unsigned long mb_id = virtio_mem_phys_to_mb_id(start); 939 651 int rc = NOTIFY_OK; 652 + unsigned long id; 940 653 941 654 if (!virtio_mem_overlaps_range(vm, start, size)) 942 655 return NOTIFY_DONE; 943 656 944 - /* 945 - * Memory is onlined/offlined in memory block granularity. We cannot 946 - * cross virtio-mem device boundaries and memory block boundaries. Bail 947 - * out if this ever changes. 948 - */ 949 - if (WARN_ON_ONCE(size != memory_block_size_bytes() || 950 - !IS_ALIGNED(start, memory_block_size_bytes()))) 951 - return NOTIFY_BAD; 657 + if (vm->in_sbm) { 658 + id = virtio_mem_phys_to_mb_id(start); 659 + /* 660 + * In SBM, we add memory in separate memory blocks - we expect 661 + * it to be onlined/offlined in the same granularity. Bail out 662 + * if this ever changes. 663 + */ 664 + if (WARN_ON_ONCE(size != memory_block_size_bytes() || 665 + !IS_ALIGNED(start, memory_block_size_bytes()))) 666 + return NOTIFY_BAD; 667 + } else { 668 + id = virtio_mem_phys_to_bb_id(vm, start); 669 + /* 670 + * In BBM, we only care about onlining/offlining happening 671 + * within a single big block, we don't care about the 672 + * actual granularity as we don't track individual Linux 673 + * memory blocks. 674 + */ 675 + if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1))) 676 + return NOTIFY_BAD; 677 + } 952 678 953 679 /* 954 680 * Avoid circular locking lockdep warnings. We lock the mutex ··· 980 680 break; 981 681 } 982 682 vm->hotplug_active = true; 983 - virtio_mem_notify_going_offline(vm, mb_id); 683 + if (vm->in_sbm) 684 + virtio_mem_sbm_notify_going_offline(vm, id); 685 + else 686 + virtio_mem_bbm_notify_going_offline(vm, id, 687 + mhp->start_pfn, 688 + mhp->nr_pages); 984 689 break; 985 690 case MEM_GOING_ONLINE: 986 691 mutex_lock(&vm->hotplug_mutex); ··· 995 690 break; 996 691 } 997 692 vm->hotplug_active = true; 998 - rc = virtio_mem_notify_going_online(vm, mb_id); 693 + if (vm->in_sbm) 694 + rc = virtio_mem_sbm_notify_going_online(vm, id); 999 695 break; 1000 696 case MEM_OFFLINE: 1001 - virtio_mem_notify_offline(vm, mb_id); 697 + if (vm->in_sbm) 698 + virtio_mem_sbm_notify_offline(vm, id); 699 + 700 + atomic64_add(size, &vm->offline_size); 701 + /* 702 + * Trigger the workqueue. Now that we have some offline memory, 703 + * maybe we can handle pending unplug requests. 704 + */ 705 + if (!unplug_online) 706 + virtio_mem_retry(vm); 707 + 1002 708 vm->hotplug_active = false; 1003 709 mutex_unlock(&vm->hotplug_mutex); 1004 710 break; 1005 711 case MEM_ONLINE: 1006 - virtio_mem_notify_online(vm, mb_id); 712 + if (vm->in_sbm) 713 + virtio_mem_sbm_notify_online(vm, id); 714 + 715 + atomic64_sub(size, &vm->offline_size); 716 + /* 717 + * Start adding more memory once we onlined half of our 718 + * threshold. Don't trigger if it's possibly due to our actipn 719 + * (e.g., us adding memory which gets onlined immediately from 720 + * the core). 721 + */ 722 + if (!atomic_read(&vm->wq_active) && 723 + virtio_mem_could_add_memory(vm, vm->offline_threshold / 2)) 724 + virtio_mem_retry(vm); 725 + 1007 726 vm->hotplug_active = false; 1008 727 mutex_unlock(&vm->hotplug_mutex); 1009 728 break; 1010 729 case MEM_CANCEL_OFFLINE: 1011 730 if (!vm->hotplug_active) 1012 731 break; 1013 - virtio_mem_notify_cancel_offline(vm, mb_id); 732 + if (vm->in_sbm) 733 + virtio_mem_sbm_notify_cancel_offline(vm, id); 734 + else 735 + virtio_mem_bbm_notify_cancel_offline(vm, id, 736 + mhp->start_pfn, 737 + mhp->nr_pages); 1014 738 vm->hotplug_active = false; 1015 739 mutex_unlock(&vm->hotplug_mutex); 1016 740 break; ··· 1063 729 * (via generic_online_page()) using PageDirty(). 1064 730 */ 1065 731 static void virtio_mem_set_fake_offline(unsigned long pfn, 1066 - unsigned int nr_pages, bool onlined) 732 + unsigned long nr_pages, bool onlined) 1067 733 { 1068 734 for (; nr_pages--; pfn++) { 1069 735 struct page *page = pfn_to_page(pfn); ··· 1082 748 * (via generic_online_page()), clear PageDirty(). 1083 749 */ 1084 750 static void virtio_mem_clear_fake_offline(unsigned long pfn, 1085 - unsigned int nr_pages, bool onlined) 751 + unsigned long nr_pages, bool onlined) 1086 752 { 1087 753 for (; nr_pages--; pfn++) { 1088 754 struct page *page = pfn_to_page(pfn); ··· 1097 763 * Release a range of fake-offline pages to the buddy, effectively 1098 764 * fake-onlining them. 1099 765 */ 1100 - static void virtio_mem_fake_online(unsigned long pfn, unsigned int nr_pages) 766 + static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) 1101 767 { 1102 - const int order = MAX_ORDER - 1; 1103 - int i; 768 + const unsigned long max_nr_pages = MAX_ORDER_NR_PAGES; 769 + unsigned long i; 1104 770 1105 771 /* 1106 - * We are always called with subblock granularity, which is at least 1107 - * aligned to MAX_ORDER - 1. 772 + * We are always called at least with MAX_ORDER_NR_PAGES 773 + * granularity/alignment (e.g., the way subblocks work). All pages 774 + * inside such a block are alike. 1108 775 */ 1109 - for (i = 0; i < nr_pages; i += 1 << order) { 776 + for (i = 0; i < nr_pages; i += max_nr_pages) { 1110 777 struct page *page = pfn_to_page(pfn + i); 1111 778 1112 779 /* ··· 1117 782 * alike. 1118 783 */ 1119 784 if (PageDirty(page)) { 1120 - virtio_mem_clear_fake_offline(pfn + i, 1 << order, 785 + virtio_mem_clear_fake_offline(pfn + i, max_nr_pages, 1121 786 false); 1122 - generic_online_page(page, order); 787 + generic_online_page(page, MAX_ORDER - 1); 1123 788 } else { 1124 - virtio_mem_clear_fake_offline(pfn + i, 1 << order, 789 + virtio_mem_clear_fake_offline(pfn + i, max_nr_pages, 1125 790 true); 1126 - free_contig_range(pfn + i, 1 << order); 1127 - adjust_managed_page_count(page, 1 << order); 791 + free_contig_range(pfn + i, max_nr_pages); 792 + adjust_managed_page_count(page, max_nr_pages); 1128 793 } 1129 794 } 795 + } 796 + 797 + /* 798 + * Try to allocate a range, marking pages fake-offline, effectively 799 + * fake-offlining them. 800 + */ 801 + static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages) 802 + { 803 + const bool is_movable = zone_idx(page_zone(pfn_to_page(pfn))) == 804 + ZONE_MOVABLE; 805 + int rc, retry_count; 806 + 807 + /* 808 + * TODO: We want an alloc_contig_range() mode that tries to allocate 809 + * harder (e.g., dealing with temporarily pinned pages, PCP), especially 810 + * with ZONE_MOVABLE. So for now, retry a couple of times with 811 + * ZONE_MOVABLE before giving up - because that zone is supposed to give 812 + * some guarantees. 813 + */ 814 + for (retry_count = 0; retry_count < 5; retry_count++) { 815 + rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE, 816 + GFP_KERNEL); 817 + if (rc == -ENOMEM) 818 + /* whoops, out of memory */ 819 + return rc; 820 + else if (rc && !is_movable) 821 + break; 822 + else if (rc) 823 + continue; 824 + 825 + virtio_mem_set_fake_offline(pfn, nr_pages, true); 826 + adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); 827 + return 0; 828 + } 829 + 830 + return -EBUSY; 831 + } 832 + 833 + /* 834 + * Handle fake-offline pages when memory is going offline - such that the 835 + * pages can be skipped by mm-core when offlining. 836 + */ 837 + static void virtio_mem_fake_offline_going_offline(unsigned long pfn, 838 + unsigned long nr_pages) 839 + { 840 + struct page *page; 841 + unsigned long i; 842 + 843 + /* 844 + * Drop our reference to the pages so the memory can get offlined 845 + * and add the unplugged pages to the managed page counters (so 846 + * offlining code can correctly subtract them again). 847 + */ 848 + adjust_managed_page_count(pfn_to_page(pfn), nr_pages); 849 + /* Drop our reference to the pages so the memory can get offlined. */ 850 + for (i = 0; i < nr_pages; i++) { 851 + page = pfn_to_page(pfn + i); 852 + if (WARN_ON(!page_ref_dec_and_test(page))) 853 + dump_page(page, "fake-offline page referenced"); 854 + } 855 + } 856 + 857 + /* 858 + * Handle fake-offline pages when memory offlining is canceled - to undo 859 + * what we did in virtio_mem_fake_offline_going_offline(). 860 + */ 861 + static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, 862 + unsigned long nr_pages) 863 + { 864 + unsigned long i; 865 + 866 + /* 867 + * Get the reference we dropped when going offline and subtract the 868 + * unplugged pages from the managed page counters. 869 + */ 870 + adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); 871 + for (i = 0; i < nr_pages; i++) 872 + page_ref_inc(pfn_to_page(pfn + i)); 1130 873 } 1131 874 1132 875 static void virtio_mem_online_page_cb(struct page *page, unsigned int order) 1133 876 { 1134 877 const unsigned long addr = page_to_phys(page); 1135 - const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr); 878 + unsigned long id, sb_id; 1136 879 struct virtio_mem *vm; 1137 - int sb_id; 880 + bool do_online; 1138 881 1139 - /* 1140 - * We exploit here that subblocks have at least MAX_ORDER - 1 1141 - * size/alignment and that this callback is is called with such a 1142 - * size/alignment. So we cannot cross subblocks and therefore 1143 - * also not memory blocks. 1144 - */ 1145 882 rcu_read_lock(); 1146 883 list_for_each_entry_rcu(vm, &virtio_mem_devices, next) { 1147 - if (!virtio_mem_owned_mb(vm, mb_id)) 884 + if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order))) 1148 885 continue; 1149 886 1150 - sb_id = virtio_mem_phys_to_sb_id(vm, addr); 1151 - /* 1152 - * If plugged, online the pages, otherwise, set them fake 1153 - * offline (PageOffline). 1154 - */ 1155 - if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1)) 887 + if (vm->in_sbm) { 888 + /* 889 + * We exploit here that subblocks have at least 890 + * MAX_ORDER_NR_PAGES size/alignment - so we cannot 891 + * cross subblocks within one call. 892 + */ 893 + id = virtio_mem_phys_to_mb_id(addr); 894 + sb_id = virtio_mem_phys_to_sb_id(vm, addr); 895 + do_online = virtio_mem_sbm_test_sb_plugged(vm, id, 896 + sb_id, 1); 897 + } else { 898 + /* 899 + * If the whole block is marked fake offline, keep 900 + * everything that way. 901 + */ 902 + id = virtio_mem_phys_to_bb_id(vm, addr); 903 + do_online = virtio_mem_bbm_get_bb_state(vm, id) != 904 + VIRTIO_MEM_BBM_BB_FAKE_OFFLINE; 905 + } 906 + if (do_online) 1156 907 generic_online_page(page, order); 1157 908 else 1158 909 virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order, ··· 1291 870 .u.plug.addr = cpu_to_virtio64(vm->vdev, addr), 1292 871 .u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 1293 872 }; 873 + int rc = -ENOMEM; 1294 874 1295 875 if (atomic_read(&vm->config_changed)) 1296 876 return -EAGAIN; 877 + 878 + dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr, 879 + addr + size - 1); 1297 880 1298 881 switch (virtio_mem_send_request(vm, &req)) { 1299 882 case VIRTIO_MEM_RESP_ACK: 1300 883 vm->plugged_size += size; 1301 884 return 0; 1302 885 case VIRTIO_MEM_RESP_NACK: 1303 - return -EAGAIN; 886 + rc = -EAGAIN; 887 + break; 1304 888 case VIRTIO_MEM_RESP_BUSY: 1305 - return -ETXTBSY; 889 + rc = -ETXTBSY; 890 + break; 1306 891 case VIRTIO_MEM_RESP_ERROR: 1307 - return -EINVAL; 892 + rc = -EINVAL; 893 + break; 1308 894 default: 1309 - return -ENOMEM; 895 + break; 1310 896 } 897 + 898 + dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc); 899 + return rc; 1311 900 } 1312 901 1313 902 static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr, ··· 1329 898 .u.unplug.addr = cpu_to_virtio64(vm->vdev, addr), 1330 899 .u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 1331 900 }; 901 + int rc = -ENOMEM; 1332 902 1333 903 if (atomic_read(&vm->config_changed)) 1334 904 return -EAGAIN; 905 + 906 + dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr, 907 + addr + size - 1); 1335 908 1336 909 switch (virtio_mem_send_request(vm, &req)) { 1337 910 case VIRTIO_MEM_RESP_ACK: 1338 911 vm->plugged_size -= size; 1339 912 return 0; 1340 913 case VIRTIO_MEM_RESP_BUSY: 1341 - return -ETXTBSY; 914 + rc = -ETXTBSY; 915 + break; 1342 916 case VIRTIO_MEM_RESP_ERROR: 1343 - return -EINVAL; 917 + rc = -EINVAL; 918 + break; 1344 919 default: 1345 - return -ENOMEM; 920 + break; 1346 921 } 922 + 923 + dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc); 924 + return rc; 1347 925 } 1348 926 1349 927 static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) ··· 1360 920 const struct virtio_mem_req req = { 1361 921 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL), 1362 922 }; 923 + int rc = -ENOMEM; 924 + 925 + dev_dbg(&vm->vdev->dev, "unplugging all memory"); 1363 926 1364 927 switch (virtio_mem_send_request(vm, &req)) { 1365 928 case VIRTIO_MEM_RESP_ACK: ··· 1372 929 atomic_set(&vm->config_changed, 1); 1373 930 return 0; 1374 931 case VIRTIO_MEM_RESP_BUSY: 1375 - return -ETXTBSY; 932 + rc = -ETXTBSY; 933 + break; 1376 934 default: 1377 - return -ENOMEM; 935 + break; 1378 936 } 937 + 938 + dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc); 939 + return rc; 1379 940 } 1380 941 1381 942 /* 1382 943 * Plug selected subblocks. Updates the plugged state, but not the state 1383 944 * of the memory block. 1384 945 */ 1385 - static int virtio_mem_mb_plug_sb(struct virtio_mem *vm, unsigned long mb_id, 1386 - int sb_id, int count) 946 + static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id, 947 + int sb_id, int count) 1387 948 { 1388 949 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + 1389 - sb_id * vm->subblock_size; 1390 - const uint64_t size = count * vm->subblock_size; 950 + sb_id * vm->sbm.sb_size; 951 + const uint64_t size = count * vm->sbm.sb_size; 1391 952 int rc; 1392 - 1393 - dev_dbg(&vm->vdev->dev, "plugging memory block: %lu : %i - %i\n", mb_id, 1394 - sb_id, sb_id + count - 1); 1395 953 1396 954 rc = virtio_mem_send_plug_request(vm, addr, size); 1397 955 if (!rc) 1398 - virtio_mem_mb_set_sb_plugged(vm, mb_id, sb_id, count); 956 + virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count); 1399 957 return rc; 1400 958 } 1401 959 ··· 1404 960 * Unplug selected subblocks. Updates the plugged state, but not the state 1405 961 * of the memory block. 1406 962 */ 1407 - static int virtio_mem_mb_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, 1408 - int sb_id, int count) 963 + static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, 964 + int sb_id, int count) 1409 965 { 1410 966 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + 1411 - sb_id * vm->subblock_size; 1412 - const uint64_t size = count * vm->subblock_size; 967 + sb_id * vm->sbm.sb_size; 968 + const uint64_t size = count * vm->sbm.sb_size; 1413 969 int rc; 1414 - 1415 - dev_dbg(&vm->vdev->dev, "unplugging memory block: %lu : %i - %i\n", 1416 - mb_id, sb_id, sb_id + count - 1); 1417 970 1418 971 rc = virtio_mem_send_unplug_request(vm, addr, size); 1419 972 if (!rc) 1420 - virtio_mem_mb_set_sb_unplugged(vm, mb_id, sb_id, count); 973 + virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count); 1421 974 return rc; 975 + } 976 + 977 + /* 978 + * Request to unplug a big block. 979 + * 980 + * Will not modify the state of the big block. 981 + */ 982 + static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id) 983 + { 984 + const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 985 + const uint64_t size = vm->bbm.bb_size; 986 + 987 + return virtio_mem_send_unplug_request(vm, addr, size); 988 + } 989 + 990 + /* 991 + * Request to plug a big block. 992 + * 993 + * Will not modify the state of the big block. 994 + */ 995 + static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id) 996 + { 997 + const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 998 + const uint64_t size = vm->bbm.bb_size; 999 + 1000 + return virtio_mem_send_plug_request(vm, addr, size); 1422 1001 } 1423 1002 1424 1003 /* ··· 1453 986 * 1454 987 * Note: can fail after some subblocks were unplugged. 1455 988 */ 1456 - static int virtio_mem_mb_unplug_any_sb(struct virtio_mem *vm, 1457 - unsigned long mb_id, uint64_t *nb_sb) 989 + static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm, 990 + unsigned long mb_id, uint64_t *nb_sb) 1458 991 { 1459 992 int sb_id, count; 1460 993 int rc; 1461 994 1462 - sb_id = vm->nb_sb_per_mb - 1; 995 + sb_id = vm->sbm.sbs_per_mb - 1; 1463 996 while (*nb_sb) { 1464 997 /* Find the next candidate subblock */ 1465 998 while (sb_id >= 0 && 1466 - virtio_mem_mb_test_sb_unplugged(vm, mb_id, sb_id, 1)) 999 + virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1)) 1467 1000 sb_id--; 1468 1001 if (sb_id < 0) 1469 1002 break; 1470 1003 /* Try to unplug multiple subblocks at a time */ 1471 1004 count = 1; 1472 1005 while (count < *nb_sb && sb_id > 0 && 1473 - virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) { 1006 + virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) { 1474 1007 count++; 1475 1008 sb_id--; 1476 1009 } 1477 1010 1478 - rc = virtio_mem_mb_unplug_sb(vm, mb_id, sb_id, count); 1011 + rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); 1479 1012 if (rc) 1480 1013 return rc; 1481 1014 *nb_sb -= count; ··· 1492 1025 * 1493 1026 * Note: can fail after some subblocks were unplugged. 1494 1027 */ 1495 - static int virtio_mem_mb_unplug(struct virtio_mem *vm, unsigned long mb_id) 1028 + static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id) 1496 1029 { 1497 - uint64_t nb_sb = vm->nb_sb_per_mb; 1030 + uint64_t nb_sb = vm->sbm.sbs_per_mb; 1498 1031 1499 - return virtio_mem_mb_unplug_any_sb(vm, mb_id, &nb_sb); 1032 + return virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb); 1500 1033 } 1501 1034 1502 1035 /* 1503 1036 * Prepare tracking data for the next memory block. 1504 1037 */ 1505 - static int virtio_mem_prepare_next_mb(struct virtio_mem *vm, 1506 - unsigned long *mb_id) 1038 + static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm, 1039 + unsigned long *mb_id) 1507 1040 { 1508 1041 int rc; 1509 1042 1510 - if (vm->next_mb_id > vm->last_usable_mb_id) 1043 + if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id) 1511 1044 return -ENOSPC; 1512 1045 1513 1046 /* Resize the state array if required. */ 1514 - rc = virtio_mem_mb_state_prepare_next_mb(vm); 1047 + rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm); 1515 1048 if (rc) 1516 1049 return rc; 1517 1050 1518 1051 /* Resize the subblock bitmap if required. */ 1519 - rc = virtio_mem_sb_bitmap_prepare_next_mb(vm); 1052 + rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm); 1520 1053 if (rc) 1521 1054 return rc; 1522 1055 1523 - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_UNUSED]++; 1524 - *mb_id = vm->next_mb_id++; 1056 + vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++; 1057 + *mb_id = vm->sbm.next_mb_id++; 1525 1058 return 0; 1526 - } 1527 - 1528 - /* 1529 - * Don't add too many blocks that are not onlined yet to avoid running OOM. 1530 - */ 1531 - static bool virtio_mem_too_many_mb_offline(struct virtio_mem *vm) 1532 - { 1533 - unsigned long nb_offline; 1534 - 1535 - nb_offline = vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] + 1536 - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL]; 1537 - return nb_offline >= VIRTIO_MEM_NB_OFFLINE_THRESHOLD; 1538 1059 } 1539 1060 1540 1061 /* ··· 1531 1076 * 1532 1077 * Will modify the state of the memory block. 1533 1078 */ 1534 - static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm, 1535 - unsigned long mb_id, 1536 - uint64_t *nb_sb) 1079 + static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm, 1080 + unsigned long mb_id, uint64_t *nb_sb) 1537 1081 { 1538 - const int count = min_t(int, *nb_sb, vm->nb_sb_per_mb); 1539 - int rc, rc2; 1082 + const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb); 1083 + int rc; 1540 1084 1541 1085 if (WARN_ON_ONCE(!count)) 1542 1086 return -EINVAL; ··· 1544 1090 * Plug the requested number of subblocks before adding it to linux, 1545 1091 * so that onlining will directly online all plugged subblocks. 1546 1092 */ 1547 - rc = virtio_mem_mb_plug_sb(vm, mb_id, 0, count); 1093 + rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count); 1548 1094 if (rc) 1549 1095 return rc; 1550 1096 ··· 1552 1098 * Mark the block properly offline before adding it to Linux, 1553 1099 * so the memory notifiers will find the block in the right state. 1554 1100 */ 1555 - if (count == vm->nb_sb_per_mb) 1556 - virtio_mem_mb_set_state(vm, mb_id, 1557 - VIRTIO_MEM_MB_STATE_OFFLINE); 1101 + if (count == vm->sbm.sbs_per_mb) 1102 + virtio_mem_sbm_set_mb_state(vm, mb_id, 1103 + VIRTIO_MEM_SBM_MB_OFFLINE); 1558 1104 else 1559 - virtio_mem_mb_set_state(vm, mb_id, 1560 - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL); 1105 + virtio_mem_sbm_set_mb_state(vm, mb_id, 1106 + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 1561 1107 1562 1108 /* Add the memory block to linux - if that fails, try to unplug. */ 1563 - rc = virtio_mem_mb_add(vm, mb_id); 1109 + rc = virtio_mem_sbm_add_mb(vm, mb_id); 1564 1110 if (rc) { 1565 - enum virtio_mem_mb_state new_state = VIRTIO_MEM_MB_STATE_UNUSED; 1111 + int new_state = VIRTIO_MEM_SBM_MB_UNUSED; 1566 1112 1567 - dev_err(&vm->vdev->dev, 1568 - "adding memory block %lu failed with %d\n", mb_id, rc); 1569 - rc2 = virtio_mem_mb_unplug_sb(vm, mb_id, 0, count); 1570 - 1571 - /* 1572 - * TODO: Linux MM does not properly clean up yet in all cases 1573 - * where adding of memory failed - especially on -ENOMEM. 1574 - */ 1575 - if (rc2) 1576 - new_state = VIRTIO_MEM_MB_STATE_PLUGGED; 1577 - virtio_mem_mb_set_state(vm, mb_id, new_state); 1113 + if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count)) 1114 + new_state = VIRTIO_MEM_SBM_MB_PLUGGED; 1115 + virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); 1578 1116 return rc; 1579 1117 } 1580 1118 ··· 1582 1136 * 1583 1137 * Note: Can fail after some subblocks were successfully plugged. 1584 1138 */ 1585 - static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id, 1586 - uint64_t *nb_sb, bool online) 1139 + static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, 1140 + unsigned long mb_id, uint64_t *nb_sb, 1141 + bool online) 1587 1142 { 1588 1143 unsigned long pfn, nr_pages; 1589 1144 int sb_id, count; ··· 1594 1147 return -EINVAL; 1595 1148 1596 1149 while (*nb_sb) { 1597 - sb_id = virtio_mem_mb_first_unplugged_sb(vm, mb_id); 1598 - if (sb_id >= vm->nb_sb_per_mb) 1150 + sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id); 1151 + if (sb_id >= vm->sbm.sbs_per_mb) 1599 1152 break; 1600 1153 count = 1; 1601 1154 while (count < *nb_sb && 1602 - sb_id + count < vm->nb_sb_per_mb && 1603 - !virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id + count, 1604 - 1)) 1155 + sb_id + count < vm->sbm.sbs_per_mb && 1156 + !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1)) 1605 1157 count++; 1606 1158 1607 - rc = virtio_mem_mb_plug_sb(vm, mb_id, sb_id, count); 1159 + rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count); 1608 1160 if (rc) 1609 1161 return rc; 1610 1162 *nb_sb -= count; ··· 1612 1166 1613 1167 /* fake-online the pages if the memory block is online */ 1614 1168 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 1615 - sb_id * vm->subblock_size); 1616 - nr_pages = PFN_DOWN(count * vm->subblock_size); 1169 + sb_id * vm->sbm.sb_size); 1170 + nr_pages = PFN_DOWN(count * vm->sbm.sb_size); 1617 1171 virtio_mem_fake_online(pfn, nr_pages); 1618 1172 } 1619 1173 1620 - if (virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { 1174 + if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1621 1175 if (online) 1622 - virtio_mem_mb_set_state(vm, mb_id, 1623 - VIRTIO_MEM_MB_STATE_ONLINE); 1176 + virtio_mem_sbm_set_mb_state(vm, mb_id, 1177 + VIRTIO_MEM_SBM_MB_ONLINE); 1624 1178 else 1625 - virtio_mem_mb_set_state(vm, mb_id, 1626 - VIRTIO_MEM_MB_STATE_OFFLINE); 1179 + virtio_mem_sbm_set_mb_state(vm, mb_id, 1180 + VIRTIO_MEM_SBM_MB_OFFLINE); 1627 1181 } 1628 1182 1629 1183 return 0; 1630 1184 } 1631 1185 1632 - /* 1633 - * Try to plug the requested amount of memory. 1634 - */ 1635 - static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) 1186 + static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) 1636 1187 { 1637 - uint64_t nb_sb = diff / vm->subblock_size; 1188 + uint64_t nb_sb = diff / vm->sbm.sb_size; 1638 1189 unsigned long mb_id; 1639 1190 int rc; 1640 1191 ··· 1642 1199 mutex_lock(&vm->hotplug_mutex); 1643 1200 1644 1201 /* Try to plug subblocks of partially plugged online blocks. */ 1645 - virtio_mem_for_each_mb_state(vm, mb_id, 1646 - VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL) { 1647 - rc = virtio_mem_mb_plug_any_sb(vm, mb_id, &nb_sb, true); 1202 + virtio_mem_sbm_for_each_mb(vm, mb_id, 1203 + VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) { 1204 + rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb, true); 1648 1205 if (rc || !nb_sb) 1649 1206 goto out_unlock; 1650 1207 cond_resched(); 1651 1208 } 1652 1209 1653 1210 /* Try to plug subblocks of partially plugged offline blocks. */ 1654 - virtio_mem_for_each_mb_state(vm, mb_id, 1655 - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) { 1656 - rc = virtio_mem_mb_plug_any_sb(vm, mb_id, &nb_sb, false); 1211 + virtio_mem_sbm_for_each_mb(vm, mb_id, 1212 + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { 1213 + rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb, false); 1657 1214 if (rc || !nb_sb) 1658 1215 goto out_unlock; 1659 1216 cond_resched(); ··· 1666 1223 mutex_unlock(&vm->hotplug_mutex); 1667 1224 1668 1225 /* Try to plug and add unused blocks */ 1669 - virtio_mem_for_each_mb_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED) { 1670 - if (virtio_mem_too_many_mb_offline(vm)) 1226 + virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) { 1227 + if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) 1671 1228 return -ENOSPC; 1672 1229 1673 - rc = virtio_mem_mb_plug_and_add(vm, mb_id, &nb_sb); 1230 + rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); 1674 1231 if (rc || !nb_sb) 1675 1232 return rc; 1676 1233 cond_resched(); ··· 1678 1235 1679 1236 /* Try to prepare, plug and add new blocks */ 1680 1237 while (nb_sb) { 1681 - if (virtio_mem_too_many_mb_offline(vm)) 1238 + if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) 1682 1239 return -ENOSPC; 1683 1240 1684 - rc = virtio_mem_prepare_next_mb(vm, &mb_id); 1241 + rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id); 1685 1242 if (rc) 1686 1243 return rc; 1687 - rc = virtio_mem_mb_plug_and_add(vm, mb_id, &nb_sb); 1244 + rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); 1688 1245 if (rc) 1689 1246 return rc; 1690 1247 cond_resched(); ··· 1697 1254 } 1698 1255 1699 1256 /* 1257 + * Plug a big block and add it to Linux. 1258 + * 1259 + * Will modify the state of the big block. 1260 + */ 1261 + static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm, 1262 + unsigned long bb_id) 1263 + { 1264 + int rc; 1265 + 1266 + if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 1267 + VIRTIO_MEM_BBM_BB_UNUSED)) 1268 + return -EINVAL; 1269 + 1270 + rc = virtio_mem_bbm_plug_bb(vm, bb_id); 1271 + if (rc) 1272 + return rc; 1273 + virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); 1274 + 1275 + rc = virtio_mem_bbm_add_bb(vm, bb_id); 1276 + if (rc) { 1277 + if (!virtio_mem_bbm_unplug_bb(vm, bb_id)) 1278 + virtio_mem_bbm_set_bb_state(vm, bb_id, 1279 + VIRTIO_MEM_BBM_BB_UNUSED); 1280 + else 1281 + /* Retry from the main loop. */ 1282 + virtio_mem_bbm_set_bb_state(vm, bb_id, 1283 + VIRTIO_MEM_BBM_BB_PLUGGED); 1284 + return rc; 1285 + } 1286 + return 0; 1287 + } 1288 + 1289 + /* 1290 + * Prepare tracking data for the next big block. 1291 + */ 1292 + static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm, 1293 + unsigned long *bb_id) 1294 + { 1295 + int rc; 1296 + 1297 + if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id) 1298 + return -ENOSPC; 1299 + 1300 + /* Resize the big block state array if required. */ 1301 + rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm); 1302 + if (rc) 1303 + return rc; 1304 + 1305 + vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++; 1306 + *bb_id = vm->bbm.next_bb_id; 1307 + vm->bbm.next_bb_id++; 1308 + return 0; 1309 + } 1310 + 1311 + static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff) 1312 + { 1313 + uint64_t nb_bb = diff / vm->bbm.bb_size; 1314 + unsigned long bb_id; 1315 + int rc; 1316 + 1317 + if (!nb_bb) 1318 + return 0; 1319 + 1320 + /* Try to plug and add unused big blocks */ 1321 + virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) { 1322 + if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) 1323 + return -ENOSPC; 1324 + 1325 + rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); 1326 + if (!rc) 1327 + nb_bb--; 1328 + if (rc || !nb_bb) 1329 + return rc; 1330 + cond_resched(); 1331 + } 1332 + 1333 + /* Try to prepare, plug and add new big blocks */ 1334 + while (nb_bb) { 1335 + if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) 1336 + return -ENOSPC; 1337 + 1338 + rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id); 1339 + if (rc) 1340 + return rc; 1341 + rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); 1342 + if (!rc) 1343 + nb_bb--; 1344 + if (rc) 1345 + return rc; 1346 + cond_resched(); 1347 + } 1348 + 1349 + return 0; 1350 + } 1351 + 1352 + /* 1353 + * Try to plug the requested amount of memory. 1354 + */ 1355 + static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) 1356 + { 1357 + if (vm->in_sbm) 1358 + return virtio_mem_sbm_plug_request(vm, diff); 1359 + return virtio_mem_bbm_plug_request(vm, diff); 1360 + } 1361 + 1362 + /* 1700 1363 * Unplug the desired number of plugged subblocks of an offline memory block. 1701 1364 * Will fail if any subblock cannot get unplugged (instead of skipping it). 1702 1365 * ··· 1811 1262 * 1812 1263 * Note: Can fail after some subblocks were successfully unplugged. 1813 1264 */ 1814 - static int virtio_mem_mb_unplug_any_sb_offline(struct virtio_mem *vm, 1815 - unsigned long mb_id, 1816 - uint64_t *nb_sb) 1265 + static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm, 1266 + unsigned long mb_id, 1267 + uint64_t *nb_sb) 1817 1268 { 1818 1269 int rc; 1819 1270 1820 - rc = virtio_mem_mb_unplug_any_sb(vm, mb_id, nb_sb); 1271 + rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, nb_sb); 1821 1272 1822 1273 /* some subblocks might have been unplugged even on failure */ 1823 - if (!virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) 1824 - virtio_mem_mb_set_state(vm, mb_id, 1825 - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL); 1274 + if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) 1275 + virtio_mem_sbm_set_mb_state(vm, mb_id, 1276 + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 1826 1277 if (rc) 1827 1278 return rc; 1828 1279 1829 - if (virtio_mem_mb_test_sb_unplugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { 1280 + if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1830 1281 /* 1831 1282 * Remove the block from Linux - this should never fail. 1832 1283 * Hinder the block from getting onlined by marking it 1833 1284 * unplugged. Temporarily drop the mutex, so 1834 1285 * any pending GOING_ONLINE requests can be serviced/rejected. 1835 1286 */ 1836 - virtio_mem_mb_set_state(vm, mb_id, 1837 - VIRTIO_MEM_MB_STATE_UNUSED); 1287 + virtio_mem_sbm_set_mb_state(vm, mb_id, 1288 + VIRTIO_MEM_SBM_MB_UNUSED); 1838 1289 1839 1290 mutex_unlock(&vm->hotplug_mutex); 1840 - rc = virtio_mem_mb_remove(vm, mb_id); 1291 + rc = virtio_mem_sbm_remove_mb(vm, mb_id); 1841 1292 BUG_ON(rc); 1842 1293 mutex_lock(&vm->hotplug_mutex); 1843 1294 } ··· 1849 1300 * 1850 1301 * Will modify the state of the memory block. 1851 1302 */ 1852 - static int virtio_mem_mb_unplug_sb_online(struct virtio_mem *vm, 1853 - unsigned long mb_id, int sb_id, 1854 - int count) 1303 + static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm, 1304 + unsigned long mb_id, int sb_id, 1305 + int count) 1855 1306 { 1856 - const unsigned long nr_pages = PFN_DOWN(vm->subblock_size) * count; 1307 + const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count; 1857 1308 unsigned long start_pfn; 1858 1309 int rc; 1859 1310 1860 1311 start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 1861 - sb_id * vm->subblock_size); 1862 - rc = alloc_contig_range(start_pfn, start_pfn + nr_pages, 1863 - MIGRATE_MOVABLE, GFP_KERNEL); 1864 - if (rc == -ENOMEM) 1865 - /* whoops, out of memory */ 1866 - return rc; 1867 - if (rc) 1868 - return -EBUSY; 1312 + sb_id * vm->sbm.sb_size); 1869 1313 1870 - /* Mark it as fake-offline before unplugging it */ 1871 - virtio_mem_set_fake_offline(start_pfn, nr_pages, true); 1872 - adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); 1314 + rc = virtio_mem_fake_offline(start_pfn, nr_pages); 1315 + if (rc) 1316 + return rc; 1873 1317 1874 1318 /* Try to unplug the allocated memory */ 1875 - rc = virtio_mem_mb_unplug_sb(vm, mb_id, sb_id, count); 1319 + rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); 1876 1320 if (rc) { 1877 1321 /* Return the memory to the buddy. */ 1878 1322 virtio_mem_fake_online(start_pfn, nr_pages); 1879 1323 return rc; 1880 1324 } 1881 1325 1882 - virtio_mem_mb_set_state(vm, mb_id, 1883 - VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL); 1326 + virtio_mem_sbm_set_mb_state(vm, mb_id, 1327 + VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL); 1884 1328 return 0; 1885 1329 } 1886 1330 ··· 1887 1345 * Note: Can fail after some subblocks were successfully unplugged. Can 1888 1346 * return 0 even if subblocks were busy and could not get unplugged. 1889 1347 */ 1890 - static int virtio_mem_mb_unplug_any_sb_online(struct virtio_mem *vm, 1891 - unsigned long mb_id, 1892 - uint64_t *nb_sb) 1348 + static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm, 1349 + unsigned long mb_id, 1350 + uint64_t *nb_sb) 1893 1351 { 1894 1352 int rc, sb_id; 1895 1353 1896 1354 /* If possible, try to unplug the complete block in one shot. */ 1897 - if (*nb_sb >= vm->nb_sb_per_mb && 1898 - virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { 1899 - rc = virtio_mem_mb_unplug_sb_online(vm, mb_id, 0, 1900 - vm->nb_sb_per_mb); 1355 + if (*nb_sb >= vm->sbm.sbs_per_mb && 1356 + virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1357 + rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0, 1358 + vm->sbm.sbs_per_mb); 1901 1359 if (!rc) { 1902 - *nb_sb -= vm->nb_sb_per_mb; 1360 + *nb_sb -= vm->sbm.sbs_per_mb; 1903 1361 goto unplugged; 1904 1362 } else if (rc != -EBUSY) 1905 1363 return rc; 1906 1364 } 1907 1365 1908 1366 /* Fallback to single subblocks. */ 1909 - for (sb_id = vm->nb_sb_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) { 1367 + for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) { 1910 1368 /* Find the next candidate subblock */ 1911 1369 while (sb_id >= 0 && 1912 - !virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1)) 1370 + !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 1913 1371 sb_id--; 1914 1372 if (sb_id < 0) 1915 1373 break; 1916 1374 1917 - rc = virtio_mem_mb_unplug_sb_online(vm, mb_id, sb_id, 1); 1375 + rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1); 1918 1376 if (rc == -EBUSY) 1919 1377 continue; 1920 1378 else if (rc) ··· 1928 1386 * remove it. This will usually not fail, as no memory is in use 1929 1387 * anymore - however some other notifiers might NACK the request. 1930 1388 */ 1931 - if (virtio_mem_mb_test_sb_unplugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { 1389 + if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1932 1390 mutex_unlock(&vm->hotplug_mutex); 1933 - rc = virtio_mem_mb_offline_and_remove(vm, mb_id); 1391 + rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id); 1934 1392 mutex_lock(&vm->hotplug_mutex); 1935 1393 if (!rc) 1936 - virtio_mem_mb_set_state(vm, mb_id, 1937 - VIRTIO_MEM_MB_STATE_UNUSED); 1394 + virtio_mem_sbm_set_mb_state(vm, mb_id, 1395 + VIRTIO_MEM_SBM_MB_UNUSED); 1938 1396 } 1939 1397 1940 1398 return 0; 1941 1399 } 1942 1400 1943 - /* 1944 - * Try to unplug the requested amount of memory. 1945 - */ 1946 - static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) 1401 + static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff) 1947 1402 { 1948 - uint64_t nb_sb = diff / vm->subblock_size; 1403 + uint64_t nb_sb = diff / vm->sbm.sb_size; 1949 1404 unsigned long mb_id; 1950 1405 int rc; 1951 1406 ··· 1957 1418 mutex_lock(&vm->hotplug_mutex); 1958 1419 1959 1420 /* Try to unplug subblocks of partially plugged offline blocks. */ 1960 - virtio_mem_for_each_mb_state_rev(vm, mb_id, 1961 - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) { 1962 - rc = virtio_mem_mb_unplug_any_sb_offline(vm, mb_id, 1963 - &nb_sb); 1421 + virtio_mem_sbm_for_each_mb_rev(vm, mb_id, 1422 + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { 1423 + rc = virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, &nb_sb); 1964 1424 if (rc || !nb_sb) 1965 1425 goto out_unlock; 1966 1426 cond_resched(); 1967 1427 } 1968 1428 1969 1429 /* Try to unplug subblocks of plugged offline blocks. */ 1970 - virtio_mem_for_each_mb_state_rev(vm, mb_id, 1971 - VIRTIO_MEM_MB_STATE_OFFLINE) { 1972 - rc = virtio_mem_mb_unplug_any_sb_offline(vm, mb_id, 1973 - &nb_sb); 1430 + virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_OFFLINE) { 1431 + rc = virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, &nb_sb); 1974 1432 if (rc || !nb_sb) 1975 1433 goto out_unlock; 1976 1434 cond_resched(); ··· 1979 1443 } 1980 1444 1981 1445 /* Try to unplug subblocks of partially plugged online blocks. */ 1982 - virtio_mem_for_each_mb_state_rev(vm, mb_id, 1983 - VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL) { 1984 - rc = virtio_mem_mb_unplug_any_sb_online(vm, mb_id, 1985 - &nb_sb); 1446 + virtio_mem_sbm_for_each_mb_rev(vm, mb_id, 1447 + VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) { 1448 + rc = virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, &nb_sb); 1986 1449 if (rc || !nb_sb) 1987 1450 goto out_unlock; 1988 1451 mutex_unlock(&vm->hotplug_mutex); ··· 1990 1455 } 1991 1456 1992 1457 /* Try to unplug subblocks of plugged online blocks. */ 1993 - virtio_mem_for_each_mb_state_rev(vm, mb_id, 1994 - VIRTIO_MEM_MB_STATE_ONLINE) { 1995 - rc = virtio_mem_mb_unplug_any_sb_online(vm, mb_id, 1996 - &nb_sb); 1458 + virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_ONLINE) { 1459 + rc = virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, &nb_sb); 1997 1460 if (rc || !nb_sb) 1998 1461 goto out_unlock; 1999 1462 mutex_unlock(&vm->hotplug_mutex); ··· 2007 1474 } 2008 1475 2009 1476 /* 1477 + * Try to offline and remove a big block from Linux and unplug it. Will fail 1478 + * with -EBUSY if some memory is busy and cannot get unplugged. 1479 + * 1480 + * Will modify the state of the memory block. Might temporarily drop the 1481 + * hotplug_mutex. 1482 + */ 1483 + static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm, 1484 + unsigned long bb_id) 1485 + { 1486 + const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 1487 + const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 1488 + unsigned long end_pfn = start_pfn + nr_pages; 1489 + unsigned long pfn; 1490 + struct page *page; 1491 + int rc; 1492 + 1493 + if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 1494 + VIRTIO_MEM_BBM_BB_ADDED)) 1495 + return -EINVAL; 1496 + 1497 + if (bbm_safe_unplug) { 1498 + /* 1499 + * Start by fake-offlining all memory. Once we marked the device 1500 + * block as fake-offline, all newly onlined memory will 1501 + * automatically be kept fake-offline. Protect from concurrent 1502 + * onlining/offlining until we have a consistent state. 1503 + */ 1504 + mutex_lock(&vm->hotplug_mutex); 1505 + virtio_mem_bbm_set_bb_state(vm, bb_id, 1506 + VIRTIO_MEM_BBM_BB_FAKE_OFFLINE); 1507 + 1508 + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1509 + page = pfn_to_online_page(pfn); 1510 + if (!page) 1511 + continue; 1512 + 1513 + rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION); 1514 + if (rc) { 1515 + end_pfn = pfn; 1516 + goto rollback_safe_unplug; 1517 + } 1518 + } 1519 + mutex_unlock(&vm->hotplug_mutex); 1520 + } 1521 + 1522 + rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id); 1523 + if (rc) { 1524 + if (bbm_safe_unplug) { 1525 + mutex_lock(&vm->hotplug_mutex); 1526 + goto rollback_safe_unplug; 1527 + } 1528 + return rc; 1529 + } 1530 + 1531 + rc = virtio_mem_bbm_unplug_bb(vm, bb_id); 1532 + if (rc) 1533 + virtio_mem_bbm_set_bb_state(vm, bb_id, 1534 + VIRTIO_MEM_BBM_BB_PLUGGED); 1535 + else 1536 + virtio_mem_bbm_set_bb_state(vm, bb_id, 1537 + VIRTIO_MEM_BBM_BB_UNUSED); 1538 + return rc; 1539 + 1540 + rollback_safe_unplug: 1541 + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1542 + page = pfn_to_online_page(pfn); 1543 + if (!page) 1544 + continue; 1545 + virtio_mem_fake_online(pfn, PAGES_PER_SECTION); 1546 + } 1547 + virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); 1548 + mutex_unlock(&vm->hotplug_mutex); 1549 + return rc; 1550 + } 1551 + 1552 + /* 1553 + * Try to remove a big block from Linux and unplug it. Will fail with 1554 + * -EBUSY if some memory is online. 1555 + * 1556 + * Will modify the state of the memory block. 1557 + */ 1558 + static int virtio_mem_bbm_remove_and_unplug_bb(struct virtio_mem *vm, 1559 + unsigned long bb_id) 1560 + { 1561 + int rc; 1562 + 1563 + if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 1564 + VIRTIO_MEM_BBM_BB_ADDED)) 1565 + return -EINVAL; 1566 + 1567 + rc = virtio_mem_bbm_remove_bb(vm, bb_id); 1568 + if (rc) 1569 + return -EBUSY; 1570 + 1571 + rc = virtio_mem_bbm_unplug_bb(vm, bb_id); 1572 + if (rc) 1573 + virtio_mem_bbm_set_bb_state(vm, bb_id, 1574 + VIRTIO_MEM_BBM_BB_PLUGGED); 1575 + else 1576 + virtio_mem_bbm_set_bb_state(vm, bb_id, 1577 + VIRTIO_MEM_BBM_BB_UNUSED); 1578 + return rc; 1579 + } 1580 + 1581 + /* 1582 + * Test if a big block is completely offline. 1583 + */ 1584 + static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm, 1585 + unsigned long bb_id) 1586 + { 1587 + const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 1588 + const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 1589 + unsigned long pfn; 1590 + 1591 + for (pfn = start_pfn; pfn < start_pfn + nr_pages; 1592 + pfn += PAGES_PER_SECTION) { 1593 + if (pfn_to_online_page(pfn)) 1594 + return false; 1595 + } 1596 + 1597 + return true; 1598 + } 1599 + 1600 + static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff) 1601 + { 1602 + uint64_t nb_bb = diff / vm->bbm.bb_size; 1603 + uint64_t bb_id; 1604 + int rc; 1605 + 1606 + if (!nb_bb) 1607 + return 0; 1608 + 1609 + /* Try to unplug completely offline big blocks first. */ 1610 + virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { 1611 + cond_resched(); 1612 + /* 1613 + * As we're holding no locks, this check is racy as memory 1614 + * can get onlined in the meantime - but we'll fail gracefully. 1615 + */ 1616 + if (!virtio_mem_bbm_bb_is_offline(vm, bb_id)) 1617 + continue; 1618 + rc = virtio_mem_bbm_remove_and_unplug_bb(vm, bb_id); 1619 + if (rc == -EBUSY) 1620 + continue; 1621 + if (!rc) 1622 + nb_bb--; 1623 + if (rc || !nb_bb) 1624 + return rc; 1625 + } 1626 + 1627 + if (!unplug_online) 1628 + return 0; 1629 + 1630 + /* Try to unplug any big blocks. */ 1631 + virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { 1632 + cond_resched(); 1633 + rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id); 1634 + if (rc == -EBUSY) 1635 + continue; 1636 + if (!rc) 1637 + nb_bb--; 1638 + if (rc || !nb_bb) 1639 + return rc; 1640 + } 1641 + 1642 + return nb_bb ? -EBUSY : 0; 1643 + } 1644 + 1645 + /* 1646 + * Try to unplug the requested amount of memory. 1647 + */ 1648 + static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) 1649 + { 1650 + if (vm->in_sbm) 1651 + return virtio_mem_sbm_unplug_request(vm, diff); 1652 + return virtio_mem_bbm_unplug_request(vm, diff); 1653 + } 1654 + 1655 + /* 2010 1656 * Try to unplug all blocks that couldn't be unplugged before, for example, 2011 1657 * because the hypervisor was busy. 2012 1658 */ 2013 1659 static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm) 2014 1660 { 2015 - unsigned long mb_id; 1661 + unsigned long id; 2016 1662 int rc; 2017 1663 2018 - virtio_mem_for_each_mb_state(vm, mb_id, VIRTIO_MEM_MB_STATE_PLUGGED) { 2019 - rc = virtio_mem_mb_unplug(vm, mb_id); 1664 + if (!vm->in_sbm) { 1665 + virtio_mem_bbm_for_each_bb(vm, id, 1666 + VIRTIO_MEM_BBM_BB_PLUGGED) { 1667 + rc = virtio_mem_bbm_unplug_bb(vm, id); 1668 + if (rc) 1669 + return rc; 1670 + virtio_mem_bbm_set_bb_state(vm, id, 1671 + VIRTIO_MEM_BBM_BB_UNUSED); 1672 + } 1673 + return 0; 1674 + } 1675 + 1676 + virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) { 1677 + rc = virtio_mem_sbm_unplug_mb(vm, id); 2020 1678 if (rc) 2021 1679 return rc; 2022 - virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED); 1680 + virtio_mem_sbm_set_mb_state(vm, id, 1681 + VIRTIO_MEM_SBM_MB_UNUSED); 2023 1682 } 2024 1683 2025 1684 return 0; ··· 2236 1511 usable_region_size, &usable_region_size); 2237 1512 end_addr = vm->addr + usable_region_size; 2238 1513 end_addr = min(end_addr, phys_limit); 2239 - vm->last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr) - 1; 1514 + 1515 + if (vm->in_sbm) 1516 + vm->sbm.last_usable_mb_id = 1517 + virtio_mem_phys_to_mb_id(end_addr) - 1; 1518 + else 1519 + vm->bbm.last_usable_bb_id = 1520 + virtio_mem_phys_to_bb_id(vm, end_addr) - 1; 2240 1521 2241 1522 /* see if there is a request to change the size */ 2242 1523 virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size, ··· 2266 1535 if (vm->broken) 2267 1536 return; 2268 1537 1538 + atomic_set(&vm->wq_active, 1); 2269 1539 retry: 2270 1540 rc = 0; 2271 1541 ··· 2327 1595 "unknown error, marking device broken: %d\n", rc); 2328 1596 vm->broken = true; 2329 1597 } 1598 + 1599 + atomic_set(&vm->wq_active, 0); 2330 1600 } 2331 1601 2332 1602 static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer) ··· 2365 1631 static int virtio_mem_init(struct virtio_mem *vm) 2366 1632 { 2367 1633 const uint64_t phys_limit = 1UL << MAX_PHYSMEM_BITS; 1634 + uint64_t sb_size, addr; 2368 1635 uint16_t node_id; 2369 1636 2370 1637 if (!vm->vdev->config->get) { ··· 2394 1659 virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size, 2395 1660 &vm->region_size); 2396 1661 2397 - /* 2398 - * We always hotplug memory in memory block granularity. This way, 2399 - * we have to wait for exactly one memory block to online. 2400 - */ 2401 - if (vm->device_block_size > memory_block_size_bytes()) { 2402 - dev_err(&vm->vdev->dev, 2403 - "The block size is not supported (too big).\n"); 2404 - return -EINVAL; 2405 - } 1662 + /* Determine the nid for the device based on the lowest address. */ 1663 + if (vm->nid == NUMA_NO_NODE) 1664 + vm->nid = memory_add_physaddr_to_nid(vm->addr); 2406 1665 2407 1666 /* bad device setup - warn only */ 2408 1667 if (!IS_ALIGNED(vm->addr, memory_block_size_bytes())) ··· 2410 1681 "Some memory is not addressable. This can make some memory unusable.\n"); 2411 1682 2412 1683 /* 2413 - * Calculate the subblock size: 2414 - * - At least MAX_ORDER - 1 / pageblock_order. 2415 - * - At least the device block size. 2416 - * In the worst case, a single subblock per memory block. 1684 + * We want subblocks to span at least MAX_ORDER_NR_PAGES and 1685 + * pageblock_nr_pages pages. This: 1686 + * - Simplifies our page onlining code (virtio_mem_online_page_cb) 1687 + * and fake page onlining code (virtio_mem_fake_online). 1688 + * - Is required for now for alloc_contig_range() to work reliably - 1689 + * it doesn't properly handle smaller granularity on ZONE_NORMAL. 2417 1690 */ 2418 - vm->subblock_size = PAGE_SIZE * 1ul << max_t(uint32_t, MAX_ORDER - 1, 2419 - pageblock_order); 2420 - vm->subblock_size = max_t(uint64_t, vm->device_block_size, 2421 - vm->subblock_size); 2422 - vm->nb_sb_per_mb = memory_block_size_bytes() / vm->subblock_size; 1691 + sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES, 1692 + pageblock_nr_pages) * PAGE_SIZE; 1693 + sb_size = max_t(uint64_t, vm->device_block_size, sb_size); 2423 1694 2424 - /* Round up to the next full memory block */ 2425 - vm->first_mb_id = virtio_mem_phys_to_mb_id(vm->addr - 1 + 2426 - memory_block_size_bytes()); 2427 - vm->next_mb_id = vm->first_mb_id; 2428 - vm->last_mb_id = virtio_mem_phys_to_mb_id(vm->addr + 2429 - vm->region_size) - 1; 1695 + if (sb_size < memory_block_size_bytes() && !force_bbm) { 1696 + /* SBM: At least two subblocks per Linux memory block. */ 1697 + vm->in_sbm = true; 1698 + vm->sbm.sb_size = sb_size; 1699 + vm->sbm.sbs_per_mb = memory_block_size_bytes() / 1700 + vm->sbm.sb_size; 1701 + 1702 + /* Round up to the next full memory block */ 1703 + addr = vm->addr + memory_block_size_bytes() - 1; 1704 + vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr); 1705 + vm->sbm.next_mb_id = vm->sbm.first_mb_id; 1706 + } else { 1707 + /* BBM: At least one Linux memory block. */ 1708 + vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size, 1709 + memory_block_size_bytes()); 1710 + 1711 + if (bbm_block_size) { 1712 + if (!is_power_of_2(bbm_block_size)) { 1713 + dev_warn(&vm->vdev->dev, 1714 + "bbm_block_size is not a power of 2"); 1715 + } else if (bbm_block_size < vm->bbm.bb_size) { 1716 + dev_warn(&vm->vdev->dev, 1717 + "bbm_block_size is too small"); 1718 + } else { 1719 + vm->bbm.bb_size = bbm_block_size; 1720 + } 1721 + } 1722 + 1723 + /* Round up to the next aligned big block */ 1724 + addr = vm->addr + vm->bbm.bb_size - 1; 1725 + vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr); 1726 + vm->bbm.next_bb_id = vm->bbm.first_bb_id; 1727 + } 1728 + 1729 + /* Prepare the offline threshold - make sure we can add two blocks. */ 1730 + vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(), 1731 + VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); 1732 + /* In BBM, we also want at least two big blocks. */ 1733 + vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size, 1734 + vm->offline_threshold); 2430 1735 2431 1736 dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr); 2432 1737 dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size); ··· 2468 1705 (unsigned long long)vm->device_block_size); 2469 1706 dev_info(&vm->vdev->dev, "memory block size: 0x%lx", 2470 1707 memory_block_size_bytes()); 2471 - dev_info(&vm->vdev->dev, "subblock size: 0x%llx", 2472 - (unsigned long long)vm->subblock_size); 2473 - if (vm->nid != NUMA_NO_NODE) 1708 + if (vm->in_sbm) 1709 + dev_info(&vm->vdev->dev, "subblock size: 0x%llx", 1710 + (unsigned long long)vm->sbm.sb_size); 1711 + else 1712 + dev_info(&vm->vdev->dev, "big block size: 0x%llx", 1713 + (unsigned long long)vm->bbm.bb_size); 1714 + if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA)) 2474 1715 dev_info(&vm->vdev->dev, "nid: %d", vm->nid); 2475 1716 2476 1717 return 0; ··· 2518 1751 kfree(vm->parent_resource); 2519 1752 kfree(name); 2520 1753 vm->parent_resource = NULL; 1754 + } 1755 + 1756 + static int virtio_mem_range_has_system_ram(struct resource *res, void *arg) 1757 + { 1758 + return 1; 1759 + } 1760 + 1761 + static bool virtio_mem_has_memory_added(struct virtio_mem *vm) 1762 + { 1763 + const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; 1764 + 1765 + return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr, 1766 + vm->addr + vm->region_size, NULL, 1767 + virtio_mem_range_has_system_ram) == 1; 2521 1768 } 2522 1769 2523 1770 static int virtio_mem_probe(struct virtio_device *vdev) ··· 2630 1849 cancel_work_sync(&vm->wq); 2631 1850 hrtimer_cancel(&vm->retry_timer); 2632 1851 2633 - /* 2634 - * After we unregistered our callbacks, user space can online partially 2635 - * plugged offline blocks. Make sure to remove them. 2636 - */ 2637 - virtio_mem_for_each_mb_state(vm, mb_id, 2638 - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) { 2639 - rc = virtio_mem_mb_remove(vm, mb_id); 2640 - BUG_ON(rc); 2641 - virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED); 1852 + if (vm->in_sbm) { 1853 + /* 1854 + * After we unregistered our callbacks, user space can online 1855 + * partially plugged offline blocks. Make sure to remove them. 1856 + */ 1857 + virtio_mem_sbm_for_each_mb(vm, mb_id, 1858 + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { 1859 + rc = virtio_mem_sbm_remove_mb(vm, mb_id); 1860 + BUG_ON(rc); 1861 + virtio_mem_sbm_set_mb_state(vm, mb_id, 1862 + VIRTIO_MEM_SBM_MB_UNUSED); 1863 + } 1864 + /* 1865 + * After we unregistered our callbacks, user space can no longer 1866 + * offline partially plugged online memory blocks. No need to 1867 + * worry about them. 1868 + */ 2642 1869 } 2643 - /* 2644 - * After we unregistered our callbacks, user space can no longer 2645 - * offline partially plugged online memory blocks. No need to worry 2646 - * about them. 2647 - */ 2648 1870 2649 1871 /* unregister callbacks */ 2650 1872 unregister_virtio_mem_device(vm); ··· 2658 1874 * the system. And there is no way to stop the driver/device from going 2659 1875 * away. Warn at least. 2660 1876 */ 2661 - if (vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] || 2662 - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL] || 2663 - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE] || 2664 - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL]) { 1877 + if (virtio_mem_has_memory_added(vm)) { 2665 1878 dev_warn(&vdev->dev, "device still has system memory added\n"); 2666 1879 } else { 2667 1880 virtio_mem_delete_resource(vm); ··· 2666 1885 } 2667 1886 2668 1887 /* remove all tracking data - no locking needed */ 2669 - vfree(vm->mb_state); 2670 - vfree(vm->sb_bitmap); 1888 + if (vm->in_sbm) { 1889 + vfree(vm->sbm.mb_states); 1890 + vfree(vm->sbm.sb_states); 1891 + } else { 1892 + vfree(vm->bbm.bb_states); 1893 + } 2671 1894 2672 1895 /* reset the device and cleanup the queues */ 2673 1896 vdev->config->reset(vdev);
+4 -4
drivers/virtio/virtio_ring.c
··· 1608 1608 vq->num_added = 0; 1609 1609 vq->packed_ring = true; 1610 1610 vq->use_dma_api = vring_use_dma_api(vdev); 1611 - list_add_tail(&vq->vq.list, &vdev->vqs); 1612 1611 #ifdef DEBUG 1613 1612 vq->in_use = false; 1614 1613 vq->last_add_time_valid = false; ··· 1668 1669 cpu_to_le16(vq->packed.event_flags_shadow); 1669 1670 } 1670 1671 1672 + list_add_tail(&vq->vq.list, &vdev->vqs); 1671 1673 return &vq->vq; 1672 1674 1673 1675 err_desc_extra: ··· 1676 1676 err_desc_state: 1677 1677 kfree(vq); 1678 1678 err_vq: 1679 - vring_free_queue(vdev, event_size_in_bytes, device, ring_dma_addr); 1679 + vring_free_queue(vdev, event_size_in_bytes, device, device_event_dma_addr); 1680 1680 err_device: 1681 - vring_free_queue(vdev, event_size_in_bytes, driver, ring_dma_addr); 1681 + vring_free_queue(vdev, event_size_in_bytes, driver, driver_event_dma_addr); 1682 1682 err_driver: 1683 1683 vring_free_queue(vdev, ring_size_in_bytes, ring, ring_dma_addr); 1684 1684 err_ring: ··· 2085 2085 vq->last_used_idx = 0; 2086 2086 vq->num_added = 0; 2087 2087 vq->use_dma_api = vring_use_dma_api(vdev); 2088 - list_add_tail(&vq->vq.list, &vdev->vqs); 2089 2088 #ifdef DEBUG 2090 2089 vq->in_use = false; 2091 2090 vq->last_add_time_valid = false; ··· 2126 2127 memset(vq->split.desc_state, 0, vring.num * 2127 2128 sizeof(struct vring_desc_state_split)); 2128 2129 2130 + list_add_tail(&vq->vq.list, &vdev->vqs); 2129 2131 return &vq->vq; 2130 2132 } 2131 2133 EXPORT_SYMBOL_GPL(__vring_new_virtqueue);
+1
include/linux/vdpa.h
··· 42 42 * @config: the configuration ops for this device. 43 43 * @index: device index 44 44 * @features_valid: were features initialized? for legacy guests 45 + * @nvqs: maximum number of supported virtqueues 45 46 */ 46 47 struct vdpa_device { 47 48 struct device dev;
+25 -19
include/uapi/linux/virtio_ids.h
··· 29 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 30 * SUCH DAMAGE. */ 31 31 32 - #define VIRTIO_ID_NET 1 /* virtio net */ 33 - #define VIRTIO_ID_BLOCK 2 /* virtio block */ 34 - #define VIRTIO_ID_CONSOLE 3 /* virtio console */ 35 - #define VIRTIO_ID_RNG 4 /* virtio rng */ 36 - #define VIRTIO_ID_BALLOON 5 /* virtio balloon */ 37 - #define VIRTIO_ID_RPMSG 7 /* virtio remote processor messaging */ 38 - #define VIRTIO_ID_SCSI 8 /* virtio scsi */ 39 - #define VIRTIO_ID_9P 9 /* 9p virtio console */ 40 - #define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ 41 - #define VIRTIO_ID_CAIF 12 /* Virtio caif */ 42 - #define VIRTIO_ID_GPU 16 /* virtio GPU */ 43 - #define VIRTIO_ID_INPUT 18 /* virtio input */ 44 - #define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */ 45 - #define VIRTIO_ID_CRYPTO 20 /* virtio crypto */ 46 - #define VIRTIO_ID_IOMMU 23 /* virtio IOMMU */ 47 - #define VIRTIO_ID_MEM 24 /* virtio mem */ 48 - #define VIRTIO_ID_FS 26 /* virtio filesystem */ 49 - #define VIRTIO_ID_PMEM 27 /* virtio pmem */ 50 - #define VIRTIO_ID_MAC80211_HWSIM 29 /* virtio mac80211-hwsim */ 32 + #define VIRTIO_ID_NET 1 /* virtio net */ 33 + #define VIRTIO_ID_BLOCK 2 /* virtio block */ 34 + #define VIRTIO_ID_CONSOLE 3 /* virtio console */ 35 + #define VIRTIO_ID_RNG 4 /* virtio rng */ 36 + #define VIRTIO_ID_BALLOON 5 /* virtio balloon */ 37 + #define VIRTIO_ID_IOMEM 6 /* virtio ioMemory */ 38 + #define VIRTIO_ID_RPMSG 7 /* virtio remote processor messaging */ 39 + #define VIRTIO_ID_SCSI 8 /* virtio scsi */ 40 + #define VIRTIO_ID_9P 9 /* 9p virtio console */ 41 + #define VIRTIO_ID_MAC80211_WLAN 10 /* virtio WLAN MAC */ 42 + #define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ 43 + #define VIRTIO_ID_CAIF 12 /* Virtio caif */ 44 + #define VIRTIO_ID_MEMORY_BALLOON 13 /* virtio memory balloon */ 45 + #define VIRTIO_ID_GPU 16 /* virtio GPU */ 46 + #define VIRTIO_ID_CLOCK 17 /* virtio clock/timer */ 47 + #define VIRTIO_ID_INPUT 18 /* virtio input */ 48 + #define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */ 49 + #define VIRTIO_ID_CRYPTO 20 /* virtio crypto */ 50 + #define VIRTIO_ID_SIGNAL_DIST 21 /* virtio signal distribution device */ 51 + #define VIRTIO_ID_PSTORE 22 /* virtio pstore device */ 52 + #define VIRTIO_ID_IOMMU 23 /* virtio IOMMU */ 53 + #define VIRTIO_ID_MEM 24 /* virtio mem */ 54 + #define VIRTIO_ID_FS 26 /* virtio filesystem */ 55 + #define VIRTIO_ID_PMEM 27 /* virtio pmem */ 56 + #define VIRTIO_ID_MAC80211_HWSIM 29 /* virtio mac80211-hwsim */ 51 57 52 58 #endif /* _LINUX_VIRTIO_IDS_H */
+91 -18
mm/memory_hotplug.c
··· 1784 1784 } 1785 1785 EXPORT_SYMBOL_GPL(remove_memory); 1786 1786 1787 + static int try_offline_memory_block(struct memory_block *mem, void *arg) 1788 + { 1789 + uint8_t online_type = MMOP_ONLINE_KERNEL; 1790 + uint8_t **online_types = arg; 1791 + struct page *page; 1792 + int rc; 1793 + 1794 + /* 1795 + * Sense the online_type via the zone of the memory block. Offlining 1796 + * with multiple zones within one memory block will be rejected 1797 + * by offlining code ... so we don't care about that. 1798 + */ 1799 + page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr)); 1800 + if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE) 1801 + online_type = MMOP_ONLINE_MOVABLE; 1802 + 1803 + rc = device_offline(&mem->dev); 1804 + /* 1805 + * Default is MMOP_OFFLINE - change it only if offlining succeeded, 1806 + * so try_reonline_memory_block() can do the right thing. 1807 + */ 1808 + if (!rc) 1809 + **online_types = online_type; 1810 + 1811 + (*online_types)++; 1812 + /* Ignore if already offline. */ 1813 + return rc < 0 ? rc : 0; 1814 + } 1815 + 1816 + static int try_reonline_memory_block(struct memory_block *mem, void *arg) 1817 + { 1818 + uint8_t **online_types = arg; 1819 + int rc; 1820 + 1821 + if (**online_types != MMOP_OFFLINE) { 1822 + mem->online_type = **online_types; 1823 + rc = device_online(&mem->dev); 1824 + if (rc < 0) 1825 + pr_warn("%s: Failed to re-online memory: %d", 1826 + __func__, rc); 1827 + } 1828 + 1829 + /* Continue processing all remaining memory blocks. */ 1830 + (*online_types)++; 1831 + return 0; 1832 + } 1833 + 1787 1834 /* 1788 - * Try to offline and remove a memory block. Might take a long time to 1789 - * finish in case memory is still in use. Primarily useful for memory devices 1790 - * that logically unplugged all memory (so it's no longer in use) and want to 1791 - * offline + remove the memory block. 1835 + * Try to offline and remove memory. Might take a long time to finish in case 1836 + * memory is still in use. Primarily useful for memory devices that logically 1837 + * unplugged all memory (so it's no longer in use) and want to offline + remove 1838 + * that memory. 1792 1839 */ 1793 1840 int offline_and_remove_memory(int nid, u64 start, u64 size) 1794 1841 { 1795 - struct memory_block *mem; 1796 - int rc = -EINVAL; 1842 + const unsigned long mb_count = size / memory_block_size_bytes(); 1843 + uint8_t *online_types, *tmp; 1844 + int rc; 1797 1845 1798 1846 if (!IS_ALIGNED(start, memory_block_size_bytes()) || 1799 - size != memory_block_size_bytes()) 1800 - return rc; 1801 - 1802 - lock_device_hotplug(); 1803 - mem = find_memory_block(__pfn_to_section(PFN_DOWN(start))); 1804 - if (mem) 1805 - rc = device_offline(&mem->dev); 1806 - /* Ignore if the device is already offline. */ 1807 - if (rc > 0) 1808 - rc = 0; 1847 + !IS_ALIGNED(size, memory_block_size_bytes()) || !size) 1848 + return -EINVAL; 1809 1849 1810 1850 /* 1811 - * In case we succeeded to offline the memory block, remove it. 1851 + * We'll remember the old online type of each memory block, so we can 1852 + * try to revert whatever we did when offlining one memory block fails 1853 + * after offlining some others succeeded. 1854 + */ 1855 + online_types = kmalloc_array(mb_count, sizeof(*online_types), 1856 + GFP_KERNEL); 1857 + if (!online_types) 1858 + return -ENOMEM; 1859 + /* 1860 + * Initialize all states to MMOP_OFFLINE, so when we abort processing in 1861 + * try_offline_memory_block(), we'll skip all unprocessed blocks in 1862 + * try_reonline_memory_block(). 1863 + */ 1864 + memset(online_types, MMOP_OFFLINE, mb_count); 1865 + 1866 + lock_device_hotplug(); 1867 + 1868 + tmp = online_types; 1869 + rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block); 1870 + 1871 + /* 1872 + * In case we succeeded to offline all memory, remove it. 1812 1873 * This cannot fail as it cannot get onlined in the meantime. 1813 1874 */ 1814 1875 if (!rc) { 1815 1876 rc = try_remove_memory(nid, start, size); 1816 - WARN_ON_ONCE(rc); 1877 + if (rc) 1878 + pr_err("%s: Failed to remove memory: %d", __func__, rc); 1879 + } 1880 + 1881 + /* 1882 + * Rollback what we did. While memory onlining might theoretically fail 1883 + * (nacked by a notifier), it barely ever happens. 1884 + */ 1885 + if (rc) { 1886 + tmp = online_types; 1887 + walk_memory_blocks(start, size, &tmp, 1888 + try_reonline_memory_block); 1817 1889 } 1818 1890 unlock_device_hotplug(); 1819 1891 1892 + kfree(online_types); 1820 1893 return rc; 1821 1894 } 1822 1895 EXPORT_SYMBOL_GPL(offline_and_remove_memory);
+10
tools/virtio/asm/barrier.h
··· 16 16 # define mb() abort() 17 17 # define dma_rmb() abort() 18 18 # define dma_wmb() abort() 19 + #elif defined(__aarch64__) 20 + #define dmb(opt) asm volatile("dmb " #opt : : : "memory") 21 + #define virt_mb() __sync_synchronize() 22 + #define virt_rmb() dmb(ishld) 23 + #define virt_wmb() dmb(ishst) 24 + #define virt_store_mb(var, value) do { WRITE_ONCE(var, value); dmb(ish); } while (0) 25 + /* Weak barriers should be used. If not - it's a bug */ 26 + # define mb() abort() 27 + # define dma_rmb() abort() 28 + # define dma_wmb() abort() 19 29 #else 20 30 #error Please fill in barrier macros 21 31 #endif
+2
tools/virtio/linux/bug.h
··· 2 2 #ifndef BUG_H 3 3 #define BUG_H 4 4 5 + #include <asm/bug.h> 6 + 5 7 #define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond)) 6 8 7 9 #define BUILD_BUG_ON(x)
+11 -2
tools/virtio/linux/kernel.h
··· 11 11 12 12 #include <linux/compiler.h> 13 13 #include <linux/types.h> 14 + #include <linux/overflow.h> 14 15 #include <linux/list.h> 15 16 #include <linux/printk.h> 16 17 #include <linux/bug.h> ··· 118 117 # define unlikely(x) (__builtin_expect(!!(x), 0)) 119 118 # endif 120 119 120 + static inline void *krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t gfp) 121 + { 122 + size_t bytes; 123 + 124 + if (unlikely(check_mul_overflow(new_n, new_size, &bytes))) 125 + return NULL; 126 + 127 + return krealloc(p, bytes, gfp); 128 + } 129 + 121 130 #define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__) 122 131 #ifdef DEBUG 123 132 #define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__) ··· 136 125 #endif 137 126 #define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) 138 127 #define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) 139 - 140 - #define WARN_ON_ONCE(cond) (unlikely(cond) ? fprintf (stderr, "WARNING\n") : 0) 141 128 142 129 #define min(x, y) ({ \ 143 130 typeof(x) _min1 = (x); \