Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Virtio-mem device driver.
4 *
5 * Copyright Red Hat, Inc. 2020
6 *
7 * Author(s): David Hildenbrand <david@redhat.com>
8 */
9
10#include <linux/virtio.h>
11#include <linux/virtio_mem.h>
12#include <linux/workqueue.h>
13#include <linux/slab.h>
14#include <linux/module.h>
15#include <linux/mm.h>
16#include <linux/memory_hotplug.h>
17#include <linux/memory.h>
18#include <linux/hrtimer.h>
19#include <linux/crash_dump.h>
20#include <linux/mutex.h>
21#include <linux/bitmap.h>
22#include <linux/lockdep.h>
23#include <linux/log2.h>
24#include <linux/vmalloc.h>
25#include <linux/suspend.h>
26
27#include <acpi/acpi_numa.h>
28
29static bool unplug_online = true;
30module_param(unplug_online, bool, 0644);
31MODULE_PARM_DESC(unplug_online, "Try to unplug online memory");
32
33static bool force_bbm;
34module_param(force_bbm, bool, 0444);
35MODULE_PARM_DESC(force_bbm,
36 "Force Big Block Mode. Default is 0 (auto-selection)");
37
38static unsigned long bbm_block_size;
39module_param(bbm_block_size, ulong, 0444);
40MODULE_PARM_DESC(bbm_block_size,
41 "Big Block size in bytes. Default is 0 (auto-detection).");
42
43/*
44 * virtio-mem currently supports the following modes of operation:
45 *
46 * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The
47 * size of a Sub Block (SB) is determined based on the device block size, the
48 * pageblock size, and the maximum allocation granularity of the buddy.
49 * Subblocks within a Linux memory block might either be plugged or unplugged.
50 * Memory is added/removed to Linux MM in Linux memory block granularity.
51 *
52 * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks.
53 * Memory is added/removed to Linux MM in Big Block granularity.
54 *
55 * The mode is determined automatically based on the Linux memory block size
56 * and the device block size.
57 *
58 * User space / core MM (auto onlining) is responsible for onlining added
59 * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are
60 * always onlined separately, and all memory within a Linux memory block is
61 * onlined to the same zone - virtio-mem relies on this behavior.
62 */
63
64/*
65 * State of a Linux memory block in SBM.
66 */
67enum virtio_mem_sbm_mb_state {
68 /* Unplugged, not added to Linux. Can be reused later. */
69 VIRTIO_MEM_SBM_MB_UNUSED = 0,
70 /* (Partially) plugged, not added to Linux. Error on add_memory(). */
71 VIRTIO_MEM_SBM_MB_PLUGGED,
72 /* Fully plugged, fully added to Linux, offline. */
73 VIRTIO_MEM_SBM_MB_OFFLINE,
74 /* Partially plugged, fully added to Linux, offline. */
75 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
76 /* Fully plugged, fully added to Linux, onlined to a kernel zone. */
77 VIRTIO_MEM_SBM_MB_KERNEL,
78 /* Partially plugged, fully added to Linux, online to a kernel zone */
79 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
80 /* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */
81 VIRTIO_MEM_SBM_MB_MOVABLE,
82 /* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */
83 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
84 VIRTIO_MEM_SBM_MB_COUNT
85};
86
87/*
88 * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks.
89 */
90enum virtio_mem_bbm_bb_state {
91 /* Unplugged, not added to Linux. Can be reused later. */
92 VIRTIO_MEM_BBM_BB_UNUSED = 0,
93 /* Plugged, not added to Linux. Error on add_memory(). */
94 VIRTIO_MEM_BBM_BB_PLUGGED,
95 /* Plugged and added to Linux. */
96 VIRTIO_MEM_BBM_BB_ADDED,
97 /* All online parts are fake-offline, ready to remove. */
98 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE,
99 VIRTIO_MEM_BBM_BB_COUNT
100};
101
102struct virtio_mem {
103 struct virtio_device *vdev;
104
105 /* We might first have to unplug all memory when starting up. */
106 bool unplug_all_required;
107
108 /* Workqueue that processes the plug/unplug requests. */
109 struct work_struct wq;
110 atomic_t wq_active;
111 atomic_t config_changed;
112
113 /* Virtqueue for guest->host requests. */
114 struct virtqueue *vq;
115
116 /* Wait for a host response to a guest request. */
117 wait_queue_head_t host_resp;
118
119 /* Space for one guest request and the host response. */
120 struct virtio_mem_req req;
121 struct virtio_mem_resp resp;
122
123 /* The current size of the device. */
124 uint64_t plugged_size;
125 /* The requested size of the device. */
126 uint64_t requested_size;
127
128 /* The device block size (for communicating with the device). */
129 uint64_t device_block_size;
130 /* The determined node id for all memory of the device. */
131 int nid;
132 /* Physical start address of the memory region. */
133 uint64_t addr;
134 /* Maximum region size in bytes. */
135 uint64_t region_size;
136
137 /* The parent resource for all memory added via this device. */
138 struct resource *parent_resource;
139 /*
140 * Copy of "System RAM (virtio_mem)" to be used for
141 * add_memory_driver_managed().
142 */
143 const char *resource_name;
144 /* Memory group identification. */
145 int mgid;
146
147 /*
148 * We don't want to add too much memory if it's not getting onlined,
149 * to avoid running OOM. Besides this threshold, we allow to have at
150 * least two offline blocks at a time (whatever is bigger).
151 */
152#define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD (1024 * 1024 * 1024)
153 atomic64_t offline_size;
154 uint64_t offline_threshold;
155
156 /* If set, the driver is in SBM, otherwise in BBM. */
157 bool in_sbm;
158
159 union {
160 struct {
161 /* Id of the first memory block of this device. */
162 unsigned long first_mb_id;
163 /* Id of the last usable memory block of this device. */
164 unsigned long last_usable_mb_id;
165 /* Id of the next memory bock to prepare when needed. */
166 unsigned long next_mb_id;
167
168 /* The subblock size. */
169 uint64_t sb_size;
170 /* The number of subblocks per Linux memory block. */
171 uint32_t sbs_per_mb;
172
173 /*
174 * Some of the Linux memory blocks tracked as "partially
175 * plugged" are completely unplugged and can be offlined
176 * and removed -- which previously failed.
177 */
178 bool have_unplugged_mb;
179
180 /* Summary of all memory block states. */
181 unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT];
182
183 /*
184 * One byte state per memory block. Allocated via
185 * vmalloc(). Resized (alloc+copy+free) on demand.
186 *
187 * With 128 MiB memory blocks, we have states for 512
188 * GiB of memory in one 4 KiB page.
189 */
190 uint8_t *mb_states;
191
192 /*
193 * Bitmap: one bit per subblock. Allocated similar to
194 * sbm.mb_states.
195 *
196 * A set bit means the corresponding subblock is
197 * plugged, otherwise it's unblocked.
198 *
199 * With 4 MiB subblocks, we manage 128 GiB of memory
200 * in one 4 KiB page.
201 */
202 unsigned long *sb_states;
203 } sbm;
204
205 struct {
206 /* Id of the first big block of this device. */
207 unsigned long first_bb_id;
208 /* Id of the last usable big block of this device. */
209 unsigned long last_usable_bb_id;
210 /* Id of the next device bock to prepare when needed. */
211 unsigned long next_bb_id;
212
213 /* Summary of all big block states. */
214 unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT];
215
216 /* One byte state per big block. See sbm.mb_states. */
217 uint8_t *bb_states;
218
219 /* The block size used for plugging/adding/removing. */
220 uint64_t bb_size;
221 } bbm;
222 };
223
224 /*
225 * Mutex that protects the sbm.mb_count, sbm.mb_states,
226 * sbm.sb_states, bbm.bb_count, and bbm.bb_states
227 *
228 * When this lock is held the pointers can't change, ONLINE and
229 * OFFLINE blocks can't change the state and no subblocks will get
230 * plugged/unplugged.
231 *
232 * In kdump mode, used to serialize requests, last_block_addr and
233 * last_block_plugged.
234 */
235 struct mutex hotplug_mutex;
236 bool hotplug_active;
237
238 /* An error occurred we cannot handle - stop processing requests. */
239 bool broken;
240
241 /* Cached valued of is_kdump_kernel() when the device was probed. */
242 bool in_kdump;
243
244 /* The driver is being removed. */
245 spinlock_t removal_lock;
246 bool removing;
247
248 /* Timer for retrying to plug/unplug memory. */
249 struct hrtimer retry_timer;
250 unsigned int retry_timer_ms;
251#define VIRTIO_MEM_RETRY_TIMER_MIN_MS 50000
252#define VIRTIO_MEM_RETRY_TIMER_MAX_MS 300000
253
254 /* Memory notifier (online/offline events). */
255 struct notifier_block memory_notifier;
256
257 /* Notifier to block hibernation image storing/reloading. */
258 struct notifier_block pm_notifier;
259
260#ifdef CONFIG_PROC_VMCORE
261 /* vmcore callback for /proc/vmcore handling in kdump mode */
262 struct vmcore_cb vmcore_cb;
263 uint64_t last_block_addr;
264 bool last_block_plugged;
265#endif /* CONFIG_PROC_VMCORE */
266
267 /* Next device in the list of virtio-mem devices. */
268 struct list_head next;
269};
270
271/*
272 * We have to share a single online_page callback among all virtio-mem
273 * devices. We use RCU to iterate the list in the callback.
274 */
275static DEFINE_MUTEX(virtio_mem_mutex);
276static LIST_HEAD(virtio_mem_devices);
277
278static void virtio_mem_online_page_cb(struct page *page, unsigned int order);
279static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
280 unsigned long nr_pages);
281static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
282 unsigned long nr_pages);
283static void virtio_mem_retry(struct virtio_mem *vm);
284static int virtio_mem_create_resource(struct virtio_mem *vm);
285static void virtio_mem_delete_resource(struct virtio_mem *vm);
286
287/*
288 * Register a virtio-mem device so it will be considered for the online_page
289 * callback.
290 */
291static int register_virtio_mem_device(struct virtio_mem *vm)
292{
293 int rc = 0;
294
295 /* First device registers the callback. */
296 mutex_lock(&virtio_mem_mutex);
297 if (list_empty(&virtio_mem_devices))
298 rc = set_online_page_callback(&virtio_mem_online_page_cb);
299 if (!rc)
300 list_add_rcu(&vm->next, &virtio_mem_devices);
301 mutex_unlock(&virtio_mem_mutex);
302
303 return rc;
304}
305
306/*
307 * Unregister a virtio-mem device so it will no longer be considered for the
308 * online_page callback.
309 */
310static void unregister_virtio_mem_device(struct virtio_mem *vm)
311{
312 /* Last device unregisters the callback. */
313 mutex_lock(&virtio_mem_mutex);
314 list_del_rcu(&vm->next);
315 if (list_empty(&virtio_mem_devices))
316 restore_online_page_callback(&virtio_mem_online_page_cb);
317 mutex_unlock(&virtio_mem_mutex);
318
319 synchronize_rcu();
320}
321
322/*
323 * Calculate the memory block id of a given address.
324 */
325static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr)
326{
327 return addr / memory_block_size_bytes();
328}
329
330/*
331 * Calculate the physical start address of a given memory block id.
332 */
333static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id)
334{
335 return mb_id * memory_block_size_bytes();
336}
337
338/*
339 * Calculate the big block id of a given address.
340 */
341static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm,
342 uint64_t addr)
343{
344 return addr / vm->bbm.bb_size;
345}
346
347/*
348 * Calculate the physical start address of a given big block id.
349 */
350static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm,
351 unsigned long bb_id)
352{
353 return bb_id * vm->bbm.bb_size;
354}
355
356/*
357 * Calculate the subblock id of a given address.
358 */
359static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm,
360 unsigned long addr)
361{
362 const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr);
363 const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id);
364
365 return (addr - mb_addr) / vm->sbm.sb_size;
366}
367
368/*
369 * Set the state of a big block, taking care of the state counter.
370 */
371static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm,
372 unsigned long bb_id,
373 enum virtio_mem_bbm_bb_state state)
374{
375 const unsigned long idx = bb_id - vm->bbm.first_bb_id;
376 enum virtio_mem_bbm_bb_state old_state;
377
378 old_state = vm->bbm.bb_states[idx];
379 vm->bbm.bb_states[idx] = state;
380
381 BUG_ON(vm->bbm.bb_count[old_state] == 0);
382 vm->bbm.bb_count[old_state]--;
383 vm->bbm.bb_count[state]++;
384}
385
386/*
387 * Get the state of a big block.
388 */
389static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm,
390 unsigned long bb_id)
391{
392 return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id];
393}
394
395/*
396 * Prepare the big block state array for the next big block.
397 */
398static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm)
399{
400 unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id;
401 unsigned long new_bytes = old_bytes + 1;
402 int old_pages = PFN_UP(old_bytes);
403 int new_pages = PFN_UP(new_bytes);
404 uint8_t *new_array;
405
406 if (vm->bbm.bb_states && old_pages == new_pages)
407 return 0;
408
409 new_array = vzalloc(new_pages * PAGE_SIZE);
410 if (!new_array)
411 return -ENOMEM;
412
413 mutex_lock(&vm->hotplug_mutex);
414 if (vm->bbm.bb_states)
415 memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE);
416 vfree(vm->bbm.bb_states);
417 vm->bbm.bb_states = new_array;
418 mutex_unlock(&vm->hotplug_mutex);
419
420 return 0;
421}
422
423#define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \
424 for (_bb_id = vm->bbm.first_bb_id; \
425 _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \
426 _bb_id++) \
427 if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
428
429#define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \
430 for (_bb_id = vm->bbm.next_bb_id - 1; \
431 _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \
432 _bb_id--) \
433 if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
434
435/*
436 * Set the state of a memory block, taking care of the state counter.
437 */
438static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm,
439 unsigned long mb_id, uint8_t state)
440{
441 const unsigned long idx = mb_id - vm->sbm.first_mb_id;
442 uint8_t old_state;
443
444 old_state = vm->sbm.mb_states[idx];
445 vm->sbm.mb_states[idx] = state;
446
447 BUG_ON(vm->sbm.mb_count[old_state] == 0);
448 vm->sbm.mb_count[old_state]--;
449 vm->sbm.mb_count[state]++;
450}
451
452/*
453 * Get the state of a memory block.
454 */
455static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm,
456 unsigned long mb_id)
457{
458 const unsigned long idx = mb_id - vm->sbm.first_mb_id;
459
460 return vm->sbm.mb_states[idx];
461}
462
463/*
464 * Prepare the state array for the next memory block.
465 */
466static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm)
467{
468 int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id);
469 int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1);
470 uint8_t *new_array;
471
472 if (vm->sbm.mb_states && old_pages == new_pages)
473 return 0;
474
475 new_array = vzalloc(new_pages * PAGE_SIZE);
476 if (!new_array)
477 return -ENOMEM;
478
479 mutex_lock(&vm->hotplug_mutex);
480 if (vm->sbm.mb_states)
481 memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE);
482 vfree(vm->sbm.mb_states);
483 vm->sbm.mb_states = new_array;
484 mutex_unlock(&vm->hotplug_mutex);
485
486 return 0;
487}
488
489#define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \
490 for (_mb_id = _vm->sbm.first_mb_id; \
491 _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \
492 _mb_id++) \
493 if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
494
495#define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \
496 for (_mb_id = _vm->sbm.next_mb_id - 1; \
497 _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \
498 _mb_id--) \
499 if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
500
501/*
502 * Calculate the bit number in the subblock bitmap for the given subblock
503 * inside the given memory block.
504 */
505static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm,
506 unsigned long mb_id, int sb_id)
507{
508 return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id;
509}
510
511/*
512 * Mark all selected subblocks plugged.
513 *
514 * Will not modify the state of the memory block.
515 */
516static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm,
517 unsigned long mb_id, int sb_id,
518 int count)
519{
520 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
521
522 __bitmap_set(vm->sbm.sb_states, bit, count);
523}
524
525/*
526 * Mark all selected subblocks unplugged.
527 *
528 * Will not modify the state of the memory block.
529 */
530static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm,
531 unsigned long mb_id, int sb_id,
532 int count)
533{
534 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
535
536 __bitmap_clear(vm->sbm.sb_states, bit, count);
537}
538
539/*
540 * Test if all selected subblocks are plugged.
541 */
542static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm,
543 unsigned long mb_id, int sb_id,
544 int count)
545{
546 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
547
548 if (count == 1)
549 return test_bit(bit, vm->sbm.sb_states);
550
551 /* TODO: Helper similar to bitmap_set() */
552 return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >=
553 bit + count;
554}
555
556/*
557 * Test if all selected subblocks are unplugged.
558 */
559static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm,
560 unsigned long mb_id, int sb_id,
561 int count)
562{
563 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
564
565 /* TODO: Helper similar to bitmap_set() */
566 return find_next_bit(vm->sbm.sb_states, bit + count, bit) >=
567 bit + count;
568}
569
570/*
571 * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is
572 * none.
573 */
574static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm,
575 unsigned long mb_id)
576{
577 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0);
578
579 return find_next_zero_bit(vm->sbm.sb_states,
580 bit + vm->sbm.sbs_per_mb, bit) - bit;
581}
582
583/*
584 * Prepare the subblock bitmap for the next memory block.
585 */
586static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm)
587{
588 const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id;
589 const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb;
590 const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb;
591 int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long));
592 int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long));
593 unsigned long *new_bitmap, *old_bitmap;
594
595 if (vm->sbm.sb_states && old_pages == new_pages)
596 return 0;
597
598 new_bitmap = vzalloc(new_pages * PAGE_SIZE);
599 if (!new_bitmap)
600 return -ENOMEM;
601
602 mutex_lock(&vm->hotplug_mutex);
603 if (vm->sbm.sb_states)
604 memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE);
605
606 old_bitmap = vm->sbm.sb_states;
607 vm->sbm.sb_states = new_bitmap;
608 mutex_unlock(&vm->hotplug_mutex);
609
610 vfree(old_bitmap);
611 return 0;
612}
613
614/*
615 * Test if we could add memory without creating too much offline memory -
616 * to avoid running OOM if memory is getting onlined deferred.
617 */
618static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size)
619{
620 if (WARN_ON_ONCE(size > vm->offline_threshold))
621 return false;
622
623 return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold;
624}
625
626/*
627 * Try adding memory to Linux. Will usually only fail if out of memory.
628 *
629 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
630 * onlining code).
631 *
632 * Will not modify the state of memory blocks in virtio-mem.
633 */
634static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr,
635 uint64_t size)
636{
637 int rc;
638
639 /*
640 * When force-unloading the driver and we still have memory added to
641 * Linux, the resource name has to stay.
642 */
643 if (!vm->resource_name) {
644 vm->resource_name = kstrdup_const("System RAM (virtio_mem)",
645 GFP_KERNEL);
646 if (!vm->resource_name)
647 return -ENOMEM;
648 }
649
650 dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr,
651 addr + size - 1);
652 /* Memory might get onlined immediately. */
653 atomic64_add(size, &vm->offline_size);
654 rc = add_memory_driver_managed(vm->mgid, addr, size, vm->resource_name,
655 MHP_MERGE_RESOURCE | MHP_NID_IS_MGID);
656 if (rc) {
657 atomic64_sub(size, &vm->offline_size);
658 dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc);
659 /*
660 * TODO: Linux MM does not properly clean up yet in all cases
661 * where adding of memory failed - especially on -ENOMEM.
662 */
663 }
664 return rc;
665}
666
667/*
668 * See virtio_mem_add_memory(): Try adding a single Linux memory block.
669 */
670static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id)
671{
672 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
673 const uint64_t size = memory_block_size_bytes();
674
675 return virtio_mem_add_memory(vm, addr, size);
676}
677
678/*
679 * See virtio_mem_add_memory(): Try adding a big block.
680 */
681static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id)
682{
683 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
684 const uint64_t size = vm->bbm.bb_size;
685
686 return virtio_mem_add_memory(vm, addr, size);
687}
688
689/*
690 * Try removing memory from Linux. Will only fail if memory blocks aren't
691 * offline.
692 *
693 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
694 * onlining code).
695 *
696 * Will not modify the state of memory blocks in virtio-mem.
697 */
698static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr,
699 uint64_t size)
700{
701 int rc;
702
703 dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr,
704 addr + size - 1);
705 rc = remove_memory(addr, size);
706 if (!rc) {
707 atomic64_sub(size, &vm->offline_size);
708 /*
709 * We might have freed up memory we can now unplug, retry
710 * immediately instead of waiting.
711 */
712 virtio_mem_retry(vm);
713 } else {
714 dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc);
715 }
716 return rc;
717}
718
719/*
720 * See virtio_mem_remove_memory(): Try removing a single Linux memory block.
721 */
722static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id)
723{
724 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
725 const uint64_t size = memory_block_size_bytes();
726
727 return virtio_mem_remove_memory(vm, addr, size);
728}
729
730/*
731 * Try offlining and removing memory from Linux.
732 *
733 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
734 * onlining code).
735 *
736 * Will not modify the state of memory blocks in virtio-mem.
737 */
738static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm,
739 uint64_t addr,
740 uint64_t size)
741{
742 int rc;
743
744 dev_dbg(&vm->vdev->dev,
745 "offlining and removing memory: 0x%llx - 0x%llx\n", addr,
746 addr + size - 1);
747
748 rc = offline_and_remove_memory(addr, size);
749 if (!rc) {
750 atomic64_sub(size, &vm->offline_size);
751 /*
752 * We might have freed up memory we can now unplug, retry
753 * immediately instead of waiting.
754 */
755 virtio_mem_retry(vm);
756 return 0;
757 }
758 dev_dbg(&vm->vdev->dev, "offlining and removing memory failed: %d\n", rc);
759 /*
760 * We don't really expect this to fail, because we fake-offlined all
761 * memory already. But it could fail in corner cases.
762 */
763 WARN_ON_ONCE(rc != -ENOMEM && rc != -EBUSY);
764 return rc == -ENOMEM ? -ENOMEM : -EBUSY;
765}
766
767/*
768 * See virtio_mem_offline_and_remove_memory(): Try offlining and removing
769 * a single Linux memory block.
770 */
771static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm,
772 unsigned long mb_id)
773{
774 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
775 const uint64_t size = memory_block_size_bytes();
776
777 return virtio_mem_offline_and_remove_memory(vm, addr, size);
778}
779
780/*
781 * Try (offlining and) removing memory from Linux in case all subblocks are
782 * unplugged. Can be called on online and offline memory blocks.
783 *
784 * May modify the state of memory blocks in virtio-mem.
785 */
786static int virtio_mem_sbm_try_remove_unplugged_mb(struct virtio_mem *vm,
787 unsigned long mb_id)
788{
789 int rc;
790
791 /*
792 * Once all subblocks of a memory block were unplugged, offline and
793 * remove it.
794 */
795 if (!virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
796 return 0;
797
798 /* offline_and_remove_memory() works for online and offline memory. */
799 mutex_unlock(&vm->hotplug_mutex);
800 rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id);
801 mutex_lock(&vm->hotplug_mutex);
802 if (!rc)
803 virtio_mem_sbm_set_mb_state(vm, mb_id,
804 VIRTIO_MEM_SBM_MB_UNUSED);
805 return rc;
806}
807
808/*
809 * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a
810 * all Linux memory blocks covered by the big block.
811 */
812static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm,
813 unsigned long bb_id)
814{
815 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
816 const uint64_t size = vm->bbm.bb_size;
817
818 return virtio_mem_offline_and_remove_memory(vm, addr, size);
819}
820
821/*
822 * Trigger the workqueue so the device can perform its magic.
823 */
824static void virtio_mem_retry(struct virtio_mem *vm)
825{
826 unsigned long flags;
827
828 spin_lock_irqsave(&vm->removal_lock, flags);
829 if (!vm->removing)
830 queue_work(system_freezable_wq, &vm->wq);
831 spin_unlock_irqrestore(&vm->removal_lock, flags);
832}
833
834static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id)
835{
836 int node = NUMA_NO_NODE;
837
838#if defined(CONFIG_ACPI_NUMA)
839 if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM))
840 node = pxm_to_node(node_id);
841#endif
842 return node;
843}
844
845/*
846 * Test if a virtio-mem device overlaps with the given range. Can be called
847 * from (notifier) callbacks lockless.
848 */
849static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start,
850 uint64_t size)
851{
852 return start < vm->addr + vm->region_size && vm->addr < start + size;
853}
854
855/*
856 * Test if a virtio-mem device contains a given range. Can be called from
857 * (notifier) callbacks lockless.
858 */
859static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start,
860 uint64_t size)
861{
862 return start >= vm->addr && start + size <= vm->addr + vm->region_size;
863}
864
865static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm,
866 unsigned long mb_id)
867{
868 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
869 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
870 case VIRTIO_MEM_SBM_MB_OFFLINE:
871 return NOTIFY_OK;
872 default:
873 break;
874 }
875 dev_warn_ratelimited(&vm->vdev->dev,
876 "memory block onlining denied\n");
877 return NOTIFY_BAD;
878}
879
880static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm,
881 unsigned long mb_id)
882{
883 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
884 case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL:
885 case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL:
886 virtio_mem_sbm_set_mb_state(vm, mb_id,
887 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
888 break;
889 case VIRTIO_MEM_SBM_MB_KERNEL:
890 case VIRTIO_MEM_SBM_MB_MOVABLE:
891 virtio_mem_sbm_set_mb_state(vm, mb_id,
892 VIRTIO_MEM_SBM_MB_OFFLINE);
893 break;
894 default:
895 BUG();
896 break;
897 }
898}
899
900static void virtio_mem_sbm_notify_online(struct virtio_mem *vm,
901 unsigned long mb_id,
902 unsigned long start_pfn)
903{
904 const bool is_movable = is_zone_movable_page(pfn_to_page(start_pfn));
905 int new_state;
906
907 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
908 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
909 new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL;
910 if (is_movable)
911 new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL;
912 break;
913 case VIRTIO_MEM_SBM_MB_OFFLINE:
914 new_state = VIRTIO_MEM_SBM_MB_KERNEL;
915 if (is_movable)
916 new_state = VIRTIO_MEM_SBM_MB_MOVABLE;
917 break;
918 default:
919 BUG();
920 break;
921 }
922 virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
923}
924
925static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm,
926 unsigned long mb_id)
927{
928 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
929 unsigned long pfn;
930 int sb_id;
931
932 for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
933 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
934 continue;
935 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
936 sb_id * vm->sbm.sb_size);
937 virtio_mem_fake_offline_going_offline(pfn, nr_pages);
938 }
939}
940
941static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm,
942 unsigned long mb_id)
943{
944 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
945 unsigned long pfn;
946 int sb_id;
947
948 for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
949 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
950 continue;
951 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
952 sb_id * vm->sbm.sb_size);
953 virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
954 }
955}
956
957static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm,
958 unsigned long bb_id,
959 unsigned long pfn,
960 unsigned long nr_pages)
961{
962 /*
963 * When marked as "fake-offline", all online memory of this device block
964 * is allocated by us. Otherwise, we don't have any memory allocated.
965 */
966 if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
967 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
968 return;
969 virtio_mem_fake_offline_going_offline(pfn, nr_pages);
970}
971
972static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm,
973 unsigned long bb_id,
974 unsigned long pfn,
975 unsigned long nr_pages)
976{
977 if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
978 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
979 return;
980 virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
981}
982
983/*
984 * This callback will either be called synchronously from add_memory() or
985 * asynchronously (e.g., triggered via user space). We have to be careful
986 * with locking when calling add_memory().
987 */
988static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
989 unsigned long action, void *arg)
990{
991 struct virtio_mem *vm = container_of(nb, struct virtio_mem,
992 memory_notifier);
993 struct memory_notify *mhp = arg;
994 const unsigned long start = PFN_PHYS(mhp->start_pfn);
995 const unsigned long size = PFN_PHYS(mhp->nr_pages);
996 int rc = NOTIFY_OK;
997 unsigned long id;
998
999 if (!virtio_mem_overlaps_range(vm, start, size))
1000 return NOTIFY_DONE;
1001
1002 if (vm->in_sbm) {
1003 id = virtio_mem_phys_to_mb_id(start);
1004 /*
1005 * In SBM, we add memory in separate memory blocks - we expect
1006 * it to be onlined/offlined in the same granularity. Bail out
1007 * if this ever changes.
1008 */
1009 if (WARN_ON_ONCE(size != memory_block_size_bytes() ||
1010 !IS_ALIGNED(start, memory_block_size_bytes())))
1011 return NOTIFY_BAD;
1012 } else {
1013 id = virtio_mem_phys_to_bb_id(vm, start);
1014 /*
1015 * In BBM, we only care about onlining/offlining happening
1016 * within a single big block, we don't care about the
1017 * actual granularity as we don't track individual Linux
1018 * memory blocks.
1019 */
1020 if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1)))
1021 return NOTIFY_BAD;
1022 }
1023
1024 /*
1025 * Avoid circular locking lockdep warnings. We lock the mutex
1026 * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The
1027 * blocking_notifier_call_chain() has it's own lock, which gets unlocked
1028 * between both notifier calls and will bail out. False positive.
1029 */
1030 lockdep_off();
1031
1032 switch (action) {
1033 case MEM_GOING_OFFLINE:
1034 mutex_lock(&vm->hotplug_mutex);
1035 if (vm->removing) {
1036 rc = notifier_from_errno(-EBUSY);
1037 mutex_unlock(&vm->hotplug_mutex);
1038 break;
1039 }
1040 vm->hotplug_active = true;
1041 if (vm->in_sbm)
1042 virtio_mem_sbm_notify_going_offline(vm, id);
1043 else
1044 virtio_mem_bbm_notify_going_offline(vm, id,
1045 mhp->start_pfn,
1046 mhp->nr_pages);
1047 break;
1048 case MEM_GOING_ONLINE:
1049 mutex_lock(&vm->hotplug_mutex);
1050 if (vm->removing) {
1051 rc = notifier_from_errno(-EBUSY);
1052 mutex_unlock(&vm->hotplug_mutex);
1053 break;
1054 }
1055 vm->hotplug_active = true;
1056 if (vm->in_sbm)
1057 rc = virtio_mem_sbm_notify_going_online(vm, id);
1058 break;
1059 case MEM_OFFLINE:
1060 if (vm->in_sbm)
1061 virtio_mem_sbm_notify_offline(vm, id);
1062
1063 atomic64_add(size, &vm->offline_size);
1064 /*
1065 * Trigger the workqueue. Now that we have some offline memory,
1066 * maybe we can handle pending unplug requests.
1067 */
1068 if (!unplug_online)
1069 virtio_mem_retry(vm);
1070
1071 vm->hotplug_active = false;
1072 mutex_unlock(&vm->hotplug_mutex);
1073 break;
1074 case MEM_ONLINE:
1075 if (vm->in_sbm)
1076 virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn);
1077
1078 atomic64_sub(size, &vm->offline_size);
1079 /*
1080 * Start adding more memory once we onlined half of our
1081 * threshold. Don't trigger if it's possibly due to our actipn
1082 * (e.g., us adding memory which gets onlined immediately from
1083 * the core).
1084 */
1085 if (!atomic_read(&vm->wq_active) &&
1086 virtio_mem_could_add_memory(vm, vm->offline_threshold / 2))
1087 virtio_mem_retry(vm);
1088
1089 vm->hotplug_active = false;
1090 mutex_unlock(&vm->hotplug_mutex);
1091 break;
1092 case MEM_CANCEL_OFFLINE:
1093 if (!vm->hotplug_active)
1094 break;
1095 if (vm->in_sbm)
1096 virtio_mem_sbm_notify_cancel_offline(vm, id);
1097 else
1098 virtio_mem_bbm_notify_cancel_offline(vm, id,
1099 mhp->start_pfn,
1100 mhp->nr_pages);
1101 vm->hotplug_active = false;
1102 mutex_unlock(&vm->hotplug_mutex);
1103 break;
1104 case MEM_CANCEL_ONLINE:
1105 if (!vm->hotplug_active)
1106 break;
1107 vm->hotplug_active = false;
1108 mutex_unlock(&vm->hotplug_mutex);
1109 break;
1110 default:
1111 break;
1112 }
1113
1114 lockdep_on();
1115
1116 return rc;
1117}
1118
1119static int virtio_mem_pm_notifier_cb(struct notifier_block *nb,
1120 unsigned long action, void *arg)
1121{
1122 struct virtio_mem *vm = container_of(nb, struct virtio_mem,
1123 pm_notifier);
1124 switch (action) {
1125 case PM_HIBERNATION_PREPARE:
1126 case PM_RESTORE_PREPARE:
1127 /*
1128 * When restarting the VM, all memory is unplugged. Don't
1129 * allow to hibernate and restore from an image.
1130 */
1131 dev_err(&vm->vdev->dev, "hibernation is not supported.\n");
1132 return NOTIFY_BAD;
1133 default:
1134 return NOTIFY_OK;
1135 }
1136}
1137
1138/*
1139 * Set a range of pages PG_offline. Remember pages that were never onlined
1140 * (via generic_online_page()) using PageDirty().
1141 */
1142static void virtio_mem_set_fake_offline(unsigned long pfn,
1143 unsigned long nr_pages, bool onlined)
1144{
1145 page_offline_begin();
1146 for (; nr_pages--; pfn++) {
1147 struct page *page = pfn_to_page(pfn);
1148
1149 __SetPageOffline(page);
1150 if (!onlined) {
1151 SetPageDirty(page);
1152 /* FIXME: remove after cleanups */
1153 ClearPageReserved(page);
1154 }
1155 }
1156 page_offline_end();
1157}
1158
1159/*
1160 * Clear PG_offline from a range of pages. If the pages were never onlined,
1161 * (via generic_online_page()), clear PageDirty().
1162 */
1163static void virtio_mem_clear_fake_offline(unsigned long pfn,
1164 unsigned long nr_pages, bool onlined)
1165{
1166 for (; nr_pages--; pfn++) {
1167 struct page *page = pfn_to_page(pfn);
1168
1169 __ClearPageOffline(page);
1170 if (!onlined)
1171 ClearPageDirty(page);
1172 }
1173}
1174
1175/*
1176 * Release a range of fake-offline pages to the buddy, effectively
1177 * fake-onlining them.
1178 */
1179static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages)
1180{
1181 unsigned long order = MAX_PAGE_ORDER;
1182 unsigned long i;
1183
1184 /*
1185 * We might get called for ranges that don't cover properly aligned
1186 * MAX_PAGE_ORDER pages; however, we can only online properly aligned
1187 * pages with an order of MAX_PAGE_ORDER at maximum.
1188 */
1189 while (!IS_ALIGNED(pfn | nr_pages, 1 << order))
1190 order--;
1191
1192 for (i = 0; i < nr_pages; i += 1 << order) {
1193 struct page *page = pfn_to_page(pfn + i);
1194
1195 /*
1196 * If the page is PageDirty(), it was kept fake-offline when
1197 * onlining the memory block. Otherwise, it was allocated
1198 * using alloc_contig_range(). All pages in a subblock are
1199 * alike.
1200 */
1201 if (PageDirty(page)) {
1202 virtio_mem_clear_fake_offline(pfn + i, 1 << order, false);
1203 generic_online_page(page, order);
1204 } else {
1205 virtio_mem_clear_fake_offline(pfn + i, 1 << order, true);
1206 free_contig_range(pfn + i, 1 << order);
1207 adjust_managed_page_count(page, 1 << order);
1208 }
1209 }
1210}
1211
1212/*
1213 * Try to allocate a range, marking pages fake-offline, effectively
1214 * fake-offlining them.
1215 */
1216static int virtio_mem_fake_offline(struct virtio_mem *vm, unsigned long pfn,
1217 unsigned long nr_pages)
1218{
1219 const bool is_movable = is_zone_movable_page(pfn_to_page(pfn));
1220 int rc, retry_count;
1221
1222 /*
1223 * TODO: We want an alloc_contig_range() mode that tries to allocate
1224 * harder (e.g., dealing with temporarily pinned pages, PCP), especially
1225 * with ZONE_MOVABLE. So for now, retry a couple of times with
1226 * ZONE_MOVABLE before giving up - because that zone is supposed to give
1227 * some guarantees.
1228 */
1229 for (retry_count = 0; retry_count < 5; retry_count++) {
1230 /*
1231 * If the config changed, stop immediately and go back to the
1232 * main loop: avoid trying to keep unplugging if the device
1233 * might have decided to not remove any more memory.
1234 */
1235 if (atomic_read(&vm->config_changed))
1236 return -EAGAIN;
1237
1238 rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE,
1239 GFP_KERNEL);
1240 if (rc == -ENOMEM)
1241 /* whoops, out of memory */
1242 return rc;
1243 else if (rc && !is_movable)
1244 break;
1245 else if (rc)
1246 continue;
1247
1248 virtio_mem_set_fake_offline(pfn, nr_pages, true);
1249 adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
1250 return 0;
1251 }
1252
1253 return -EBUSY;
1254}
1255
1256/*
1257 * Handle fake-offline pages when memory is going offline - such that the
1258 * pages can be skipped by mm-core when offlining.
1259 */
1260static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
1261 unsigned long nr_pages)
1262{
1263 struct page *page;
1264 unsigned long i;
1265
1266 /*
1267 * Drop our reference to the pages so the memory can get offlined
1268 * and add the unplugged pages to the managed page counters (so
1269 * offlining code can correctly subtract them again).
1270 */
1271 adjust_managed_page_count(pfn_to_page(pfn), nr_pages);
1272 /* Drop our reference to the pages so the memory can get offlined. */
1273 for (i = 0; i < nr_pages; i++) {
1274 page = pfn_to_page(pfn + i);
1275 if (WARN_ON(!page_ref_dec_and_test(page)))
1276 dump_page(page, "fake-offline page referenced");
1277 }
1278}
1279
1280/*
1281 * Handle fake-offline pages when memory offlining is canceled - to undo
1282 * what we did in virtio_mem_fake_offline_going_offline().
1283 */
1284static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
1285 unsigned long nr_pages)
1286{
1287 unsigned long i;
1288
1289 /*
1290 * Get the reference we dropped when going offline and subtract the
1291 * unplugged pages from the managed page counters.
1292 */
1293 adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
1294 for (i = 0; i < nr_pages; i++)
1295 page_ref_inc(pfn_to_page(pfn + i));
1296}
1297
1298static void virtio_mem_online_page(struct virtio_mem *vm,
1299 struct page *page, unsigned int order)
1300{
1301 const unsigned long start = page_to_phys(page);
1302 const unsigned long end = start + PFN_PHYS(1 << order);
1303 unsigned long addr, next, id, sb_id, count;
1304 bool do_online;
1305
1306 /*
1307 * We can get called with any order up to MAX_PAGE_ORDER. If our subblock
1308 * size is smaller than that and we have a mixture of plugged and
1309 * unplugged subblocks within such a page, we have to process in
1310 * smaller granularity. In that case we'll adjust the order exactly once
1311 * within the loop.
1312 */
1313 for (addr = start; addr < end; ) {
1314 next = addr + PFN_PHYS(1 << order);
1315
1316 if (vm->in_sbm) {
1317 id = virtio_mem_phys_to_mb_id(addr);
1318 sb_id = virtio_mem_phys_to_sb_id(vm, addr);
1319 count = virtio_mem_phys_to_sb_id(vm, next - 1) - sb_id + 1;
1320
1321 if (virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, count)) {
1322 /* Fully plugged. */
1323 do_online = true;
1324 } else if (count == 1 ||
1325 virtio_mem_sbm_test_sb_unplugged(vm, id, sb_id, count)) {
1326 /* Fully unplugged. */
1327 do_online = false;
1328 } else {
1329 /*
1330 * Mixture, process sub-blocks instead. This
1331 * will be at least the size of a pageblock.
1332 * We'll run into this case exactly once.
1333 */
1334 order = ilog2(vm->sbm.sb_size) - PAGE_SHIFT;
1335 do_online = virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, 1);
1336 continue;
1337 }
1338 } else {
1339 /*
1340 * If the whole block is marked fake offline, keep
1341 * everything that way.
1342 */
1343 id = virtio_mem_phys_to_bb_id(vm, addr);
1344 do_online = virtio_mem_bbm_get_bb_state(vm, id) !=
1345 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE;
1346 }
1347
1348 if (do_online)
1349 generic_online_page(pfn_to_page(PFN_DOWN(addr)), order);
1350 else
1351 virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order,
1352 false);
1353 addr = next;
1354 }
1355}
1356
1357static void virtio_mem_online_page_cb(struct page *page, unsigned int order)
1358{
1359 const unsigned long addr = page_to_phys(page);
1360 struct virtio_mem *vm;
1361
1362 rcu_read_lock();
1363 list_for_each_entry_rcu(vm, &virtio_mem_devices, next) {
1364 /*
1365 * Pages we're onlining will never cross memory blocks and,
1366 * therefore, not virtio-mem devices.
1367 */
1368 if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order)))
1369 continue;
1370
1371 /*
1372 * virtio_mem_set_fake_offline() might sleep. We can safely
1373 * drop the RCU lock at this point because the device
1374 * cannot go away. See virtio_mem_remove() how races
1375 * between memory onlining and device removal are handled.
1376 */
1377 rcu_read_unlock();
1378
1379 virtio_mem_online_page(vm, page, order);
1380 return;
1381 }
1382 rcu_read_unlock();
1383
1384 /* not virtio-mem memory, but e.g., a DIMM. online it */
1385 generic_online_page(page, order);
1386}
1387
1388static uint64_t virtio_mem_send_request(struct virtio_mem *vm,
1389 const struct virtio_mem_req *req)
1390{
1391 struct scatterlist *sgs[2], sg_req, sg_resp;
1392 unsigned int len;
1393 int rc;
1394
1395 /* don't use the request residing on the stack (vaddr) */
1396 vm->req = *req;
1397
1398 /* out: buffer for request */
1399 sg_init_one(&sg_req, &vm->req, sizeof(vm->req));
1400 sgs[0] = &sg_req;
1401
1402 /* in: buffer for response */
1403 sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp));
1404 sgs[1] = &sg_resp;
1405
1406 rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL);
1407 if (rc < 0)
1408 return rc;
1409
1410 virtqueue_kick(vm->vq);
1411
1412 /* wait for a response */
1413 wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len));
1414
1415 return virtio16_to_cpu(vm->vdev, vm->resp.type);
1416}
1417
1418static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr,
1419 uint64_t size)
1420{
1421 const uint64_t nb_vm_blocks = size / vm->device_block_size;
1422 const struct virtio_mem_req req = {
1423 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG),
1424 .u.plug.addr = cpu_to_virtio64(vm->vdev, addr),
1425 .u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
1426 };
1427 int rc = -ENOMEM;
1428
1429 if (atomic_read(&vm->config_changed))
1430 return -EAGAIN;
1431
1432 dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr,
1433 addr + size - 1);
1434
1435 switch (virtio_mem_send_request(vm, &req)) {
1436 case VIRTIO_MEM_RESP_ACK:
1437 vm->plugged_size += size;
1438 return 0;
1439 case VIRTIO_MEM_RESP_NACK:
1440 rc = -EAGAIN;
1441 break;
1442 case VIRTIO_MEM_RESP_BUSY:
1443 rc = -ETXTBSY;
1444 break;
1445 case VIRTIO_MEM_RESP_ERROR:
1446 rc = -EINVAL;
1447 break;
1448 default:
1449 break;
1450 }
1451
1452 dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc);
1453 return rc;
1454}
1455
1456static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
1457 uint64_t size)
1458{
1459 const uint64_t nb_vm_blocks = size / vm->device_block_size;
1460 const struct virtio_mem_req req = {
1461 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG),
1462 .u.unplug.addr = cpu_to_virtio64(vm->vdev, addr),
1463 .u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
1464 };
1465 int rc = -ENOMEM;
1466
1467 if (atomic_read(&vm->config_changed))
1468 return -EAGAIN;
1469
1470 dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr,
1471 addr + size - 1);
1472
1473 switch (virtio_mem_send_request(vm, &req)) {
1474 case VIRTIO_MEM_RESP_ACK:
1475 vm->plugged_size -= size;
1476 return 0;
1477 case VIRTIO_MEM_RESP_BUSY:
1478 rc = -ETXTBSY;
1479 break;
1480 case VIRTIO_MEM_RESP_ERROR:
1481 rc = -EINVAL;
1482 break;
1483 default:
1484 break;
1485 }
1486
1487 dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc);
1488 return rc;
1489}
1490
1491static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
1492{
1493 const struct virtio_mem_req req = {
1494 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL),
1495 };
1496 int rc = -ENOMEM;
1497
1498 dev_dbg(&vm->vdev->dev, "unplugging all memory");
1499
1500 switch (virtio_mem_send_request(vm, &req)) {
1501 case VIRTIO_MEM_RESP_ACK:
1502 vm->unplug_all_required = false;
1503 vm->plugged_size = 0;
1504 /* usable region might have shrunk */
1505 atomic_set(&vm->config_changed, 1);
1506 return 0;
1507 case VIRTIO_MEM_RESP_BUSY:
1508 rc = -ETXTBSY;
1509 break;
1510 default:
1511 break;
1512 }
1513
1514 dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc);
1515 return rc;
1516}
1517
1518/*
1519 * Plug selected subblocks. Updates the plugged state, but not the state
1520 * of the memory block.
1521 */
1522static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
1523 int sb_id, int count)
1524{
1525 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
1526 sb_id * vm->sbm.sb_size;
1527 const uint64_t size = count * vm->sbm.sb_size;
1528 int rc;
1529
1530 rc = virtio_mem_send_plug_request(vm, addr, size);
1531 if (!rc)
1532 virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count);
1533 return rc;
1534}
1535
1536/*
1537 * Unplug selected subblocks. Updates the plugged state, but not the state
1538 * of the memory block.
1539 */
1540static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id,
1541 int sb_id, int count)
1542{
1543 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
1544 sb_id * vm->sbm.sb_size;
1545 const uint64_t size = count * vm->sbm.sb_size;
1546 int rc;
1547
1548 rc = virtio_mem_send_unplug_request(vm, addr, size);
1549 if (!rc)
1550 virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count);
1551 return rc;
1552}
1553
1554/*
1555 * Request to unplug a big block.
1556 *
1557 * Will not modify the state of the big block.
1558 */
1559static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id)
1560{
1561 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
1562 const uint64_t size = vm->bbm.bb_size;
1563
1564 return virtio_mem_send_unplug_request(vm, addr, size);
1565}
1566
1567/*
1568 * Request to plug a big block.
1569 *
1570 * Will not modify the state of the big block.
1571 */
1572static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id)
1573{
1574 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
1575 const uint64_t size = vm->bbm.bb_size;
1576
1577 return virtio_mem_send_plug_request(vm, addr, size);
1578}
1579
1580/*
1581 * Unplug the desired number of plugged subblocks of a offline or not-added
1582 * memory block. Will fail if any subblock cannot get unplugged (instead of
1583 * skipping it).
1584 *
1585 * Will not modify the state of the memory block.
1586 *
1587 * Note: can fail after some subblocks were unplugged.
1588 */
1589static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm,
1590 unsigned long mb_id, uint64_t *nb_sb)
1591{
1592 int sb_id, count;
1593 int rc;
1594
1595 sb_id = vm->sbm.sbs_per_mb - 1;
1596 while (*nb_sb) {
1597 /* Find the next candidate subblock */
1598 while (sb_id >= 0 &&
1599 virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1))
1600 sb_id--;
1601 if (sb_id < 0)
1602 break;
1603 /* Try to unplug multiple subblocks at a time */
1604 count = 1;
1605 while (count < *nb_sb && sb_id > 0 &&
1606 virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) {
1607 count++;
1608 sb_id--;
1609 }
1610
1611 rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
1612 if (rc)
1613 return rc;
1614 *nb_sb -= count;
1615 sb_id--;
1616 }
1617
1618 return 0;
1619}
1620
1621/*
1622 * Unplug all plugged subblocks of an offline or not-added memory block.
1623 *
1624 * Will not modify the state of the memory block.
1625 *
1626 * Note: can fail after some subblocks were unplugged.
1627 */
1628static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id)
1629{
1630 uint64_t nb_sb = vm->sbm.sbs_per_mb;
1631
1632 return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb);
1633}
1634
1635/*
1636 * Prepare tracking data for the next memory block.
1637 */
1638static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm,
1639 unsigned long *mb_id)
1640{
1641 int rc;
1642
1643 if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id)
1644 return -ENOSPC;
1645
1646 /* Resize the state array if required. */
1647 rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm);
1648 if (rc)
1649 return rc;
1650
1651 /* Resize the subblock bitmap if required. */
1652 rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm);
1653 if (rc)
1654 return rc;
1655
1656 vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++;
1657 *mb_id = vm->sbm.next_mb_id++;
1658 return 0;
1659}
1660
1661/*
1662 * Try to plug the desired number of subblocks and add the memory block
1663 * to Linux.
1664 *
1665 * Will modify the state of the memory block.
1666 */
1667static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm,
1668 unsigned long mb_id, uint64_t *nb_sb)
1669{
1670 const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb);
1671 int rc;
1672
1673 if (WARN_ON_ONCE(!count))
1674 return -EINVAL;
1675
1676 /*
1677 * Plug the requested number of subblocks before adding it to linux,
1678 * so that onlining will directly online all plugged subblocks.
1679 */
1680 rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count);
1681 if (rc)
1682 return rc;
1683
1684 /*
1685 * Mark the block properly offline before adding it to Linux,
1686 * so the memory notifiers will find the block in the right state.
1687 */
1688 if (count == vm->sbm.sbs_per_mb)
1689 virtio_mem_sbm_set_mb_state(vm, mb_id,
1690 VIRTIO_MEM_SBM_MB_OFFLINE);
1691 else
1692 virtio_mem_sbm_set_mb_state(vm, mb_id,
1693 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
1694
1695 /* Add the memory block to linux - if that fails, try to unplug. */
1696 rc = virtio_mem_sbm_add_mb(vm, mb_id);
1697 if (rc) {
1698 int new_state = VIRTIO_MEM_SBM_MB_UNUSED;
1699
1700 if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count))
1701 new_state = VIRTIO_MEM_SBM_MB_PLUGGED;
1702 virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
1703 return rc;
1704 }
1705
1706 *nb_sb -= count;
1707 return 0;
1708}
1709
1710/*
1711 * Try to plug the desired number of subblocks of a memory block that
1712 * is already added to Linux.
1713 *
1714 * Will modify the state of the memory block.
1715 *
1716 * Note: Can fail after some subblocks were successfully plugged.
1717 */
1718static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm,
1719 unsigned long mb_id, uint64_t *nb_sb)
1720{
1721 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
1722 unsigned long pfn, nr_pages;
1723 int sb_id, count;
1724 int rc;
1725
1726 if (WARN_ON_ONCE(!*nb_sb))
1727 return -EINVAL;
1728
1729 while (*nb_sb) {
1730 sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id);
1731 if (sb_id >= vm->sbm.sbs_per_mb)
1732 break;
1733 count = 1;
1734 while (count < *nb_sb &&
1735 sb_id + count < vm->sbm.sbs_per_mb &&
1736 !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1))
1737 count++;
1738
1739 rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count);
1740 if (rc)
1741 return rc;
1742 *nb_sb -= count;
1743 if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL)
1744 continue;
1745
1746 /* fake-online the pages if the memory block is online */
1747 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
1748 sb_id * vm->sbm.sb_size);
1749 nr_pages = PFN_DOWN(count * vm->sbm.sb_size);
1750 virtio_mem_fake_online(pfn, nr_pages);
1751 }
1752
1753 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
1754 virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1);
1755
1756 return 0;
1757}
1758
1759static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff)
1760{
1761 const int mb_states[] = {
1762 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
1763 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
1764 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
1765 };
1766 uint64_t nb_sb = diff / vm->sbm.sb_size;
1767 unsigned long mb_id;
1768 int rc, i;
1769
1770 if (!nb_sb)
1771 return 0;
1772
1773 /* Don't race with onlining/offlining */
1774 mutex_lock(&vm->hotplug_mutex);
1775
1776 for (i = 0; i < ARRAY_SIZE(mb_states); i++) {
1777 virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) {
1778 rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb);
1779 if (rc || !nb_sb)
1780 goto out_unlock;
1781 cond_resched();
1782 }
1783 }
1784
1785 /*
1786 * We won't be working on online/offline memory blocks from this point,
1787 * so we can't race with memory onlining/offlining. Drop the mutex.
1788 */
1789 mutex_unlock(&vm->hotplug_mutex);
1790
1791 /* Try to plug and add unused blocks */
1792 virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) {
1793 if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
1794 return -ENOSPC;
1795
1796 rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
1797 if (rc || !nb_sb)
1798 return rc;
1799 cond_resched();
1800 }
1801
1802 /* Try to prepare, plug and add new blocks */
1803 while (nb_sb) {
1804 if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
1805 return -ENOSPC;
1806
1807 rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id);
1808 if (rc)
1809 return rc;
1810 rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
1811 if (rc)
1812 return rc;
1813 cond_resched();
1814 }
1815
1816 return 0;
1817out_unlock:
1818 mutex_unlock(&vm->hotplug_mutex);
1819 return rc;
1820}
1821
1822/*
1823 * Plug a big block and add it to Linux.
1824 *
1825 * Will modify the state of the big block.
1826 */
1827static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm,
1828 unsigned long bb_id)
1829{
1830 int rc;
1831
1832 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
1833 VIRTIO_MEM_BBM_BB_UNUSED))
1834 return -EINVAL;
1835
1836 rc = virtio_mem_bbm_plug_bb(vm, bb_id);
1837 if (rc)
1838 return rc;
1839 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
1840
1841 rc = virtio_mem_bbm_add_bb(vm, bb_id);
1842 if (rc) {
1843 if (!virtio_mem_bbm_unplug_bb(vm, bb_id))
1844 virtio_mem_bbm_set_bb_state(vm, bb_id,
1845 VIRTIO_MEM_BBM_BB_UNUSED);
1846 else
1847 /* Retry from the main loop. */
1848 virtio_mem_bbm_set_bb_state(vm, bb_id,
1849 VIRTIO_MEM_BBM_BB_PLUGGED);
1850 return rc;
1851 }
1852 return 0;
1853}
1854
1855/*
1856 * Prepare tracking data for the next big block.
1857 */
1858static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm,
1859 unsigned long *bb_id)
1860{
1861 int rc;
1862
1863 if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id)
1864 return -ENOSPC;
1865
1866 /* Resize the big block state array if required. */
1867 rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm);
1868 if (rc)
1869 return rc;
1870
1871 vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++;
1872 *bb_id = vm->bbm.next_bb_id;
1873 vm->bbm.next_bb_id++;
1874 return 0;
1875}
1876
1877static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff)
1878{
1879 uint64_t nb_bb = diff / vm->bbm.bb_size;
1880 unsigned long bb_id;
1881 int rc;
1882
1883 if (!nb_bb)
1884 return 0;
1885
1886 /* Try to plug and add unused big blocks */
1887 virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) {
1888 if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
1889 return -ENOSPC;
1890
1891 rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
1892 if (!rc)
1893 nb_bb--;
1894 if (rc || !nb_bb)
1895 return rc;
1896 cond_resched();
1897 }
1898
1899 /* Try to prepare, plug and add new big blocks */
1900 while (nb_bb) {
1901 if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
1902 return -ENOSPC;
1903
1904 rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id);
1905 if (rc)
1906 return rc;
1907 rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
1908 if (!rc)
1909 nb_bb--;
1910 if (rc)
1911 return rc;
1912 cond_resched();
1913 }
1914
1915 return 0;
1916}
1917
1918/*
1919 * Try to plug the requested amount of memory.
1920 */
1921static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
1922{
1923 if (vm->in_sbm)
1924 return virtio_mem_sbm_plug_request(vm, diff);
1925 return virtio_mem_bbm_plug_request(vm, diff);
1926}
1927
1928/*
1929 * Unplug the desired number of plugged subblocks of an offline memory block.
1930 * Will fail if any subblock cannot get unplugged (instead of skipping it).
1931 *
1932 * Will modify the state of the memory block. Might temporarily drop the
1933 * hotplug_mutex.
1934 *
1935 * Note: Can fail after some subblocks were successfully unplugged.
1936 */
1937static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm,
1938 unsigned long mb_id,
1939 uint64_t *nb_sb)
1940{
1941 int rc;
1942
1943 rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb);
1944
1945 /* some subblocks might have been unplugged even on failure */
1946 if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
1947 virtio_mem_sbm_set_mb_state(vm, mb_id,
1948 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
1949 if (rc)
1950 return rc;
1951
1952 if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
1953 /*
1954 * Remove the block from Linux - this should never fail.
1955 * Hinder the block from getting onlined by marking it
1956 * unplugged. Temporarily drop the mutex, so
1957 * any pending GOING_ONLINE requests can be serviced/rejected.
1958 */
1959 virtio_mem_sbm_set_mb_state(vm, mb_id,
1960 VIRTIO_MEM_SBM_MB_UNUSED);
1961
1962 mutex_unlock(&vm->hotplug_mutex);
1963 rc = virtio_mem_sbm_remove_mb(vm, mb_id);
1964 BUG_ON(rc);
1965 mutex_lock(&vm->hotplug_mutex);
1966 }
1967 return 0;
1968}
1969
1970/*
1971 * Unplug the given plugged subblocks of an online memory block.
1972 *
1973 * Will modify the state of the memory block.
1974 */
1975static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm,
1976 unsigned long mb_id, int sb_id,
1977 int count)
1978{
1979 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count;
1980 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
1981 unsigned long start_pfn;
1982 int rc;
1983
1984 start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
1985 sb_id * vm->sbm.sb_size);
1986
1987 rc = virtio_mem_fake_offline(vm, start_pfn, nr_pages);
1988 if (rc)
1989 return rc;
1990
1991 /* Try to unplug the allocated memory */
1992 rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
1993 if (rc) {
1994 /* Return the memory to the buddy. */
1995 virtio_mem_fake_online(start_pfn, nr_pages);
1996 return rc;
1997 }
1998
1999 switch (old_state) {
2000 case VIRTIO_MEM_SBM_MB_KERNEL:
2001 virtio_mem_sbm_set_mb_state(vm, mb_id,
2002 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL);
2003 break;
2004 case VIRTIO_MEM_SBM_MB_MOVABLE:
2005 virtio_mem_sbm_set_mb_state(vm, mb_id,
2006 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL);
2007 break;
2008 }
2009
2010 return 0;
2011}
2012
2013/*
2014 * Unplug the desired number of plugged subblocks of an online memory block.
2015 * Will skip subblock that are busy.
2016 *
2017 * Will modify the state of the memory block. Might temporarily drop the
2018 * hotplug_mutex.
2019 *
2020 * Note: Can fail after some subblocks were successfully unplugged. Can
2021 * return 0 even if subblocks were busy and could not get unplugged.
2022 */
2023static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm,
2024 unsigned long mb_id,
2025 uint64_t *nb_sb)
2026{
2027 int rc, sb_id;
2028
2029 /* If possible, try to unplug the complete block in one shot. */
2030 if (*nb_sb >= vm->sbm.sbs_per_mb &&
2031 virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
2032 rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0,
2033 vm->sbm.sbs_per_mb);
2034 if (!rc) {
2035 *nb_sb -= vm->sbm.sbs_per_mb;
2036 goto unplugged;
2037 } else if (rc != -EBUSY)
2038 return rc;
2039 }
2040
2041 /* Fallback to single subblocks. */
2042 for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) {
2043 /* Find the next candidate subblock */
2044 while (sb_id >= 0 &&
2045 !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
2046 sb_id--;
2047 if (sb_id < 0)
2048 break;
2049
2050 rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1);
2051 if (rc == -EBUSY)
2052 continue;
2053 else if (rc)
2054 return rc;
2055 *nb_sb -= 1;
2056 }
2057
2058unplugged:
2059 rc = virtio_mem_sbm_try_remove_unplugged_mb(vm, mb_id);
2060 if (rc)
2061 vm->sbm.have_unplugged_mb = 1;
2062 /* Ignore errors, this is not critical. We'll retry later. */
2063 return 0;
2064}
2065
2066/*
2067 * Unplug the desired number of plugged subblocks of a memory block that is
2068 * already added to Linux. Will skip subblock of online memory blocks that are
2069 * busy (by the OS). Will fail if any subblock that's not busy cannot get
2070 * unplugged.
2071 *
2072 * Will modify the state of the memory block. Might temporarily drop the
2073 * hotplug_mutex.
2074 *
2075 * Note: Can fail after some subblocks were successfully unplugged. Can
2076 * return 0 even if subblocks were busy and could not get unplugged.
2077 */
2078static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm,
2079 unsigned long mb_id,
2080 uint64_t *nb_sb)
2081{
2082 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
2083
2084 switch (old_state) {
2085 case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL:
2086 case VIRTIO_MEM_SBM_MB_KERNEL:
2087 case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL:
2088 case VIRTIO_MEM_SBM_MB_MOVABLE:
2089 return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb);
2090 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
2091 case VIRTIO_MEM_SBM_MB_OFFLINE:
2092 return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb);
2093 }
2094 return -EINVAL;
2095}
2096
2097static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
2098{
2099 const int mb_states[] = {
2100 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
2101 VIRTIO_MEM_SBM_MB_OFFLINE,
2102 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
2103 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
2104 VIRTIO_MEM_SBM_MB_MOVABLE,
2105 VIRTIO_MEM_SBM_MB_KERNEL,
2106 };
2107 uint64_t nb_sb = diff / vm->sbm.sb_size;
2108 unsigned long mb_id;
2109 int rc, i;
2110
2111 if (!nb_sb)
2112 return 0;
2113
2114 /*
2115 * We'll drop the mutex a couple of times when it is safe to do so.
2116 * This might result in some blocks switching the state (online/offline)
2117 * and we could miss them in this run - we will retry again later.
2118 */
2119 mutex_lock(&vm->hotplug_mutex);
2120
2121 /*
2122 * We try unplug from partially plugged blocks first, to try removing
2123 * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE
2124 * as it's more reliable to unplug memory and remove whole memory
2125 * blocks, and we don't want to trigger a zone imbalances by
2126 * accidentially removing too much kernel memory.
2127 */
2128 for (i = 0; i < ARRAY_SIZE(mb_states); i++) {
2129 virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) {
2130 rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb);
2131 if (rc || !nb_sb)
2132 goto out_unlock;
2133 mutex_unlock(&vm->hotplug_mutex);
2134 cond_resched();
2135 mutex_lock(&vm->hotplug_mutex);
2136 }
2137 if (!unplug_online && i == 1) {
2138 mutex_unlock(&vm->hotplug_mutex);
2139 return 0;
2140 }
2141 }
2142
2143 mutex_unlock(&vm->hotplug_mutex);
2144 return nb_sb ? -EBUSY : 0;
2145out_unlock:
2146 mutex_unlock(&vm->hotplug_mutex);
2147 return rc;
2148}
2149
2150/*
2151 * Try to offline and remove a big block from Linux and unplug it. Will fail
2152 * with -EBUSY if some memory is busy and cannot get unplugged.
2153 *
2154 * Will modify the state of the memory block. Might temporarily drop the
2155 * hotplug_mutex.
2156 */
2157static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm,
2158 unsigned long bb_id)
2159{
2160 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2161 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2162 unsigned long end_pfn = start_pfn + nr_pages;
2163 unsigned long pfn;
2164 struct page *page;
2165 int rc;
2166
2167 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
2168 VIRTIO_MEM_BBM_BB_ADDED))
2169 return -EINVAL;
2170
2171 /*
2172 * Start by fake-offlining all memory. Once we marked the device
2173 * block as fake-offline, all newly onlined memory will
2174 * automatically be kept fake-offline. Protect from concurrent
2175 * onlining/offlining until we have a consistent state.
2176 */
2177 mutex_lock(&vm->hotplug_mutex);
2178 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_FAKE_OFFLINE);
2179
2180 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
2181 page = pfn_to_online_page(pfn);
2182 if (!page)
2183 continue;
2184
2185 rc = virtio_mem_fake_offline(vm, pfn, PAGES_PER_SECTION);
2186 if (rc) {
2187 end_pfn = pfn;
2188 goto rollback;
2189 }
2190 }
2191 mutex_unlock(&vm->hotplug_mutex);
2192
2193 rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id);
2194 if (rc) {
2195 mutex_lock(&vm->hotplug_mutex);
2196 goto rollback;
2197 }
2198
2199 rc = virtio_mem_bbm_unplug_bb(vm, bb_id);
2200 if (rc)
2201 virtio_mem_bbm_set_bb_state(vm, bb_id,
2202 VIRTIO_MEM_BBM_BB_PLUGGED);
2203 else
2204 virtio_mem_bbm_set_bb_state(vm, bb_id,
2205 VIRTIO_MEM_BBM_BB_UNUSED);
2206 return rc;
2207
2208rollback:
2209 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
2210 page = pfn_to_online_page(pfn);
2211 if (!page)
2212 continue;
2213 virtio_mem_fake_online(pfn, PAGES_PER_SECTION);
2214 }
2215 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
2216 mutex_unlock(&vm->hotplug_mutex);
2217 return rc;
2218}
2219
2220/*
2221 * Test if a big block is completely offline.
2222 */
2223static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm,
2224 unsigned long bb_id)
2225{
2226 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2227 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2228 unsigned long pfn;
2229
2230 for (pfn = start_pfn; pfn < start_pfn + nr_pages;
2231 pfn += PAGES_PER_SECTION) {
2232 if (pfn_to_online_page(pfn))
2233 return false;
2234 }
2235
2236 return true;
2237}
2238
2239/*
2240 * Test if a big block is completely onlined to ZONE_MOVABLE (or offline).
2241 */
2242static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm,
2243 unsigned long bb_id)
2244{
2245 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2246 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2247 struct page *page;
2248 unsigned long pfn;
2249
2250 for (pfn = start_pfn; pfn < start_pfn + nr_pages;
2251 pfn += PAGES_PER_SECTION) {
2252 page = pfn_to_online_page(pfn);
2253 if (!page)
2254 continue;
2255 if (page_zonenum(page) != ZONE_MOVABLE)
2256 return false;
2257 }
2258
2259 return true;
2260}
2261
2262static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
2263{
2264 uint64_t nb_bb = diff / vm->bbm.bb_size;
2265 uint64_t bb_id;
2266 int rc, i;
2267
2268 if (!nb_bb)
2269 return 0;
2270
2271 /*
2272 * Try to unplug big blocks. Similar to SBM, start with offline
2273 * big blocks.
2274 */
2275 for (i = 0; i < 3; i++) {
2276 virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) {
2277 cond_resched();
2278
2279 /*
2280 * As we're holding no locks, these checks are racy,
2281 * but we don't care.
2282 */
2283 if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id))
2284 continue;
2285 if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id))
2286 continue;
2287 rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id);
2288 if (rc == -EBUSY)
2289 continue;
2290 if (!rc)
2291 nb_bb--;
2292 if (rc || !nb_bb)
2293 return rc;
2294 }
2295 if (i == 0 && !unplug_online)
2296 return 0;
2297 }
2298
2299 return nb_bb ? -EBUSY : 0;
2300}
2301
2302/*
2303 * Try to unplug the requested amount of memory.
2304 */
2305static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
2306{
2307 if (vm->in_sbm)
2308 return virtio_mem_sbm_unplug_request(vm, diff);
2309 return virtio_mem_bbm_unplug_request(vm, diff);
2310}
2311
2312/*
2313 * Try to unplug all blocks that couldn't be unplugged before, for example,
2314 * because the hypervisor was busy. Further, offline and remove any memory
2315 * blocks where we previously failed.
2316 */
2317static int virtio_mem_cleanup_pending_mb(struct virtio_mem *vm)
2318{
2319 unsigned long id;
2320 int rc = 0;
2321
2322 if (!vm->in_sbm) {
2323 virtio_mem_bbm_for_each_bb(vm, id,
2324 VIRTIO_MEM_BBM_BB_PLUGGED) {
2325 rc = virtio_mem_bbm_unplug_bb(vm, id);
2326 if (rc)
2327 return rc;
2328 virtio_mem_bbm_set_bb_state(vm, id,
2329 VIRTIO_MEM_BBM_BB_UNUSED);
2330 }
2331 return 0;
2332 }
2333
2334 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) {
2335 rc = virtio_mem_sbm_unplug_mb(vm, id);
2336 if (rc)
2337 return rc;
2338 virtio_mem_sbm_set_mb_state(vm, id,
2339 VIRTIO_MEM_SBM_MB_UNUSED);
2340 }
2341
2342 if (!vm->sbm.have_unplugged_mb)
2343 return 0;
2344
2345 /*
2346 * Let's retry (offlining and) removing completely unplugged Linux
2347 * memory blocks.
2348 */
2349 vm->sbm.have_unplugged_mb = false;
2350
2351 mutex_lock(&vm->hotplug_mutex);
2352 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL)
2353 rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id);
2354 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL)
2355 rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id);
2356 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL)
2357 rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id);
2358 mutex_unlock(&vm->hotplug_mutex);
2359
2360 if (rc)
2361 vm->sbm.have_unplugged_mb = true;
2362 /* Ignore errors, this is not critical. We'll retry later. */
2363 return 0;
2364}
2365
2366/*
2367 * Update all parts of the config that could have changed.
2368 */
2369static void virtio_mem_refresh_config(struct virtio_mem *vm)
2370{
2371 const struct range pluggable_range = mhp_get_pluggable_range(true);
2372 uint64_t new_plugged_size, usable_region_size, end_addr;
2373
2374 /* the plugged_size is just a reflection of what _we_ did previously */
2375 virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
2376 &new_plugged_size);
2377 if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size))
2378 vm->plugged_size = new_plugged_size;
2379
2380 /* calculate the last usable memory block id */
2381 virtio_cread_le(vm->vdev, struct virtio_mem_config,
2382 usable_region_size, &usable_region_size);
2383 end_addr = min(vm->addr + usable_region_size - 1,
2384 pluggable_range.end);
2385
2386 if (vm->in_sbm) {
2387 vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr);
2388 if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes()))
2389 vm->sbm.last_usable_mb_id--;
2390 } else {
2391 vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm,
2392 end_addr);
2393 if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size))
2394 vm->bbm.last_usable_bb_id--;
2395 }
2396 /*
2397 * If we cannot plug any of our device memory (e.g., nothing in the
2398 * usable region is addressable), the last usable memory block id will
2399 * be smaller than the first usable memory block id. We'll stop
2400 * attempting to add memory with -ENOSPC from our main loop.
2401 */
2402
2403 /* see if there is a request to change the size */
2404 virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size,
2405 &vm->requested_size);
2406
2407 dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size);
2408 dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size);
2409}
2410
2411/*
2412 * Workqueue function for handling plug/unplug requests and config updates.
2413 */
2414static void virtio_mem_run_wq(struct work_struct *work)
2415{
2416 struct virtio_mem *vm = container_of(work, struct virtio_mem, wq);
2417 uint64_t diff;
2418 int rc;
2419
2420 if (unlikely(vm->in_kdump)) {
2421 dev_warn_once(&vm->vdev->dev,
2422 "unexpected workqueue run in kdump kernel\n");
2423 return;
2424 }
2425
2426 hrtimer_cancel(&vm->retry_timer);
2427
2428 if (vm->broken)
2429 return;
2430
2431 atomic_set(&vm->wq_active, 1);
2432retry:
2433 rc = 0;
2434
2435 /* Make sure we start with a clean state if there are leftovers. */
2436 if (unlikely(vm->unplug_all_required))
2437 rc = virtio_mem_send_unplug_all_request(vm);
2438
2439 if (atomic_read(&vm->config_changed)) {
2440 atomic_set(&vm->config_changed, 0);
2441 virtio_mem_refresh_config(vm);
2442 }
2443
2444 /* Cleanup any leftovers from previous runs */
2445 if (!rc)
2446 rc = virtio_mem_cleanup_pending_mb(vm);
2447
2448 if (!rc && vm->requested_size != vm->plugged_size) {
2449 if (vm->requested_size > vm->plugged_size) {
2450 diff = vm->requested_size - vm->plugged_size;
2451 rc = virtio_mem_plug_request(vm, diff);
2452 } else {
2453 diff = vm->plugged_size - vm->requested_size;
2454 rc = virtio_mem_unplug_request(vm, diff);
2455 }
2456 }
2457
2458 /*
2459 * Keep retrying to offline and remove completely unplugged Linux
2460 * memory blocks.
2461 */
2462 if (!rc && vm->in_sbm && vm->sbm.have_unplugged_mb)
2463 rc = -EBUSY;
2464
2465 switch (rc) {
2466 case 0:
2467 vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
2468 break;
2469 case -ENOSPC:
2470 /*
2471 * We cannot add any more memory (alignment, physical limit)
2472 * or we have too many offline memory blocks.
2473 */
2474 break;
2475 case -ETXTBSY:
2476 /*
2477 * The hypervisor cannot process our request right now
2478 * (e.g., out of memory, migrating);
2479 */
2480 case -EBUSY:
2481 /*
2482 * We cannot free up any memory to unplug it (all plugged memory
2483 * is busy).
2484 */
2485 case -ENOMEM:
2486 /* Out of memory, try again later. */
2487 hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms),
2488 HRTIMER_MODE_REL);
2489 break;
2490 case -EAGAIN:
2491 /* Retry immediately (e.g., the config changed). */
2492 goto retry;
2493 default:
2494 /* Unknown error, mark as broken */
2495 dev_err(&vm->vdev->dev,
2496 "unknown error, marking device broken: %d\n", rc);
2497 vm->broken = true;
2498 }
2499
2500 atomic_set(&vm->wq_active, 0);
2501}
2502
2503static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer)
2504{
2505 struct virtio_mem *vm = container_of(timer, struct virtio_mem,
2506 retry_timer);
2507
2508 virtio_mem_retry(vm);
2509 vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2,
2510 VIRTIO_MEM_RETRY_TIMER_MAX_MS);
2511 return HRTIMER_NORESTART;
2512}
2513
2514static void virtio_mem_handle_response(struct virtqueue *vq)
2515{
2516 struct virtio_mem *vm = vq->vdev->priv;
2517
2518 wake_up(&vm->host_resp);
2519}
2520
2521static int virtio_mem_init_vq(struct virtio_mem *vm)
2522{
2523 struct virtqueue *vq;
2524
2525 vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response,
2526 "guest-request");
2527 if (IS_ERR(vq))
2528 return PTR_ERR(vq);
2529 vm->vq = vq;
2530
2531 return 0;
2532}
2533
2534static int virtio_mem_init_hotplug(struct virtio_mem *vm)
2535{
2536 const struct range pluggable_range = mhp_get_pluggable_range(true);
2537 uint64_t unit_pages, sb_size, addr;
2538 int rc;
2539
2540 /* bad device setup - warn only */
2541 if (!IS_ALIGNED(vm->addr, memory_block_size_bytes()))
2542 dev_warn(&vm->vdev->dev,
2543 "The alignment of the physical start address can make some memory unusable.\n");
2544 if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes()))
2545 dev_warn(&vm->vdev->dev,
2546 "The alignment of the physical end address can make some memory unusable.\n");
2547 if (vm->addr < pluggable_range.start ||
2548 vm->addr + vm->region_size - 1 > pluggable_range.end)
2549 dev_warn(&vm->vdev->dev,
2550 "Some device memory is not addressable/pluggable. This can make some memory unusable.\n");
2551
2552 /* Prepare the offline threshold - make sure we can add two blocks. */
2553 vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(),
2554 VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD);
2555
2556 /*
2557 * alloc_contig_range() works reliably with pageblock
2558 * granularity on ZONE_NORMAL, use pageblock_nr_pages.
2559 */
2560 sb_size = PAGE_SIZE * pageblock_nr_pages;
2561 sb_size = max_t(uint64_t, vm->device_block_size, sb_size);
2562
2563 if (sb_size < memory_block_size_bytes() && !force_bbm) {
2564 /* SBM: At least two subblocks per Linux memory block. */
2565 vm->in_sbm = true;
2566 vm->sbm.sb_size = sb_size;
2567 vm->sbm.sbs_per_mb = memory_block_size_bytes() /
2568 vm->sbm.sb_size;
2569
2570 /* Round up to the next full memory block */
2571 addr = max_t(uint64_t, vm->addr, pluggable_range.start) +
2572 memory_block_size_bytes() - 1;
2573 vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr);
2574 vm->sbm.next_mb_id = vm->sbm.first_mb_id;
2575 } else {
2576 /* BBM: At least one Linux memory block. */
2577 vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size,
2578 memory_block_size_bytes());
2579
2580 if (bbm_block_size) {
2581 if (!is_power_of_2(bbm_block_size)) {
2582 dev_warn(&vm->vdev->dev,
2583 "bbm_block_size is not a power of 2");
2584 } else if (bbm_block_size < vm->bbm.bb_size) {
2585 dev_warn(&vm->vdev->dev,
2586 "bbm_block_size is too small");
2587 } else {
2588 vm->bbm.bb_size = bbm_block_size;
2589 }
2590 }
2591
2592 /* Round up to the next aligned big block */
2593 addr = max_t(uint64_t, vm->addr, pluggable_range.start) +
2594 vm->bbm.bb_size - 1;
2595 vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr);
2596 vm->bbm.next_bb_id = vm->bbm.first_bb_id;
2597
2598 /* Make sure we can add two big blocks. */
2599 vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size,
2600 vm->offline_threshold);
2601 }
2602
2603 dev_info(&vm->vdev->dev, "memory block size: 0x%lx",
2604 memory_block_size_bytes());
2605 if (vm->in_sbm)
2606 dev_info(&vm->vdev->dev, "subblock size: 0x%llx",
2607 (unsigned long long)vm->sbm.sb_size);
2608 else
2609 dev_info(&vm->vdev->dev, "big block size: 0x%llx",
2610 (unsigned long long)vm->bbm.bb_size);
2611
2612 /* create the parent resource for all memory */
2613 rc = virtio_mem_create_resource(vm);
2614 if (rc)
2615 return rc;
2616
2617 /* use a single dynamic memory group to cover the whole memory device */
2618 if (vm->in_sbm)
2619 unit_pages = PHYS_PFN(memory_block_size_bytes());
2620 else
2621 unit_pages = PHYS_PFN(vm->bbm.bb_size);
2622 rc = memory_group_register_dynamic(vm->nid, unit_pages);
2623 if (rc < 0)
2624 goto out_del_resource;
2625 vm->mgid = rc;
2626
2627 /*
2628 * If we still have memory plugged, we have to unplug all memory first.
2629 * Registering our parent resource makes sure that this memory isn't
2630 * actually in use (e.g., trying to reload the driver).
2631 */
2632 if (vm->plugged_size) {
2633 vm->unplug_all_required = true;
2634 dev_info(&vm->vdev->dev, "unplugging all memory is required\n");
2635 }
2636
2637 /* register callbacks */
2638 vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb;
2639 rc = register_memory_notifier(&vm->memory_notifier);
2640 if (rc)
2641 goto out_unreg_group;
2642 /* Block hibernation as early as possible. */
2643 vm->pm_notifier.priority = INT_MAX;
2644 vm->pm_notifier.notifier_call = virtio_mem_pm_notifier_cb;
2645 rc = register_pm_notifier(&vm->pm_notifier);
2646 if (rc)
2647 goto out_unreg_mem;
2648 rc = register_virtio_mem_device(vm);
2649 if (rc)
2650 goto out_unreg_pm;
2651
2652 return 0;
2653out_unreg_pm:
2654 unregister_pm_notifier(&vm->pm_notifier);
2655out_unreg_mem:
2656 unregister_memory_notifier(&vm->memory_notifier);
2657out_unreg_group:
2658 memory_group_unregister(vm->mgid);
2659out_del_resource:
2660 virtio_mem_delete_resource(vm);
2661 return rc;
2662}
2663
2664#ifdef CONFIG_PROC_VMCORE
2665static int virtio_mem_send_state_request(struct virtio_mem *vm, uint64_t addr,
2666 uint64_t size)
2667{
2668 const uint64_t nb_vm_blocks = size / vm->device_block_size;
2669 const struct virtio_mem_req req = {
2670 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_STATE),
2671 .u.state.addr = cpu_to_virtio64(vm->vdev, addr),
2672 .u.state.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
2673 };
2674 int rc = -ENOMEM;
2675
2676 dev_dbg(&vm->vdev->dev, "requesting state: 0x%llx - 0x%llx\n", addr,
2677 addr + size - 1);
2678
2679 switch (virtio_mem_send_request(vm, &req)) {
2680 case VIRTIO_MEM_RESP_ACK:
2681 return virtio16_to_cpu(vm->vdev, vm->resp.u.state.state);
2682 case VIRTIO_MEM_RESP_ERROR:
2683 rc = -EINVAL;
2684 break;
2685 default:
2686 break;
2687 }
2688
2689 dev_dbg(&vm->vdev->dev, "requesting state failed: %d\n", rc);
2690 return rc;
2691}
2692
2693static bool virtio_mem_vmcore_pfn_is_ram(struct vmcore_cb *cb,
2694 unsigned long pfn)
2695{
2696 struct virtio_mem *vm = container_of(cb, struct virtio_mem,
2697 vmcore_cb);
2698 uint64_t addr = PFN_PHYS(pfn);
2699 bool is_ram;
2700 int rc;
2701
2702 if (!virtio_mem_contains_range(vm, addr, PAGE_SIZE))
2703 return true;
2704 if (!vm->plugged_size)
2705 return false;
2706
2707 /*
2708 * We have to serialize device requests and access to the information
2709 * about the block queried last.
2710 */
2711 mutex_lock(&vm->hotplug_mutex);
2712
2713 addr = ALIGN_DOWN(addr, vm->device_block_size);
2714 if (addr != vm->last_block_addr) {
2715 rc = virtio_mem_send_state_request(vm, addr,
2716 vm->device_block_size);
2717 /* On any kind of error, we're going to signal !ram. */
2718 if (rc == VIRTIO_MEM_STATE_PLUGGED)
2719 vm->last_block_plugged = true;
2720 else
2721 vm->last_block_plugged = false;
2722 vm->last_block_addr = addr;
2723 }
2724
2725 is_ram = vm->last_block_plugged;
2726 mutex_unlock(&vm->hotplug_mutex);
2727 return is_ram;
2728}
2729#endif /* CONFIG_PROC_VMCORE */
2730
2731static int virtio_mem_init_kdump(struct virtio_mem *vm)
2732{
2733#ifdef CONFIG_PROC_VMCORE
2734 dev_info(&vm->vdev->dev, "memory hot(un)plug disabled in kdump kernel\n");
2735 vm->vmcore_cb.pfn_is_ram = virtio_mem_vmcore_pfn_is_ram;
2736 register_vmcore_cb(&vm->vmcore_cb);
2737 return 0;
2738#else /* CONFIG_PROC_VMCORE */
2739 dev_warn(&vm->vdev->dev, "disabled in kdump kernel without vmcore\n");
2740 return -EBUSY;
2741#endif /* CONFIG_PROC_VMCORE */
2742}
2743
2744static int virtio_mem_init(struct virtio_mem *vm)
2745{
2746 uint16_t node_id;
2747
2748 if (!vm->vdev->config->get) {
2749 dev_err(&vm->vdev->dev, "config access disabled\n");
2750 return -EINVAL;
2751 }
2752
2753 /* Fetch all properties that can't change. */
2754 virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
2755 &vm->plugged_size);
2756 virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size,
2757 &vm->device_block_size);
2758 virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id,
2759 &node_id);
2760 vm->nid = virtio_mem_translate_node_id(vm, node_id);
2761 virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr);
2762 virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size,
2763 &vm->region_size);
2764
2765 /* Determine the nid for the device based on the lowest address. */
2766 if (vm->nid == NUMA_NO_NODE)
2767 vm->nid = memory_add_physaddr_to_nid(vm->addr);
2768
2769 dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr);
2770 dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size);
2771 dev_info(&vm->vdev->dev, "device block size: 0x%llx",
2772 (unsigned long long)vm->device_block_size);
2773 if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA))
2774 dev_info(&vm->vdev->dev, "nid: %d", vm->nid);
2775
2776 /*
2777 * We don't want to (un)plug or reuse any memory when in kdump. The
2778 * memory is still accessible (but not exposed to Linux).
2779 */
2780 if (vm->in_kdump)
2781 return virtio_mem_init_kdump(vm);
2782 return virtio_mem_init_hotplug(vm);
2783}
2784
2785static int virtio_mem_create_resource(struct virtio_mem *vm)
2786{
2787 /*
2788 * When force-unloading the driver and removing the device, we
2789 * could have a garbage pointer. Duplicate the string.
2790 */
2791 const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL);
2792
2793 if (!name)
2794 return -ENOMEM;
2795
2796 /* Disallow mapping device memory via /dev/mem completely. */
2797 vm->parent_resource = __request_mem_region(vm->addr, vm->region_size,
2798 name, IORESOURCE_SYSTEM_RAM |
2799 IORESOURCE_EXCLUSIVE);
2800 if (!vm->parent_resource) {
2801 kfree(name);
2802 dev_warn(&vm->vdev->dev, "could not reserve device region\n");
2803 dev_info(&vm->vdev->dev,
2804 "reloading the driver is not supported\n");
2805 return -EBUSY;
2806 }
2807
2808 /* The memory is not actually busy - make add_memory() work. */
2809 vm->parent_resource->flags &= ~IORESOURCE_BUSY;
2810 return 0;
2811}
2812
2813static void virtio_mem_delete_resource(struct virtio_mem *vm)
2814{
2815 const char *name;
2816
2817 if (!vm->parent_resource)
2818 return;
2819
2820 name = vm->parent_resource->name;
2821 release_resource(vm->parent_resource);
2822 kfree(vm->parent_resource);
2823 kfree(name);
2824 vm->parent_resource = NULL;
2825}
2826
2827static int virtio_mem_range_has_system_ram(struct resource *res, void *arg)
2828{
2829 return 1;
2830}
2831
2832static bool virtio_mem_has_memory_added(struct virtio_mem *vm)
2833{
2834 const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
2835
2836 return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr,
2837 vm->addr + vm->region_size, NULL,
2838 virtio_mem_range_has_system_ram) == 1;
2839}
2840
2841static int virtio_mem_probe(struct virtio_device *vdev)
2842{
2843 struct virtio_mem *vm;
2844 int rc;
2845
2846 BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24);
2847 BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10);
2848
2849 vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL);
2850 if (!vm)
2851 return -ENOMEM;
2852
2853 init_waitqueue_head(&vm->host_resp);
2854 vm->vdev = vdev;
2855 INIT_WORK(&vm->wq, virtio_mem_run_wq);
2856 mutex_init(&vm->hotplug_mutex);
2857 INIT_LIST_HEAD(&vm->next);
2858 spin_lock_init(&vm->removal_lock);
2859 hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2860 vm->retry_timer.function = virtio_mem_timer_expired;
2861 vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
2862 vm->in_kdump = is_kdump_kernel();
2863
2864 /* register the virtqueue */
2865 rc = virtio_mem_init_vq(vm);
2866 if (rc)
2867 goto out_free_vm;
2868
2869 /* initialize the device by querying the config */
2870 rc = virtio_mem_init(vm);
2871 if (rc)
2872 goto out_del_vq;
2873
2874 virtio_device_ready(vdev);
2875
2876 /* trigger a config update to start processing the requested_size */
2877 if (!vm->in_kdump) {
2878 atomic_set(&vm->config_changed, 1);
2879 queue_work(system_freezable_wq, &vm->wq);
2880 }
2881
2882 return 0;
2883out_del_vq:
2884 vdev->config->del_vqs(vdev);
2885out_free_vm:
2886 kfree(vm);
2887 vdev->priv = NULL;
2888
2889 return rc;
2890}
2891
2892static void virtio_mem_deinit_hotplug(struct virtio_mem *vm)
2893{
2894 unsigned long mb_id;
2895 int rc;
2896
2897 /*
2898 * Make sure the workqueue won't be triggered anymore and no memory
2899 * blocks can be onlined/offlined until we're finished here.
2900 */
2901 mutex_lock(&vm->hotplug_mutex);
2902 spin_lock_irq(&vm->removal_lock);
2903 vm->removing = true;
2904 spin_unlock_irq(&vm->removal_lock);
2905 mutex_unlock(&vm->hotplug_mutex);
2906
2907 /* wait until the workqueue stopped */
2908 cancel_work_sync(&vm->wq);
2909 hrtimer_cancel(&vm->retry_timer);
2910
2911 if (vm->in_sbm) {
2912 /*
2913 * After we unregistered our callbacks, user space can online
2914 * partially plugged offline blocks. Make sure to remove them.
2915 */
2916 virtio_mem_sbm_for_each_mb(vm, mb_id,
2917 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) {
2918 rc = virtio_mem_sbm_remove_mb(vm, mb_id);
2919 BUG_ON(rc);
2920 virtio_mem_sbm_set_mb_state(vm, mb_id,
2921 VIRTIO_MEM_SBM_MB_UNUSED);
2922 }
2923 /*
2924 * After we unregistered our callbacks, user space can no longer
2925 * offline partially plugged online memory blocks. No need to
2926 * worry about them.
2927 */
2928 }
2929
2930 /* unregister callbacks */
2931 unregister_virtio_mem_device(vm);
2932 unregister_pm_notifier(&vm->pm_notifier);
2933 unregister_memory_notifier(&vm->memory_notifier);
2934
2935 /*
2936 * There is no way we could reliably remove all memory we have added to
2937 * the system. And there is no way to stop the driver/device from going
2938 * away. Warn at least.
2939 */
2940 if (virtio_mem_has_memory_added(vm)) {
2941 dev_warn(&vm->vdev->dev,
2942 "device still has system memory added\n");
2943 } else {
2944 virtio_mem_delete_resource(vm);
2945 kfree_const(vm->resource_name);
2946 memory_group_unregister(vm->mgid);
2947 }
2948
2949 /* remove all tracking data - no locking needed */
2950 if (vm->in_sbm) {
2951 vfree(vm->sbm.mb_states);
2952 vfree(vm->sbm.sb_states);
2953 } else {
2954 vfree(vm->bbm.bb_states);
2955 }
2956}
2957
2958static void virtio_mem_deinit_kdump(struct virtio_mem *vm)
2959{
2960#ifdef CONFIG_PROC_VMCORE
2961 unregister_vmcore_cb(&vm->vmcore_cb);
2962#endif /* CONFIG_PROC_VMCORE */
2963}
2964
2965static void virtio_mem_remove(struct virtio_device *vdev)
2966{
2967 struct virtio_mem *vm = vdev->priv;
2968
2969 if (vm->in_kdump)
2970 virtio_mem_deinit_kdump(vm);
2971 else
2972 virtio_mem_deinit_hotplug(vm);
2973
2974 /* reset the device and cleanup the queues */
2975 virtio_reset_device(vdev);
2976 vdev->config->del_vqs(vdev);
2977
2978 kfree(vm);
2979 vdev->priv = NULL;
2980}
2981
2982static void virtio_mem_config_changed(struct virtio_device *vdev)
2983{
2984 struct virtio_mem *vm = vdev->priv;
2985
2986 if (unlikely(vm->in_kdump))
2987 return;
2988
2989 atomic_set(&vm->config_changed, 1);
2990 virtio_mem_retry(vm);
2991}
2992
2993#ifdef CONFIG_PM_SLEEP
2994static int virtio_mem_freeze(struct virtio_device *vdev)
2995{
2996 struct virtio_mem *vm = vdev->priv;
2997
2998 /*
2999 * We block hibernation using the PM notifier completely. The workqueue
3000 * is already frozen by the PM core at this point, so we simply
3001 * reset the device and cleanup the queues.
3002 */
3003 if (pm_suspend_target_state != PM_SUSPEND_TO_IDLE &&
3004 vm->plugged_size &&
3005 !virtio_has_feature(vm->vdev, VIRTIO_MEM_F_PERSISTENT_SUSPEND)) {
3006 dev_err(&vm->vdev->dev,
3007 "suspending with plugged memory is not supported\n");
3008 return -EPERM;
3009 }
3010
3011 virtio_reset_device(vdev);
3012 vdev->config->del_vqs(vdev);
3013 vm->vq = NULL;
3014 return 0;
3015}
3016
3017static int virtio_mem_restore(struct virtio_device *vdev)
3018{
3019 struct virtio_mem *vm = vdev->priv;
3020 int ret;
3021
3022 ret = virtio_mem_init_vq(vm);
3023 if (ret)
3024 return ret;
3025 virtio_device_ready(vdev);
3026
3027 /* Let's check if anything changed. */
3028 virtio_mem_config_changed(vdev);
3029 return 0;
3030}
3031#endif
3032
3033static unsigned int virtio_mem_features[] = {
3034#if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA)
3035 VIRTIO_MEM_F_ACPI_PXM,
3036#endif
3037 VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE,
3038 VIRTIO_MEM_F_PERSISTENT_SUSPEND,
3039};
3040
3041static const struct virtio_device_id virtio_mem_id_table[] = {
3042 { VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID },
3043 { 0 },
3044};
3045
3046static struct virtio_driver virtio_mem_driver = {
3047 .feature_table = virtio_mem_features,
3048 .feature_table_size = ARRAY_SIZE(virtio_mem_features),
3049 .driver.name = KBUILD_MODNAME,
3050 .id_table = virtio_mem_id_table,
3051 .probe = virtio_mem_probe,
3052 .remove = virtio_mem_remove,
3053 .config_changed = virtio_mem_config_changed,
3054#ifdef CONFIG_PM_SLEEP
3055 .freeze = virtio_mem_freeze,
3056 .restore = virtio_mem_restore,
3057#endif
3058};
3059
3060module_virtio_driver(virtio_mem_driver);
3061MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table);
3062MODULE_AUTHOR("David Hildenbrand <david@redhat.com>");
3063MODULE_DESCRIPTION("Virtio-mem driver");
3064MODULE_LICENSE("GPL");