Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/Hyper-V: Support for free page reporting

Linux has support for free page reporting now (36e66c554b5c) for
virtualized environment. On Hyper-V when virtually backed VMs are
configured, Hyper-V will advertise cold memory discard capability,
when supported. This patch adds the support to hook into the free
page reporting infrastructure and leverage the Hyper-V cold memory
discard hint hypercall to report/free these pages back to the host.

Signed-off-by: Sunil Muthuswamy <sunilmut@microsoft.com>
Tested-by: Matheus Castello <matheus@castello.eng.br>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Link: https://lore.kernel.org/r/SN4PR2101MB0880121FA4E2FEC67F35C1DCC0649@SN4PR2101MB0880.namprd21.prod.outlook.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>

authored by

Sunil Muthuswamy and committed by
Wei Liu
6dc2a774 1b602808

+180 -8
+50 -1
arch/x86/hyperv/hv_init.c
··· 498 498 x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain; 499 499 #endif 500 500 501 + /* Query the VMs extended capability once, so that it can be cached. */ 502 + hv_query_ext_cap(0); 501 503 return; 502 504 503 505 remove_cpuhp_state: ··· 603 601 604 602 enum hv_isolation_type hv_get_isolation_type(void) 605 603 { 606 - if (!(ms_hyperv.features_b & HV_ISOLATION)) 604 + if (!(ms_hyperv.priv_high & HV_ISOLATION)) 607 605 return HV_ISOLATION_TYPE_NONE; 608 606 return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b); 609 607 } ··· 614 612 return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE; 615 613 } 616 614 EXPORT_SYMBOL_GPL(hv_is_isolation_supported); 615 + 616 + /* Bit mask of the extended capability to query: see HV_EXT_CAPABILITY_xxx */ 617 + bool hv_query_ext_cap(u64 cap_query) 618 + { 619 + /* 620 + * The address of the 'hv_extended_cap' variable will be used as an 621 + * output parameter to the hypercall below and so it should be 622 + * compatible with 'virt_to_phys'. Which means, it's address should be 623 + * directly mapped. Use 'static' to keep it compatible; stack variables 624 + * can be virtually mapped, making them imcompatible with 625 + * 'virt_to_phys'. 626 + * Hypercall input/output addresses should also be 8-byte aligned. 627 + */ 628 + static u64 hv_extended_cap __aligned(8); 629 + static bool hv_extended_cap_queried; 630 + u64 status; 631 + 632 + /* 633 + * Querying extended capabilities is an extended hypercall. Check if the 634 + * partition supports extended hypercall, first. 635 + */ 636 + if (!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS)) 637 + return false; 638 + 639 + /* Extended capabilities do not change at runtime. */ 640 + if (hv_extended_cap_queried) 641 + return hv_extended_cap & cap_query; 642 + 643 + status = hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL, 644 + &hv_extended_cap); 645 + 646 + /* 647 + * The query extended capabilities hypercall should not fail under 648 + * any normal circumstances. Avoid repeatedly making the hypercall, on 649 + * error. 650 + */ 651 + hv_extended_cap_queried = true; 652 + status &= HV_HYPERCALL_RESULT_MASK; 653 + if (status != HV_STATUS_SUCCESS) { 654 + pr_err("Hyper-V: Extended query capabilities hypercall failed 0x%llx\n", 655 + status); 656 + return false; 657 + } 658 + 659 + return hv_extended_cap & cap_query; 660 + } 661 + EXPORT_SYMBOL_GPL(hv_query_ext_cap);
+5 -4
arch/x86/kernel/cpu/mshyperv.c
··· 265 265 * Extract the features and hints 266 266 */ 267 267 ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); 268 - ms_hyperv.features_b = cpuid_ebx(HYPERV_CPUID_FEATURES); 268 + ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES); 269 269 ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); 270 270 ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); 271 271 272 - pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n", 273 - ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features); 272 + pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n", 273 + ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints, 274 + ms_hyperv.misc_features); 274 275 275 276 ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS); 276 277 ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS); ··· 317 316 x86_platform.calibrate_cpu = hv_get_tsc_khz; 318 317 } 319 318 320 - if (ms_hyperv.features_b & HV_ISOLATION) { 319 + if (ms_hyperv.priv_high & HV_ISOLATION) { 321 320 ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG); 322 321 ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG); 323 322
+1
drivers/hv/Kconfig
··· 23 23 config HYPERV_BALLOON 24 24 tristate "Microsoft Hyper-V Balloon driver" 25 25 depends on HYPERV 26 + select PAGE_REPORTING 26 27 help 27 28 Select this option to enable Hyper-V Balloon driver. 28 29
+89
drivers/hv/hv_balloon.c
··· 21 21 #include <linux/memory.h> 22 22 #include <linux/notifier.h> 23 23 #include <linux/percpu_counter.h> 24 + #include <linux/page_reporting.h> 24 25 25 26 #include <linux/hyperv.h> 26 27 #include <asm/hyperv-tlfs.h> ··· 564 563 * The negotiated version agreed by host. 565 564 */ 566 565 __u32 version; 566 + 567 + struct page_reporting_dev_info pr_dev_info; 567 568 }; 568 569 569 570 static struct hv_dynmem_device dm_device; ··· 1571 1568 1572 1569 } 1573 1570 1571 + /* Hyper-V only supports reporting 2MB pages or higher */ 1572 + #define HV_MIN_PAGE_REPORTING_ORDER 9 1573 + #define HV_MIN_PAGE_REPORTING_LEN (HV_HYP_PAGE_SIZE << HV_MIN_PAGE_REPORTING_ORDER) 1574 + static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info, 1575 + struct scatterlist *sgl, unsigned int nents) 1576 + { 1577 + unsigned long flags; 1578 + struct hv_memory_hint *hint; 1579 + int i; 1580 + u64 status; 1581 + struct scatterlist *sg; 1582 + 1583 + WARN_ON_ONCE(nents > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES); 1584 + WARN_ON_ONCE(sgl->length < HV_MIN_PAGE_REPORTING_LEN); 1585 + local_irq_save(flags); 1586 + hint = *(struct hv_memory_hint **)this_cpu_ptr(hyperv_pcpu_input_arg); 1587 + if (!hint) { 1588 + local_irq_restore(flags); 1589 + return -ENOSPC; 1590 + } 1591 + 1592 + hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD; 1593 + hint->reserved = 0; 1594 + for_each_sg(sgl, sg, nents, i) { 1595 + union hv_gpa_page_range *range; 1596 + 1597 + range = &hint->ranges[i]; 1598 + range->address_space = 0; 1599 + /* page reporting only reports 2MB pages or higher */ 1600 + range->page.largepage = 1; 1601 + range->page.additional_pages = 1602 + (sg->length / HV_MIN_PAGE_REPORTING_LEN) - 1; 1603 + range->page_size = HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB; 1604 + range->base_large_pfn = 1605 + page_to_hvpfn(sg_page(sg)) >> HV_MIN_PAGE_REPORTING_ORDER; 1606 + } 1607 + 1608 + status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0, 1609 + hint, NULL); 1610 + local_irq_restore(flags); 1611 + if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) { 1612 + pr_err("Cold memory discard hypercall failed with status %llx\n", 1613 + status); 1614 + return -EINVAL; 1615 + } 1616 + 1617 + return 0; 1618 + } 1619 + 1620 + static void enable_page_reporting(void) 1621 + { 1622 + int ret; 1623 + 1624 + /* Essentially, validating 'PAGE_REPORTING_MIN_ORDER' is big enough. */ 1625 + if (pageblock_order < HV_MIN_PAGE_REPORTING_ORDER) { 1626 + pr_debug("Cold memory discard is only supported on 2MB pages and above\n"); 1627 + return; 1628 + } 1629 + 1630 + if (!hv_query_ext_cap(HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT)) { 1631 + pr_debug("Cold memory discard hint not supported by Hyper-V\n"); 1632 + return; 1633 + } 1634 + 1635 + BUILD_BUG_ON(PAGE_REPORTING_CAPACITY > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES); 1636 + dm_device.pr_dev_info.report = hv_free_page_report; 1637 + ret = page_reporting_register(&dm_device.pr_dev_info); 1638 + if (ret < 0) { 1639 + dm_device.pr_dev_info.report = NULL; 1640 + pr_err("Failed to enable cold memory discard: %d\n", ret); 1641 + } else { 1642 + pr_info("Cold memory discard hint enabled\n"); 1643 + } 1644 + } 1645 + 1646 + static void disable_page_reporting(void) 1647 + { 1648 + if (dm_device.pr_dev_info.report) { 1649 + page_reporting_unregister(&dm_device.pr_dev_info); 1650 + dm_device.pr_dev_info.report = NULL; 1651 + } 1652 + } 1653 + 1574 1654 static int balloon_connect_vsp(struct hv_device *dev) 1575 1655 { 1576 1656 struct dm_version_request version_req; ··· 1799 1713 if (ret != 0) 1800 1714 return ret; 1801 1715 1716 + enable_page_reporting(); 1802 1717 dm_device.state = DM_INITIALIZED; 1803 1718 1804 1719 dm_device.thread = ··· 1814 1727 probe_error: 1815 1728 dm_device.state = DM_INIT_ERROR; 1816 1729 dm_device.thread = NULL; 1730 + disable_page_reporting(); 1817 1731 vmbus_close(dev->channel); 1818 1732 #ifdef CONFIG_MEMORY_HOTPLUG 1819 1733 unregister_memory_notifier(&hv_memory_nb); ··· 1837 1749 cancel_work_sync(&dm->ha_wrk.wrk); 1838 1750 1839 1751 kthread_stop(dm->thread); 1752 + disable_page_reporting(); 1840 1753 vmbus_close(dev->channel); 1841 1754 #ifdef CONFIG_MEMORY_HOTPLUG 1842 1755 unregister_memory_notifier(&hv_memory_nb);
+33 -2
include/asm-generic/hyperv-tlfs.h
··· 89 89 #define HV_ACCESS_STATS BIT(8) 90 90 #define HV_DEBUGGING BIT(11) 91 91 #define HV_CPU_MANAGEMENT BIT(12) 92 + #define HV_ENABLE_EXTENDED_HYPERCALLS BIT(20) 92 93 #define HV_ISOLATION BIT(22) 93 - 94 94 95 95 /* 96 96 * TSC page layout. ··· 159 159 #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af 160 160 #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 161 161 162 + /* Extended hypercalls */ 163 + #define HV_EXT_CALL_QUERY_CAPABILITIES 0x8001 164 + #define HV_EXT_CALL_MEMORY_HEAT_HINT 0x8003 165 + 162 166 #define HV_FLUSH_ALL_PROCESSORS BIT(0) 163 167 #define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) 164 168 #define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) 165 169 #define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3) 170 + 171 + /* Extended capability bits */ 172 + #define HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT BIT(8) 166 173 167 174 enum HV_GENERIC_SET_FORMAT { 168 175 HV_GENERIC_SET_SPARSE_4K, ··· 415 408 * by the bitwidth of "additional_pages" in union hv_gpa_page_range. 416 409 */ 417 410 #define HV_MAX_FLUSH_PAGES (2048) 411 + #define HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB 0 412 + #define HV_GPA_PAGE_RANGE_PAGE_SIZE_1GB 1 418 413 419 - /* HvFlushGuestPhysicalAddressList hypercall */ 414 + /* HvFlushGuestPhysicalAddressList, HvExtCallMemoryHeatHint hypercall */ 420 415 union hv_gpa_page_range { 421 416 u64 address_space; 422 417 struct { ··· 426 417 u64 largepage:1; 427 418 u64 basepfn:52; 428 419 } page; 420 + struct { 421 + u64 reserved:12; 422 + u64 page_size:1; 423 + u64 reserved1:8; 424 + u64 base_large_pfn:43; 425 + }; 429 426 }; 430 427 431 428 /* ··· 788 773 789 774 #define HV_SOURCE_SHADOW_NONE 0x0 790 775 #define HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE 0x1 776 + 777 + /* 778 + * The whole argument should fit in a page to be able to pass to the hypervisor 779 + * in one hypercall. 780 + */ 781 + #define HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES \ 782 + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_memory_hint)) / \ 783 + sizeof(union hv_gpa_page_range)) 784 + 785 + /* HvExtCallMemoryHeatHint hypercall */ 786 + #define HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD 2 787 + struct hv_memory_hint { 788 + u64 type:2; 789 + u64 reserved:62; 790 + union hv_gpa_page_range ranges[]; 791 + } __packed; 791 792 792 793 #endif
+2 -1
include/asm-generic/mshyperv.h
··· 27 27 28 28 struct ms_hyperv_info { 29 29 u32 features; 30 - u32 features_b; 30 + u32 priv_high; 31 31 u32 misc_features; 32 32 u32 hints; 33 33 u32 nested_features; ··· 179 179 enum hv_isolation_type hv_get_isolation_type(void); 180 180 bool hv_is_isolation_supported(void); 181 181 void hyperv_cleanup(void); 182 + bool hv_query_ext_cap(u64 cap_query); 182 183 #else /* CONFIG_HYPERV */ 183 184 static inline bool hv_is_hyperv_initialized(void) { return false; } 184 185 static inline bool hv_is_hibernation_supported(void) { return false; }