Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm/memory_hotplug: track present pages in memory groups

Let's track all present pages in each memory group. Especially, track
memory present in ZONE_MOVABLE and memory present in one of the kernel
zones (which really only is ZONE_NORMAL right now as memory groups only
apply to hotplugged memory) separately within a memory group, to prepare
for making smart auto-online decision for individual memory blocks within
a memory group based on group statistics.

Link: https://lkml.kernel.org/r/20210806124715.17090-5-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hui Zhu <teawater@gmail.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Marek Kedzierski <mkedzier@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

David Hildenbrand and committed by
Linus Torvalds
836809ec 028fc57a

+34 -14
+5 -5
drivers/base/memory.c
··· 198 198 } 199 199 200 200 ret = online_pages(start_pfn + nr_vmemmap_pages, 201 - nr_pages - nr_vmemmap_pages, zone); 201 + nr_pages - nr_vmemmap_pages, zone, mem->group); 202 202 if (ret) { 203 203 if (nr_vmemmap_pages) 204 204 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); ··· 210 210 * now already properly populated. 211 211 */ 212 212 if (nr_vmemmap_pages) 213 - adjust_present_page_count(pfn_to_page(start_pfn), 213 + adjust_present_page_count(pfn_to_page(start_pfn), mem->group, 214 214 nr_vmemmap_pages); 215 215 216 216 return ret; ··· 228 228 * can properly be torn down in offline_pages(). 229 229 */ 230 230 if (nr_vmemmap_pages) 231 - adjust_present_page_count(pfn_to_page(start_pfn), 231 + adjust_present_page_count(pfn_to_page(start_pfn), mem->group, 232 232 -nr_vmemmap_pages); 233 233 234 234 ret = offline_pages(start_pfn + nr_vmemmap_pages, 235 - nr_pages - nr_vmemmap_pages); 235 + nr_pages - nr_vmemmap_pages, mem->group); 236 236 if (ret) { 237 237 /* offline_pages() failed. Account back. */ 238 238 if (nr_vmemmap_pages) 239 239 adjust_present_page_count(pfn_to_page(start_pfn), 240 - nr_vmemmap_pages); 240 + mem->group, nr_vmemmap_pages); 241 241 return ret; 242 242 } 243 243
+6
include/linux/memory.h
··· 27 27 * struct memory_group - a logical group of memory blocks 28 28 * @nid: The node id for all memory blocks inside the memory group. 29 29 * @blocks: List of all memory blocks belonging to this memory group. 30 + * @present_kernel_pages: Present (online) memory outside ZONE_MOVABLE of this 31 + * memory group. 32 + * @present_movable_pages: Present (online) memory in ZONE_MOVABLE of this 33 + * memory group. 30 34 * @is_dynamic: The memory group type: static vs. dynamic 31 35 * @s.max_pages: Valid with &memory_group.is_dynamic == false. The maximum 32 36 * number of pages we'll have in this static memory group. ··· 52 48 struct memory_group { 53 49 int nid; 54 50 struct list_head memory_blocks; 51 + unsigned long present_kernel_pages; 52 + unsigned long present_movable_pages; 55 53 bool is_dynamic; 56 54 union { 57 55 struct {
+9 -4
include/linux/memory_hotplug.h
··· 12 12 struct pglist_data; 13 13 struct mem_section; 14 14 struct memory_block; 15 + struct memory_group; 15 16 struct resource; 16 17 struct vmem_altmap; 17 18 ··· 101 100 extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); 102 101 extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); 103 102 extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); 104 - extern void adjust_present_page_count(struct page *page, long nr_pages); 103 + extern void adjust_present_page_count(struct page *page, 104 + struct memory_group *group, 105 + long nr_pages); 105 106 /* VM interface that may be used by firmware interface */ 106 107 extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, 107 108 struct zone *zone); 108 109 extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); 109 110 extern int online_pages(unsigned long pfn, unsigned long nr_pages, 110 - struct zone *zone); 111 + struct zone *zone, struct memory_group *group); 111 112 extern struct zone *test_pages_in_a_zone(unsigned long start_pfn, 112 113 unsigned long end_pfn); 113 114 extern void __offline_isolated_pages(unsigned long start_pfn, ··· 299 296 #ifdef CONFIG_MEMORY_HOTREMOVE 300 297 301 298 extern void try_offline_node(int nid); 302 - extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); 299 + extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages, 300 + struct memory_group *group); 303 301 extern int remove_memory(u64 start, u64 size); 304 302 extern void __remove_memory(u64 start, u64 size); 305 303 extern int offline_and_remove_memory(u64 start, u64 size); ··· 308 304 #else 309 305 static inline void try_offline_node(int nid) {} 310 306 311 - static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 307 + static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages, 308 + struct memory_group *group) 312 309 { 313 310 return -EINVAL; 314 311 }
+14 -5
mm/memory_hotplug.c
··· 915 915 * This function should only be called by memory_block_{online,offline}, 916 916 * and {online,offline}_pages. 917 917 */ 918 - void adjust_present_page_count(struct page *page, long nr_pages) 918 + void adjust_present_page_count(struct page *page, struct memory_group *group, 919 + long nr_pages) 919 920 { 920 921 struct zone *zone = page_zone(page); 922 + const bool movable = zone_idx(zone) == ZONE_MOVABLE; 921 923 922 924 /* 923 925 * We only support onlining/offlining/adding/removing of complete ··· 929 927 zone->present_early_pages += nr_pages; 930 928 zone->present_pages += nr_pages; 931 929 zone->zone_pgdat->node_present_pages += nr_pages; 930 + 931 + if (group && movable) 932 + group->present_movable_pages += nr_pages; 933 + else if (group && !movable) 934 + group->present_kernel_pages += nr_pages; 932 935 } 933 936 934 937 int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, ··· 979 972 kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); 980 973 } 981 974 982 - int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone) 975 + int __ref online_pages(unsigned long pfn, unsigned long nr_pages, 976 + struct zone *zone, struct memory_group *group) 983 977 { 984 978 unsigned long flags; 985 979 int need_zonelists_rebuild = 0; ··· 1033 1025 } 1034 1026 1035 1027 online_pages_range(pfn, nr_pages); 1036 - adjust_present_page_count(pfn_to_page(pfn), nr_pages); 1028 + adjust_present_page_count(pfn_to_page(pfn), group, nr_pages); 1037 1029 1038 1030 node_states_set_node(nid, &arg); 1039 1031 if (need_zonelists_rebuild) ··· 1777 1769 return 0; 1778 1770 } 1779 1771 1780 - int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1772 + int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, 1773 + struct memory_group *group) 1781 1774 { 1782 1775 const unsigned long end_pfn = start_pfn + nr_pages; 1783 1776 unsigned long pfn, system_ram_pages = 0; ··· 1914 1905 1915 1906 /* removal success */ 1916 1907 adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); 1917 - adjust_present_page_count(pfn_to_page(start_pfn), -nr_pages); 1908 + adjust_present_page_count(pfn_to_page(start_pfn), group, -nr_pages); 1918 1909 1919 1910 /* reinitialise watermarks and update pcp limits */ 1920 1911 init_per_zone_wmark_min();