[PATCH] x86_64: add x86-64 support for memory hot-add

Add x86-64 specific memory hot-add functions, Kconfig options,
and runtime kernel page table update functions to make
hot-add usable on x86-64 machines. Also, fixup the nefarious
conditional locking and exports pointed out by Andi.

Tested on Intel and IBM x86-64 memory hot-add capable systems.

Signed-off-by: Matt Tolentino <matthew.e.tolentino@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by Matt Tolentino and committed by Linus Torvalds 44df75e6 8817210d

+135 -34
+6 -1
arch/x86_64/Kconfig
··· 305 306 config ARCH_SPARSEMEM_ENABLE 307 def_bool y 308 - depends on NUMA 309 310 config ARCH_FLATMEM_ENABLE 311 def_bool y ··· 319 320 config HAVE_ARCH_EARLY_PFN_TO_NID 321 def_bool y 322 323 config NR_CPUS 324 int "Maximum number of CPUs (2-256)"
··· 305 306 config ARCH_SPARSEMEM_ENABLE 307 def_bool y 308 + depends on (NUMA || EXPERIMENTAL) 309 + 310 + config ARCH_MEMORY_PROBE 311 + def_bool y 312 + depends on MEMORY_HOTPLUG 313 314 config ARCH_FLATMEM_ENABLE 315 def_bool y ··· 315 316 config HAVE_ARCH_EARLY_PFN_TO_NID 317 def_bool y 318 + depends on NUMA 319 320 config NR_CPUS 321 int "Maximum number of CPUs (2-256)"
+129 -33
arch/x86_64/mm/init.c
··· 24 #include <linux/proc_fs.h> 25 #include <linux/pci.h> 26 #include <linux/dma-mapping.h> 27 28 #include <asm/processor.h> 29 #include <asm/system.h> ··· 182 {} 183 }; 184 185 - static __init void *alloc_low_page(int *index, unsigned long *phys) 186 { 187 struct temp_map *ti; 188 int i; 189 unsigned long pfn = table_end++, paddr; 190 void *adr; 191 192 if (pfn >= end_pfn) 193 panic("alloc_low_page: ran out of memory"); ··· 207 ti->allocated = 1; 208 __flush_tlb(); 209 adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); 210 *index = i; 211 *phys = pfn * PAGE_SIZE; 212 return adr; 213 } 214 215 - static __init void unmap_low_page(int i) 216 { 217 - struct temp_map *ti = &temp_mappings[i]; 218 set_pmd(ti->pmd, __pmd(0)); 219 ti->allocated = 0; 220 } 221 222 - static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) 223 - { 224 - long i, j; 225 226 - i = pud_index(address); 227 pud = pud + i; 228 for (; i < PTRS_PER_PUD; pud++, i++) { 229 int map; 230 unsigned long paddr, pmd_phys; 231 pmd_t *pmd; 232 233 - paddr = address + i*PUD_SIZE; 234 - if (paddr >= end) { 235 - for (; i < PTRS_PER_PUD; i++, pud++) 236 - set_pud(pud, __pud(0)); 237 break; 238 - } 239 240 - if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) { 241 set_pud(pud, __pud(0)); 242 continue; 243 } 244 245 pmd = alloc_low_page(&map, &pmd_phys); 246 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); 247 - for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { 248 - unsigned long pe; 249 - 250 - if (paddr >= end) { 251 - for (; j < PTRS_PER_PMD; j++, pmd++) 252 - set_pmd(pmd, __pmd(0)); 253 - break; 254 - } 255 - pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr; 256 - pe &= __supported_pte_mask; 257 - set_pmd(pmd, __pmd(pe)); 258 - } 259 unmap_low_page(map); 260 } 261 __flush_tlb(); ··· 311 312 table_start >>= PAGE_SHIFT; 313 table_end = table_start; 314 } 315 316 /* Setup the direct mapping of the physical memory at PAGE_OFFSET. 317 This runs before bootmem is initialized and gets pages directly from the 318 physical memory. To access them they are temporarily mapped. */ 319 - void __init init_memory_mapping(unsigned long start, unsigned long end) 320 { 321 unsigned long next; 322 ··· 331 * mapped. Unfortunately this is done currently before the nodes are 332 * discovered. 333 */ 334 - find_early_table_space(end); 335 336 start = (unsigned long)__va(start); 337 end = (unsigned long)__va(end); ··· 340 for (; start < end; start = next) { 341 int map; 342 unsigned long pud_phys; 343 - pud_t *pud = alloc_low_page(&map, &pud_phys); 344 next = start + PGDIR_SIZE; 345 if (next > end) 346 next = end; 347 phys_pud_init(pud, __pa(start), __pa(next)); 348 - set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); 349 unmap_low_page(map); 350 } 351 352 - asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features)); 353 __flush_tlb_all(); 354 - early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, 355 - table_start<<PAGE_SHIFT, 356 - table_end<<PAGE_SHIFT); 357 } 358 359 void __cpuinit zap_low_mappings(int cpu) ··· 434 void __init paging_init(void) 435 { 436 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES]; 437 size_zones(zones, holes, 0, end_pfn); 438 free_area_init_node(0, NODE_DATA(0), zones, 439 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes); ··· 476 } 477 __flush_tlb_all(); 478 } 479 480 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, 481 kcore_vsyscall;
··· 24 #include <linux/proc_fs.h> 25 #include <linux/pci.h> 26 #include <linux/dma-mapping.h> 27 + #include <linux/module.h> 28 + #include <linux/memory_hotplug.h> 29 30 #include <asm/processor.h> 31 #include <asm/system.h> ··· 180 {} 181 }; 182 183 + static __meminit void *alloc_low_page(int *index, unsigned long *phys) 184 { 185 struct temp_map *ti; 186 int i; 187 unsigned long pfn = table_end++, paddr; 188 void *adr; 189 + 190 + if (after_bootmem) { 191 + adr = (void *)get_zeroed_page(GFP_ATOMIC); 192 + *phys = __pa(adr); 193 + return adr; 194 + } 195 196 if (pfn >= end_pfn) 197 panic("alloc_low_page: ran out of memory"); ··· 199 ti->allocated = 1; 200 __flush_tlb(); 201 adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); 202 + memset(adr, 0, PAGE_SIZE); 203 *index = i; 204 *phys = pfn * PAGE_SIZE; 205 return adr; 206 } 207 208 + static __meminit void unmap_low_page(int i) 209 { 210 + struct temp_map *ti; 211 + 212 + if (after_bootmem) 213 + return; 214 + 215 + ti = &temp_mappings[i]; 216 set_pmd(ti->pmd, __pmd(0)); 217 ti->allocated = 0; 218 } 219 220 + static void __meminit 221 + phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end) 222 + { 223 + int i; 224 225 + for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) { 226 + unsigned long entry; 227 + 228 + if (address > end) { 229 + for (; i < PTRS_PER_PMD; i++, pmd++) 230 + set_pmd(pmd, __pmd(0)); 231 + break; 232 + } 233 + entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address; 234 + entry &= __supported_pte_mask; 235 + set_pmd(pmd, __pmd(entry)); 236 + } 237 + } 238 + 239 + static void __meminit 240 + phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) 241 + { 242 + pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address)); 243 + 244 + if (pmd_none(*pmd)) { 245 + spin_lock(&init_mm.page_table_lock); 246 + phys_pmd_init(pmd, address, end); 247 + spin_unlock(&init_mm.page_table_lock); 248 + __flush_tlb_all(); 249 + } 250 + } 251 + 252 + static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) 253 + { 254 + long i = pud_index(address); 255 + 256 pud = pud + i; 257 + 258 + if (after_bootmem && pud_val(*pud)) { 259 + phys_pmd_update(pud, address, end); 260 + return; 261 + } 262 + 263 for (; i < PTRS_PER_PUD; pud++, i++) { 264 int map; 265 unsigned long paddr, pmd_phys; 266 pmd_t *pmd; 267 268 + paddr = (address & PGDIR_MASK) + i*PUD_SIZE; 269 + if (paddr >= end) 270 break; 271 272 + if (!after_bootmem && !e820_mapped(paddr, paddr+PUD_SIZE, 0)) { 273 set_pud(pud, __pud(0)); 274 continue; 275 } 276 277 pmd = alloc_low_page(&map, &pmd_phys); 278 + spin_lock(&init_mm.page_table_lock); 279 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); 280 + phys_pmd_init(pmd, paddr, end); 281 + spin_unlock(&init_mm.page_table_lock); 282 unmap_low_page(map); 283 } 284 __flush_tlb(); ··· 272 273 table_start >>= PAGE_SHIFT; 274 table_end = table_start; 275 + 276 + early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", 277 + end, table_start << PAGE_SHIFT, table_end << PAGE_SHIFT); 278 } 279 280 /* Setup the direct mapping of the physical memory at PAGE_OFFSET. 281 This runs before bootmem is initialized and gets pages directly from the 282 physical memory. To access them they are temporarily mapped. */ 283 + void __meminit init_memory_mapping(unsigned long start, unsigned long end) 284 { 285 unsigned long next; 286 ··· 289 * mapped. Unfortunately this is done currently before the nodes are 290 * discovered. 291 */ 292 + if (!after_bootmem) 293 + find_early_table_space(end); 294 295 start = (unsigned long)__va(start); 296 end = (unsigned long)__va(end); ··· 297 for (; start < end; start = next) { 298 int map; 299 unsigned long pud_phys; 300 + pgd_t *pgd = pgd_offset_k(start); 301 + pud_t *pud; 302 + 303 + if (after_bootmem) 304 + pud = pud_offset_k(pgd, __PAGE_OFFSET); 305 + else 306 + pud = alloc_low_page(&map, &pud_phys); 307 + 308 next = start + PGDIR_SIZE; 309 if (next > end) 310 next = end; 311 phys_pud_init(pud, __pa(start), __pa(next)); 312 + if (!after_bootmem) 313 + set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); 314 unmap_low_page(map); 315 } 316 317 + if (!after_bootmem) 318 + asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features)); 319 __flush_tlb_all(); 320 } 321 322 void __cpuinit zap_low_mappings(int cpu) ··· 385 void __init paging_init(void) 386 { 387 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES]; 388 + 389 + memory_present(0, 0, end_pfn); 390 + sparse_init(); 391 size_zones(zones, holes, 0, end_pfn); 392 free_area_init_node(0, NODE_DATA(0), zones, 393 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes); ··· 424 } 425 __flush_tlb_all(); 426 } 427 + 428 + /* 429 + * Memory hotplug specific functions 430 + * These are only for non-NUMA machines right now. 431 + */ 432 + #ifdef CONFIG_MEMORY_HOTPLUG 433 + 434 + void online_page(struct page *page) 435 + { 436 + ClearPageReserved(page); 437 + set_page_count(page, 1); 438 + __free_page(page); 439 + totalram_pages++; 440 + num_physpages++; 441 + } 442 + 443 + int add_memory(u64 start, u64 size) 444 + { 445 + struct pglist_data *pgdat = NODE_DATA(0); 446 + struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2; 447 + unsigned long start_pfn = start >> PAGE_SHIFT; 448 + unsigned long nr_pages = size >> PAGE_SHIFT; 449 + int ret; 450 + 451 + ret = __add_pages(zone, start_pfn, nr_pages); 452 + if (ret) 453 + goto error; 454 + 455 + init_memory_mapping(start, (start + size -1)); 456 + 457 + return ret; 458 + error: 459 + printk("%s: Problem encountered in __add_pages!\n", __func__); 460 + return ret; 461 + } 462 + EXPORT_SYMBOL_GPL(add_memory); 463 + 464 + int remove_memory(u64 start, u64 size) 465 + { 466 + return -EINVAL; 467 + } 468 + EXPORT_SYMBOL_GPL(remove_memory); 469 + 470 + #endif 471 472 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, 473 kcore_vsyscall;