Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.12 2032 lines 50 kB view raw
1/* 2 * Copyright 2002 Andi Kleen, SuSE Labs. 3 * Thanks to Ben LaHaise for precious feedback. 4 */ 5#include <linux/highmem.h> 6#include <linux/bootmem.h> 7#include <linux/sched.h> 8#include <linux/mm.h> 9#include <linux/interrupt.h> 10#include <linux/seq_file.h> 11#include <linux/debugfs.h> 12#include <linux/pfn.h> 13#include <linux/percpu.h> 14#include <linux/gfp.h> 15#include <linux/pci.h> 16#include <linux/vmalloc.h> 17 18#include <asm/e820/api.h> 19#include <asm/processor.h> 20#include <asm/tlbflush.h> 21#include <asm/sections.h> 22#include <asm/setup.h> 23#include <linux/uaccess.h> 24#include <asm/pgalloc.h> 25#include <asm/proto.h> 26#include <asm/pat.h> 27#include <asm/set_memory.h> 28 29/* 30 * The current flushing context - we pass it instead of 5 arguments: 31 */ 32struct cpa_data { 33 unsigned long *vaddr; 34 pgd_t *pgd; 35 pgprot_t mask_set; 36 pgprot_t mask_clr; 37 unsigned long numpages; 38 int flags; 39 unsigned long pfn; 40 unsigned force_split : 1; 41 int curpage; 42 struct page **pages; 43}; 44 45/* 46 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) 47 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb 48 * entries change the page attribute in parallel to some other cpu 49 * splitting a large page entry along with changing the attribute. 50 */ 51static DEFINE_SPINLOCK(cpa_lock); 52 53#define CPA_FLUSHTLB 1 54#define CPA_ARRAY 2 55#define CPA_PAGES_ARRAY 4 56 57#ifdef CONFIG_PROC_FS 58static unsigned long direct_pages_count[PG_LEVEL_NUM]; 59 60void update_page_count(int level, unsigned long pages) 61{ 62 /* Protect against CPA */ 63 spin_lock(&pgd_lock); 64 direct_pages_count[level] += pages; 65 spin_unlock(&pgd_lock); 66} 67 68static void split_page_count(int level) 69{ 70 if (direct_pages_count[level] == 0) 71 return; 72 73 direct_pages_count[level]--; 74 direct_pages_count[level - 1] += PTRS_PER_PTE; 75} 76 77void arch_report_meminfo(struct seq_file *m) 78{ 79 seq_printf(m, "DirectMap4k: %8lu kB\n", 80 direct_pages_count[PG_LEVEL_4K] << 2); 81#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 82 seq_printf(m, "DirectMap2M: %8lu kB\n", 83 direct_pages_count[PG_LEVEL_2M] << 11); 84#else 85 seq_printf(m, "DirectMap4M: %8lu kB\n", 86 direct_pages_count[PG_LEVEL_2M] << 12); 87#endif 88 if (direct_gbpages) 89 seq_printf(m, "DirectMap1G: %8lu kB\n", 90 direct_pages_count[PG_LEVEL_1G] << 20); 91} 92#else 93static inline void split_page_count(int level) { } 94#endif 95 96#ifdef CONFIG_X86_64 97 98static inline unsigned long highmap_start_pfn(void) 99{ 100 return __pa_symbol(_text) >> PAGE_SHIFT; 101} 102 103static inline unsigned long highmap_end_pfn(void) 104{ 105 /* Do not reference physical address outside the kernel. */ 106 return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT; 107} 108 109#endif 110 111static inline int 112within(unsigned long addr, unsigned long start, unsigned long end) 113{ 114 return addr >= start && addr < end; 115} 116 117static inline int 118within_inclusive(unsigned long addr, unsigned long start, unsigned long end) 119{ 120 return addr >= start && addr <= end; 121} 122 123/* 124 * Flushing functions 125 */ 126 127/** 128 * clflush_cache_range - flush a cache range with clflush 129 * @vaddr: virtual start address 130 * @size: number of bytes to flush 131 * 132 * clflushopt is an unordered instruction which needs fencing with mfence or 133 * sfence to avoid ordering issues. 134 */ 135void clflush_cache_range(void *vaddr, unsigned int size) 136{ 137 const unsigned long clflush_size = boot_cpu_data.x86_clflush_size; 138 void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1)); 139 void *vend = vaddr + size; 140 141 if (p >= vend) 142 return; 143 144 mb(); 145 146 for (; p < vend; p += clflush_size) 147 clflushopt(p); 148 149 mb(); 150} 151EXPORT_SYMBOL_GPL(clflush_cache_range); 152 153static void __cpa_flush_all(void *arg) 154{ 155 unsigned long cache = (unsigned long)arg; 156 157 /* 158 * Flush all to work around Errata in early athlons regarding 159 * large page flushing. 160 */ 161 __flush_tlb_all(); 162 163 if (cache && boot_cpu_data.x86 >= 4) 164 wbinvd(); 165} 166 167static void cpa_flush_all(unsigned long cache) 168{ 169 BUG_ON(irqs_disabled()); 170 171 on_each_cpu(__cpa_flush_all, (void *) cache, 1); 172} 173 174static void __cpa_flush_range(void *arg) 175{ 176 /* 177 * We could optimize that further and do individual per page 178 * tlb invalidates for a low number of pages. Caveat: we must 179 * flush the high aliases on 64bit as well. 180 */ 181 __flush_tlb_all(); 182} 183 184static void cpa_flush_range(unsigned long start, int numpages, int cache) 185{ 186 unsigned int i, level; 187 unsigned long addr; 188 189 BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); 190 WARN_ON(PAGE_ALIGN(start) != start); 191 192 on_each_cpu(__cpa_flush_range, NULL, 1); 193 194 if (!cache) 195 return; 196 197 /* 198 * We only need to flush on one CPU, 199 * clflush is a MESI-coherent instruction that 200 * will cause all other CPUs to flush the same 201 * cachelines: 202 */ 203 for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) { 204 pte_t *pte = lookup_address(addr, &level); 205 206 /* 207 * Only flush present addresses: 208 */ 209 if (pte && (pte_val(*pte) & _PAGE_PRESENT)) 210 clflush_cache_range((void *) addr, PAGE_SIZE); 211 } 212} 213 214static void cpa_flush_array(unsigned long *start, int numpages, int cache, 215 int in_flags, struct page **pages) 216{ 217 unsigned int i, level; 218#ifdef CONFIG_PREEMPT 219 /* 220 * Avoid wbinvd() because it causes latencies on all CPUs, 221 * regardless of any CPU isolation that may be in effect. 222 * 223 * This should be extended for CAT enabled systems independent of 224 * PREEMPT because wbinvd() does not respect the CAT partitions and 225 * this is exposed to unpriviledged users through the graphics 226 * subsystem. 227 */ 228 unsigned long do_wbinvd = 0; 229#else 230 unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */ 231#endif 232 233 BUG_ON(irqs_disabled()); 234 235 on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1); 236 237 if (!cache || do_wbinvd) 238 return; 239 240 /* 241 * We only need to flush on one CPU, 242 * clflush is a MESI-coherent instruction that 243 * will cause all other CPUs to flush the same 244 * cachelines: 245 */ 246 for (i = 0; i < numpages; i++) { 247 unsigned long addr; 248 pte_t *pte; 249 250 if (in_flags & CPA_PAGES_ARRAY) 251 addr = (unsigned long)page_address(pages[i]); 252 else 253 addr = start[i]; 254 255 pte = lookup_address(addr, &level); 256 257 /* 258 * Only flush present addresses: 259 */ 260 if (pte && (pte_val(*pte) & _PAGE_PRESENT)) 261 clflush_cache_range((void *)addr, PAGE_SIZE); 262 } 263} 264 265/* 266 * Certain areas of memory on x86 require very specific protection flags, 267 * for example the BIOS area or kernel text. Callers don't always get this 268 * right (again, ioremap() on BIOS memory is not uncommon) so this function 269 * checks and fixes these known static required protection bits. 270 */ 271static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, 272 unsigned long pfn) 273{ 274 pgprot_t forbidden = __pgprot(0); 275 276 /* 277 * The BIOS area between 640k and 1Mb needs to be executable for 278 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. 279 */ 280#ifdef CONFIG_PCI_BIOS 281 if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) 282 pgprot_val(forbidden) |= _PAGE_NX; 283#endif 284 285 /* 286 * The kernel text needs to be executable for obvious reasons 287 * Does not cover __inittext since that is gone later on. On 288 * 64bit we do not enforce !NX on the low mapping 289 */ 290 if (within(address, (unsigned long)_text, (unsigned long)_etext)) 291 pgprot_val(forbidden) |= _PAGE_NX; 292 293 /* 294 * The .rodata section needs to be read-only. Using the pfn 295 * catches all aliases. 296 */ 297 if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, 298 __pa_symbol(__end_rodata) >> PAGE_SHIFT)) 299 pgprot_val(forbidden) |= _PAGE_RW; 300 301#if defined(CONFIG_X86_64) 302 /* 303 * Once the kernel maps the text as RO (kernel_set_to_readonly is set), 304 * kernel text mappings for the large page aligned text, rodata sections 305 * will be always read-only. For the kernel identity mappings covering 306 * the holes caused by this alignment can be anything that user asks. 307 * 308 * This will preserve the large page mappings for kernel text/data 309 * at no extra cost. 310 */ 311 if (kernel_set_to_readonly && 312 within(address, (unsigned long)_text, 313 (unsigned long)__end_rodata_hpage_align)) { 314 unsigned int level; 315 316 /* 317 * Don't enforce the !RW mapping for the kernel text mapping, 318 * if the current mapping is already using small page mapping. 319 * No need to work hard to preserve large page mappings in this 320 * case. 321 * 322 * This also fixes the Linux Xen paravirt guest boot failure 323 * (because of unexpected read-only mappings for kernel identity 324 * mappings). In this paravirt guest case, the kernel text 325 * mapping and the kernel identity mapping share the same 326 * page-table pages. Thus we can't really use different 327 * protections for the kernel text and identity mappings. Also, 328 * these shared mappings are made of small page mappings. 329 * Thus this don't enforce !RW mapping for small page kernel 330 * text mapping logic will help Linux Xen parvirt guest boot 331 * as well. 332 */ 333 if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) 334 pgprot_val(forbidden) |= _PAGE_RW; 335 } 336#endif 337 338 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); 339 340 return prot; 341} 342 343/* 344 * Lookup the page table entry for a virtual address in a specific pgd. 345 * Return a pointer to the entry and the level of the mapping. 346 */ 347pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, 348 unsigned int *level) 349{ 350 p4d_t *p4d; 351 pud_t *pud; 352 pmd_t *pmd; 353 354 *level = PG_LEVEL_NONE; 355 356 if (pgd_none(*pgd)) 357 return NULL; 358 359 p4d = p4d_offset(pgd, address); 360 if (p4d_none(*p4d)) 361 return NULL; 362 363 *level = PG_LEVEL_512G; 364 if (p4d_large(*p4d) || !p4d_present(*p4d)) 365 return (pte_t *)p4d; 366 367 pud = pud_offset(p4d, address); 368 if (pud_none(*pud)) 369 return NULL; 370 371 *level = PG_LEVEL_1G; 372 if (pud_large(*pud) || !pud_present(*pud)) 373 return (pte_t *)pud; 374 375 pmd = pmd_offset(pud, address); 376 if (pmd_none(*pmd)) 377 return NULL; 378 379 *level = PG_LEVEL_2M; 380 if (pmd_large(*pmd) || !pmd_present(*pmd)) 381 return (pte_t *)pmd; 382 383 *level = PG_LEVEL_4K; 384 385 return pte_offset_kernel(pmd, address); 386} 387 388/* 389 * Lookup the page table entry for a virtual address. Return a pointer 390 * to the entry and the level of the mapping. 391 * 392 * Note: We return pud and pmd either when the entry is marked large 393 * or when the present bit is not set. Otherwise we would return a 394 * pointer to a nonexisting mapping. 395 */ 396pte_t *lookup_address(unsigned long address, unsigned int *level) 397{ 398 return lookup_address_in_pgd(pgd_offset_k(address), address, level); 399} 400EXPORT_SYMBOL_GPL(lookup_address); 401 402static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, 403 unsigned int *level) 404{ 405 if (cpa->pgd) 406 return lookup_address_in_pgd(cpa->pgd + pgd_index(address), 407 address, level); 408 409 return lookup_address(address, level); 410} 411 412/* 413 * Lookup the PMD entry for a virtual address. Return a pointer to the entry 414 * or NULL if not present. 415 */ 416pmd_t *lookup_pmd_address(unsigned long address) 417{ 418 pgd_t *pgd; 419 p4d_t *p4d; 420 pud_t *pud; 421 422 pgd = pgd_offset_k(address); 423 if (pgd_none(*pgd)) 424 return NULL; 425 426 p4d = p4d_offset(pgd, address); 427 if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d)) 428 return NULL; 429 430 pud = pud_offset(p4d, address); 431 if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud)) 432 return NULL; 433 434 return pmd_offset(pud, address); 435} 436 437/* 438 * This is necessary because __pa() does not work on some 439 * kinds of memory, like vmalloc() or the alloc_remap() 440 * areas on 32-bit NUMA systems. The percpu areas can 441 * end up in this kind of memory, for instance. 442 * 443 * This could be optimized, but it is only intended to be 444 * used at inititalization time, and keeping it 445 * unoptimized should increase the testing coverage for 446 * the more obscure platforms. 447 */ 448phys_addr_t slow_virt_to_phys(void *__virt_addr) 449{ 450 unsigned long virt_addr = (unsigned long)__virt_addr; 451 phys_addr_t phys_addr; 452 unsigned long offset; 453 enum pg_level level; 454 pte_t *pte; 455 456 pte = lookup_address(virt_addr, &level); 457 BUG_ON(!pte); 458 459 /* 460 * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t 461 * before being left-shifted PAGE_SHIFT bits -- this trick is to 462 * make 32-PAE kernel work correctly. 463 */ 464 switch (level) { 465 case PG_LEVEL_1G: 466 phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT; 467 offset = virt_addr & ~PUD_PAGE_MASK; 468 break; 469 case PG_LEVEL_2M: 470 phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT; 471 offset = virt_addr & ~PMD_PAGE_MASK; 472 break; 473 default: 474 phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; 475 offset = virt_addr & ~PAGE_MASK; 476 } 477 478 return (phys_addr_t)(phys_addr | offset); 479} 480EXPORT_SYMBOL_GPL(slow_virt_to_phys); 481 482/* 483 * Set the new pmd in all the pgds we know about: 484 */ 485static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) 486{ 487 /* change init_mm */ 488 set_pte_atomic(kpte, pte); 489#ifdef CONFIG_X86_32 490 if (!SHARED_KERNEL_PMD) { 491 struct page *page; 492 493 list_for_each_entry(page, &pgd_list, lru) { 494 pgd_t *pgd; 495 p4d_t *p4d; 496 pud_t *pud; 497 pmd_t *pmd; 498 499 pgd = (pgd_t *)page_address(page) + pgd_index(address); 500 p4d = p4d_offset(pgd, address); 501 pud = pud_offset(p4d, address); 502 pmd = pmd_offset(pud, address); 503 set_pte_atomic((pte_t *)pmd, pte); 504 } 505 } 506#endif 507} 508 509static int 510try_preserve_large_page(pte_t *kpte, unsigned long address, 511 struct cpa_data *cpa) 512{ 513 unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn; 514 pte_t new_pte, old_pte, *tmp; 515 pgprot_t old_prot, new_prot, req_prot; 516 int i, do_split = 1; 517 enum pg_level level; 518 519 if (cpa->force_split) 520 return 1; 521 522 spin_lock(&pgd_lock); 523 /* 524 * Check for races, another CPU might have split this page 525 * up already: 526 */ 527 tmp = _lookup_address_cpa(cpa, address, &level); 528 if (tmp != kpte) 529 goto out_unlock; 530 531 switch (level) { 532 case PG_LEVEL_2M: 533 old_prot = pmd_pgprot(*(pmd_t *)kpte); 534 old_pfn = pmd_pfn(*(pmd_t *)kpte); 535 break; 536 case PG_LEVEL_1G: 537 old_prot = pud_pgprot(*(pud_t *)kpte); 538 old_pfn = pud_pfn(*(pud_t *)kpte); 539 break; 540 default: 541 do_split = -EINVAL; 542 goto out_unlock; 543 } 544 545 psize = page_level_size(level); 546 pmask = page_level_mask(level); 547 548 /* 549 * Calculate the number of pages, which fit into this large 550 * page starting at address: 551 */ 552 nextpage_addr = (address + psize) & pmask; 553 numpages = (nextpage_addr - address) >> PAGE_SHIFT; 554 if (numpages < cpa->numpages) 555 cpa->numpages = numpages; 556 557 /* 558 * We are safe now. Check whether the new pgprot is the same: 559 * Convert protection attributes to 4k-format, as cpa->mask* are set 560 * up accordingly. 561 */ 562 old_pte = *kpte; 563 req_prot = pgprot_large_2_4k(old_prot); 564 565 pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); 566 pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); 567 568 /* 569 * req_prot is in format of 4k pages. It must be converted to large 570 * page format: the caching mode includes the PAT bit located at 571 * different bit positions in the two formats. 572 */ 573 req_prot = pgprot_4k_2_large(req_prot); 574 575 /* 576 * Set the PSE and GLOBAL flags only if the PRESENT flag is 577 * set otherwise pmd_present/pmd_huge will return true even on 578 * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL 579 * for the ancient hardware that doesn't support it. 580 */ 581 if (pgprot_val(req_prot) & _PAGE_PRESENT) 582 pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL; 583 else 584 pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL); 585 586 req_prot = canon_pgprot(req_prot); 587 588 /* 589 * old_pfn points to the large page base pfn. So we need 590 * to add the offset of the virtual address: 591 */ 592 pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT); 593 cpa->pfn = pfn; 594 595 new_prot = static_protections(req_prot, address, pfn); 596 597 /* 598 * We need to check the full range, whether 599 * static_protection() requires a different pgprot for one of 600 * the pages in the range we try to preserve: 601 */ 602 addr = address & pmask; 603 pfn = old_pfn; 604 for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) { 605 pgprot_t chk_prot = static_protections(req_prot, addr, pfn); 606 607 if (pgprot_val(chk_prot) != pgprot_val(new_prot)) 608 goto out_unlock; 609 } 610 611 /* 612 * If there are no changes, return. maxpages has been updated 613 * above: 614 */ 615 if (pgprot_val(new_prot) == pgprot_val(old_prot)) { 616 do_split = 0; 617 goto out_unlock; 618 } 619 620 /* 621 * We need to change the attributes. Check, whether we can 622 * change the large page in one go. We request a split, when 623 * the address is not aligned and the number of pages is 624 * smaller than the number of pages in the large page. Note 625 * that we limited the number of possible pages already to 626 * the number of pages in the large page. 627 */ 628 if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) { 629 /* 630 * The address is aligned and the number of pages 631 * covers the full page. 632 */ 633 new_pte = pfn_pte(old_pfn, new_prot); 634 __set_pmd_pte(kpte, address, new_pte); 635 cpa->flags |= CPA_FLUSHTLB; 636 do_split = 0; 637 } 638 639out_unlock: 640 spin_unlock(&pgd_lock); 641 642 return do_split; 643} 644 645static int 646__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, 647 struct page *base) 648{ 649 pte_t *pbase = (pte_t *)page_address(base); 650 unsigned long ref_pfn, pfn, pfninc = 1; 651 unsigned int i, level; 652 pte_t *tmp; 653 pgprot_t ref_prot; 654 655 spin_lock(&pgd_lock); 656 /* 657 * Check for races, another CPU might have split this page 658 * up for us already: 659 */ 660 tmp = _lookup_address_cpa(cpa, address, &level); 661 if (tmp != kpte) { 662 spin_unlock(&pgd_lock); 663 return 1; 664 } 665 666 paravirt_alloc_pte(&init_mm, page_to_pfn(base)); 667 668 switch (level) { 669 case PG_LEVEL_2M: 670 ref_prot = pmd_pgprot(*(pmd_t *)kpte); 671 /* clear PSE and promote PAT bit to correct position */ 672 ref_prot = pgprot_large_2_4k(ref_prot); 673 ref_pfn = pmd_pfn(*(pmd_t *)kpte); 674 break; 675 676 case PG_LEVEL_1G: 677 ref_prot = pud_pgprot(*(pud_t *)kpte); 678 ref_pfn = pud_pfn(*(pud_t *)kpte); 679 pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; 680 681 /* 682 * Clear the PSE flags if the PRESENT flag is not set 683 * otherwise pmd_present/pmd_huge will return true 684 * even on a non present pmd. 685 */ 686 if (!(pgprot_val(ref_prot) & _PAGE_PRESENT)) 687 pgprot_val(ref_prot) &= ~_PAGE_PSE; 688 break; 689 690 default: 691 spin_unlock(&pgd_lock); 692 return 1; 693 } 694 695 /* 696 * Set the GLOBAL flags only if the PRESENT flag is set 697 * otherwise pmd/pte_present will return true even on a non 698 * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL 699 * for the ancient hardware that doesn't support it. 700 */ 701 if (pgprot_val(ref_prot) & _PAGE_PRESENT) 702 pgprot_val(ref_prot) |= _PAGE_GLOBAL; 703 else 704 pgprot_val(ref_prot) &= ~_PAGE_GLOBAL; 705 706 /* 707 * Get the target pfn from the original entry: 708 */ 709 pfn = ref_pfn; 710 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) 711 set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot))); 712 713 if (virt_addr_valid(address)) { 714 unsigned long pfn = PFN_DOWN(__pa(address)); 715 716 if (pfn_range_is_mapped(pfn, pfn + 1)) 717 split_page_count(level); 718 } 719 720 /* 721 * Install the new, split up pagetable. 722 * 723 * We use the standard kernel pagetable protections for the new 724 * pagetable protections, the actual ptes set above control the 725 * primary protection behavior: 726 */ 727 __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); 728 729 /* 730 * Intel Atom errata AAH41 workaround. 731 * 732 * The real fix should be in hw or in a microcode update, but 733 * we also probabilistically try to reduce the window of having 734 * a large TLB mixed with 4K TLBs while instruction fetches are 735 * going on. 736 */ 737 __flush_tlb_all(); 738 spin_unlock(&pgd_lock); 739 740 return 0; 741} 742 743static int split_large_page(struct cpa_data *cpa, pte_t *kpte, 744 unsigned long address) 745{ 746 struct page *base; 747 748 if (!debug_pagealloc_enabled()) 749 spin_unlock(&cpa_lock); 750 base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); 751 if (!debug_pagealloc_enabled()) 752 spin_lock(&cpa_lock); 753 if (!base) 754 return -ENOMEM; 755 756 if (__split_large_page(cpa, kpte, address, base)) 757 __free_page(base); 758 759 return 0; 760} 761 762static bool try_to_free_pte_page(pte_t *pte) 763{ 764 int i; 765 766 for (i = 0; i < PTRS_PER_PTE; i++) 767 if (!pte_none(pte[i])) 768 return false; 769 770 free_page((unsigned long)pte); 771 return true; 772} 773 774static bool try_to_free_pmd_page(pmd_t *pmd) 775{ 776 int i; 777 778 for (i = 0; i < PTRS_PER_PMD; i++) 779 if (!pmd_none(pmd[i])) 780 return false; 781 782 free_page((unsigned long)pmd); 783 return true; 784} 785 786static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) 787{ 788 pte_t *pte = pte_offset_kernel(pmd, start); 789 790 while (start < end) { 791 set_pte(pte, __pte(0)); 792 793 start += PAGE_SIZE; 794 pte++; 795 } 796 797 if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { 798 pmd_clear(pmd); 799 return true; 800 } 801 return false; 802} 803 804static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, 805 unsigned long start, unsigned long end) 806{ 807 if (unmap_pte_range(pmd, start, end)) 808 if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) 809 pud_clear(pud); 810} 811 812static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) 813{ 814 pmd_t *pmd = pmd_offset(pud, start); 815 816 /* 817 * Not on a 2MB page boundary? 818 */ 819 if (start & (PMD_SIZE - 1)) { 820 unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; 821 unsigned long pre_end = min_t(unsigned long, end, next_page); 822 823 __unmap_pmd_range(pud, pmd, start, pre_end); 824 825 start = pre_end; 826 pmd++; 827 } 828 829 /* 830 * Try to unmap in 2M chunks. 831 */ 832 while (end - start >= PMD_SIZE) { 833 if (pmd_large(*pmd)) 834 pmd_clear(pmd); 835 else 836 __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); 837 838 start += PMD_SIZE; 839 pmd++; 840 } 841 842 /* 843 * 4K leftovers? 844 */ 845 if (start < end) 846 return __unmap_pmd_range(pud, pmd, start, end); 847 848 /* 849 * Try again to free the PMD page if haven't succeeded above. 850 */ 851 if (!pud_none(*pud)) 852 if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) 853 pud_clear(pud); 854} 855 856static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) 857{ 858 pud_t *pud = pud_offset(p4d, start); 859 860 /* 861 * Not on a GB page boundary? 862 */ 863 if (start & (PUD_SIZE - 1)) { 864 unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; 865 unsigned long pre_end = min_t(unsigned long, end, next_page); 866 867 unmap_pmd_range(pud, start, pre_end); 868 869 start = pre_end; 870 pud++; 871 } 872 873 /* 874 * Try to unmap in 1G chunks? 875 */ 876 while (end - start >= PUD_SIZE) { 877 878 if (pud_large(*pud)) 879 pud_clear(pud); 880 else 881 unmap_pmd_range(pud, start, start + PUD_SIZE); 882 883 start += PUD_SIZE; 884 pud++; 885 } 886 887 /* 888 * 2M leftovers? 889 */ 890 if (start < end) 891 unmap_pmd_range(pud, start, end); 892 893 /* 894 * No need to try to free the PUD page because we'll free it in 895 * populate_pgd's error path 896 */ 897} 898 899static int alloc_pte_page(pmd_t *pmd) 900{ 901 pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); 902 if (!pte) 903 return -1; 904 905 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); 906 return 0; 907} 908 909static int alloc_pmd_page(pud_t *pud) 910{ 911 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); 912 if (!pmd) 913 return -1; 914 915 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); 916 return 0; 917} 918 919static void populate_pte(struct cpa_data *cpa, 920 unsigned long start, unsigned long end, 921 unsigned num_pages, pmd_t *pmd, pgprot_t pgprot) 922{ 923 pte_t *pte; 924 925 pte = pte_offset_kernel(pmd, start); 926 927 /* 928 * Set the GLOBAL flags only if the PRESENT flag is 929 * set otherwise pte_present will return true even on 930 * a non present pte. The canon_pgprot will clear 931 * _PAGE_GLOBAL for the ancient hardware that doesn't 932 * support it. 933 */ 934 if (pgprot_val(pgprot) & _PAGE_PRESENT) 935 pgprot_val(pgprot) |= _PAGE_GLOBAL; 936 else 937 pgprot_val(pgprot) &= ~_PAGE_GLOBAL; 938 939 pgprot = canon_pgprot(pgprot); 940 941 while (num_pages-- && start < end) { 942 set_pte(pte, pfn_pte(cpa->pfn, pgprot)); 943 944 start += PAGE_SIZE; 945 cpa->pfn++; 946 pte++; 947 } 948} 949 950static long populate_pmd(struct cpa_data *cpa, 951 unsigned long start, unsigned long end, 952 unsigned num_pages, pud_t *pud, pgprot_t pgprot) 953{ 954 long cur_pages = 0; 955 pmd_t *pmd; 956 pgprot_t pmd_pgprot; 957 958 /* 959 * Not on a 2M boundary? 960 */ 961 if (start & (PMD_SIZE - 1)) { 962 unsigned long pre_end = start + (num_pages << PAGE_SHIFT); 963 unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; 964 965 pre_end = min_t(unsigned long, pre_end, next_page); 966 cur_pages = (pre_end - start) >> PAGE_SHIFT; 967 cur_pages = min_t(unsigned int, num_pages, cur_pages); 968 969 /* 970 * Need a PTE page? 971 */ 972 pmd = pmd_offset(pud, start); 973 if (pmd_none(*pmd)) 974 if (alloc_pte_page(pmd)) 975 return -1; 976 977 populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot); 978 979 start = pre_end; 980 } 981 982 /* 983 * We mapped them all? 984 */ 985 if (num_pages == cur_pages) 986 return cur_pages; 987 988 pmd_pgprot = pgprot_4k_2_large(pgprot); 989 990 while (end - start >= PMD_SIZE) { 991 992 /* 993 * We cannot use a 1G page so allocate a PMD page if needed. 994 */ 995 if (pud_none(*pud)) 996 if (alloc_pmd_page(pud)) 997 return -1; 998 999 pmd = pmd_offset(pud, start); 1000 1001 set_pmd(pmd, __pmd(cpa->pfn << PAGE_SHIFT | _PAGE_PSE | 1002 massage_pgprot(pmd_pgprot))); 1003 1004 start += PMD_SIZE; 1005 cpa->pfn += PMD_SIZE >> PAGE_SHIFT; 1006 cur_pages += PMD_SIZE >> PAGE_SHIFT; 1007 } 1008 1009 /* 1010 * Map trailing 4K pages. 1011 */ 1012 if (start < end) { 1013 pmd = pmd_offset(pud, start); 1014 if (pmd_none(*pmd)) 1015 if (alloc_pte_page(pmd)) 1016 return -1; 1017 1018 populate_pte(cpa, start, end, num_pages - cur_pages, 1019 pmd, pgprot); 1020 } 1021 return num_pages; 1022} 1023 1024static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d, 1025 pgprot_t pgprot) 1026{ 1027 pud_t *pud; 1028 unsigned long end; 1029 long cur_pages = 0; 1030 pgprot_t pud_pgprot; 1031 1032 end = start + (cpa->numpages << PAGE_SHIFT); 1033 1034 /* 1035 * Not on a Gb page boundary? => map everything up to it with 1036 * smaller pages. 1037 */ 1038 if (start & (PUD_SIZE - 1)) { 1039 unsigned long pre_end; 1040 unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; 1041 1042 pre_end = min_t(unsigned long, end, next_page); 1043 cur_pages = (pre_end - start) >> PAGE_SHIFT; 1044 cur_pages = min_t(int, (int)cpa->numpages, cur_pages); 1045 1046 pud = pud_offset(p4d, start); 1047 1048 /* 1049 * Need a PMD page? 1050 */ 1051 if (pud_none(*pud)) 1052 if (alloc_pmd_page(pud)) 1053 return -1; 1054 1055 cur_pages = populate_pmd(cpa, start, pre_end, cur_pages, 1056 pud, pgprot); 1057 if (cur_pages < 0) 1058 return cur_pages; 1059 1060 start = pre_end; 1061 } 1062 1063 /* We mapped them all? */ 1064 if (cpa->numpages == cur_pages) 1065 return cur_pages; 1066 1067 pud = pud_offset(p4d, start); 1068 pud_pgprot = pgprot_4k_2_large(pgprot); 1069 1070 /* 1071 * Map everything starting from the Gb boundary, possibly with 1G pages 1072 */ 1073 while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) { 1074 set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE | 1075 massage_pgprot(pud_pgprot))); 1076 1077 start += PUD_SIZE; 1078 cpa->pfn += PUD_SIZE >> PAGE_SHIFT; 1079 cur_pages += PUD_SIZE >> PAGE_SHIFT; 1080 pud++; 1081 } 1082 1083 /* Map trailing leftover */ 1084 if (start < end) { 1085 long tmp; 1086 1087 pud = pud_offset(p4d, start); 1088 if (pud_none(*pud)) 1089 if (alloc_pmd_page(pud)) 1090 return -1; 1091 1092 tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages, 1093 pud, pgprot); 1094 if (tmp < 0) 1095 return cur_pages; 1096 1097 cur_pages += tmp; 1098 } 1099 return cur_pages; 1100} 1101 1102/* 1103 * Restrictions for kernel page table do not necessarily apply when mapping in 1104 * an alternate PGD. 1105 */ 1106static int populate_pgd(struct cpa_data *cpa, unsigned long addr) 1107{ 1108 pgprot_t pgprot = __pgprot(_KERNPG_TABLE); 1109 pud_t *pud = NULL; /* shut up gcc */ 1110 p4d_t *p4d; 1111 pgd_t *pgd_entry; 1112 long ret; 1113 1114 pgd_entry = cpa->pgd + pgd_index(addr); 1115 1116 if (pgd_none(*pgd_entry)) { 1117 p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); 1118 if (!p4d) 1119 return -1; 1120 1121 set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE)); 1122 } 1123 1124 /* 1125 * Allocate a PUD page and hand it down for mapping. 1126 */ 1127 p4d = p4d_offset(pgd_entry, addr); 1128 if (p4d_none(*p4d)) { 1129 pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); 1130 if (!pud) 1131 return -1; 1132 1133 set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); 1134 } 1135 1136 pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); 1137 pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); 1138 1139 ret = populate_pud(cpa, addr, p4d, pgprot); 1140 if (ret < 0) { 1141 /* 1142 * Leave the PUD page in place in case some other CPU or thread 1143 * already found it, but remove any useless entries we just 1144 * added to it. 1145 */ 1146 unmap_pud_range(p4d, addr, 1147 addr + (cpa->numpages << PAGE_SHIFT)); 1148 return ret; 1149 } 1150 1151 cpa->numpages = ret; 1152 return 0; 1153} 1154 1155static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, 1156 int primary) 1157{ 1158 if (cpa->pgd) { 1159 /* 1160 * Right now, we only execute this code path when mapping 1161 * the EFI virtual memory map regions, no other users 1162 * provide a ->pgd value. This may change in the future. 1163 */ 1164 return populate_pgd(cpa, vaddr); 1165 } 1166 1167 /* 1168 * Ignore all non primary paths. 1169 */ 1170 if (!primary) { 1171 cpa->numpages = 1; 1172 return 0; 1173 } 1174 1175 /* 1176 * Ignore the NULL PTE for kernel identity mapping, as it is expected 1177 * to have holes. 1178 * Also set numpages to '1' indicating that we processed cpa req for 1179 * one virtual address page and its pfn. TBD: numpages can be set based 1180 * on the initial value and the level returned by lookup_address(). 1181 */ 1182 if (within(vaddr, PAGE_OFFSET, 1183 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { 1184 cpa->numpages = 1; 1185 cpa->pfn = __pa(vaddr) >> PAGE_SHIFT; 1186 return 0; 1187 } else { 1188 WARN(1, KERN_WARNING "CPA: called for zero pte. " 1189 "vaddr = %lx cpa->vaddr = %lx\n", vaddr, 1190 *cpa->vaddr); 1191 1192 return -EFAULT; 1193 } 1194} 1195 1196static int __change_page_attr(struct cpa_data *cpa, int primary) 1197{ 1198 unsigned long address; 1199 int do_split, err; 1200 unsigned int level; 1201 pte_t *kpte, old_pte; 1202 1203 if (cpa->flags & CPA_PAGES_ARRAY) { 1204 struct page *page = cpa->pages[cpa->curpage]; 1205 if (unlikely(PageHighMem(page))) 1206 return 0; 1207 address = (unsigned long)page_address(page); 1208 } else if (cpa->flags & CPA_ARRAY) 1209 address = cpa->vaddr[cpa->curpage]; 1210 else 1211 address = *cpa->vaddr; 1212repeat: 1213 kpte = _lookup_address_cpa(cpa, address, &level); 1214 if (!kpte) 1215 return __cpa_process_fault(cpa, address, primary); 1216 1217 old_pte = *kpte; 1218 if (pte_none(old_pte)) 1219 return __cpa_process_fault(cpa, address, primary); 1220 1221 if (level == PG_LEVEL_4K) { 1222 pte_t new_pte; 1223 pgprot_t new_prot = pte_pgprot(old_pte); 1224 unsigned long pfn = pte_pfn(old_pte); 1225 1226 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); 1227 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); 1228 1229 new_prot = static_protections(new_prot, address, pfn); 1230 1231 /* 1232 * Set the GLOBAL flags only if the PRESENT flag is 1233 * set otherwise pte_present will return true even on 1234 * a non present pte. The canon_pgprot will clear 1235 * _PAGE_GLOBAL for the ancient hardware that doesn't 1236 * support it. 1237 */ 1238 if (pgprot_val(new_prot) & _PAGE_PRESENT) 1239 pgprot_val(new_prot) |= _PAGE_GLOBAL; 1240 else 1241 pgprot_val(new_prot) &= ~_PAGE_GLOBAL; 1242 1243 /* 1244 * We need to keep the pfn from the existing PTE, 1245 * after all we're only going to change it's attributes 1246 * not the memory it points to 1247 */ 1248 new_pte = pfn_pte(pfn, canon_pgprot(new_prot)); 1249 cpa->pfn = pfn; 1250 /* 1251 * Do we really change anything ? 1252 */ 1253 if (pte_val(old_pte) != pte_val(new_pte)) { 1254 set_pte_atomic(kpte, new_pte); 1255 cpa->flags |= CPA_FLUSHTLB; 1256 } 1257 cpa->numpages = 1; 1258 return 0; 1259 } 1260 1261 /* 1262 * Check, whether we can keep the large page intact 1263 * and just change the pte: 1264 */ 1265 do_split = try_preserve_large_page(kpte, address, cpa); 1266 /* 1267 * When the range fits into the existing large page, 1268 * return. cp->numpages and cpa->tlbflush have been updated in 1269 * try_large_page: 1270 */ 1271 if (do_split <= 0) 1272 return do_split; 1273 1274 /* 1275 * We have to split the large page: 1276 */ 1277 err = split_large_page(cpa, kpte, address); 1278 if (!err) { 1279 /* 1280 * Do a global flush tlb after splitting the large page 1281 * and before we do the actual change page attribute in the PTE. 1282 * 1283 * With out this, we violate the TLB application note, that says 1284 * "The TLBs may contain both ordinary and large-page 1285 * translations for a 4-KByte range of linear addresses. This 1286 * may occur if software modifies the paging structures so that 1287 * the page size used for the address range changes. If the two 1288 * translations differ with respect to page frame or attributes 1289 * (e.g., permissions), processor behavior is undefined and may 1290 * be implementation-specific." 1291 * 1292 * We do this global tlb flush inside the cpa_lock, so that we 1293 * don't allow any other cpu, with stale tlb entries change the 1294 * page attribute in parallel, that also falls into the 1295 * just split large page entry. 1296 */ 1297 flush_tlb_all(); 1298 goto repeat; 1299 } 1300 1301 return err; 1302} 1303 1304static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); 1305 1306static int cpa_process_alias(struct cpa_data *cpa) 1307{ 1308 struct cpa_data alias_cpa; 1309 unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); 1310 unsigned long vaddr; 1311 int ret; 1312 1313 if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1)) 1314 return 0; 1315 1316 /* 1317 * No need to redo, when the primary call touched the direct 1318 * mapping already: 1319 */ 1320 if (cpa->flags & CPA_PAGES_ARRAY) { 1321 struct page *page = cpa->pages[cpa->curpage]; 1322 if (unlikely(PageHighMem(page))) 1323 return 0; 1324 vaddr = (unsigned long)page_address(page); 1325 } else if (cpa->flags & CPA_ARRAY) 1326 vaddr = cpa->vaddr[cpa->curpage]; 1327 else 1328 vaddr = *cpa->vaddr; 1329 1330 if (!(within(vaddr, PAGE_OFFSET, 1331 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { 1332 1333 alias_cpa = *cpa; 1334 alias_cpa.vaddr = &laddr; 1335 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); 1336 1337 ret = __change_page_attr_set_clr(&alias_cpa, 0); 1338 if (ret) 1339 return ret; 1340 } 1341 1342#ifdef CONFIG_X86_64 1343 /* 1344 * If the primary call didn't touch the high mapping already 1345 * and the physical address is inside the kernel map, we need 1346 * to touch the high mapped kernel as well: 1347 */ 1348 if (!within(vaddr, (unsigned long)_text, _brk_end) && 1349 within_inclusive(cpa->pfn, highmap_start_pfn(), 1350 highmap_end_pfn())) { 1351 unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + 1352 __START_KERNEL_map - phys_base; 1353 alias_cpa = *cpa; 1354 alias_cpa.vaddr = &temp_cpa_vaddr; 1355 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); 1356 1357 /* 1358 * The high mapping range is imprecise, so ignore the 1359 * return value. 1360 */ 1361 __change_page_attr_set_clr(&alias_cpa, 0); 1362 } 1363#endif 1364 1365 return 0; 1366} 1367 1368static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) 1369{ 1370 unsigned long numpages = cpa->numpages; 1371 int ret; 1372 1373 while (numpages) { 1374 /* 1375 * Store the remaining nr of pages for the large page 1376 * preservation check. 1377 */ 1378 cpa->numpages = numpages; 1379 /* for array changes, we can't use large page */ 1380 if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) 1381 cpa->numpages = 1; 1382 1383 if (!debug_pagealloc_enabled()) 1384 spin_lock(&cpa_lock); 1385 ret = __change_page_attr(cpa, checkalias); 1386 if (!debug_pagealloc_enabled()) 1387 spin_unlock(&cpa_lock); 1388 if (ret) 1389 return ret; 1390 1391 if (checkalias) { 1392 ret = cpa_process_alias(cpa); 1393 if (ret) 1394 return ret; 1395 } 1396 1397 /* 1398 * Adjust the number of pages with the result of the 1399 * CPA operation. Either a large page has been 1400 * preserved or a single page update happened. 1401 */ 1402 BUG_ON(cpa->numpages > numpages || !cpa->numpages); 1403 numpages -= cpa->numpages; 1404 if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) 1405 cpa->curpage++; 1406 else 1407 *cpa->vaddr += cpa->numpages * PAGE_SIZE; 1408 1409 } 1410 return 0; 1411} 1412 1413static int change_page_attr_set_clr(unsigned long *addr, int numpages, 1414 pgprot_t mask_set, pgprot_t mask_clr, 1415 int force_split, int in_flag, 1416 struct page **pages) 1417{ 1418 struct cpa_data cpa; 1419 int ret, cache, checkalias; 1420 unsigned long baddr = 0; 1421 1422 memset(&cpa, 0, sizeof(cpa)); 1423 1424 /* 1425 * Check, if we are requested to change a not supported 1426 * feature: 1427 */ 1428 mask_set = canon_pgprot(mask_set); 1429 mask_clr = canon_pgprot(mask_clr); 1430 if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split) 1431 return 0; 1432 1433 /* Ensure we are PAGE_SIZE aligned */ 1434 if (in_flag & CPA_ARRAY) { 1435 int i; 1436 for (i = 0; i < numpages; i++) { 1437 if (addr[i] & ~PAGE_MASK) { 1438 addr[i] &= PAGE_MASK; 1439 WARN_ON_ONCE(1); 1440 } 1441 } 1442 } else if (!(in_flag & CPA_PAGES_ARRAY)) { 1443 /* 1444 * in_flag of CPA_PAGES_ARRAY implies it is aligned. 1445 * No need to cehck in that case 1446 */ 1447 if (*addr & ~PAGE_MASK) { 1448 *addr &= PAGE_MASK; 1449 /* 1450 * People should not be passing in unaligned addresses: 1451 */ 1452 WARN_ON_ONCE(1); 1453 } 1454 /* 1455 * Save address for cache flush. *addr is modified in the call 1456 * to __change_page_attr_set_clr() below. 1457 */ 1458 baddr = *addr; 1459 } 1460 1461 /* Must avoid aliasing mappings in the highmem code */ 1462 kmap_flush_unused(); 1463 1464 vm_unmap_aliases(); 1465 1466 cpa.vaddr = addr; 1467 cpa.pages = pages; 1468 cpa.numpages = numpages; 1469 cpa.mask_set = mask_set; 1470 cpa.mask_clr = mask_clr; 1471 cpa.flags = 0; 1472 cpa.curpage = 0; 1473 cpa.force_split = force_split; 1474 1475 if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY)) 1476 cpa.flags |= in_flag; 1477 1478 /* No alias checking for _NX bit modifications */ 1479 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; 1480 1481 ret = __change_page_attr_set_clr(&cpa, checkalias); 1482 1483 /* 1484 * Check whether we really changed something: 1485 */ 1486 if (!(cpa.flags & CPA_FLUSHTLB)) 1487 goto out; 1488 1489 /* 1490 * No need to flush, when we did not set any of the caching 1491 * attributes: 1492 */ 1493 cache = !!pgprot2cachemode(mask_set); 1494 1495 /* 1496 * On success we use CLFLUSH, when the CPU supports it to 1497 * avoid the WBINVD. If the CPU does not support it and in the 1498 * error case we fall back to cpa_flush_all (which uses 1499 * WBINVD): 1500 */ 1501 if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) { 1502 if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { 1503 cpa_flush_array(addr, numpages, cache, 1504 cpa.flags, pages); 1505 } else 1506 cpa_flush_range(baddr, numpages, cache); 1507 } else 1508 cpa_flush_all(cache); 1509 1510out: 1511 return ret; 1512} 1513 1514static inline int change_page_attr_set(unsigned long *addr, int numpages, 1515 pgprot_t mask, int array) 1516{ 1517 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, 1518 (array ? CPA_ARRAY : 0), NULL); 1519} 1520 1521static inline int change_page_attr_clear(unsigned long *addr, int numpages, 1522 pgprot_t mask, int array) 1523{ 1524 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, 1525 (array ? CPA_ARRAY : 0), NULL); 1526} 1527 1528static inline int cpa_set_pages_array(struct page **pages, int numpages, 1529 pgprot_t mask) 1530{ 1531 return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0, 1532 CPA_PAGES_ARRAY, pages); 1533} 1534 1535static inline int cpa_clear_pages_array(struct page **pages, int numpages, 1536 pgprot_t mask) 1537{ 1538 return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0, 1539 CPA_PAGES_ARRAY, pages); 1540} 1541 1542int _set_memory_uc(unsigned long addr, int numpages) 1543{ 1544 /* 1545 * for now UC MINUS. see comments in ioremap_nocache() 1546 * If you really need strong UC use ioremap_uc(), but note 1547 * that you cannot override IO areas with set_memory_*() as 1548 * these helpers cannot work with IO memory. 1549 */ 1550 return change_page_attr_set(&addr, numpages, 1551 cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), 1552 0); 1553} 1554 1555int set_memory_uc(unsigned long addr, int numpages) 1556{ 1557 int ret; 1558 1559 /* 1560 * for now UC MINUS. see comments in ioremap_nocache() 1561 */ 1562 ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, 1563 _PAGE_CACHE_MODE_UC_MINUS, NULL); 1564 if (ret) 1565 goto out_err; 1566 1567 ret = _set_memory_uc(addr, numpages); 1568 if (ret) 1569 goto out_free; 1570 1571 return 0; 1572 1573out_free: 1574 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); 1575out_err: 1576 return ret; 1577} 1578EXPORT_SYMBOL(set_memory_uc); 1579 1580static int _set_memory_array(unsigned long *addr, int addrinarray, 1581 enum page_cache_mode new_type) 1582{ 1583 enum page_cache_mode set_type; 1584 int i, j; 1585 int ret; 1586 1587 for (i = 0; i < addrinarray; i++) { 1588 ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, 1589 new_type, NULL); 1590 if (ret) 1591 goto out_free; 1592 } 1593 1594 /* If WC, set to UC- first and then WC */ 1595 set_type = (new_type == _PAGE_CACHE_MODE_WC) ? 1596 _PAGE_CACHE_MODE_UC_MINUS : new_type; 1597 1598 ret = change_page_attr_set(addr, addrinarray, 1599 cachemode2pgprot(set_type), 1); 1600 1601 if (!ret && new_type == _PAGE_CACHE_MODE_WC) 1602 ret = change_page_attr_set_clr(addr, addrinarray, 1603 cachemode2pgprot( 1604 _PAGE_CACHE_MODE_WC), 1605 __pgprot(_PAGE_CACHE_MASK), 1606 0, CPA_ARRAY, NULL); 1607 if (ret) 1608 goto out_free; 1609 1610 return 0; 1611 1612out_free: 1613 for (j = 0; j < i; j++) 1614 free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE); 1615 1616 return ret; 1617} 1618 1619int set_memory_array_uc(unsigned long *addr, int addrinarray) 1620{ 1621 return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); 1622} 1623EXPORT_SYMBOL(set_memory_array_uc); 1624 1625int set_memory_array_wc(unsigned long *addr, int addrinarray) 1626{ 1627 return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC); 1628} 1629EXPORT_SYMBOL(set_memory_array_wc); 1630 1631int set_memory_array_wt(unsigned long *addr, int addrinarray) 1632{ 1633 return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WT); 1634} 1635EXPORT_SYMBOL_GPL(set_memory_array_wt); 1636 1637int _set_memory_wc(unsigned long addr, int numpages) 1638{ 1639 int ret; 1640 unsigned long addr_copy = addr; 1641 1642 ret = change_page_attr_set(&addr, numpages, 1643 cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), 1644 0); 1645 if (!ret) { 1646 ret = change_page_attr_set_clr(&addr_copy, numpages, 1647 cachemode2pgprot( 1648 _PAGE_CACHE_MODE_WC), 1649 __pgprot(_PAGE_CACHE_MASK), 1650 0, 0, NULL); 1651 } 1652 return ret; 1653} 1654 1655int set_memory_wc(unsigned long addr, int numpages) 1656{ 1657 int ret; 1658 1659 ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, 1660 _PAGE_CACHE_MODE_WC, NULL); 1661 if (ret) 1662 return ret; 1663 1664 ret = _set_memory_wc(addr, numpages); 1665 if (ret) 1666 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); 1667 1668 return ret; 1669} 1670EXPORT_SYMBOL(set_memory_wc); 1671 1672int _set_memory_wt(unsigned long addr, int numpages) 1673{ 1674 return change_page_attr_set(&addr, numpages, 1675 cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0); 1676} 1677 1678int set_memory_wt(unsigned long addr, int numpages) 1679{ 1680 int ret; 1681 1682 ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, 1683 _PAGE_CACHE_MODE_WT, NULL); 1684 if (ret) 1685 return ret; 1686 1687 ret = _set_memory_wt(addr, numpages); 1688 if (ret) 1689 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); 1690 1691 return ret; 1692} 1693EXPORT_SYMBOL_GPL(set_memory_wt); 1694 1695int _set_memory_wb(unsigned long addr, int numpages) 1696{ 1697 /* WB cache mode is hard wired to all cache attribute bits being 0 */ 1698 return change_page_attr_clear(&addr, numpages, 1699 __pgprot(_PAGE_CACHE_MASK), 0); 1700} 1701 1702int set_memory_wb(unsigned long addr, int numpages) 1703{ 1704 int ret; 1705 1706 ret = _set_memory_wb(addr, numpages); 1707 if (ret) 1708 return ret; 1709 1710 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); 1711 return 0; 1712} 1713EXPORT_SYMBOL(set_memory_wb); 1714 1715int set_memory_array_wb(unsigned long *addr, int addrinarray) 1716{ 1717 int i; 1718 int ret; 1719 1720 /* WB cache mode is hard wired to all cache attribute bits being 0 */ 1721 ret = change_page_attr_clear(addr, addrinarray, 1722 __pgprot(_PAGE_CACHE_MASK), 1); 1723 if (ret) 1724 return ret; 1725 1726 for (i = 0; i < addrinarray; i++) 1727 free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); 1728 1729 return 0; 1730} 1731EXPORT_SYMBOL(set_memory_array_wb); 1732 1733int set_memory_x(unsigned long addr, int numpages) 1734{ 1735 if (!(__supported_pte_mask & _PAGE_NX)) 1736 return 0; 1737 1738 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); 1739} 1740EXPORT_SYMBOL(set_memory_x); 1741 1742int set_memory_nx(unsigned long addr, int numpages) 1743{ 1744 if (!(__supported_pte_mask & _PAGE_NX)) 1745 return 0; 1746 1747 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); 1748} 1749EXPORT_SYMBOL(set_memory_nx); 1750 1751int set_memory_ro(unsigned long addr, int numpages) 1752{ 1753 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); 1754} 1755 1756int set_memory_rw(unsigned long addr, int numpages) 1757{ 1758 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); 1759} 1760 1761int set_memory_np(unsigned long addr, int numpages) 1762{ 1763 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); 1764} 1765 1766int set_memory_4k(unsigned long addr, int numpages) 1767{ 1768 return change_page_attr_set_clr(&addr, numpages, __pgprot(0), 1769 __pgprot(0), 1, 0, NULL); 1770} 1771 1772int set_pages_uc(struct page *page, int numpages) 1773{ 1774 unsigned long addr = (unsigned long)page_address(page); 1775 1776 return set_memory_uc(addr, numpages); 1777} 1778EXPORT_SYMBOL(set_pages_uc); 1779 1780static int _set_pages_array(struct page **pages, int addrinarray, 1781 enum page_cache_mode new_type) 1782{ 1783 unsigned long start; 1784 unsigned long end; 1785 enum page_cache_mode set_type; 1786 int i; 1787 int free_idx; 1788 int ret; 1789 1790 for (i = 0; i < addrinarray; i++) { 1791 if (PageHighMem(pages[i])) 1792 continue; 1793 start = page_to_pfn(pages[i]) << PAGE_SHIFT; 1794 end = start + PAGE_SIZE; 1795 if (reserve_memtype(start, end, new_type, NULL)) 1796 goto err_out; 1797 } 1798 1799 /* If WC, set to UC- first and then WC */ 1800 set_type = (new_type == _PAGE_CACHE_MODE_WC) ? 1801 _PAGE_CACHE_MODE_UC_MINUS : new_type; 1802 1803 ret = cpa_set_pages_array(pages, addrinarray, 1804 cachemode2pgprot(set_type)); 1805 if (!ret && new_type == _PAGE_CACHE_MODE_WC) 1806 ret = change_page_attr_set_clr(NULL, addrinarray, 1807 cachemode2pgprot( 1808 _PAGE_CACHE_MODE_WC), 1809 __pgprot(_PAGE_CACHE_MASK), 1810 0, CPA_PAGES_ARRAY, pages); 1811 if (ret) 1812 goto err_out; 1813 return 0; /* Success */ 1814err_out: 1815 free_idx = i; 1816 for (i = 0; i < free_idx; i++) { 1817 if (PageHighMem(pages[i])) 1818 continue; 1819 start = page_to_pfn(pages[i]) << PAGE_SHIFT; 1820 end = start + PAGE_SIZE; 1821 free_memtype(start, end); 1822 } 1823 return -EINVAL; 1824} 1825 1826int set_pages_array_uc(struct page **pages, int addrinarray) 1827{ 1828 return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); 1829} 1830EXPORT_SYMBOL(set_pages_array_uc); 1831 1832int set_pages_array_wc(struct page **pages, int addrinarray) 1833{ 1834 return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC); 1835} 1836EXPORT_SYMBOL(set_pages_array_wc); 1837 1838int set_pages_array_wt(struct page **pages, int addrinarray) 1839{ 1840 return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WT); 1841} 1842EXPORT_SYMBOL_GPL(set_pages_array_wt); 1843 1844int set_pages_wb(struct page *page, int numpages) 1845{ 1846 unsigned long addr = (unsigned long)page_address(page); 1847 1848 return set_memory_wb(addr, numpages); 1849} 1850EXPORT_SYMBOL(set_pages_wb); 1851 1852int set_pages_array_wb(struct page **pages, int addrinarray) 1853{ 1854 int retval; 1855 unsigned long start; 1856 unsigned long end; 1857 int i; 1858 1859 /* WB cache mode is hard wired to all cache attribute bits being 0 */ 1860 retval = cpa_clear_pages_array(pages, addrinarray, 1861 __pgprot(_PAGE_CACHE_MASK)); 1862 if (retval) 1863 return retval; 1864 1865 for (i = 0; i < addrinarray; i++) { 1866 if (PageHighMem(pages[i])) 1867 continue; 1868 start = page_to_pfn(pages[i]) << PAGE_SHIFT; 1869 end = start + PAGE_SIZE; 1870 free_memtype(start, end); 1871 } 1872 1873 return 0; 1874} 1875EXPORT_SYMBOL(set_pages_array_wb); 1876 1877int set_pages_x(struct page *page, int numpages) 1878{ 1879 unsigned long addr = (unsigned long)page_address(page); 1880 1881 return set_memory_x(addr, numpages); 1882} 1883EXPORT_SYMBOL(set_pages_x); 1884 1885int set_pages_nx(struct page *page, int numpages) 1886{ 1887 unsigned long addr = (unsigned long)page_address(page); 1888 1889 return set_memory_nx(addr, numpages); 1890} 1891EXPORT_SYMBOL(set_pages_nx); 1892 1893int set_pages_ro(struct page *page, int numpages) 1894{ 1895 unsigned long addr = (unsigned long)page_address(page); 1896 1897 return set_memory_ro(addr, numpages); 1898} 1899 1900int set_pages_rw(struct page *page, int numpages) 1901{ 1902 unsigned long addr = (unsigned long)page_address(page); 1903 1904 return set_memory_rw(addr, numpages); 1905} 1906 1907#ifdef CONFIG_DEBUG_PAGEALLOC 1908 1909static int __set_pages_p(struct page *page, int numpages) 1910{ 1911 unsigned long tempaddr = (unsigned long) page_address(page); 1912 struct cpa_data cpa = { .vaddr = &tempaddr, 1913 .pgd = NULL, 1914 .numpages = numpages, 1915 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), 1916 .mask_clr = __pgprot(0), 1917 .flags = 0}; 1918 1919 /* 1920 * No alias checking needed for setting present flag. otherwise, 1921 * we may need to break large pages for 64-bit kernel text 1922 * mappings (this adds to complexity if we want to do this from 1923 * atomic context especially). Let's keep it simple! 1924 */ 1925 return __change_page_attr_set_clr(&cpa, 0); 1926} 1927 1928static int __set_pages_np(struct page *page, int numpages) 1929{ 1930 unsigned long tempaddr = (unsigned long) page_address(page); 1931 struct cpa_data cpa = { .vaddr = &tempaddr, 1932 .pgd = NULL, 1933 .numpages = numpages, 1934 .mask_set = __pgprot(0), 1935 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), 1936 .flags = 0}; 1937 1938 /* 1939 * No alias checking needed for setting not present flag. otherwise, 1940 * we may need to break large pages for 64-bit kernel text 1941 * mappings (this adds to complexity if we want to do this from 1942 * atomic context especially). Let's keep it simple! 1943 */ 1944 return __change_page_attr_set_clr(&cpa, 0); 1945} 1946 1947void __kernel_map_pages(struct page *page, int numpages, int enable) 1948{ 1949 if (PageHighMem(page)) 1950 return; 1951 if (!enable) { 1952 debug_check_no_locks_freed(page_address(page), 1953 numpages * PAGE_SIZE); 1954 } 1955 1956 /* 1957 * The return value is ignored as the calls cannot fail. 1958 * Large pages for identity mappings are not used at boot time 1959 * and hence no memory allocations during large page split. 1960 */ 1961 if (enable) 1962 __set_pages_p(page, numpages); 1963 else 1964 __set_pages_np(page, numpages); 1965 1966 /* 1967 * We should perform an IPI and flush all tlbs, 1968 * but that can deadlock->flush only current cpu: 1969 */ 1970 __flush_tlb_all(); 1971 1972 arch_flush_lazy_mmu_mode(); 1973} 1974 1975#ifdef CONFIG_HIBERNATION 1976 1977bool kernel_page_present(struct page *page) 1978{ 1979 unsigned int level; 1980 pte_t *pte; 1981 1982 if (PageHighMem(page)) 1983 return false; 1984 1985 pte = lookup_address((unsigned long)page_address(page), &level); 1986 return (pte_val(*pte) & _PAGE_PRESENT); 1987} 1988 1989#endif /* CONFIG_HIBERNATION */ 1990 1991#endif /* CONFIG_DEBUG_PAGEALLOC */ 1992 1993int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, 1994 unsigned numpages, unsigned long page_flags) 1995{ 1996 int retval = -EINVAL; 1997 1998 struct cpa_data cpa = { 1999 .vaddr = &address, 2000 .pfn = pfn, 2001 .pgd = pgd, 2002 .numpages = numpages, 2003 .mask_set = __pgprot(0), 2004 .mask_clr = __pgprot(0), 2005 .flags = 0, 2006 }; 2007 2008 if (!(__supported_pte_mask & _PAGE_NX)) 2009 goto out; 2010 2011 if (!(page_flags & _PAGE_NX)) 2012 cpa.mask_clr = __pgprot(_PAGE_NX); 2013 2014 if (!(page_flags & _PAGE_RW)) 2015 cpa.mask_clr = __pgprot(_PAGE_RW); 2016 2017 cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); 2018 2019 retval = __change_page_attr_set_clr(&cpa, 0); 2020 __flush_tlb_all(); 2021 2022out: 2023 return retval; 2024} 2025 2026/* 2027 * The testcases use internal knowledge of the implementation that shouldn't 2028 * be exposed to the rest of the kernel. Include these directly here. 2029 */ 2030#ifdef CONFIG_CPA_DEBUG 2031#include "pageattr-test.c" 2032#endif