arch/x86/mm/pageattr.c at v4.12

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / arch / x86 / mm / pageattr.c
at v4.12 2032 lines 50 kB view raw
wrap content
   1/*
   2 * Copyright 2002 Andi Kleen, SuSE Labs.
   3 * Thanks to Ben LaHaise for precious feedback.
   4 */
   5#include <linux/highmem.h>
   6#include <linux/bootmem.h>
   7#include <linux/sched.h>
   8#include <linux/mm.h>
   9#include <linux/interrupt.h>
  10#include <linux/seq_file.h>
  11#include <linux/debugfs.h>
  12#include <linux/pfn.h>
  13#include <linux/percpu.h>
  14#include <linux/gfp.h>
  15#include <linux/pci.h>
  16#include <linux/vmalloc.h>
  17
  18#include <asm/e820/api.h>
  19#include <asm/processor.h>
  20#include <asm/tlbflush.h>
  21#include <asm/sections.h>
  22#include <asm/setup.h>
  23#include <linux/uaccess.h>
  24#include <asm/pgalloc.h>
  25#include <asm/proto.h>
  26#include <asm/pat.h>
  27#include <asm/set_memory.h>
  28
  29/*
  30 * The current flushing context - we pass it instead of 5 arguments:
  31 */
  32struct cpa_data {
  33	unsigned long	*vaddr;
  34	pgd_t		*pgd;
  35	pgprot_t	mask_set;
  36	pgprot_t	mask_clr;
  37	unsigned long	numpages;
  38	int		flags;
  39	unsigned long	pfn;
  40	unsigned	force_split : 1;
  41	int		curpage;
  42	struct page	**pages;
  43};
  44
  45/*
  46 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
  47 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
  48 * entries change the page attribute in parallel to some other cpu
  49 * splitting a large page entry along with changing the attribute.
  50 */
  51static DEFINE_SPINLOCK(cpa_lock);
  52
  53#define CPA_FLUSHTLB 1
  54#define CPA_ARRAY 2
  55#define CPA_PAGES_ARRAY 4
  56
  57#ifdef CONFIG_PROC_FS
  58static unsigned long direct_pages_count[PG_LEVEL_NUM];
  59
  60void update_page_count(int level, unsigned long pages)
  61{
  62	/* Protect against CPA */
  63	spin_lock(&pgd_lock);
  64	direct_pages_count[level] += pages;
  65	spin_unlock(&pgd_lock);
  66}
  67
  68static void split_page_count(int level)
  69{
  70	if (direct_pages_count[level] == 0)
  71		return;
  72
  73	direct_pages_count[level]--;
  74	direct_pages_count[level - 1] += PTRS_PER_PTE;
  75}
  76
  77void arch_report_meminfo(struct seq_file *m)
  78{
  79	seq_printf(m, "DirectMap4k:    %8lu kB\n",
  80			direct_pages_count[PG_LEVEL_4K] << 2);
  81#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
  82	seq_printf(m, "DirectMap2M:    %8lu kB\n",
  83			direct_pages_count[PG_LEVEL_2M] << 11);
  84#else
  85	seq_printf(m, "DirectMap4M:    %8lu kB\n",
  86			direct_pages_count[PG_LEVEL_2M] << 12);
  87#endif
  88	if (direct_gbpages)
  89		seq_printf(m, "DirectMap1G:    %8lu kB\n",
  90			direct_pages_count[PG_LEVEL_1G] << 20);
  91}
  92#else
  93static inline void split_page_count(int level) { }
  94#endif
  95
  96#ifdef CONFIG_X86_64
  97
  98static inline unsigned long highmap_start_pfn(void)
  99{
 100	return __pa_symbol(_text) >> PAGE_SHIFT;
 101}
 102
 103static inline unsigned long highmap_end_pfn(void)
 104{
 105	/* Do not reference physical address outside the kernel. */
 106	return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT;
 107}
 108
 109#endif
 110
 111static inline int
 112within(unsigned long addr, unsigned long start, unsigned long end)
 113{
 114	return addr >= start && addr < end;
 115}
 116
 117static inline int
 118within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
 119{
 120	return addr >= start && addr <= end;
 121}
 122
 123/*
 124 * Flushing functions
 125 */
 126
 127/**
 128 * clflush_cache_range - flush a cache range with clflush
 129 * @vaddr:	virtual start address
 130 * @size:	number of bytes to flush
 131 *
 132 * clflushopt is an unordered instruction which needs fencing with mfence or
 133 * sfence to avoid ordering issues.
 134 */
 135void clflush_cache_range(void *vaddr, unsigned int size)
 136{
 137	const unsigned long clflush_size = boot_cpu_data.x86_clflush_size;
 138	void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1));
 139	void *vend = vaddr + size;
 140
 141	if (p >= vend)
 142		return;
 143
 144	mb();
 145
 146	for (; p < vend; p += clflush_size)
 147		clflushopt(p);
 148
 149	mb();
 150}
 151EXPORT_SYMBOL_GPL(clflush_cache_range);
 152
 153static void __cpa_flush_all(void *arg)
 154{
 155	unsigned long cache = (unsigned long)arg;
 156
 157	/*
 158	 * Flush all to work around Errata in early athlons regarding
 159	 * large page flushing.
 160	 */
 161	__flush_tlb_all();
 162
 163	if (cache && boot_cpu_data.x86 >= 4)
 164		wbinvd();
 165}
 166
 167static void cpa_flush_all(unsigned long cache)
 168{
 169	BUG_ON(irqs_disabled());
 170
 171	on_each_cpu(__cpa_flush_all, (void *) cache, 1);
 172}
 173
 174static void __cpa_flush_range(void *arg)
 175{
 176	/*
 177	 * We could optimize that further and do individual per page
 178	 * tlb invalidates for a low number of pages. Caveat: we must
 179	 * flush the high aliases on 64bit as well.
 180	 */
 181	__flush_tlb_all();
 182}
 183
 184static void cpa_flush_range(unsigned long start, int numpages, int cache)
 185{
 186	unsigned int i, level;
 187	unsigned long addr;
 188
 189	BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
 190	WARN_ON(PAGE_ALIGN(start) != start);
 191
 192	on_each_cpu(__cpa_flush_range, NULL, 1);
 193
 194	if (!cache)
 195		return;
 196
 197	/*
 198	 * We only need to flush on one CPU,
 199	 * clflush is a MESI-coherent instruction that
 200	 * will cause all other CPUs to flush the same
 201	 * cachelines:
 202	 */
 203	for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
 204		pte_t *pte = lookup_address(addr, &level);
 205
 206		/*
 207		 * Only flush present addresses:
 208		 */
 209		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
 210			clflush_cache_range((void *) addr, PAGE_SIZE);
 211	}
 212}
 213
 214static void cpa_flush_array(unsigned long *start, int numpages, int cache,
 215			    int in_flags, struct page **pages)
 216{
 217	unsigned int i, level;
 218#ifdef CONFIG_PREEMPT
 219	/*
 220	 * Avoid wbinvd() because it causes latencies on all CPUs,
 221	 * regardless of any CPU isolation that may be in effect.
 222	 *
 223	 * This should be extended for CAT enabled systems independent of
 224	 * PREEMPT because wbinvd() does not respect the CAT partitions and
 225	 * this is exposed to unpriviledged users through the graphics
 226	 * subsystem.
 227	 */
 228	unsigned long do_wbinvd = 0;
 229#else
 230	unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
 231#endif
 232
 233	BUG_ON(irqs_disabled());
 234
 235	on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
 236
 237	if (!cache || do_wbinvd)
 238		return;
 239
 240	/*
 241	 * We only need to flush on one CPU,
 242	 * clflush is a MESI-coherent instruction that
 243	 * will cause all other CPUs to flush the same
 244	 * cachelines:
 245	 */
 246	for (i = 0; i < numpages; i++) {
 247		unsigned long addr;
 248		pte_t *pte;
 249
 250		if (in_flags & CPA_PAGES_ARRAY)
 251			addr = (unsigned long)page_address(pages[i]);
 252		else
 253			addr = start[i];
 254
 255		pte = lookup_address(addr, &level);
 256
 257		/*
 258		 * Only flush present addresses:
 259		 */
 260		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
 261			clflush_cache_range((void *)addr, PAGE_SIZE);
 262	}
 263}
 264
 265/*
 266 * Certain areas of memory on x86 require very specific protection flags,
 267 * for example the BIOS area or kernel text. Callers don't always get this
 268 * right (again, ioremap() on BIOS memory is not uncommon) so this function
 269 * checks and fixes these known static required protection bits.
 270 */
 271static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 272				   unsigned long pfn)
 273{
 274	pgprot_t forbidden = __pgprot(0);
 275
 276	/*
 277	 * The BIOS area between 640k and 1Mb needs to be executable for
 278	 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
 279	 */
 280#ifdef CONFIG_PCI_BIOS
 281	if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
 282		pgprot_val(forbidden) |= _PAGE_NX;
 283#endif
 284
 285	/*
 286	 * The kernel text needs to be executable for obvious reasons
 287	 * Does not cover __inittext since that is gone later on. On
 288	 * 64bit we do not enforce !NX on the low mapping
 289	 */
 290	if (within(address, (unsigned long)_text, (unsigned long)_etext))
 291		pgprot_val(forbidden) |= _PAGE_NX;
 292
 293	/*
 294	 * The .rodata section needs to be read-only. Using the pfn
 295	 * catches all aliases.
 296	 */
 297	if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
 298		   __pa_symbol(__end_rodata) >> PAGE_SHIFT))
 299		pgprot_val(forbidden) |= _PAGE_RW;
 300
 301#if defined(CONFIG_X86_64)
 302	/*
 303	 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
 304	 * kernel text mappings for the large page aligned text, rodata sections
 305	 * will be always read-only. For the kernel identity mappings covering
 306	 * the holes caused by this alignment can be anything that user asks.
 307	 *
 308	 * This will preserve the large page mappings for kernel text/data
 309	 * at no extra cost.
 310	 */
 311	if (kernel_set_to_readonly &&
 312	    within(address, (unsigned long)_text,
 313		   (unsigned long)__end_rodata_hpage_align)) {
 314		unsigned int level;
 315
 316		/*
 317		 * Don't enforce the !RW mapping for the kernel text mapping,
 318		 * if the current mapping is already using small page mapping.
 319		 * No need to work hard to preserve large page mappings in this
 320		 * case.
 321		 *
 322		 * This also fixes the Linux Xen paravirt guest boot failure
 323		 * (because of unexpected read-only mappings for kernel identity
 324		 * mappings). In this paravirt guest case, the kernel text
 325		 * mapping and the kernel identity mapping share the same
 326		 * page-table pages. Thus we can't really use different
 327		 * protections for the kernel text and identity mappings. Also,
 328		 * these shared mappings are made of small page mappings.
 329		 * Thus this don't enforce !RW mapping for small page kernel
 330		 * text mapping logic will help Linux Xen parvirt guest boot
 331		 * as well.
 332		 */
 333		if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
 334			pgprot_val(forbidden) |= _PAGE_RW;
 335	}
 336#endif
 337
 338	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
 339
 340	return prot;
 341}
 342
 343/*
 344 * Lookup the page table entry for a virtual address in a specific pgd.
 345 * Return a pointer to the entry and the level of the mapping.
 346 */
 347pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
 348			     unsigned int *level)
 349{
 350	p4d_t *p4d;
 351	pud_t *pud;
 352	pmd_t *pmd;
 353
 354	*level = PG_LEVEL_NONE;
 355
 356	if (pgd_none(*pgd))
 357		return NULL;
 358
 359	p4d = p4d_offset(pgd, address);
 360	if (p4d_none(*p4d))
 361		return NULL;
 362
 363	*level = PG_LEVEL_512G;
 364	if (p4d_large(*p4d) || !p4d_present(*p4d))
 365		return (pte_t *)p4d;
 366
 367	pud = pud_offset(p4d, address);
 368	if (pud_none(*pud))
 369		return NULL;
 370
 371	*level = PG_LEVEL_1G;
 372	if (pud_large(*pud) || !pud_present(*pud))
 373		return (pte_t *)pud;
 374
 375	pmd = pmd_offset(pud, address);
 376	if (pmd_none(*pmd))
 377		return NULL;
 378
 379	*level = PG_LEVEL_2M;
 380	if (pmd_large(*pmd) || !pmd_present(*pmd))
 381		return (pte_t *)pmd;
 382
 383	*level = PG_LEVEL_4K;
 384
 385	return pte_offset_kernel(pmd, address);
 386}
 387
 388/*
 389 * Lookup the page table entry for a virtual address. Return a pointer
 390 * to the entry and the level of the mapping.
 391 *
 392 * Note: We return pud and pmd either when the entry is marked large
 393 * or when the present bit is not set. Otherwise we would return a
 394 * pointer to a nonexisting mapping.
 395 */
 396pte_t *lookup_address(unsigned long address, unsigned int *level)
 397{
 398        return lookup_address_in_pgd(pgd_offset_k(address), address, level);
 399}
 400EXPORT_SYMBOL_GPL(lookup_address);
 401
 402static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
 403				  unsigned int *level)
 404{
 405        if (cpa->pgd)
 406		return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
 407					       address, level);
 408
 409        return lookup_address(address, level);
 410}
 411
 412/*
 413 * Lookup the PMD entry for a virtual address. Return a pointer to the entry
 414 * or NULL if not present.
 415 */
 416pmd_t *lookup_pmd_address(unsigned long address)
 417{
 418	pgd_t *pgd;
 419	p4d_t *p4d;
 420	pud_t *pud;
 421
 422	pgd = pgd_offset_k(address);
 423	if (pgd_none(*pgd))
 424		return NULL;
 425
 426	p4d = p4d_offset(pgd, address);
 427	if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d))
 428		return NULL;
 429
 430	pud = pud_offset(p4d, address);
 431	if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
 432		return NULL;
 433
 434	return pmd_offset(pud, address);
 435}
 436
 437/*
 438 * This is necessary because __pa() does not work on some
 439 * kinds of memory, like vmalloc() or the alloc_remap()
 440 * areas on 32-bit NUMA systems.  The percpu areas can
 441 * end up in this kind of memory, for instance.
 442 *
 443 * This could be optimized, but it is only intended to be
 444 * used at inititalization time, and keeping it
 445 * unoptimized should increase the testing coverage for
 446 * the more obscure platforms.
 447 */
 448phys_addr_t slow_virt_to_phys(void *__virt_addr)
 449{
 450	unsigned long virt_addr = (unsigned long)__virt_addr;
 451	phys_addr_t phys_addr;
 452	unsigned long offset;
 453	enum pg_level level;
 454	pte_t *pte;
 455
 456	pte = lookup_address(virt_addr, &level);
 457	BUG_ON(!pte);
 458
 459	/*
 460	 * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t
 461	 * before being left-shifted PAGE_SHIFT bits -- this trick is to
 462	 * make 32-PAE kernel work correctly.
 463	 */
 464	switch (level) {
 465	case PG_LEVEL_1G:
 466		phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
 467		offset = virt_addr & ~PUD_PAGE_MASK;
 468		break;
 469	case PG_LEVEL_2M:
 470		phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
 471		offset = virt_addr & ~PMD_PAGE_MASK;
 472		break;
 473	default:
 474		phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
 475		offset = virt_addr & ~PAGE_MASK;
 476	}
 477
 478	return (phys_addr_t)(phys_addr | offset);
 479}
 480EXPORT_SYMBOL_GPL(slow_virt_to_phys);
 481
 482/*
 483 * Set the new pmd in all the pgds we know about:
 484 */
 485static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
 486{
 487	/* change init_mm */
 488	set_pte_atomic(kpte, pte);
 489#ifdef CONFIG_X86_32
 490	if (!SHARED_KERNEL_PMD) {
 491		struct page *page;
 492
 493		list_for_each_entry(page, &pgd_list, lru) {
 494			pgd_t *pgd;
 495			p4d_t *p4d;
 496			pud_t *pud;
 497			pmd_t *pmd;
 498
 499			pgd = (pgd_t *)page_address(page) + pgd_index(address);
 500			p4d = p4d_offset(pgd, address);
 501			pud = pud_offset(p4d, address);
 502			pmd = pmd_offset(pud, address);
 503			set_pte_atomic((pte_t *)pmd, pte);
 504		}
 505	}
 506#endif
 507}
 508
 509static int
 510try_preserve_large_page(pte_t *kpte, unsigned long address,
 511			struct cpa_data *cpa)
 512{
 513	unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn;
 514	pte_t new_pte, old_pte, *tmp;
 515	pgprot_t old_prot, new_prot, req_prot;
 516	int i, do_split = 1;
 517	enum pg_level level;
 518
 519	if (cpa->force_split)
 520		return 1;
 521
 522	spin_lock(&pgd_lock);
 523	/*
 524	 * Check for races, another CPU might have split this page
 525	 * up already:
 526	 */
 527	tmp = _lookup_address_cpa(cpa, address, &level);
 528	if (tmp != kpte)
 529		goto out_unlock;
 530
 531	switch (level) {
 532	case PG_LEVEL_2M:
 533		old_prot = pmd_pgprot(*(pmd_t *)kpte);
 534		old_pfn = pmd_pfn(*(pmd_t *)kpte);
 535		break;
 536	case PG_LEVEL_1G:
 537		old_prot = pud_pgprot(*(pud_t *)kpte);
 538		old_pfn = pud_pfn(*(pud_t *)kpte);
 539		break;
 540	default:
 541		do_split = -EINVAL;
 542		goto out_unlock;
 543	}
 544
 545	psize = page_level_size(level);
 546	pmask = page_level_mask(level);
 547
 548	/*
 549	 * Calculate the number of pages, which fit into this large
 550	 * page starting at address:
 551	 */
 552	nextpage_addr = (address + psize) & pmask;
 553	numpages = (nextpage_addr - address) >> PAGE_SHIFT;
 554	if (numpages < cpa->numpages)
 555		cpa->numpages = numpages;
 556
 557	/*
 558	 * We are safe now. Check whether the new pgprot is the same:
 559	 * Convert protection attributes to 4k-format, as cpa->mask* are set
 560	 * up accordingly.
 561	 */
 562	old_pte = *kpte;
 563	req_prot = pgprot_large_2_4k(old_prot);
 564
 565	pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
 566	pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
 567
 568	/*
 569	 * req_prot is in format of 4k pages. It must be converted to large
 570	 * page format: the caching mode includes the PAT bit located at
 571	 * different bit positions in the two formats.
 572	 */
 573	req_prot = pgprot_4k_2_large(req_prot);
 574
 575	/*
 576	 * Set the PSE and GLOBAL flags only if the PRESENT flag is
 577	 * set otherwise pmd_present/pmd_huge will return true even on
 578	 * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL
 579	 * for the ancient hardware that doesn't support it.
 580	 */
 581	if (pgprot_val(req_prot) & _PAGE_PRESENT)
 582		pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL;
 583	else
 584		pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL);
 585
 586	req_prot = canon_pgprot(req_prot);
 587
 588	/*
 589	 * old_pfn points to the large page base pfn. So we need
 590	 * to add the offset of the virtual address:
 591	 */
 592	pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
 593	cpa->pfn = pfn;
 594
 595	new_prot = static_protections(req_prot, address, pfn);
 596
 597	/*
 598	 * We need to check the full range, whether
 599	 * static_protection() requires a different pgprot for one of
 600	 * the pages in the range we try to preserve:
 601	 */
 602	addr = address & pmask;
 603	pfn = old_pfn;
 604	for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
 605		pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
 606
 607		if (pgprot_val(chk_prot) != pgprot_val(new_prot))
 608			goto out_unlock;
 609	}
 610
 611	/*
 612	 * If there are no changes, return. maxpages has been updated
 613	 * above:
 614	 */
 615	if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
 616		do_split = 0;
 617		goto out_unlock;
 618	}
 619
 620	/*
 621	 * We need to change the attributes. Check, whether we can
 622	 * change the large page in one go. We request a split, when
 623	 * the address is not aligned and the number of pages is
 624	 * smaller than the number of pages in the large page. Note
 625	 * that we limited the number of possible pages already to
 626	 * the number of pages in the large page.
 627	 */
 628	if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
 629		/*
 630		 * The address is aligned and the number of pages
 631		 * covers the full page.
 632		 */
 633		new_pte = pfn_pte(old_pfn, new_prot);
 634		__set_pmd_pte(kpte, address, new_pte);
 635		cpa->flags |= CPA_FLUSHTLB;
 636		do_split = 0;
 637	}
 638
 639out_unlock:
 640	spin_unlock(&pgd_lock);
 641
 642	return do_split;
 643}
 644
 645static int
 646__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 647		   struct page *base)
 648{
 649	pte_t *pbase = (pte_t *)page_address(base);
 650	unsigned long ref_pfn, pfn, pfninc = 1;
 651	unsigned int i, level;
 652	pte_t *tmp;
 653	pgprot_t ref_prot;
 654
 655	spin_lock(&pgd_lock);
 656	/*
 657	 * Check for races, another CPU might have split this page
 658	 * up for us already:
 659	 */
 660	tmp = _lookup_address_cpa(cpa, address, &level);
 661	if (tmp != kpte) {
 662		spin_unlock(&pgd_lock);
 663		return 1;
 664	}
 665
 666	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
 667
 668	switch (level) {
 669	case PG_LEVEL_2M:
 670		ref_prot = pmd_pgprot(*(pmd_t *)kpte);
 671		/* clear PSE and promote PAT bit to correct position */
 672		ref_prot = pgprot_large_2_4k(ref_prot);
 673		ref_pfn = pmd_pfn(*(pmd_t *)kpte);
 674		break;
 675
 676	case PG_LEVEL_1G:
 677		ref_prot = pud_pgprot(*(pud_t *)kpte);
 678		ref_pfn = pud_pfn(*(pud_t *)kpte);
 679		pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
 680
 681		/*
 682		 * Clear the PSE flags if the PRESENT flag is not set
 683		 * otherwise pmd_present/pmd_huge will return true
 684		 * even on a non present pmd.
 685		 */
 686		if (!(pgprot_val(ref_prot) & _PAGE_PRESENT))
 687			pgprot_val(ref_prot) &= ~_PAGE_PSE;
 688		break;
 689
 690	default:
 691		spin_unlock(&pgd_lock);
 692		return 1;
 693	}
 694
 695	/*
 696	 * Set the GLOBAL flags only if the PRESENT flag is set
 697	 * otherwise pmd/pte_present will return true even on a non
 698	 * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL
 699	 * for the ancient hardware that doesn't support it.
 700	 */
 701	if (pgprot_val(ref_prot) & _PAGE_PRESENT)
 702		pgprot_val(ref_prot) |= _PAGE_GLOBAL;
 703	else
 704		pgprot_val(ref_prot) &= ~_PAGE_GLOBAL;
 705
 706	/*
 707	 * Get the target pfn from the original entry:
 708	 */
 709	pfn = ref_pfn;
 710	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
 711		set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot)));
 712
 713	if (virt_addr_valid(address)) {
 714		unsigned long pfn = PFN_DOWN(__pa(address));
 715
 716		if (pfn_range_is_mapped(pfn, pfn + 1))
 717			split_page_count(level);
 718	}
 719
 720	/*
 721	 * Install the new, split up pagetable.
 722	 *
 723	 * We use the standard kernel pagetable protections for the new
 724	 * pagetable protections, the actual ptes set above control the
 725	 * primary protection behavior:
 726	 */
 727	__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
 728
 729	/*
 730	 * Intel Atom errata AAH41 workaround.
 731	 *
 732	 * The real fix should be in hw or in a microcode update, but
 733	 * we also probabilistically try to reduce the window of having
 734	 * a large TLB mixed with 4K TLBs while instruction fetches are
 735	 * going on.
 736	 */
 737	__flush_tlb_all();
 738	spin_unlock(&pgd_lock);
 739
 740	return 0;
 741}
 742
 743static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
 744			    unsigned long address)
 745{
 746	struct page *base;
 747
 748	if (!debug_pagealloc_enabled())
 749		spin_unlock(&cpa_lock);
 750	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
 751	if (!debug_pagealloc_enabled())
 752		spin_lock(&cpa_lock);
 753	if (!base)
 754		return -ENOMEM;
 755
 756	if (__split_large_page(cpa, kpte, address, base))
 757		__free_page(base);
 758
 759	return 0;
 760}
 761
 762static bool try_to_free_pte_page(pte_t *pte)
 763{
 764	int i;
 765
 766	for (i = 0; i < PTRS_PER_PTE; i++)
 767		if (!pte_none(pte[i]))
 768			return false;
 769
 770	free_page((unsigned long)pte);
 771	return true;
 772}
 773
 774static bool try_to_free_pmd_page(pmd_t *pmd)
 775{
 776	int i;
 777
 778	for (i = 0; i < PTRS_PER_PMD; i++)
 779		if (!pmd_none(pmd[i]))
 780			return false;
 781
 782	free_page((unsigned long)pmd);
 783	return true;
 784}
 785
 786static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
 787{
 788	pte_t *pte = pte_offset_kernel(pmd, start);
 789
 790	while (start < end) {
 791		set_pte(pte, __pte(0));
 792
 793		start += PAGE_SIZE;
 794		pte++;
 795	}
 796
 797	if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
 798		pmd_clear(pmd);
 799		return true;
 800	}
 801	return false;
 802}
 803
 804static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
 805			      unsigned long start, unsigned long end)
 806{
 807	if (unmap_pte_range(pmd, start, end))
 808		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
 809			pud_clear(pud);
 810}
 811
 812static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
 813{
 814	pmd_t *pmd = pmd_offset(pud, start);
 815
 816	/*
 817	 * Not on a 2MB page boundary?
 818	 */
 819	if (start & (PMD_SIZE - 1)) {
 820		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
 821		unsigned long pre_end = min_t(unsigned long, end, next_page);
 822
 823		__unmap_pmd_range(pud, pmd, start, pre_end);
 824
 825		start = pre_end;
 826		pmd++;
 827	}
 828
 829	/*
 830	 * Try to unmap in 2M chunks.
 831	 */
 832	while (end - start >= PMD_SIZE) {
 833		if (pmd_large(*pmd))
 834			pmd_clear(pmd);
 835		else
 836			__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
 837
 838		start += PMD_SIZE;
 839		pmd++;
 840	}
 841
 842	/*
 843	 * 4K leftovers?
 844	 */
 845	if (start < end)
 846		return __unmap_pmd_range(pud, pmd, start, end);
 847
 848	/*
 849	 * Try again to free the PMD page if haven't succeeded above.
 850	 */
 851	if (!pud_none(*pud))
 852		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
 853			pud_clear(pud);
 854}
 855
 856static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
 857{
 858	pud_t *pud = pud_offset(p4d, start);
 859
 860	/*
 861	 * Not on a GB page boundary?
 862	 */
 863	if (start & (PUD_SIZE - 1)) {
 864		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
 865		unsigned long pre_end	= min_t(unsigned long, end, next_page);
 866
 867		unmap_pmd_range(pud, start, pre_end);
 868
 869		start = pre_end;
 870		pud++;
 871	}
 872
 873	/*
 874	 * Try to unmap in 1G chunks?
 875	 */
 876	while (end - start >= PUD_SIZE) {
 877
 878		if (pud_large(*pud))
 879			pud_clear(pud);
 880		else
 881			unmap_pmd_range(pud, start, start + PUD_SIZE);
 882
 883		start += PUD_SIZE;
 884		pud++;
 885	}
 886
 887	/*
 888	 * 2M leftovers?
 889	 */
 890	if (start < end)
 891		unmap_pmd_range(pud, start, end);
 892
 893	/*
 894	 * No need to try to free the PUD page because we'll free it in
 895	 * populate_pgd's error path
 896	 */
 897}
 898
 899static int alloc_pte_page(pmd_t *pmd)
 900{
 901	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
 902	if (!pte)
 903		return -1;
 904
 905	set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
 906	return 0;
 907}
 908
 909static int alloc_pmd_page(pud_t *pud)
 910{
 911	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
 912	if (!pmd)
 913		return -1;
 914
 915	set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
 916	return 0;
 917}
 918
 919static void populate_pte(struct cpa_data *cpa,
 920			 unsigned long start, unsigned long end,
 921			 unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
 922{
 923	pte_t *pte;
 924
 925	pte = pte_offset_kernel(pmd, start);
 926
 927	/*
 928	 * Set the GLOBAL flags only if the PRESENT flag is
 929	 * set otherwise pte_present will return true even on
 930	 * a non present pte. The canon_pgprot will clear
 931	 * _PAGE_GLOBAL for the ancient hardware that doesn't
 932	 * support it.
 933	 */
 934	if (pgprot_val(pgprot) & _PAGE_PRESENT)
 935		pgprot_val(pgprot) |= _PAGE_GLOBAL;
 936	else
 937		pgprot_val(pgprot) &= ~_PAGE_GLOBAL;
 938
 939	pgprot = canon_pgprot(pgprot);
 940
 941	while (num_pages-- && start < end) {
 942		set_pte(pte, pfn_pte(cpa->pfn, pgprot));
 943
 944		start	 += PAGE_SIZE;
 945		cpa->pfn++;
 946		pte++;
 947	}
 948}
 949
 950static long populate_pmd(struct cpa_data *cpa,
 951			 unsigned long start, unsigned long end,
 952			 unsigned num_pages, pud_t *pud, pgprot_t pgprot)
 953{
 954	long cur_pages = 0;
 955	pmd_t *pmd;
 956	pgprot_t pmd_pgprot;
 957
 958	/*
 959	 * Not on a 2M boundary?
 960	 */
 961	if (start & (PMD_SIZE - 1)) {
 962		unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
 963		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
 964
 965		pre_end   = min_t(unsigned long, pre_end, next_page);
 966		cur_pages = (pre_end - start) >> PAGE_SHIFT;
 967		cur_pages = min_t(unsigned int, num_pages, cur_pages);
 968
 969		/*
 970		 * Need a PTE page?
 971		 */
 972		pmd = pmd_offset(pud, start);
 973		if (pmd_none(*pmd))
 974			if (alloc_pte_page(pmd))
 975				return -1;
 976
 977		populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
 978
 979		start = pre_end;
 980	}
 981
 982	/*
 983	 * We mapped them all?
 984	 */
 985	if (num_pages == cur_pages)
 986		return cur_pages;
 987
 988	pmd_pgprot = pgprot_4k_2_large(pgprot);
 989
 990	while (end - start >= PMD_SIZE) {
 991
 992		/*
 993		 * We cannot use a 1G page so allocate a PMD page if needed.
 994		 */
 995		if (pud_none(*pud))
 996			if (alloc_pmd_page(pud))
 997				return -1;
 998
 999		pmd = pmd_offset(pud, start);
1000
1001		set_pmd(pmd, __pmd(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
1002				   massage_pgprot(pmd_pgprot)));
1003
1004		start	  += PMD_SIZE;
1005		cpa->pfn  += PMD_SIZE >> PAGE_SHIFT;
1006		cur_pages += PMD_SIZE >> PAGE_SHIFT;
1007	}
1008
1009	/*
1010	 * Map trailing 4K pages.
1011	 */
1012	if (start < end) {
1013		pmd = pmd_offset(pud, start);
1014		if (pmd_none(*pmd))
1015			if (alloc_pte_page(pmd))
1016				return -1;
1017
1018		populate_pte(cpa, start, end, num_pages - cur_pages,
1019			     pmd, pgprot);
1020	}
1021	return num_pages;
1022}
1023
1024static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
1025			pgprot_t pgprot)
1026{
1027	pud_t *pud;
1028	unsigned long end;
1029	long cur_pages = 0;
1030	pgprot_t pud_pgprot;
1031
1032	end = start + (cpa->numpages << PAGE_SHIFT);
1033
1034	/*
1035	 * Not on a Gb page boundary? => map everything up to it with
1036	 * smaller pages.
1037	 */
1038	if (start & (PUD_SIZE - 1)) {
1039		unsigned long pre_end;
1040		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
1041
1042		pre_end   = min_t(unsigned long, end, next_page);
1043		cur_pages = (pre_end - start) >> PAGE_SHIFT;
1044		cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
1045
1046		pud = pud_offset(p4d, start);
1047
1048		/*
1049		 * Need a PMD page?
1050		 */
1051		if (pud_none(*pud))
1052			if (alloc_pmd_page(pud))
1053				return -1;
1054
1055		cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
1056					 pud, pgprot);
1057		if (cur_pages < 0)
1058			return cur_pages;
1059
1060		start = pre_end;
1061	}
1062
1063	/* We mapped them all? */
1064	if (cpa->numpages == cur_pages)
1065		return cur_pages;
1066
1067	pud = pud_offset(p4d, start);
1068	pud_pgprot = pgprot_4k_2_large(pgprot);
1069
1070	/*
1071	 * Map everything starting from the Gb boundary, possibly with 1G pages
1072	 */
1073	while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
1074		set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
1075				   massage_pgprot(pud_pgprot)));
1076
1077		start	  += PUD_SIZE;
1078		cpa->pfn  += PUD_SIZE >> PAGE_SHIFT;
1079		cur_pages += PUD_SIZE >> PAGE_SHIFT;
1080		pud++;
1081	}
1082
1083	/* Map trailing leftover */
1084	if (start < end) {
1085		long tmp;
1086
1087		pud = pud_offset(p4d, start);
1088		if (pud_none(*pud))
1089			if (alloc_pmd_page(pud))
1090				return -1;
1091
1092		tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
1093				   pud, pgprot);
1094		if (tmp < 0)
1095			return cur_pages;
1096
1097		cur_pages += tmp;
1098	}
1099	return cur_pages;
1100}
1101
1102/*
1103 * Restrictions for kernel page table do not necessarily apply when mapping in
1104 * an alternate PGD.
1105 */
1106static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
1107{
1108	pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
1109	pud_t *pud = NULL;	/* shut up gcc */
1110	p4d_t *p4d;
1111	pgd_t *pgd_entry;
1112	long ret;
1113
1114	pgd_entry = cpa->pgd + pgd_index(addr);
1115
1116	if (pgd_none(*pgd_entry)) {
1117		p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
1118		if (!p4d)
1119			return -1;
1120
1121		set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE));
1122	}
1123
1124	/*
1125	 * Allocate a PUD page and hand it down for mapping.
1126	 */
1127	p4d = p4d_offset(pgd_entry, addr);
1128	if (p4d_none(*p4d)) {
1129		pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
1130		if (!pud)
1131			return -1;
1132
1133		set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
1134	}
1135
1136	pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
1137	pgprot_val(pgprot) |=  pgprot_val(cpa->mask_set);
1138
1139	ret = populate_pud(cpa, addr, p4d, pgprot);
1140	if (ret < 0) {
1141		/*
1142		 * Leave the PUD page in place in case some other CPU or thread
1143		 * already found it, but remove any useless entries we just
1144		 * added to it.
1145		 */
1146		unmap_pud_range(p4d, addr,
1147				addr + (cpa->numpages << PAGE_SHIFT));
1148		return ret;
1149	}
1150
1151	cpa->numpages = ret;
1152	return 0;
1153}
1154
1155static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
1156			       int primary)
1157{
1158	if (cpa->pgd) {
1159		/*
1160		 * Right now, we only execute this code path when mapping
1161		 * the EFI virtual memory map regions, no other users
1162		 * provide a ->pgd value. This may change in the future.
1163		 */
1164		return populate_pgd(cpa, vaddr);
1165	}
1166
1167	/*
1168	 * Ignore all non primary paths.
1169	 */
1170	if (!primary) {
1171		cpa->numpages = 1;
1172		return 0;
1173	}
1174
1175	/*
1176	 * Ignore the NULL PTE for kernel identity mapping, as it is expected
1177	 * to have holes.
1178	 * Also set numpages to '1' indicating that we processed cpa req for
1179	 * one virtual address page and its pfn. TBD: numpages can be set based
1180	 * on the initial value and the level returned by lookup_address().
1181	 */
1182	if (within(vaddr, PAGE_OFFSET,
1183		   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
1184		cpa->numpages = 1;
1185		cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
1186		return 0;
1187	} else {
1188		WARN(1, KERN_WARNING "CPA: called for zero pte. "
1189			"vaddr = %lx cpa->vaddr = %lx\n", vaddr,
1190			*cpa->vaddr);
1191
1192		return -EFAULT;
1193	}
1194}
1195
1196static int __change_page_attr(struct cpa_data *cpa, int primary)
1197{
1198	unsigned long address;
1199	int do_split, err;
1200	unsigned int level;
1201	pte_t *kpte, old_pte;
1202
1203	if (cpa->flags & CPA_PAGES_ARRAY) {
1204		struct page *page = cpa->pages[cpa->curpage];
1205		if (unlikely(PageHighMem(page)))
1206			return 0;
1207		address = (unsigned long)page_address(page);
1208	} else if (cpa->flags & CPA_ARRAY)
1209		address = cpa->vaddr[cpa->curpage];
1210	else
1211		address = *cpa->vaddr;
1212repeat:
1213	kpte = _lookup_address_cpa(cpa, address, &level);
1214	if (!kpte)
1215		return __cpa_process_fault(cpa, address, primary);
1216
1217	old_pte = *kpte;
1218	if (pte_none(old_pte))
1219		return __cpa_process_fault(cpa, address, primary);
1220
1221	if (level == PG_LEVEL_4K) {
1222		pte_t new_pte;
1223		pgprot_t new_prot = pte_pgprot(old_pte);
1224		unsigned long pfn = pte_pfn(old_pte);
1225
1226		pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
1227		pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
1228
1229		new_prot = static_protections(new_prot, address, pfn);
1230
1231		/*
1232		 * Set the GLOBAL flags only if the PRESENT flag is
1233		 * set otherwise pte_present will return true even on
1234		 * a non present pte. The canon_pgprot will clear
1235		 * _PAGE_GLOBAL for the ancient hardware that doesn't
1236		 * support it.
1237		 */
1238		if (pgprot_val(new_prot) & _PAGE_PRESENT)
1239			pgprot_val(new_prot) |= _PAGE_GLOBAL;
1240		else
1241			pgprot_val(new_prot) &= ~_PAGE_GLOBAL;
1242
1243		/*
1244		 * We need to keep the pfn from the existing PTE,
1245		 * after all we're only going to change it's attributes
1246		 * not the memory it points to
1247		 */
1248		new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
1249		cpa->pfn = pfn;
1250		/*
1251		 * Do we really change anything ?
1252		 */
1253		if (pte_val(old_pte) != pte_val(new_pte)) {
1254			set_pte_atomic(kpte, new_pte);
1255			cpa->flags |= CPA_FLUSHTLB;
1256		}
1257		cpa->numpages = 1;
1258		return 0;
1259	}
1260
1261	/*
1262	 * Check, whether we can keep the large page intact
1263	 * and just change the pte:
1264	 */
1265	do_split = try_preserve_large_page(kpte, address, cpa);
1266	/*
1267	 * When the range fits into the existing large page,
1268	 * return. cp->numpages and cpa->tlbflush have been updated in
1269	 * try_large_page:
1270	 */
1271	if (do_split <= 0)
1272		return do_split;
1273
1274	/*
1275	 * We have to split the large page:
1276	 */
1277	err = split_large_page(cpa, kpte, address);
1278	if (!err) {
1279		/*
1280	 	 * Do a global flush tlb after splitting the large page
1281	 	 * and before we do the actual change page attribute in the PTE.
1282	 	 *
1283	 	 * With out this, we violate the TLB application note, that says
1284	 	 * "The TLBs may contain both ordinary and large-page
1285		 *  translations for a 4-KByte range of linear addresses. This
1286		 *  may occur if software modifies the paging structures so that
1287		 *  the page size used for the address range changes. If the two
1288		 *  translations differ with respect to page frame or attributes
1289		 *  (e.g., permissions), processor behavior is undefined and may
1290		 *  be implementation-specific."
1291	 	 *
1292	 	 * We do this global tlb flush inside the cpa_lock, so that we
1293		 * don't allow any other cpu, with stale tlb entries change the
1294		 * page attribute in parallel, that also falls into the
1295		 * just split large page entry.
1296	 	 */
1297		flush_tlb_all();
1298		goto repeat;
1299	}
1300
1301	return err;
1302}
1303
1304static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
1305
1306static int cpa_process_alias(struct cpa_data *cpa)
1307{
1308	struct cpa_data alias_cpa;
1309	unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
1310	unsigned long vaddr;
1311	int ret;
1312
1313	if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
1314		return 0;
1315
1316	/*
1317	 * No need to redo, when the primary call touched the direct
1318	 * mapping already:
1319	 */
1320	if (cpa->flags & CPA_PAGES_ARRAY) {
1321		struct page *page = cpa->pages[cpa->curpage];
1322		if (unlikely(PageHighMem(page)))
1323			return 0;
1324		vaddr = (unsigned long)page_address(page);
1325	} else if (cpa->flags & CPA_ARRAY)
1326		vaddr = cpa->vaddr[cpa->curpage];
1327	else
1328		vaddr = *cpa->vaddr;
1329
1330	if (!(within(vaddr, PAGE_OFFSET,
1331		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
1332
1333		alias_cpa = *cpa;
1334		alias_cpa.vaddr = &laddr;
1335		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1336
1337		ret = __change_page_attr_set_clr(&alias_cpa, 0);
1338		if (ret)
1339			return ret;
1340	}
1341
1342#ifdef CONFIG_X86_64
1343	/*
1344	 * If the primary call didn't touch the high mapping already
1345	 * and the physical address is inside the kernel map, we need
1346	 * to touch the high mapped kernel as well:
1347	 */
1348	if (!within(vaddr, (unsigned long)_text, _brk_end) &&
1349	    within_inclusive(cpa->pfn, highmap_start_pfn(),
1350			     highmap_end_pfn())) {
1351		unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
1352					       __START_KERNEL_map - phys_base;
1353		alias_cpa = *cpa;
1354		alias_cpa.vaddr = &temp_cpa_vaddr;
1355		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1356
1357		/*
1358		 * The high mapping range is imprecise, so ignore the
1359		 * return value.
1360		 */
1361		__change_page_attr_set_clr(&alias_cpa, 0);
1362	}
1363#endif
1364
1365	return 0;
1366}
1367
1368static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
1369{
1370	unsigned long numpages = cpa->numpages;
1371	int ret;
1372
1373	while (numpages) {
1374		/*
1375		 * Store the remaining nr of pages for the large page
1376		 * preservation check.
1377		 */
1378		cpa->numpages = numpages;
1379		/* for array changes, we can't use large page */
1380		if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
1381			cpa->numpages = 1;
1382
1383		if (!debug_pagealloc_enabled())
1384			spin_lock(&cpa_lock);
1385		ret = __change_page_attr(cpa, checkalias);
1386		if (!debug_pagealloc_enabled())
1387			spin_unlock(&cpa_lock);
1388		if (ret)
1389			return ret;
1390
1391		if (checkalias) {
1392			ret = cpa_process_alias(cpa);
1393			if (ret)
1394				return ret;
1395		}
1396
1397		/*
1398		 * Adjust the number of pages with the result of the
1399		 * CPA operation. Either a large page has been
1400		 * preserved or a single page update happened.
1401		 */
1402		BUG_ON(cpa->numpages > numpages || !cpa->numpages);
1403		numpages -= cpa->numpages;
1404		if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
1405			cpa->curpage++;
1406		else
1407			*cpa->vaddr += cpa->numpages * PAGE_SIZE;
1408
1409	}
1410	return 0;
1411}
1412
1413static int change_page_attr_set_clr(unsigned long *addr, int numpages,
1414				    pgprot_t mask_set, pgprot_t mask_clr,
1415				    int force_split, int in_flag,
1416				    struct page **pages)
1417{
1418	struct cpa_data cpa;
1419	int ret, cache, checkalias;
1420	unsigned long baddr = 0;
1421
1422	memset(&cpa, 0, sizeof(cpa));
1423
1424	/*
1425	 * Check, if we are requested to change a not supported
1426	 * feature:
1427	 */
1428	mask_set = canon_pgprot(mask_set);
1429	mask_clr = canon_pgprot(mask_clr);
1430	if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
1431		return 0;
1432
1433	/* Ensure we are PAGE_SIZE aligned */
1434	if (in_flag & CPA_ARRAY) {
1435		int i;
1436		for (i = 0; i < numpages; i++) {
1437			if (addr[i] & ~PAGE_MASK) {
1438				addr[i] &= PAGE_MASK;
1439				WARN_ON_ONCE(1);
1440			}
1441		}
1442	} else if (!(in_flag & CPA_PAGES_ARRAY)) {
1443		/*
1444		 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
1445		 * No need to cehck in that case
1446		 */
1447		if (*addr & ~PAGE_MASK) {
1448			*addr &= PAGE_MASK;
1449			/*
1450			 * People should not be passing in unaligned addresses:
1451			 */
1452			WARN_ON_ONCE(1);
1453		}
1454		/*
1455		 * Save address for cache flush. *addr is modified in the call
1456		 * to __change_page_attr_set_clr() below.
1457		 */
1458		baddr = *addr;
1459	}
1460
1461	/* Must avoid aliasing mappings in the highmem code */
1462	kmap_flush_unused();
1463
1464	vm_unmap_aliases();
1465
1466	cpa.vaddr = addr;
1467	cpa.pages = pages;
1468	cpa.numpages = numpages;
1469	cpa.mask_set = mask_set;
1470	cpa.mask_clr = mask_clr;
1471	cpa.flags = 0;
1472	cpa.curpage = 0;
1473	cpa.force_split = force_split;
1474
1475	if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
1476		cpa.flags |= in_flag;
1477
1478	/* No alias checking for _NX bit modifications */
1479	checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
1480
1481	ret = __change_page_attr_set_clr(&cpa, checkalias);
1482
1483	/*
1484	 * Check whether we really changed something:
1485	 */
1486	if (!(cpa.flags & CPA_FLUSHTLB))
1487		goto out;
1488
1489	/*
1490	 * No need to flush, when we did not set any of the caching
1491	 * attributes:
1492	 */
1493	cache = !!pgprot2cachemode(mask_set);
1494
1495	/*
1496	 * On success we use CLFLUSH, when the CPU supports it to
1497	 * avoid the WBINVD. If the CPU does not support it and in the
1498	 * error case we fall back to cpa_flush_all (which uses
1499	 * WBINVD):
1500	 */
1501	if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) {
1502		if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
1503			cpa_flush_array(addr, numpages, cache,
1504					cpa.flags, pages);
1505		} else
1506			cpa_flush_range(baddr, numpages, cache);
1507	} else
1508		cpa_flush_all(cache);
1509
1510out:
1511	return ret;
1512}
1513
1514static inline int change_page_attr_set(unsigned long *addr, int numpages,
1515				       pgprot_t mask, int array)
1516{
1517	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
1518		(array ? CPA_ARRAY : 0), NULL);
1519}
1520
1521static inline int change_page_attr_clear(unsigned long *addr, int numpages,
1522					 pgprot_t mask, int array)
1523{
1524	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
1525		(array ? CPA_ARRAY : 0), NULL);
1526}
1527
1528static inline int cpa_set_pages_array(struct page **pages, int numpages,
1529				       pgprot_t mask)
1530{
1531	return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
1532		CPA_PAGES_ARRAY, pages);
1533}
1534
1535static inline int cpa_clear_pages_array(struct page **pages, int numpages,
1536					 pgprot_t mask)
1537{
1538	return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
1539		CPA_PAGES_ARRAY, pages);
1540}
1541
1542int _set_memory_uc(unsigned long addr, int numpages)
1543{
1544	/*
1545	 * for now UC MINUS. see comments in ioremap_nocache()
1546	 * If you really need strong UC use ioremap_uc(), but note
1547	 * that you cannot override IO areas with set_memory_*() as
1548	 * these helpers cannot work with IO memory.
1549	 */
1550	return change_page_attr_set(&addr, numpages,
1551				    cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
1552				    0);
1553}
1554
1555int set_memory_uc(unsigned long addr, int numpages)
1556{
1557	int ret;
1558
1559	/*
1560	 * for now UC MINUS. see comments in ioremap_nocache()
1561	 */
1562	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
1563			      _PAGE_CACHE_MODE_UC_MINUS, NULL);
1564	if (ret)
1565		goto out_err;
1566
1567	ret = _set_memory_uc(addr, numpages);
1568	if (ret)
1569		goto out_free;
1570
1571	return 0;
1572
1573out_free:
1574	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1575out_err:
1576	return ret;
1577}
1578EXPORT_SYMBOL(set_memory_uc);
1579
1580static int _set_memory_array(unsigned long *addr, int addrinarray,
1581		enum page_cache_mode new_type)
1582{
1583	enum page_cache_mode set_type;
1584	int i, j;
1585	int ret;
1586
1587	for (i = 0; i < addrinarray; i++) {
1588		ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
1589					new_type, NULL);
1590		if (ret)
1591			goto out_free;
1592	}
1593
1594	/* If WC, set to UC- first and then WC */
1595	set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
1596				_PAGE_CACHE_MODE_UC_MINUS : new_type;
1597
1598	ret = change_page_attr_set(addr, addrinarray,
1599				   cachemode2pgprot(set_type), 1);
1600
1601	if (!ret && new_type == _PAGE_CACHE_MODE_WC)
1602		ret = change_page_attr_set_clr(addr, addrinarray,
1603					       cachemode2pgprot(
1604						_PAGE_CACHE_MODE_WC),
1605					       __pgprot(_PAGE_CACHE_MASK),
1606					       0, CPA_ARRAY, NULL);
1607	if (ret)
1608		goto out_free;
1609
1610	return 0;
1611
1612out_free:
1613	for (j = 0; j < i; j++)
1614		free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE);
1615
1616	return ret;
1617}
1618
1619int set_memory_array_uc(unsigned long *addr, int addrinarray)
1620{
1621	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
1622}
1623EXPORT_SYMBOL(set_memory_array_uc);
1624
1625int set_memory_array_wc(unsigned long *addr, int addrinarray)
1626{
1627	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC);
1628}
1629EXPORT_SYMBOL(set_memory_array_wc);
1630
1631int set_memory_array_wt(unsigned long *addr, int addrinarray)
1632{
1633	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WT);
1634}
1635EXPORT_SYMBOL_GPL(set_memory_array_wt);
1636
1637int _set_memory_wc(unsigned long addr, int numpages)
1638{
1639	int ret;
1640	unsigned long addr_copy = addr;
1641
1642	ret = change_page_attr_set(&addr, numpages,
1643				   cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
1644				   0);
1645	if (!ret) {
1646		ret = change_page_attr_set_clr(&addr_copy, numpages,
1647					       cachemode2pgprot(
1648						_PAGE_CACHE_MODE_WC),
1649					       __pgprot(_PAGE_CACHE_MASK),
1650					       0, 0, NULL);
1651	}
1652	return ret;
1653}
1654
1655int set_memory_wc(unsigned long addr, int numpages)
1656{
1657	int ret;
1658
1659	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
1660		_PAGE_CACHE_MODE_WC, NULL);
1661	if (ret)
1662		return ret;
1663
1664	ret = _set_memory_wc(addr, numpages);
1665	if (ret)
1666		free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1667
1668	return ret;
1669}
1670EXPORT_SYMBOL(set_memory_wc);
1671
1672int _set_memory_wt(unsigned long addr, int numpages)
1673{
1674	return change_page_attr_set(&addr, numpages,
1675				    cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0);
1676}
1677
1678int set_memory_wt(unsigned long addr, int numpages)
1679{
1680	int ret;
1681
1682	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
1683			      _PAGE_CACHE_MODE_WT, NULL);
1684	if (ret)
1685		return ret;
1686
1687	ret = _set_memory_wt(addr, numpages);
1688	if (ret)
1689		free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1690
1691	return ret;
1692}
1693EXPORT_SYMBOL_GPL(set_memory_wt);
1694
1695int _set_memory_wb(unsigned long addr, int numpages)
1696{
1697	/* WB cache mode is hard wired to all cache attribute bits being 0 */
1698	return change_page_attr_clear(&addr, numpages,
1699				      __pgprot(_PAGE_CACHE_MASK), 0);
1700}
1701
1702int set_memory_wb(unsigned long addr, int numpages)
1703{
1704	int ret;
1705
1706	ret = _set_memory_wb(addr, numpages);
1707	if (ret)
1708		return ret;
1709
1710	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1711	return 0;
1712}
1713EXPORT_SYMBOL(set_memory_wb);
1714
1715int set_memory_array_wb(unsigned long *addr, int addrinarray)
1716{
1717	int i;
1718	int ret;
1719
1720	/* WB cache mode is hard wired to all cache attribute bits being 0 */
1721	ret = change_page_attr_clear(addr, addrinarray,
1722				      __pgprot(_PAGE_CACHE_MASK), 1);
1723	if (ret)
1724		return ret;
1725
1726	for (i = 0; i < addrinarray; i++)
1727		free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
1728
1729	return 0;
1730}
1731EXPORT_SYMBOL(set_memory_array_wb);
1732
1733int set_memory_x(unsigned long addr, int numpages)
1734{
1735	if (!(__supported_pte_mask & _PAGE_NX))
1736		return 0;
1737
1738	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
1739}
1740EXPORT_SYMBOL(set_memory_x);
1741
1742int set_memory_nx(unsigned long addr, int numpages)
1743{
1744	if (!(__supported_pte_mask & _PAGE_NX))
1745		return 0;
1746
1747	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
1748}
1749EXPORT_SYMBOL(set_memory_nx);
1750
1751int set_memory_ro(unsigned long addr, int numpages)
1752{
1753	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
1754}
1755
1756int set_memory_rw(unsigned long addr, int numpages)
1757{
1758	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
1759}
1760
1761int set_memory_np(unsigned long addr, int numpages)
1762{
1763	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
1764}
1765
1766int set_memory_4k(unsigned long addr, int numpages)
1767{
1768	return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
1769					__pgprot(0), 1, 0, NULL);
1770}
1771
1772int set_pages_uc(struct page *page, int numpages)
1773{
1774	unsigned long addr = (unsigned long)page_address(page);
1775
1776	return set_memory_uc(addr, numpages);
1777}
1778EXPORT_SYMBOL(set_pages_uc);
1779
1780static int _set_pages_array(struct page **pages, int addrinarray,
1781		enum page_cache_mode new_type)
1782{
1783	unsigned long start;
1784	unsigned long end;
1785	enum page_cache_mode set_type;
1786	int i;
1787	int free_idx;
1788	int ret;
1789
1790	for (i = 0; i < addrinarray; i++) {
1791		if (PageHighMem(pages[i]))
1792			continue;
1793		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1794		end = start + PAGE_SIZE;
1795		if (reserve_memtype(start, end, new_type, NULL))
1796			goto err_out;
1797	}
1798
1799	/* If WC, set to UC- first and then WC */
1800	set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
1801				_PAGE_CACHE_MODE_UC_MINUS : new_type;
1802
1803	ret = cpa_set_pages_array(pages, addrinarray,
1804				  cachemode2pgprot(set_type));
1805	if (!ret && new_type == _PAGE_CACHE_MODE_WC)
1806		ret = change_page_attr_set_clr(NULL, addrinarray,
1807					       cachemode2pgprot(
1808						_PAGE_CACHE_MODE_WC),
1809					       __pgprot(_PAGE_CACHE_MASK),
1810					       0, CPA_PAGES_ARRAY, pages);
1811	if (ret)
1812		goto err_out;
1813	return 0; /* Success */
1814err_out:
1815	free_idx = i;
1816	for (i = 0; i < free_idx; i++) {
1817		if (PageHighMem(pages[i]))
1818			continue;
1819		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1820		end = start + PAGE_SIZE;
1821		free_memtype(start, end);
1822	}
1823	return -EINVAL;
1824}
1825
1826int set_pages_array_uc(struct page **pages, int addrinarray)
1827{
1828	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
1829}
1830EXPORT_SYMBOL(set_pages_array_uc);
1831
1832int set_pages_array_wc(struct page **pages, int addrinarray)
1833{
1834	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC);
1835}
1836EXPORT_SYMBOL(set_pages_array_wc);
1837
1838int set_pages_array_wt(struct page **pages, int addrinarray)
1839{
1840	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WT);
1841}
1842EXPORT_SYMBOL_GPL(set_pages_array_wt);
1843
1844int set_pages_wb(struct page *page, int numpages)
1845{
1846	unsigned long addr = (unsigned long)page_address(page);
1847
1848	return set_memory_wb(addr, numpages);
1849}
1850EXPORT_SYMBOL(set_pages_wb);
1851
1852int set_pages_array_wb(struct page **pages, int addrinarray)
1853{
1854	int retval;
1855	unsigned long start;
1856	unsigned long end;
1857	int i;
1858
1859	/* WB cache mode is hard wired to all cache attribute bits being 0 */
1860	retval = cpa_clear_pages_array(pages, addrinarray,
1861			__pgprot(_PAGE_CACHE_MASK));
1862	if (retval)
1863		return retval;
1864
1865	for (i = 0; i < addrinarray; i++) {
1866		if (PageHighMem(pages[i]))
1867			continue;
1868		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1869		end = start + PAGE_SIZE;
1870		free_memtype(start, end);
1871	}
1872
1873	return 0;
1874}
1875EXPORT_SYMBOL(set_pages_array_wb);
1876
1877int set_pages_x(struct page *page, int numpages)
1878{
1879	unsigned long addr = (unsigned long)page_address(page);
1880
1881	return set_memory_x(addr, numpages);
1882}
1883EXPORT_SYMBOL(set_pages_x);
1884
1885int set_pages_nx(struct page *page, int numpages)
1886{
1887	unsigned long addr = (unsigned long)page_address(page);
1888
1889	return set_memory_nx(addr, numpages);
1890}
1891EXPORT_SYMBOL(set_pages_nx);
1892
1893int set_pages_ro(struct page *page, int numpages)
1894{
1895	unsigned long addr = (unsigned long)page_address(page);
1896
1897	return set_memory_ro(addr, numpages);
1898}
1899
1900int set_pages_rw(struct page *page, int numpages)
1901{
1902	unsigned long addr = (unsigned long)page_address(page);
1903
1904	return set_memory_rw(addr, numpages);
1905}
1906
1907#ifdef CONFIG_DEBUG_PAGEALLOC
1908
1909static int __set_pages_p(struct page *page, int numpages)
1910{
1911	unsigned long tempaddr = (unsigned long) page_address(page);
1912	struct cpa_data cpa = { .vaddr = &tempaddr,
1913				.pgd = NULL,
1914				.numpages = numpages,
1915				.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1916				.mask_clr = __pgprot(0),
1917				.flags = 0};
1918
1919	/*
1920	 * No alias checking needed for setting present flag. otherwise,
1921	 * we may need to break large pages for 64-bit kernel text
1922	 * mappings (this adds to complexity if we want to do this from
1923	 * atomic context especially). Let's keep it simple!
1924	 */
1925	return __change_page_attr_set_clr(&cpa, 0);
1926}
1927
1928static int __set_pages_np(struct page *page, int numpages)
1929{
1930	unsigned long tempaddr = (unsigned long) page_address(page);
1931	struct cpa_data cpa = { .vaddr = &tempaddr,
1932				.pgd = NULL,
1933				.numpages = numpages,
1934				.mask_set = __pgprot(0),
1935				.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1936				.flags = 0};
1937
1938	/*
1939	 * No alias checking needed for setting not present flag. otherwise,
1940	 * we may need to break large pages for 64-bit kernel text
1941	 * mappings (this adds to complexity if we want to do this from
1942	 * atomic context especially). Let's keep it simple!
1943	 */
1944	return __change_page_attr_set_clr(&cpa, 0);
1945}
1946
1947void __kernel_map_pages(struct page *page, int numpages, int enable)
1948{
1949	if (PageHighMem(page))
1950		return;
1951	if (!enable) {
1952		debug_check_no_locks_freed(page_address(page),
1953					   numpages * PAGE_SIZE);
1954	}
1955
1956	/*
1957	 * The return value is ignored as the calls cannot fail.
1958	 * Large pages for identity mappings are not used at boot time
1959	 * and hence no memory allocations during large page split.
1960	 */
1961	if (enable)
1962		__set_pages_p(page, numpages);
1963	else
1964		__set_pages_np(page, numpages);
1965
1966	/*
1967	 * We should perform an IPI and flush all tlbs,
1968	 * but that can deadlock->flush only current cpu:
1969	 */
1970	__flush_tlb_all();
1971
1972	arch_flush_lazy_mmu_mode();
1973}
1974
1975#ifdef CONFIG_HIBERNATION
1976
1977bool kernel_page_present(struct page *page)
1978{
1979	unsigned int level;
1980	pte_t *pte;
1981
1982	if (PageHighMem(page))
1983		return false;
1984
1985	pte = lookup_address((unsigned long)page_address(page), &level);
1986	return (pte_val(*pte) & _PAGE_PRESENT);
1987}
1988
1989#endif /* CONFIG_HIBERNATION */
1990
1991#endif /* CONFIG_DEBUG_PAGEALLOC */
1992
1993int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
1994			    unsigned numpages, unsigned long page_flags)
1995{
1996	int retval = -EINVAL;
1997
1998	struct cpa_data cpa = {
1999		.vaddr = &address,
2000		.pfn = pfn,
2001		.pgd = pgd,
2002		.numpages = numpages,
2003		.mask_set = __pgprot(0),
2004		.mask_clr = __pgprot(0),
2005		.flags = 0,
2006	};
2007
2008	if (!(__supported_pte_mask & _PAGE_NX))
2009		goto out;
2010
2011	if (!(page_flags & _PAGE_NX))
2012		cpa.mask_clr = __pgprot(_PAGE_NX);
2013
2014	if (!(page_flags & _PAGE_RW))
2015		cpa.mask_clr = __pgprot(_PAGE_RW);
2016
2017	cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
2018
2019	retval = __change_page_attr_set_clr(&cpa, 0);
2020	__flush_tlb_all();
2021
2022out:
2023	return retval;
2024}
2025
2026/*
2027 * The testcases use internal knowledge of the implementation that shouldn't
2028 * be exposed to the rest of the kernel. Include these directly here.
2029 */
2030#ifdef CONFIG_CPA_DEBUG
2031#include "pageattr-test.c"
2032#endif
Configure Feed

Configure Feed