arch/powerpc/mm/pgtable-hash64.c at v4.9

tjh.dev / kernel
fork atom
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork atom
kernel / arch / powerpc / mm / pgtable-hash64.c
at v4.9 342 lines 9.7 kB view raw
wrap content
  1/*
  2 * Copyright 2005, Paul Mackerras, IBM Corporation.
  3 * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
  4 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
  5 *
  6 * This program is free software; you can redistribute it and/or
  7 * modify it under the terms of the GNU General Public License
  8 * as published by the Free Software Foundation; either version
  9 * 2 of the License, or (at your option) any later version.
 10 */
 11
 12#include <linux/sched.h>
 13#include <asm/pgalloc.h>
 14#include <asm/tlb.h>
 15
 16#include "mmu_decl.h"
 17
 18#define CREATE_TRACE_POINTS
 19#include <trace/events/thp.h>
 20
 21#ifdef CONFIG_SPARSEMEM_VMEMMAP
 22/*
 23 * On hash-based CPUs, the vmemmap is bolted in the hash table.
 24 *
 25 */
 26int __meminit hash__vmemmap_create_mapping(unsigned long start,
 27				       unsigned long page_size,
 28				       unsigned long phys)
 29{
 30	int rc = htab_bolt_mapping(start, start + page_size, phys,
 31				   pgprot_val(PAGE_KERNEL),
 32				   mmu_vmemmap_psize, mmu_kernel_ssize);
 33	if (rc < 0) {
 34		int rc2 = htab_remove_mapping(start, start + page_size,
 35					      mmu_vmemmap_psize,
 36					      mmu_kernel_ssize);
 37		BUG_ON(rc2 && (rc2 != -ENOENT));
 38	}
 39	return rc;
 40}
 41
 42#ifdef CONFIG_MEMORY_HOTPLUG
 43void hash__vmemmap_remove_mapping(unsigned long start,
 44			      unsigned long page_size)
 45{
 46	int rc = htab_remove_mapping(start, start + page_size,
 47				     mmu_vmemmap_psize,
 48				     mmu_kernel_ssize);
 49	BUG_ON((rc < 0) && (rc != -ENOENT));
 50	WARN_ON(rc == -ENOENT);
 51}
 52#endif
 53#endif /* CONFIG_SPARSEMEM_VMEMMAP */
 54
 55/*
 56 * map_kernel_page currently only called by __ioremap
 57 * map_kernel_page adds an entry to the ioremap page table
 58 * and adds an entry to the HPT, possibly bolting it
 59 */
 60int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
 61{
 62	pgd_t *pgdp;
 63	pud_t *pudp;
 64	pmd_t *pmdp;
 65	pte_t *ptep;
 66
 67	BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
 68	if (slab_is_available()) {
 69		pgdp = pgd_offset_k(ea);
 70		pudp = pud_alloc(&init_mm, pgdp, ea);
 71		if (!pudp)
 72			return -ENOMEM;
 73		pmdp = pmd_alloc(&init_mm, pudp, ea);
 74		if (!pmdp)
 75			return -ENOMEM;
 76		ptep = pte_alloc_kernel(pmdp, ea);
 77		if (!ptep)
 78			return -ENOMEM;
 79		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
 80							  __pgprot(flags)));
 81	} else {
 82		/*
 83		 * If the mm subsystem is not fully up, we cannot create a
 84		 * linux page table entry for this mapping.  Simply bolt an
 85		 * entry in the hardware page table.
 86		 *
 87		 */
 88		if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
 89				      mmu_io_psize, mmu_kernel_ssize)) {
 90			printk(KERN_ERR "Failed to do bolted mapping IO "
 91			       "memory at %016lx !\n", pa);
 92			return -ENOMEM;
 93		}
 94	}
 95
 96	smp_wmb();
 97	return 0;
 98}
 99
100#ifdef CONFIG_TRANSPARENT_HUGEPAGE
101
102unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
103				    pmd_t *pmdp, unsigned long clr,
104				    unsigned long set)
105{
106	__be64 old_be, tmp;
107	unsigned long old;
108
109#ifdef CONFIG_DEBUG_VM
110	WARN_ON(!pmd_trans_huge(*pmdp));
111	assert_spin_locked(&mm->page_table_lock);
112#endif
113
114	__asm__ __volatile__(
115	"1:	ldarx	%0,0,%3\n\
116		and.	%1,%0,%6\n\
117		bne-	1b \n\
118		andc	%1,%0,%4 \n\
119		or	%1,%1,%7\n\
120		stdcx.	%1,0,%3 \n\
121		bne-	1b"
122	: "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
123	: "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
124	  "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
125	: "cc" );
126
127	old = be64_to_cpu(old_be);
128
129	trace_hugepage_update(addr, old, clr, set);
130	if (old & H_PAGE_HASHPTE)
131		hpte_do_hugepage_flush(mm, addr, pmdp, old);
132	return old;
133}
134
135pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
136			    pmd_t *pmdp)
137{
138	pmd_t pmd;
139
140	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
141	VM_BUG_ON(pmd_trans_huge(*pmdp));
142
143	pmd = *pmdp;
144	pmd_clear(pmdp);
145	/*
146	 * Wait for all pending hash_page to finish. This is needed
147	 * in case of subpage collapse. When we collapse normal pages
148	 * to hugepage, we first clear the pmd, then invalidate all
149	 * the PTE entries. The assumption here is that any low level
150	 * page fault will see a none pmd and take the slow path that
151	 * will wait on mmap_sem. But we could very well be in a
152	 * hash_page with local ptep pointer value. Such a hash page
153	 * can result in adding new HPTE entries for normal subpages.
154	 * That means we could be modifying the page content as we
155	 * copy them to a huge page. So wait for parallel hash_page
156	 * to finish before invalidating HPTE entries. We can do this
157	 * by sending an IPI to all the cpus and executing a dummy
158	 * function there.
159	 */
160	kick_all_cpus_sync();
161	/*
162	 * Now invalidate the hpte entries in the range
163	 * covered by pmd. This make sure we take a
164	 * fault and will find the pmd as none, which will
165	 * result in a major fault which takes mmap_sem and
166	 * hence wait for collapse to complete. Without this
167	 * the __collapse_huge_page_copy can result in copying
168	 * the old content.
169	 */
170	flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
171	return pmd;
172}
173
174/*
175 * We want to put the pgtable in pmd and use pgtable for tracking
176 * the base page size hptes
177 */
178void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
179				  pgtable_t pgtable)
180{
181	pgtable_t *pgtable_slot;
182	assert_spin_locked(&mm->page_table_lock);
183	/*
184	 * we store the pgtable in the second half of PMD
185	 */
186	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
187	*pgtable_slot = pgtable;
188	/*
189	 * expose the deposited pgtable to other cpus.
190	 * before we set the hugepage PTE at pmd level
191	 * hash fault code looks at the deposted pgtable
192	 * to store hash index values.
193	 */
194	smp_wmb();
195}
196
197pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
198{
199	pgtable_t pgtable;
200	pgtable_t *pgtable_slot;
201
202	assert_spin_locked(&mm->page_table_lock);
203	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
204	pgtable = *pgtable_slot;
205	/*
206	 * Once we withdraw, mark the entry NULL.
207	 */
208	*pgtable_slot = NULL;
209	/*
210	 * We store HPTE information in the deposited PTE fragment.
211	 * zero out the content on withdraw.
212	 */
213	memset(pgtable, 0, PTE_FRAG_SIZE);
214	return pgtable;
215}
216
217void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
218			       unsigned long address, pmd_t *pmdp)
219{
220	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
221	VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);
222
223	/*
224	 * We can't mark the pmd none here, because that will cause a race
225	 * against exit_mmap. We need to continue mark pmd TRANS HUGE, while
226	 * we spilt, but at the same time we wan't rest of the ppc64 code
227	 * not to insert hash pte on this, because we will be modifying
228	 * the deposited pgtable in the caller of this function. Hence
229	 * clear the _PAGE_USER so that we move the fault handling to
230	 * higher level function and that will serialize against ptl.
231	 * We need to flush existing hash pte entries here even though,
232	 * the translation is still valid, because we will withdraw
233	 * pgtable_t after this.
234	 */
235	pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED);
236}
237
238/*
239 * A linux hugepage PMD was changed and the corresponding hash table entries
240 * neesd to be flushed.
241 */
242void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
243			    pmd_t *pmdp, unsigned long old_pmd)
244{
245	int ssize;
246	unsigned int psize;
247	unsigned long vsid;
248	unsigned long flags = 0;
249	const struct cpumask *tmp;
250
251	/* get the base page size,vsid and segment size */
252#ifdef CONFIG_DEBUG_VM
253	psize = get_slice_psize(mm, addr);
254	BUG_ON(psize == MMU_PAGE_16M);
255#endif
256	if (old_pmd & H_PAGE_COMBO)
257		psize = MMU_PAGE_4K;
258	else
259		psize = MMU_PAGE_64K;
260
261	if (!is_kernel_addr(addr)) {
262		ssize = user_segment_size(addr);
263		vsid = get_vsid(mm->context.id, addr, ssize);
264		WARN_ON(vsid == 0);
265	} else {
266		vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
267		ssize = mmu_kernel_ssize;
268	}
269
270	tmp = cpumask_of(smp_processor_id());
271	if (cpumask_equal(mm_cpumask(mm), tmp))
272		flags |= HPTE_LOCAL_UPDATE;
273
274	return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
275}
276
277pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
278				unsigned long addr, pmd_t *pmdp)
279{
280	pmd_t old_pmd;
281	pgtable_t pgtable;
282	unsigned long old;
283	pgtable_t *pgtable_slot;
284
285	old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
286	old_pmd = __pmd(old);
287	/*
288	 * We have pmd == none and we are holding page_table_lock.
289	 * So we can safely go and clear the pgtable hash
290	 * index info.
291	 */
292	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
293	pgtable = *pgtable_slot;
294	/*
295	 * Let's zero out old valid and hash index details
296	 * hash fault look at them.
297	 */
298	memset(pgtable, 0, PTE_FRAG_SIZE);
299	/*
300	 * Serialize against find_linux_pte_or_hugepte which does lock-less
301	 * lookup in page tables with local interrupts disabled. For huge pages
302	 * it casts pmd_t to pte_t. Since format of pte_t is different from
303	 * pmd_t we want to prevent transit from pmd pointing to page table
304	 * to pmd pointing to huge page (and back) while interrupts are disabled.
305	 * We clear pmd to possibly replace it with page table pointer in
306	 * different code paths. So make sure we wait for the parallel
307	 * find_linux_pte_or_hugepage to finish.
308	 */
309	kick_all_cpus_sync();
310	return old_pmd;
311}
312
313int hash__has_transparent_hugepage(void)
314{
315
316	if (!mmu_has_feature(MMU_FTR_16M_PAGE))
317		return 0;
318	/*
319	 * We support THP only if PMD_SIZE is 16MB.
320	 */
321	if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
322		return 0;
323	/*
324	 * We need to make sure that we support 16MB hugepage in a segement
325	 * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
326	 * of 64K.
327	 */
328	/*
329	 * If we have 64K HPTE, we will be using that by default
330	 */
331	if (mmu_psize_defs[MMU_PAGE_64K].shift &&
332	    (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
333		return 0;
334	/*
335	 * Ok we only have 4K HPTE
336	 */
337	if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
338		return 0;
339
340	return 1;
341}
342#endif /* CONFIG_TRANSPARENT_HUGEPAGE */