arch/s390/mm/gmap_helpers.c at master

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / arch / s390 / mm / gmap_helpers.c
at master 304 lines 9.0 kB view raw
wrap content
  1// SPDX-License-Identifier: GPL-2.0
  2/*
  3 *  Helper functions for KVM guest address space mapping code
  4 *
  5 *    Copyright IBM Corp. 2007, 2025
  6 */
  7
  8#include <linux/export.h>
  9#include <linux/mm_types.h>
 10#include <linux/mmap_lock.h>
 11#include <linux/mm.h>
 12#include <linux/hugetlb.h>
 13#include <linux/swap.h>
 14#include <linux/leafops.h>
 15#include <linux/pagewalk.h>
 16#include <linux/ksm.h>
 17#include <asm/gmap_helpers.h>
 18
 19/**
 20 * ptep_zap_softleaf_entry() - discard a software leaf entry.
 21 * @mm: the mm
 22 * @entry: the software leaf entry that needs to be zapped
 23 *
 24 * Discards the given software leaf entry. If the leaf entry was an actual
 25 * swap entry (and not a migration entry, for example), the actual swapped
 26 * page is also discarded from swap.
 27 */
 28static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry)
 29{
 30	if (softleaf_is_swap(entry))
 31		dec_mm_counter(mm, MM_SWAPENTS);
 32	else if (softleaf_is_migration(entry))
 33		dec_mm_counter(mm, mm_counter(softleaf_to_folio(entry)));
 34	swap_put_entries_direct(entry, 1);
 35}
 36
 37/**
 38 * gmap_helper_zap_one_page() - discard a page if it was swapped.
 39 * @mm: the mm
 40 * @vmaddr: the userspace virtual address that needs to be discarded
 41 *
 42 * If the given address maps to a swap entry, discard it.
 43 *
 44 * Context: needs to be called while holding the mmap lock.
 45 */
 46void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr)
 47{
 48	struct vm_area_struct *vma;
 49	spinlock_t *ptl;
 50	pte_t *ptep;
 51
 52	mmap_assert_locked(mm);
 53
 54	/* Find the vm address for the guest address */
 55	vma = vma_lookup(mm, vmaddr);
 56	if (!vma || is_vm_hugetlb_page(vma))
 57		return;
 58
 59	/* Get pointer to the page table entry */
 60	ptep = get_locked_pte(mm, vmaddr, &ptl);
 61	if (unlikely(!ptep))
 62		return;
 63	if (pte_swap(*ptep)) {
 64		ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep));
 65		pte_clear(mm, vmaddr, ptep);
 66	}
 67	pte_unmap_unlock(ptep, ptl);
 68}
 69EXPORT_SYMBOL_GPL(gmap_helper_zap_one_page);
 70
 71/**
 72 * gmap_helper_discard() - discard user pages in the given range
 73 * @mm: the mm
 74 * @vmaddr: starting userspace address
 75 * @end: end address (first address outside the range)
 76 *
 77 * All userpace pages in the range [@vamddr, @end) are discarded and unmapped.
 78 *
 79 * Context: needs to be called while holding the mmap lock.
 80 */
 81void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end)
 82{
 83	struct vm_area_struct *vma;
 84
 85	mmap_assert_locked(mm);
 86
 87	while (vmaddr < end) {
 88		vma = find_vma_intersection(mm, vmaddr, end);
 89		if (!vma)
 90			return;
 91		if (!is_vm_hugetlb_page(vma))
 92			zap_vma_range(vma, vmaddr, min(end, vma->vm_end) - vmaddr);
 93		vmaddr = vma->vm_end;
 94	}
 95}
 96EXPORT_SYMBOL_GPL(gmap_helper_discard);
 97
 98/**
 99 * gmap_helper_try_set_pte_unused() - mark a pte entry as unused
100 * @mm: the mm
101 * @vmaddr: the userspace address whose pte is to be marked
102 *
103 * Mark the pte corresponding the given address as unused. This will cause
104 * core mm code to just drop this page instead of swapping it.
105 *
106 * This function needs to be called with interrupts disabled (for example
107 * while holding a spinlock), or while holding the mmap lock. Normally this
108 * function is called as a result of an unmap operation, and thus KVM common
109 * code will already hold kvm->mmu_lock in write mode.
110 *
111 * Context: Needs to be called while holding the mmap lock or with interrupts
112 *          disabled.
113 */
114void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr)
115{
116	pmd_t *pmdp, pmd, pmdval;
117	pud_t *pudp, pud;
118	p4d_t *p4dp, p4d;
119	pgd_t *pgdp, pgd;
120	spinlock_t *ptl;	/* Lock for the host (userspace) page table */
121	pte_t *ptep;
122
123	pgdp = pgd_offset(mm, vmaddr);
124	pgd = pgdp_get(pgdp);
125	if (pgd_none(pgd) || !pgd_present(pgd))
126		return;
127
128	p4dp = p4d_offset(pgdp, vmaddr);
129	p4d = p4dp_get(p4dp);
130	if (p4d_none(p4d) || !p4d_present(p4d))
131		return;
132
133	pudp = pud_offset(p4dp, vmaddr);
134	pud = pudp_get(pudp);
135	if (pud_none(pud) || pud_leaf(pud) || !pud_present(pud))
136		return;
137
138	pmdp = pmd_offset(pudp, vmaddr);
139	pmd = pmdp_get_lockless(pmdp);
140	if (pmd_none(pmd) || pmd_leaf(pmd) || !pmd_present(pmd))
141		return;
142
143	ptep = pte_offset_map_rw_nolock(mm, pmdp, vmaddr, &pmdval, &ptl);
144	if (!ptep)
145		return;
146
147	/*
148	 * Several paths exists that takes the ptl lock and then call the
149	 * mmu_notifier, which takes the mmu_lock. The unmap path, instead,
150	 * takes the mmu_lock in write mode first, and then potentially
151	 * calls this function, which takes the ptl lock. This can lead to a
152	 * deadlock.
153	 * The unused page mechanism is only an optimization, if the
154	 * _PAGE_UNUSED bit is not set, the unused page is swapped as normal
155	 * instead of being discarded.
156	 * If the lock is contended the bit is not set and the deadlock is
157	 * avoided.
158	 */
159	if (spin_trylock(ptl)) {
160		/*
161		 * Make sure the pte we are touching is still the correct
162		 * one. In theory this check should not be needed, but
163		 * better safe than sorry.
164		 * Disabling interrupts or holding the mmap lock is enough to
165		 * guarantee that no concurrent updates to the page tables
166		 * are possible.
167		 */
168		if (likely(pmd_same(pmdval, pmdp_get_lockless(pmdp))))
169			__atomic64_or(_PAGE_UNUSED, (long *)ptep);
170		spin_unlock(ptl);
171	}
172
173	pte_unmap(ptep);
174}
175EXPORT_SYMBOL_GPL(gmap_helper_try_set_pte_unused);
176
177static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
178				   unsigned long end, struct mm_walk *walk)
179{
180	unsigned long *found_addr = walk->private;
181
182	/* Return 1 of the page is a zeropage. */
183	if (is_zero_pfn(pte_pfn(*pte))) {
184		/*
185		 * Shared zeropage in e.g., a FS DAX mapping? We cannot do the
186		 * right thing and likely don't care: FAULT_FLAG_UNSHARE
187		 * currently only works in COW mappings, which is also where
188		 * mm_forbids_zeropage() is checked.
189		 */
190		if (!is_cow_mapping(walk->vma->vm_flags))
191			return -EFAULT;
192
193		*found_addr = addr;
194		return 1;
195	}
196	return 0;
197}
198
199static const struct mm_walk_ops find_zeropage_ops = {
200	.pte_entry      = find_zeropage_pte_entry,
201	.walk_lock      = PGWALK_WRLOCK,
202};
203
204/** __gmap_helper_unshare_zeropages() - unshare all shared zeropages
205 * @mm: the mm whose zero pages are to be unshared
206 *
207 * Unshare all shared zeropages, replacing them by anonymous pages. Note that
208 * we cannot simply zap all shared zeropages, because this could later
209 * trigger unexpected userfaultfd missing events.
210 *
211 * This must be called after mm->context.allow_cow_sharing was
212 * set to 0, to avoid future mappings of shared zeropages.
213 *
214 * mm contracts with s390, that even if mm were to remove a page table,
215 * and racing with walk_page_range_vma() calling pte_offset_map_lock()
216 * would fail, it will never insert a page table containing empty zero
217 * pages once mm_forbids_zeropage(mm) i.e.
218 * mm->context.allow_cow_sharing is set to 0.
219 */
220static int __gmap_helper_unshare_zeropages(struct mm_struct *mm)
221{
222	struct vm_area_struct *vma;
223	VMA_ITERATOR(vmi, mm, 0);
224	unsigned long addr;
225	vm_fault_t fault;
226	int rc;
227
228	for_each_vma(vmi, vma) {
229		/*
230		 * We could only look at COW mappings, but it's more future
231		 * proof to catch unexpected zeropages in other mappings and
232		 * fail.
233		 */
234		if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma))
235			continue;
236		addr = vma->vm_start;
237
238retry:
239		rc = walk_page_range_vma(vma, addr, vma->vm_end,
240					 &find_zeropage_ops, &addr);
241		if (rc < 0)
242			return rc;
243		else if (!rc)
244			continue;
245
246		/* addr was updated by find_zeropage_pte_entry() */
247		fault = handle_mm_fault(vma, addr,
248					FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
249					NULL);
250		if (fault & VM_FAULT_OOM)
251			return -ENOMEM;
252		/*
253		 * See break_ksm(): even after handle_mm_fault() returned 0, we
254		 * must start the lookup from the current address, because
255		 * handle_mm_fault() may back out if there's any difficulty.
256		 *
257		 * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but
258		 * maybe they could trigger in the future on concurrent
259		 * truncation. In that case, the shared zeropage would be gone
260		 * and we can simply retry and make progress.
261		 */
262		cond_resched();
263		goto retry;
264	}
265
266	return 0;
267}
268
269/**
270 * gmap_helper_disable_cow_sharing() - disable all COW sharing
271 *
272 * Disable most COW-sharing of memory pages for the whole process:
273 * (1) Disable KSM and unmerge/unshare any KSM pages.
274 * (2) Disallow shared zeropages and unshare any zerpages that are mapped.
275 *
276 * Not that we currently don't bother with COW-shared pages that are shared
277 * with parent/child processes due to fork().
278 */
279int gmap_helper_disable_cow_sharing(void)
280{
281	struct mm_struct *mm = current->mm;
282	int rc;
283
284	mmap_assert_write_locked(mm);
285
286	if (!mm->context.allow_cow_sharing)
287		return 0;
288
289	mm->context.allow_cow_sharing = 0;
290
291	/* Replace all shared zeropages by anonymous pages. */
292	rc = __gmap_helper_unshare_zeropages(mm);
293	/*
294	 * Make sure to disable KSM (if enabled for the whole process or
295	 * individual VMAs). Note that nothing currently hinders user space
296	 * from re-enabling it.
297	 */
298	if (!rc)
299		rc = ksm_disable(mm);
300	if (rc)
301		mm->context.allow_cow_sharing = 1;
302	return rc;
303}
304EXPORT_SYMBOL_GPL(gmap_helper_disable_cow_sharing);
Configure Feed

Configure Feed