include/linux/pagewalk.h at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / include / linux / pagewalk.h
at master 7.8 kB view raw
  1/* SPDX-License-Identifier: GPL-2.0 */
  2#ifndef _LINUX_PAGEWALK_H
  3#define _LINUX_PAGEWALK_H
  4
  5#include <linux/mm.h>
  6
  7struct mm_walk;
  8
  9/* Locking requirement during a page walk. */
 10enum page_walk_lock {
 11	/* mmap_lock should be locked for read to stabilize the vma tree */
 12	PGWALK_RDLOCK = 0,
 13	/* vma will be write-locked during the walk */
 14	PGWALK_WRLOCK = 1,
 15	/* vma is expected to be already write-locked during the walk */
 16	PGWALK_WRLOCK_VERIFY = 2,
 17	/* vma is expected to be already read-locked during the walk */
 18	PGWALK_VMA_RDLOCK_VERIFY = 3,
 19};
 20
 21/**
 22 * struct mm_walk_ops - callbacks for walk_page_range
 23 * @pgd_entry:		if set, called for each non-empty PGD (top-level) entry
 24 * @p4d_entry:		if set, called for each non-empty P4D entry
 25 * @pud_entry:		if set, called for each non-empty PUD entry
 26 * @pmd_entry:		if set, called for each non-empty PMD entry
 27 *			this handler is required to be able to handle
 28 *			pmd_trans_huge() pmds.  They may simply choose to
 29 *			split_huge_page() instead of handling it explicitly.
 30 * @pte_entry:		if set, called for each PTE (lowest-level) entry
 31 *			including empty ones, except if @install_pte is set.
 32 *			If @install_pte is set, @pte_entry is called only for
 33 *			existing PTEs.
 34 * @pte_hole:		if set, called for each hole at all levels,
 35 *			depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD.
 36 *			Any folded depths (where PTRS_PER_P?D is equal to 1)
 37 *			are skipped. If @install_pte is specified, this will
 38 *			not trigger for any populated ranges.
 39 * @hugetlb_entry:	if set, called for each hugetlb entry. This hook
 40 *			function is called with the vma lock held, in order to
 41 *			protect against a concurrent freeing of the pte_t* or
 42 *			the ptl. In some cases, the hook function needs to drop
 43 *			and retake the vma lock in order to avoid deadlocks
 44 *			while calling other functions. In such cases the hook
 45 *			function must either refrain from accessing the pte or
 46 *			ptl after dropping the vma lock, or else revalidate
 47 *			those items after re-acquiring the vma lock and before
 48 *			accessing them.
 49 * @test_walk:		caller specific callback function to determine whether
 50 *			we walk over the current vma or not. Returning 0 means
 51 *			"do page table walk over the current vma", returning
 52 *			a negative value means "abort current page table walk
 53 *			right now" and returning 1 means "skip the current vma"
 54 *			Note that this callback is not called when the caller
 55 *			passes in a single VMA as for walk_page_vma().
 56 * @pre_vma:            if set, called before starting walk on a non-null vma.
 57 * @post_vma:           if set, called after a walk on a non-null vma, provided
 58 *                      that @pre_vma and the vma walk succeeded.
 59 * @install_pte:        if set, missing page table entries are installed and
 60 *                      thus all levels are always walked in the specified
 61 *                      range. This callback is then invoked at the PTE level
 62 *                      (having split any THP pages prior), providing the PTE to
 63 *                      install. If allocations fail, the walk is aborted. This
 64 *                      operation is only available for userland memory. Not
 65 *                      usable for hugetlb ranges.
 66 *
 67 * p?d_entry callbacks are called even if those levels are folded on a
 68 * particular architecture/configuration.
 69 */
 70struct mm_walk_ops {
 71	int (*pgd_entry)(pgd_t *pgd, unsigned long addr,
 72			 unsigned long next, struct mm_walk *walk);
 73	int (*p4d_entry)(p4d_t *p4d, unsigned long addr,
 74			 unsigned long next, struct mm_walk *walk);
 75	int (*pud_entry)(pud_t *pud, unsigned long addr,
 76			 unsigned long next, struct mm_walk *walk);
 77	int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
 78			 unsigned long next, struct mm_walk *walk);
 79	int (*pte_entry)(pte_t *pte, unsigned long addr,
 80			 unsigned long next, struct mm_walk *walk);
 81	int (*pte_hole)(unsigned long addr, unsigned long next,
 82			int depth, struct mm_walk *walk);
 83	int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
 84			     unsigned long addr, unsigned long next,
 85			     struct mm_walk *walk);
 86	int (*test_walk)(unsigned long addr, unsigned long next,
 87			struct mm_walk *walk);
 88	int (*pre_vma)(unsigned long start, unsigned long end,
 89		       struct mm_walk *walk);
 90	void (*post_vma)(struct mm_walk *walk);
 91	int (*install_pte)(unsigned long addr, unsigned long next,
 92			   pte_t *ptep, struct mm_walk *walk);
 93	enum page_walk_lock walk_lock;
 94};
 95
 96/*
 97 * Action for pud_entry / pmd_entry callbacks.
 98 * ACTION_SUBTREE is the default
 99 */
100enum page_walk_action {
101	/* Descend to next level, splitting huge pages if needed and possible */
102	ACTION_SUBTREE = 0,
103	/* Continue to next entry at this level (ignoring any subtree) */
104	ACTION_CONTINUE = 1,
105	/* Call again for this entry */
106	ACTION_AGAIN = 2
107};
108
109/**
110 * struct mm_walk - walk_page_range data
111 * @ops:	operation to call during the walk
112 * @mm:		mm_struct representing the target process of page table walk
113 * @pgd:	pointer to PGD; only valid with no_vma (otherwise set to NULL)
114 * @vma:	vma currently walked (NULL if walking outside vmas)
115 * @action:	next action to perform (see enum page_walk_action)
116 * @no_vma:	walk ignoring vmas (vma will always be NULL)
117 * @private:	private data for callbacks' usage
118 *
119 * (see the comment on walk_page_range() for more details)
120 */
121struct mm_walk {
122	const struct mm_walk_ops *ops;
123	struct mm_struct *mm;
124	pgd_t *pgd;
125	struct vm_area_struct *vma;
126	enum page_walk_action action;
127	bool no_vma;
128	void *private;
129};
130
131int walk_page_range(struct mm_struct *mm, unsigned long start,
132		unsigned long end, const struct mm_walk_ops *ops,
133		void *private);
134int walk_kernel_page_table_range(unsigned long start,
135		unsigned long end, const struct mm_walk_ops *ops,
136		pgd_t *pgd, void *private);
137int walk_kernel_page_table_range_lockless(unsigned long start,
138		unsigned long end, const struct mm_walk_ops *ops,
139		pgd_t *pgd, void *private);
140int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
141			unsigned long end, const struct mm_walk_ops *ops,
142			void *private);
143int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
144		void *private);
145int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
146		      pgoff_t nr, const struct mm_walk_ops *ops,
147		      void *private);
148
149typedef int __bitwise folio_walk_flags_t;
150
151/*
152 * Walk migration entries as well. Careful: a large folio might get split
153 * concurrently.
154 */
155#define FW_MIGRATION			((__force folio_walk_flags_t)BIT(0))
156
157/* Walk shared zeropages (small + huge) as well. */
158#define FW_ZEROPAGE			((__force folio_walk_flags_t)BIT(1))
159
160enum folio_walk_level {
161	FW_LEVEL_PTE,
162	FW_LEVEL_PMD,
163	FW_LEVEL_PUD,
164};
165
166/**
167 * struct folio_walk - folio_walk_start() / folio_walk_end() data
168 * @page:	exact folio page referenced (if applicable)
169 * @level:	page table level identifying the entry type
170 * @pte:	pointer to the page table entry (FW_LEVEL_PTE).
171 * @pmd:	pointer to the page table entry (FW_LEVEL_PMD).
172 * @pud:	pointer to the page table entry (FW_LEVEL_PUD).
173 * @ptl:	pointer to the page table lock.
174 *
175 * (see folio_walk_start() documentation for more details)
176 */
177struct folio_walk {
178	/* public */
179	struct page *page;
180	enum folio_walk_level level;
181	union {
182		pte_t *ptep;
183		pud_t *pudp;
184		pmd_t *pmdp;
185	};
186	union {
187		pte_t pte;
188		pud_t pud;
189		pmd_t pmd;
190	};
191	/* private */
192	struct vm_area_struct *vma;
193	spinlock_t *ptl;
194};
195
196struct folio *folio_walk_start(struct folio_walk *fw,
197		struct vm_area_struct *vma, unsigned long addr,
198		folio_walk_flags_t flags);
199
200#define folio_walk_end(__fw, __vma) do { \
201	spin_unlock((__fw)->ptl); \
202	if (likely((__fw)->level == FW_LEVEL_PTE)) \
203		pte_unmap((__fw)->ptep); \
204	vma_pgtable_walk_end(__vma); \
205} while (0)
206
207#endif /* _LINUX_PAGEWALK_H */