mm/mmu_gather.c at v5.18-rc7

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / mm / mmu_gather.c
at v5.18-rc7 347 lines 8.8 kB view raw
wrap content
  1#include <linux/gfp.h>
  2#include <linux/highmem.h>
  3#include <linux/kernel.h>
  4#include <linux/mmdebug.h>
  5#include <linux/mm_types.h>
  6#include <linux/mm_inline.h>
  7#include <linux/pagemap.h>
  8#include <linux/rcupdate.h>
  9#include <linux/smp.h>
 10#include <linux/swap.h>
 11
 12#include <asm/pgalloc.h>
 13#include <asm/tlb.h>
 14
 15#ifndef CONFIG_MMU_GATHER_NO_GATHER
 16
 17static bool tlb_next_batch(struct mmu_gather *tlb)
 18{
 19	struct mmu_gather_batch *batch;
 20
 21	batch = tlb->active;
 22	if (batch->next) {
 23		tlb->active = batch->next;
 24		return true;
 25	}
 26
 27	if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
 28		return false;
 29
 30	batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
 31	if (!batch)
 32		return false;
 33
 34	tlb->batch_count++;
 35	batch->next = NULL;
 36	batch->nr   = 0;
 37	batch->max  = MAX_GATHER_BATCH;
 38
 39	tlb->active->next = batch;
 40	tlb->active = batch;
 41
 42	return true;
 43}
 44
 45static void tlb_batch_pages_flush(struct mmu_gather *tlb)
 46{
 47	struct mmu_gather_batch *batch;
 48
 49	for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
 50		free_pages_and_swap_cache(batch->pages, batch->nr);
 51		batch->nr = 0;
 52	}
 53	tlb->active = &tlb->local;
 54}
 55
 56static void tlb_batch_list_free(struct mmu_gather *tlb)
 57{
 58	struct mmu_gather_batch *batch, *next;
 59
 60	for (batch = tlb->local.next; batch; batch = next) {
 61		next = batch->next;
 62		free_pages((unsigned long)batch, 0);
 63	}
 64	tlb->local.next = NULL;
 65}
 66
 67bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
 68{
 69	struct mmu_gather_batch *batch;
 70
 71	VM_BUG_ON(!tlb->end);
 72
 73#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
 74	VM_WARN_ON(tlb->page_size != page_size);
 75#endif
 76
 77	batch = tlb->active;
 78	/*
 79	 * Add the page and check if we are full. If so
 80	 * force a flush.
 81	 */
 82	batch->pages[batch->nr++] = page;
 83	if (batch->nr == batch->max) {
 84		if (!tlb_next_batch(tlb))
 85			return true;
 86		batch = tlb->active;
 87	}
 88	VM_BUG_ON_PAGE(batch->nr > batch->max, page);
 89
 90	return false;
 91}
 92
 93#endif /* MMU_GATHER_NO_GATHER */
 94
 95#ifdef CONFIG_MMU_GATHER_TABLE_FREE
 96
 97static void __tlb_remove_table_free(struct mmu_table_batch *batch)
 98{
 99	int i;
100
101	for (i = 0; i < batch->nr; i++)
102		__tlb_remove_table(batch->tables[i]);
103
104	free_page((unsigned long)batch);
105}
106
107#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
108
109/*
110 * Semi RCU freeing of the page directories.
111 *
112 * This is needed by some architectures to implement software pagetable walkers.
113 *
114 * gup_fast() and other software pagetable walkers do a lockless page-table
115 * walk and therefore needs some synchronization with the freeing of the page
116 * directories. The chosen means to accomplish that is by disabling IRQs over
117 * the walk.
118 *
119 * Architectures that use IPIs to flush TLBs will then automagically DTRT,
120 * since we unlink the page, flush TLBs, free the page. Since the disabling of
121 * IRQs delays the completion of the TLB flush we can never observe an already
122 * freed page.
123 *
124 * Architectures that do not have this (PPC) need to delay the freeing by some
125 * other means, this is that means.
126 *
127 * What we do is batch the freed directory pages (tables) and RCU free them.
128 * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
129 * holds off grace periods.
130 *
131 * However, in order to batch these pages we need to allocate storage, this
132 * allocation is deep inside the MM code and can thus easily fail on memory
133 * pressure. To guarantee progress we fall back to single table freeing, see
134 * the implementation of tlb_remove_table_one().
135 *
136 */
137
138static void tlb_remove_table_smp_sync(void *arg)
139{
140	/* Simply deliver the interrupt */
141}
142
143static void tlb_remove_table_sync_one(void)
144{
145	/*
146	 * This isn't an RCU grace period and hence the page-tables cannot be
147	 * assumed to be actually RCU-freed.
148	 *
149	 * It is however sufficient for software page-table walkers that rely on
150	 * IRQ disabling.
151	 */
152	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
153}
154
155static void tlb_remove_table_rcu(struct rcu_head *head)
156{
157	__tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
158}
159
160static void tlb_remove_table_free(struct mmu_table_batch *batch)
161{
162	call_rcu(&batch->rcu, tlb_remove_table_rcu);
163}
164
165#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
166
167static void tlb_remove_table_sync_one(void) { }
168
169static void tlb_remove_table_free(struct mmu_table_batch *batch)
170{
171	__tlb_remove_table_free(batch);
172}
173
174#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
175
176/*
177 * If we want tlb_remove_table() to imply TLB invalidates.
178 */
179static inline void tlb_table_invalidate(struct mmu_gather *tlb)
180{
181	if (tlb_needs_table_invalidate()) {
182		/*
183		 * Invalidate page-table caches used by hardware walkers. Then
184		 * we still need to RCU-sched wait while freeing the pages
185		 * because software walkers can still be in-flight.
186		 */
187		tlb_flush_mmu_tlbonly(tlb);
188	}
189}
190
191static void tlb_remove_table_one(void *table)
192{
193	tlb_remove_table_sync_one();
194	__tlb_remove_table(table);
195}
196
197static void tlb_table_flush(struct mmu_gather *tlb)
198{
199	struct mmu_table_batch **batch = &tlb->batch;
200
201	if (*batch) {
202		tlb_table_invalidate(tlb);
203		tlb_remove_table_free(*batch);
204		*batch = NULL;
205	}
206}
207
208void tlb_remove_table(struct mmu_gather *tlb, void *table)
209{
210	struct mmu_table_batch **batch = &tlb->batch;
211
212	if (*batch == NULL) {
213		*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
214		if (*batch == NULL) {
215			tlb_table_invalidate(tlb);
216			tlb_remove_table_one(table);
217			return;
218		}
219		(*batch)->nr = 0;
220	}
221
222	(*batch)->tables[(*batch)->nr++] = table;
223	if ((*batch)->nr == MAX_TABLE_BATCH)
224		tlb_table_flush(tlb);
225}
226
227static inline void tlb_table_init(struct mmu_gather *tlb)
228{
229	tlb->batch = NULL;
230}
231
232#else /* !CONFIG_MMU_GATHER_TABLE_FREE */
233
234static inline void tlb_table_flush(struct mmu_gather *tlb) { }
235static inline void tlb_table_init(struct mmu_gather *tlb) { }
236
237#endif /* CONFIG_MMU_GATHER_TABLE_FREE */
238
239static void tlb_flush_mmu_free(struct mmu_gather *tlb)
240{
241	tlb_table_flush(tlb);
242#ifndef CONFIG_MMU_GATHER_NO_GATHER
243	tlb_batch_pages_flush(tlb);
244#endif
245}
246
247void tlb_flush_mmu(struct mmu_gather *tlb)
248{
249	tlb_flush_mmu_tlbonly(tlb);
250	tlb_flush_mmu_free(tlb);
251}
252
253static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
254			     bool fullmm)
255{
256	tlb->mm = mm;
257	tlb->fullmm = fullmm;
258
259#ifndef CONFIG_MMU_GATHER_NO_GATHER
260	tlb->need_flush_all = 0;
261	tlb->local.next = NULL;
262	tlb->local.nr   = 0;
263	tlb->local.max  = ARRAY_SIZE(tlb->__pages);
264	tlb->active     = &tlb->local;
265	tlb->batch_count = 0;
266#endif
267
268	tlb_table_init(tlb);
269#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
270	tlb->page_size = 0;
271#endif
272
273	__tlb_reset_range(tlb);
274	inc_tlb_flush_pending(tlb->mm);
275}
276
277/**
278 * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
279 * @tlb: the mmu_gather structure to initialize
280 * @mm: the mm_struct of the target address space
281 *
282 * Called to initialize an (on-stack) mmu_gather structure for page-table
283 * tear-down from @mm.
284 */
285void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
286{
287	__tlb_gather_mmu(tlb, mm, false);
288}
289
290/**
291 * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down
292 * @tlb: the mmu_gather structure to initialize
293 * @mm: the mm_struct of the target address space
294 *
295 * In this case, @mm is without users and we're going to destroy the
296 * full address space (exit/execve).
297 *
298 * Called to initialize an (on-stack) mmu_gather structure for page-table
299 * tear-down from @mm.
300 */
301void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
302{
303	__tlb_gather_mmu(tlb, mm, true);
304}
305
306/**
307 * tlb_finish_mmu - finish an mmu_gather structure
308 * @tlb: the mmu_gather structure to finish
309 *
310 * Called at the end of the shootdown operation to free up any resources that
311 * were required.
312 */
313void tlb_finish_mmu(struct mmu_gather *tlb)
314{
315	/*
316	 * If there are parallel threads are doing PTE changes on same range
317	 * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
318	 * flush by batching, one thread may end up seeing inconsistent PTEs
319	 * and result in having stale TLB entries.  So flush TLB forcefully
320	 * if we detect parallel PTE batching threads.
321	 *
322	 * However, some syscalls, e.g. munmap(), may free page tables, this
323	 * needs force flush everything in the given range. Otherwise this
324	 * may result in having stale TLB entries for some architectures,
325	 * e.g. aarch64, that could specify flush what level TLB.
326	 */
327	if (mm_tlb_flush_nested(tlb->mm)) {
328		/*
329		 * The aarch64 yields better performance with fullmm by
330		 * avoiding multiple CPUs spamming TLBI messages at the
331		 * same time.
332		 *
333		 * On x86 non-fullmm doesn't yield significant difference
334		 * against fullmm.
335		 */
336		tlb->fullmm = 1;
337		__tlb_reset_range(tlb);
338		tlb->freed_tables = 1;
339	}
340
341	tlb_flush_mmu(tlb);
342
343#ifndef CONFIG_MMU_GATHER_NO_GATHER
344	tlb_batch_list_free(tlb);
345#endif
346	dec_tlb_flush_pending(tlb->mm);
347}
Configure Feed

Configure Feed