include/linux/mm_types.h at v4.8 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / include / linux / mm_types.h
at v4.8 18 kB view raw
  1#ifndef _LINUX_MM_TYPES_H
  2#define _LINUX_MM_TYPES_H
  3
  4#include <linux/auxvec.h>
  5#include <linux/types.h>
  6#include <linux/threads.h>
  7#include <linux/list.h>
  8#include <linux/spinlock.h>
  9#include <linux/rbtree.h>
 10#include <linux/rwsem.h>
 11#include <linux/completion.h>
 12#include <linux/cpumask.h>
 13#include <linux/uprobes.h>
 14#include <linux/page-flags-layout.h>
 15#include <linux/workqueue.h>
 16#include <asm/page.h>
 17#include <asm/mmu.h>
 18
 19#ifndef AT_VECTOR_SIZE_ARCH
 20#define AT_VECTOR_SIZE_ARCH 0
 21#endif
 22#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
 23
 24struct address_space;
 25struct mem_cgroup;
 26
 27#define USE_SPLIT_PTE_PTLOCKS	(NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
 28#define USE_SPLIT_PMD_PTLOCKS	(USE_SPLIT_PTE_PTLOCKS && \
 29		IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK))
 30#define ALLOC_SPLIT_PTLOCKS	(SPINLOCK_SIZE > BITS_PER_LONG/8)
 31
 32/*
 33 * Each physical page in the system has a struct page associated with
 34 * it to keep track of whatever it is we are using the page for at the
 35 * moment. Note that we have no way to track which tasks are using
 36 * a page, though if it is a pagecache page, rmap structures can tell us
 37 * who is mapping it.
 38 *
 39 * The objects in struct page are organized in double word blocks in
 40 * order to allows us to use atomic double word operations on portions
 41 * of struct page. That is currently only used by slub but the arrangement
 42 * allows the use of atomic double word operations on the flags/mapping
 43 * and lru list pointers also.
 44 */
 45struct page {
 46	/* First double word block */
 47	unsigned long flags;		/* Atomic flags, some possibly
 48					 * updated asynchronously */
 49	union {
 50		struct address_space *mapping;	/* If low bit clear, points to
 51						 * inode address_space, or NULL.
 52						 * If page mapped as anonymous
 53						 * memory, low bit is set, and
 54						 * it points to anon_vma object:
 55						 * see PAGE_MAPPING_ANON below.
 56						 */
 57		void *s_mem;			/* slab first object */
 58		atomic_t compound_mapcount;	/* first tail page */
 59		/* page_deferred_list().next	 -- second tail page */
 60	};
 61
 62	/* Second double word */
 63	union {
 64		pgoff_t index;		/* Our offset within mapping. */
 65		void *freelist;		/* sl[aou]b first free object */
 66		/* page_deferred_list().prev	-- second tail page */
 67	};
 68
 69	union {
 70#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
 71	defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 72		/* Used for cmpxchg_double in slub */
 73		unsigned long counters;
 74#else
 75		/*
 76		 * Keep _refcount separate from slub cmpxchg_double data.
 77		 * As the rest of the double word is protected by slab_lock
 78		 * but _refcount is not.
 79		 */
 80		unsigned counters;
 81#endif
 82		struct {
 83
 84			union {
 85				/*
 86				 * Count of ptes mapped in mms, to show when
 87				 * page is mapped & limit reverse map searches.
 88				 *
 89				 * Extra information about page type may be
 90				 * stored here for pages that are never mapped,
 91				 * in which case the value MUST BE <= -2.
 92				 * See page-flags.h for more details.
 93				 */
 94				atomic_t _mapcount;
 95
 96				unsigned int active;		/* SLAB */
 97				struct {			/* SLUB */
 98					unsigned inuse:16;
 99					unsigned objects:15;
100					unsigned frozen:1;
101				};
102				int units;			/* SLOB */
103			};
104			/*
105			 * Usage count, *USE WRAPPER FUNCTION* when manual
106			 * accounting. See page_ref.h
107			 */
108			atomic_t _refcount;
109		};
110	};
111
112	/*
113	 * Third double word block
114	 *
115	 * WARNING: bit 0 of the first word encode PageTail(). That means
116	 * the rest users of the storage space MUST NOT use the bit to
117	 * avoid collision and false-positive PageTail().
118	 */
119	union {
120		struct list_head lru;	/* Pageout list, eg. active_list
121					 * protected by zone_lru_lock !
122					 * Can be used as a generic list
123					 * by the page owner.
124					 */
125		struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an
126					    * lru or handled by a slab
127					    * allocator, this points to the
128					    * hosting device page map.
129					    */
130		struct {		/* slub per cpu partial pages */
131			struct page *next;	/* Next partial slab */
132#ifdef CONFIG_64BIT
133			int pages;	/* Nr of partial slabs left */
134			int pobjects;	/* Approximate # of objects */
135#else
136			short int pages;
137			short int pobjects;
138#endif
139		};
140
141		struct rcu_head rcu_head;	/* Used by SLAB
142						 * when destroying via RCU
143						 */
144		/* Tail pages of compound page */
145		struct {
146			unsigned long compound_head; /* If bit zero is set */
147
148			/* First tail page only */
149#ifdef CONFIG_64BIT
150			/*
151			 * On 64 bit system we have enough space in struct page
152			 * to encode compound_dtor and compound_order with
153			 * unsigned int. It can help compiler generate better or
154			 * smaller code on some archtectures.
155			 */
156			unsigned int compound_dtor;
157			unsigned int compound_order;
158#else
159			unsigned short int compound_dtor;
160			unsigned short int compound_order;
161#endif
162		};
163
164#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
165		struct {
166			unsigned long __pad;	/* do not overlay pmd_huge_pte
167						 * with compound_head to avoid
168						 * possible bit 0 collision.
169						 */
170			pgtable_t pmd_huge_pte; /* protected by page->ptl */
171		};
172#endif
173	};
174
175	/* Remainder is not double word aligned */
176	union {
177		unsigned long private;		/* Mapping-private opaque data:
178					 	 * usually used for buffer_heads
179						 * if PagePrivate set; used for
180						 * swp_entry_t if PageSwapCache;
181						 * indicates order in the buddy
182						 * system if PG_buddy is set.
183						 */
184#if USE_SPLIT_PTE_PTLOCKS
185#if ALLOC_SPLIT_PTLOCKS
186		spinlock_t *ptl;
187#else
188		spinlock_t ptl;
189#endif
190#endif
191		struct kmem_cache *slab_cache;	/* SL[AU]B: Pointer to slab */
192	};
193
194#ifdef CONFIG_MEMCG
195	struct mem_cgroup *mem_cgroup;
196#endif
197
198	/*
199	 * On machines where all RAM is mapped into kernel address space,
200	 * we can simply calculate the virtual address. On machines with
201	 * highmem some memory is mapped into kernel virtual memory
202	 * dynamically, so we need a place to store that address.
203	 * Note that this field could be 16 bits on x86 ... ;)
204	 *
205	 * Architectures with slow multiplication can define
206	 * WANT_PAGE_VIRTUAL in asm/page.h
207	 */
208#if defined(WANT_PAGE_VIRTUAL)
209	void *virtual;			/* Kernel virtual address (NULL if
210					   not kmapped, ie. highmem) */
211#endif /* WANT_PAGE_VIRTUAL */
212
213#ifdef CONFIG_KMEMCHECK
214	/*
215	 * kmemcheck wants to track the status of each byte in a page; this
216	 * is a pointer to such a status block. NULL if not tracked.
217	 */
218	void *shadow;
219#endif
220
221#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
222	int _last_cpupid;
223#endif
224}
225/*
226 * The struct page can be forced to be double word aligned so that atomic ops
227 * on double words work. The SLUB allocator can make use of such a feature.
228 */
229#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
230	__aligned(2 * sizeof(unsigned long))
231#endif
232;
233
234struct page_frag {
235	struct page *page;
236#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
237	__u32 offset;
238	__u32 size;
239#else
240	__u16 offset;
241	__u16 size;
242#endif
243};
244
245#define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
246#define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
247
248struct page_frag_cache {
249	void * va;
250#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
251	__u16 offset;
252	__u16 size;
253#else
254	__u32 offset;
255#endif
256	/* we maintain a pagecount bias, so that we dont dirty cache line
257	 * containing page->_refcount every time we allocate a fragment.
258	 */
259	unsigned int		pagecnt_bias;
260	bool pfmemalloc;
261};
262
263typedef unsigned long vm_flags_t;
264
265/*
266 * A region containing a mapping of a non-memory backed file under NOMMU
267 * conditions.  These are held in a global tree and are pinned by the VMAs that
268 * map parts of them.
269 */
270struct vm_region {
271	struct rb_node	vm_rb;		/* link in global region tree */
272	vm_flags_t	vm_flags;	/* VMA vm_flags */
273	unsigned long	vm_start;	/* start address of region */
274	unsigned long	vm_end;		/* region initialised to here */
275	unsigned long	vm_top;		/* region allocated to here */
276	unsigned long	vm_pgoff;	/* the offset in vm_file corresponding to vm_start */
277	struct file	*vm_file;	/* the backing file or NULL */
278
279	int		vm_usage;	/* region usage count (access under nommu_region_sem) */
280	bool		vm_icache_flushed : 1; /* true if the icache has been flushed for
281						* this region */
282};
283
284#ifdef CONFIG_USERFAULTFD
285#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, })
286struct vm_userfaultfd_ctx {
287	struct userfaultfd_ctx *ctx;
288};
289#else /* CONFIG_USERFAULTFD */
290#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {})
291struct vm_userfaultfd_ctx {};
292#endif /* CONFIG_USERFAULTFD */
293
294/*
295 * This struct defines a memory VMM memory area. There is one of these
296 * per VM-area/task.  A VM area is any part of the process virtual memory
297 * space that has a special rule for the page-fault handlers (ie a shared
298 * library, the executable area etc).
299 */
300struct vm_area_struct {
301	/* The first cache line has the info for VMA tree walking. */
302
303	unsigned long vm_start;		/* Our start address within vm_mm. */
304	unsigned long vm_end;		/* The first byte after our end address
305					   within vm_mm. */
306
307	/* linked list of VM areas per task, sorted by address */
308	struct vm_area_struct *vm_next, *vm_prev;
309
310	struct rb_node vm_rb;
311
312	/*
313	 * Largest free memory gap in bytes to the left of this VMA.
314	 * Either between this VMA and vma->vm_prev, or between one of the
315	 * VMAs below us in the VMA rbtree and its ->vm_prev. This helps
316	 * get_unmapped_area find a free area of the right size.
317	 */
318	unsigned long rb_subtree_gap;
319
320	/* Second cache line starts here. */
321
322	struct mm_struct *vm_mm;	/* The address space we belong to. */
323	pgprot_t vm_page_prot;		/* Access permissions of this VMA. */
324	unsigned long vm_flags;		/* Flags, see mm.h. */
325
326	/*
327	 * For areas with an address space and backing store,
328	 * linkage into the address_space->i_mmap interval tree.
329	 */
330	struct {
331		struct rb_node rb;
332		unsigned long rb_subtree_last;
333	} shared;
334
335	/*
336	 * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
337	 * list, after a COW of one of the file pages.	A MAP_SHARED vma
338	 * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
339	 * or brk vma (with NULL file) can only be in an anon_vma list.
340	 */
341	struct list_head anon_vma_chain; /* Serialized by mmap_sem &
342					  * page_table_lock */
343	struct anon_vma *anon_vma;	/* Serialized by page_table_lock */
344
345	/* Function pointers to deal with this struct. */
346	const struct vm_operations_struct *vm_ops;
347
348	/* Information about our backing store: */
349	unsigned long vm_pgoff;		/* Offset (within vm_file) in PAGE_SIZE
350					   units */
351	struct file * vm_file;		/* File we map to (can be NULL). */
352	void * vm_private_data;		/* was vm_pte (shared mem) */
353
354#ifndef CONFIG_MMU
355	struct vm_region *vm_region;	/* NOMMU mapping region */
356#endif
357#ifdef CONFIG_NUMA
358	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
359#endif
360	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
361};
362
363struct core_thread {
364	struct task_struct *task;
365	struct core_thread *next;
366};
367
368struct core_state {
369	atomic_t nr_threads;
370	struct core_thread dumper;
371	struct completion startup;
372};
373
374enum {
375	MM_FILEPAGES,	/* Resident file mapping pages */
376	MM_ANONPAGES,	/* Resident anonymous pages */
377	MM_SWAPENTS,	/* Anonymous swap entries */
378	MM_SHMEMPAGES,	/* Resident shared memory pages */
379	NR_MM_COUNTERS
380};
381
382#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU)
383#define SPLIT_RSS_COUNTING
384/* per-thread cached information, */
385struct task_rss_stat {
386	int events;	/* for synchronization threshold */
387	int count[NR_MM_COUNTERS];
388};
389#endif /* USE_SPLIT_PTE_PTLOCKS */
390
391struct mm_rss_stat {
392	atomic_long_t count[NR_MM_COUNTERS];
393};
394
395struct kioctx_table;
396struct mm_struct {
397	struct vm_area_struct *mmap;		/* list of VMAs */
398	struct rb_root mm_rb;
399	u32 vmacache_seqnum;                   /* per-thread vmacache */
400#ifdef CONFIG_MMU
401	unsigned long (*get_unmapped_area) (struct file *filp,
402				unsigned long addr, unsigned long len,
403				unsigned long pgoff, unsigned long flags);
404#endif
405	unsigned long mmap_base;		/* base of mmap area */
406	unsigned long mmap_legacy_base;         /* base of mmap area in bottom-up allocations */
407	unsigned long task_size;		/* size of task vm space */
408	unsigned long highest_vm_end;		/* highest vma end address */
409	pgd_t * pgd;
410	atomic_t mm_users;			/* How many users with user space? */
411	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
412	atomic_long_t nr_ptes;			/* PTE page table pages */
413#if CONFIG_PGTABLE_LEVELS > 2
414	atomic_long_t nr_pmds;			/* PMD page table pages */
415#endif
416	int map_count;				/* number of VMAs */
417
418	spinlock_t page_table_lock;		/* Protects page tables and some counters */
419	struct rw_semaphore mmap_sem;
420
421	struct list_head mmlist;		/* List of maybe swapped mm's.	These are globally strung
422						 * together off init_mm.mmlist, and are protected
423						 * by mmlist_lock
424						 */
425
426
427	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
428	unsigned long hiwater_vm;	/* High-water virtual memory usage */
429
430	unsigned long total_vm;		/* Total pages mapped */
431	unsigned long locked_vm;	/* Pages that have PG_mlocked set */
432	unsigned long pinned_vm;	/* Refcount permanently increased */
433	unsigned long data_vm;		/* VM_WRITE & ~VM_SHARED & ~VM_STACK */
434	unsigned long exec_vm;		/* VM_EXEC & ~VM_WRITE & ~VM_STACK */
435	unsigned long stack_vm;		/* VM_STACK */
436	unsigned long def_flags;
437	unsigned long start_code, end_code, start_data, end_data;
438	unsigned long start_brk, brk, start_stack;
439	unsigned long arg_start, arg_end, env_start, env_end;
440
441	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
442
443	/*
444	 * Special counters, in some configurations protected by the
445	 * page_table_lock, in other configurations by being atomic.
446	 */
447	struct mm_rss_stat rss_stat;
448
449	struct linux_binfmt *binfmt;
450
451	cpumask_var_t cpu_vm_mask_var;
452
453	/* Architecture-specific MM context */
454	mm_context_t context;
455
456	unsigned long flags; /* Must use atomic bitops to access the bits */
457
458	struct core_state *core_state; /* coredumping support */
459#ifdef CONFIG_AIO
460	spinlock_t			ioctx_lock;
461	struct kioctx_table __rcu	*ioctx_table;
462#endif
463#ifdef CONFIG_MEMCG
464	/*
465	 * "owner" points to a task that is regarded as the canonical
466	 * user/owner of this mm. All of the following must be true in
467	 * order for it to be changed:
468	 *
469	 * current == mm->owner
470	 * current->mm != mm
471	 * new_owner->mm == mm
472	 * new_owner->alloc_lock is held
473	 */
474	struct task_struct __rcu *owner;
475#endif
476
477	/* store ref to file /proc/<pid>/exe symlink points to */
478	struct file __rcu *exe_file;
479#ifdef CONFIG_MMU_NOTIFIER
480	struct mmu_notifier_mm *mmu_notifier_mm;
481#endif
482#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
483	pgtable_t pmd_huge_pte; /* protected by page_table_lock */
484#endif
485#ifdef CONFIG_CPUMASK_OFFSTACK
486	struct cpumask cpumask_allocation;
487#endif
488#ifdef CONFIG_NUMA_BALANCING
489	/*
490	 * numa_next_scan is the next time that the PTEs will be marked
491	 * pte_numa. NUMA hinting faults will gather statistics and migrate
492	 * pages to new nodes if necessary.
493	 */
494	unsigned long numa_next_scan;
495
496	/* Restart point for scanning and setting pte_numa */
497	unsigned long numa_scan_offset;
498
499	/* numa_scan_seq prevents two threads setting pte_numa */
500	int numa_scan_seq;
501#endif
502#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
503	/*
504	 * An operation with batched TLB flushing is going on. Anything that
505	 * can move process memory needs to flush the TLB when moving a
506	 * PROT_NONE or PROT_NUMA mapped page.
507	 */
508	bool tlb_flush_pending;
509#endif
510	struct uprobes_state uprobes_state;
511#ifdef CONFIG_X86_INTEL_MPX
512	/* address of the bounds directory */
513	void __user *bd_addr;
514#endif
515#ifdef CONFIG_HUGETLB_PAGE
516	atomic_long_t hugetlb_usage;
517#endif
518#ifdef CONFIG_MMU
519	struct work_struct async_put_work;
520#endif
521};
522
523static inline void mm_init_cpumask(struct mm_struct *mm)
524{
525#ifdef CONFIG_CPUMASK_OFFSTACK
526	mm->cpu_vm_mask_var = &mm->cpumask_allocation;
527#endif
528	cpumask_clear(mm->cpu_vm_mask_var);
529}
530
531/* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
532static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
533{
534	return mm->cpu_vm_mask_var;
535}
536
537#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
538/*
539 * Memory barriers to keep this state in sync are graciously provided by
540 * the page table locks, outside of which no page table modifications happen.
541 * The barriers below prevent the compiler from re-ordering the instructions
542 * around the memory barriers that are already present in the code.
543 */
544static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
545{
546	barrier();
547	return mm->tlb_flush_pending;
548}
549static inline void set_tlb_flush_pending(struct mm_struct *mm)
550{
551	mm->tlb_flush_pending = true;
552
553	/*
554	 * Guarantee that the tlb_flush_pending store does not leak into the
555	 * critical section updating the page tables
556	 */
557	smp_mb__before_spinlock();
558}
559/* Clearing is done after a TLB flush, which also provides a barrier. */
560static inline void clear_tlb_flush_pending(struct mm_struct *mm)
561{
562	barrier();
563	mm->tlb_flush_pending = false;
564}
565#else
566static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
567{
568	return false;
569}
570static inline void set_tlb_flush_pending(struct mm_struct *mm)
571{
572}
573static inline void clear_tlb_flush_pending(struct mm_struct *mm)
574{
575}
576#endif
577
578struct vm_fault;
579
580struct vm_special_mapping {
581	const char *name;	/* The name, e.g. "[vdso]". */
582
583	/*
584	 * If .fault is not provided, this points to a
585	 * NULL-terminated array of pages that back the special mapping.
586	 *
587	 * This must not be NULL unless .fault is provided.
588	 */
589	struct page **pages;
590
591	/*
592	 * If non-NULL, then this is called to resolve page faults
593	 * on the special mapping.  If used, .pages is not checked.
594	 */
595	int (*fault)(const struct vm_special_mapping *sm,
596		     struct vm_area_struct *vma,
597		     struct vm_fault *vmf);
598
599	int (*mremap)(const struct vm_special_mapping *sm,
600		     struct vm_area_struct *new_vma);
601};
602
603enum tlb_flush_reason {
604	TLB_FLUSH_ON_TASK_SWITCH,
605	TLB_REMOTE_SHOOTDOWN,
606	TLB_LOCAL_SHOOTDOWN,
607	TLB_LOCAL_MM_SHOOTDOWN,
608	TLB_REMOTE_SEND_IPI,
609	NR_TLB_FLUSH_REASONS,
610};
611
612 /*
613  * A swap entry has to fit into a "unsigned long", as the entry is hidden
614  * in the "index" field of the swapper address space.
615  */
616typedef struct {
617	unsigned long val;
618} swp_entry_t;
619
620#endif /* _LINUX_MM_TYPES_H */