include/linux/pagemap.h at v3.17 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / include / linux / pagemap.h
at v3.17 682 lines 20 kB view raw
  1#ifndef _LINUX_PAGEMAP_H
  2#define _LINUX_PAGEMAP_H
  3
  4/*
  5 * Copyright 1995 Linus Torvalds
  6 */
  7#include <linux/mm.h>
  8#include <linux/fs.h>
  9#include <linux/list.h>
 10#include <linux/highmem.h>
 11#include <linux/compiler.h>
 12#include <asm/uaccess.h>
 13#include <linux/gfp.h>
 14#include <linux/bitops.h>
 15#include <linux/hardirq.h> /* for in_interrupt() */
 16#include <linux/hugetlb_inline.h>
 17
 18/*
 19 * Bits in mapping->flags.  The lower __GFP_BITS_SHIFT bits are the page
 20 * allocation mode flags.
 21 */
 22enum mapping_flags {
 23	AS_EIO		= __GFP_BITS_SHIFT + 0,	/* IO error on async write */
 24	AS_ENOSPC	= __GFP_BITS_SHIFT + 1,	/* ENOSPC on async write */
 25	AS_MM_ALL_LOCKS	= __GFP_BITS_SHIFT + 2,	/* under mm_take_all_locks() */
 26	AS_UNEVICTABLE	= __GFP_BITS_SHIFT + 3,	/* e.g., ramdisk, SHM_LOCK */
 27	AS_BALLOON_MAP  = __GFP_BITS_SHIFT + 4, /* balloon page special map */
 28	AS_EXITING	= __GFP_BITS_SHIFT + 5, /* final truncate in progress */
 29};
 30
 31static inline void mapping_set_error(struct address_space *mapping, int error)
 32{
 33	if (unlikely(error)) {
 34		if (error == -ENOSPC)
 35			set_bit(AS_ENOSPC, &mapping->flags);
 36		else
 37			set_bit(AS_EIO, &mapping->flags);
 38	}
 39}
 40
 41static inline void mapping_set_unevictable(struct address_space *mapping)
 42{
 43	set_bit(AS_UNEVICTABLE, &mapping->flags);
 44}
 45
 46static inline void mapping_clear_unevictable(struct address_space *mapping)
 47{
 48	clear_bit(AS_UNEVICTABLE, &mapping->flags);
 49}
 50
 51static inline int mapping_unevictable(struct address_space *mapping)
 52{
 53	if (mapping)
 54		return test_bit(AS_UNEVICTABLE, &mapping->flags);
 55	return !!mapping;
 56}
 57
 58static inline void mapping_set_balloon(struct address_space *mapping)
 59{
 60	set_bit(AS_BALLOON_MAP, &mapping->flags);
 61}
 62
 63static inline void mapping_clear_balloon(struct address_space *mapping)
 64{
 65	clear_bit(AS_BALLOON_MAP, &mapping->flags);
 66}
 67
 68static inline int mapping_balloon(struct address_space *mapping)
 69{
 70	return mapping && test_bit(AS_BALLOON_MAP, &mapping->flags);
 71}
 72
 73static inline void mapping_set_exiting(struct address_space *mapping)
 74{
 75	set_bit(AS_EXITING, &mapping->flags);
 76}
 77
 78static inline int mapping_exiting(struct address_space *mapping)
 79{
 80	return test_bit(AS_EXITING, &mapping->flags);
 81}
 82
 83static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
 84{
 85	return (__force gfp_t)mapping->flags & __GFP_BITS_MASK;
 86}
 87
 88/*
 89 * This is non-atomic.  Only to be used before the mapping is activated.
 90 * Probably needs a barrier...
 91 */
 92static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
 93{
 94	m->flags = (m->flags & ~(__force unsigned long)__GFP_BITS_MASK) |
 95				(__force unsigned long)mask;
 96}
 97
 98/*
 99 * The page cache can done in larger chunks than
100 * one page, because it allows for more efficient
101 * throughput (it can then be mapped into user
102 * space in smaller chunks for same flexibility).
103 *
104 * Or rather, it _will_ be done in larger chunks.
105 */
106#define PAGE_CACHE_SHIFT	PAGE_SHIFT
107#define PAGE_CACHE_SIZE		PAGE_SIZE
108#define PAGE_CACHE_MASK		PAGE_MASK
109#define PAGE_CACHE_ALIGN(addr)	(((addr)+PAGE_CACHE_SIZE-1)&PAGE_CACHE_MASK)
110
111#define page_cache_get(page)		get_page(page)
112#define page_cache_release(page)	put_page(page)
113void release_pages(struct page **pages, int nr, bool cold);
114
115/*
116 * speculatively take a reference to a page.
117 * If the page is free (_count == 0), then _count is untouched, and 0
118 * is returned. Otherwise, _count is incremented by 1 and 1 is returned.
119 *
120 * This function must be called inside the same rcu_read_lock() section as has
121 * been used to lookup the page in the pagecache radix-tree (or page table):
122 * this allows allocators to use a synchronize_rcu() to stabilize _count.
123 *
124 * Unless an RCU grace period has passed, the count of all pages coming out
125 * of the allocator must be considered unstable. page_count may return higher
126 * than expected, and put_page must be able to do the right thing when the
127 * page has been finished with, no matter what it is subsequently allocated
128 * for (because put_page is what is used here to drop an invalid speculative
129 * reference).
130 *
131 * This is the interesting part of the lockless pagecache (and lockless
132 * get_user_pages) locking protocol, where the lookup-side (eg. find_get_page)
133 * has the following pattern:
134 * 1. find page in radix tree
135 * 2. conditionally increment refcount
136 * 3. check the page is still in pagecache (if no, goto 1)
137 *
138 * Remove-side that cares about stability of _count (eg. reclaim) has the
139 * following (with tree_lock held for write):
140 * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg)
141 * B. remove page from pagecache
142 * C. free the page
143 *
144 * There are 2 critical interleavings that matter:
145 * - 2 runs before A: in this case, A sees elevated refcount and bails out
146 * - A runs before 2: in this case, 2 sees zero refcount and retries;
147 *   subsequently, B will complete and 1 will find no page, causing the
148 *   lookup to return NULL.
149 *
150 * It is possible that between 1 and 2, the page is removed then the exact same
151 * page is inserted into the same position in pagecache. That's OK: the
152 * old find_get_page using tree_lock could equally have run before or after
153 * such a re-insertion, depending on order that locks are granted.
154 *
155 * Lookups racing against pagecache insertion isn't a big problem: either 1
156 * will find the page or it will not. Likewise, the old find_get_page could run
157 * either before the insertion or afterwards, depending on timing.
158 */
159static inline int page_cache_get_speculative(struct page *page)
160{
161	VM_BUG_ON(in_interrupt());
162
163#ifdef CONFIG_TINY_RCU
164# ifdef CONFIG_PREEMPT_COUNT
165	VM_BUG_ON(!in_atomic());
166# endif
167	/*
168	 * Preempt must be disabled here - we rely on rcu_read_lock doing
169	 * this for us.
170	 *
171	 * Pagecache won't be truncated from interrupt context, so if we have
172	 * found a page in the radix tree here, we have pinned its refcount by
173	 * disabling preempt, and hence no need for the "speculative get" that
174	 * SMP requires.
175	 */
176	VM_BUG_ON_PAGE(page_count(page) == 0, page);
177	atomic_inc(&page->_count);
178
179#else
180	if (unlikely(!get_page_unless_zero(page))) {
181		/*
182		 * Either the page has been freed, or will be freed.
183		 * In either case, retry here and the caller should
184		 * do the right thing (see comments above).
185		 */
186		return 0;
187	}
188#endif
189	VM_BUG_ON_PAGE(PageTail(page), page);
190
191	return 1;
192}
193
194/*
195 * Same as above, but add instead of inc (could just be merged)
196 */
197static inline int page_cache_add_speculative(struct page *page, int count)
198{
199	VM_BUG_ON(in_interrupt());
200
201#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
202# ifdef CONFIG_PREEMPT_COUNT
203	VM_BUG_ON(!in_atomic());
204# endif
205	VM_BUG_ON_PAGE(page_count(page) == 0, page);
206	atomic_add(count, &page->_count);
207
208#else
209	if (unlikely(!atomic_add_unless(&page->_count, count, 0)))
210		return 0;
211#endif
212	VM_BUG_ON_PAGE(PageCompound(page) && page != compound_head(page), page);
213
214	return 1;
215}
216
217static inline int page_freeze_refs(struct page *page, int count)
218{
219	return likely(atomic_cmpxchg(&page->_count, count, 0) == count);
220}
221
222static inline void page_unfreeze_refs(struct page *page, int count)
223{
224	VM_BUG_ON_PAGE(page_count(page) != 0, page);
225	VM_BUG_ON(count == 0);
226
227	atomic_set(&page->_count, count);
228}
229
230#ifdef CONFIG_NUMA
231extern struct page *__page_cache_alloc(gfp_t gfp);
232#else
233static inline struct page *__page_cache_alloc(gfp_t gfp)
234{
235	return alloc_pages(gfp, 0);
236}
237#endif
238
239static inline struct page *page_cache_alloc(struct address_space *x)
240{
241	return __page_cache_alloc(mapping_gfp_mask(x));
242}
243
244static inline struct page *page_cache_alloc_cold(struct address_space *x)
245{
246	return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD);
247}
248
249static inline struct page *page_cache_alloc_readahead(struct address_space *x)
250{
251	return __page_cache_alloc(mapping_gfp_mask(x) |
252				  __GFP_COLD | __GFP_NORETRY | __GFP_NOWARN);
253}
254
255typedef int filler_t(void *, struct page *);
256
257pgoff_t page_cache_next_hole(struct address_space *mapping,
258			     pgoff_t index, unsigned long max_scan);
259pgoff_t page_cache_prev_hole(struct address_space *mapping,
260			     pgoff_t index, unsigned long max_scan);
261
262#define FGP_ACCESSED		0x00000001
263#define FGP_LOCK		0x00000002
264#define FGP_CREAT		0x00000004
265#define FGP_WRITE		0x00000008
266#define FGP_NOFS		0x00000010
267#define FGP_NOWAIT		0x00000020
268
269struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
270		int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask);
271
272/**
273 * find_get_page - find and get a page reference
274 * @mapping: the address_space to search
275 * @offset: the page index
276 *
277 * Looks up the page cache slot at @mapping & @offset.  If there is a
278 * page cache page, it is returned with an increased refcount.
279 *
280 * Otherwise, %NULL is returned.
281 */
282static inline struct page *find_get_page(struct address_space *mapping,
283					pgoff_t offset)
284{
285	return pagecache_get_page(mapping, offset, 0, 0, 0);
286}
287
288static inline struct page *find_get_page_flags(struct address_space *mapping,
289					pgoff_t offset, int fgp_flags)
290{
291	return pagecache_get_page(mapping, offset, fgp_flags, 0, 0);
292}
293
294/**
295 * find_lock_page - locate, pin and lock a pagecache page
296 * pagecache_get_page - find and get a page reference
297 * @mapping: the address_space to search
298 * @offset: the page index
299 *
300 * Looks up the page cache slot at @mapping & @offset.  If there is a
301 * page cache page, it is returned locked and with an increased
302 * refcount.
303 *
304 * Otherwise, %NULL is returned.
305 *
306 * find_lock_page() may sleep.
307 */
308static inline struct page *find_lock_page(struct address_space *mapping,
309					pgoff_t offset)
310{
311	return pagecache_get_page(mapping, offset, FGP_LOCK, 0, 0);
312}
313
314/**
315 * find_or_create_page - locate or add a pagecache page
316 * @mapping: the page's address_space
317 * @index: the page's index into the mapping
318 * @gfp_mask: page allocation mode
319 *
320 * Looks up the page cache slot at @mapping & @offset.  If there is a
321 * page cache page, it is returned locked and with an increased
322 * refcount.
323 *
324 * If the page is not present, a new page is allocated using @gfp_mask
325 * and added to the page cache and the VM's LRU list.  The page is
326 * returned locked and with an increased refcount.
327 *
328 * On memory exhaustion, %NULL is returned.
329 *
330 * find_or_create_page() may sleep, even if @gfp_flags specifies an
331 * atomic allocation!
332 */
333static inline struct page *find_or_create_page(struct address_space *mapping,
334					pgoff_t offset, gfp_t gfp_mask)
335{
336	return pagecache_get_page(mapping, offset,
337					FGP_LOCK|FGP_ACCESSED|FGP_CREAT,
338					gfp_mask, gfp_mask & GFP_RECLAIM_MASK);
339}
340
341/**
342 * grab_cache_page_nowait - returns locked page at given index in given cache
343 * @mapping: target address_space
344 * @index: the page index
345 *
346 * Same as grab_cache_page(), but do not wait if the page is unavailable.
347 * This is intended for speculative data generators, where the data can
348 * be regenerated if the page couldn't be grabbed.  This routine should
349 * be safe to call while holding the lock for another page.
350 *
351 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
352 * and deadlock against the caller's locked page.
353 */
354static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
355				pgoff_t index)
356{
357	return pagecache_get_page(mapping, index,
358			FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
359			mapping_gfp_mask(mapping),
360			GFP_NOFS);
361}
362
363struct page *find_get_entry(struct address_space *mapping, pgoff_t offset);
364struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
365unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
366			  unsigned int nr_entries, struct page **entries,
367			  pgoff_t *indices);
368unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
369			unsigned int nr_pages, struct page **pages);
370unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
371			       unsigned int nr_pages, struct page **pages);
372unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
373			int tag, unsigned int nr_pages, struct page **pages);
374
375struct page *grab_cache_page_write_begin(struct address_space *mapping,
376			pgoff_t index, unsigned flags);
377
378/*
379 * Returns locked page at given index in given cache, creating it if needed.
380 */
381static inline struct page *grab_cache_page(struct address_space *mapping,
382								pgoff_t index)
383{
384	return find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
385}
386
387extern struct page * read_cache_page(struct address_space *mapping,
388				pgoff_t index, filler_t *filler, void *data);
389extern struct page * read_cache_page_gfp(struct address_space *mapping,
390				pgoff_t index, gfp_t gfp_mask);
391extern int read_cache_pages(struct address_space *mapping,
392		struct list_head *pages, filler_t *filler, void *data);
393
394static inline struct page *read_mapping_page(struct address_space *mapping,
395				pgoff_t index, void *data)
396{
397	filler_t *filler = (filler_t *)mapping->a_ops->readpage;
398	return read_cache_page(mapping, index, filler, data);
399}
400
401/*
402 * Get the offset in PAGE_SIZE.
403 * (TODO: hugepage should have ->index in PAGE_SIZE)
404 */
405static inline pgoff_t page_to_pgoff(struct page *page)
406{
407	if (unlikely(PageHeadHuge(page)))
408		return page->index << compound_order(page);
409	else
410		return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
411}
412
413/*
414 * Return byte-offset into filesystem object for page.
415 */
416static inline loff_t page_offset(struct page *page)
417{
418	return ((loff_t)page->index) << PAGE_CACHE_SHIFT;
419}
420
421static inline loff_t page_file_offset(struct page *page)
422{
423	return ((loff_t)page_file_index(page)) << PAGE_CACHE_SHIFT;
424}
425
426extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
427				     unsigned long address);
428
429static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
430					unsigned long address)
431{
432	pgoff_t pgoff;
433	if (unlikely(is_vm_hugetlb_page(vma)))
434		return linear_hugepage_index(vma, address);
435	pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
436	pgoff += vma->vm_pgoff;
437	return pgoff >> (PAGE_CACHE_SHIFT - PAGE_SHIFT);
438}
439
440extern void __lock_page(struct page *page);
441extern int __lock_page_killable(struct page *page);
442extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
443				unsigned int flags);
444extern void unlock_page(struct page *page);
445
446static inline void __set_page_locked(struct page *page)
447{
448	__set_bit(PG_locked, &page->flags);
449}
450
451static inline void __clear_page_locked(struct page *page)
452{
453	__clear_bit(PG_locked, &page->flags);
454}
455
456static inline int trylock_page(struct page *page)
457{
458	return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
459}
460
461/*
462 * lock_page may only be called if we have the page's inode pinned.
463 */
464static inline void lock_page(struct page *page)
465{
466	might_sleep();
467	if (!trylock_page(page))
468		__lock_page(page);
469}
470
471/*
472 * lock_page_killable is like lock_page but can be interrupted by fatal
473 * signals.  It returns 0 if it locked the page and -EINTR if it was
474 * killed while waiting.
475 */
476static inline int lock_page_killable(struct page *page)
477{
478	might_sleep();
479	if (!trylock_page(page))
480		return __lock_page_killable(page);
481	return 0;
482}
483
484/*
485 * lock_page_or_retry - Lock the page, unless this would block and the
486 * caller indicated that it can handle a retry.
487 *
488 * Return value and mmap_sem implications depend on flags; see
489 * __lock_page_or_retry().
490 */
491static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm,
492				     unsigned int flags)
493{
494	might_sleep();
495	return trylock_page(page) || __lock_page_or_retry(page, mm, flags);
496}
497
498/*
499 * This is exported only for wait_on_page_locked/wait_on_page_writeback.
500 * Never use this directly!
501 */
502extern void wait_on_page_bit(struct page *page, int bit_nr);
503
504extern int wait_on_page_bit_killable(struct page *page, int bit_nr);
505
506static inline int wait_on_page_locked_killable(struct page *page)
507{
508	if (PageLocked(page))
509		return wait_on_page_bit_killable(page, PG_locked);
510	return 0;
511}
512
513/* 
514 * Wait for a page to be unlocked.
515 *
516 * This must be called with the caller "holding" the page,
517 * ie with increased "page->count" so that the page won't
518 * go away during the wait..
519 */
520static inline void wait_on_page_locked(struct page *page)
521{
522	if (PageLocked(page))
523		wait_on_page_bit(page, PG_locked);
524}
525
526/* 
527 * Wait for a page to complete writeback
528 */
529static inline void wait_on_page_writeback(struct page *page)
530{
531	if (PageWriteback(page))
532		wait_on_page_bit(page, PG_writeback);
533}
534
535extern void end_page_writeback(struct page *page);
536void wait_for_stable_page(struct page *page);
537
538void page_endio(struct page *page, int rw, int err);
539
540/*
541 * Add an arbitrary waiter to a page's wait queue
542 */
543extern void add_page_wait_queue(struct page *page, wait_queue_t *waiter);
544
545/*
546 * Fault a userspace page into pagetables.  Return non-zero on a fault.
547 *
548 * This assumes that two userspace pages are always sufficient.  That's
549 * not true if PAGE_CACHE_SIZE > PAGE_SIZE.
550 */
551static inline int fault_in_pages_writeable(char __user *uaddr, int size)
552{
553	int ret;
554
555	if (unlikely(size == 0))
556		return 0;
557
558	/*
559	 * Writing zeroes into userspace here is OK, because we know that if
560	 * the zero gets there, we'll be overwriting it.
561	 */
562	ret = __put_user(0, uaddr);
563	if (ret == 0) {
564		char __user *end = uaddr + size - 1;
565
566		/*
567		 * If the page was already mapped, this will get a cache miss
568		 * for sure, so try to avoid doing it.
569		 */
570		if (((unsigned long)uaddr & PAGE_MASK) !=
571				((unsigned long)end & PAGE_MASK))
572			ret = __put_user(0, end);
573	}
574	return ret;
575}
576
577static inline int fault_in_pages_readable(const char __user *uaddr, int size)
578{
579	volatile char c;
580	int ret;
581
582	if (unlikely(size == 0))
583		return 0;
584
585	ret = __get_user(c, uaddr);
586	if (ret == 0) {
587		const char __user *end = uaddr + size - 1;
588
589		if (((unsigned long)uaddr & PAGE_MASK) !=
590				((unsigned long)end & PAGE_MASK)) {
591			ret = __get_user(c, end);
592			(void)c;
593		}
594	}
595	return ret;
596}
597
598/*
599 * Multipage variants of the above prefault helpers, useful if more than
600 * PAGE_SIZE of data needs to be prefaulted. These are separate from the above
601 * functions (which only handle up to PAGE_SIZE) to avoid clobbering the
602 * filemap.c hotpaths.
603 */
604static inline int fault_in_multipages_writeable(char __user *uaddr, int size)
605{
606	int ret = 0;
607	char __user *end = uaddr + size - 1;
608
609	if (unlikely(size == 0))
610		return ret;
611
612	/*
613	 * Writing zeroes into userspace here is OK, because we know that if
614	 * the zero gets there, we'll be overwriting it.
615	 */
616	while (uaddr <= end) {
617		ret = __put_user(0, uaddr);
618		if (ret != 0)
619			return ret;
620		uaddr += PAGE_SIZE;
621	}
622
623	/* Check whether the range spilled into the next page. */
624	if (((unsigned long)uaddr & PAGE_MASK) ==
625			((unsigned long)end & PAGE_MASK))
626		ret = __put_user(0, end);
627
628	return ret;
629}
630
631static inline int fault_in_multipages_readable(const char __user *uaddr,
632					       int size)
633{
634	volatile char c;
635	int ret = 0;
636	const char __user *end = uaddr + size - 1;
637
638	if (unlikely(size == 0))
639		return ret;
640
641	while (uaddr <= end) {
642		ret = __get_user(c, uaddr);
643		if (ret != 0)
644			return ret;
645		uaddr += PAGE_SIZE;
646	}
647
648	/* Check whether the range spilled into the next page. */
649	if (((unsigned long)uaddr & PAGE_MASK) ==
650			((unsigned long)end & PAGE_MASK)) {
651		ret = __get_user(c, end);
652		(void)c;
653	}
654
655	return ret;
656}
657
658int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
659				pgoff_t index, gfp_t gfp_mask);
660int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
661				pgoff_t index, gfp_t gfp_mask);
662extern void delete_from_page_cache(struct page *page);
663extern void __delete_from_page_cache(struct page *page, void *shadow);
664int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
665
666/*
667 * Like add_to_page_cache_locked, but used to add newly allocated pages:
668 * the page is new, so we can just run __set_page_locked() against it.
669 */
670static inline int add_to_page_cache(struct page *page,
671		struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
672{
673	int error;
674
675	__set_page_locked(page);
676	error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
677	if (unlikely(error))
678		__clear_page_locked(page);
679	return error;
680}
681
682#endif /* _LINUX_PAGEMAP_H */