Merge branch 'page-refs' (page ref overflow)

Merge page ref overflow branch.

Jann Horn reported that he can overflow the page ref count with
sufficient memory (and a filesystem that is intentionally extremely
slow).

Admittedly it's not exactly easy. To have more than four billion
references to a page requires a minimum of 32GB of kernel memory just
for the pointers to the pages, much less any metadata to keep track of
those pointers. Jann needed a total of 140GB of memory and a specially
crafted filesystem that leaves all reads pending (in order to not ever
free the page references and just keep adding more).

Still, we have a fairly straightforward way to limit the two obvious
user-controllable sources of page references: direct-IO like page
references gotten through get_user_pages(), and the splice pipe page
duplication. So let's just do that.

* branch page-refs:
fs: prevent page refcount overflow in pipe_buf_get
mm: prevent get_user_pages() from overflowing page refcount
mm: add 'try_get_page()' helper function
mm: make page ref count overflow check tighter and more explicit

Changed files
+92 -28
fs
include
kernel
trace
mm
+6 -6
fs/fuse/dev.c
··· 2056 2056 rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len; 2057 2057 2058 2058 ret = -EINVAL; 2059 - if (rem < len) { 2060 - pipe_unlock(pipe); 2061 - goto out; 2062 - } 2059 + if (rem < len) 2060 + goto out_free; 2063 2061 2064 2062 rem = len; 2065 2063 while (rem) { ··· 2075 2077 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); 2076 2078 pipe->nrbufs--; 2077 2079 } else { 2078 - pipe_buf_get(pipe, ibuf); 2080 + if (!pipe_buf_get(pipe, ibuf)) 2081 + goto out_free; 2082 + 2079 2083 *obuf = *ibuf; 2080 2084 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 2081 2085 obuf->len = rem; ··· 2100 2100 ret = fuse_dev_do_write(fud, &cs, len); 2101 2101 2102 2102 pipe_lock(pipe); 2103 + out_free: 2103 2104 for (idx = 0; idx < nbuf; idx++) 2104 2105 pipe_buf_release(pipe, &bufs[idx]); 2105 2106 pipe_unlock(pipe); 2106 2107 2107 - out: 2108 2108 kvfree(bufs); 2109 2109 return ret; 2110 2110 }
+2 -2
fs/pipe.c
··· 188 188 * in the tee() system call, when we duplicate the buffers in one 189 189 * pipe into another. 190 190 */ 191 - void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 191 + bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 192 192 { 193 - get_page(buf->page); 193 + return try_get_page(buf->page); 194 194 } 195 195 EXPORT_SYMBOL(generic_pipe_buf_get); 196 196
+10 -2
fs/splice.c
··· 1593 1593 * Get a reference to this pipe buffer, 1594 1594 * so we can copy the contents over. 1595 1595 */ 1596 - pipe_buf_get(ipipe, ibuf); 1596 + if (!pipe_buf_get(ipipe, ibuf)) { 1597 + if (ret == 0) 1598 + ret = -EFAULT; 1599 + break; 1600 + } 1597 1601 *obuf = *ibuf; 1598 1602 1599 1603 /* ··· 1671 1667 * Get a reference to this pipe buffer, 1672 1668 * so we can copy the contents over. 1673 1669 */ 1674 - pipe_buf_get(ipipe, ibuf); 1670 + if (!pipe_buf_get(ipipe, ibuf)) { 1671 + if (ret == 0) 1672 + ret = -EFAULT; 1673 + break; 1674 + } 1675 1675 1676 1676 obuf = opipe->bufs + nbuf; 1677 1677 *obuf = *ibuf;
+14 -1
include/linux/mm.h
··· 966 966 } 967 967 #endif /* CONFIG_DEV_PAGEMAP_OPS */ 968 968 969 + /* 127: arbitrary random number, small enough to assemble well */ 970 + #define page_ref_zero_or_close_to_overflow(page) \ 971 + ((unsigned int) page_ref_count(page) + 127u <= 127u) 972 + 969 973 static inline void get_page(struct page *page) 970 974 { 971 975 page = compound_head(page); ··· 977 973 * Getting a normal page or the head of a compound page 978 974 * requires to already have an elevated page->_refcount. 979 975 */ 980 - VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); 976 + VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page); 981 977 page_ref_inc(page); 978 + } 979 + 980 + static inline __must_check bool try_get_page(struct page *page) 981 + { 982 + page = compound_head(page); 983 + if (WARN_ON_ONCE(page_ref_count(page) <= 0)) 984 + return false; 985 + page_ref_inc(page); 986 + return true; 982 987 } 983 988 984 989 static inline void put_page(struct page *page)
+6 -4
include/linux/pipe_fs_i.h
··· 101 101 /* 102 102 * Get a reference to the pipe buffer. 103 103 */ 104 - void (*get)(struct pipe_inode_info *, struct pipe_buffer *); 104 + bool (*get)(struct pipe_inode_info *, struct pipe_buffer *); 105 105 }; 106 106 107 107 /** 108 108 * pipe_buf_get - get a reference to a pipe_buffer 109 109 * @pipe: the pipe that the buffer belongs to 110 110 * @buf: the buffer to get a reference to 111 + * 112 + * Return: %true if the reference was successfully obtained. 111 113 */ 112 - static inline void pipe_buf_get(struct pipe_inode_info *pipe, 114 + static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe, 113 115 struct pipe_buffer *buf) 114 116 { 115 - buf->ops->get(pipe, buf); 117 + return buf->ops->get(pipe, buf); 116 118 } 117 119 118 120 /** ··· 173 171 void free_pipe_info(struct pipe_inode_info *); 174 172 175 173 /* Generic pipe buffer ops functions */ 176 - void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); 174 + bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); 177 175 int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); 178 176 int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); 179 177 void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
+5 -1
kernel/trace/trace.c
··· 7041 7041 buf->private = 0; 7042 7042 } 7043 7043 7044 - static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, 7044 + static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, 7045 7045 struct pipe_buffer *buf) 7046 7046 { 7047 7047 struct buffer_ref *ref = (struct buffer_ref *)buf->private; 7048 7048 7049 + if (ref->ref > INT_MAX/2) 7050 + return false; 7051 + 7049 7052 ref->ref++; 7053 + return true; 7050 7054 } 7051 7055 7052 7056 /* Pipe buffer operations for a buffer. */
+36 -12
mm/gup.c
··· 160 160 goto retry; 161 161 } 162 162 163 - if (flags & FOLL_GET) 164 - get_page(page); 163 + if (flags & FOLL_GET) { 164 + if (unlikely(!try_get_page(page))) { 165 + page = ERR_PTR(-ENOMEM); 166 + goto out; 167 + } 168 + } 165 169 if (flags & FOLL_TOUCH) { 166 170 if ((flags & FOLL_WRITE) && 167 171 !pte_dirty(pte) && !PageDirty(page)) ··· 302 298 if (pmd_trans_unstable(pmd)) 303 299 ret = -EBUSY; 304 300 } else { 305 - get_page(page); 301 + if (unlikely(!try_get_page(page))) { 302 + spin_unlock(ptl); 303 + return ERR_PTR(-ENOMEM); 304 + } 306 305 spin_unlock(ptl); 307 306 lock_page(page); 308 307 ret = split_huge_page(page); ··· 507 500 if (is_device_public_page(*page)) 508 501 goto unmap; 509 502 } 510 - get_page(*page); 503 + if (unlikely(!try_get_page(*page))) { 504 + ret = -ENOMEM; 505 + goto unmap; 506 + } 511 507 out: 512 508 ret = 0; 513 509 unmap: ··· 1555 1545 } 1556 1546 } 1557 1547 1548 + /* 1549 + * Return the compund head page with ref appropriately incremented, 1550 + * or NULL if that failed. 1551 + */ 1552 + static inline struct page *try_get_compound_head(struct page *page, int refs) 1553 + { 1554 + struct page *head = compound_head(page); 1555 + if (WARN_ON_ONCE(page_ref_count(head) < 0)) 1556 + return NULL; 1557 + if (unlikely(!page_cache_add_speculative(head, refs))) 1558 + return NULL; 1559 + return head; 1560 + } 1561 + 1558 1562 #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL 1559 1563 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, 1560 1564 int write, struct page **pages, int *nr) ··· 1603 1579 1604 1580 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 1605 1581 page = pte_page(pte); 1606 - head = compound_head(page); 1607 1582 1608 - if (!page_cache_get_speculative(head)) 1583 + head = try_get_compound_head(page, 1); 1584 + if (!head) 1609 1585 goto pte_unmap; 1610 1586 1611 1587 if (unlikely(pte_val(pte) != pte_val(*ptep))) { ··· 1744 1720 refs++; 1745 1721 } while (addr += PAGE_SIZE, addr != end); 1746 1722 1747 - head = compound_head(pmd_page(orig)); 1748 - if (!page_cache_add_speculative(head, refs)) { 1723 + head = try_get_compound_head(pmd_page(orig), refs); 1724 + if (!head) { 1749 1725 *nr -= refs; 1750 1726 return 0; 1751 1727 } ··· 1782 1758 refs++; 1783 1759 } while (addr += PAGE_SIZE, addr != end); 1784 1760 1785 - head = compound_head(pud_page(orig)); 1786 - if (!page_cache_add_speculative(head, refs)) { 1761 + head = try_get_compound_head(pud_page(orig), refs); 1762 + if (!head) { 1787 1763 *nr -= refs; 1788 1764 return 0; 1789 1765 } ··· 1819 1795 refs++; 1820 1796 } while (addr += PAGE_SIZE, addr != end); 1821 1797 1822 - head = compound_head(pgd_page(orig)); 1823 - if (!page_cache_add_speculative(head, refs)) { 1798 + head = try_get_compound_head(pgd_page(orig), refs); 1799 + if (!head) { 1824 1800 *nr -= refs; 1825 1801 return 0; 1826 1802 }
+13
mm/hugetlb.c
··· 4299 4299 4300 4300 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; 4301 4301 page = pte_page(huge_ptep_get(pte)); 4302 + 4303 + /* 4304 + * Instead of doing 'try_get_page()' below in the same_page 4305 + * loop, just check the count once here. 4306 + */ 4307 + if (unlikely(page_count(page) <= 0)) { 4308 + if (pages) { 4309 + spin_unlock(ptl); 4310 + remainder = 0; 4311 + err = -ENOMEM; 4312 + break; 4313 + } 4314 + } 4302 4315 same_page: 4303 4316 if (pages) { 4304 4317 pages[i] = mem_map_offset(page, pfn_offset);