Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mempolicy: alloc_pages_mpol() for NUMA policy without vma

Shrink shmem's stack usage by eliminating the pseudo-vma from its folio
allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the
principal actor for passing mempolicy choice down to __alloc_pages(),
rather than vma_alloc_folio(gfp, order, vma, addr, hugepage).

vma_alloc_folio() and alloc_pages() remain, but as wrappers around
alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the
additional args to policy_nodemask(), which subsumes policy_node().
Cleanup throughout, cutting out some unhelpful "helpers".

It would all be much simpler without MPOL_INTERLEAVE, but that adds a
dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd
("tmpfs: distribute interleave better across nodes"), which added ino bias
to the interleave, hidden from mm/mempolicy.c until this commit.

Hence "ilx" throughout, the "interleave index". Originally I thought it
could be done just with nid, but that's wrong: the nodemask may come from
the shared policy layer below a shmem vma, or it may come from the task
layer above a shmem vma; and without the final nodemask then nodeid cannot
be decided. And how ilx is applied depends also on page order.

The interleave index is almost always irrelevant unless MPOL_INTERLEAVE:
with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX
passed down from vma-less alloc_pages() is also used as hint not to use
THP-style hugepage allocation - to avoid the overhead of a hugepage arg
(though I don't understand why we never just added a GFP bit for THP - if
it actually needs a different allocation strategy from other pages of the
same order). vma_alloc_folio() still carries its hugepage arg here, but
it is not used, and should be removed when agreed.

get_vma_policy() no longer allows a NULL vma: over time I believe we've
eradicated all the places which used to need it e.g. swapoff and madvise
used to pass NULL vma to read_swap_cache_async(), but now know the vma.

[hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()]
Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com
Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com
Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun heo <tj@kernel.org>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Hugh Dickins and committed by
Andrew Morton
ddc1a5cb 23e48832

+309 -324
+3 -2
fs/proc/task_mmu.c
··· 2673 2673 struct numa_maps *md = &numa_priv->md; 2674 2674 struct file *file = vma->vm_file; 2675 2675 struct mm_struct *mm = vma->vm_mm; 2676 - struct mempolicy *pol; 2677 2676 char buffer[64]; 2677 + struct mempolicy *pol; 2678 + pgoff_t ilx; 2678 2679 int nid; 2679 2680 2680 2681 if (!mm) ··· 2684 2683 /* Ensure we start with an empty set of numa_maps statistics. */ 2685 2684 memset(md, 0, sizeof(*md)); 2686 2685 2687 - pol = __get_vma_policy(vma, vma->vm_start); 2686 + pol = __get_vma_policy(vma, vma->vm_start, &ilx); 2688 2687 if (pol) { 2689 2688 mpol_to_str(buffer, sizeof(buffer), pol); 2690 2689 mpol_cond_put(pol);
+9 -1
include/linux/gfp.h
··· 8 8 #include <linux/topology.h> 9 9 10 10 struct vm_area_struct; 11 + struct mempolicy; 11 12 12 13 /* Convert GFP flags to their corresponding migrate type */ 13 14 #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) ··· 263 262 264 263 #ifdef CONFIG_NUMA 265 264 struct page *alloc_pages(gfp_t gfp, unsigned int order); 266 - struct folio *folio_alloc(gfp_t gfp, unsigned order); 265 + struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, 266 + struct mempolicy *mpol, pgoff_t ilx, int nid); 267 + struct folio *folio_alloc(gfp_t gfp, unsigned int order); 267 268 struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, 268 269 unsigned long addr, bool hugepage); 269 270 #else 270 271 static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) 271 272 { 272 273 return alloc_pages_node(numa_node_id(), gfp_mask, order); 274 + } 275 + static inline struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, 276 + struct mempolicy *mpol, pgoff_t ilx, int nid) 277 + { 278 + return alloc_pages(gfp, order); 273 279 } 274 280 static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) 275 281 {
+17 -3
include/linux/mempolicy.h
··· 17 17 18 18 struct mm_struct; 19 19 20 + #define NO_INTERLEAVE_INDEX (-1UL) /* use task il_prev for interleaving */ 21 + 20 22 #ifdef CONFIG_NUMA 21 23 22 24 /* ··· 128 126 129 127 struct mempolicy *get_task_policy(struct task_struct *p); 130 128 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, 131 - unsigned long addr); 129 + unsigned long addr, pgoff_t *ilx); 130 + struct mempolicy *get_vma_policy(struct vm_area_struct *vma, 131 + unsigned long addr, int order, pgoff_t *ilx); 132 132 bool vma_policy_mof(struct vm_area_struct *vma); 133 133 134 134 extern void numa_default_policy(void); ··· 144 140 extern bool init_nodemask_of_mempolicy(nodemask_t *mask); 145 141 extern bool mempolicy_in_oom_domain(struct task_struct *tsk, 146 142 const nodemask_t *mask); 147 - extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy); 148 - 149 143 extern unsigned int mempolicy_slab_node(void); 150 144 151 145 extern enum zone_type policy_zone; ··· 181 179 182 180 struct mempolicy {}; 183 181 182 + static inline struct mempolicy *get_task_policy(struct task_struct *p) 183 + { 184 + return NULL; 185 + } 186 + 184 187 static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) 185 188 { 186 189 return true; ··· 217 210 static inline struct mempolicy * 218 211 mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) 219 212 { 213 + return NULL; 214 + } 215 + 216 + static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma, 217 + unsigned long addr, int order, pgoff_t *ilx) 218 + { 219 + *ilx = 0; 220 220 return NULL; 221 221 } 222 222
+1 -1
include/linux/mm.h
··· 619 619 * policy. 620 620 */ 621 621 struct mempolicy *(*get_policy)(struct vm_area_struct *vma, 622 - unsigned long addr); 622 + unsigned long addr, pgoff_t *ilx); 623 623 #endif 624 624 /* 625 625 * Called by vm_normal_page() for special PTEs to find the
+8 -13
ipc/shm.c
··· 562 562 } 563 563 564 564 #ifdef CONFIG_NUMA 565 - static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new) 565 + static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) 566 566 { 567 - struct file *file = vma->vm_file; 568 - struct shm_file_data *sfd = shm_file_data(file); 567 + struct shm_file_data *sfd = shm_file_data(vma->vm_file); 569 568 int err = 0; 570 569 571 570 if (sfd->vm_ops->set_policy) 572 - err = sfd->vm_ops->set_policy(vma, new); 571 + err = sfd->vm_ops->set_policy(vma, mpol); 573 572 return err; 574 573 } 575 574 576 575 static struct mempolicy *shm_get_policy(struct vm_area_struct *vma, 577 - unsigned long addr) 576 + unsigned long addr, pgoff_t *ilx) 578 577 { 579 - struct file *file = vma->vm_file; 580 - struct shm_file_data *sfd = shm_file_data(file); 581 - struct mempolicy *pol = NULL; 578 + struct shm_file_data *sfd = shm_file_data(vma->vm_file); 579 + struct mempolicy *mpol = vma->vm_policy; 582 580 583 581 if (sfd->vm_ops->get_policy) 584 - pol = sfd->vm_ops->get_policy(vma, addr); 585 - else if (vma->vm_policy) 586 - pol = vma->vm_policy; 587 - 588 - return pol; 582 + mpol = sfd->vm_ops->get_policy(vma, addr, ilx); 583 + return mpol; 589 584 } 590 585 #endif 591 586
+156 -225
mm/mempolicy.c
··· 898 898 } 899 899 900 900 if (flags & MPOL_F_ADDR) { 901 + pgoff_t ilx; /* ignored here */ 901 902 /* 902 903 * Do NOT fall back to task policy if the 903 904 * vma/shared policy at addr is NULL. We ··· 910 909 mmap_read_unlock(mm); 911 910 return -EFAULT; 912 911 } 913 - if (vma->vm_ops && vma->vm_ops->get_policy) 914 - pol = vma->vm_ops->get_policy(vma, addr); 915 - else 916 - pol = vma->vm_policy; 912 + pol = __get_vma_policy(vma, addr, &ilx); 917 913 } else if (addr) 918 914 return -EINVAL; 919 915 ··· 1168 1170 break; 1169 1171 } 1170 1172 1173 + /* 1174 + * __get_vma_policy() now expects a genuine non-NULL vma. Return NULL 1175 + * when the page can no longer be located in a vma: that is not ideal 1176 + * (migrate_pages() will give up early, presuming ENOMEM), but good 1177 + * enough to avoid a crash by syzkaller or concurrent holepunch. 1178 + */ 1179 + if (!vma) 1180 + return NULL; 1181 + 1171 1182 if (folio_test_hugetlb(src)) { 1172 1183 return alloc_hugetlb_folio_vma(folio_hstate(src), 1173 1184 vma, address); ··· 1185 1178 if (folio_test_large(src)) 1186 1179 gfp = GFP_TRANSHUGE; 1187 1180 1188 - /* 1189 - * if !vma, vma_alloc_folio() will use task or system default policy 1190 - */ 1191 1181 return vma_alloc_folio(gfp, folio_order(src), vma, address, 1192 1182 folio_test_large(src)); 1193 1183 } ··· 1694 1690 } 1695 1691 1696 1692 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, 1697 - unsigned long addr) 1693 + unsigned long addr, pgoff_t *ilx) 1698 1694 { 1699 - struct mempolicy *pol = NULL; 1700 - 1701 - if (vma) { 1702 - if (vma->vm_ops && vma->vm_ops->get_policy) { 1703 - pol = vma->vm_ops->get_policy(vma, addr); 1704 - } else if (vma->vm_policy) { 1705 - pol = vma->vm_policy; 1706 - 1707 - /* 1708 - * shmem_alloc_page() passes MPOL_F_SHARED policy with 1709 - * a pseudo vma whose vma->vm_ops=NULL. Take a reference 1710 - * count on these policies which will be dropped by 1711 - * mpol_cond_put() later 1712 - */ 1713 - if (mpol_needs_cond_ref(pol)) 1714 - mpol_get(pol); 1715 - } 1716 - } 1717 - 1718 - return pol; 1695 + *ilx = 0; 1696 + return (vma->vm_ops && vma->vm_ops->get_policy) ? 1697 + vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; 1719 1698 } 1720 1699 1721 1700 /* 1722 - * get_vma_policy(@vma, @addr) 1701 + * get_vma_policy(@vma, @addr, @order, @ilx) 1723 1702 * @vma: virtual memory area whose policy is sought 1724 1703 * @addr: address in @vma for shared policy lookup 1704 + * @order: 0, or appropriate huge_page_order for interleaving 1705 + * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE 1725 1706 * 1726 1707 * Returns effective policy for a VMA at specified address. 1727 1708 * Falls back to current->mempolicy or system default policy, as necessary. ··· 1715 1726 * freeing by another task. It is the caller's responsibility to free the 1716 1727 * extra reference for shared policies. 1717 1728 */ 1718 - static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, 1719 - unsigned long addr) 1729 + struct mempolicy *get_vma_policy(struct vm_area_struct *vma, 1730 + unsigned long addr, int order, pgoff_t *ilx) 1720 1731 { 1721 - struct mempolicy *pol = __get_vma_policy(vma, addr); 1732 + struct mempolicy *pol; 1722 1733 1734 + pol = __get_vma_policy(vma, addr, ilx); 1723 1735 if (!pol) 1724 1736 pol = get_task_policy(current); 1725 - 1737 + if (pol->mode == MPOL_INTERLEAVE) { 1738 + *ilx += vma->vm_pgoff >> order; 1739 + *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); 1740 + } 1726 1741 return pol; 1727 1742 } 1728 1743 ··· 1736 1743 1737 1744 if (vma->vm_ops && vma->vm_ops->get_policy) { 1738 1745 bool ret = false; 1746 + pgoff_t ilx; /* ignored here */ 1739 1747 1740 - pol = vma->vm_ops->get_policy(vma, vma->vm_start); 1748 + pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); 1741 1749 if (pol && (pol->flags & MPOL_F_MOF)) 1742 1750 ret = true; 1743 1751 mpol_cond_put(pol); ··· 1771 1777 dynamic_policy_zone = ZONE_MOVABLE; 1772 1778 1773 1779 return zone >= dynamic_policy_zone; 1774 - } 1775 - 1776 - /* 1777 - * Return a nodemask representing a mempolicy for filtering nodes for 1778 - * page allocation 1779 - */ 1780 - nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) 1781 - { 1782 - int mode = policy->mode; 1783 - 1784 - /* Lower zones don't get a nodemask applied for MPOL_BIND */ 1785 - if (unlikely(mode == MPOL_BIND) && 1786 - apply_policy_zone(policy, gfp_zone(gfp)) && 1787 - cpuset_nodemask_valid_mems_allowed(&policy->nodes)) 1788 - return &policy->nodes; 1789 - 1790 - if (mode == MPOL_PREFERRED_MANY) 1791 - return &policy->nodes; 1792 - 1793 - return NULL; 1794 - } 1795 - 1796 - /* 1797 - * Return the preferred node id for 'prefer' mempolicy, and return 1798 - * the given id for all other policies. 1799 - * 1800 - * policy_node() is always coupled with policy_nodemask(), which 1801 - * secures the nodemask limit for 'bind' and 'prefer-many' policy. 1802 - */ 1803 - static int policy_node(gfp_t gfp, struct mempolicy *policy, int nid) 1804 - { 1805 - if (policy->mode == MPOL_PREFERRED) { 1806 - nid = first_node(policy->nodes); 1807 - } else { 1808 - /* 1809 - * __GFP_THISNODE shouldn't even be used with the bind policy 1810 - * because we might easily break the expectation to stay on the 1811 - * requested node and not break the policy. 1812 - */ 1813 - WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); 1814 - } 1815 - 1816 - if ((policy->mode == MPOL_BIND || 1817 - policy->mode == MPOL_PREFERRED_MANY) && 1818 - policy->home_node != NUMA_NO_NODE) 1819 - return policy->home_node; 1820 - 1821 - return nid; 1822 1780 } 1823 1781 1824 1782 /* Do dynamic interleaving for a process */ ··· 1832 1886 } 1833 1887 1834 1888 /* 1835 - * Do static interleaving for a VMA with known offset @n. Returns the n'th 1836 - * node in pol->nodes (starting from n=0), wrapping around if n exceeds the 1837 - * number of present nodes. 1889 + * Do static interleaving for interleave index @ilx. Returns the ilx'th 1890 + * node in pol->nodes (starting from ilx=0), wrapping around if ilx 1891 + * exceeds the number of present nodes. 1838 1892 */ 1839 - static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) 1893 + static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) 1840 1894 { 1841 1895 nodemask_t nodemask = pol->nodes; 1842 1896 unsigned int target, nnodes; ··· 1854 1908 nnodes = nodes_weight(nodemask); 1855 1909 if (!nnodes) 1856 1910 return numa_node_id(); 1857 - target = (unsigned int)n % nnodes; 1911 + target = ilx % nnodes; 1858 1912 nid = first_node(nodemask); 1859 1913 for (i = 0; i < target; i++) 1860 1914 nid = next_node(nid, nodemask); 1861 1915 return nid; 1862 1916 } 1863 1917 1864 - /* Determine a node number for interleave */ 1865 - static inline unsigned interleave_nid(struct mempolicy *pol, 1866 - struct vm_area_struct *vma, unsigned long addr, int shift) 1918 + /* 1919 + * Return a nodemask representing a mempolicy for filtering nodes for 1920 + * page allocation, together with preferred node id (or the input node id). 1921 + */ 1922 + static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, 1923 + pgoff_t ilx, int *nid) 1867 1924 { 1868 - if (vma) { 1869 - unsigned long off; 1925 + nodemask_t *nodemask = NULL; 1870 1926 1927 + switch (pol->mode) { 1928 + case MPOL_PREFERRED: 1929 + /* Override input node id */ 1930 + *nid = first_node(pol->nodes); 1931 + break; 1932 + case MPOL_PREFERRED_MANY: 1933 + nodemask = &pol->nodes; 1934 + if (pol->home_node != NUMA_NO_NODE) 1935 + *nid = pol->home_node; 1936 + break; 1937 + case MPOL_BIND: 1938 + /* Restrict to nodemask (but not on lower zones) */ 1939 + if (apply_policy_zone(pol, gfp_zone(gfp)) && 1940 + cpuset_nodemask_valid_mems_allowed(&pol->nodes)) 1941 + nodemask = &pol->nodes; 1942 + if (pol->home_node != NUMA_NO_NODE) 1943 + *nid = pol->home_node; 1871 1944 /* 1872 - * for small pages, there is no difference between 1873 - * shift and PAGE_SHIFT, so the bit-shift is safe. 1874 - * for huge pages, since vm_pgoff is in units of small 1875 - * pages, we need to shift off the always 0 bits to get 1876 - * a useful offset. 1945 + * __GFP_THISNODE shouldn't even be used with the bind policy 1946 + * because we might easily break the expectation to stay on the 1947 + * requested node and not break the policy. 1877 1948 */ 1878 - BUG_ON(shift < PAGE_SHIFT); 1879 - off = vma->vm_pgoff >> (shift - PAGE_SHIFT); 1880 - off += (addr - vma->vm_start) >> shift; 1881 - return offset_il_node(pol, off); 1882 - } else 1883 - return interleave_nodes(pol); 1949 + WARN_ON_ONCE(gfp & __GFP_THISNODE); 1950 + break; 1951 + case MPOL_INTERLEAVE: 1952 + /* Override input node id */ 1953 + *nid = (ilx == NO_INTERLEAVE_INDEX) ? 1954 + interleave_nodes(pol) : interleave_nid(pol, ilx); 1955 + break; 1956 + } 1957 + 1958 + return nodemask; 1884 1959 } 1885 1960 1886 1961 #ifdef CONFIG_HUGETLBFS ··· 1917 1950 * to the struct mempolicy for conditional unref after allocation. 1918 1951 * If the effective policy is 'bind' or 'prefer-many', returns a pointer 1919 1952 * to the mempolicy's @nodemask for filtering the zonelist. 1920 - * 1921 - * Must be protected by read_mems_allowed_begin() 1922 1953 */ 1923 1954 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, 1924 - struct mempolicy **mpol, nodemask_t **nodemask) 1955 + struct mempolicy **mpol, nodemask_t **nodemask) 1925 1956 { 1957 + pgoff_t ilx; 1926 1958 int nid; 1927 - int mode; 1928 1959 1929 - *mpol = get_vma_policy(vma, addr); 1930 - *nodemask = NULL; 1931 - mode = (*mpol)->mode; 1932 - 1933 - if (unlikely(mode == MPOL_INTERLEAVE)) { 1934 - nid = interleave_nid(*mpol, vma, addr, 1935 - huge_page_shift(hstate_vma(vma))); 1936 - } else { 1937 - nid = policy_node(gfp_flags, *mpol, numa_node_id()); 1938 - if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY) 1939 - *nodemask = &(*mpol)->nodes; 1940 - } 1960 + nid = numa_node_id(); 1961 + *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); 1962 + *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); 1941 1963 return nid; 1942 1964 } 1943 1965 ··· 2004 2048 return ret; 2005 2049 } 2006 2050 2007 - /* Allocate a page in interleaved policy. 2008 - Own path because it needs to do special accounting. */ 2009 - static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, 2010 - unsigned nid) 2011 - { 2012 - struct page *page; 2013 - 2014 - page = __alloc_pages(gfp, order, nid, NULL); 2015 - /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */ 2016 - if (!static_branch_likely(&vm_numa_stat_key)) 2017 - return page; 2018 - if (page && page_to_nid(page) == nid) { 2019 - preempt_disable(); 2020 - __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); 2021 - preempt_enable(); 2022 - } 2023 - return page; 2024 - } 2025 - 2026 2051 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, 2027 - int nid, struct mempolicy *pol) 2052 + int nid, nodemask_t *nodemask) 2028 2053 { 2029 2054 struct page *page; 2030 2055 gfp_t preferred_gfp; ··· 2018 2081 */ 2019 2082 preferred_gfp = gfp | __GFP_NOWARN; 2020 2083 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 2021 - page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes); 2084 + page = __alloc_pages(preferred_gfp, order, nid, nodemask); 2022 2085 if (!page) 2023 2086 page = __alloc_pages(gfp, order, nid, NULL); 2024 2087 ··· 2026 2089 } 2027 2090 2028 2091 /** 2029 - * vma_alloc_folio - Allocate a folio for a VMA. 2092 + * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. 2030 2093 * @gfp: GFP flags. 2031 - * @order: Order of the folio. 2032 - * @vma: Pointer to VMA or NULL if not available. 2033 - * @addr: Virtual address of the allocation. Must be inside @vma. 2034 - * @hugepage: For hugepages try only the preferred node if possible. 2094 + * @order: Order of the page allocation. 2095 + * @pol: Pointer to the NUMA mempolicy. 2096 + * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). 2097 + * @nid: Preferred node (usually numa_node_id() but @mpol may override it). 2035 2098 * 2036 - * Allocate a folio for a specific address in @vma, using the appropriate 2037 - * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock 2038 - * of the mm_struct of the VMA to prevent it from going away. Should be 2039 - * used for all allocations for folios that will be mapped into user space. 2040 - * 2041 - * Return: The folio on success or NULL if allocation fails. 2099 + * Return: The page on success or NULL if allocation fails. 2042 2100 */ 2043 - struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, 2044 - unsigned long addr, bool hugepage) 2101 + struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, 2102 + struct mempolicy *pol, pgoff_t ilx, int nid) 2045 2103 { 2046 - struct mempolicy *pol; 2047 - int node = numa_node_id(); 2048 - struct folio *folio; 2049 - int preferred_nid; 2050 - nodemask_t *nmask; 2104 + nodemask_t *nodemask; 2105 + struct page *page; 2051 2106 2052 - pol = get_vma_policy(vma, addr); 2107 + nodemask = policy_nodemask(gfp, pol, ilx, &nid); 2053 2108 2054 - if (pol->mode == MPOL_INTERLEAVE) { 2055 - struct page *page; 2056 - unsigned nid; 2109 + if (pol->mode == MPOL_PREFERRED_MANY) 2110 + return alloc_pages_preferred_many(gfp, order, nid, nodemask); 2057 2111 2058 - nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); 2059 - mpol_cond_put(pol); 2060 - gfp |= __GFP_COMP; 2061 - page = alloc_page_interleave(gfp, order, nid); 2062 - return page_rmappable_folio(page); 2063 - } 2064 - 2065 - if (pol->mode == MPOL_PREFERRED_MANY) { 2066 - struct page *page; 2067 - 2068 - node = policy_node(gfp, pol, node); 2069 - gfp |= __GFP_COMP; 2070 - page = alloc_pages_preferred_many(gfp, order, node, pol); 2071 - mpol_cond_put(pol); 2072 - return page_rmappable_folio(page); 2073 - } 2074 - 2075 - if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { 2076 - int hpage_node = node; 2077 - 2112 + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 2113 + /* filter "hugepage" allocation, unless from alloc_pages() */ 2114 + order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { 2078 2115 /* 2079 2116 * For hugepage allocation and non-interleave policy which 2080 2117 * allows the current node (or other explicitly preferred ··· 2059 2148 * If the policy is interleave or does not allow the current 2060 2149 * node in its nodemask, we allocate the standard way. 2061 2150 */ 2062 - if (pol->mode == MPOL_PREFERRED) 2063 - hpage_node = first_node(pol->nodes); 2064 - 2065 - nmask = policy_nodemask(gfp, pol); 2066 - if (!nmask || node_isset(hpage_node, *nmask)) { 2067 - mpol_cond_put(pol); 2151 + if (pol->mode != MPOL_INTERLEAVE && 2152 + (!nodemask || node_isset(nid, *nodemask))) { 2068 2153 /* 2069 2154 * First, try to allocate THP only on local node, but 2070 2155 * don't reclaim unnecessarily, just compact. 2071 2156 */ 2072 - folio = __folio_alloc_node(gfp | __GFP_THISNODE | 2073 - __GFP_NORETRY, order, hpage_node); 2074 - 2157 + page = __alloc_pages_node(nid, 2158 + gfp | __GFP_THISNODE | __GFP_NORETRY, order); 2159 + if (page || !(gfp & __GFP_DIRECT_RECLAIM)) 2160 + return page; 2075 2161 /* 2076 2162 * If hugepage allocations are configured to always 2077 2163 * synchronous compact or the vma has been madvised 2078 2164 * to prefer hugepage backing, retry allowing remote 2079 2165 * memory with both reclaim and compact as well. 2080 2166 */ 2081 - if (!folio && (gfp & __GFP_DIRECT_RECLAIM)) 2082 - folio = __folio_alloc(gfp, order, hpage_node, 2083 - nmask); 2084 - 2085 - goto out; 2086 2167 } 2087 2168 } 2088 2169 2089 - nmask = policy_nodemask(gfp, pol); 2090 - preferred_nid = policy_node(gfp, pol, node); 2091 - folio = __folio_alloc(gfp, order, preferred_nid, nmask); 2170 + page = __alloc_pages(gfp, order, nid, nodemask); 2171 + 2172 + if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) { 2173 + /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ 2174 + if (static_branch_likely(&vm_numa_stat_key) && 2175 + page_to_nid(page) == nid) { 2176 + preempt_disable(); 2177 + __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); 2178 + preempt_enable(); 2179 + } 2180 + } 2181 + 2182 + return page; 2183 + } 2184 + 2185 + /** 2186 + * vma_alloc_folio - Allocate a folio for a VMA. 2187 + * @gfp: GFP flags. 2188 + * @order: Order of the folio. 2189 + * @vma: Pointer to VMA. 2190 + * @addr: Virtual address of the allocation. Must be inside @vma. 2191 + * @hugepage: Unused (was: For hugepages try only preferred node if possible). 2192 + * 2193 + * Allocate a folio for a specific address in @vma, using the appropriate 2194 + * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the 2195 + * VMA to prevent it from going away. Should be used for all allocations 2196 + * for folios that will be mapped into user space, excepting hugetlbfs, and 2197 + * excepting where direct use of alloc_pages_mpol() is more appropriate. 2198 + * 2199 + * Return: The folio on success or NULL if allocation fails. 2200 + */ 2201 + struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, 2202 + unsigned long addr, bool hugepage) 2203 + { 2204 + struct mempolicy *pol; 2205 + pgoff_t ilx; 2206 + struct page *page; 2207 + 2208 + pol = get_vma_policy(vma, addr, order, &ilx); 2209 + page = alloc_pages_mpol(gfp | __GFP_COMP, order, 2210 + pol, ilx, numa_node_id()); 2092 2211 mpol_cond_put(pol); 2093 - out: 2094 - return folio; 2212 + return page_rmappable_folio(page); 2095 2213 } 2096 2214 EXPORT_SYMBOL(vma_alloc_folio); 2097 2215 ··· 2138 2198 * flags are used. 2139 2199 * Return: The page on success or NULL if allocation fails. 2140 2200 */ 2141 - struct page *alloc_pages(gfp_t gfp, unsigned order) 2201 + struct page *alloc_pages(gfp_t gfp, unsigned int order) 2142 2202 { 2143 2203 struct mempolicy *pol = &default_policy; 2144 - struct page *page; 2145 - 2146 - if (!in_interrupt() && !(gfp & __GFP_THISNODE)) 2147 - pol = get_task_policy(current); 2148 2204 2149 2205 /* 2150 2206 * No reference counting needed for current->mempolicy 2151 2207 * nor system default_policy 2152 2208 */ 2153 - if (pol->mode == MPOL_INTERLEAVE) 2154 - page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); 2155 - else if (pol->mode == MPOL_PREFERRED_MANY) 2156 - page = alloc_pages_preferred_many(gfp, order, 2157 - policy_node(gfp, pol, numa_node_id()), pol); 2158 - else 2159 - page = __alloc_pages(gfp, order, 2160 - policy_node(gfp, pol, numa_node_id()), 2161 - policy_nodemask(gfp, pol)); 2209 + if (!in_interrupt() && !(gfp & __GFP_THISNODE)) 2210 + pol = get_task_policy(current); 2162 2211 2163 - return page; 2212 + return alloc_pages_mpol(gfp, order, 2213 + pol, NO_INTERLEAVE_INDEX, numa_node_id()); 2164 2214 } 2165 2215 EXPORT_SYMBOL(alloc_pages); 2166 2216 2167 - struct folio *folio_alloc(gfp_t gfp, unsigned order) 2217 + struct folio *folio_alloc(gfp_t gfp, unsigned int order) 2168 2218 { 2169 2219 return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order)); 2170 2220 } ··· 2225 2295 unsigned long nr_pages, struct page **page_array) 2226 2296 { 2227 2297 struct mempolicy *pol = &default_policy; 2298 + nodemask_t *nodemask; 2299 + int nid; 2228 2300 2229 2301 if (!in_interrupt() && !(gfp & __GFP_THISNODE)) 2230 2302 pol = get_task_policy(current); ··· 2239 2307 return alloc_pages_bulk_array_preferred_many(gfp, 2240 2308 numa_node_id(), pol, nr_pages, page_array); 2241 2309 2242 - return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()), 2243 - policy_nodemask(gfp, pol), nr_pages, NULL, 2244 - page_array); 2310 + nid = numa_node_id(); 2311 + nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); 2312 + return __alloc_pages_bulk(gfp, nid, nodemask, 2313 + nr_pages, NULL, page_array); 2245 2314 } 2246 2315 2247 2316 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) ··· 2429 2496 unsigned long addr) 2430 2497 { 2431 2498 struct mempolicy *pol; 2499 + pgoff_t ilx; 2432 2500 struct zoneref *z; 2433 2501 int curnid = folio_nid(folio); 2434 - unsigned long pgoff; 2435 2502 int thiscpu = raw_smp_processor_id(); 2436 2503 int thisnid = cpu_to_node(thiscpu); 2437 2504 int polnid = NUMA_NO_NODE; 2438 2505 int ret = NUMA_NO_NODE; 2439 2506 2440 - pol = get_vma_policy(vma, addr); 2507 + pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); 2441 2508 if (!(pol->flags & MPOL_F_MOF)) 2442 2509 goto out; 2443 2510 2444 2511 switch (pol->mode) { 2445 2512 case MPOL_INTERLEAVE: 2446 - pgoff = vma->vm_pgoff; 2447 - pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; 2448 - polnid = offset_il_node(pol, pgoff); 2513 + polnid = interleave_nid(pol, ilx); 2449 2514 break; 2450 2515 2451 2516 case MPOL_PREFERRED:
+51 -41
mm/shmem.c
··· 1544 1544 return NULL; 1545 1545 } 1546 1546 #endif /* CONFIG_NUMA && CONFIG_TMPFS */ 1547 - #ifndef CONFIG_NUMA 1548 - #define vm_policy vm_private_data 1549 - #endif 1550 1547 1551 - static void shmem_pseudo_vma_init(struct vm_area_struct *vma, 1552 - struct shmem_inode_info *info, pgoff_t index) 1553 - { 1554 - /* Create a pseudo vma that just contains the policy */ 1555 - vma_init(vma, NULL); 1556 - /* Bias interleave by inode number to distribute better across nodes */ 1557 - vma->vm_pgoff = index + info->vfs_inode.i_ino; 1558 - vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); 1559 - } 1548 + static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, 1549 + pgoff_t index, unsigned int order, pgoff_t *ilx); 1560 1550 1561 - static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) 1562 - { 1563 - /* Drop reference taken by mpol_shared_policy_lookup() */ 1564 - mpol_cond_put(vma->vm_policy); 1565 - } 1566 - 1567 - static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp, 1551 + static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp, 1568 1552 struct shmem_inode_info *info, pgoff_t index) 1569 1553 { 1570 - struct vm_area_struct pvma; 1554 + struct mempolicy *mpol; 1555 + pgoff_t ilx; 1571 1556 struct page *page; 1572 - struct vm_fault vmf = { 1573 - .vma = &pvma, 1574 - }; 1575 1557 1576 - shmem_pseudo_vma_init(&pvma, info, index); 1577 - page = swap_cluster_readahead(swap, gfp, &vmf); 1578 - shmem_pseudo_vma_destroy(&pvma); 1558 + mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); 1559 + page = swap_cluster_readahead(swap, gfp, mpol, ilx); 1560 + mpol_cond_put(mpol); 1579 1561 1580 1562 if (!page) 1581 1563 return NULL; ··· 1591 1609 static struct folio *shmem_alloc_hugefolio(gfp_t gfp, 1592 1610 struct shmem_inode_info *info, pgoff_t index) 1593 1611 { 1594 - struct vm_area_struct pvma; 1595 - struct folio *folio; 1612 + struct mempolicy *mpol; 1613 + pgoff_t ilx; 1614 + struct page *page; 1596 1615 1597 - shmem_pseudo_vma_init(&pvma, info, index); 1598 - folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); 1599 - shmem_pseudo_vma_destroy(&pvma); 1616 + mpol = shmem_get_pgoff_policy(info, index, HPAGE_PMD_ORDER, &ilx); 1617 + page = alloc_pages_mpol(gfp, HPAGE_PMD_ORDER, mpol, ilx, numa_node_id()); 1618 + mpol_cond_put(mpol); 1600 1619 1601 - return folio; 1620 + return page_rmappable_folio(page); 1602 1621 } 1603 1622 1604 1623 static struct folio *shmem_alloc_folio(gfp_t gfp, 1605 1624 struct shmem_inode_info *info, pgoff_t index) 1606 1625 { 1607 - struct vm_area_struct pvma; 1608 - struct folio *folio; 1626 + struct mempolicy *mpol; 1627 + pgoff_t ilx; 1628 + struct page *page; 1609 1629 1610 - shmem_pseudo_vma_init(&pvma, info, index); 1611 - folio = vma_alloc_folio(gfp, 0, &pvma, 0, false); 1612 - shmem_pseudo_vma_destroy(&pvma); 1630 + mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); 1631 + page = alloc_pages_mpol(gfp, 0, mpol, ilx, numa_node_id()); 1632 + mpol_cond_put(mpol); 1613 1633 1614 - return folio; 1634 + return (struct folio *)page; 1615 1635 } 1616 1636 1617 1637 static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, ··· 1867 1883 count_memcg_event_mm(fault_mm, PGMAJFAULT); 1868 1884 } 1869 1885 /* Here we actually start the io */ 1870 - folio = shmem_swapin(swap, gfp, info, index); 1886 + folio = shmem_swapin_cluster(swap, gfp, info, index); 1871 1887 if (!folio) { 1872 1888 error = -ENOMEM; 1873 1889 goto failed; ··· 2318 2334 } 2319 2335 2320 2336 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, 2321 - unsigned long addr) 2337 + unsigned long addr, pgoff_t *ilx) 2322 2338 { 2323 2339 struct inode *inode = file_inode(vma->vm_file); 2324 2340 pgoff_t index; 2325 2341 2342 + /* 2343 + * Bias interleave by inode number to distribute better across nodes; 2344 + * but this interface is independent of which page order is used, so 2345 + * supplies only that bias, letting caller apply the offset (adjusted 2346 + * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()). 2347 + */ 2348 + *ilx = inode->i_ino; 2326 2349 index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 2327 2350 return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); 2328 2351 } 2329 - #endif 2352 + 2353 + static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, 2354 + pgoff_t index, unsigned int order, pgoff_t *ilx) 2355 + { 2356 + struct mempolicy *mpol; 2357 + 2358 + /* Bias interleave by inode number to distribute better across nodes */ 2359 + *ilx = info->vfs_inode.i_ino + (index >> order); 2360 + 2361 + mpol = mpol_shared_policy_lookup(&info->policy, index); 2362 + return mpol ? mpol : get_task_policy(current); 2363 + } 2364 + #else 2365 + static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, 2366 + pgoff_t index, unsigned int order, pgoff_t *ilx) 2367 + { 2368 + *ilx = 0; 2369 + return NULL; 2370 + } 2371 + #endif /* CONFIG_NUMA */ 2330 2372 2331 2373 int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) 2332 2374 {
+5 -4
mm/swap.h
··· 2 2 #ifndef _MM_SWAP_H 3 3 #define _MM_SWAP_H 4 4 5 + struct mempolicy; 6 + 5 7 #ifdef CONFIG_SWAP 6 8 #include <linux/blk_types.h> /* for bio_end_io_t */ 7 9 ··· 50 48 unsigned long addr, 51 49 struct swap_iocb **plug); 52 50 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 53 - struct vm_area_struct *vma, 54 - unsigned long addr, 51 + struct mempolicy *mpol, pgoff_t ilx, 55 52 bool *new_page_allocated); 56 53 struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, 57 - struct vm_fault *vmf); 54 + struct mempolicy *mpol, pgoff_t ilx); 58 55 struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, 59 56 struct vm_fault *vmf); 60 57 ··· 81 80 } 82 81 83 82 static inline struct page *swap_cluster_readahead(swp_entry_t entry, 84 - gfp_t gfp_mask, struct vm_fault *vmf) 83 + gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx) 85 84 { 86 85 return NULL; 87 86 }
+54 -32
mm/swap_state.c
··· 10 10 #include <linux/mm.h> 11 11 #include <linux/gfp.h> 12 12 #include <linux/kernel_stat.h> 13 + #include <linux/mempolicy.h> 13 14 #include <linux/swap.h> 14 15 #include <linux/swapops.h> 15 16 #include <linux/init.h> ··· 411 410 } 412 411 413 412 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 414 - struct vm_area_struct *vma, unsigned long addr, 415 - bool *new_page_allocated) 413 + struct mempolicy *mpol, pgoff_t ilx, 414 + bool *new_page_allocated) 416 415 { 417 416 struct swap_info_struct *si; 418 417 struct folio *folio; ··· 454 453 * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will 455 454 * cause any racers to loop around until we add it to cache. 456 455 */ 457 - folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false); 456 + folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, 457 + mpol, ilx, numa_node_id()); 458 458 if (!folio) 459 459 goto fail_put_swap; 460 460 ··· 530 528 struct vm_area_struct *vma, 531 529 unsigned long addr, struct swap_iocb **plug) 532 530 { 533 - bool page_was_allocated; 534 - struct page *retpage = __read_swap_cache_async(entry, gfp_mask, 535 - vma, addr, &page_was_allocated); 531 + bool page_allocated; 532 + struct mempolicy *mpol; 533 + pgoff_t ilx; 534 + struct page *page; 536 535 537 - if (page_was_allocated) 538 - swap_readpage(retpage, false, plug); 536 + mpol = get_vma_policy(vma, addr, 0, &ilx); 537 + page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, 538 + &page_allocated); 539 + mpol_cond_put(mpol); 539 540 540 - return retpage; 541 + if (page_allocated) 542 + swap_readpage(page, false, plug); 543 + return page; 541 544 } 542 545 543 546 static unsigned int __swapin_nr_pages(unsigned long prev_offset, ··· 610 603 * swap_cluster_readahead - swap in pages in hope we need them soon 611 604 * @entry: swap entry of this memory 612 605 * @gfp_mask: memory allocation flags 613 - * @vmf: fault information 606 + * @mpol: NUMA memory allocation policy to be applied 607 + * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE 614 608 * 615 609 * Returns the struct page for entry and addr, after queueing swapin. 616 610 * ··· 620 612 * because it doesn't cost us any seek time. We also make sure to queue 621 613 * the 'original' request together with the readahead ones... 622 614 * 623 - * This has been extended to use the NUMA policies from the mm triggering 624 - * the readahead. 625 - * 626 - * Caller must hold read mmap_lock if vmf->vma is not NULL. 615 + * Note: it is intentional that the same NUMA policy and interleave index 616 + * are used for every page of the readahead: neighbouring pages on swap 617 + * are fairly likely to have been swapped out from the same node. 627 618 */ 628 619 struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, 629 - struct vm_fault *vmf) 620 + struct mempolicy *mpol, pgoff_t ilx) 630 621 { 631 622 struct page *page; 632 623 unsigned long entry_offset = swp_offset(entry); ··· 636 629 struct blk_plug plug; 637 630 struct swap_iocb *splug = NULL; 638 631 bool page_allocated; 639 - struct vm_area_struct *vma = vmf->vma; 640 - unsigned long addr = vmf->address; 641 632 642 633 mask = swapin_nr_pages(offset) - 1; 643 634 if (!mask) ··· 653 648 for (offset = start_offset; offset <= end_offset ; offset++) { 654 649 /* Ok, do the async read-ahead now */ 655 650 page = __read_swap_cache_async( 656 - swp_entry(swp_type(entry), offset), 657 - gfp_mask, vma, addr, &page_allocated); 651 + swp_entry(swp_type(entry), offset), 652 + gfp_mask, mpol, ilx, &page_allocated); 658 653 if (!page) 659 654 continue; 660 655 if (page_allocated) { ··· 668 663 } 669 664 blk_finish_plug(&plug); 670 665 swap_read_unplug(splug); 671 - 672 666 lru_add_drain(); /* Push any new pages onto the LRU now */ 673 667 skip: 674 668 /* The page was likely read above, so no need for plugging here */ 675 - return read_swap_cache_async(entry, gfp_mask, vma, addr, NULL); 669 + page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, 670 + &page_allocated); 671 + if (unlikely(page_allocated)) 672 + swap_readpage(page, false, NULL); 673 + return page; 676 674 } 677 675 678 676 int init_swap_address_space(unsigned int type, unsigned long nr_pages) ··· 773 765 774 766 /** 775 767 * swap_vma_readahead - swap in pages in hope we need them soon 776 - * @fentry: swap entry of this memory 768 + * @targ_entry: swap entry of the targeted memory 777 769 * @gfp_mask: memory allocation flags 770 + * @mpol: NUMA memory allocation policy to be applied 771 + * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE 778 772 * @vmf: fault information 779 773 * 780 774 * Returns the struct page for entry and addr, after queueing swapin. ··· 787 777 * Caller must hold read mmap_lock if vmf->vma is not NULL. 788 778 * 789 779 */ 790 - static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, 780 + static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, 781 + struct mempolicy *mpol, pgoff_t targ_ilx, 791 782 struct vm_fault *vmf) 792 783 { 793 784 struct blk_plug plug; 794 785 struct swap_iocb *splug = NULL; 795 - struct vm_area_struct *vma = vmf->vma; 796 786 struct page *page; 797 787 pte_t *pte = NULL, pentry; 798 788 unsigned long addr; 799 789 swp_entry_t entry; 790 + pgoff_t ilx; 800 791 unsigned int i; 801 792 bool page_allocated; 802 793 struct vma_swap_readahead ra_info = { ··· 809 798 goto skip; 810 799 811 800 addr = vmf->address - (ra_info.offset * PAGE_SIZE); 801 + ilx = targ_ilx - ra_info.offset; 812 802 813 803 blk_start_plug(&plug); 814 - for (i = 0; i < ra_info.nr_pte; i++, addr += PAGE_SIZE) { 804 + for (i = 0; i < ra_info.nr_pte; i++, ilx++, addr += PAGE_SIZE) { 815 805 if (!pte++) { 816 806 pte = pte_offset_map(vmf->pmd, addr); 817 807 if (!pte) ··· 826 814 continue; 827 815 pte_unmap(pte); 828 816 pte = NULL; 829 - page = __read_swap_cache_async(entry, gfp_mask, vma, 830 - addr, &page_allocated); 817 + page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, 818 + &page_allocated); 831 819 if (!page) 832 820 continue; 833 821 if (page_allocated) { ··· 846 834 lru_add_drain(); 847 835 skip: 848 836 /* The page was likely read above, so no need for plugging here */ 849 - return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, 850 - NULL); 837 + page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, 838 + &page_allocated); 839 + if (unlikely(page_allocated)) 840 + swap_readpage(page, false, NULL); 841 + return page; 851 842 } 852 843 853 844 /** ··· 868 853 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 869 854 struct vm_fault *vmf) 870 855 { 871 - return swap_use_vma_readahead() ? 872 - swap_vma_readahead(entry, gfp_mask, vmf) : 873 - swap_cluster_readahead(entry, gfp_mask, vmf); 856 + struct mempolicy *mpol; 857 + pgoff_t ilx; 858 + struct page *page; 859 + 860 + mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); 861 + page = swap_use_vma_readahead() ? 862 + swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) : 863 + swap_cluster_readahead(entry, gfp_mask, mpol, ilx); 864 + mpol_cond_put(mpol); 865 + return page; 874 866 } 875 867 876 868 #ifdef CONFIG_SYSFS
+5 -2
mm/zswap.c
··· 24 24 #include <linux/swap.h> 25 25 #include <linux/crypto.h> 26 26 #include <linux/scatterlist.h> 27 + #include <linux/mempolicy.h> 27 28 #include <linux/mempool.h> 28 29 #include <linux/zpool.h> 29 30 #include <crypto/acompress.h> ··· 1058 1057 { 1059 1058 swp_entry_t swpentry = entry->swpentry; 1060 1059 struct page *page; 1060 + struct mempolicy *mpol; 1061 1061 struct scatterlist input, output; 1062 1062 struct crypto_acomp_ctx *acomp_ctx; 1063 1063 struct zpool *pool = zswap_find_zpool(entry); ··· 1077 1075 } 1078 1076 1079 1077 /* try to allocate swap cache page */ 1080 - page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0, 1081 - &page_was_allocated); 1078 + mpol = get_task_policy(current); 1079 + page = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, 1080 + NO_INTERLEAVE_INDEX, &page_was_allocated); 1082 1081 if (!page) { 1083 1082 ret = -ENOMEM; 1084 1083 goto fail;