[PATCH] Add tmpfs options for memory placement policies

Anything that writes into a tmpfs filesystem is liable to disproportionately
decrease the available memory on a particular node. Since there's no telling
what sort of application (e.g. dd/cp/cat) might be dropping large files
there, this lets the admin choose the appropriate default behavior for their
site's situation.

Introduce a tmpfs mount option which allows specifying a memory policy and
a second option to specify the nodelist for that policy. With the default
policy, tmpfs will behave as it does today. This patch adds support for
preferred, bind, and interleave policies.

The default policy will cause pages to be added to tmpfs files on the node
which is doing the writing. Some jobs expect a single process to create
and manage the tmpfs files. This results in a node which has a
significantly reduced number of free pages.

With this patch, the administrator can specify the policy and nodes for
that policy where they would prefer allocations.

This patch was originally written by Brent Casavant and Hugh Dickins. I
added support for the bind and preferred policies and the mpol_nodelist
mount option.

Signed-off-by: Brent Casavant <bcasavan@sgi.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Robin Holt <holt@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by Robin Holt and committed by Linus Torvalds 7339ff83 852cf918

+75 -15
+12
Documentation/filesystems/tmpfs.txt
··· 78 78 that instance in a system with many cpus making intensive use of it. 79 79 80 80 81 + tmpfs has a mount option to set the NUMA memory allocation policy for 82 + all files in that instance: 83 + mpol=interleave prefers to allocate memory from each node in turn 84 + mpol=default prefers to allocate memory from the local node 85 + mpol=bind prefers to allocate from mpol_nodelist 86 + mpol=preferred prefers to allocate from first node in mpol_nodelist 87 + 88 + The following mount option is used in conjunction with mpol=interleave, 89 + mpol=bind or mpol=preferred: 90 + mpol_nodelist: nodelist suitable for parsing with nodelist_parse. 91 + 92 + 81 93 To specify the initial root directory you can use the following mount 82 94 options: 83 95
+1 -1
fs/hugetlbfs/inode.c
··· 402 402 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; 403 403 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 404 404 info = HUGETLBFS_I(inode); 405 - mpol_shared_policy_init(&info->policy); 405 + mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL); 406 406 switch (mode & S_IFMT) { 407 407 default: 408 408 init_special_inode(inode, mode, dev);
+4 -7
include/linux/mempolicy.h
··· 132 132 spinlock_t lock; 133 133 }; 134 134 135 - static inline void mpol_shared_policy_init(struct shared_policy *info) 136 - { 137 - info->root = RB_ROOT; 138 - spin_lock_init(&info->lock); 139 - } 140 - 135 + void mpol_shared_policy_init(struct shared_policy *info, int policy, 136 + nodemask_t *nodes); 141 137 int mpol_set_shared_policy(struct shared_policy *info, 142 138 struct vm_area_struct *vma, 143 139 struct mempolicy *new); ··· 207 211 return -EINVAL; 208 212 } 209 213 210 - static inline void mpol_shared_policy_init(struct shared_policy *info) 214 + static inline void mpol_shared_policy_init(struct shared_policy *info, 215 + int policy, nodemask_t *nodes) 211 216 { 212 217 } 213 218
+2
include/linux/shmem_fs.h
··· 26 26 unsigned long free_blocks; /* How many are left for allocation */ 27 27 unsigned long max_inodes; /* How many inodes are allowed */ 28 28 unsigned long free_inodes; /* How many are left for allocation */ 29 + int policy; /* Default NUMA memory alloc policy */ 30 + nodemask_t policy_nodes; /* nodemask for preferred and bind */ 29 31 spinlock_t stat_lock; 30 32 }; 31 33
+24
mm/mempolicy.c
··· 1359 1359 return 0; 1360 1360 } 1361 1361 1362 + void mpol_shared_policy_init(struct shared_policy *info, int policy, 1363 + nodemask_t *policy_nodes) 1364 + { 1365 + info->root = RB_ROOT; 1366 + spin_lock_init(&info->lock); 1367 + 1368 + if (policy != MPOL_DEFAULT) { 1369 + struct mempolicy *newpol; 1370 + 1371 + /* Falls back to MPOL_DEFAULT on any error */ 1372 + newpol = mpol_new(policy, policy_nodes); 1373 + if (!IS_ERR(newpol)) { 1374 + /* Create pseudo-vma that contains just the policy */ 1375 + struct vm_area_struct pvma; 1376 + 1377 + memset(&pvma, 0, sizeof(struct vm_area_struct)); 1378 + /* Policy covers entire file */ 1379 + pvma.vm_end = TASK_SIZE; 1380 + mpol_set_shared_policy(info, &pvma, newpol); 1381 + mpol_free(newpol); 1382 + } 1383 + } 1384 + } 1385 + 1362 1386 int mpol_set_shared_policy(struct shared_policy *info, 1363 1387 struct vm_area_struct *vma, struct mempolicy *npol) 1364 1388 {
+32 -7
mm/shmem.c
··· 1316 1316 case S_IFREG: 1317 1317 inode->i_op = &shmem_inode_operations; 1318 1318 inode->i_fop = &shmem_file_operations; 1319 - mpol_shared_policy_init(&info->policy); 1319 + mpol_shared_policy_init(&info->policy, sbinfo->policy, 1320 + &sbinfo->policy_nodes); 1320 1321 break; 1321 1322 case S_IFDIR: 1322 1323 inode->i_nlink++; ··· 1331 1330 * Must not load anything in the rbtree, 1332 1331 * mpol_free_shared_policy will not be called. 1333 1332 */ 1334 - mpol_shared_policy_init(&info->policy); 1333 + mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, 1334 + NULL); 1335 1335 break; 1336 1336 } 1337 1337 } else if (sbinfo->max_inodes) { ··· 1845 1843 .put_link = shmem_put_link, 1846 1844 }; 1847 1845 1848 - static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes) 1846 + static int shmem_parse_options(char *options, int *mode, uid_t *uid, 1847 + gid_t *gid, unsigned long *blocks, unsigned long *inodes, 1848 + int *policy, nodemask_t *policy_nodes) 1849 1849 { 1850 1850 char *this_char, *value, *rest; 1851 1851 ··· 1901 1897 *gid = simple_strtoul(value,&rest,0); 1902 1898 if (*rest) 1903 1899 goto bad_val; 1900 + } else if (!strcmp(this_char,"mpol")) { 1901 + if (!strcmp(value,"default")) 1902 + *policy = MPOL_DEFAULT; 1903 + else if (!strcmp(value,"preferred")) 1904 + *policy = MPOL_PREFERRED; 1905 + else if (!strcmp(value,"bind")) 1906 + *policy = MPOL_BIND; 1907 + else if (!strcmp(value,"interleave")) 1908 + *policy = MPOL_INTERLEAVE; 1909 + else 1910 + goto bad_val; 1911 + } else if (!strcmp(this_char,"mpol_nodelist")) { 1912 + nodelist_parse(value, *policy_nodes); 1904 1913 } else { 1905 1914 printk(KERN_ERR "tmpfs: Bad mount option %s\n", 1906 1915 this_char); ··· 1934 1917 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 1935 1918 unsigned long max_blocks = sbinfo->max_blocks; 1936 1919 unsigned long max_inodes = sbinfo->max_inodes; 1920 + int policy = sbinfo->policy; 1921 + nodemask_t policy_nodes = sbinfo->policy_nodes; 1937 1922 unsigned long blocks; 1938 1923 unsigned long inodes; 1939 1924 int error = -EINVAL; 1940 1925 1941 - if (shmem_parse_options(data, NULL, NULL, NULL, 1942 - &max_blocks, &max_inodes)) 1926 + if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, 1927 + &max_inodes, &policy, &policy_nodes)) 1943 1928 return error; 1944 1929 1945 1930 spin_lock(&sbinfo->stat_lock); ··· 1967 1948 sbinfo->free_blocks = max_blocks - blocks; 1968 1949 sbinfo->max_inodes = max_inodes; 1969 1950 sbinfo->free_inodes = max_inodes - inodes; 1951 + sbinfo->policy = policy; 1952 + sbinfo->policy_nodes = policy_nodes; 1970 1953 out: 1971 1954 spin_unlock(&sbinfo->stat_lock); 1972 1955 return error; ··· 1993 1972 struct shmem_sb_info *sbinfo; 1994 1973 unsigned long blocks = 0; 1995 1974 unsigned long inodes = 0; 1975 + int policy = MPOL_DEFAULT; 1976 + nodemask_t policy_nodes = node_online_map; 1996 1977 1997 1978 #ifdef CONFIG_TMPFS 1998 1979 /* ··· 2007 1984 inodes = totalram_pages - totalhigh_pages; 2008 1985 if (inodes > blocks) 2009 1986 inodes = blocks; 2010 - if (shmem_parse_options(data, &mode, &uid, &gid, 2011 - &blocks, &inodes)) 1987 + if (shmem_parse_options(data, &mode, &uid, &gid, &blocks, 1988 + &inodes, &policy, &policy_nodes)) 2012 1989 return -EINVAL; 2013 1990 } 2014 1991 #else ··· 2026 2003 sbinfo->free_blocks = blocks; 2027 2004 sbinfo->max_inodes = inodes; 2028 2005 sbinfo->free_inodes = inodes; 2006 + sbinfo->policy = policy; 2007 + sbinfo->policy_nodes = policy_nodes; 2029 2008 2030 2009 sb->s_fs_info = sbinfo; 2031 2010 sb->s_maxbytes = SHMEM_MAX_BYTES;