[PATCH] Add tmpfs options for memory placement policies

Anything that writes into a tmpfs filesystem is liable to disproportionately
decrease the available memory on a particular node. Since there's no telling
what sort of application (e.g. dd/cp/cat) might be dropping large files
there, this lets the admin choose the appropriate default behavior for their
site's situation.

Introduce a tmpfs mount option which allows specifying a memory policy and
a second option to specify the nodelist for that policy. With the default
policy, tmpfs will behave as it does today. This patch adds support for
preferred, bind, and interleave policies.

The default policy will cause pages to be added to tmpfs files on the node
which is doing the writing. Some jobs expect a single process to create
and manage the tmpfs files. This results in a node which has a
significantly reduced number of free pages.

With this patch, the administrator can specify the policy and nodes for
that policy where they would prefer allocations.

This patch was originally written by Brent Casavant and Hugh Dickins. I
added support for the bind and preferred policies and the mpol_nodelist
mount option.

Signed-off-by: Brent Casavant <bcasavan@sgi.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Robin Holt <holt@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by Robin Holt and committed by Linus Torvalds 7339ff83 852cf918

+75 -15
+12
Documentation/filesystems/tmpfs.txt
··· 78 that instance in a system with many cpus making intensive use of it. 79 80 81 To specify the initial root directory you can use the following mount 82 options: 83
··· 78 that instance in a system with many cpus making intensive use of it. 79 80 81 + tmpfs has a mount option to set the NUMA memory allocation policy for 82 + all files in that instance: 83 + mpol=interleave prefers to allocate memory from each node in turn 84 + mpol=default prefers to allocate memory from the local node 85 + mpol=bind prefers to allocate from mpol_nodelist 86 + mpol=preferred prefers to allocate from first node in mpol_nodelist 87 + 88 + The following mount option is used in conjunction with mpol=interleave, 89 + mpol=bind or mpol=preferred: 90 + mpol_nodelist: nodelist suitable for parsing with nodelist_parse. 91 + 92 + 93 To specify the initial root directory you can use the following mount 94 options: 95
+1 -1
fs/hugetlbfs/inode.c
··· 402 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; 403 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 404 info = HUGETLBFS_I(inode); 405 - mpol_shared_policy_init(&info->policy); 406 switch (mode & S_IFMT) { 407 default: 408 init_special_inode(inode, mode, dev);
··· 402 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; 403 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 404 info = HUGETLBFS_I(inode); 405 + mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL); 406 switch (mode & S_IFMT) { 407 default: 408 init_special_inode(inode, mode, dev);
+4 -7
include/linux/mempolicy.h
··· 132 spinlock_t lock; 133 }; 134 135 - static inline void mpol_shared_policy_init(struct shared_policy *info) 136 - { 137 - info->root = RB_ROOT; 138 - spin_lock_init(&info->lock); 139 - } 140 - 141 int mpol_set_shared_policy(struct shared_policy *info, 142 struct vm_area_struct *vma, 143 struct mempolicy *new); ··· 207 return -EINVAL; 208 } 209 210 - static inline void mpol_shared_policy_init(struct shared_policy *info) 211 { 212 } 213
··· 132 spinlock_t lock; 133 }; 134 135 + void mpol_shared_policy_init(struct shared_policy *info, int policy, 136 + nodemask_t *nodes); 137 int mpol_set_shared_policy(struct shared_policy *info, 138 struct vm_area_struct *vma, 139 struct mempolicy *new); ··· 211 return -EINVAL; 212 } 213 214 + static inline void mpol_shared_policy_init(struct shared_policy *info, 215 + int policy, nodemask_t *nodes) 216 { 217 } 218
+2
include/linux/shmem_fs.h
··· 26 unsigned long free_blocks; /* How many are left for allocation */ 27 unsigned long max_inodes; /* How many inodes are allowed */ 28 unsigned long free_inodes; /* How many are left for allocation */ 29 spinlock_t stat_lock; 30 }; 31
··· 26 unsigned long free_blocks; /* How many are left for allocation */ 27 unsigned long max_inodes; /* How many inodes are allowed */ 28 unsigned long free_inodes; /* How many are left for allocation */ 29 + int policy; /* Default NUMA memory alloc policy */ 30 + nodemask_t policy_nodes; /* nodemask for preferred and bind */ 31 spinlock_t stat_lock; 32 }; 33
+24
mm/mempolicy.c
··· 1359 return 0; 1360 } 1361 1362 int mpol_set_shared_policy(struct shared_policy *info, 1363 struct vm_area_struct *vma, struct mempolicy *npol) 1364 {
··· 1359 return 0; 1360 } 1361 1362 + void mpol_shared_policy_init(struct shared_policy *info, int policy, 1363 + nodemask_t *policy_nodes) 1364 + { 1365 + info->root = RB_ROOT; 1366 + spin_lock_init(&info->lock); 1367 + 1368 + if (policy != MPOL_DEFAULT) { 1369 + struct mempolicy *newpol; 1370 + 1371 + /* Falls back to MPOL_DEFAULT on any error */ 1372 + newpol = mpol_new(policy, policy_nodes); 1373 + if (!IS_ERR(newpol)) { 1374 + /* Create pseudo-vma that contains just the policy */ 1375 + struct vm_area_struct pvma; 1376 + 1377 + memset(&pvma, 0, sizeof(struct vm_area_struct)); 1378 + /* Policy covers entire file */ 1379 + pvma.vm_end = TASK_SIZE; 1380 + mpol_set_shared_policy(info, &pvma, newpol); 1381 + mpol_free(newpol); 1382 + } 1383 + } 1384 + } 1385 + 1386 int mpol_set_shared_policy(struct shared_policy *info, 1387 struct vm_area_struct *vma, struct mempolicy *npol) 1388 {
+32 -7
mm/shmem.c
··· 1316 case S_IFREG: 1317 inode->i_op = &shmem_inode_operations; 1318 inode->i_fop = &shmem_file_operations; 1319 - mpol_shared_policy_init(&info->policy); 1320 break; 1321 case S_IFDIR: 1322 inode->i_nlink++; ··· 1331 * Must not load anything in the rbtree, 1332 * mpol_free_shared_policy will not be called. 1333 */ 1334 - mpol_shared_policy_init(&info->policy); 1335 break; 1336 } 1337 } else if (sbinfo->max_inodes) { ··· 1845 .put_link = shmem_put_link, 1846 }; 1847 1848 - static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes) 1849 { 1850 char *this_char, *value, *rest; 1851 ··· 1901 *gid = simple_strtoul(value,&rest,0); 1902 if (*rest) 1903 goto bad_val; 1904 } else { 1905 printk(KERN_ERR "tmpfs: Bad mount option %s\n", 1906 this_char); ··· 1934 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 1935 unsigned long max_blocks = sbinfo->max_blocks; 1936 unsigned long max_inodes = sbinfo->max_inodes; 1937 unsigned long blocks; 1938 unsigned long inodes; 1939 int error = -EINVAL; 1940 1941 - if (shmem_parse_options(data, NULL, NULL, NULL, 1942 - &max_blocks, &max_inodes)) 1943 return error; 1944 1945 spin_lock(&sbinfo->stat_lock); ··· 1967 sbinfo->free_blocks = max_blocks - blocks; 1968 sbinfo->max_inodes = max_inodes; 1969 sbinfo->free_inodes = max_inodes - inodes; 1970 out: 1971 spin_unlock(&sbinfo->stat_lock); 1972 return error; ··· 1993 struct shmem_sb_info *sbinfo; 1994 unsigned long blocks = 0; 1995 unsigned long inodes = 0; 1996 1997 #ifdef CONFIG_TMPFS 1998 /* ··· 2007 inodes = totalram_pages - totalhigh_pages; 2008 if (inodes > blocks) 2009 inodes = blocks; 2010 - if (shmem_parse_options(data, &mode, &uid, &gid, 2011 - &blocks, &inodes)) 2012 return -EINVAL; 2013 } 2014 #else ··· 2026 sbinfo->free_blocks = blocks; 2027 sbinfo->max_inodes = inodes; 2028 sbinfo->free_inodes = inodes; 2029 2030 sb->s_fs_info = sbinfo; 2031 sb->s_maxbytes = SHMEM_MAX_BYTES;
··· 1316 case S_IFREG: 1317 inode->i_op = &shmem_inode_operations; 1318 inode->i_fop = &shmem_file_operations; 1319 + mpol_shared_policy_init(&info->policy, sbinfo->policy, 1320 + &sbinfo->policy_nodes); 1321 break; 1322 case S_IFDIR: 1323 inode->i_nlink++; ··· 1330 * Must not load anything in the rbtree, 1331 * mpol_free_shared_policy will not be called. 1332 */ 1333 + mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, 1334 + NULL); 1335 break; 1336 } 1337 } else if (sbinfo->max_inodes) { ··· 1843 .put_link = shmem_put_link, 1844 }; 1845 1846 + static int shmem_parse_options(char *options, int *mode, uid_t *uid, 1847 + gid_t *gid, unsigned long *blocks, unsigned long *inodes, 1848 + int *policy, nodemask_t *policy_nodes) 1849 { 1850 char *this_char, *value, *rest; 1851 ··· 1897 *gid = simple_strtoul(value,&rest,0); 1898 if (*rest) 1899 goto bad_val; 1900 + } else if (!strcmp(this_char,"mpol")) { 1901 + if (!strcmp(value,"default")) 1902 + *policy = MPOL_DEFAULT; 1903 + else if (!strcmp(value,"preferred")) 1904 + *policy = MPOL_PREFERRED; 1905 + else if (!strcmp(value,"bind")) 1906 + *policy = MPOL_BIND; 1907 + else if (!strcmp(value,"interleave")) 1908 + *policy = MPOL_INTERLEAVE; 1909 + else 1910 + goto bad_val; 1911 + } else if (!strcmp(this_char,"mpol_nodelist")) { 1912 + nodelist_parse(value, *policy_nodes); 1913 } else { 1914 printk(KERN_ERR "tmpfs: Bad mount option %s\n", 1915 this_char); ··· 1917 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 1918 unsigned long max_blocks = sbinfo->max_blocks; 1919 unsigned long max_inodes = sbinfo->max_inodes; 1920 + int policy = sbinfo->policy; 1921 + nodemask_t policy_nodes = sbinfo->policy_nodes; 1922 unsigned long blocks; 1923 unsigned long inodes; 1924 int error = -EINVAL; 1925 1926 + if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, 1927 + &max_inodes, &policy, &policy_nodes)) 1928 return error; 1929 1930 spin_lock(&sbinfo->stat_lock); ··· 1948 sbinfo->free_blocks = max_blocks - blocks; 1949 sbinfo->max_inodes = max_inodes; 1950 sbinfo->free_inodes = max_inodes - inodes; 1951 + sbinfo->policy = policy; 1952 + sbinfo->policy_nodes = policy_nodes; 1953 out: 1954 spin_unlock(&sbinfo->stat_lock); 1955 return error; ··· 1972 struct shmem_sb_info *sbinfo; 1973 unsigned long blocks = 0; 1974 unsigned long inodes = 0; 1975 + int policy = MPOL_DEFAULT; 1976 + nodemask_t policy_nodes = node_online_map; 1977 1978 #ifdef CONFIG_TMPFS 1979 /* ··· 1984 inodes = totalram_pages - totalhigh_pages; 1985 if (inodes > blocks) 1986 inodes = blocks; 1987 + if (shmem_parse_options(data, &mode, &uid, &gid, &blocks, 1988 + &inodes, &policy, &policy_nodes)) 1989 return -EINVAL; 1990 } 1991 #else ··· 2003 sbinfo->free_blocks = blocks; 2004 sbinfo->max_inodes = inodes; 2005 sbinfo->free_inodes = inodes; 2006 + sbinfo->policy = policy; 2007 + sbinfo->policy_nodes = policy_nodes; 2008 2009 sb->s_fs_info = sbinfo; 2010 sb->s_maxbytes = SHMEM_MAX_BYTES;