Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: support more pagesizes for MAP_HUGETLB/SHM_HUGETLB

There was some desire in large applications using MAP_HUGETLB or
SHM_HUGETLB to use 1GB huge pages on some mappings, and stay with 2MB on
others. This is useful together with NUMA policy: use 2MB interleaving
on some mappings, but 1GB on local mappings.

This patch extends the IPC/SHM syscall interfaces slightly to allow
specifying the page size.

It borrows some upper bits in the existing flag arguments and allows
encoding the log of the desired page size in addition to the *_HUGETLB
flag. When 0 is specified the default size is used, this makes the
change fully compatible.

Extending the internal hugetlb code to handle this is straight forward.
Instead of a single mount it just keeps an array of them and selects the
right mount based on the specified page size. When no page size is
specified it uses the mount of the default page size.

The change is not visible in /proc/mounts because internal mounts don't
appear there. It also has very little overhead: the additional mounts
just consume a super block, but not more memory when not used.

I also exported the new flags to the user headers (they were previously
under __KERNEL__). Right now only symbols for x86 and some other
architecture for 1GB and 2MB are defined. The interface should already
work for all other architectures though. Only architectures that define
multiple hugetlb sizes actually need it (that is currently x86, tile,
powerpc). However tile and powerpc have user configurable hugetlb
sizes, so it's not easy to add defines. A program on those
architectures would need to query sysfs and use the appropiate log2.

[akpm@linux-foundation.org: cleanups]
[rientjes@google.com: fix build]
[akpm@linux-foundation.org: checkpatch fixes]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hillf Danton <dhillf@gmail.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Andi Kleen and committed by
Linus Torvalds
42d7395f ff604cf6

+135 -18
+11
arch/alpha/include/asm/mman.h
··· 63 63 /* compatibility flags */ 64 64 #define MAP_FILE 0 65 65 66 + /* 67 + * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. 68 + * This gives us 6 bits, which is enough until someone invents 128 bit address 69 + * spaces. 70 + * 71 + * Assume these are all power of twos. 72 + * When 0 use the default page size. 73 + */ 74 + #define MAP_HUGE_SHIFT 26 75 + #define MAP_HUGE_MASK 0x3f 76 + 66 77 #endif /* __ALPHA_MMAN_H__ */
+11
arch/mips/include/uapi/asm/mman.h
··· 87 87 /* compatibility flags */ 88 88 #define MAP_FILE 0 89 89 90 + /* 91 + * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. 92 + * This gives us 6 bits, which is enough until someone invents 128 bit address 93 + * spaces. 94 + * 95 + * Assume these are all power of twos. 96 + * When 0 use the default page size. 97 + */ 98 + #define MAP_HUGE_SHIFT 26 99 + #define MAP_HUGE_MASK 0x3f 100 + 90 101 #endif /* _ASM_MMAN_H */
+11
arch/parisc/include/uapi/asm/mman.h
··· 70 70 #define MAP_FILE 0 71 71 #define MAP_VARIABLE 0 72 72 73 + /* 74 + * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. 75 + * This gives us 6 bits, which is enough until someone invents 128 bit address 76 + * spaces. 77 + * 78 + * Assume these are all power of twos. 79 + * When 0 use the default page size. 80 + */ 81 + #define MAP_HUGE_SHIFT 26 82 + #define MAP_HUGE_MASK 0x3f 83 + 73 84 #endif /* __PARISC_MMAN_H__ */
+3
arch/x86/include/asm/mman.h
··· 3 3 4 4 #define MAP_32BIT 0x40 /* only give out 32bit addresses */ 5 5 6 + #define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) 7 + #define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) 8 + 6 9 #include <asm-generic/mman.h> 7 10 8 11 #endif /* _ASM_X86_MMAN_H */
+11
arch/xtensa/include/uapi/asm/mman.h
··· 93 93 /* compatibility flags */ 94 94 #define MAP_FILE 0 95 95 96 + /* 97 + * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. 98 + * This gives us 6 bits, which is enough until someone invents 128 bit address 99 + * spaces. 100 + * 101 + * Assume these are all power of twos. 102 + * When 0 use the default page size. 103 + */ 104 + #define MAP_HUGE_SHIFT 26 105 + #define MAP_HUGE_MASK 0x3f 106 + 96 107 #endif /* _XTENSA_MMAN_H */
+50 -13
fs/hugetlbfs/inode.c
··· 923 923 .kill_sb = kill_litter_super, 924 924 }; 925 925 926 - static struct vfsmount *hugetlbfs_vfsmount; 926 + static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; 927 927 928 928 static int can_do_hugetlb_shm(void) 929 929 { ··· 932 932 return capable(CAP_IPC_LOCK) || in_group_p(shm_group); 933 933 } 934 934 935 + static int get_hstate_idx(int page_size_log) 936 + { 937 + struct hstate *h; 938 + 939 + if (!page_size_log) 940 + return default_hstate_idx; 941 + h = size_to_hstate(1 << page_size_log); 942 + if (!h) 943 + return -1; 944 + return h - hstates; 945 + } 946 + 935 947 struct file *hugetlb_file_setup(const char *name, unsigned long addr, 936 948 size_t size, vm_flags_t acctflag, 937 - struct user_struct **user, int creat_flags) 949 + struct user_struct **user, 950 + int creat_flags, int page_size_log) 938 951 { 939 952 int error = -ENOMEM; 940 953 struct file *file; ··· 957 944 struct qstr quick_string; 958 945 struct hstate *hstate; 959 946 unsigned long num_pages; 947 + int hstate_idx; 948 + 949 + hstate_idx = get_hstate_idx(page_size_log); 950 + if (hstate_idx < 0) 951 + return ERR_PTR(-ENODEV); 960 952 961 953 *user = NULL; 962 - if (!hugetlbfs_vfsmount) 954 + if (!hugetlbfs_vfsmount[hstate_idx]) 963 955 return ERR_PTR(-ENOENT); 964 956 965 957 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { ··· 981 963 } 982 964 } 983 965 984 - root = hugetlbfs_vfsmount->mnt_root; 966 + root = hugetlbfs_vfsmount[hstate_idx]->mnt_root; 985 967 quick_string.name = name; 986 968 quick_string.len = strlen(quick_string.name); 987 969 quick_string.hash = 0; ··· 989 971 if (!path.dentry) 990 972 goto out_shm_unlock; 991 973 992 - path.mnt = mntget(hugetlbfs_vfsmount); 974 + path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]); 993 975 error = -ENOSPC; 994 976 inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0); 995 977 if (!inode) ··· 1029 1011 1030 1012 static int __init init_hugetlbfs_fs(void) 1031 1013 { 1014 + struct hstate *h; 1032 1015 int error; 1033 - struct vfsmount *vfsmount; 1016 + int i; 1034 1017 1035 1018 error = bdi_init(&hugetlbfs_backing_dev_info); 1036 1019 if (error) ··· 1048 1029 if (error) 1049 1030 goto out; 1050 1031 1051 - vfsmount = kern_mount(&hugetlbfs_fs_type); 1032 + i = 0; 1033 + for_each_hstate(h) { 1034 + char buf[50]; 1035 + unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10); 1052 1036 1053 - if (!IS_ERR(vfsmount)) { 1054 - hugetlbfs_vfsmount = vfsmount; 1055 - return 0; 1037 + snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb); 1038 + hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type, 1039 + buf); 1040 + 1041 + if (IS_ERR(hugetlbfs_vfsmount[i])) { 1042 + pr_err("hugetlb: Cannot mount internal hugetlbfs for " 1043 + "page size %uK", ps_kb); 1044 + error = PTR_ERR(hugetlbfs_vfsmount[i]); 1045 + hugetlbfs_vfsmount[i] = NULL; 1046 + } 1047 + i++; 1056 1048 } 1057 - 1058 - error = PTR_ERR(vfsmount); 1049 + /* Non default hstates are optional */ 1050 + if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx])) 1051 + return 0; 1059 1052 1060 1053 out: 1061 1054 kmem_cache_destroy(hugetlbfs_inode_cachep); ··· 1078 1047 1079 1048 static void __exit exit_hugetlbfs_fs(void) 1080 1049 { 1050 + struct hstate *h; 1051 + int i; 1052 + 1053 + 1081 1054 /* 1082 1055 * Make sure all delayed rcu free inodes are flushed before we 1083 1056 * destroy cache. 1084 1057 */ 1085 1058 rcu_barrier(); 1086 1059 kmem_cache_destroy(hugetlbfs_inode_cachep); 1087 - kern_unmount(hugetlbfs_vfsmount); 1060 + i = 0; 1061 + for_each_hstate(h) 1062 + kern_unmount(hugetlbfs_vfsmount[i++]); 1088 1063 unregister_filesystem(&hugetlbfs_fs_type); 1089 1064 bdi_destroy(&hugetlbfs_backing_dev_info); 1090 1065 }
+5 -2
include/linux/hugetlb.h
··· 183 183 extern const struct vm_operations_struct hugetlb_vm_ops; 184 184 struct file *hugetlb_file_setup(const char *name, unsigned long addr, 185 185 size_t size, vm_flags_t acct, 186 - struct user_struct **user, int creat_flags); 186 + struct user_struct **user, int creat_flags, 187 + int page_size_log); 187 188 188 189 static inline int is_file_hugepages(struct file *file) 189 190 { ··· 196 195 return 0; 197 196 } 198 197 198 + 199 199 #else /* !CONFIG_HUGETLBFS */ 200 200 201 201 #define is_file_hugepages(file) 0 202 202 static inline struct file * 203 203 hugetlb_file_setup(const char *name, unsigned long addr, size_t size, 204 - vm_flags_t acctflag, struct user_struct **user, int creat_flags) 204 + vm_flags_t acctflag, struct user_struct **user, int creat_flags, 205 + int page_size_log) 205 206 { 206 207 return ERR_PTR(-ENOSYS); 207 208 }
+15
include/linux/shm.h
··· 29 29 #define SHM_HUGETLB 04000 /* segment will use huge TLB pages */ 30 30 #define SHM_NORESERVE 010000 /* don't check for reservations */ 31 31 32 + /* Bits [26:31] are reserved */ 33 + 34 + /* 35 + * When SHM_HUGETLB is set bits [26:31] encode the log2 of the huge page size. 36 + * This gives us 6 bits, which is enough until someone invents 128 bit address 37 + * spaces. 38 + * 39 + * Assume these are all power of twos. 40 + * When 0 use the default page size. 41 + */ 42 + #define SHM_HUGE_SHIFT 26 43 + #define SHM_HUGE_MASK 0x3f 44 + #define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT) 45 + #define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT) 46 + 32 47 #ifdef CONFIG_SYSVIPC 33 48 long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr, 34 49 unsigned long shmlba);
+11
include/uapi/asm-generic/mman-common.h
··· 55 55 /* compatibility flags */ 56 56 #define MAP_FILE 0 57 57 58 + /* 59 + * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. 60 + * This gives us 6 bits, which is enough until someone invents 128 bit address 61 + * spaces. 62 + * 63 + * Assume these are all power of twos. 64 + * When 0 use the default page size. 65 + */ 66 + #define MAP_HUGE_SHIFT 26 67 + #define MAP_HUGE_MASK 0x3f 68 + 58 69 #endif /* __ASM_GENERIC_MMAN_COMMON_H */
+2
include/uapi/asm-generic/mman.h
··· 13 13 #define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ 14 14 #define MAP_HUGETLB 0x40000 /* create a huge page mapping */ 15 15 16 + /* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */ 17 + 16 18 #define MCL_CURRENT 1 /* lock all current mappings */ 17 19 #define MCL_FUTURE 2 /* lock all future mappings */ 18 20
+2 -1
ipc/shm.c
··· 495 495 if (shmflg & SHM_NORESERVE) 496 496 acctflag = VM_NORESERVE; 497 497 file = hugetlb_file_setup(name, 0, size, acctflag, 498 - &shp->mlock_user, HUGETLB_SHMFS_INODE); 498 + &shp->mlock_user, HUGETLB_SHMFS_INODE, 499 + (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); 499 500 } else { 500 501 /* 501 502 * Do not allow no accounting for OVERCOMMIT_NEVER, even
+3 -2
mm/mmap.c
··· 1153 1153 * memory so no accounting is necessary 1154 1154 */ 1155 1155 file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, 1156 - VM_NORESERVE, &user, 1157 - HUGETLB_ANONHUGE_INODE); 1156 + VM_NORESERVE, 1157 + &user, HUGETLB_ANONHUGE_INODE, 1158 + (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); 1158 1159 if (IS_ERR(file)) 1159 1160 return PTR_ERR(file); 1160 1161 }