Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ipc/msg: increase MSGMNI, remove scaling

SysV can be abused to allocate locked kernel memory. For most systems, a
small limit doesn't make sense, see the discussion with regards to SHMMAX.

Therefore: increase MSGMNI to the maximum supported.

And: If we ignore the risk of locking too much memory, then an automatic
scaling of MSGMNI doesn't make sense. Therefore the logic can be removed.

The code preserves auto_msgmni to avoid breaking any user space applications
that expect that the value exists.

Notes:
1) If an administrator must limit the memory allocations, then he can set
MSGMNI as necessary.

Or he can disable sysv entirely (as e.g. done by Android).

2) MSGMAX and MSGMNB are intentionally not increased, as these values are used
to control latency vs. throughput:
If MSGMNB is large, then msgsnd() just returns and more messages can be queued
before a task switch to a task that calls msgrcv() is forced.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Rafael Aquini <aquini@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Manfred Spraul and committed by
Linus Torvalds
0050ee05 e843e7d2

+45 -298
+6 -4
Documentation/sysctl/kernel.txt
··· 116 116 117 117 auto_msgmni: 118 118 119 - Enables/Disables automatic recomputing of msgmni upon memory add/remove 120 - or upon ipc namespace creation/removal (see the msgmni description 121 - above). Echoing "1" into this file enables msgmni automatic recomputing. 122 - Echoing "0" turns it off. auto_msgmni default value is 1. 119 + This variable has no effect and may be removed in future kernel 120 + releases. Reading it always returns 0. 121 + Up to Linux 3.17, it enabled/disabled automatic recomputing of msgmni 122 + upon memory add/remove or upon ipc namespace creation/removal. 123 + Echoing "1" into this file enabled msgmni automatic recomputing. 124 + Echoing "0" turned it off. auto_msgmni default value was 1. 123 125 124 126 125 127 ==============================================================
-20
include/linux/ipc_namespace.h
··· 7 7 #include <linux/notifier.h> 8 8 #include <linux/nsproxy.h> 9 9 10 - /* 11 - * ipc namespace events 12 - */ 13 - #define IPCNS_MEMCHANGED 0x00000001 /* Notify lowmem size changed */ 14 - #define IPCNS_CREATED 0x00000002 /* Notify new ipc namespace created */ 15 - #define IPCNS_REMOVED 0x00000003 /* Notify ipc namespace removed */ 16 - 17 - #define IPCNS_CALLBACK_PRI 0 18 - 19 10 struct user_namespace; 20 11 21 12 struct ipc_ids { ··· 29 38 unsigned int msg_ctlmni; 30 39 atomic_t msg_bytes; 31 40 atomic_t msg_hdrs; 32 - int auto_msgmni; 33 41 34 42 size_t shm_ctlmax; 35 43 size_t shm_ctlall; ··· 67 77 extern spinlock_t mq_lock; 68 78 69 79 #ifdef CONFIG_SYSVIPC 70 - extern int register_ipcns_notifier(struct ipc_namespace *); 71 - extern int cond_register_ipcns_notifier(struct ipc_namespace *); 72 - extern void unregister_ipcns_notifier(struct ipc_namespace *); 73 - extern int ipcns_notify(unsigned long); 74 80 extern void shm_destroy_orphaned(struct ipc_namespace *ns); 75 81 #else /* CONFIG_SYSVIPC */ 76 - static inline int register_ipcns_notifier(struct ipc_namespace *ns) 77 - { return 0; } 78 - static inline int cond_register_ipcns_notifier(struct ipc_namespace *ns) 79 - { return 0; } 80 - static inline void unregister_ipcns_notifier(struct ipc_namespace *ns) { } 81 - static inline int ipcns_notify(unsigned long l) { return 0; } 82 82 static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {} 83 83 #endif /* CONFIG_SYSVIPC */ 84 84
+20 -8
include/uapi/linux/msg.h
··· 51 51 }; 52 52 53 53 /* 54 - * Scaling factor to compute msgmni: 55 - * the memory dedicated to msg queues (msgmni * msgmnb) should occupy 56 - * at most 1/MSG_MEM_SCALE of the lowmem (see the formula in ipc/msg.c): 57 - * up to 8MB : msgmni = 16 (MSGMNI) 58 - * 4 GB : msgmni = 8K 59 - * more than 16 GB : msgmni = 32K (IPCMNI) 54 + * MSGMNI, MSGMAX and MSGMNB are default values which can be 55 + * modified by sysctl. 56 + * 57 + * MSGMNI is the upper limit for the number of messages queues per 58 + * namespace. 59 + * It has been chosen to be as large possible without facilitating 60 + * scenarios where userspace causes overflows when adjusting the limits via 61 + * operations of the form retrieve current limit; add X; update limit". 62 + * 63 + * MSGMNB is the default size of a new message queue. Non-root tasks can 64 + * decrease the size with msgctl(IPC_SET), root tasks 65 + * (actually: CAP_SYS_RESOURCE) can both increase and decrease the queue 66 + * size. The optimal value is application dependent. 67 + * 16384 is used because it was always used (since 0.99.10) 68 + * 69 + * MAXMAX is the maximum size of an individual message, it's a global 70 + * (per-namespace) limit that applies for all message queues. 71 + * It's set to 1/2 of MSGMNB, to ensure that at least two messages fit into 72 + * the queue. This is also an arbitrary choice (since 2.6.0). 60 73 */ 61 - #define MSG_MEM_SCALE 32 62 74 63 - #define MSGMNI 16 /* <= IPCMNI */ /* max # of msg queue identifiers */ 75 + #define MSGMNI 32000 /* <= IPCMNI */ /* max # of msg queue identifiers */ 64 76 #define MSGMAX 8192 /* <= INT_MAX */ /* max size of message (bytes) */ 65 77 #define MSGMNB 16384 /* <= INT_MAX */ /* default max size of a message queue */ 66 78
+1 -1
ipc/Makefile
··· 3 3 # 4 4 5 5 obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o 6 - obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o ipcns_notifier.o syscall.o 6 + obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o syscall.o 7 7 obj-$(CONFIG_SYSVIPC_SYSCTL) += ipc_sysctl.o 8 8 obj_mq-$(CONFIG_COMPAT) += compat_mq.o 9 9 obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y)
+17 -76
ipc/ipc_sysctl.c
··· 62 62 return err; 63 63 } 64 64 65 - static int proc_ipc_callback_dointvec_minmax(struct ctl_table *table, int write, 66 - void __user *buffer, size_t *lenp, loff_t *ppos) 67 - { 68 - struct ctl_table ipc_table; 69 - size_t lenp_bef = *lenp; 70 - int rc; 71 - 72 - memcpy(&ipc_table, table, sizeof(ipc_table)); 73 - ipc_table.data = get_ipc(table); 74 - 75 - rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); 76 - 77 - if (write && !rc && lenp_bef == *lenp) 78 - /* 79 - * Tunable has successfully been changed by hand. Disable its 80 - * automatic adjustment. This simply requires unregistering 81 - * the notifiers that trigger recalculation. 82 - */ 83 - unregister_ipcns_notifier(current->nsproxy->ipc_ns); 84 - 85 - return rc; 86 - } 87 - 88 65 static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, 89 66 void __user *buffer, size_t *lenp, loff_t *ppos) 90 67 { ··· 73 96 lenp, ppos); 74 97 } 75 98 76 - /* 77 - * Routine that is called when the file "auto_msgmni" has successfully been 78 - * written. 79 - * Two values are allowed: 80 - * 0: unregister msgmni's callback routine from the ipc namespace notifier 81 - * chain. This means that msgmni won't be recomputed anymore upon memory 82 - * add/remove or ipc namespace creation/removal. 83 - * 1: register back the callback routine. 84 - */ 85 - static void ipc_auto_callback(int val) 86 - { 87 - if (!val) 88 - unregister_ipcns_notifier(current->nsproxy->ipc_ns); 89 - else { 90 - /* 91 - * Re-enable automatic recomputing only if not already 92 - * enabled. 93 - */ 94 - recompute_msgmni(current->nsproxy->ipc_ns); 95 - cond_register_ipcns_notifier(current->nsproxy->ipc_ns); 96 - } 97 - } 98 - 99 - static int proc_ipcauto_dointvec_minmax(struct ctl_table *table, int write, 99 + static int proc_ipc_auto_msgmni(struct ctl_table *table, int write, 100 100 void __user *buffer, size_t *lenp, loff_t *ppos) 101 101 { 102 102 struct ctl_table ipc_table; 103 - int oldval; 104 - int rc; 103 + int dummy = 0; 105 104 106 105 memcpy(&ipc_table, table, sizeof(ipc_table)); 107 - ipc_table.data = get_ipc(table); 108 - oldval = *((int *)(ipc_table.data)); 106 + ipc_table.data = &dummy; 109 107 110 - rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); 108 + if (write) 109 + pr_info_once("writing to auto_msgmni has no effect"); 111 110 112 - if (write && !rc) { 113 - int newval = *((int *)(ipc_table.data)); 114 - /* 115 - * The file "auto_msgmni" has correctly been set. 116 - * React by (un)registering the corresponding tunable, if the 117 - * value has changed. 118 - */ 119 - if (newval != oldval) 120 - ipc_auto_callback(newval); 121 - } 122 - 123 - return rc; 111 + return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); 124 112 } 125 113 126 114 #else ··· 93 151 #define proc_ipc_dointvec NULL 94 152 #define proc_ipc_dointvec_minmax NULL 95 153 #define proc_ipc_dointvec_minmax_orphans NULL 96 - #define proc_ipc_callback_dointvec_minmax NULL 97 - #define proc_ipcauto_dointvec_minmax NULL 154 + #define proc_ipc_auto_msgmni NULL 98 155 #endif 99 156 100 157 static int zero; ··· 145 204 .data = &init_ipc_ns.msg_ctlmni, 146 205 .maxlen = sizeof(init_ipc_ns.msg_ctlmni), 147 206 .mode = 0644, 148 - .proc_handler = proc_ipc_callback_dointvec_minmax, 207 + .proc_handler = proc_ipc_dointvec_minmax, 149 208 .extra1 = &zero, 150 209 .extra2 = &int_max, 210 + }, 211 + { 212 + .procname = "auto_msgmni", 213 + .data = NULL, 214 + .maxlen = sizeof(int), 215 + .mode = 0644, 216 + .proc_handler = proc_ipc_auto_msgmni, 217 + .extra1 = &zero, 218 + .extra2 = &one, 151 219 }, 152 220 { 153 221 .procname = "msgmnb", ··· 173 223 .maxlen = 4*sizeof(int), 174 224 .mode = 0644, 175 225 .proc_handler = proc_ipc_dointvec, 176 - }, 177 - { 178 - .procname = "auto_msgmni", 179 - .data = &init_ipc_ns.auto_msgmni, 180 - .maxlen = sizeof(int), 181 - .mode = 0644, 182 - .proc_handler = proc_ipcauto_dointvec_minmax, 183 - .extra1 = &zero, 184 - .extra2 = &one, 185 226 }, 186 227 #ifdef CONFIG_CHECKPOINT_RESTORE 187 228 {
-92
ipc/ipcns_notifier.c
··· 1 - /* 2 - * linux/ipc/ipcns_notifier.c 3 - * Copyright (C) 2007 BULL SA. Nadia Derbey 4 - * 5 - * Notification mechanism for ipc namespaces: 6 - * The callback routine registered in the memory chain invokes the ipcns 7 - * notifier chain with the IPCNS_MEMCHANGED event. 8 - * Each callback routine registered in the ipcns namespace recomputes msgmni 9 - * for the owning namespace. 10 - */ 11 - 12 - #include <linux/msg.h> 13 - #include <linux/rcupdate.h> 14 - #include <linux/notifier.h> 15 - #include <linux/nsproxy.h> 16 - #include <linux/ipc_namespace.h> 17 - 18 - #include "util.h" 19 - 20 - 21 - 22 - static BLOCKING_NOTIFIER_HEAD(ipcns_chain); 23 - 24 - 25 - static int ipcns_callback(struct notifier_block *self, 26 - unsigned long action, void *arg) 27 - { 28 - struct ipc_namespace *ns; 29 - 30 - switch (action) { 31 - case IPCNS_MEMCHANGED: /* amount of lowmem has changed */ 32 - case IPCNS_CREATED: 33 - case IPCNS_REMOVED: 34 - /* 35 - * It's time to recompute msgmni 36 - */ 37 - ns = container_of(self, struct ipc_namespace, ipcns_nb); 38 - /* 39 - * No need to get a reference on the ns: the 1st job of 40 - * free_ipc_ns() is to unregister the callback routine. 41 - * blocking_notifier_chain_unregister takes the wr lock to do 42 - * it. 43 - * When this callback routine is called the rd lock is held by 44 - * blocking_notifier_call_chain. 45 - * So the ipc ns cannot be freed while we are here. 46 - */ 47 - recompute_msgmni(ns); 48 - break; 49 - default: 50 - break; 51 - } 52 - 53 - return NOTIFY_OK; 54 - } 55 - 56 - int register_ipcns_notifier(struct ipc_namespace *ns) 57 - { 58 - int rc; 59 - 60 - memset(&ns->ipcns_nb, 0, sizeof(ns->ipcns_nb)); 61 - ns->ipcns_nb.notifier_call = ipcns_callback; 62 - ns->ipcns_nb.priority = IPCNS_CALLBACK_PRI; 63 - rc = blocking_notifier_chain_register(&ipcns_chain, &ns->ipcns_nb); 64 - if (!rc) 65 - ns->auto_msgmni = 1; 66 - return rc; 67 - } 68 - 69 - int cond_register_ipcns_notifier(struct ipc_namespace *ns) 70 - { 71 - int rc; 72 - 73 - memset(&ns->ipcns_nb, 0, sizeof(ns->ipcns_nb)); 74 - ns->ipcns_nb.notifier_call = ipcns_callback; 75 - ns->ipcns_nb.priority = IPCNS_CALLBACK_PRI; 76 - rc = blocking_notifier_chain_cond_register(&ipcns_chain, 77 - &ns->ipcns_nb); 78 - if (!rc) 79 - ns->auto_msgmni = 1; 80 - return rc; 81 - } 82 - 83 - void unregister_ipcns_notifier(struct ipc_namespace *ns) 84 - { 85 - blocking_notifier_chain_unregister(&ipcns_chain, &ns->ipcns_nb); 86 - ns->auto_msgmni = 0; 87 - } 88 - 89 - int ipcns_notify(unsigned long val) 90 - { 91 - return blocking_notifier_call_chain(&ipcns_chain, val, NULL); 92 - }
+1 -35
ipc/msg.c
··· 989 989 return do_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg, do_msg_fill); 990 990 } 991 991 992 - /* 993 - * Scale msgmni with the available lowmem size: the memory dedicated to msg 994 - * queues should occupy at most 1/MSG_MEM_SCALE of lowmem. 995 - * Also take into account the number of nsproxies created so far. 996 - * This should be done staying within the (MSGMNI , IPCMNI/nr_ipc_ns) range. 997 - */ 998 - void recompute_msgmni(struct ipc_namespace *ns) 999 - { 1000 - struct sysinfo i; 1001 - unsigned long allowed; 1002 - int nb_ns; 1003 - 1004 - si_meminfo(&i); 1005 - allowed = (((i.totalram - i.totalhigh) / MSG_MEM_SCALE) * i.mem_unit) 1006 - / MSGMNB; 1007 - nb_ns = atomic_read(&nr_ipc_ns); 1008 - allowed /= nb_ns; 1009 - 1010 - if (allowed < MSGMNI) { 1011 - ns->msg_ctlmni = MSGMNI; 1012 - return; 1013 - } 1014 - 1015 - if (allowed > IPCMNI / nb_ns) { 1016 - ns->msg_ctlmni = IPCMNI / nb_ns; 1017 - return; 1018 - } 1019 - 1020 - ns->msg_ctlmni = allowed; 1021 - } 1022 992 1023 993 void msg_init_ns(struct ipc_namespace *ns) 1024 994 { 1025 995 ns->msg_ctlmax = MSGMAX; 1026 996 ns->msg_ctlmnb = MSGMNB; 1027 - 1028 - recompute_msgmni(ns); 997 + ns->msg_ctlmni = MSGMNI; 1029 998 1030 999 atomic_set(&ns->msg_bytes, 0); 1031 1000 atomic_set(&ns->msg_hdrs, 0); ··· 1037 1068 void __init msg_init(void) 1038 1069 { 1039 1070 msg_init_ns(&init_ipc_ns); 1040 - 1041 - printk(KERN_INFO "msgmni has been set to %d\n", 1042 - init_ipc_ns.msg_ctlmni); 1043 1071 1044 1072 ipc_init_proc_interface("sysvipc/msg", 1045 1073 " key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n",
-22
ipc/namespace.c
··· 45 45 msg_init_ns(ns); 46 46 shm_init_ns(ns); 47 47 48 - /* 49 - * msgmni has already been computed for the new ipc ns. 50 - * Thus, do the ipcns creation notification before registering that 51 - * new ipcns in the chain. 52 - */ 53 - ipcns_notify(IPCNS_CREATED); 54 - register_ipcns_notifier(ns); 55 - 56 48 ns->user_ns = get_user_ns(user_ns); 57 49 58 50 return ns; ··· 91 99 92 100 static void free_ipc_ns(struct ipc_namespace *ns) 93 101 { 94 - /* 95 - * Unregistering the hotplug notifier at the beginning guarantees 96 - * that the ipc namespace won't be freed while we are inside the 97 - * callback routine. Since the blocking_notifier_chain_XXX routines 98 - * hold a rw lock on the notifier list, unregister_ipcns_notifier() 99 - * won't take the rw lock before blocking_notifier_call_chain() has 100 - * released the rd lock. 101 - */ 102 - unregister_ipcns_notifier(ns); 103 102 sem_exit_ns(ns); 104 103 msg_exit_ns(ns); 105 104 shm_exit_ns(ns); 106 105 atomic_dec(&nr_ipc_ns); 107 106 108 - /* 109 - * Do the ipcns removal notification after decrementing nr_ipc_ns in 110 - * order to have a correct value when recomputing msgmni. 111 - */ 112 - ipcns_notify(IPCNS_REMOVED); 113 107 put_user_ns(ns->user_ns); 114 108 proc_free_inum(ns->proc_inum); 115 109 kfree(ns);
-40
ipc/util.c
··· 71 71 int (*show)(struct seq_file *, void *); 72 72 }; 73 73 74 - static void ipc_memory_notifier(struct work_struct *work) 75 - { 76 - ipcns_notify(IPCNS_MEMCHANGED); 77 - } 78 - 79 - static int ipc_memory_callback(struct notifier_block *self, 80 - unsigned long action, void *arg) 81 - { 82 - static DECLARE_WORK(ipc_memory_wq, ipc_memory_notifier); 83 - 84 - switch (action) { 85 - case MEM_ONLINE: /* memory successfully brought online */ 86 - case MEM_OFFLINE: /* or offline: it's time to recompute msgmni */ 87 - /* 88 - * This is done by invoking the ipcns notifier chain with the 89 - * IPC_MEMCHANGED event. 90 - * In order not to keep the lock on the hotplug memory chain 91 - * for too long, queue a work item that will, when waken up, 92 - * activate the ipcns notification chain. 93 - */ 94 - schedule_work(&ipc_memory_wq); 95 - break; 96 - case MEM_GOING_ONLINE: 97 - case MEM_GOING_OFFLINE: 98 - case MEM_CANCEL_ONLINE: 99 - case MEM_CANCEL_OFFLINE: 100 - default: 101 - break; 102 - } 103 - 104 - return NOTIFY_OK; 105 - } 106 - 107 - static struct notifier_block ipc_memory_nb = { 108 - .notifier_call = ipc_memory_callback, 109 - .priority = IPC_CALLBACK_PRI, 110 - }; 111 - 112 74 /** 113 75 * ipc_init - initialise ipc subsystem 114 76 * ··· 86 124 sem_init(); 87 125 msg_init(); 88 126 shm_init(); 89 - register_hotmemory_notifier(&ipc_memory_nb); 90 - register_ipcns_notifier(&init_ipc_ns); 91 127 return 0; 92 128 } 93 129 device_initcall(ipc_init);