Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

cgroup: Merge branch 'memcg_event' into for-3.14

Merge v3.12 based patch series to move cgroup_event implementation to
memcg into for-3.14. The following two commits cause a conflict in
kernel/cgroup.c

2ff2a7d03bbe4 ("cgroup: kill css_id")
79bd9814e5ec9 ("cgroup, memcg: move cgroup_event implementation to memcg")

Each patch removes a struct definition from kernel/cgroup.c. As the
two are adjacent, they cause a context conflict. Easily resolved by
removing both structs.

Signed-off-by: Tejun Heo <tj@kernel.org>

Tejun Heo edab9510 e5fca243

+335 -360
-20
Documentation/cgroups/cgroups.txt
··· 24 24 2.1 Basic Usage 25 25 2.2 Attaching processes 26 26 2.3 Mounting hierarchies by name 27 - 2.4 Notification API 28 27 3. Kernel API 29 28 3.1 Overview 30 29 3.2 Synchronization ··· 471 472 The name of the subsystem appears as part of the hierarchy description 472 473 in /proc/mounts and /proc/<pid>/cgroups. 473 474 474 - 2.4 Notification API 475 - -------------------- 476 - 477 - There is mechanism which allows to get notifications about changing 478 - status of a cgroup. 479 - 480 - To register a new notification handler you need to: 481 - - create a file descriptor for event notification using eventfd(2); 482 - - open a control file to be monitored (e.g. memory.usage_in_bytes); 483 - - write "<event_fd> <control_fd> <args>" to cgroup.event_control. 484 - Interpretation of args is defined by control file implementation; 485 - 486 - eventfd will be woken up by control file implementation or when the 487 - cgroup is removed. 488 - 489 - To unregister a notification handler just close eventfd. 490 - 491 - NOTE: Support of notifications should be implemented for the control 492 - file. See documentation for the subsystem. 493 475 494 476 3. Kernel API 495 477 =============
-24
include/linux/cgroup.h
··· 29 29 struct inode; 30 30 struct cgroup; 31 31 struct css_id; 32 - struct eventfd_ctx; 33 32 34 33 extern int cgroup_init_early(void); 35 34 extern int cgroup_init(void); ··· 237 238 /* For css percpu_ref killing and RCU-protected deletion */ 238 239 struct rcu_head rcu_head; 239 240 struct work_struct destroy_work; 240 - 241 - /* List of events which userspace want to receive */ 242 - struct list_head event_list; 243 - spinlock_t event_list_lock; 244 241 245 242 /* directory xattrs */ 246 243 struct simple_xattrs xattrs; ··· 501 506 int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); 502 507 503 508 int (*release)(struct inode *inode, struct file *file); 504 - 505 - /* 506 - * register_event() callback will be used to add new userspace 507 - * waiter for changes related to the cftype. Implement it if 508 - * you want to provide this functionality. Use eventfd_signal() 509 - * on eventfd to send notification to userspace. 510 - */ 511 - int (*register_event)(struct cgroup_subsys_state *css, 512 - struct cftype *cft, struct eventfd_ctx *eventfd, 513 - const char *args); 514 - /* 515 - * unregister_event() callback will be called when userspace 516 - * closes the eventfd or on cgroup removing. 517 - * This callback must be implemented, if you want provide 518 - * notification functionality. 519 - */ 520 - void (*unregister_event)(struct cgroup_subsys_state *css, 521 - struct cftype *cft, 522 - struct eventfd_ctx *eventfd); 523 509 }; 524 510 525 511 /*
+3 -5
include/linux/vmpressure.h
··· 7 7 #include <linux/gfp.h> 8 8 #include <linux/types.h> 9 9 #include <linux/cgroup.h> 10 + #include <linux/eventfd.h> 10 11 11 12 struct vmpressure { 12 13 unsigned long scanned; ··· 34 33 extern void vmpressure_cleanup(struct vmpressure *vmpr); 35 34 extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); 36 35 extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); 37 - extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css); 38 - extern int vmpressure_register_event(struct cgroup_subsys_state *css, 39 - struct cftype *cft, 36 + extern int vmpressure_register_event(struct mem_cgroup *memcg, 40 37 struct eventfd_ctx *eventfd, 41 38 const char *args); 42 - extern void vmpressure_unregister_event(struct cgroup_subsys_state *css, 43 - struct cftype *cft, 39 + extern void vmpressure_unregister_event(struct mem_cgroup *memcg, 44 40 struct eventfd_ctx *eventfd); 45 41 #else 46 42 static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+1 -2
init/Kconfig
··· 848 848 849 849 menuconfig CGROUPS 850 850 boolean "Control Group support" 851 - depends on EVENTFD 852 851 help 853 852 This option adds support for grouping sets of processes together, for 854 853 use with process control subsystems such as Cpusets, CFS, memory ··· 914 915 bool "Memory Resource Controller for Control Groups" 915 916 depends on RESOURCE_COUNTERS 916 917 select MM_OWNER 918 + select EVENTFD 917 919 help 918 920 Provides a memory resource controller that manages both anonymous 919 921 memory and page cache. (See Documentation/cgroups/memory.txt) ··· 1154 1154 1155 1155 config SCHED_AUTOGROUP 1156 1156 bool "Automatic process group scheduling" 1157 - select EVENTFD 1158 1157 select CGROUPS 1159 1158 select CGROUP_SCHED 1160 1159 select FAIR_GROUP_SCHED
-259
kernel/cgroup.c
··· 56 56 #include <linux/pid_namespace.h> 57 57 #include <linux/idr.h> 58 58 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 59 - #include <linux/eventfd.h> 60 - #include <linux/poll.h> 61 59 #include <linux/flex_array.h> /* used in cgroup_attach_task */ 62 60 #include <linux/kthread.h> 63 - #include <linux/file.h> 64 61 65 62 #include <linux/atomic.h> 66 63 ··· 127 130 128 131 /* file xattrs */ 129 132 struct simple_xattrs xattrs; 130 - }; 131 - 132 - /* 133 - * cgroup_event represents events which userspace want to receive. 134 - */ 135 - struct cgroup_event { 136 - /* 137 - * css which the event belongs to. 138 - */ 139 - struct cgroup_subsys_state *css; 140 - /* 141 - * Control file which the event associated. 142 - */ 143 - struct cftype *cft; 144 - /* 145 - * eventfd to signal userspace about the event. 146 - */ 147 - struct eventfd_ctx *eventfd; 148 - /* 149 - * Each of these stored in a list by the cgroup. 150 - */ 151 - struct list_head list; 152 - /* 153 - * All fields below needed to unregister event when 154 - * userspace closes eventfd. 155 - */ 156 - poll_table pt; 157 - wait_queue_head_t *wqh; 158 - wait_queue_t wait; 159 - struct work_struct remove; 160 133 }; 161 134 162 135 /* The list of hierarchy roots */ ··· 1318 1351 INIT_LIST_HEAD(&cgrp->pidlists); 1319 1352 mutex_init(&cgrp->pidlist_mutex); 1320 1353 cgrp->dummy_css.cgroup = cgrp; 1321 - INIT_LIST_HEAD(&cgrp->event_list); 1322 - spin_lock_init(&cgrp->event_list_lock); 1323 1354 simple_xattrs_init(&cgrp->xattrs); 1324 1355 } 1325 1356 ··· 2590 2625 .listxattr = cgroup_listxattr, 2591 2626 .removexattr = cgroup_removexattr, 2592 2627 }; 2593 - 2594 - /* 2595 - * Check if a file is a control file 2596 - */ 2597 - static inline struct cftype *__file_cft(struct file *file) 2598 - { 2599 - if (file_inode(file)->i_fop != &cgroup_file_operations) 2600 - return ERR_PTR(-EINVAL); 2601 - return __d_cft(file->f_dentry); 2602 - } 2603 2628 2604 2629 static int cgroup_create_file(struct dentry *dentry, umode_t mode, 2605 2630 struct super_block *sb) ··· 3870 3915 deactivate_super(sb); 3871 3916 } 3872 3917 3873 - /* 3874 - * Unregister event and free resources. 3875 - * 3876 - * Gets called from workqueue. 3877 - */ 3878 - static void cgroup_event_remove(struct work_struct *work) 3879 - { 3880 - struct cgroup_event *event = container_of(work, struct cgroup_event, 3881 - remove); 3882 - struct cgroup_subsys_state *css = event->css; 3883 - 3884 - remove_wait_queue(event->wqh, &event->wait); 3885 - 3886 - event->cft->unregister_event(css, event->cft, event->eventfd); 3887 - 3888 - /* Notify userspace the event is going away. */ 3889 - eventfd_signal(event->eventfd, 1); 3890 - 3891 - eventfd_ctx_put(event->eventfd); 3892 - kfree(event); 3893 - css_put(css); 3894 - } 3895 - 3896 - /* 3897 - * Gets called on POLLHUP on eventfd when user closes it. 3898 - * 3899 - * Called with wqh->lock held and interrupts disabled. 3900 - */ 3901 - static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, 3902 - int sync, void *key) 3903 - { 3904 - struct cgroup_event *event = container_of(wait, 3905 - struct cgroup_event, wait); 3906 - struct cgroup *cgrp = event->css->cgroup; 3907 - unsigned long flags = (unsigned long)key; 3908 - 3909 - if (flags & POLLHUP) { 3910 - /* 3911 - * If the event has been detached at cgroup removal, we 3912 - * can simply return knowing the other side will cleanup 3913 - * for us. 3914 - * 3915 - * We can't race against event freeing since the other 3916 - * side will require wqh->lock via remove_wait_queue(), 3917 - * which we hold. 3918 - */ 3919 - spin_lock(&cgrp->event_list_lock); 3920 - if (!list_empty(&event->list)) { 3921 - list_del_init(&event->list); 3922 - /* 3923 - * We are in atomic context, but cgroup_event_remove() 3924 - * may sleep, so we have to call it in workqueue. 3925 - */ 3926 - schedule_work(&event->remove); 3927 - } 3928 - spin_unlock(&cgrp->event_list_lock); 3929 - } 3930 - 3931 - return 0; 3932 - } 3933 - 3934 - static void cgroup_event_ptable_queue_proc(struct file *file, 3935 - wait_queue_head_t *wqh, poll_table *pt) 3936 - { 3937 - struct cgroup_event *event = container_of(pt, 3938 - struct cgroup_event, pt); 3939 - 3940 - event->wqh = wqh; 3941 - add_wait_queue(wqh, &event->wait); 3942 - } 3943 - 3944 - /* 3945 - * Parse input and register new cgroup event handler. 3946 - * 3947 - * Input must be in format '<event_fd> <control_fd> <args>'. 3948 - * Interpretation of args is defined by control file implementation. 3949 - */ 3950 - static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, 3951 - struct cftype *cft, const char *buffer) 3952 - { 3953 - struct cgroup *cgrp = dummy_css->cgroup; 3954 - struct cgroup_event *event; 3955 - struct cgroup_subsys_state *cfile_css; 3956 - unsigned int efd, cfd; 3957 - struct fd efile; 3958 - struct fd cfile; 3959 - char *endp; 3960 - int ret; 3961 - 3962 - efd = simple_strtoul(buffer, &endp, 10); 3963 - if (*endp != ' ') 3964 - return -EINVAL; 3965 - buffer = endp + 1; 3966 - 3967 - cfd = simple_strtoul(buffer, &endp, 10); 3968 - if ((*endp != ' ') && (*endp != '\0')) 3969 - return -EINVAL; 3970 - buffer = endp + 1; 3971 - 3972 - event = kzalloc(sizeof(*event), GFP_KERNEL); 3973 - if (!event) 3974 - return -ENOMEM; 3975 - 3976 - INIT_LIST_HEAD(&event->list); 3977 - init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); 3978 - init_waitqueue_func_entry(&event->wait, cgroup_event_wake); 3979 - INIT_WORK(&event->remove, cgroup_event_remove); 3980 - 3981 - efile = fdget(efd); 3982 - if (!efile.file) { 3983 - ret = -EBADF; 3984 - goto out_kfree; 3985 - } 3986 - 3987 - event->eventfd = eventfd_ctx_fileget(efile.file); 3988 - if (IS_ERR(event->eventfd)) { 3989 - ret = PTR_ERR(event->eventfd); 3990 - goto out_put_efile; 3991 - } 3992 - 3993 - cfile = fdget(cfd); 3994 - if (!cfile.file) { 3995 - ret = -EBADF; 3996 - goto out_put_eventfd; 3997 - } 3998 - 3999 - /* the process need read permission on control file */ 4000 - /* AV: shouldn't we check that it's been opened for read instead? */ 4001 - ret = inode_permission(file_inode(cfile.file), MAY_READ); 4002 - if (ret < 0) 4003 - goto out_put_cfile; 4004 - 4005 - event->cft = __file_cft(cfile.file); 4006 - if (IS_ERR(event->cft)) { 4007 - ret = PTR_ERR(event->cft); 4008 - goto out_put_cfile; 4009 - } 4010 - 4011 - if (!event->cft->ss) { 4012 - ret = -EBADF; 4013 - goto out_put_cfile; 4014 - } 4015 - 4016 - /* 4017 - * Determine the css of @cfile, verify it belongs to the same 4018 - * cgroup as cgroup.event_control, and associate @event with it. 4019 - * Remaining events are automatically removed on cgroup destruction 4020 - * but the removal is asynchronous, so take an extra ref. 4021 - */ 4022 - rcu_read_lock(); 4023 - 4024 - ret = -EINVAL; 4025 - event->css = cgroup_css(cgrp, event->cft->ss); 4026 - cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); 4027 - if (event->css && event->css == cfile_css && css_tryget(event->css)) 4028 - ret = 0; 4029 - 4030 - rcu_read_unlock(); 4031 - if (ret) 4032 - goto out_put_cfile; 4033 - 4034 - if (!event->cft->register_event || !event->cft->unregister_event) { 4035 - ret = -EINVAL; 4036 - goto out_put_css; 4037 - } 4038 - 4039 - ret = event->cft->register_event(event->css, event->cft, 4040 - event->eventfd, buffer); 4041 - if (ret) 4042 - goto out_put_css; 4043 - 4044 - efile.file->f_op->poll(efile.file, &event->pt); 4045 - 4046 - spin_lock(&cgrp->event_list_lock); 4047 - list_add(&event->list, &cgrp->event_list); 4048 - spin_unlock(&cgrp->event_list_lock); 4049 - 4050 - fdput(cfile); 4051 - fdput(efile); 4052 - 4053 - return 0; 4054 - 4055 - out_put_css: 4056 - css_put(event->css); 4057 - out_put_cfile: 4058 - fdput(cfile); 4059 - out_put_eventfd: 4060 - eventfd_ctx_put(event->eventfd); 4061 - out_put_efile: 4062 - fdput(efile); 4063 - out_kfree: 4064 - kfree(event); 4065 - 4066 - return ret; 4067 - } 4068 - 4069 3918 static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, 4070 3919 struct cftype *cft) 4071 3920 { ··· 3893 4134 .write_u64 = cgroup_procs_write, 3894 4135 .release = cgroup_pidlist_release, 3895 4136 .mode = S_IRUGO | S_IWUSR, 3896 - }, 3897 - { 3898 - .name = "cgroup.event_control", 3899 - .write_string = cgroup_write_event_control, 3900 - .mode = S_IWUGO, 3901 4137 }, 3902 4138 { 3903 4139 .name = "cgroup.clone_children", ··· 4364 4610 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4365 4611 { 4366 4612 struct dentry *d = cgrp->dentry; 4367 - struct cgroup_event *event, *tmp; 4368 4613 struct cgroup_subsys *ss; 4369 4614 struct cgroup *child; 4370 4615 bool empty; ··· 4437 4684 cgroup_addrm_files(cgrp, cgroup_base_files, false); 4438 4685 dget(d); 4439 4686 cgroup_d_remove_dir(d); 4440 - 4441 - /* 4442 - * Unregister events and notify userspace. 4443 - * Notify userspace about cgroup removing only after rmdir of cgroup 4444 - * directory to avoid race between userspace and kernelspace. 4445 - */ 4446 - spin_lock(&cgrp->event_list_lock); 4447 - list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { 4448 - list_del_init(&event->list); 4449 - schedule_work(&event->remove); 4450 - } 4451 - spin_unlock(&cgrp->event_list_lock); 4452 4687 4453 4688 return 0; 4454 4689 };
+322 -33
mm/memcontrol.c
··· 45 45 #include <linux/swapops.h> 46 46 #include <linux/spinlock.h> 47 47 #include <linux/eventfd.h> 48 + #include <linux/poll.h> 48 49 #include <linux/sort.h> 49 50 #include <linux/fs.h> 50 51 #include <linux/seq_file.h> ··· 56 55 #include <linux/cpu.h> 57 56 #include <linux/oom.h> 58 57 #include <linux/lockdep.h> 58 + #include <linux/file.h> 59 59 #include "internal.h" 60 60 #include <net/sock.h> 61 61 #include <net/ip.h> ··· 229 227 struct eventfd_ctx *eventfd; 230 228 }; 231 229 230 + /* 231 + * cgroup_event represents events which userspace want to receive. 232 + */ 233 + struct mem_cgroup_event { 234 + /* 235 + * memcg which the event belongs to. 236 + */ 237 + struct mem_cgroup *memcg; 238 + /* 239 + * eventfd to signal userspace about the event. 240 + */ 241 + struct eventfd_ctx *eventfd; 242 + /* 243 + * Each of these stored in a list by the cgroup. 244 + */ 245 + struct list_head list; 246 + /* 247 + * register_event() callback will be used to add new userspace 248 + * waiter for changes related to this event. Use eventfd_signal() 249 + * on eventfd to send notification to userspace. 250 + */ 251 + int (*register_event)(struct mem_cgroup *memcg, 252 + struct eventfd_ctx *eventfd, const char *args); 253 + /* 254 + * unregister_event() callback will be called when userspace closes 255 + * the eventfd or on cgroup removing. This callback must be set, 256 + * if you want provide notification functionality. 257 + */ 258 + void (*unregister_event)(struct mem_cgroup *memcg, 259 + struct eventfd_ctx *eventfd); 260 + /* 261 + * All fields below needed to unregister event when 262 + * userspace closes eventfd. 263 + */ 264 + poll_table pt; 265 + wait_queue_head_t *wqh; 266 + wait_queue_t wait; 267 + struct work_struct remove; 268 + }; 269 + 232 270 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 233 271 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 234 272 ··· 372 330 atomic_t numainfo_events; 373 331 atomic_t numainfo_updating; 374 332 #endif 333 + 334 + /* List of events which userspace want to receive */ 335 + struct list_head event_list; 336 + spinlock_t event_list_lock; 375 337 376 338 struct mem_cgroup_per_node *nodeinfo[0]; 377 339 /* WARNING: nodeinfo must be the last member here */ ··· 534 488 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 535 489 { 536 490 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 537 - } 538 - 539 - struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css) 540 - { 541 - return &mem_cgroup_from_css(css)->vmpressure; 542 491 } 543 492 544 493 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) ··· 5689 5648 mem_cgroup_oom_notify_cb(iter); 5690 5649 } 5691 5650 5692 - static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, 5693 - struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5651 + static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 5652 + struct eventfd_ctx *eventfd, const char *args, enum res_type type) 5694 5653 { 5695 - struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5696 5654 struct mem_cgroup_thresholds *thresholds; 5697 5655 struct mem_cgroup_threshold_ary *new; 5698 - enum res_type type = MEMFILE_TYPE(cft->private); 5699 5656 u64 threshold, usage; 5700 5657 int i, size, ret; 5701 5658 ··· 5770 5731 return ret; 5771 5732 } 5772 5733 5773 - static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, 5774 - struct cftype *cft, struct eventfd_ctx *eventfd) 5734 + static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 5735 + struct eventfd_ctx *eventfd, const char *args) 5775 5736 { 5776 - struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5737 + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 5738 + } 5739 + 5740 + static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 5741 + struct eventfd_ctx *eventfd, const char *args) 5742 + { 5743 + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 5744 + } 5745 + 5746 + static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 5747 + struct eventfd_ctx *eventfd, enum res_type type) 5748 + { 5777 5749 struct mem_cgroup_thresholds *thresholds; 5778 5750 struct mem_cgroup_threshold_ary *new; 5779 - enum res_type type = MEMFILE_TYPE(cft->private); 5780 5751 u64 usage; 5781 5752 int i, j, size; 5782 5753 ··· 5859 5810 mutex_unlock(&memcg->thresholds_lock); 5860 5811 } 5861 5812 5862 - static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, 5863 - struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5813 + static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 5814 + struct eventfd_ctx *eventfd) 5864 5815 { 5865 - struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5866 - struct mem_cgroup_eventfd_list *event; 5867 - enum res_type type = MEMFILE_TYPE(cft->private); 5816 + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 5817 + } 5868 5818 5869 - BUG_ON(type != _OOM_TYPE); 5819 + static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 5820 + struct eventfd_ctx *eventfd) 5821 + { 5822 + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 5823 + } 5824 + 5825 + static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 5826 + struct eventfd_ctx *eventfd, const char *args) 5827 + { 5828 + struct mem_cgroup_eventfd_list *event; 5829 + 5870 5830 event = kmalloc(sizeof(*event), GFP_KERNEL); 5871 5831 if (!event) 5872 5832 return -ENOMEM; ··· 5893 5835 return 0; 5894 5836 } 5895 5837 5896 - static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, 5897 - struct cftype *cft, struct eventfd_ctx *eventfd) 5838 + static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 5839 + struct eventfd_ctx *eventfd) 5898 5840 { 5899 - struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5900 5841 struct mem_cgroup_eventfd_list *ev, *tmp; 5901 - enum res_type type = MEMFILE_TYPE(cft->private); 5902 - 5903 - BUG_ON(type != _OOM_TYPE); 5904 5842 5905 5843 spin_lock(&memcg_oom_lock); 5906 5844 ··· 6013 5959 } 6014 5960 #endif 6015 5961 5962 + /* 5963 + * DO NOT USE IN NEW FILES. 5964 + * 5965 + * "cgroup.event_control" implementation. 5966 + * 5967 + * This is way over-engineered. It tries to support fully configurable 5968 + * events for each user. Such level of flexibility is completely 5969 + * unnecessary especially in the light of the planned unified hierarchy. 5970 + * 5971 + * Please deprecate this and replace with something simpler if at all 5972 + * possible. 5973 + */ 5974 + 5975 + /* 5976 + * Unregister event and free resources. 5977 + * 5978 + * Gets called from workqueue. 5979 + */ 5980 + static void memcg_event_remove(struct work_struct *work) 5981 + { 5982 + struct mem_cgroup_event *event = 5983 + container_of(work, struct mem_cgroup_event, remove); 5984 + struct mem_cgroup *memcg = event->memcg; 5985 + 5986 + remove_wait_queue(event->wqh, &event->wait); 5987 + 5988 + event->unregister_event(memcg, event->eventfd); 5989 + 5990 + /* Notify userspace the event is going away. */ 5991 + eventfd_signal(event->eventfd, 1); 5992 + 5993 + eventfd_ctx_put(event->eventfd); 5994 + kfree(event); 5995 + css_put(&memcg->css); 5996 + } 5997 + 5998 + /* 5999 + * Gets called on POLLHUP on eventfd when user closes it. 6000 + * 6001 + * Called with wqh->lock held and interrupts disabled. 6002 + */ 6003 + static int memcg_event_wake(wait_queue_t *wait, unsigned mode, 6004 + int sync, void *key) 6005 + { 6006 + struct mem_cgroup_event *event = 6007 + container_of(wait, struct mem_cgroup_event, wait); 6008 + struct mem_cgroup *memcg = event->memcg; 6009 + unsigned long flags = (unsigned long)key; 6010 + 6011 + if (flags & POLLHUP) { 6012 + /* 6013 + * If the event has been detached at cgroup removal, we 6014 + * can simply return knowing the other side will cleanup 6015 + * for us. 6016 + * 6017 + * We can't race against event freeing since the other 6018 + * side will require wqh->lock via remove_wait_queue(), 6019 + * which we hold. 6020 + */ 6021 + spin_lock(&memcg->event_list_lock); 6022 + if (!list_empty(&event->list)) { 6023 + list_del_init(&event->list); 6024 + /* 6025 + * We are in atomic context, but cgroup_event_remove() 6026 + * may sleep, so we have to call it in workqueue. 6027 + */ 6028 + schedule_work(&event->remove); 6029 + } 6030 + spin_unlock(&memcg->event_list_lock); 6031 + } 6032 + 6033 + return 0; 6034 + } 6035 + 6036 + static void memcg_event_ptable_queue_proc(struct file *file, 6037 + wait_queue_head_t *wqh, poll_table *pt) 6038 + { 6039 + struct mem_cgroup_event *event = 6040 + container_of(pt, struct mem_cgroup_event, pt); 6041 + 6042 + event->wqh = wqh; 6043 + add_wait_queue(wqh, &event->wait); 6044 + } 6045 + 6046 + /* 6047 + * DO NOT USE IN NEW FILES. 6048 + * 6049 + * Parse input and register new cgroup event handler. 6050 + * 6051 + * Input must be in format '<event_fd> <control_fd> <args>'. 6052 + * Interpretation of args is defined by control file implementation. 6053 + */ 6054 + static int memcg_write_event_control(struct cgroup_subsys_state *css, 6055 + struct cftype *cft, const char *buffer) 6056 + { 6057 + struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6058 + struct mem_cgroup_event *event; 6059 + struct cgroup_subsys_state *cfile_css; 6060 + unsigned int efd, cfd; 6061 + struct fd efile; 6062 + struct fd cfile; 6063 + const char *name; 6064 + char *endp; 6065 + int ret; 6066 + 6067 + efd = simple_strtoul(buffer, &endp, 10); 6068 + if (*endp != ' ') 6069 + return -EINVAL; 6070 + buffer = endp + 1; 6071 + 6072 + cfd = simple_strtoul(buffer, &endp, 10); 6073 + if ((*endp != ' ') && (*endp != '\0')) 6074 + return -EINVAL; 6075 + buffer = endp + 1; 6076 + 6077 + event = kzalloc(sizeof(*event), GFP_KERNEL); 6078 + if (!event) 6079 + return -ENOMEM; 6080 + 6081 + event->memcg = memcg; 6082 + INIT_LIST_HEAD(&event->list); 6083 + init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 6084 + init_waitqueue_func_entry(&event->wait, memcg_event_wake); 6085 + INIT_WORK(&event->remove, memcg_event_remove); 6086 + 6087 + efile = fdget(efd); 6088 + if (!efile.file) { 6089 + ret = -EBADF; 6090 + goto out_kfree; 6091 + } 6092 + 6093 + event->eventfd = eventfd_ctx_fileget(efile.file); 6094 + if (IS_ERR(event->eventfd)) { 6095 + ret = PTR_ERR(event->eventfd); 6096 + goto out_put_efile; 6097 + } 6098 + 6099 + cfile = fdget(cfd); 6100 + if (!cfile.file) { 6101 + ret = -EBADF; 6102 + goto out_put_eventfd; 6103 + } 6104 + 6105 + /* the process need read permission on control file */ 6106 + /* AV: shouldn't we check that it's been opened for read instead? */ 6107 + ret = inode_permission(file_inode(cfile.file), MAY_READ); 6108 + if (ret < 0) 6109 + goto out_put_cfile; 6110 + 6111 + /* 6112 + * Determine the event callbacks and set them in @event. This used 6113 + * to be done via struct cftype but cgroup core no longer knows 6114 + * about these events. The following is crude but the whole thing 6115 + * is for compatibility anyway. 6116 + * 6117 + * DO NOT ADD NEW FILES. 6118 + */ 6119 + name = cfile.file->f_dentry->d_name.name; 6120 + 6121 + if (!strcmp(name, "memory.usage_in_bytes")) { 6122 + event->register_event = mem_cgroup_usage_register_event; 6123 + event->unregister_event = mem_cgroup_usage_unregister_event; 6124 + } else if (!strcmp(name, "memory.oom_control")) { 6125 + event->register_event = mem_cgroup_oom_register_event; 6126 + event->unregister_event = mem_cgroup_oom_unregister_event; 6127 + } else if (!strcmp(name, "memory.pressure_level")) { 6128 + event->register_event = vmpressure_register_event; 6129 + event->unregister_event = vmpressure_unregister_event; 6130 + } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 6131 + event->register_event = memsw_cgroup_usage_register_event; 6132 + event->unregister_event = memsw_cgroup_usage_unregister_event; 6133 + } else { 6134 + ret = -EINVAL; 6135 + goto out_put_cfile; 6136 + } 6137 + 6138 + /* 6139 + * Verify @cfile should belong to @css. Also, remaining events are 6140 + * automatically removed on cgroup destruction but the removal is 6141 + * asynchronous, so take an extra ref on @css. 6142 + */ 6143 + rcu_read_lock(); 6144 + 6145 + ret = -EINVAL; 6146 + cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, 6147 + &mem_cgroup_subsys); 6148 + if (cfile_css == css && css_tryget(css)) 6149 + ret = 0; 6150 + 6151 + rcu_read_unlock(); 6152 + if (ret) 6153 + goto out_put_cfile; 6154 + 6155 + ret = event->register_event(memcg, event->eventfd, buffer); 6156 + if (ret) 6157 + goto out_put_css; 6158 + 6159 + efile.file->f_op->poll(efile.file, &event->pt); 6160 + 6161 + spin_lock(&memcg->event_list_lock); 6162 + list_add(&event->list, &memcg->event_list); 6163 + spin_unlock(&memcg->event_list_lock); 6164 + 6165 + fdput(cfile); 6166 + fdput(efile); 6167 + 6168 + return 0; 6169 + 6170 + out_put_css: 6171 + css_put(css); 6172 + out_put_cfile: 6173 + fdput(cfile); 6174 + out_put_eventfd: 6175 + eventfd_ctx_put(event->eventfd); 6176 + out_put_efile: 6177 + fdput(efile); 6178 + out_kfree: 6179 + kfree(event); 6180 + 6181 + return ret; 6182 + } 6183 + 6016 6184 static struct cftype mem_cgroup_files[] = { 6017 6185 { 6018 6186 .name = "usage_in_bytes", 6019 6187 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 6020 6188 .read = mem_cgroup_read, 6021 - .register_event = mem_cgroup_usage_register_event, 6022 - .unregister_event = mem_cgroup_usage_unregister_event, 6023 6189 }, 6024 6190 { 6025 6191 .name = "max_usage_in_bytes", ··· 6280 6006 .read_u64 = mem_cgroup_hierarchy_read, 6281 6007 }, 6282 6008 { 6009 + .name = "cgroup.event_control", /* XXX: for compat */ 6010 + .write_string = memcg_write_event_control, 6011 + .flags = CFTYPE_NO_PREFIX, 6012 + .mode = S_IWUGO, 6013 + }, 6014 + { 6283 6015 .name = "swappiness", 6284 6016 .read_u64 = mem_cgroup_swappiness_read, 6285 6017 .write_u64 = mem_cgroup_swappiness_write, ··· 6299 6019 .name = "oom_control", 6300 6020 .read_map = mem_cgroup_oom_control_read, 6301 6021 .write_u64 = mem_cgroup_oom_control_write, 6302 - .register_event = mem_cgroup_oom_register_event, 6303 - .unregister_event = mem_cgroup_oom_unregister_event, 6304 6022 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 6305 6023 }, 6306 6024 { 6307 6025 .name = "pressure_level", 6308 - .register_event = vmpressure_register_event, 6309 - .unregister_event = vmpressure_unregister_event, 6310 6026 }, 6311 6027 #ifdef CONFIG_NUMA 6312 6028 { ··· 6350 6074 .name = "memsw.usage_in_bytes", 6351 6075 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 6352 6076 .read = mem_cgroup_read, 6353 - .register_event = mem_cgroup_usage_register_event, 6354 - .unregister_event = mem_cgroup_usage_unregister_event, 6355 6077 }, 6356 6078 { 6357 6079 .name = "memsw.max_usage_in_bytes", ··· 6539 6265 mutex_init(&memcg->thresholds_lock); 6540 6266 spin_lock_init(&memcg->move_lock); 6541 6267 vmpressure_init(&memcg->vmpressure); 6268 + INIT_LIST_HEAD(&memcg->event_list); 6269 + spin_lock_init(&memcg->event_list_lock); 6542 6270 6543 6271 return &memcg->css; 6544 6272 ··· 6616 6340 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 6617 6341 { 6618 6342 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6343 + struct mem_cgroup_event *event, *tmp; 6344 + 6345 + /* 6346 + * Unregister events and notify userspace. 6347 + * Notify userspace about cgroup removing only after rmdir of cgroup 6348 + * directory to avoid race between userspace and kernelspace. 6349 + */ 6350 + spin_lock(&memcg->event_list_lock); 6351 + list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 6352 + list_del_init(&event->list); 6353 + schedule_work(&event->remove); 6354 + } 6355 + spin_unlock(&memcg->event_list_lock); 6619 6356 6620 6357 kmem_cgroup_css_offline(memcg); 6621 6358
+9 -17
mm/vmpressure.c
··· 278 278 279 279 /** 280 280 * vmpressure_register_event() - Bind vmpressure notifications to an eventfd 281 - * @css: css that is interested in vmpressure notifications 282 - * @cft: cgroup control files handle 281 + * @memcg: memcg that is interested in vmpressure notifications 283 282 * @eventfd: eventfd context to link notifications with 284 283 * @args: event arguments (used to set up a pressure level threshold) 285 284 * ··· 288 289 * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or 289 290 * "critical"). 290 291 * 291 - * This function should not be used directly, just pass it to (struct 292 - * cftype).register_event, and then cgroup core will handle everything by 293 - * itself. 292 + * To be used as memcg event method. 294 293 */ 295 - int vmpressure_register_event(struct cgroup_subsys_state *css, 296 - struct cftype *cft, struct eventfd_ctx *eventfd, 297 - const char *args) 294 + int vmpressure_register_event(struct mem_cgroup *memcg, 295 + struct eventfd_ctx *eventfd, const char *args) 298 296 { 299 - struct vmpressure *vmpr = css_to_vmpressure(css); 297 + struct vmpressure *vmpr = memcg_to_vmpressure(memcg); 300 298 struct vmpressure_event *ev; 301 299 int level; 302 300 ··· 321 325 322 326 /** 323 327 * vmpressure_unregister_event() - Unbind eventfd from vmpressure 324 - * @css: css handle 325 - * @cft: cgroup control files handle 328 + * @memcg: memcg handle 326 329 * @eventfd: eventfd context that was used to link vmpressure with the @cg 327 330 * 328 331 * This function does internal manipulations to detach the @eventfd from 329 332 * the vmpressure notifications, and then frees internal resources 330 333 * associated with the @eventfd (but the @eventfd itself is not freed). 331 334 * 332 - * This function should not be used directly, just pass it to (struct 333 - * cftype).unregister_event, and then cgroup core will handle everything 334 - * by itself. 335 + * To be used as memcg event method. 335 336 */ 336 - void vmpressure_unregister_event(struct cgroup_subsys_state *css, 337 - struct cftype *cft, 337 + void vmpressure_unregister_event(struct mem_cgroup *memcg, 338 338 struct eventfd_ctx *eventfd) 339 339 { 340 - struct vmpressure *vmpr = css_to_vmpressure(css); 340 + struct vmpressure *vmpr = memcg_to_vmpressure(memcg); 341 341 struct vmpressure_event *ev; 342 342 343 343 mutex_lock(&vmpr->events_lock);