Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-3.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo:
"The bulk of changes are cleanups and preparations for the upcoming
kernfs conversion.

- cgroup_event mechanism which is and will be used only by memcg is
moved to memcg.

- pidlist handling is updated so that it can be served by seq_file.

Also, the list is not sorted if sane_behavior. cgroup
documentation explicitly states that the file is not sorted but it
has been for quite some time.

- All cgroup file handling now happens on top of seq_file. This is
to prepare for kernfs conversion. In addition, all operations are
restructured so that they map 1-1 to kernfs operations.

- Other cleanups and low-pri fixes"

* 'for-3.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (40 commits)
cgroup: trivial style updates
cgroup: remove stray references to css_id
doc: cgroups: Fix typo in doc/cgroups
cgroup: fix fail path in cgroup_load_subsys()
cgroup: fix missing unlock on error in cgroup_load_subsys()
cgroup: remove for_each_root_subsys()
cgroup: implement for_each_css()
cgroup: factor out cgroup_subsys_state creation into create_css()
cgroup: combine css handling loops in cgroup_create()
cgroup: reorder operations in cgroup_create()
cgroup: make for_each_subsys() useable under cgroup_root_mutex
cgroup: css iterations and css_from_dir() are safe under cgroup_mutex
cgroup: unify pidlist and other file handling
cgroup: replace cftype->read_seq_string() with cftype->seq_show()
cgroup: attach cgroup_open_file to all cgroup files
cgroup: generalize cgroup_pidlist_open_file
cgroup: unify read path so that seq_file is always used
cgroup: unify cgroup_write_X64() and cgroup_write_string()
cgroup: remove cftype->read(), ->read_map() and ->write()
hugetlb_cgroup: convert away from cftype->read()
...

+1029 -1105
-20
Documentation/cgroups/cgroups.txt
··· 24 24 2.1 Basic Usage 25 25 2.2 Attaching processes 26 26 2.3 Mounting hierarchies by name 27 - 2.4 Notification API 28 27 3. Kernel API 29 28 3.1 Overview 30 29 3.2 Synchronization ··· 471 472 The name of the subsystem appears as part of the hierarchy description 472 473 in /proc/mounts and /proc/<pid>/cgroups. 473 474 474 - 2.4 Notification API 475 - -------------------- 476 - 477 - There is mechanism which allows to get notifications about changing 478 - status of a cgroup. 479 - 480 - To register a new notification handler you need to: 481 - - create a file descriptor for event notification using eventfd(2); 482 - - open a control file to be monitored (e.g. memory.usage_in_bytes); 483 - - write "<event_fd> <control_fd> <args>" to cgroup.event_control. 484 - Interpretation of args is defined by control file implementation; 485 - 486 - eventfd will be woken up by control file implementation or when the 487 - cgroup is removed. 488 - 489 - To unregister a notification handler just close eventfd. 490 - 491 - NOTE: Support of notifications should be implemented for the control 492 - file. See documentation for the subsystem. 493 475 494 476 3. Kernel API 495 477 =============
+2 -2
Documentation/cgroups/memory.txt
··· 577 577 per-node page counts including "hierarchical_<counter>" which sums up all 578 578 hierarchical children's values in addition to the memcg's own value. 579 579 580 - The ouput format of memory.numa_stat is: 580 + The output format of memory.numa_stat is: 581 581 582 582 total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ... 583 583 file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ... ··· 670 670 671 671 8.1 Interface 672 672 673 - This feature is disabled by default. It can be enabledi (and disabled again) by 673 + This feature is disabled by default. It can be enabled (and disabled again) by 674 674 writing to memory.move_charge_at_immigrate of the destination cgroup. 675 675 676 676 If you want to enable it:
+2 -2
Documentation/cgroups/resource_counter.txt
··· 97 97 (struct res_counter *rc, struct res_counter *top, 98 98 unsinged long val) 99 99 100 - Almost same as res_cunter_uncharge() but propagation of uncharge 101 - stops when rc == top. This is useful when kill a res_coutner in 100 + Almost same as res_counter_uncharge() but propagation of uncharge 101 + stops when rc == top. This is useful when kill a res_counter in 102 102 child cgroup. 103 103 104 104 2.1 Other accounting routines
+15 -20
block/blk-throttle.c
··· 1303 1303 return __blkg_prfill_rwstat(sf, pd, &rwstat); 1304 1304 } 1305 1305 1306 - static int tg_print_cpu_rwstat(struct cgroup_subsys_state *css, 1307 - struct cftype *cft, struct seq_file *sf) 1306 + static int tg_print_cpu_rwstat(struct seq_file *sf, void *v) 1308 1307 { 1309 - struct blkcg *blkcg = css_to_blkcg(css); 1310 - 1311 - blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl, 1312 - cft->private, true); 1308 + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat, 1309 + &blkcg_policy_throtl, seq_cft(sf)->private, true); 1313 1310 return 0; 1314 1311 } 1315 1312 ··· 1332 1335 return __blkg_prfill_u64(sf, pd, v); 1333 1336 } 1334 1337 1335 - static int tg_print_conf_u64(struct cgroup_subsys_state *css, 1336 - struct cftype *cft, struct seq_file *sf) 1338 + static int tg_print_conf_u64(struct seq_file *sf, void *v) 1337 1339 { 1338 - blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_u64, 1339 - &blkcg_policy_throtl, cft->private, false); 1340 + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64, 1341 + &blkcg_policy_throtl, seq_cft(sf)->private, false); 1340 1342 return 0; 1341 1343 } 1342 1344 1343 - static int tg_print_conf_uint(struct cgroup_subsys_state *css, 1344 - struct cftype *cft, struct seq_file *sf) 1345 + static int tg_print_conf_uint(struct seq_file *sf, void *v) 1345 1346 { 1346 - blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_uint, 1347 - &blkcg_policy_throtl, cft->private, false); 1347 + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint, 1348 + &blkcg_policy_throtl, seq_cft(sf)->private, false); 1348 1349 return 0; 1349 1350 } 1350 1351 ··· 1423 1428 { 1424 1429 .name = "throttle.read_bps_device", 1425 1430 .private = offsetof(struct throtl_grp, bps[READ]), 1426 - .read_seq_string = tg_print_conf_u64, 1431 + .seq_show = tg_print_conf_u64, 1427 1432 .write_string = tg_set_conf_u64, 1428 1433 .max_write_len = 256, 1429 1434 }, 1430 1435 { 1431 1436 .name = "throttle.write_bps_device", 1432 1437 .private = offsetof(struct throtl_grp, bps[WRITE]), 1433 - .read_seq_string = tg_print_conf_u64, 1438 + .seq_show = tg_print_conf_u64, 1434 1439 .write_string = tg_set_conf_u64, 1435 1440 .max_write_len = 256, 1436 1441 }, 1437 1442 { 1438 1443 .name = "throttle.read_iops_device", 1439 1444 .private = offsetof(struct throtl_grp, iops[READ]), 1440 - .read_seq_string = tg_print_conf_uint, 1445 + .seq_show = tg_print_conf_uint, 1441 1446 .write_string = tg_set_conf_uint, 1442 1447 .max_write_len = 256, 1443 1448 }, 1444 1449 { 1445 1450 .name = "throttle.write_iops_device", 1446 1451 .private = offsetof(struct throtl_grp, iops[WRITE]), 1447 - .read_seq_string = tg_print_conf_uint, 1452 + .seq_show = tg_print_conf_uint, 1448 1453 .write_string = tg_set_conf_uint, 1449 1454 .max_write_len = 256, 1450 1455 }, 1451 1456 { 1452 1457 .name = "throttle.io_service_bytes", 1453 1458 .private = offsetof(struct tg_stats_cpu, service_bytes), 1454 - .read_seq_string = tg_print_cpu_rwstat, 1459 + .seq_show = tg_print_cpu_rwstat, 1455 1460 }, 1456 1461 { 1457 1462 .name = "throttle.io_serviced", 1458 1463 .private = offsetof(struct tg_stats_cpu, serviced), 1459 - .read_seq_string = tg_print_cpu_rwstat, 1464 + .seq_show = tg_print_cpu_rwstat, 1460 1465 }, 1461 1466 { } /* terminate */ 1462 1467 };
+58 -73
block/cfq-iosched.c
··· 1632 1632 return __blkg_prfill_u64(sf, pd, cfqg->dev_weight); 1633 1633 } 1634 1634 1635 - static int cfqg_print_weight_device(struct cgroup_subsys_state *css, 1636 - struct cftype *cft, struct seq_file *sf) 1635 + static int cfqg_print_weight_device(struct seq_file *sf, void *v) 1637 1636 { 1638 - blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_weight_device, 1639 - &blkcg_policy_cfq, 0, false); 1637 + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1638 + cfqg_prfill_weight_device, &blkcg_policy_cfq, 1639 + 0, false); 1640 1640 return 0; 1641 1641 } 1642 1642 ··· 1650 1650 return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight); 1651 1651 } 1652 1652 1653 - static int cfqg_print_leaf_weight_device(struct cgroup_subsys_state *css, 1654 - struct cftype *cft, 1655 - struct seq_file *sf) 1653 + static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v) 1656 1654 { 1657 - blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_leaf_weight_device, 1658 - &blkcg_policy_cfq, 0, false); 1655 + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1656 + cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 1657 + 0, false); 1659 1658 return 0; 1660 1659 } 1661 1660 1662 - static int cfq_print_weight(struct cgroup_subsys_state *css, struct cftype *cft, 1663 - struct seq_file *sf) 1661 + static int cfq_print_weight(struct seq_file *sf, void *v) 1664 1662 { 1665 - seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_weight); 1663 + seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight); 1666 1664 return 0; 1667 1665 } 1668 1666 1669 - static int cfq_print_leaf_weight(struct cgroup_subsys_state *css, 1670 - struct cftype *cft, struct seq_file *sf) 1667 + static int cfq_print_leaf_weight(struct seq_file *sf, void *v) 1671 1668 { 1672 - seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_leaf_weight); 1669 + seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight); 1673 1670 return 0; 1674 1671 } 1675 1672 ··· 1759 1762 return __cfq_set_weight(css, cft, val, true); 1760 1763 } 1761 1764 1762 - static int cfqg_print_stat(struct cgroup_subsys_state *css, struct cftype *cft, 1763 - struct seq_file *sf) 1765 + static int cfqg_print_stat(struct seq_file *sf, void *v) 1764 1766 { 1765 - struct blkcg *blkcg = css_to_blkcg(css); 1766 - 1767 - blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq, 1768 - cft->private, false); 1767 + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, 1768 + &blkcg_policy_cfq, seq_cft(sf)->private, false); 1769 1769 return 0; 1770 1770 } 1771 1771 1772 - static int cfqg_print_rwstat(struct cgroup_subsys_state *css, 1773 - struct cftype *cft, struct seq_file *sf) 1772 + static int cfqg_print_rwstat(struct seq_file *sf, void *v) 1774 1773 { 1775 - struct blkcg *blkcg = css_to_blkcg(css); 1776 - 1777 - blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq, 1778 - cft->private, true); 1774 + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, 1775 + &blkcg_policy_cfq, seq_cft(sf)->private, true); 1779 1776 return 0; 1780 1777 } 1781 1778 ··· 1789 1798 return __blkg_prfill_rwstat(sf, pd, &sum); 1790 1799 } 1791 1800 1792 - static int cfqg_print_stat_recursive(struct cgroup_subsys_state *css, 1793 - struct cftype *cft, struct seq_file *sf) 1801 + static int cfqg_print_stat_recursive(struct seq_file *sf, void *v) 1794 1802 { 1795 - struct blkcg *blkcg = css_to_blkcg(css); 1796 - 1797 - blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive, 1798 - &blkcg_policy_cfq, cft->private, false); 1803 + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1804 + cfqg_prfill_stat_recursive, &blkcg_policy_cfq, 1805 + seq_cft(sf)->private, false); 1799 1806 return 0; 1800 1807 } 1801 1808 1802 - static int cfqg_print_rwstat_recursive(struct cgroup_subsys_state *css, 1803 - struct cftype *cft, struct seq_file *sf) 1809 + static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v) 1804 1810 { 1805 - struct blkcg *blkcg = css_to_blkcg(css); 1806 - 1807 - blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive, 1808 - &blkcg_policy_cfq, cft->private, true); 1811 + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1812 + cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq, 1813 + seq_cft(sf)->private, true); 1809 1814 return 0; 1810 1815 } 1811 1816 ··· 1822 1835 } 1823 1836 1824 1837 /* print avg_queue_size */ 1825 - static int cfqg_print_avg_queue_size(struct cgroup_subsys_state *css, 1826 - struct cftype *cft, struct seq_file *sf) 1838 + static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v) 1827 1839 { 1828 - struct blkcg *blkcg = css_to_blkcg(css); 1829 - 1830 - blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size, 1831 - &blkcg_policy_cfq, 0, false); 1840 + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1841 + cfqg_prfill_avg_queue_size, &blkcg_policy_cfq, 1842 + 0, false); 1832 1843 return 0; 1833 1844 } 1834 1845 #endif /* CONFIG_DEBUG_BLK_CGROUP */ ··· 1836 1851 { 1837 1852 .name = "weight_device", 1838 1853 .flags = CFTYPE_ONLY_ON_ROOT, 1839 - .read_seq_string = cfqg_print_leaf_weight_device, 1854 + .seq_show = cfqg_print_leaf_weight_device, 1840 1855 .write_string = cfqg_set_leaf_weight_device, 1841 1856 .max_write_len = 256, 1842 1857 }, 1843 1858 { 1844 1859 .name = "weight", 1845 1860 .flags = CFTYPE_ONLY_ON_ROOT, 1846 - .read_seq_string = cfq_print_leaf_weight, 1861 + .seq_show = cfq_print_leaf_weight, 1847 1862 .write_u64 = cfq_set_leaf_weight, 1848 1863 }, 1849 1864 ··· 1851 1866 { 1852 1867 .name = "weight_device", 1853 1868 .flags = CFTYPE_NOT_ON_ROOT, 1854 - .read_seq_string = cfqg_print_weight_device, 1869 + .seq_show = cfqg_print_weight_device, 1855 1870 .write_string = cfqg_set_weight_device, 1856 1871 .max_write_len = 256, 1857 1872 }, 1858 1873 { 1859 1874 .name = "weight", 1860 1875 .flags = CFTYPE_NOT_ON_ROOT, 1861 - .read_seq_string = cfq_print_weight, 1876 + .seq_show = cfq_print_weight, 1862 1877 .write_u64 = cfq_set_weight, 1863 1878 }, 1864 1879 1865 1880 { 1866 1881 .name = "leaf_weight_device", 1867 - .read_seq_string = cfqg_print_leaf_weight_device, 1882 + .seq_show = cfqg_print_leaf_weight_device, 1868 1883 .write_string = cfqg_set_leaf_weight_device, 1869 1884 .max_write_len = 256, 1870 1885 }, 1871 1886 { 1872 1887 .name = "leaf_weight", 1873 - .read_seq_string = cfq_print_leaf_weight, 1888 + .seq_show = cfq_print_leaf_weight, 1874 1889 .write_u64 = cfq_set_leaf_weight, 1875 1890 }, 1876 1891 ··· 1878 1893 { 1879 1894 .name = "time", 1880 1895 .private = offsetof(struct cfq_group, stats.time), 1881 - .read_seq_string = cfqg_print_stat, 1896 + .seq_show = cfqg_print_stat, 1882 1897 }, 1883 1898 { 1884 1899 .name = "sectors", 1885 1900 .private = offsetof(struct cfq_group, stats.sectors), 1886 - .read_seq_string = cfqg_print_stat, 1901 + .seq_show = cfqg_print_stat, 1887 1902 }, 1888 1903 { 1889 1904 .name = "io_service_bytes", 1890 1905 .private = offsetof(struct cfq_group, stats.service_bytes), 1891 - .read_seq_string = cfqg_print_rwstat, 1906 + .seq_show = cfqg_print_rwstat, 1892 1907 }, 1893 1908 { 1894 1909 .name = "io_serviced", 1895 1910 .private = offsetof(struct cfq_group, stats.serviced), 1896 - .read_seq_string = cfqg_print_rwstat, 1911 + .seq_show = cfqg_print_rwstat, 1897 1912 }, 1898 1913 { 1899 1914 .name = "io_service_time", 1900 1915 .private = offsetof(struct cfq_group, stats.service_time), 1901 - .read_seq_string = cfqg_print_rwstat, 1916 + .seq_show = cfqg_print_rwstat, 1902 1917 }, 1903 1918 { 1904 1919 .name = "io_wait_time", 1905 1920 .private = offsetof(struct cfq_group, stats.wait_time), 1906 - .read_seq_string = cfqg_print_rwstat, 1921 + .seq_show = cfqg_print_rwstat, 1907 1922 }, 1908 1923 { 1909 1924 .name = "io_merged", 1910 1925 .private = offsetof(struct cfq_group, stats.merged), 1911 - .read_seq_string = cfqg_print_rwstat, 1926 + .seq_show = cfqg_print_rwstat, 1912 1927 }, 1913 1928 { 1914 1929 .name = "io_queued", 1915 1930 .private = offsetof(struct cfq_group, stats.queued), 1916 - .read_seq_string = cfqg_print_rwstat, 1931 + .seq_show = cfqg_print_rwstat, 1917 1932 }, 1918 1933 1919 1934 /* the same statictics which cover the cfqg and its descendants */ 1920 1935 { 1921 1936 .name = "time_recursive", 1922 1937 .private = offsetof(struct cfq_group, stats.time), 1923 - .read_seq_string = cfqg_print_stat_recursive, 1938 + .seq_show = cfqg_print_stat_recursive, 1924 1939 }, 1925 1940 { 1926 1941 .name = "sectors_recursive", 1927 1942 .private = offsetof(struct cfq_group, stats.sectors), 1928 - .read_seq_string = cfqg_print_stat_recursive, 1943 + .seq_show = cfqg_print_stat_recursive, 1929 1944 }, 1930 1945 { 1931 1946 .name = "io_service_bytes_recursive", 1932 1947 .private = offsetof(struct cfq_group, stats.service_bytes), 1933 - .read_seq_string = cfqg_print_rwstat_recursive, 1948 + .seq_show = cfqg_print_rwstat_recursive, 1934 1949 }, 1935 1950 { 1936 1951 .name = "io_serviced_recursive", 1937 1952 .private = offsetof(struct cfq_group, stats.serviced), 1938 - .read_seq_string = cfqg_print_rwstat_recursive, 1953 + .seq_show = cfqg_print_rwstat_recursive, 1939 1954 }, 1940 1955 { 1941 1956 .name = "io_service_time_recursive", 1942 1957 .private = offsetof(struct cfq_group, stats.service_time), 1943 - .read_seq_string = cfqg_print_rwstat_recursive, 1958 + .seq_show = cfqg_print_rwstat_recursive, 1944 1959 }, 1945 1960 { 1946 1961 .name = "io_wait_time_recursive", 1947 1962 .private = offsetof(struct cfq_group, stats.wait_time), 1948 - .read_seq_string = cfqg_print_rwstat_recursive, 1963 + .seq_show = cfqg_print_rwstat_recursive, 1949 1964 }, 1950 1965 { 1951 1966 .name = "io_merged_recursive", 1952 1967 .private = offsetof(struct cfq_group, stats.merged), 1953 - .read_seq_string = cfqg_print_rwstat_recursive, 1968 + .seq_show = cfqg_print_rwstat_recursive, 1954 1969 }, 1955 1970 { 1956 1971 .name = "io_queued_recursive", 1957 1972 .private = offsetof(struct cfq_group, stats.queued), 1958 - .read_seq_string = cfqg_print_rwstat_recursive, 1973 + .seq_show = cfqg_print_rwstat_recursive, 1959 1974 }, 1960 1975 #ifdef CONFIG_DEBUG_BLK_CGROUP 1961 1976 { 1962 1977 .name = "avg_queue_size", 1963 - .read_seq_string = cfqg_print_avg_queue_size, 1978 + .seq_show = cfqg_print_avg_queue_size, 1964 1979 }, 1965 1980 { 1966 1981 .name = "group_wait_time", 1967 1982 .private = offsetof(struct cfq_group, stats.group_wait_time), 1968 - .read_seq_string = cfqg_print_stat, 1983 + .seq_show = cfqg_print_stat, 1969 1984 }, 1970 1985 { 1971 1986 .name = "idle_time", 1972 1987 .private = offsetof(struct cfq_group, stats.idle_time), 1973 - .read_seq_string = cfqg_print_stat, 1988 + .seq_show = cfqg_print_stat, 1974 1989 }, 1975 1990 { 1976 1991 .name = "empty_time", 1977 1992 .private = offsetof(struct cfq_group, stats.empty_time), 1978 - .read_seq_string = cfqg_print_stat, 1993 + .seq_show = cfqg_print_stat, 1979 1994 }, 1980 1995 { 1981 1996 .name = "dequeue", 1982 1997 .private = offsetof(struct cfq_group, stats.dequeue), 1983 - .read_seq_string = cfqg_print_stat, 1998 + .seq_show = cfqg_print_stat, 1984 1999 }, 1985 2000 { 1986 2001 .name = "unaccounted_time", 1987 2002 .private = offsetof(struct cfq_group, stats.unaccounted_time), 1988 - .read_seq_string = cfqg_print_stat, 2003 + .seq_show = cfqg_print_stat, 1989 2004 }, 1990 2005 #endif /* CONFIG_DEBUG_BLK_CGROUP */ 1991 2006 { } /* terminate */
-1
drivers/md/bcache/request.c
··· 163 163 static void bcachecg_destroy(struct cgroup *cgroup) 164 164 { 165 165 struct bch_cgroup *cg = cgroup_to_bcache(cgroup); 166 - free_css_id(&bcache_subsys, &cg->css); 167 166 kfree(cg); 168 167 } 169 168
+44 -68
include/linux/cgroup.h
··· 21 21 #include <linux/xattr.h> 22 22 #include <linux/fs.h> 23 23 #include <linux/percpu-refcount.h> 24 + #include <linux/seq_file.h> 24 25 25 26 #ifdef CONFIG_CGROUPS 26 27 ··· 29 28 struct cgroup_subsys; 30 29 struct inode; 31 30 struct cgroup; 32 - struct css_id; 33 - struct eventfd_ctx; 34 31 35 32 extern int cgroup_init_early(void); 36 33 extern int cgroup_init(void); ··· 78 79 struct cgroup_subsys_state *parent; 79 80 80 81 unsigned long flags; 81 - /* ID for this css, if possible */ 82 - struct css_id __rcu *id; 83 82 84 83 /* percpu_ref killing and RCU release */ 85 84 struct rcu_head rcu_head; ··· 236 239 struct rcu_head rcu_head; 237 240 struct work_struct destroy_work; 238 241 239 - /* List of events which userspace want to receive */ 240 - struct list_head event_list; 241 - spinlock_t event_list_lock; 242 - 243 242 /* directory xattrs */ 244 243 struct simple_xattrs xattrs; 245 244 }; ··· 272 279 * 273 280 * - "tasks" is removed. Everything should be at process 274 281 * granularity. Use "cgroup.procs" instead. 282 + * 283 + * - "cgroup.procs" is not sorted. pids will be unique unless they 284 + * got recycled inbetween reads. 275 285 * 276 286 * - "release_agent" and "notify_on_release" are removed. 277 287 * Replacement notification mechanism will be implemented. ··· 315 319 316 320 /* Unique id for this hierarchy. */ 317 321 int hierarchy_id; 318 - 319 - /* A list running through the attached subsystems */ 320 - struct list_head subsys_list; 321 322 322 323 /* The root cgroup for this hierarchy */ 323 324 struct cgroup top_cgroup; ··· 382 389 }; 383 390 384 391 /* 385 - * cgroup_map_cb is an abstract callback API for reporting map-valued 386 - * control files 387 - */ 388 - 389 - struct cgroup_map_cb { 390 - int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value); 391 - void *state; 392 - }; 393 - 394 - /* 395 392 * struct cftype: handler definitions for cgroup control files 396 393 * 397 394 * When reading/writing to a file: ··· 428 445 */ 429 446 struct cgroup_subsys *ss; 430 447 431 - int (*open)(struct inode *inode, struct file *file); 432 - ssize_t (*read)(struct cgroup_subsys_state *css, struct cftype *cft, 433 - struct file *file, 434 - char __user *buf, size_t nbytes, loff_t *ppos); 435 448 /* 436 449 * read_u64() is a shortcut for the common case of returning a 437 450 * single integer. Use it in place of read() ··· 437 458 * read_s64() is a signed version of read_u64() 438 459 */ 439 460 s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); 440 - /* 441 - * read_map() is used for defining a map of key/value 442 - * pairs. It should call cb->fill(cb, key, value) for each 443 - * entry. The key/value pairs (and their ordering) should not 444 - * change between reboots. 445 - */ 446 - int (*read_map)(struct cgroup_subsys_state *css, struct cftype *cft, 447 - struct cgroup_map_cb *cb); 448 - /* 449 - * read_seq_string() is used for outputting a simple sequence 450 - * using seqfile. 451 - */ 452 - int (*read_seq_string)(struct cgroup_subsys_state *css, 453 - struct cftype *cft, struct seq_file *m); 454 461 455 - ssize_t (*write)(struct cgroup_subsys_state *css, struct cftype *cft, 456 - struct file *file, 457 - const char __user *buf, size_t nbytes, loff_t *ppos); 462 + /* generic seq_file read interface */ 463 + int (*seq_show)(struct seq_file *sf, void *v); 464 + 465 + /* optional ops, implement all or none */ 466 + void *(*seq_start)(struct seq_file *sf, loff_t *ppos); 467 + void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); 468 + void (*seq_stop)(struct seq_file *sf, void *v); 458 469 459 470 /* 460 471 * write_u64() is a shortcut for the common case of accepting ··· 473 504 * kick type for multiplexing. 474 505 */ 475 506 int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); 476 - 477 - int (*release)(struct inode *inode, struct file *file); 478 - 479 - /* 480 - * register_event() callback will be used to add new userspace 481 - * waiter for changes related to the cftype. Implement it if 482 - * you want to provide this functionality. Use eventfd_signal() 483 - * on eventfd to send notification to userspace. 484 - */ 485 - int (*register_event)(struct cgroup_subsys_state *css, 486 - struct cftype *cft, struct eventfd_ctx *eventfd, 487 - const char *args); 488 - /* 489 - * unregister_event() callback will be called when userspace 490 - * closes the eventfd or on cgroup removing. 491 - * This callback must be implemented, if you want provide 492 - * notification functionality. 493 - */ 494 - void (*unregister_event)(struct cgroup_subsys_state *css, 495 - struct cftype *cft, 496 - struct eventfd_ctx *eventfd); 497 507 }; 498 508 499 509 /* ··· 483 535 struct cftype_set { 484 536 struct list_head node; /* chained at subsys->cftsets */ 485 537 struct cftype *cfts; 538 + }; 539 + 540 + /* 541 + * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. Don't 542 + * access directly. 543 + */ 544 + struct cfent { 545 + struct list_head node; 546 + struct dentry *dentry; 547 + struct cftype *type; 548 + struct cgroup_subsys_state *css; 549 + 550 + /* file xattrs */ 551 + struct simple_xattrs xattrs; 552 + }; 553 + 554 + /* seq_file->private points to the following, only ->priv is public */ 555 + struct cgroup_open_file { 556 + struct cfent *cfe; 557 + void *priv; 486 558 }; 487 559 488 560 /* ··· 518 550 static inline const char *cgroup_name(const struct cgroup *cgrp) 519 551 { 520 552 return rcu_dereference(cgrp->name)->name; 553 + } 554 + 555 + static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) 556 + { 557 + struct cgroup_open_file *of = seq->private; 558 + return of->cfe->css; 559 + } 560 + 561 + static inline struct cftype *seq_cft(struct seq_file *seq) 562 + { 563 + struct cgroup_open_file *of = seq->private; 564 + return of->cfe->type; 521 565 } 522 566 523 567 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); ··· 611 631 #define MAX_CGROUP_TYPE_NAMELEN 32 612 632 const char *name; 613 633 614 - /* 615 - * Link to parent, and list entry in parent's children. 616 - * Protected by cgroup_lock() 617 - */ 634 + /* link to parent, protected by cgroup_lock() */ 618 635 struct cgroupfs_root *root; 619 - struct list_head sibling; 620 636 621 637 /* list of cftype_sets */ 622 638 struct list_head cftsets;
+3 -5
include/linux/vmpressure.h
··· 7 7 #include <linux/gfp.h> 8 8 #include <linux/types.h> 9 9 #include <linux/cgroup.h> 10 + #include <linux/eventfd.h> 10 11 11 12 struct vmpressure { 12 13 unsigned long scanned; ··· 34 33 extern void vmpressure_cleanup(struct vmpressure *vmpr); 35 34 extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); 36 35 extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); 37 - extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css); 38 - extern int vmpressure_register_event(struct cgroup_subsys_state *css, 39 - struct cftype *cft, 36 + extern int vmpressure_register_event(struct mem_cgroup *memcg, 40 37 struct eventfd_ctx *eventfd, 41 38 const char *args); 42 - extern void vmpressure_unregister_event(struct cgroup_subsys_state *css, 43 - struct cftype *cft, 39 + extern void vmpressure_unregister_event(struct mem_cgroup *memcg, 44 40 struct eventfd_ctx *eventfd); 45 41 #else 46 42 static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+1 -2
init/Kconfig
··· 854 854 855 855 menuconfig CGROUPS 856 856 boolean "Control Group support" 857 - depends on EVENTFD 858 857 help 859 858 This option adds support for grouping sets of processes together, for 860 859 use with process control subsystems such as Cpusets, CFS, memory ··· 920 921 bool "Memory Resource Controller for Control Groups" 921 922 depends on RESOURCE_COUNTERS 922 923 select MM_OWNER 924 + select EVENTFD 923 925 help 924 926 Provides a memory resource controller that manages both anonymous 925 927 memory and page cache. (See Documentation/cgroups/memory.txt) ··· 1160 1160 1161 1161 config SCHED_AUTOGROUP 1162 1162 bool "Automatic process group scheduling" 1163 - select EVENTFD 1164 1163 select CGROUPS 1165 1164 select CGROUP_SCHED 1166 1165 select FAIR_GROUP_SCHED
+485 -727
kernel/cgroup.c
··· 41 41 #include <linux/rcupdate.h> 42 42 #include <linux/sched.h> 43 43 #include <linux/backing-dev.h> 44 - #include <linux/seq_file.h> 45 44 #include <linux/slab.h> 46 45 #include <linux/magic.h> 47 46 #include <linux/spinlock.h> ··· 55 56 #include <linux/pid_namespace.h> 56 57 #include <linux/idr.h> 57 58 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 58 - #include <linux/eventfd.h> 59 - #include <linux/poll.h> 60 59 #include <linux/flex_array.h> /* used in cgroup_attach_task */ 61 60 #include <linux/kthread.h> 62 - #include <linux/file.h> 63 61 64 62 #include <linux/atomic.h> 63 + 64 + /* 65 + * pidlists linger the following amount before being destroyed. The goal 66 + * is avoiding frequent destruction in the middle of consecutive read calls 67 + * Expiring in the middle is a performance problem not a correctness one. 68 + * 1 sec should be enough. 69 + */ 70 + #define CGROUP_PIDLIST_DESTROY_DELAY HZ 65 71 66 72 /* 67 73 * cgroup_mutex is the master lock. Any modification to cgroup or its ··· 93 89 94 90 static DEFINE_MUTEX(cgroup_root_mutex); 95 91 92 + #define cgroup_assert_mutex_or_rcu_locked() \ 93 + rcu_lockdep_assert(rcu_read_lock_held() || \ 94 + lockdep_is_held(&cgroup_mutex), \ 95 + "cgroup_mutex or RCU read lock required"); 96 + 97 + #ifdef CONFIG_LOCKDEP 98 + #define cgroup_assert_mutex_or_root_locked() \ 99 + WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \ 100 + !lockdep_is_held(&cgroup_root_mutex))) 101 + #else 102 + #define cgroup_assert_mutex_or_root_locked() do { } while (0) 103 + #endif 104 + 96 105 /* 97 106 * cgroup destruction makes heavy use of work items and there can be a lot 98 107 * of concurrent destructions. Use a separate workqueue so that cgroup ··· 113 96 * which may lead to deadlock. 114 97 */ 115 98 static struct workqueue_struct *cgroup_destroy_wq; 99 + 100 + /* 101 + * pidlist destructions need to be flushed on cgroup destruction. Use a 102 + * separate workqueue as flush domain. 103 + */ 104 + static struct workqueue_struct *cgroup_pidlist_destroy_wq; 116 105 117 106 /* 118 107 * Generate an array of cgroup subsystem pointers. At boot time, this is ··· 141 118 142 119 /* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ 143 120 static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; 144 - 145 - /* 146 - * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. 147 - */ 148 - struct cfent { 149 - struct list_head node; 150 - struct dentry *dentry; 151 - struct cftype *type; 152 - struct cgroup_subsys_state *css; 153 - 154 - /* file xattrs */ 155 - struct simple_xattrs xattrs; 156 - }; 157 - 158 - /* 159 - * cgroup_event represents events which userspace want to receive. 160 - */ 161 - struct cgroup_event { 162 - /* 163 - * css which the event belongs to. 164 - */ 165 - struct cgroup_subsys_state *css; 166 - /* 167 - * Control file which the event associated. 168 - */ 169 - struct cftype *cft; 170 - /* 171 - * eventfd to signal userspace about the event. 172 - */ 173 - struct eventfd_ctx *eventfd; 174 - /* 175 - * Each of these stored in a list by the cgroup. 176 - */ 177 - struct list_head list; 178 - /* 179 - * All fields below needed to unregister event when 180 - * userspace closes eventfd. 181 - */ 182 - poll_table pt; 183 - wait_queue_head_t *wqh; 184 - wait_queue_t wait; 185 - struct work_struct remove; 186 - }; 187 121 188 122 /* The list of hierarchy roots */ 189 123 ··· 180 200 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 181 201 bool is_add); 182 202 static int cgroup_file_release(struct inode *inode, struct file *file); 203 + static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); 183 204 184 205 /** 185 206 * cgroup_css - obtain a cgroup's css for the specified subsystem ··· 243 262 } 244 263 245 264 /** 246 - * for_each_subsys - iterate all loaded cgroup subsystems 247 - * @ss: the iteration cursor 248 - * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end 265 + * for_each_css - iterate all css's of a cgroup 266 + * @css: the iteration cursor 267 + * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end 268 + * @cgrp: the target cgroup to iterate css's of 249 269 * 250 270 * Should be called under cgroup_mutex. 251 271 */ 252 - #define for_each_subsys(ss, i) \ 253 - for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \ 254 - if (({ lockdep_assert_held(&cgroup_mutex); \ 255 - !((ss) = cgroup_subsys[i]); })) { } \ 272 + #define for_each_css(css, ssid, cgrp) \ 273 + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 274 + if (!((css) = rcu_dereference_check( \ 275 + (cgrp)->subsys[(ssid)], \ 276 + lockdep_is_held(&cgroup_mutex)))) { } \ 277 + else 278 + 279 + /** 280 + * for_each_subsys - iterate all loaded cgroup subsystems 281 + * @ss: the iteration cursor 282 + * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end 283 + * 284 + * Iterates through all loaded subsystems. Should be called under 285 + * cgroup_mutex or cgroup_root_mutex. 286 + */ 287 + #define for_each_subsys(ss, ssid) \ 288 + for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \ 289 + (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 290 + if (!((ss) = cgroup_subsys[(ssid)])) { } \ 256 291 else 257 292 258 293 /** ··· 282 285 #define for_each_builtin_subsys(ss, i) \ 283 286 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ 284 287 (((ss) = cgroup_subsys[i]) || true); (i)++) 285 - 286 - /* iterate each subsystem attached to a hierarchy */ 287 - #define for_each_root_subsys(root, ss) \ 288 - list_for_each_entry((ss), &(root)->subsys_list, sibling) 289 288 290 289 /* iterate across the active hierarchies */ 291 290 #define for_each_active_root(root) \ ··· 856 863 */ 857 864 deactivate_super(cgrp->root->sb); 858 865 859 - /* 860 - * if we're getting rid of the cgroup, refcount should ensure 861 - * that there are no pidlists left. 862 - */ 863 - BUG_ON(!list_empty(&cgrp->pidlists)); 866 + cgroup_pidlist_destroy_all(cgrp); 864 867 865 868 simple_xattrs_free(&cgrp->xattrs); 866 869 ··· 1039 1050 cgroup_css(cgroup_dummy_top, ss)); 1040 1051 cgroup_css(cgrp, ss)->cgroup = cgrp; 1041 1052 1042 - list_move(&ss->sibling, &root->subsys_list); 1043 1053 ss->root = root; 1044 1054 if (ss->bind) 1045 1055 ss->bind(cgroup_css(cgrp, ss)); ··· 1057 1069 RCU_INIT_POINTER(cgrp->subsys[i], NULL); 1058 1070 1059 1071 cgroup_subsys[i]->root = &cgroup_dummy_root; 1060 - list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); 1061 1072 1062 1073 /* subsystem is now free - drop reference on module */ 1063 1074 module_put(ss->module); ··· 1083 1096 { 1084 1097 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 1085 1098 struct cgroup_subsys *ss; 1099 + int ssid; 1086 1100 1087 1101 mutex_lock(&cgroup_root_mutex); 1088 - for_each_root_subsys(root, ss) 1089 - seq_printf(seq, ",%s", ss->name); 1102 + for_each_subsys(ss, ssid) 1103 + if (root->subsys_mask & (1 << ssid)) 1104 + seq_printf(seq, ",%s", ss->name); 1090 1105 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1091 1106 seq_puts(seq, ",sane_behavior"); 1092 1107 if (root->flags & CGRP_ROOT_NOPREFIX) ··· 1351 1362 INIT_LIST_HEAD(&cgrp->pidlists); 1352 1363 mutex_init(&cgrp->pidlist_mutex); 1353 1364 cgrp->dummy_css.cgroup = cgrp; 1354 - INIT_LIST_HEAD(&cgrp->event_list); 1355 - spin_lock_init(&cgrp->event_list_lock); 1356 1365 simple_xattrs_init(&cgrp->xattrs); 1357 1366 } 1358 1367 ··· 1358 1371 { 1359 1372 struct cgroup *cgrp = &root->top_cgroup; 1360 1373 1361 - INIT_LIST_HEAD(&root->subsys_list); 1362 1374 INIT_LIST_HEAD(&root->root_list); 1363 1375 root->number_of_cgroups = 1; 1364 1376 cgrp->root = root; ··· 1679 1693 return ERR_PTR(ret); 1680 1694 } 1681 1695 1682 - static void cgroup_kill_sb(struct super_block *sb) { 1696 + static void cgroup_kill_sb(struct super_block *sb) 1697 + { 1683 1698 struct cgroupfs_root *root = sb->s_fs_info; 1684 1699 struct cgroup *cgrp = &root->top_cgroup; 1685 1700 struct cgrp_cset_link *link, *tmp_link; ··· 1963 1976 bool threadgroup) 1964 1977 { 1965 1978 int retval, i, group_size; 1966 - struct cgroup_subsys *ss, *failed_ss = NULL; 1967 1979 struct cgroupfs_root *root = cgrp->root; 1980 + struct cgroup_subsys_state *css, *failed_css = NULL; 1968 1981 /* threadgroup list cursor and array */ 1969 1982 struct task_struct *leader = tsk; 1970 1983 struct task_and_cgroup *tc; ··· 2037 2050 /* 2038 2051 * step 1: check that we can legitimately attach to the cgroup. 2039 2052 */ 2040 - for_each_root_subsys(root, ss) { 2041 - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 2042 - 2043 - if (ss->can_attach) { 2044 - retval = ss->can_attach(css, &tset); 2053 + for_each_css(css, i, cgrp) { 2054 + if (css->ss->can_attach) { 2055 + retval = css->ss->can_attach(css, &tset); 2045 2056 if (retval) { 2046 - failed_ss = ss; 2057 + failed_css = css; 2047 2058 goto out_cancel_attach; 2048 2059 } 2049 2060 } ··· 2077 2092 /* 2078 2093 * step 4: do subsystem attach callbacks. 2079 2094 */ 2080 - for_each_root_subsys(root, ss) { 2081 - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 2082 - 2083 - if (ss->attach) 2084 - ss->attach(css, &tset); 2085 - } 2095 + for_each_css(css, i, cgrp) 2096 + if (css->ss->attach) 2097 + css->ss->attach(css, &tset); 2086 2098 2087 2099 /* 2088 2100 * step 5: success! and cleanup ··· 2096 2114 } 2097 2115 out_cancel_attach: 2098 2116 if (retval) { 2099 - for_each_root_subsys(root, ss) { 2100 - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 2101 - 2102 - if (ss == failed_ss) 2117 + for_each_css(css, i, cgrp) { 2118 + if (css == failed_css) 2103 2119 break; 2104 - if (ss->cancel_attach) 2105 - ss->cancel_attach(css, &tset); 2120 + if (css->ss->cancel_attach) 2121 + css->ss->cancel_attach(css, &tset); 2106 2122 } 2107 2123 } 2108 2124 out_free_group_list: ··· 2128 2148 tsk = find_task_by_vpid(pid); 2129 2149 if (!tsk) { 2130 2150 rcu_read_unlock(); 2131 - ret= -ESRCH; 2151 + ret = -ESRCH; 2132 2152 goto out_unlock_cgroup; 2133 2153 } 2134 2154 /* ··· 2240 2260 return 0; 2241 2261 } 2242 2262 2243 - static int cgroup_release_agent_show(struct cgroup_subsys_state *css, 2244 - struct cftype *cft, struct seq_file *seq) 2263 + static int cgroup_release_agent_show(struct seq_file *seq, void *v) 2245 2264 { 2246 - struct cgroup *cgrp = css->cgroup; 2265 + struct cgroup *cgrp = seq_css(seq)->cgroup; 2247 2266 2248 2267 if (!cgroup_lock_live_group(cgrp)) 2249 2268 return -ENODEV; ··· 2252 2273 return 0; 2253 2274 } 2254 2275 2255 - static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, 2256 - struct cftype *cft, struct seq_file *seq) 2276 + static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) 2257 2277 { 2258 - seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); 2278 + struct cgroup *cgrp = seq_css(seq)->cgroup; 2279 + 2280 + seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); 2259 2281 return 0; 2260 2282 } 2261 2283 2262 2284 /* A buffer size big enough for numbers or short strings */ 2263 2285 #define CGROUP_LOCAL_BUFFER_SIZE 64 2264 2286 2265 - static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, 2266 - struct cftype *cft, struct file *file, 2267 - const char __user *userbuf, size_t nbytes, 2268 - loff_t *unused_ppos) 2269 - { 2270 - char buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2271 - int retval = 0; 2272 - char *end; 2273 - 2274 - if (!nbytes) 2275 - return -EINVAL; 2276 - if (nbytes >= sizeof(buffer)) 2277 - return -E2BIG; 2278 - if (copy_from_user(buffer, userbuf, nbytes)) 2279 - return -EFAULT; 2280 - 2281 - buffer[nbytes] = 0; /* nul-terminate */ 2282 - if (cft->write_u64) { 2283 - u64 val = simple_strtoull(strstrip(buffer), &end, 0); 2284 - if (*end) 2285 - return -EINVAL; 2286 - retval = cft->write_u64(css, cft, val); 2287 - } else { 2288 - s64 val = simple_strtoll(strstrip(buffer), &end, 0); 2289 - if (*end) 2290 - return -EINVAL; 2291 - retval = cft->write_s64(css, cft, val); 2292 - } 2293 - if (!retval) 2294 - retval = nbytes; 2295 - return retval; 2296 - } 2297 - 2298 - static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, 2299 - struct cftype *cft, struct file *file, 2300 - const char __user *userbuf, size_t nbytes, 2301 - loff_t *unused_ppos) 2302 - { 2303 - char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2304 - int retval = 0; 2305 - size_t max_bytes = cft->max_write_len; 2306 - char *buffer = local_buffer; 2307 - 2308 - if (!max_bytes) 2309 - max_bytes = sizeof(local_buffer) - 1; 2310 - if (nbytes >= max_bytes) 2311 - return -E2BIG; 2312 - /* Allocate a dynamic buffer if we need one */ 2313 - if (nbytes >= sizeof(local_buffer)) { 2314 - buffer = kmalloc(nbytes + 1, GFP_KERNEL); 2315 - if (buffer == NULL) 2316 - return -ENOMEM; 2317 - } 2318 - if (nbytes && copy_from_user(buffer, userbuf, nbytes)) { 2319 - retval = -EFAULT; 2320 - goto out; 2321 - } 2322 - 2323 - buffer[nbytes] = 0; /* nul-terminate */ 2324 - retval = cft->write_string(css, cft, strstrip(buffer)); 2325 - if (!retval) 2326 - retval = nbytes; 2327 - out: 2328 - if (buffer != local_buffer) 2329 - kfree(buffer); 2330 - return retval; 2331 - } 2332 - 2333 - static ssize_t cgroup_file_write(struct file *file, const char __user *buf, 2287 + static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf, 2334 2288 size_t nbytes, loff_t *ppos) 2335 2289 { 2336 2290 struct cfent *cfe = __d_cfe(file->f_dentry); 2337 2291 struct cftype *cft = __d_cft(file->f_dentry); 2338 2292 struct cgroup_subsys_state *css = cfe->css; 2293 + size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1; 2294 + char *buf; 2295 + int ret; 2339 2296 2340 - if (cft->write) 2341 - return cft->write(css, cft, file, buf, nbytes, ppos); 2342 - if (cft->write_u64 || cft->write_s64) 2343 - return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); 2344 - if (cft->write_string) 2345 - return cgroup_write_string(css, cft, file, buf, nbytes, ppos); 2346 - if (cft->trigger) { 2347 - int ret = cft->trigger(css, (unsigned int)cft->private); 2348 - return ret ? ret : nbytes; 2297 + if (nbytes >= max_bytes) 2298 + return -E2BIG; 2299 + 2300 + buf = kmalloc(nbytes + 1, GFP_KERNEL); 2301 + if (!buf) 2302 + return -ENOMEM; 2303 + 2304 + if (copy_from_user(buf, userbuf, nbytes)) { 2305 + ret = -EFAULT; 2306 + goto out_free; 2349 2307 } 2350 - return -EINVAL; 2351 - } 2352 2308 2353 - static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, 2354 - struct cftype *cft, struct file *file, 2355 - char __user *buf, size_t nbytes, loff_t *ppos) 2356 - { 2357 - char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2358 - u64 val = cft->read_u64(css, cft); 2359 - int len = sprintf(tmp, "%llu\n", (unsigned long long) val); 2309 + buf[nbytes] = '\0'; 2360 2310 2361 - return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2362 - } 2363 - 2364 - static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, 2365 - struct cftype *cft, struct file *file, 2366 - char __user *buf, size_t nbytes, loff_t *ppos) 2367 - { 2368 - char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2369 - s64 val = cft->read_s64(css, cft); 2370 - int len = sprintf(tmp, "%lld\n", (long long) val); 2371 - 2372 - return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2373 - } 2374 - 2375 - static ssize_t cgroup_file_read(struct file *file, char __user *buf, 2376 - size_t nbytes, loff_t *ppos) 2377 - { 2378 - struct cfent *cfe = __d_cfe(file->f_dentry); 2379 - struct cftype *cft = __d_cft(file->f_dentry); 2380 - struct cgroup_subsys_state *css = cfe->css; 2381 - 2382 - if (cft->read) 2383 - return cft->read(css, cft, file, buf, nbytes, ppos); 2384 - if (cft->read_u64) 2385 - return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); 2386 - if (cft->read_s64) 2387 - return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); 2388 - return -EINVAL; 2311 + if (cft->write_string) { 2312 + ret = cft->write_string(css, cft, strstrip(buf)); 2313 + } else if (cft->write_u64) { 2314 + unsigned long long v; 2315 + ret = kstrtoull(buf, 0, &v); 2316 + if (!ret) 2317 + ret = cft->write_u64(css, cft, v); 2318 + } else if (cft->write_s64) { 2319 + long long v; 2320 + ret = kstrtoll(buf, 0, &v); 2321 + if (!ret) 2322 + ret = cft->write_s64(css, cft, v); 2323 + } else if (cft->trigger) { 2324 + ret = cft->trigger(css, (unsigned int)cft->private); 2325 + } else { 2326 + ret = -EINVAL; 2327 + } 2328 + out_free: 2329 + kfree(buf); 2330 + return ret ?: nbytes; 2389 2331 } 2390 2332 2391 2333 /* ··· 2314 2414 * supports string->u64 maps, but can be extended in future. 2315 2415 */ 2316 2416 2317 - static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) 2417 + static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) 2318 2418 { 2319 - struct seq_file *sf = cb->state; 2320 - return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value); 2419 + struct cftype *cft = seq_cft(seq); 2420 + 2421 + if (cft->seq_start) { 2422 + return cft->seq_start(seq, ppos); 2423 + } else { 2424 + /* 2425 + * The same behavior and code as single_open(). Returns 2426 + * !NULL if pos is at the beginning; otherwise, NULL. 2427 + */ 2428 + return NULL + !*ppos; 2429 + } 2430 + } 2431 + 2432 + static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) 2433 + { 2434 + struct cftype *cft = seq_cft(seq); 2435 + 2436 + if (cft->seq_next) { 2437 + return cft->seq_next(seq, v, ppos); 2438 + } else { 2439 + /* 2440 + * The same behavior and code as single_open(), always 2441 + * terminate after the initial read. 2442 + */ 2443 + ++*ppos; 2444 + return NULL; 2445 + } 2446 + } 2447 + 2448 + static void cgroup_seqfile_stop(struct seq_file *seq, void *v) 2449 + { 2450 + struct cftype *cft = seq_cft(seq); 2451 + 2452 + if (cft->seq_stop) 2453 + cft->seq_stop(seq, v); 2321 2454 } 2322 2455 2323 2456 static int cgroup_seqfile_show(struct seq_file *m, void *arg) 2324 2457 { 2325 - struct cfent *cfe = m->private; 2326 - struct cftype *cft = cfe->type; 2327 - struct cgroup_subsys_state *css = cfe->css; 2458 + struct cftype *cft = seq_cft(m); 2459 + struct cgroup_subsys_state *css = seq_css(m); 2328 2460 2329 - if (cft->read_map) { 2330 - struct cgroup_map_cb cb = { 2331 - .fill = cgroup_map_add, 2332 - .state = m, 2333 - }; 2334 - return cft->read_map(css, cft, &cb); 2335 - } 2336 - return cft->read_seq_string(css, cft, m); 2461 + if (cft->seq_show) 2462 + return cft->seq_show(m, arg); 2463 + 2464 + if (cft->read_u64) 2465 + seq_printf(m, "%llu\n", cft->read_u64(css, cft)); 2466 + else if (cft->read_s64) 2467 + seq_printf(m, "%lld\n", cft->read_s64(css, cft)); 2468 + else 2469 + return -EINVAL; 2470 + return 0; 2337 2471 } 2338 2472 2339 - static const struct file_operations cgroup_seqfile_operations = { 2340 - .read = seq_read, 2341 - .write = cgroup_file_write, 2342 - .llseek = seq_lseek, 2343 - .release = cgroup_file_release, 2473 + static struct seq_operations cgroup_seq_operations = { 2474 + .start = cgroup_seqfile_start, 2475 + .next = cgroup_seqfile_next, 2476 + .stop = cgroup_seqfile_stop, 2477 + .show = cgroup_seqfile_show, 2344 2478 }; 2345 2479 2346 2480 static int cgroup_file_open(struct inode *inode, struct file *file) ··· 2383 2449 struct cftype *cft = __d_cft(file->f_dentry); 2384 2450 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); 2385 2451 struct cgroup_subsys_state *css; 2452 + struct cgroup_open_file *of; 2386 2453 int err; 2387 2454 2388 2455 err = generic_file_open(inode, file); ··· 2413 2478 WARN_ON_ONCE(cfe->css && cfe->css != css); 2414 2479 cfe->css = css; 2415 2480 2416 - if (cft->read_map || cft->read_seq_string) { 2417 - file->f_op = &cgroup_seqfile_operations; 2418 - err = single_open(file, cgroup_seqfile_show, cfe); 2419 - } else if (cft->open) { 2420 - err = cft->open(inode, file); 2481 + of = __seq_open_private(file, &cgroup_seq_operations, 2482 + sizeof(struct cgroup_open_file)); 2483 + if (of) { 2484 + of->cfe = cfe; 2485 + return 0; 2421 2486 } 2422 2487 2423 - if (css->ss && err) 2488 + if (css->ss) 2424 2489 css_put(css); 2425 - return err; 2490 + return -ENOMEM; 2426 2491 } 2427 2492 2428 2493 static int cgroup_file_release(struct inode *inode, struct file *file) 2429 2494 { 2430 2495 struct cfent *cfe = __d_cfe(file->f_dentry); 2431 - struct cftype *cft = __d_cft(file->f_dentry); 2432 2496 struct cgroup_subsys_state *css = cfe->css; 2433 - int ret = 0; 2434 2497 2435 - if (cft->release) 2436 - ret = cft->release(inode, file); 2437 2498 if (css->ss) 2438 2499 css_put(css); 2439 - if (file->f_op == &cgroup_seqfile_operations) 2440 - single_release(inode, file); 2441 - return ret; 2500 + return seq_release_private(inode, file); 2442 2501 } 2443 2502 2444 2503 /* ··· 2543 2614 } 2544 2615 2545 2616 static const struct file_operations cgroup_file_operations = { 2546 - .read = cgroup_file_read, 2617 + .read = seq_read, 2547 2618 .write = cgroup_file_write, 2548 2619 .llseek = generic_file_llseek, 2549 2620 .open = cgroup_file_open, ··· 2567 2638 .listxattr = cgroup_listxattr, 2568 2639 .removexattr = cgroup_removexattr, 2569 2640 }; 2570 - 2571 - /* 2572 - * Check if a file is a control file 2573 - */ 2574 - static inline struct cftype *__file_cft(struct file *file) 2575 - { 2576 - if (file_inode(file)->i_fop != &cgroup_file_operations) 2577 - return ERR_PTR(-EINVAL); 2578 - return __d_cft(file->f_dentry); 2579 - } 2580 2641 2581 2642 static int cgroup_create_file(struct dentry *dentry, umode_t mode, 2582 2643 struct super_block *sb) ··· 2625 2706 if (cft->mode) 2626 2707 return cft->mode; 2627 2708 2628 - if (cft->read || cft->read_u64 || cft->read_s64 || 2629 - cft->read_map || cft->read_seq_string) 2709 + if (cft->read_u64 || cft->read_s64 || cft->seq_show) 2630 2710 mode |= S_IRUGO; 2631 2711 2632 - if (cft->write || cft->write_u64 || cft->write_s64 || 2633 - cft->write_string || cft->trigger) 2712 + if (cft->write_u64 || cft->write_s64 || cft->write_string || 2713 + cft->trigger) 2634 2714 mode |= S_IWUSR; 2635 2715 2636 2716 return mode; ··· 2925 3007 * @parent_css: css whose children to walk 2926 3008 * 2927 3009 * This function returns the next child of @parent_css and should be called 2928 - * under RCU read lock. The only requirement is that @parent_css and 2929 - * @pos_css are accessible. The next sibling is guaranteed to be returned 2930 - * regardless of their states. 3010 + * under either cgroup_mutex or RCU read lock. The only requirement is 3011 + * that @parent_css and @pos_css are accessible. The next sibling is 3012 + * guaranteed to be returned regardless of their states. 2931 3013 */ 2932 3014 struct cgroup_subsys_state * 2933 3015 css_next_child(struct cgroup_subsys_state *pos_css, ··· 2937 3019 struct cgroup *cgrp = parent_css->cgroup; 2938 3020 struct cgroup *next; 2939 3021 2940 - WARN_ON_ONCE(!rcu_read_lock_held()); 3022 + cgroup_assert_mutex_or_rcu_locked(); 2941 3023 2942 3024 /* 2943 3025 * @pos could already have been removed. Once a cgroup is removed, ··· 2984 3066 * to visit for pre-order traversal of @root's descendants. @root is 2985 3067 * included in the iteration and the first node to be visited. 2986 3068 * 2987 - * While this function requires RCU read locking, it doesn't require the 2988 - * whole traversal to be contained in a single RCU critical section. This 2989 - * function will return the correct next descendant as long as both @pos 2990 - * and @root are accessible and @pos is a descendant of @root. 3069 + * While this function requires cgroup_mutex or RCU read locking, it 3070 + * doesn't require the whole traversal to be contained in a single critical 3071 + * section. This function will return the correct next descendant as long 3072 + * as both @pos and @root are accessible and @pos is a descendant of @root. 2991 3073 */ 2992 3074 struct cgroup_subsys_state * 2993 3075 css_next_descendant_pre(struct cgroup_subsys_state *pos, ··· 2995 3077 { 2996 3078 struct cgroup_subsys_state *next; 2997 3079 2998 - WARN_ON_ONCE(!rcu_read_lock_held()); 3080 + cgroup_assert_mutex_or_rcu_locked(); 2999 3081 3000 3082 /* if first iteration, visit @root */ 3001 3083 if (!pos) ··· 3026 3108 * is returned. This can be used during pre-order traversal to skip 3027 3109 * subtree of @pos. 3028 3110 * 3029 - * While this function requires RCU read locking, it doesn't require the 3030 - * whole traversal to be contained in a single RCU critical section. This 3031 - * function will return the correct rightmost descendant as long as @pos is 3032 - * accessible. 3111 + * While this function requires cgroup_mutex or RCU read locking, it 3112 + * doesn't require the whole traversal to be contained in a single critical 3113 + * section. This function will return the correct rightmost descendant as 3114 + * long as @pos is accessible. 3033 3115 */ 3034 3116 struct cgroup_subsys_state * 3035 3117 css_rightmost_descendant(struct cgroup_subsys_state *pos) 3036 3118 { 3037 3119 struct cgroup_subsys_state *last, *tmp; 3038 3120 3039 - WARN_ON_ONCE(!rcu_read_lock_held()); 3121 + cgroup_assert_mutex_or_rcu_locked(); 3040 3122 3041 3123 do { 3042 3124 last = pos; ··· 3072 3154 * to visit for post-order traversal of @root's descendants. @root is 3073 3155 * included in the iteration and the last node to be visited. 3074 3156 * 3075 - * While this function requires RCU read locking, it doesn't require the 3076 - * whole traversal to be contained in a single RCU critical section. This 3077 - * function will return the correct next descendant as long as both @pos 3078 - * and @cgroup are accessible and @pos is a descendant of @cgroup. 3157 + * While this function requires cgroup_mutex or RCU read locking, it 3158 + * doesn't require the whole traversal to be contained in a single critical 3159 + * section. This function will return the correct next descendant as long 3160 + * as both @pos and @cgroup are accessible and @pos is a descendant of 3161 + * @cgroup. 3079 3162 */ 3080 3163 struct cgroup_subsys_state * 3081 3164 css_next_descendant_post(struct cgroup_subsys_state *pos, ··· 3084 3165 { 3085 3166 struct cgroup_subsys_state *next; 3086 3167 3087 - WARN_ON_ONCE(!rcu_read_lock_held()); 3168 + cgroup_assert_mutex_or_rcu_locked(); 3088 3169 3089 3170 /* if first iteration, visit leftmost descendant which may be @root */ 3090 3171 if (!pos) ··· 3423 3504 pid_t *list; 3424 3505 /* how many elements the above list has */ 3425 3506 int length; 3426 - /* how many files are using the current array */ 3427 - int use_count; 3428 3507 /* each of these stored in a list by its cgroup */ 3429 3508 struct list_head links; 3430 3509 /* pointer to the cgroup we belong to, for list removal purposes */ 3431 3510 struct cgroup *owner; 3432 - /* protects the other fields */ 3433 - struct rw_semaphore rwsem; 3511 + /* for delayed destruction */ 3512 + struct delayed_work destroy_dwork; 3434 3513 }; 3435 3514 3436 3515 /* ··· 3444 3527 else 3445 3528 return kmalloc(count * sizeof(pid_t), GFP_KERNEL); 3446 3529 } 3530 + 3447 3531 static void pidlist_free(void *p) 3448 3532 { 3449 3533 if (is_vmalloc_addr(p)) 3450 3534 vfree(p); 3451 3535 else 3452 3536 kfree(p); 3537 + } 3538 + 3539 + /* 3540 + * Used to destroy all pidlists lingering waiting for destroy timer. None 3541 + * should be left afterwards. 3542 + */ 3543 + static void cgroup_pidlist_destroy_all(struct cgroup *cgrp) 3544 + { 3545 + struct cgroup_pidlist *l, *tmp_l; 3546 + 3547 + mutex_lock(&cgrp->pidlist_mutex); 3548 + list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) 3549 + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); 3550 + mutex_unlock(&cgrp->pidlist_mutex); 3551 + 3552 + flush_workqueue(cgroup_pidlist_destroy_wq); 3553 + BUG_ON(!list_empty(&cgrp->pidlists)); 3554 + } 3555 + 3556 + static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) 3557 + { 3558 + struct delayed_work *dwork = to_delayed_work(work); 3559 + struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, 3560 + destroy_dwork); 3561 + struct cgroup_pidlist *tofree = NULL; 3562 + 3563 + mutex_lock(&l->owner->pidlist_mutex); 3564 + 3565 + /* 3566 + * Destroy iff we didn't get queued again. The state won't change 3567 + * as destroy_dwork can only be queued while locked. 3568 + */ 3569 + if (!delayed_work_pending(dwork)) { 3570 + list_del(&l->links); 3571 + pidlist_free(l->list); 3572 + put_pid_ns(l->key.ns); 3573 + tofree = l; 3574 + } 3575 + 3576 + mutex_unlock(&l->owner->pidlist_mutex); 3577 + kfree(tofree); 3453 3578 } 3454 3579 3455 3580 /* ··· 3524 3565 return dest; 3525 3566 } 3526 3567 3568 + /* 3569 + * The two pid files - task and cgroup.procs - guaranteed that the result 3570 + * is sorted, which forced this whole pidlist fiasco. As pid order is 3571 + * different per namespace, each namespace needs differently sorted list, 3572 + * making it impossible to use, for example, single rbtree of member tasks 3573 + * sorted by task pointer. As pidlists can be fairly large, allocating one 3574 + * per open file is dangerous, so cgroup had to implement shared pool of 3575 + * pidlists keyed by cgroup and namespace. 3576 + * 3577 + * All this extra complexity was caused by the original implementation 3578 + * committing to an entirely unnecessary property. In the long term, we 3579 + * want to do away with it. Explicitly scramble sort order if 3580 + * sane_behavior so that no such expectation exists in the new interface. 3581 + * 3582 + * Scrambling is done by swapping every two consecutive bits, which is 3583 + * non-identity one-to-one mapping which disturbs sort order sufficiently. 3584 + */ 3585 + static pid_t pid_fry(pid_t pid) 3586 + { 3587 + unsigned a = pid & 0x55555555; 3588 + unsigned b = pid & 0xAAAAAAAA; 3589 + 3590 + return (a << 1) | (b >> 1); 3591 + } 3592 + 3593 + static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) 3594 + { 3595 + if (cgroup_sane_behavior(cgrp)) 3596 + return pid_fry(pid); 3597 + else 3598 + return pid; 3599 + } 3600 + 3527 3601 static int cmppid(const void *a, const void *b) 3528 3602 { 3529 3603 return *(pid_t *)a - *(pid_t *)b; 3604 + } 3605 + 3606 + static int fried_cmppid(const void *a, const void *b) 3607 + { 3608 + return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b); 3609 + } 3610 + 3611 + static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, 3612 + enum cgroup_filetype type) 3613 + { 3614 + struct cgroup_pidlist *l; 3615 + /* don't need task_nsproxy() if we're looking at ourself */ 3616 + struct pid_namespace *ns = task_active_pid_ns(current); 3617 + 3618 + lockdep_assert_held(&cgrp->pidlist_mutex); 3619 + 3620 + list_for_each_entry(l, &cgrp->pidlists, links) 3621 + if (l->key.type == type && l->key.ns == ns) 3622 + return l; 3623 + return NULL; 3530 3624 } 3531 3625 3532 3626 /* ··· 3588 3576 * of the use count, or returns NULL with no locks held if we're out of 3589 3577 * memory. 3590 3578 */ 3591 - static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, 3592 - enum cgroup_filetype type) 3579 + static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, 3580 + enum cgroup_filetype type) 3593 3581 { 3594 3582 struct cgroup_pidlist *l; 3595 - /* don't need task_nsproxy() if we're looking at ourself */ 3596 - struct pid_namespace *ns = task_active_pid_ns(current); 3597 3583 3598 - /* 3599 - * We can't drop the pidlist_mutex before taking the l->rwsem in case 3600 - * the last ref-holder is trying to remove l from the list at the same 3601 - * time. Holding the pidlist_mutex precludes somebody taking whichever 3602 - * list we find out from under us - compare release_pid_array(). 3603 - */ 3604 - mutex_lock(&cgrp->pidlist_mutex); 3605 - list_for_each_entry(l, &cgrp->pidlists, links) { 3606 - if (l->key.type == type && l->key.ns == ns) { 3607 - /* make sure l doesn't vanish out from under us */ 3608 - down_write(&l->rwsem); 3609 - mutex_unlock(&cgrp->pidlist_mutex); 3610 - return l; 3611 - } 3612 - } 3584 + lockdep_assert_held(&cgrp->pidlist_mutex); 3585 + 3586 + l = cgroup_pidlist_find(cgrp, type); 3587 + if (l) 3588 + return l; 3589 + 3613 3590 /* entry not found; create a new one */ 3614 3591 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 3615 - if (!l) { 3616 - mutex_unlock(&cgrp->pidlist_mutex); 3592 + if (!l) 3617 3593 return l; 3618 - } 3619 - init_rwsem(&l->rwsem); 3620 - down_write(&l->rwsem); 3594 + 3595 + INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); 3621 3596 l->key.type = type; 3622 - l->key.ns = get_pid_ns(ns); 3597 + /* don't need task_nsproxy() if we're looking at ourself */ 3598 + l->key.ns = get_pid_ns(task_active_pid_ns(current)); 3623 3599 l->owner = cgrp; 3624 3600 list_add(&l->links, &cgrp->pidlists); 3625 - mutex_unlock(&cgrp->pidlist_mutex); 3626 3601 return l; 3627 3602 } 3628 3603 ··· 3625 3626 struct css_task_iter it; 3626 3627 struct task_struct *tsk; 3627 3628 struct cgroup_pidlist *l; 3629 + 3630 + lockdep_assert_held(&cgrp->pidlist_mutex); 3628 3631 3629 3632 /* 3630 3633 * If cgroup gets more users after we read count, we won't have ··· 3654 3653 css_task_iter_end(&it); 3655 3654 length = n; 3656 3655 /* now sort & (if procs) strip out duplicates */ 3657 - sort(array, length, sizeof(pid_t), cmppid, NULL); 3656 + if (cgroup_sane_behavior(cgrp)) 3657 + sort(array, length, sizeof(pid_t), fried_cmppid, NULL); 3658 + else 3659 + sort(array, length, sizeof(pid_t), cmppid, NULL); 3658 3660 if (type == CGROUP_FILE_PROCS) 3659 3661 length = pidlist_uniq(array, length); 3660 - l = cgroup_pidlist_find(cgrp, type); 3662 + 3663 + l = cgroup_pidlist_find_create(cgrp, type); 3661 3664 if (!l) { 3665 + mutex_unlock(&cgrp->pidlist_mutex); 3662 3666 pidlist_free(array); 3663 3667 return -ENOMEM; 3664 3668 } 3665 - /* store array, freeing old if necessary - lock already held */ 3669 + 3670 + /* store array, freeing old if necessary */ 3666 3671 pidlist_free(l->list); 3667 3672 l->list = array; 3668 3673 l->length = length; 3669 - l->use_count++; 3670 - up_write(&l->rwsem); 3671 3674 *lp = l; 3672 3675 return 0; 3673 3676 } ··· 3745 3740 * after a seek to the start). Use a binary-search to find the 3746 3741 * next pid to display, if any 3747 3742 */ 3748 - struct cgroup_pidlist *l = s->private; 3743 + struct cgroup_open_file *of = s->private; 3744 + struct cgroup *cgrp = seq_css(s)->cgroup; 3745 + struct cgroup_pidlist *l; 3746 + enum cgroup_filetype type = seq_cft(s)->private; 3749 3747 int index = 0, pid = *pos; 3750 - int *iter; 3748 + int *iter, ret; 3751 3749 3752 - down_read(&l->rwsem); 3750 + mutex_lock(&cgrp->pidlist_mutex); 3751 + 3752 + /* 3753 + * !NULL @of->priv indicates that this isn't the first start() 3754 + * after open. If the matching pidlist is around, we can use that. 3755 + * Look for it. Note that @of->priv can't be used directly. It 3756 + * could already have been destroyed. 3757 + */ 3758 + if (of->priv) 3759 + of->priv = cgroup_pidlist_find(cgrp, type); 3760 + 3761 + /* 3762 + * Either this is the first start() after open or the matching 3763 + * pidlist has been destroyed inbetween. Create a new one. 3764 + */ 3765 + if (!of->priv) { 3766 + ret = pidlist_array_load(cgrp, type, 3767 + (struct cgroup_pidlist **)&of->priv); 3768 + if (ret) 3769 + return ERR_PTR(ret); 3770 + } 3771 + l = of->priv; 3772 + 3753 3773 if (pid) { 3754 3774 int end = l->length; 3755 3775 3756 3776 while (index < end) { 3757 3777 int mid = (index + end) / 2; 3758 - if (l->list[mid] == pid) { 3778 + if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) { 3759 3779 index = mid; 3760 3780 break; 3761 - } else if (l->list[mid] <= pid) 3781 + } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid) 3762 3782 index = mid + 1; 3763 3783 else 3764 3784 end = mid; ··· 3794 3764 return NULL; 3795 3765 /* Update the abstract position to be the actual pid that we found */ 3796 3766 iter = l->list + index; 3797 - *pos = *iter; 3767 + *pos = cgroup_pid_fry(cgrp, *iter); 3798 3768 return iter; 3799 3769 } 3800 3770 3801 3771 static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3802 3772 { 3803 - struct cgroup_pidlist *l = s->private; 3804 - up_read(&l->rwsem); 3773 + struct cgroup_open_file *of = s->private; 3774 + struct cgroup_pidlist *l = of->priv; 3775 + 3776 + if (l) 3777 + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 3778 + CGROUP_PIDLIST_DESTROY_DELAY); 3779 + mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); 3805 3780 } 3806 3781 3807 3782 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3808 3783 { 3809 - struct cgroup_pidlist *l = s->private; 3784 + struct cgroup_open_file *of = s->private; 3785 + struct cgroup_pidlist *l = of->priv; 3810 3786 pid_t *p = v; 3811 3787 pid_t *end = l->list + l->length; 3812 3788 /* ··· 3823 3787 if (p >= end) { 3824 3788 return NULL; 3825 3789 } else { 3826 - *pos = *p; 3790 + *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p); 3827 3791 return p; 3828 3792 } 3829 3793 } ··· 3843 3807 .next = cgroup_pidlist_next, 3844 3808 .show = cgroup_pidlist_show, 3845 3809 }; 3846 - 3847 - static void cgroup_release_pid_array(struct cgroup_pidlist *l) 3848 - { 3849 - /* 3850 - * the case where we're the last user of this particular pidlist will 3851 - * have us remove it from the cgroup's list, which entails taking the 3852 - * mutex. since in pidlist_find the pidlist->lock depends on cgroup-> 3853 - * pidlist_mutex, we have to take pidlist_mutex first. 3854 - */ 3855 - mutex_lock(&l->owner->pidlist_mutex); 3856 - down_write(&l->rwsem); 3857 - BUG_ON(!l->use_count); 3858 - if (!--l->use_count) { 3859 - /* we're the last user if refcount is 0; remove and free */ 3860 - list_del(&l->links); 3861 - mutex_unlock(&l->owner->pidlist_mutex); 3862 - pidlist_free(l->list); 3863 - put_pid_ns(l->key.ns); 3864 - up_write(&l->rwsem); 3865 - kfree(l); 3866 - return; 3867 - } 3868 - mutex_unlock(&l->owner->pidlist_mutex); 3869 - up_write(&l->rwsem); 3870 - } 3871 - 3872 - static int cgroup_pidlist_release(struct inode *inode, struct file *file) 3873 - { 3874 - struct cgroup_pidlist *l; 3875 - if (!(file->f_mode & FMODE_READ)) 3876 - return 0; 3877 - /* 3878 - * the seq_file will only be initialized if the file was opened for 3879 - * reading; hence we check if it's not null only in that case. 3880 - */ 3881 - l = ((struct seq_file *)file->private_data)->private; 3882 - cgroup_release_pid_array(l); 3883 - return seq_release(inode, file); 3884 - } 3885 - 3886 - static const struct file_operations cgroup_pidlist_operations = { 3887 - .read = seq_read, 3888 - .llseek = seq_lseek, 3889 - .write = cgroup_file_write, 3890 - .release = cgroup_pidlist_release, 3891 - }; 3892 - 3893 - /* 3894 - * The following functions handle opens on a file that displays a pidlist 3895 - * (tasks or procs). Prepare an array of the process/thread IDs of whoever's 3896 - * in the cgroup. 3897 - */ 3898 - /* helper function for the two below it */ 3899 - static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type) 3900 - { 3901 - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 3902 - struct cgroup_pidlist *l; 3903 - int retval; 3904 - 3905 - /* Nothing to do for write-only files */ 3906 - if (!(file->f_mode & FMODE_READ)) 3907 - return 0; 3908 - 3909 - /* have the array populated */ 3910 - retval = pidlist_array_load(cgrp, type, &l); 3911 - if (retval) 3912 - return retval; 3913 - /* configure file information */ 3914 - file->f_op = &cgroup_pidlist_operations; 3915 - 3916 - retval = seq_open(file, &cgroup_pidlist_seq_operations); 3917 - if (retval) { 3918 - cgroup_release_pid_array(l); 3919 - return retval; 3920 - } 3921 - ((struct seq_file *)file->private_data)->private = l; 3922 - return 0; 3923 - } 3924 - static int cgroup_tasks_open(struct inode *unused, struct file *file) 3925 - { 3926 - return cgroup_pidlist_open(file, CGROUP_FILE_TASKS); 3927 - } 3928 - static int cgroup_procs_open(struct inode *unused, struct file *file) 3929 - { 3930 - return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); 3931 - } 3932 3810 3933 3811 static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, 3934 3812 struct cftype *cft) ··· 3878 3928 deactivate_super(sb); 3879 3929 } 3880 3930 3881 - /* 3882 - * Unregister event and free resources. 3883 - * 3884 - * Gets called from workqueue. 3885 - */ 3886 - static void cgroup_event_remove(struct work_struct *work) 3887 - { 3888 - struct cgroup_event *event = container_of(work, struct cgroup_event, 3889 - remove); 3890 - struct cgroup_subsys_state *css = event->css; 3891 - 3892 - remove_wait_queue(event->wqh, &event->wait); 3893 - 3894 - event->cft->unregister_event(css, event->cft, event->eventfd); 3895 - 3896 - /* Notify userspace the event is going away. */ 3897 - eventfd_signal(event->eventfd, 1); 3898 - 3899 - eventfd_ctx_put(event->eventfd); 3900 - kfree(event); 3901 - css_put(css); 3902 - } 3903 - 3904 - /* 3905 - * Gets called on POLLHUP on eventfd when user closes it. 3906 - * 3907 - * Called with wqh->lock held and interrupts disabled. 3908 - */ 3909 - static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, 3910 - int sync, void *key) 3911 - { 3912 - struct cgroup_event *event = container_of(wait, 3913 - struct cgroup_event, wait); 3914 - struct cgroup *cgrp = event->css->cgroup; 3915 - unsigned long flags = (unsigned long)key; 3916 - 3917 - if (flags & POLLHUP) { 3918 - /* 3919 - * If the event has been detached at cgroup removal, we 3920 - * can simply return knowing the other side will cleanup 3921 - * for us. 3922 - * 3923 - * We can't race against event freeing since the other 3924 - * side will require wqh->lock via remove_wait_queue(), 3925 - * which we hold. 3926 - */ 3927 - spin_lock(&cgrp->event_list_lock); 3928 - if (!list_empty(&event->list)) { 3929 - list_del_init(&event->list); 3930 - /* 3931 - * We are in atomic context, but cgroup_event_remove() 3932 - * may sleep, so we have to call it in workqueue. 3933 - */ 3934 - schedule_work(&event->remove); 3935 - } 3936 - spin_unlock(&cgrp->event_list_lock); 3937 - } 3938 - 3939 - return 0; 3940 - } 3941 - 3942 - static void cgroup_event_ptable_queue_proc(struct file *file, 3943 - wait_queue_head_t *wqh, poll_table *pt) 3944 - { 3945 - struct cgroup_event *event = container_of(pt, 3946 - struct cgroup_event, pt); 3947 - 3948 - event->wqh = wqh; 3949 - add_wait_queue(wqh, &event->wait); 3950 - } 3951 - 3952 - /* 3953 - * Parse input and register new cgroup event handler. 3954 - * 3955 - * Input must be in format '<event_fd> <control_fd> <args>'. 3956 - * Interpretation of args is defined by control file implementation. 3957 - */ 3958 - static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, 3959 - struct cftype *cft, const char *buffer) 3960 - { 3961 - struct cgroup *cgrp = dummy_css->cgroup; 3962 - struct cgroup_event *event; 3963 - struct cgroup_subsys_state *cfile_css; 3964 - unsigned int efd, cfd; 3965 - struct fd efile; 3966 - struct fd cfile; 3967 - char *endp; 3968 - int ret; 3969 - 3970 - efd = simple_strtoul(buffer, &endp, 10); 3971 - if (*endp != ' ') 3972 - return -EINVAL; 3973 - buffer = endp + 1; 3974 - 3975 - cfd = simple_strtoul(buffer, &endp, 10); 3976 - if ((*endp != ' ') && (*endp != '\0')) 3977 - return -EINVAL; 3978 - buffer = endp + 1; 3979 - 3980 - event = kzalloc(sizeof(*event), GFP_KERNEL); 3981 - if (!event) 3982 - return -ENOMEM; 3983 - 3984 - INIT_LIST_HEAD(&event->list); 3985 - init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); 3986 - init_waitqueue_func_entry(&event->wait, cgroup_event_wake); 3987 - INIT_WORK(&event->remove, cgroup_event_remove); 3988 - 3989 - efile = fdget(efd); 3990 - if (!efile.file) { 3991 - ret = -EBADF; 3992 - goto out_kfree; 3993 - } 3994 - 3995 - event->eventfd = eventfd_ctx_fileget(efile.file); 3996 - if (IS_ERR(event->eventfd)) { 3997 - ret = PTR_ERR(event->eventfd); 3998 - goto out_put_efile; 3999 - } 4000 - 4001 - cfile = fdget(cfd); 4002 - if (!cfile.file) { 4003 - ret = -EBADF; 4004 - goto out_put_eventfd; 4005 - } 4006 - 4007 - /* the process need read permission on control file */ 4008 - /* AV: shouldn't we check that it's been opened for read instead? */ 4009 - ret = inode_permission(file_inode(cfile.file), MAY_READ); 4010 - if (ret < 0) 4011 - goto out_put_cfile; 4012 - 4013 - event->cft = __file_cft(cfile.file); 4014 - if (IS_ERR(event->cft)) { 4015 - ret = PTR_ERR(event->cft); 4016 - goto out_put_cfile; 4017 - } 4018 - 4019 - if (!event->cft->ss) { 4020 - ret = -EBADF; 4021 - goto out_put_cfile; 4022 - } 4023 - 4024 - /* 4025 - * Determine the css of @cfile, verify it belongs to the same 4026 - * cgroup as cgroup.event_control, and associate @event with it. 4027 - * Remaining events are automatically removed on cgroup destruction 4028 - * but the removal is asynchronous, so take an extra ref. 4029 - */ 4030 - rcu_read_lock(); 4031 - 4032 - ret = -EINVAL; 4033 - event->css = cgroup_css(cgrp, event->cft->ss); 4034 - cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); 4035 - if (event->css && event->css == cfile_css && css_tryget(event->css)) 4036 - ret = 0; 4037 - 4038 - rcu_read_unlock(); 4039 - if (ret) 4040 - goto out_put_cfile; 4041 - 4042 - if (!event->cft->register_event || !event->cft->unregister_event) { 4043 - ret = -EINVAL; 4044 - goto out_put_css; 4045 - } 4046 - 4047 - ret = event->cft->register_event(event->css, event->cft, 4048 - event->eventfd, buffer); 4049 - if (ret) 4050 - goto out_put_css; 4051 - 4052 - efile.file->f_op->poll(efile.file, &event->pt); 4053 - 4054 - spin_lock(&cgrp->event_list_lock); 4055 - list_add(&event->list, &cgrp->event_list); 4056 - spin_unlock(&cgrp->event_list_lock); 4057 - 4058 - fdput(cfile); 4059 - fdput(efile); 4060 - 4061 - return 0; 4062 - 4063 - out_put_css: 4064 - css_put(event->css); 4065 - out_put_cfile: 4066 - fdput(cfile); 4067 - out_put_eventfd: 4068 - eventfd_ctx_put(event->eventfd); 4069 - out_put_efile: 4070 - fdput(efile); 4071 - out_kfree: 4072 - kfree(event); 4073 - 4074 - return ret; 4075 - } 4076 - 4077 3931 static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, 4078 3932 struct cftype *cft) 4079 3933 { ··· 3897 4143 static struct cftype cgroup_base_files[] = { 3898 4144 { 3899 4145 .name = "cgroup.procs", 3900 - .open = cgroup_procs_open, 4146 + .seq_start = cgroup_pidlist_start, 4147 + .seq_next = cgroup_pidlist_next, 4148 + .seq_stop = cgroup_pidlist_stop, 4149 + .seq_show = cgroup_pidlist_show, 4150 + .private = CGROUP_FILE_PROCS, 3901 4151 .write_u64 = cgroup_procs_write, 3902 - .release = cgroup_pidlist_release, 3903 4152 .mode = S_IRUGO | S_IWUSR, 3904 - }, 3905 - { 3906 - .name = "cgroup.event_control", 3907 - .write_string = cgroup_write_event_control, 3908 - .mode = S_IWUGO, 3909 4153 }, 3910 4154 { 3911 4155 .name = "cgroup.clone_children", ··· 3914 4162 { 3915 4163 .name = "cgroup.sane_behavior", 3916 4164 .flags = CFTYPE_ONLY_ON_ROOT, 3917 - .read_seq_string = cgroup_sane_behavior_show, 4165 + .seq_show = cgroup_sane_behavior_show, 3918 4166 }, 3919 4167 3920 4168 /* ··· 3925 4173 { 3926 4174 .name = "tasks", 3927 4175 .flags = CFTYPE_INSANE, /* use "procs" instead */ 3928 - .open = cgroup_tasks_open, 4176 + .seq_start = cgroup_pidlist_start, 4177 + .seq_next = cgroup_pidlist_next, 4178 + .seq_stop = cgroup_pidlist_stop, 4179 + .seq_show = cgroup_pidlist_show, 4180 + .private = CGROUP_FILE_TASKS, 3929 4181 .write_u64 = cgroup_tasks_write, 3930 - .release = cgroup_pidlist_release, 3931 4182 .mode = S_IRUGO | S_IWUSR, 3932 4183 }, 3933 4184 { ··· 3942 4187 { 3943 4188 .name = "release_agent", 3944 4189 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 3945 - .read_seq_string = cgroup_release_agent_show, 4190 + .seq_show = cgroup_release_agent_show, 3946 4191 .write_string = cgroup_release_agent_write, 3947 4192 .max_write_len = PATH_MAX, 3948 4193 }, ··· 4088 4333 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); 4089 4334 } 4090 4335 4336 + /** 4337 + * create_css - create a cgroup_subsys_state 4338 + * @cgrp: the cgroup new css will be associated with 4339 + * @ss: the subsys of new css 4340 + * 4341 + * Create a new css associated with @cgrp - @ss pair. On success, the new 4342 + * css is online and installed in @cgrp with all interface files created. 4343 + * Returns 0 on success, -errno on failure. 4344 + */ 4345 + static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) 4346 + { 4347 + struct cgroup *parent = cgrp->parent; 4348 + struct cgroup_subsys_state *css; 4349 + int err; 4350 + 4351 + lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); 4352 + lockdep_assert_held(&cgroup_mutex); 4353 + 4354 + css = ss->css_alloc(cgroup_css(parent, ss)); 4355 + if (IS_ERR(css)) 4356 + return PTR_ERR(css); 4357 + 4358 + err = percpu_ref_init(&css->refcnt, css_release); 4359 + if (err) 4360 + goto err_free; 4361 + 4362 + init_css(css, ss, cgrp); 4363 + 4364 + err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id); 4365 + if (err) 4366 + goto err_free; 4367 + 4368 + err = online_css(css); 4369 + if (err) 4370 + goto err_free; 4371 + 4372 + dget(cgrp->dentry); 4373 + css_get(css->parent); 4374 + 4375 + if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 4376 + parent->parent) { 4377 + pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", 4378 + current->comm, current->pid, ss->name); 4379 + if (!strcmp(ss->name, "memory")) 4380 + pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); 4381 + ss->warned_broken_hierarchy = true; 4382 + } 4383 + 4384 + return 0; 4385 + 4386 + err_free: 4387 + percpu_ref_cancel_init(&css->refcnt); 4388 + ss->css_free(css); 4389 + return err; 4390 + } 4391 + 4091 4392 /* 4092 4393 * cgroup_create - create a cgroup 4093 4394 * @parent: cgroup that will be parent of the new cgroup ··· 4155 4344 static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 4156 4345 umode_t mode) 4157 4346 { 4158 - struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { }; 4159 4347 struct cgroup *cgrp; 4160 4348 struct cgroup_name *name; 4161 4349 struct cgroupfs_root *root = parent->root; 4162 - int err = 0; 4350 + int ssid, err = 0; 4163 4351 struct cgroup_subsys *ss; 4164 4352 struct super_block *sb = root->sb; 4165 4353 ··· 4214 4404 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 4215 4405 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4216 4406 4217 - for_each_root_subsys(root, ss) { 4218 - struct cgroup_subsys_state *css; 4219 - 4220 - css = ss->css_alloc(cgroup_css(parent, ss)); 4221 - if (IS_ERR(css)) { 4222 - err = PTR_ERR(css); 4223 - goto err_free_all; 4224 - } 4225 - css_ar[ss->subsys_id] = css; 4226 - 4227 - err = percpu_ref_init(&css->refcnt, css_release); 4228 - if (err) 4229 - goto err_free_all; 4230 - 4231 - init_css(css, ss, cgrp); 4232 - } 4233 - 4234 4407 /* 4235 4408 * Create directory. cgroup_create_file() returns with the new 4236 4409 * directory locked on success so that it can be populated without ··· 4221 4428 */ 4222 4429 err = cgroup_create_file(dentry, S_IFDIR | mode, sb); 4223 4430 if (err < 0) 4224 - goto err_free_all; 4431 + goto err_unlock; 4225 4432 lockdep_assert_held(&dentry->d_inode->i_mutex); 4226 4433 4227 4434 cgrp->serial_nr = cgroup_serial_nr_next++; ··· 4233 4440 /* hold a ref to the parent's dentry */ 4234 4441 dget(parent->dentry); 4235 4442 4236 - /* creation succeeded, notify subsystems */ 4237 - for_each_root_subsys(root, ss) { 4238 - struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; 4239 - 4240 - err = online_css(css); 4241 - if (err) 4242 - goto err_destroy; 4243 - 4244 - /* each css holds a ref to the cgroup's dentry and parent css */ 4245 - dget(dentry); 4246 - css_get(css->parent); 4247 - 4248 - /* mark it consumed for error path */ 4249 - css_ar[ss->subsys_id] = NULL; 4250 - 4251 - if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 4252 - parent->parent) { 4253 - pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", 4254 - current->comm, current->pid, ss->name); 4255 - if (!strcmp(ss->name, "memory")) 4256 - pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); 4257 - ss->warned_broken_hierarchy = true; 4258 - } 4259 - } 4260 - 4443 + /* 4444 + * @cgrp is now fully operational. If something fails after this 4445 + * point, it'll be released via the normal destruction path. 4446 + */ 4261 4447 idr_replace(&root->cgroup_idr, cgrp, cgrp->id); 4262 4448 4263 4449 err = cgroup_addrm_files(cgrp, cgroup_base_files, true); 4264 4450 if (err) 4265 4451 goto err_destroy; 4266 4452 4267 - err = cgroup_populate_dir(cgrp, root->subsys_mask); 4268 - if (err) 4269 - goto err_destroy; 4453 + /* let's create and online css's */ 4454 + for_each_subsys(ss, ssid) { 4455 + if (root->subsys_mask & (1 << ssid)) { 4456 + err = create_css(cgrp, ss); 4457 + if (err) 4458 + goto err_destroy; 4459 + } 4460 + } 4270 4461 4271 4462 mutex_unlock(&cgroup_mutex); 4272 4463 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 4273 4464 4274 4465 return 0; 4275 4466 4276 - err_free_all: 4277 - for_each_root_subsys(root, ss) { 4278 - struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; 4279 - 4280 - if (css) { 4281 - percpu_ref_cancel_init(&css->refcnt); 4282 - ss->css_free(css); 4283 - } 4284 - } 4467 + err_unlock: 4285 4468 mutex_unlock(&cgroup_mutex); 4286 4469 /* Release the reference count that we took on the superblock */ 4287 4470 deactivate_super(sb); ··· 4270 4501 return err; 4271 4502 4272 4503 err_destroy: 4273 - for_each_root_subsys(root, ss) { 4274 - struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; 4275 - 4276 - if (css) { 4277 - percpu_ref_cancel_init(&css->refcnt); 4278 - ss->css_free(css); 4279 - } 4280 - } 4281 4504 cgroup_destroy_locked(cgrp); 4282 4505 mutex_unlock(&cgroup_mutex); 4283 4506 mutex_unlock(&dentry->d_inode->i_mutex); ··· 4392 4631 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4393 4632 { 4394 4633 struct dentry *d = cgrp->dentry; 4395 - struct cgroup_event *event, *tmp; 4396 - struct cgroup_subsys *ss; 4634 + struct cgroup_subsys_state *css; 4397 4635 struct cgroup *child; 4398 4636 bool empty; 4637 + int ssid; 4399 4638 4400 4639 lockdep_assert_held(&d->d_inode->i_mutex); 4401 4640 lockdep_assert_held(&cgroup_mutex); ··· 4431 4670 * will be invoked to perform the rest of destruction once the 4432 4671 * percpu refs of all css's are confirmed to be killed. 4433 4672 */ 4434 - for_each_root_subsys(cgrp->root, ss) { 4435 - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); 4436 - 4437 - if (css) 4438 - kill_css(css); 4439 - } 4673 + for_each_css(css, ssid, cgrp) 4674 + kill_css(css); 4440 4675 4441 4676 /* 4442 4677 * Mark @cgrp dead. This prevents further task migration and child ··· 4466 4709 cgroup_addrm_files(cgrp, cgroup_base_files, false); 4467 4710 dget(d); 4468 4711 cgroup_d_remove_dir(d); 4469 - 4470 - /* 4471 - * Unregister events and notify userspace. 4472 - * Notify userspace about cgroup removing only after rmdir of cgroup 4473 - * directory to avoid race between userspace and kernelspace. 4474 - */ 4475 - spin_lock(&cgrp->event_list_lock); 4476 - list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { 4477 - list_del_init(&event->list); 4478 - schedule_work(&event->remove); 4479 - } 4480 - spin_unlock(&cgrp->event_list_lock); 4481 4712 4482 4713 return 0; 4483 4714 }; ··· 4537 4792 cgroup_init_cftsets(ss); 4538 4793 4539 4794 /* Create the top cgroup state for this subsystem */ 4540 - list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); 4541 4795 ss->root = &cgroup_dummy_root; 4542 4796 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); 4543 4797 /* We don't handle early failures gracefully */ ··· 4610 4866 cgroup_init_cftsets(ss); 4611 4867 4612 4868 mutex_lock(&cgroup_mutex); 4869 + mutex_lock(&cgroup_root_mutex); 4613 4870 cgroup_subsys[ss->subsys_id] = ss; 4614 4871 4615 4872 /* ··· 4622 4877 if (IS_ERR(css)) { 4623 4878 /* failure case - need to deassign the cgroup_subsys[] slot. */ 4624 4879 cgroup_subsys[ss->subsys_id] = NULL; 4880 + mutex_unlock(&cgroup_root_mutex); 4625 4881 mutex_unlock(&cgroup_mutex); 4626 4882 return PTR_ERR(css); 4627 4883 } 4628 4884 4629 - list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); 4630 4885 ss->root = &cgroup_dummy_root; 4631 4886 4632 4887 /* our new subsystem will be attached to the dummy hierarchy. */ ··· 4656 4911 write_unlock(&css_set_lock); 4657 4912 4658 4913 ret = online_css(css); 4659 - if (ret) 4914 + if (ret) { 4915 + ss->css_free(css); 4660 4916 goto err_unload; 4917 + } 4661 4918 4662 4919 /* success! */ 4920 + mutex_unlock(&cgroup_root_mutex); 4663 4921 mutex_unlock(&cgroup_mutex); 4664 4922 return 0; 4665 4923 4666 4924 err_unload: 4925 + mutex_unlock(&cgroup_root_mutex); 4667 4926 mutex_unlock(&cgroup_mutex); 4668 4927 /* @ss can't be mounted here as try_module_get() would fail */ 4669 4928 cgroup_unload_subsys(ss); ··· 4686 4937 void cgroup_unload_subsys(struct cgroup_subsys *ss) 4687 4938 { 4688 4939 struct cgrp_cset_link *link; 4940 + struct cgroup_subsys_state *css; 4689 4941 4690 4942 BUG_ON(ss->module == NULL); 4691 4943 ··· 4698 4948 BUG_ON(ss->root != &cgroup_dummy_root); 4699 4949 4700 4950 mutex_lock(&cgroup_mutex); 4951 + mutex_lock(&cgroup_root_mutex); 4701 4952 4702 - offline_css(cgroup_css(cgroup_dummy_top, ss)); 4953 + css = cgroup_css(cgroup_dummy_top, ss); 4954 + if (css) 4955 + offline_css(css); 4703 4956 4704 4957 /* deassign the subsys_id */ 4705 4958 cgroup_subsys[ss->subsys_id] = NULL; 4706 - 4707 - /* remove subsystem from the dummy root's list of subsystems */ 4708 - list_del_init(&ss->sibling); 4709 4959 4710 4960 /* 4711 4961 * disentangle the css from all css_sets attached to the dummy ··· 4729 4979 * need to free before marking as null because ss->css_free needs 4730 4980 * the cgrp->subsys pointer to find their state. 4731 4981 */ 4732 - ss->css_free(cgroup_css(cgroup_dummy_top, ss)); 4982 + if (css) 4983 + ss->css_free(css); 4733 4984 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); 4734 4985 4986 + mutex_unlock(&cgroup_root_mutex); 4735 4987 mutex_unlock(&cgroup_mutex); 4736 4988 } 4737 4989 EXPORT_SYMBOL_GPL(cgroup_unload_subsys); ··· 4852 5100 */ 4853 5101 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); 4854 5102 BUG_ON(!cgroup_destroy_wq); 5103 + 5104 + /* 5105 + * Used to destroy pidlists and separate to serve as flush domain. 5106 + * Cap @max_active to 1 too. 5107 + */ 5108 + cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", 5109 + 0, 1); 5110 + BUG_ON(!cgroup_pidlist_destroy_wq); 5111 + 4855 5112 return 0; 4856 5113 } 4857 5114 core_initcall(cgroup_wq_init); ··· 4904 5143 for_each_active_root(root) { 4905 5144 struct cgroup_subsys *ss; 4906 5145 struct cgroup *cgrp; 4907 - int count = 0; 5146 + int ssid, count = 0; 4908 5147 4909 5148 seq_printf(m, "%d:", root->hierarchy_id); 4910 - for_each_root_subsys(root, ss) 4911 - seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 5149 + for_each_subsys(ss, ssid) 5150 + if (root->subsys_mask & (1 << ssid)) 5151 + seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4912 5152 if (strlen(root->name)) 4913 5153 seq_printf(m, "%sname=%s", count ? "," : "", 4914 5154 root->name); ··· 5250 5488 * @dentry: directory dentry of interest 5251 5489 * @ss: subsystem of interest 5252 5490 * 5253 - * Must be called under RCU read lock. The caller is responsible for 5254 - * pinning the returned css if it needs to be accessed outside the RCU 5255 - * critical section. 5491 + * Must be called under cgroup_mutex or RCU read lock. The caller is 5492 + * responsible for pinning the returned css if it needs to be accessed 5493 + * outside the critical section. 5256 5494 */ 5257 5495 struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, 5258 5496 struct cgroup_subsys *ss) 5259 5497 { 5260 5498 struct cgroup *cgrp; 5261 5499 5262 - WARN_ON_ONCE(!rcu_read_lock_held()); 5500 + cgroup_assert_mutex_or_rcu_locked(); 5263 5501 5264 5502 /* is @dentry a cgroup dir? */ 5265 5503 if (!dentry->d_inode || ··· 5282 5520 { 5283 5521 struct cgroup *cgrp; 5284 5522 5285 - rcu_lockdep_assert(rcu_read_lock_held() || 5286 - lockdep_is_held(&cgroup_mutex), 5287 - "css_from_id() needs proper protection"); 5523 + cgroup_assert_mutex_or_rcu_locked(); 5288 5524 5289 5525 cgrp = idr_find(&ss->root->cgroup_idr, id); 5290 5526 if (cgrp) ··· 5330 5570 return count; 5331 5571 } 5332 5572 5333 - static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, 5334 - struct cftype *cft, 5335 - struct seq_file *seq) 5573 + static int current_css_set_cg_links_read(struct seq_file *seq, void *v) 5336 5574 { 5337 5575 struct cgrp_cset_link *link; 5338 5576 struct css_set *cset; ··· 5355 5597 } 5356 5598 5357 5599 #define MAX_TASKS_SHOWN_PER_CSS 25 5358 - static int cgroup_css_links_read(struct cgroup_subsys_state *css, 5359 - struct cftype *cft, struct seq_file *seq) 5600 + static int cgroup_css_links_read(struct seq_file *seq, void *v) 5360 5601 { 5602 + struct cgroup_subsys_state *css = seq_css(seq); 5361 5603 struct cgrp_cset_link *link; 5362 5604 5363 5605 read_lock(&css_set_lock); ··· 5403 5645 5404 5646 { 5405 5647 .name = "current_css_set_cg_links", 5406 - .read_seq_string = current_css_set_cg_links_read, 5648 + .seq_show = current_css_set_cg_links_read, 5407 5649 }, 5408 5650 5409 5651 { 5410 5652 .name = "cgroup_css_links", 5411 - .read_seq_string = cgroup_css_links_read, 5653 + .seq_show = cgroup_css_links_read, 5412 5654 }, 5413 5655 5414 5656 {
+3 -4
kernel/cgroup_freezer.c
··· 301 301 spin_unlock_irq(&freezer->lock); 302 302 } 303 303 304 - static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, 305 - struct seq_file *m) 304 + static int freezer_read(struct seq_file *m, void *v) 306 305 { 307 - struct cgroup_subsys_state *pos; 306 + struct cgroup_subsys_state *css = seq_css(m), *pos; 308 307 309 308 rcu_read_lock(); 310 309 ··· 457 458 { 458 459 .name = "state", 459 460 .flags = CFTYPE_NOT_ON_ROOT, 460 - .read_seq_string = freezer_read, 461 + .seq_show = freezer_read, 461 462 .write_string = freezer_write, 462 463 }, 463 464 {
+24 -49
kernel/cpuset.c
··· 1731 1731 * used, list of ranges of sequential numbers, is variable length, 1732 1732 * and since these maps can change value dynamically, one could read 1733 1733 * gibberish by doing partial reads while a list was changing. 1734 - * A single large read to a buffer that crosses a page boundary is 1735 - * ok, because the result being copied to user land is not recomputed 1736 - * across a page fault. 1737 1734 */ 1738 - 1739 - static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs) 1735 + static int cpuset_common_seq_show(struct seq_file *sf, void *v) 1740 1736 { 1741 - size_t count; 1737 + struct cpuset *cs = css_cs(seq_css(sf)); 1738 + cpuset_filetype_t type = seq_cft(sf)->private; 1739 + ssize_t count; 1740 + char *buf, *s; 1741 + int ret = 0; 1742 + 1743 + count = seq_get_buf(sf, &buf); 1744 + s = buf; 1742 1745 1743 1746 mutex_lock(&callback_mutex); 1744 - count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); 1745 - mutex_unlock(&callback_mutex); 1746 - 1747 - return count; 1748 - } 1749 - 1750 - static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) 1751 - { 1752 - size_t count; 1753 - 1754 - mutex_lock(&callback_mutex); 1755 - count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed); 1756 - mutex_unlock(&callback_mutex); 1757 - 1758 - return count; 1759 - } 1760 - 1761 - static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css, 1762 - struct cftype *cft, struct file *file, 1763 - char __user *buf, size_t nbytes, 1764 - loff_t *ppos) 1765 - { 1766 - struct cpuset *cs = css_cs(css); 1767 - cpuset_filetype_t type = cft->private; 1768 - char *page; 1769 - ssize_t retval = 0; 1770 - char *s; 1771 - 1772 - if (!(page = (char *)__get_free_page(GFP_TEMPORARY))) 1773 - return -ENOMEM; 1774 - 1775 - s = page; 1776 1747 1777 1748 switch (type) { 1778 1749 case FILE_CPULIST: 1779 - s += cpuset_sprintf_cpulist(s, cs); 1750 + s += cpulist_scnprintf(s, count, cs->cpus_allowed); 1780 1751 break; 1781 1752 case FILE_MEMLIST: 1782 - s += cpuset_sprintf_memlist(s, cs); 1753 + s += nodelist_scnprintf(s, count, cs->mems_allowed); 1783 1754 break; 1784 1755 default: 1785 - retval = -EINVAL; 1786 - goto out; 1756 + ret = -EINVAL; 1757 + goto out_unlock; 1787 1758 } 1788 - *s++ = '\n'; 1789 1759 1790 - retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); 1791 - out: 1792 - free_page((unsigned long)page); 1793 - return retval; 1760 + if (s < buf + count - 1) { 1761 + *s++ = '\n'; 1762 + seq_commit(sf, s - buf); 1763 + } else { 1764 + seq_commit(sf, -1); 1765 + } 1766 + out_unlock: 1767 + mutex_unlock(&callback_mutex); 1768 + return ret; 1794 1769 } 1795 1770 1796 1771 static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) ··· 1822 1847 static struct cftype files[] = { 1823 1848 { 1824 1849 .name = "cpus", 1825 - .read = cpuset_common_file_read, 1850 + .seq_show = cpuset_common_seq_show, 1826 1851 .write_string = cpuset_write_resmask, 1827 1852 .max_write_len = (100U + 6 * NR_CPUS), 1828 1853 .private = FILE_CPULIST, ··· 1830 1855 1831 1856 { 1832 1857 .name = "mems", 1833 - .read = cpuset_common_file_read, 1858 + .seq_show = cpuset_common_seq_show, 1834 1859 .write_string = cpuset_write_resmask, 1835 1860 .max_write_len = (100U + 6 * MAX_NUMNODES), 1836 1861 .private = FILE_MEMLIST,
+6 -7
kernel/sched/core.c
··· 7852 7852 return ret; 7853 7853 } 7854 7854 7855 - static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, 7856 - struct cgroup_map_cb *cb) 7855 + static int cpu_stats_show(struct seq_file *sf, void *v) 7857 7856 { 7858 - struct task_group *tg = css_tg(css); 7857 + struct task_group *tg = css_tg(seq_css(sf)); 7859 7858 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7860 7859 7861 - cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7862 - cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); 7863 - cb->fill(cb, "throttled_time", cfs_b->throttled_time); 7860 + seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); 7861 + seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); 7862 + seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); 7864 7863 7865 7864 return 0; 7866 7865 } ··· 7913 7914 }, 7914 7915 { 7915 7916 .name = "stat", 7916 - .read_map = cpu_stats_show, 7917 + .seq_show = cpu_stats_show, 7917 7918 }, 7918 7919 #endif 7919 7920 #ifdef CONFIG_RT_GROUP_SCHED
+8 -10
kernel/sched/cpuacct.c
··· 163 163 return err; 164 164 } 165 165 166 - static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, 167 - struct cftype *cft, struct seq_file *m) 166 + static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) 168 167 { 169 - struct cpuacct *ca = css_ca(css); 168 + struct cpuacct *ca = css_ca(seq_css(m)); 170 169 u64 percpu; 171 170 int i; 172 171 ··· 182 183 [CPUACCT_STAT_SYSTEM] = "system", 183 184 }; 184 185 185 - static int cpuacct_stats_show(struct cgroup_subsys_state *css, 186 - struct cftype *cft, struct cgroup_map_cb *cb) 186 + static int cpuacct_stats_show(struct seq_file *sf, void *v) 187 187 { 188 - struct cpuacct *ca = css_ca(css); 188 + struct cpuacct *ca = css_ca(seq_css(sf)); 189 189 int cpu; 190 190 s64 val = 0; 191 191 ··· 194 196 val += kcpustat->cpustat[CPUTIME_NICE]; 195 197 } 196 198 val = cputime64_to_clock_t(val); 197 - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); 199 + seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); 198 200 199 201 val = 0; 200 202 for_each_online_cpu(cpu) { ··· 205 207 } 206 208 207 209 val = cputime64_to_clock_t(val); 208 - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); 210 + seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); 209 211 210 212 return 0; 211 213 } ··· 218 220 }, 219 221 { 220 222 .name = "usage_percpu", 221 - .read_seq_string = cpuacct_percpu_seq_read, 223 + .seq_show = cpuacct_percpu_seq_show, 222 224 }, 223 225 { 224 226 .name = "stat", 225 - .read_map = cpuacct_stats_show, 227 + .seq_show = cpuacct_stats_show, 226 228 }, 227 229 { } /* terminate */ 228 230 };
+8 -14
mm/hugetlb_cgroup.c
··· 242 242 return; 243 243 } 244 244 245 - static ssize_t hugetlb_cgroup_read(struct cgroup_subsys_state *css, 246 - struct cftype *cft, struct file *file, 247 - char __user *buf, size_t nbytes, 248 - loff_t *ppos) 245 + static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, 246 + struct cftype *cft) 249 247 { 250 - u64 val; 251 - char str[64]; 252 - int idx, name, len; 248 + int idx, name; 253 249 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 254 250 255 251 idx = MEMFILE_IDX(cft->private); 256 252 name = MEMFILE_ATTR(cft->private); 257 253 258 - val = res_counter_read_u64(&h_cg->hugepage[idx], name); 259 - len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); 260 - return simple_read_from_buffer(buf, nbytes, ppos, str, len); 254 + return res_counter_read_u64(&h_cg->hugepage[idx], name); 261 255 } 262 256 263 257 static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, ··· 331 337 cft = &h->cgroup_files[0]; 332 338 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); 333 339 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); 334 - cft->read = hugetlb_cgroup_read; 340 + cft->read_u64 = hugetlb_cgroup_read_u64; 335 341 cft->write_string = hugetlb_cgroup_write; 336 342 337 343 /* Add the usage file */ 338 344 cft = &h->cgroup_files[1]; 339 345 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); 340 346 cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); 341 - cft->read = hugetlb_cgroup_read; 347 + cft->read_u64 = hugetlb_cgroup_read_u64; 342 348 343 349 /* Add the MAX usage file */ 344 350 cft = &h->cgroup_files[2]; 345 351 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); 346 352 cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); 347 353 cft->trigger = hugetlb_cgroup_reset; 348 - cft->read = hugetlb_cgroup_read; 354 + cft->read_u64 = hugetlb_cgroup_read_u64; 349 355 350 356 /* Add the failcntfile */ 351 357 cft = &h->cgroup_files[3]; 352 358 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); 353 359 cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); 354 360 cft->trigger = hugetlb_cgroup_reset; 355 - cft->read = hugetlb_cgroup_read; 361 + cft->read_u64 = hugetlb_cgroup_read_u64; 356 362 357 363 /* NULL terminate the last cft */ 358 364 cft = &h->cgroup_files[4];
+353 -75
mm/memcontrol.c
··· 45 45 #include <linux/swapops.h> 46 46 #include <linux/spinlock.h> 47 47 #include <linux/eventfd.h> 48 + #include <linux/poll.h> 48 49 #include <linux/sort.h> 49 50 #include <linux/fs.h> 50 51 #include <linux/seq_file.h> ··· 56 55 #include <linux/cpu.h> 57 56 #include <linux/oom.h> 58 57 #include <linux/lockdep.h> 58 + #include <linux/file.h> 59 59 #include "internal.h" 60 60 #include <net/sock.h> 61 61 #include <net/ip.h> ··· 229 227 struct eventfd_ctx *eventfd; 230 228 }; 231 229 230 + /* 231 + * cgroup_event represents events which userspace want to receive. 232 + */ 233 + struct mem_cgroup_event { 234 + /* 235 + * memcg which the event belongs to. 236 + */ 237 + struct mem_cgroup *memcg; 238 + /* 239 + * eventfd to signal userspace about the event. 240 + */ 241 + struct eventfd_ctx *eventfd; 242 + /* 243 + * Each of these stored in a list by the cgroup. 244 + */ 245 + struct list_head list; 246 + /* 247 + * register_event() callback will be used to add new userspace 248 + * waiter for changes related to this event. Use eventfd_signal() 249 + * on eventfd to send notification to userspace. 250 + */ 251 + int (*register_event)(struct mem_cgroup *memcg, 252 + struct eventfd_ctx *eventfd, const char *args); 253 + /* 254 + * unregister_event() callback will be called when userspace closes 255 + * the eventfd or on cgroup removing. This callback must be set, 256 + * if you want provide notification functionality. 257 + */ 258 + void (*unregister_event)(struct mem_cgroup *memcg, 259 + struct eventfd_ctx *eventfd); 260 + /* 261 + * All fields below needed to unregister event when 262 + * userspace closes eventfd. 263 + */ 264 + poll_table pt; 265 + wait_queue_head_t *wqh; 266 + wait_queue_t wait; 267 + struct work_struct remove; 268 + }; 269 + 232 270 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 233 271 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 234 272 ··· 372 330 atomic_t numainfo_events; 373 331 atomic_t numainfo_updating; 374 332 #endif 333 + 334 + /* List of events which userspace want to receive */ 335 + struct list_head event_list; 336 + spinlock_t event_list_lock; 375 337 376 338 struct mem_cgroup_per_node *nodeinfo[0]; 377 339 /* WARNING: nodeinfo must be the last member here */ ··· 534 488 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 535 489 { 536 490 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 537 - } 538 - 539 - struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css) 540 - { 541 - return &mem_cgroup_from_css(css)->vmpressure; 542 491 } 543 492 544 493 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) ··· 3017 2976 } 3018 2977 3019 2978 #ifdef CONFIG_SLABINFO 3020 - static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css, 3021 - struct cftype *cft, struct seq_file *m) 2979 + static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) 3022 2980 { 3023 - struct mem_cgroup *memcg = mem_cgroup_from_css(css); 2981 + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 3024 2982 struct memcg_cache_params *params; 3025 2983 3026 2984 if (!memcg_can_account_kmem(memcg)) ··· 5152 5112 return val << PAGE_SHIFT; 5153 5113 } 5154 5114 5155 - static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css, 5156 - struct cftype *cft, struct file *file, 5157 - char __user *buf, size_t nbytes, loff_t *ppos) 5115 + static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 5116 + struct cftype *cft) 5158 5117 { 5159 5118 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5160 - char str[64]; 5161 5119 u64 val; 5162 - int name, len; 5120 + int name; 5163 5121 enum res_type type; 5164 5122 5165 5123 type = MEMFILE_TYPE(cft->private); ··· 5183 5145 BUG(); 5184 5146 } 5185 5147 5186 - len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); 5187 - return simple_read_from_buffer(buf, nbytes, ppos, str, len); 5148 + return val; 5188 5149 } 5189 5150 5190 5151 static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) ··· 5420 5383 #endif 5421 5384 5422 5385 #ifdef CONFIG_NUMA 5423 - static int memcg_numa_stat_show(struct cgroup_subsys_state *css, 5424 - struct cftype *cft, struct seq_file *m) 5386 + static int memcg_numa_stat_show(struct seq_file *m, void *v) 5425 5387 { 5426 5388 struct numa_stat { 5427 5389 const char *name; ··· 5436 5400 const struct numa_stat *stat; 5437 5401 int nid; 5438 5402 unsigned long nr; 5439 - struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5403 + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5440 5404 5441 5405 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 5442 5406 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); ··· 5475 5439 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 5476 5440 } 5477 5441 5478 - static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft, 5479 - struct seq_file *m) 5442 + static int memcg_stat_show(struct seq_file *m, void *v) 5480 5443 { 5481 - struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5444 + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5482 5445 struct mem_cgroup *mi; 5483 5446 unsigned int i; 5484 5447 ··· 5686 5651 mem_cgroup_oom_notify_cb(iter); 5687 5652 } 5688 5653 5689 - static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, 5690 - struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5654 + static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 5655 + struct eventfd_ctx *eventfd, const char *args, enum res_type type) 5691 5656 { 5692 - struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5693 5657 struct mem_cgroup_thresholds *thresholds; 5694 5658 struct mem_cgroup_threshold_ary *new; 5695 - enum res_type type = MEMFILE_TYPE(cft->private); 5696 5659 u64 threshold, usage; 5697 5660 int i, size, ret; 5698 5661 ··· 5767 5734 return ret; 5768 5735 } 5769 5736 5770 - static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, 5771 - struct cftype *cft, struct eventfd_ctx *eventfd) 5737 + static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 5738 + struct eventfd_ctx *eventfd, const char *args) 5772 5739 { 5773 - struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5740 + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 5741 + } 5742 + 5743 + static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 5744 + struct eventfd_ctx *eventfd, const char *args) 5745 + { 5746 + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 5747 + } 5748 + 5749 + static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 5750 + struct eventfd_ctx *eventfd, enum res_type type) 5751 + { 5774 5752 struct mem_cgroup_thresholds *thresholds; 5775 5753 struct mem_cgroup_threshold_ary *new; 5776 - enum res_type type = MEMFILE_TYPE(cft->private); 5777 5754 u64 usage; 5778 5755 int i, j, size; 5779 5756 ··· 5856 5813 mutex_unlock(&memcg->thresholds_lock); 5857 5814 } 5858 5815 5859 - static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, 5860 - struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5816 + static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 5817 + struct eventfd_ctx *eventfd) 5861 5818 { 5862 - struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5863 - struct mem_cgroup_eventfd_list *event; 5864 - enum res_type type = MEMFILE_TYPE(cft->private); 5819 + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 5820 + } 5865 5821 5866 - BUG_ON(type != _OOM_TYPE); 5822 + static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 5823 + struct eventfd_ctx *eventfd) 5824 + { 5825 + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 5826 + } 5827 + 5828 + static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 5829 + struct eventfd_ctx *eventfd, const char *args) 5830 + { 5831 + struct mem_cgroup_eventfd_list *event; 5832 + 5867 5833 event = kmalloc(sizeof(*event), GFP_KERNEL); 5868 5834 if (!event) 5869 5835 return -ENOMEM; ··· 5890 5838 return 0; 5891 5839 } 5892 5840 5893 - static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, 5894 - struct cftype *cft, struct eventfd_ctx *eventfd) 5841 + static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 5842 + struct eventfd_ctx *eventfd) 5895 5843 { 5896 - struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5897 5844 struct mem_cgroup_eventfd_list *ev, *tmp; 5898 - enum res_type type = MEMFILE_TYPE(cft->private); 5899 - 5900 - BUG_ON(type != _OOM_TYPE); 5901 5845 5902 5846 spin_lock(&memcg_oom_lock); 5903 5847 ··· 5907 5859 spin_unlock(&memcg_oom_lock); 5908 5860 } 5909 5861 5910 - static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css, 5911 - struct cftype *cft, struct cgroup_map_cb *cb) 5862 + static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 5912 5863 { 5913 - struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5864 + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); 5914 5865 5915 - cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); 5916 - 5917 - if (atomic_read(&memcg->under_oom)) 5918 - cb->fill(cb, "under_oom", 1); 5919 - else 5920 - cb->fill(cb, "under_oom", 0); 5866 + seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 5867 + seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom)); 5921 5868 return 0; 5922 5869 } 5923 5870 ··· 6005 5962 } 6006 5963 #endif 6007 5964 5965 + /* 5966 + * DO NOT USE IN NEW FILES. 5967 + * 5968 + * "cgroup.event_control" implementation. 5969 + * 5970 + * This is way over-engineered. It tries to support fully configurable 5971 + * events for each user. Such level of flexibility is completely 5972 + * unnecessary especially in the light of the planned unified hierarchy. 5973 + * 5974 + * Please deprecate this and replace with something simpler if at all 5975 + * possible. 5976 + */ 5977 + 5978 + /* 5979 + * Unregister event and free resources. 5980 + * 5981 + * Gets called from workqueue. 5982 + */ 5983 + static void memcg_event_remove(struct work_struct *work) 5984 + { 5985 + struct mem_cgroup_event *event = 5986 + container_of(work, struct mem_cgroup_event, remove); 5987 + struct mem_cgroup *memcg = event->memcg; 5988 + 5989 + remove_wait_queue(event->wqh, &event->wait); 5990 + 5991 + event->unregister_event(memcg, event->eventfd); 5992 + 5993 + /* Notify userspace the event is going away. */ 5994 + eventfd_signal(event->eventfd, 1); 5995 + 5996 + eventfd_ctx_put(event->eventfd); 5997 + kfree(event); 5998 + css_put(&memcg->css); 5999 + } 6000 + 6001 + /* 6002 + * Gets called on POLLHUP on eventfd when user closes it. 6003 + * 6004 + * Called with wqh->lock held and interrupts disabled. 6005 + */ 6006 + static int memcg_event_wake(wait_queue_t *wait, unsigned mode, 6007 + int sync, void *key) 6008 + { 6009 + struct mem_cgroup_event *event = 6010 + container_of(wait, struct mem_cgroup_event, wait); 6011 + struct mem_cgroup *memcg = event->memcg; 6012 + unsigned long flags = (unsigned long)key; 6013 + 6014 + if (flags & POLLHUP) { 6015 + /* 6016 + * If the event has been detached at cgroup removal, we 6017 + * can simply return knowing the other side will cleanup 6018 + * for us. 6019 + * 6020 + * We can't race against event freeing since the other 6021 + * side will require wqh->lock via remove_wait_queue(), 6022 + * which we hold. 6023 + */ 6024 + spin_lock(&memcg->event_list_lock); 6025 + if (!list_empty(&event->list)) { 6026 + list_del_init(&event->list); 6027 + /* 6028 + * We are in atomic context, but cgroup_event_remove() 6029 + * may sleep, so we have to call it in workqueue. 6030 + */ 6031 + schedule_work(&event->remove); 6032 + } 6033 + spin_unlock(&memcg->event_list_lock); 6034 + } 6035 + 6036 + return 0; 6037 + } 6038 + 6039 + static void memcg_event_ptable_queue_proc(struct file *file, 6040 + wait_queue_head_t *wqh, poll_table *pt) 6041 + { 6042 + struct mem_cgroup_event *event = 6043 + container_of(pt, struct mem_cgroup_event, pt); 6044 + 6045 + event->wqh = wqh; 6046 + add_wait_queue(wqh, &event->wait); 6047 + } 6048 + 6049 + /* 6050 + * DO NOT USE IN NEW FILES. 6051 + * 6052 + * Parse input and register new cgroup event handler. 6053 + * 6054 + * Input must be in format '<event_fd> <control_fd> <args>'. 6055 + * Interpretation of args is defined by control file implementation. 6056 + */ 6057 + static int memcg_write_event_control(struct cgroup_subsys_state *css, 6058 + struct cftype *cft, const char *buffer) 6059 + { 6060 + struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6061 + struct mem_cgroup_event *event; 6062 + struct cgroup_subsys_state *cfile_css; 6063 + unsigned int efd, cfd; 6064 + struct fd efile; 6065 + struct fd cfile; 6066 + const char *name; 6067 + char *endp; 6068 + int ret; 6069 + 6070 + efd = simple_strtoul(buffer, &endp, 10); 6071 + if (*endp != ' ') 6072 + return -EINVAL; 6073 + buffer = endp + 1; 6074 + 6075 + cfd = simple_strtoul(buffer, &endp, 10); 6076 + if ((*endp != ' ') && (*endp != '\0')) 6077 + return -EINVAL; 6078 + buffer = endp + 1; 6079 + 6080 + event = kzalloc(sizeof(*event), GFP_KERNEL); 6081 + if (!event) 6082 + return -ENOMEM; 6083 + 6084 + event->memcg = memcg; 6085 + INIT_LIST_HEAD(&event->list); 6086 + init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 6087 + init_waitqueue_func_entry(&event->wait, memcg_event_wake); 6088 + INIT_WORK(&event->remove, memcg_event_remove); 6089 + 6090 + efile = fdget(efd); 6091 + if (!efile.file) { 6092 + ret = -EBADF; 6093 + goto out_kfree; 6094 + } 6095 + 6096 + event->eventfd = eventfd_ctx_fileget(efile.file); 6097 + if (IS_ERR(event->eventfd)) { 6098 + ret = PTR_ERR(event->eventfd); 6099 + goto out_put_efile; 6100 + } 6101 + 6102 + cfile = fdget(cfd); 6103 + if (!cfile.file) { 6104 + ret = -EBADF; 6105 + goto out_put_eventfd; 6106 + } 6107 + 6108 + /* the process need read permission on control file */ 6109 + /* AV: shouldn't we check that it's been opened for read instead? */ 6110 + ret = inode_permission(file_inode(cfile.file), MAY_READ); 6111 + if (ret < 0) 6112 + goto out_put_cfile; 6113 + 6114 + /* 6115 + * Determine the event callbacks and set them in @event. This used 6116 + * to be done via struct cftype but cgroup core no longer knows 6117 + * about these events. The following is crude but the whole thing 6118 + * is for compatibility anyway. 6119 + * 6120 + * DO NOT ADD NEW FILES. 6121 + */ 6122 + name = cfile.file->f_dentry->d_name.name; 6123 + 6124 + if (!strcmp(name, "memory.usage_in_bytes")) { 6125 + event->register_event = mem_cgroup_usage_register_event; 6126 + event->unregister_event = mem_cgroup_usage_unregister_event; 6127 + } else if (!strcmp(name, "memory.oom_control")) { 6128 + event->register_event = mem_cgroup_oom_register_event; 6129 + event->unregister_event = mem_cgroup_oom_unregister_event; 6130 + } else if (!strcmp(name, "memory.pressure_level")) { 6131 + event->register_event = vmpressure_register_event; 6132 + event->unregister_event = vmpressure_unregister_event; 6133 + } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 6134 + event->register_event = memsw_cgroup_usage_register_event; 6135 + event->unregister_event = memsw_cgroup_usage_unregister_event; 6136 + } else { 6137 + ret = -EINVAL; 6138 + goto out_put_cfile; 6139 + } 6140 + 6141 + /* 6142 + * Verify @cfile should belong to @css. Also, remaining events are 6143 + * automatically removed on cgroup destruction but the removal is 6144 + * asynchronous, so take an extra ref on @css. 6145 + */ 6146 + rcu_read_lock(); 6147 + 6148 + ret = -EINVAL; 6149 + cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, 6150 + &mem_cgroup_subsys); 6151 + if (cfile_css == css && css_tryget(css)) 6152 + ret = 0; 6153 + 6154 + rcu_read_unlock(); 6155 + if (ret) 6156 + goto out_put_cfile; 6157 + 6158 + ret = event->register_event(memcg, event->eventfd, buffer); 6159 + if (ret) 6160 + goto out_put_css; 6161 + 6162 + efile.file->f_op->poll(efile.file, &event->pt); 6163 + 6164 + spin_lock(&memcg->event_list_lock); 6165 + list_add(&event->list, &memcg->event_list); 6166 + spin_unlock(&memcg->event_list_lock); 6167 + 6168 + fdput(cfile); 6169 + fdput(efile); 6170 + 6171 + return 0; 6172 + 6173 + out_put_css: 6174 + css_put(css); 6175 + out_put_cfile: 6176 + fdput(cfile); 6177 + out_put_eventfd: 6178 + eventfd_ctx_put(event->eventfd); 6179 + out_put_efile: 6180 + fdput(efile); 6181 + out_kfree: 6182 + kfree(event); 6183 + 6184 + return ret; 6185 + } 6186 + 6008 6187 static struct cftype mem_cgroup_files[] = { 6009 6188 { 6010 6189 .name = "usage_in_bytes", 6011 6190 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 6012 - .read = mem_cgroup_read, 6013 - .register_event = mem_cgroup_usage_register_event, 6014 - .unregister_event = mem_cgroup_usage_unregister_event, 6191 + .read_u64 = mem_cgroup_read_u64, 6015 6192 }, 6016 6193 { 6017 6194 .name = "max_usage_in_bytes", 6018 6195 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 6019 6196 .trigger = mem_cgroup_reset, 6020 - .read = mem_cgroup_read, 6197 + .read_u64 = mem_cgroup_read_u64, 6021 6198 }, 6022 6199 { 6023 6200 .name = "limit_in_bytes", 6024 6201 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 6025 6202 .write_string = mem_cgroup_write, 6026 - .read = mem_cgroup_read, 6203 + .read_u64 = mem_cgroup_read_u64, 6027 6204 }, 6028 6205 { 6029 6206 .name = "soft_limit_in_bytes", 6030 6207 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 6031 6208 .write_string = mem_cgroup_write, 6032 - .read = mem_cgroup_read, 6209 + .read_u64 = mem_cgroup_read_u64, 6033 6210 }, 6034 6211 { 6035 6212 .name = "failcnt", 6036 6213 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 6037 6214 .trigger = mem_cgroup_reset, 6038 - .read = mem_cgroup_read, 6215 + .read_u64 = mem_cgroup_read_u64, 6039 6216 }, 6040 6217 { 6041 6218 .name = "stat", 6042 - .read_seq_string = memcg_stat_show, 6219 + .seq_show = memcg_stat_show, 6043 6220 }, 6044 6221 { 6045 6222 .name = "force_empty", ··· 6270 6007 .flags = CFTYPE_INSANE, 6271 6008 .write_u64 = mem_cgroup_hierarchy_write, 6272 6009 .read_u64 = mem_cgroup_hierarchy_read, 6010 + }, 6011 + { 6012 + .name = "cgroup.event_control", /* XXX: for compat */ 6013 + .write_string = memcg_write_event_control, 6014 + .flags = CFTYPE_NO_PREFIX, 6015 + .mode = S_IWUGO, 6273 6016 }, 6274 6017 { 6275 6018 .name = "swappiness", ··· 6289 6020 }, 6290 6021 { 6291 6022 .name = "oom_control", 6292 - .read_map = mem_cgroup_oom_control_read, 6023 + .seq_show = mem_cgroup_oom_control_read, 6293 6024 .write_u64 = mem_cgroup_oom_control_write, 6294 - .register_event = mem_cgroup_oom_register_event, 6295 - .unregister_event = mem_cgroup_oom_unregister_event, 6296 6025 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 6297 6026 }, 6298 6027 { 6299 6028 .name = "pressure_level", 6300 - .register_event = vmpressure_register_event, 6301 - .unregister_event = vmpressure_unregister_event, 6302 6029 }, 6303 6030 #ifdef CONFIG_NUMA 6304 6031 { 6305 6032 .name = "numa_stat", 6306 - .read_seq_string = memcg_numa_stat_show, 6033 + .seq_show = memcg_numa_stat_show, 6307 6034 }, 6308 6035 #endif 6309 6036 #ifdef CONFIG_MEMCG_KMEM ··· 6307 6042 .name = "kmem.limit_in_bytes", 6308 6043 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 6309 6044 .write_string = mem_cgroup_write, 6310 - .read = mem_cgroup_read, 6045 + .read_u64 = mem_cgroup_read_u64, 6311 6046 }, 6312 6047 { 6313 6048 .name = "kmem.usage_in_bytes", 6314 6049 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 6315 - .read = mem_cgroup_read, 6050 + .read_u64 = mem_cgroup_read_u64, 6316 6051 }, 6317 6052 { 6318 6053 .name = "kmem.failcnt", 6319 6054 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 6320 6055 .trigger = mem_cgroup_reset, 6321 - .read = mem_cgroup_read, 6056 + .read_u64 = mem_cgroup_read_u64, 6322 6057 }, 6323 6058 { 6324 6059 .name = "kmem.max_usage_in_bytes", 6325 6060 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 6326 6061 .trigger = mem_cgroup_reset, 6327 - .read = mem_cgroup_read, 6062 + .read_u64 = mem_cgroup_read_u64, 6328 6063 }, 6329 6064 #ifdef CONFIG_SLABINFO 6330 6065 { 6331 6066 .name = "kmem.slabinfo", 6332 - .read_seq_string = mem_cgroup_slabinfo_read, 6067 + .seq_show = mem_cgroup_slabinfo_read, 6333 6068 }, 6334 6069 #endif 6335 6070 #endif ··· 6341 6076 { 6342 6077 .name = "memsw.usage_in_bytes", 6343 6078 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 6344 - .read = mem_cgroup_read, 6345 - .register_event = mem_cgroup_usage_register_event, 6346 - .unregister_event = mem_cgroup_usage_unregister_event, 6079 + .read_u64 = mem_cgroup_read_u64, 6347 6080 }, 6348 6081 { 6349 6082 .name = "memsw.max_usage_in_bytes", 6350 6083 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 6351 6084 .trigger = mem_cgroup_reset, 6352 - .read = mem_cgroup_read, 6085 + .read_u64 = mem_cgroup_read_u64, 6353 6086 }, 6354 6087 { 6355 6088 .name = "memsw.limit_in_bytes", 6356 6089 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 6357 6090 .write_string = mem_cgroup_write, 6358 - .read = mem_cgroup_read, 6091 + .read_u64 = mem_cgroup_read_u64, 6359 6092 }, 6360 6093 { 6361 6094 .name = "memsw.failcnt", 6362 6095 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 6363 6096 .trigger = mem_cgroup_reset, 6364 - .read = mem_cgroup_read, 6097 + .read_u64 = mem_cgroup_read_u64, 6365 6098 }, 6366 6099 { }, /* terminate */ 6367 6100 }; ··· 6531 6268 mutex_init(&memcg->thresholds_lock); 6532 6269 spin_lock_init(&memcg->move_lock); 6533 6270 vmpressure_init(&memcg->vmpressure); 6271 + INIT_LIST_HEAD(&memcg->event_list); 6272 + spin_lock_init(&memcg->event_list_lock); 6534 6273 6535 6274 return &memcg->css; 6536 6275 ··· 6608 6343 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 6609 6344 { 6610 6345 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6346 + struct mem_cgroup_event *event, *tmp; 6347 + 6348 + /* 6349 + * Unregister events and notify userspace. 6350 + * Notify userspace about cgroup removing only after rmdir of cgroup 6351 + * directory to avoid race between userspace and kernelspace. 6352 + */ 6353 + spin_lock(&memcg->event_list_lock); 6354 + list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 6355 + list_del_init(&event->list); 6356 + schedule_work(&event->remove); 6357 + } 6358 + spin_unlock(&memcg->event_list_lock); 6611 6359 6612 6360 kmem_cgroup_css_offline(memcg); 6613 6361
+1 -1
mm/page_cgroup.c
··· 451 451 * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry 452 452 * @ent: swap entry to be looked up. 453 453 * 454 - * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) 454 + * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) 455 455 */ 456 456 unsigned short lookup_swap_cgroup_id(swp_entry_t ent) 457 457 {
+9 -17
mm/vmpressure.c
··· 278 278 279 279 /** 280 280 * vmpressure_register_event() - Bind vmpressure notifications to an eventfd 281 - * @css: css that is interested in vmpressure notifications 282 - * @cft: cgroup control files handle 281 + * @memcg: memcg that is interested in vmpressure notifications 283 282 * @eventfd: eventfd context to link notifications with 284 283 * @args: event arguments (used to set up a pressure level threshold) 285 284 * ··· 288 289 * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or 289 290 * "critical"). 290 291 * 291 - * This function should not be used directly, just pass it to (struct 292 - * cftype).register_event, and then cgroup core will handle everything by 293 - * itself. 292 + * To be used as memcg event method. 294 293 */ 295 - int vmpressure_register_event(struct cgroup_subsys_state *css, 296 - struct cftype *cft, struct eventfd_ctx *eventfd, 297 - const char *args) 294 + int vmpressure_register_event(struct mem_cgroup *memcg, 295 + struct eventfd_ctx *eventfd, const char *args) 298 296 { 299 - struct vmpressure *vmpr = css_to_vmpressure(css); 297 + struct vmpressure *vmpr = memcg_to_vmpressure(memcg); 300 298 struct vmpressure_event *ev; 301 299 int level; 302 300 ··· 321 325 322 326 /** 323 327 * vmpressure_unregister_event() - Unbind eventfd from vmpressure 324 - * @css: css handle 325 - * @cft: cgroup control files handle 328 + * @memcg: memcg handle 326 329 * @eventfd: eventfd context that was used to link vmpressure with the @cg 327 330 * 328 331 * This function does internal manipulations to detach the @eventfd from 329 332 * the vmpressure notifications, and then frees internal resources 330 333 * associated with the @eventfd (but the @eventfd itself is not freed). 331 334 * 332 - * This function should not be used directly, just pass it to (struct 333 - * cftype).unregister_event, and then cgroup core will handle everything 334 - * by itself. 335 + * To be used as memcg event method. 335 336 */ 336 - void vmpressure_unregister_event(struct cgroup_subsys_state *css, 337 - struct cftype *cft, 337 + void vmpressure_unregister_event(struct mem_cgroup *memcg, 338 338 struct eventfd_ctx *eventfd) 339 339 { 340 - struct vmpressure *vmpr = css_to_vmpressure(css); 340 + struct vmpressure *vmpr = memcg_to_vmpressure(memcg); 341 341 struct vmpressure_event *ev; 342 342 343 343 mutex_lock(&vmpr->events_lock);
+4 -4
net/core/netprio_cgroup.c
··· 173 173 return css->cgroup->id; 174 174 } 175 175 176 - static int read_priomap(struct cgroup_subsys_state *css, struct cftype *cft, 177 - struct cgroup_map_cb *cb) 176 + static int read_priomap(struct seq_file *sf, void *v) 178 177 { 179 178 struct net_device *dev; 180 179 181 180 rcu_read_lock(); 182 181 for_each_netdev_rcu(&init_net, dev) 183 - cb->fill(cb, dev->name, netprio_prio(css, dev)); 182 + seq_printf(sf, "%s %u\n", dev->name, 183 + netprio_prio(seq_css(sf), dev)); 184 184 rcu_read_unlock(); 185 185 return 0; 186 186 } ··· 238 238 }, 239 239 { 240 240 .name = "ifpriomap", 241 - .read_map = read_priomap, 241 + .seq_show = read_priomap, 242 242 .write_string = write_priomap, 243 243 }, 244 244 { } /* terminate */
+3 -4
security/device_cgroup.c
··· 274 274 sprintf(str, "%u", m); 275 275 } 276 276 277 - static int devcgroup_seq_read(struct cgroup_subsys_state *css, 278 - struct cftype *cft, struct seq_file *m) 277 + static int devcgroup_seq_show(struct seq_file *m, void *v) 279 278 { 280 - struct dev_cgroup *devcgroup = css_to_devcgroup(css); 279 + struct dev_cgroup *devcgroup = css_to_devcgroup(seq_css(m)); 281 280 struct dev_exception_item *ex; 282 281 char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; 283 282 ··· 678 679 }, 679 680 { 680 681 .name = "list", 681 - .read_seq_string = devcgroup_seq_read, 682 + .seq_show = devcgroup_seq_show, 682 683 .private = DEVCG_LIST, 683 684 }, 684 685 { } /* terminate */