Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

memcg: completely decouple memcg and obj stocks

Let's completely decouple the memcg and obj per-cpu stocks. This will
enable us to make memcg per-cpu stocks to used without disabling irqs.
Also it will enable us to make obj stocks nmi safe independently which is
required to make kmalloc/slab safe for allocations from nmi context.

Link: https://lkml.kernel.org/r/20250506225533.2580386-4-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Eric Dumaze <edumazet@google.com>
Cc: Jakub Kacinski <kuba@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Shakeel Butt and committed by
Andrew Morton
c80509ef 3523dd7a

+92 -57
+92 -57
mm/memcontrol.c
··· 1778 1778 * nr_pages in a single cacheline. This may change in future. 1779 1779 */ 1780 1780 #define NR_MEMCG_STOCK 7 1781 + #define FLUSHING_CACHED_CHARGE 0 1781 1782 struct memcg_stock_pcp { 1782 - local_trylock_t memcg_lock; 1783 + local_trylock_t lock; 1783 1784 uint8_t nr_pages[NR_MEMCG_STOCK]; 1784 1785 struct mem_cgroup *cached[NR_MEMCG_STOCK]; 1785 1786 1786 - local_trylock_t obj_lock; 1787 + struct work_struct work; 1788 + unsigned long flags; 1789 + }; 1790 + 1791 + static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = { 1792 + .lock = INIT_LOCAL_TRYLOCK(lock), 1793 + }; 1794 + 1795 + struct obj_stock_pcp { 1796 + local_trylock_t lock; 1787 1797 unsigned int nr_bytes; 1788 1798 struct obj_cgroup *cached_objcg; 1789 1799 struct pglist_data *cached_pgdat; ··· 1802 1792 1803 1793 struct work_struct work; 1804 1794 unsigned long flags; 1805 - #define FLUSHING_CACHED_CHARGE 0 1806 1795 }; 1807 - static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = { 1808 - .memcg_lock = INIT_LOCAL_TRYLOCK(memcg_lock), 1809 - .obj_lock = INIT_LOCAL_TRYLOCK(obj_lock), 1796 + 1797 + static DEFINE_PER_CPU_ALIGNED(struct obj_stock_pcp, obj_stock) = { 1798 + .lock = INIT_LOCAL_TRYLOCK(lock), 1810 1799 }; 1800 + 1811 1801 static DEFINE_MUTEX(percpu_charge_mutex); 1812 1802 1813 - static void drain_obj_stock(struct memcg_stock_pcp *stock); 1814 - static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 1803 + static void drain_obj_stock(struct obj_stock_pcp *stock); 1804 + static bool obj_stock_flush_required(struct obj_stock_pcp *stock, 1815 1805 struct mem_cgroup *root_memcg); 1816 1806 1817 1807 /** ··· 1834 1824 int i; 1835 1825 1836 1826 if (nr_pages > MEMCG_CHARGE_BATCH || 1837 - !local_trylock_irqsave(&memcg_stock.memcg_lock, flags)) 1827 + !local_trylock_irqsave(&memcg_stock.lock, flags)) 1838 1828 return ret; 1839 1829 1840 1830 stock = this_cpu_ptr(&memcg_stock); ··· 1851 1841 break; 1852 1842 } 1853 1843 1854 - local_unlock_irqrestore(&memcg_stock.memcg_lock, flags); 1844 + local_unlock_irqrestore(&memcg_stock.lock, flags); 1855 1845 1856 1846 return ret; 1857 1847 } ··· 1892 1882 drain_stock(stock, i); 1893 1883 } 1894 1884 1895 - static void drain_local_stock(struct work_struct *dummy) 1885 + static void drain_local_memcg_stock(struct work_struct *dummy) 1896 1886 { 1897 1887 struct memcg_stock_pcp *stock; 1898 1888 unsigned long flags; ··· 1900 1890 if (WARN_ONCE(!in_task(), "drain in non-task context")) 1901 1891 return; 1902 1892 1903 - local_lock_irqsave(&memcg_stock.obj_lock, flags); 1904 - stock = this_cpu_ptr(&memcg_stock); 1905 - drain_obj_stock(stock); 1906 - local_unlock_irqrestore(&memcg_stock.obj_lock, flags); 1893 + local_lock_irqsave(&memcg_stock.lock, flags); 1907 1894 1908 - local_lock_irqsave(&memcg_stock.memcg_lock, flags); 1909 1895 stock = this_cpu_ptr(&memcg_stock); 1910 1896 drain_stock_fully(stock); 1911 1897 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 1912 - local_unlock_irqrestore(&memcg_stock.memcg_lock, flags); 1898 + 1899 + local_unlock_irqrestore(&memcg_stock.lock, flags); 1900 + } 1901 + 1902 + static void drain_local_obj_stock(struct work_struct *dummy) 1903 + { 1904 + struct obj_stock_pcp *stock; 1905 + unsigned long flags; 1906 + 1907 + if (WARN_ONCE(!in_task(), "drain in non-task context")) 1908 + return; 1909 + 1910 + local_lock_irqsave(&obj_stock.lock, flags); 1911 + 1912 + stock = this_cpu_ptr(&obj_stock); 1913 + drain_obj_stock(stock); 1914 + clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 1915 + 1916 + local_unlock_irqrestore(&obj_stock.lock, flags); 1913 1917 } 1914 1918 1915 1919 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) ··· 1946 1922 VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg)); 1947 1923 1948 1924 if (nr_pages > MEMCG_CHARGE_BATCH || 1949 - !local_trylock_irqsave(&memcg_stock.memcg_lock, flags)) { 1925 + !local_trylock_irqsave(&memcg_stock.lock, flags)) { 1950 1926 /* 1951 1927 * In case of larger than batch refill or unlikely failure to 1952 - * lock the percpu memcg_lock, uncharge memcg directly. 1928 + * lock the percpu memcg_stock.lock, uncharge memcg directly. 1953 1929 */ 1954 1930 memcg_uncharge(memcg, nr_pages); 1955 1931 return; ··· 1981 1957 WRITE_ONCE(stock->nr_pages[i], nr_pages); 1982 1958 } 1983 1959 1984 - local_unlock_irqrestore(&memcg_stock.memcg_lock, flags); 1960 + local_unlock_irqrestore(&memcg_stock.lock, flags); 1985 1961 } 1986 1962 1987 - static bool is_drain_needed(struct memcg_stock_pcp *stock, 1988 - struct mem_cgroup *root_memcg) 1963 + static bool is_memcg_drain_needed(struct memcg_stock_pcp *stock, 1964 + struct mem_cgroup *root_memcg) 1989 1965 { 1990 1966 struct mem_cgroup *memcg; 1991 1967 bool flush = false; 1992 1968 int i; 1993 1969 1994 1970 rcu_read_lock(); 1995 - 1996 - if (obj_stock_flush_required(stock, root_memcg)) { 1997 - flush = true; 1998 - goto out; 1999 - } 2000 - 2001 1971 for (i = 0; i < NR_MEMCG_STOCK; ++i) { 2002 1972 memcg = READ_ONCE(stock->cached[i]); 2003 1973 if (!memcg) ··· 2003 1985 break; 2004 1986 } 2005 1987 } 2006 - out: 2007 1988 rcu_read_unlock(); 2008 1989 return flush; 2009 1990 } ··· 2027 2010 migrate_disable(); 2028 2011 curcpu = smp_processor_id(); 2029 2012 for_each_online_cpu(cpu) { 2030 - struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2031 - bool flush = is_drain_needed(stock, root_memcg); 2013 + struct memcg_stock_pcp *memcg_st = &per_cpu(memcg_stock, cpu); 2014 + struct obj_stock_pcp *obj_st = &per_cpu(obj_stock, cpu); 2032 2015 2033 - if (flush && 2034 - !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2016 + if (!test_bit(FLUSHING_CACHED_CHARGE, &memcg_st->flags) && 2017 + is_memcg_drain_needed(memcg_st, root_memcg) && 2018 + !test_and_set_bit(FLUSHING_CACHED_CHARGE, 2019 + &memcg_st->flags)) { 2035 2020 if (cpu == curcpu) 2036 - drain_local_stock(&stock->work); 2021 + drain_local_memcg_stock(&memcg_st->work); 2037 2022 else if (!cpu_is_isolated(cpu)) 2038 - schedule_work_on(cpu, &stock->work); 2023 + schedule_work_on(cpu, &memcg_st->work); 2024 + } 2025 + 2026 + if (!test_bit(FLUSHING_CACHED_CHARGE, &obj_st->flags) && 2027 + obj_stock_flush_required(obj_st, root_memcg) && 2028 + !test_and_set_bit(FLUSHING_CACHED_CHARGE, 2029 + &obj_st->flags)) { 2030 + if (cpu == curcpu) 2031 + drain_local_obj_stock(&obj_st->work); 2032 + else if (!cpu_is_isolated(cpu)) 2033 + schedule_work_on(cpu, &obj_st->work); 2039 2034 } 2040 2035 } 2041 2036 migrate_enable(); ··· 2056 2027 2057 2028 static int memcg_hotplug_cpu_dead(unsigned int cpu) 2058 2029 { 2059 - struct memcg_stock_pcp *stock; 2030 + struct obj_stock_pcp *obj_st; 2060 2031 unsigned long flags; 2061 2032 2062 - stock = &per_cpu(memcg_stock, cpu); 2033 + obj_st = &per_cpu(obj_stock, cpu); 2063 2034 2064 - /* drain_obj_stock requires obj_lock */ 2065 - local_lock_irqsave(&memcg_stock.obj_lock, flags); 2066 - drain_obj_stock(stock); 2067 - local_unlock_irqrestore(&memcg_stock.obj_lock, flags); 2035 + /* drain_obj_stock requires objstock.lock */ 2036 + local_lock_irqsave(&obj_stock.lock, flags); 2037 + drain_obj_stock(obj_st); 2038 + local_unlock_irqrestore(&obj_stock.lock, flags); 2068 2039 2069 2040 /* no need for the local lock */ 2070 - drain_stock_fully(stock); 2041 + drain_stock_fully(&per_cpu(memcg_stock, cpu)); 2071 2042 2072 2043 return 0; 2073 2044 } ··· 2864 2835 } 2865 2836 2866 2837 static void __account_obj_stock(struct obj_cgroup *objcg, 2867 - struct memcg_stock_pcp *stock, int nr, 2838 + struct obj_stock_pcp *stock, int nr, 2868 2839 struct pglist_data *pgdat, enum node_stat_item idx) 2869 2840 { 2870 2841 int *bytes; ··· 2915 2886 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, 2916 2887 struct pglist_data *pgdat, enum node_stat_item idx) 2917 2888 { 2918 - struct memcg_stock_pcp *stock; 2889 + struct obj_stock_pcp *stock; 2919 2890 unsigned long flags; 2920 2891 bool ret = false; 2921 2892 2922 - local_lock_irqsave(&memcg_stock.obj_lock, flags); 2893 + local_lock_irqsave(&obj_stock.lock, flags); 2923 2894 2924 - stock = this_cpu_ptr(&memcg_stock); 2895 + stock = this_cpu_ptr(&obj_stock); 2925 2896 if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { 2926 2897 stock->nr_bytes -= nr_bytes; 2927 2898 ret = true; ··· 2930 2901 __account_obj_stock(objcg, stock, nr_bytes, pgdat, idx); 2931 2902 } 2932 2903 2933 - local_unlock_irqrestore(&memcg_stock.obj_lock, flags); 2904 + local_unlock_irqrestore(&obj_stock.lock, flags); 2934 2905 2935 2906 return ret; 2936 2907 } 2937 2908 2938 - static void drain_obj_stock(struct memcg_stock_pcp *stock) 2909 + static void drain_obj_stock(struct obj_stock_pcp *stock) 2939 2910 { 2940 2911 struct obj_cgroup *old = READ_ONCE(stock->cached_objcg); 2941 2912 ··· 2996 2967 obj_cgroup_put(old); 2997 2968 } 2998 2969 2999 - static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 2970 + static bool obj_stock_flush_required(struct obj_stock_pcp *stock, 3000 2971 struct mem_cgroup *root_memcg) 3001 2972 { 3002 2973 struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg); 3003 2974 struct mem_cgroup *memcg; 2975 + bool flush = false; 3004 2976 2977 + rcu_read_lock(); 3005 2978 if (objcg) { 3006 2979 memcg = obj_cgroup_memcg(objcg); 3007 2980 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) 3008 - return true; 2981 + flush = true; 3009 2982 } 2983 + rcu_read_unlock(); 3010 2984 3011 - return false; 2985 + return flush; 3012 2986 } 3013 2987 3014 2988 static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, 3015 2989 bool allow_uncharge, int nr_acct, struct pglist_data *pgdat, 3016 2990 enum node_stat_item idx) 3017 2991 { 3018 - struct memcg_stock_pcp *stock; 2992 + struct obj_stock_pcp *stock; 3019 2993 unsigned long flags; 3020 2994 unsigned int nr_pages = 0; 3021 2995 3022 - local_lock_irqsave(&memcg_stock.obj_lock, flags); 2996 + local_lock_irqsave(&obj_stock.lock, flags); 3023 2997 3024 - stock = this_cpu_ptr(&memcg_stock); 2998 + stock = this_cpu_ptr(&obj_stock); 3025 2999 if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ 3026 3000 drain_obj_stock(stock); 3027 3001 obj_cgroup_get(objcg); ··· 3044 3012 stock->nr_bytes &= (PAGE_SIZE - 1); 3045 3013 } 3046 3014 3047 - local_unlock_irqrestore(&memcg_stock.obj_lock, flags); 3015 + local_unlock_irqrestore(&obj_stock.lock, flags); 3048 3016 3049 3017 if (nr_pages) 3050 3018 obj_cgroup_uncharge_pages(objcg, nr_pages); ··· 5109 5077 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, 5110 5078 memcg_hotplug_cpu_dead); 5111 5079 5112 - for_each_possible_cpu(cpu) 5080 + for_each_possible_cpu(cpu) { 5113 5081 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, 5114 - drain_local_stock); 5082 + drain_local_memcg_stock); 5083 + INIT_WORK(&per_cpu_ptr(&obj_stock, cpu)->work, 5084 + drain_local_obj_stock); 5085 + } 5115 5086 5116 5087 memcg_size = struct_size_t(struct mem_cgroup, nodeinfo, nr_node_ids); 5117 5088 memcg_cachep = kmem_cache_create("mem_cgroup", memcg_size, 0,