Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

apparmor: cache buffers on percpu list if there is lock contention

commit df323337e507 ("apparmor: Use a memory pool instead per-CPU caches")

changed buffer allocation to use a memory pool, however on a heavily
loaded machine there can be lock contention on the global buffers
lock. Add a percpu list to cache buffers on when lock contention is
encountered.

When allocating buffers attempt to use cached buffers first,
before taking the global buffers lock. When freeing buffers
try to put them back to the global list but if contention is
encountered, put the buffer on the percpu list.

The length of time a buffer is held on the percpu list is dynamically
adjusted based on lock contention. The amount of hold time is
increased and decreased linearly.

v5:
- simplify base patch by removing: improvements can be added later
- MAX_LOCAL and must lock
- contention scaling.
v4:
- fix percpu ->count buffer count which had been spliced across a
debug patch.
- introduce define for MAX_LOCAL_COUNT
- rework count check and locking around it.
- update commit message to reference commit that introduced the
memory.
v3:
- limit number of buffers that can be pushed onto the percpu
list. This avoids a problem on some kernels where one percpu
list can inherit buffers from another cpu after a reschedule,
causing more kernel memory to used than is necessary. Under
normal conditions this should eventually return to normal
but under pathelogical conditions the extra memory consumption
may have been unbouanded
v2:
- dynamically adjust buffer hold time on percpu list based on
lock contention.
v1:
- cache buffers on percpu list on lock contention

Reported-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Georgia Garcia <georgia.garcia@canonical.com>
Signed-off-by: John Johansen <john.johansen@canonical.com>

+62 -5
+62 -5
security/apparmor/lsm.c
··· 49 49 DECLARE_FLEX_ARRAY(char, buffer); 50 50 }; 51 51 52 + struct aa_local_cache { 53 + unsigned int hold; 54 + unsigned int count; 55 + struct list_head head; 56 + }; 57 + 52 58 #define RESERVE_COUNT 2 53 59 static int reserve_count = RESERVE_COUNT; 54 60 static int buffer_count; 55 61 56 62 static LIST_HEAD(aa_global_buffers); 57 63 static DEFINE_SPINLOCK(aa_buffers_lock); 64 + static DEFINE_PER_CPU(struct aa_local_cache, aa_local_buffers); 58 65 59 66 /* 60 67 * LSM hook functions ··· 1796 1789 char *aa_get_buffer(bool in_atomic) 1797 1790 { 1798 1791 union aa_buffer *aa_buf; 1792 + struct aa_local_cache *cache; 1799 1793 bool try_again = true; 1800 1794 gfp_t flags = (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); 1801 1795 1796 + /* use per cpu cached buffers first */ 1797 + cache = get_cpu_ptr(&aa_local_buffers); 1798 + if (!list_empty(&cache->head)) { 1799 + aa_buf = list_first_entry(&cache->head, union aa_buffer, list); 1800 + list_del(&aa_buf->list); 1801 + cache->hold--; 1802 + cache->count--; 1803 + put_cpu_ptr(&aa_local_buffers); 1804 + return &aa_buf->buffer[0]; 1805 + } 1806 + put_cpu_ptr(&aa_local_buffers); 1807 + 1808 + if (!spin_trylock(&aa_buffers_lock)) { 1809 + cache = get_cpu_ptr(&aa_local_buffers); 1810 + cache->hold += 1; 1811 + put_cpu_ptr(&aa_local_buffers); 1812 + spin_lock(&aa_buffers_lock); 1813 + } else { 1814 + cache = get_cpu_ptr(&aa_local_buffers); 1815 + put_cpu_ptr(&aa_local_buffers); 1816 + } 1802 1817 retry: 1803 - spin_lock(&aa_buffers_lock); 1804 1818 if (buffer_count > reserve_count || 1805 1819 (in_atomic && !list_empty(&aa_global_buffers))) { 1806 1820 aa_buf = list_first_entry(&aa_global_buffers, union aa_buffer, ··· 1847 1819 if (!aa_buf) { 1848 1820 if (try_again) { 1849 1821 try_again = false; 1822 + spin_lock(&aa_buffers_lock); 1850 1823 goto retry; 1851 1824 } 1852 1825 pr_warn_once("AppArmor: Failed to allocate a memory buffer.\n"); ··· 1859 1830 void aa_put_buffer(char *buf) 1860 1831 { 1861 1832 union aa_buffer *aa_buf; 1833 + struct aa_local_cache *cache; 1862 1834 1863 1835 if (!buf) 1864 1836 return; 1865 1837 aa_buf = container_of(buf, union aa_buffer, buffer[0]); 1866 1838 1867 - spin_lock(&aa_buffers_lock); 1868 - list_add(&aa_buf->list, &aa_global_buffers); 1869 - buffer_count++; 1870 - spin_unlock(&aa_buffers_lock); 1839 + cache = get_cpu_ptr(&aa_local_buffers); 1840 + if (!cache->hold) { 1841 + put_cpu_ptr(&aa_local_buffers); 1842 + 1843 + if (spin_trylock(&aa_buffers_lock)) { 1844 + /* put back on global list */ 1845 + list_add(&aa_buf->list, &aa_global_buffers); 1846 + buffer_count++; 1847 + spin_unlock(&aa_buffers_lock); 1848 + cache = get_cpu_ptr(&aa_local_buffers); 1849 + put_cpu_ptr(&aa_local_buffers); 1850 + return; 1851 + } 1852 + /* contention on global list, fallback to percpu */ 1853 + cache = get_cpu_ptr(&aa_local_buffers); 1854 + cache->hold += 1; 1855 + } 1856 + 1857 + /* cache in percpu list */ 1858 + list_add(&aa_buf->list, &cache->head); 1859 + cache->count++; 1860 + put_cpu_ptr(&aa_local_buffers); 1871 1861 } 1872 1862 1873 1863 /* ··· 1928 1880 union aa_buffer *aa_buf; 1929 1881 int i, num; 1930 1882 1883 + /* 1884 + * per cpu set of cached allocated buffers used to help reduce 1885 + * lock contention 1886 + */ 1887 + for_each_possible_cpu(i) { 1888 + per_cpu(aa_local_buffers, i).hold = 0; 1889 + per_cpu(aa_local_buffers, i).count = 0; 1890 + INIT_LIST_HEAD(&per_cpu(aa_local_buffers, i).head); 1891 + } 1931 1892 /* 1932 1893 * A function may require two buffers at once. Usually the buffers are 1933 1894 * used for a short period of time and are shared. On UP kernel buffers