x86/sgx: Set active memcg prior to shmem allocation

When the system runs out of enclave memory, SGX can reclaim EPC pages
by swapping to normal RAM. These backing pages are allocated via a
per-enclave shared memory area. Since SGX allows unlimited over
commit on EPC memory, the reclaimer thread can allocate a large
number of backing RAM pages in response to EPC memory pressure.

When the shared memory backing RAM allocation occurs during
the reclaimer thread context, the shared memory is charged to
the root memory control group, and the shmem usage of the enclave
is not properly accounted for, making cgroups ineffective at
limiting the amount of RAM an enclave can consume.

For example, when using a cgroup to launch a set of test
enclaves, the kernel does not properly account for 50% - 75% of
shmem page allocations on average. In the worst case, when
nearly all allocations occur during the reclaimer thread, the
kernel accounts less than a percent of the amount of shmem used
by the enclave's cgroup to the correct cgroup.

SGX stores a list of mm_structs that are associated with
an enclave. Pick one of them during reclaim and charge that
mm's memcg with the shmem allocation. The one that gets picked
is arbitrary, but this list almost always only has one mm. The
cases where there is more than one mm with different memcg's
are not worth considering.

Create a new function - sgx_encl_alloc_backing(). This function
is used whenever a new backing storage page needs to be
allocated. Previously the same function was used for page
allocation as well as retrieving a previously allocated page.
Prior to backing page allocation, if there is a mm_struct associated
with the enclave that is requesting the allocation, it is set
as the active memory control group.

[ dhansen: - fix merge conflict with ELDU fixes
- check against actual ksgxd_tsk, not ->mm ]

Cc: stable@vger.kernel.org
Signed-off-by: Kristen Carlson Accardi <kristen@linux.intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Link: https://lkml.kernel.org/r/20220520174248.4918-1-kristen@linux.intel.com

authored by Kristen Carlson Accardi and committed by Dave Hansen 0c9782e2 17d8e3d9

+115 -6
+103 -2
arch/x86/kernel/cpu/sgx/encl.c
··· 152 153 page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); 154 155 - ret = sgx_encl_get_backing(encl, page_index, &b); 156 if (ret) 157 return ret; 158 ··· 718 * 0 on success, 719 * -errno otherwise. 720 */ 721 - int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, 722 struct sgx_backing *backing) 723 { 724 pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); ··· 741 backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1); 742 743 return 0; 744 } 745 746 /**
··· 152 153 page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); 154 155 + ret = sgx_encl_lookup_backing(encl, page_index, &b); 156 if (ret) 157 return ret; 158 ··· 718 * 0 on success, 719 * -errno otherwise. 720 */ 721 + static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, 722 struct sgx_backing *backing) 723 { 724 pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); ··· 741 backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1); 742 743 return 0; 744 + } 745 + 746 + /* 747 + * When called from ksgxd, returns the mem_cgroup of a struct mm stored 748 + * in the enclave's mm_list. When not called from ksgxd, just returns 749 + * the mem_cgroup of the current task. 750 + */ 751 + static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl) 752 + { 753 + struct mem_cgroup *memcg = NULL; 754 + struct sgx_encl_mm *encl_mm; 755 + int idx; 756 + 757 + /* 758 + * If called from normal task context, return the mem_cgroup 759 + * of the current task's mm. The remainder of the handling is for 760 + * ksgxd. 761 + */ 762 + if (!current_is_ksgxd()) 763 + return get_mem_cgroup_from_mm(current->mm); 764 + 765 + /* 766 + * Search the enclave's mm_list to find an mm associated with 767 + * this enclave to charge the allocation to. 768 + */ 769 + idx = srcu_read_lock(&encl->srcu); 770 + 771 + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { 772 + if (!mmget_not_zero(encl_mm->mm)) 773 + continue; 774 + 775 + memcg = get_mem_cgroup_from_mm(encl_mm->mm); 776 + 777 + mmput_async(encl_mm->mm); 778 + 779 + break; 780 + } 781 + 782 + srcu_read_unlock(&encl->srcu, idx); 783 + 784 + /* 785 + * In the rare case that there isn't an mm associated with 786 + * the enclave, set memcg to the current active mem_cgroup. 787 + * This will be the root mem_cgroup if there is no active 788 + * mem_cgroup. 789 + */ 790 + if (!memcg) 791 + return get_mem_cgroup_from_mm(NULL); 792 + 793 + return memcg; 794 + } 795 + 796 + /** 797 + * sgx_encl_alloc_backing() - allocate a new backing storage page 798 + * @encl: an enclave pointer 799 + * @page_index: enclave page index 800 + * @backing: data for accessing backing storage for the page 801 + * 802 + * When called from ksgxd, sets the active memcg from one of the 803 + * mms in the enclave's mm_list prior to any backing page allocation, 804 + * in order to ensure that shmem page allocations are charged to the 805 + * enclave. 806 + * 807 + * Return: 808 + * 0 on success, 809 + * -errno otherwise. 810 + */ 811 + int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, 812 + struct sgx_backing *backing) 813 + { 814 + struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl); 815 + struct mem_cgroup *memcg = set_active_memcg(encl_memcg); 816 + int ret; 817 + 818 + ret = sgx_encl_get_backing(encl, page_index, backing); 819 + 820 + set_active_memcg(memcg); 821 + mem_cgroup_put(encl_memcg); 822 + 823 + return ret; 824 + } 825 + 826 + /** 827 + * sgx_encl_lookup_backing() - retrieve an existing backing storage page 828 + * @encl: an enclave pointer 829 + * @page_index: enclave page index 830 + * @backing: data for accessing backing storage for the page 831 + * 832 + * Retrieve a backing page for loading data back into an EPC page with ELDU. 833 + * It is the caller's responsibility to ensure that it is appropriate to use 834 + * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is 835 + * not used correctly, this will cause an allocation which is not accounted for. 836 + * 837 + * Return: 838 + * 0 on success, 839 + * -errno otherwise. 840 + */ 841 + int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, 842 + struct sgx_backing *backing) 843 + { 844 + return sgx_encl_get_backing(encl, page_index, backing); 845 } 846 847 /**
+5 -2
arch/x86/kernel/cpu/sgx/encl.h
··· 103 int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, 104 unsigned long end, unsigned long vm_flags); 105 106 void sgx_encl_release(struct kref *ref); 107 int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); 108 - int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, 109 - struct sgx_backing *backing); 110 void sgx_encl_put_backing(struct sgx_backing *backing); 111 int sgx_encl_test_and_clear_young(struct mm_struct *mm, 112 struct sgx_encl_page *page);
··· 103 int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, 104 unsigned long end, unsigned long vm_flags); 105 106 + bool current_is_ksgxd(void); 107 void sgx_encl_release(struct kref *ref); 108 int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); 109 + int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, 110 + struct sgx_backing *backing); 111 + int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, 112 + struct sgx_backing *backing); 113 void sgx_encl_put_backing(struct sgx_backing *backing); 114 int sgx_encl_test_and_clear_young(struct mm_struct *mm, 115 struct sgx_encl_page *page);
+7 -2
arch/x86/kernel/cpu/sgx/main.c
··· 313 sgx_encl_put_backing(backing); 314 315 if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) { 316 - ret = sgx_encl_get_backing(encl, PFN_DOWN(encl->size), 317 &secs_backing); 318 if (ret) 319 goto out; ··· 384 page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); 385 386 mutex_lock(&encl_page->encl->lock); 387 - ret = sgx_encl_get_backing(encl_page->encl, page_index, &backing[i]); 388 if (ret) { 389 mutex_unlock(&encl_page->encl->lock); 390 goto skip; ··· 473 ksgxd_tsk = tsk; 474 475 return true; 476 } 477 478 static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
··· 313 sgx_encl_put_backing(backing); 314 315 if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) { 316 + ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size), 317 &secs_backing); 318 if (ret) 319 goto out; ··· 384 page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); 385 386 mutex_lock(&encl_page->encl->lock); 387 + ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i]); 388 if (ret) { 389 mutex_unlock(&encl_page->encl->lock); 390 goto skip; ··· 473 ksgxd_tsk = tsk; 474 475 return true; 476 + } 477 + 478 + bool current_is_ksgxd(void) 479 + { 480 + return current == ksgxd_tsk; 481 } 482 483 static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)