Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'bpf: Handle MEM_RCU type properly'

Yonghong Song says:

====================

Patch set [1] added rcu support for bpf programs. In [1], a rcu
pointer is considered to be trusted and not null. This is actually
not true in some cases. The rcu pointer could be null, and for non-null
rcu pointer, it may have reference count of 0. This small patch set
fixed this problem. Patch 1 is the kernel fix. Patch 2 adjusted
selftests properly. Patch 3 added documentation for newly-introduced
KF_RCU flag.

[1] https://lore.kernel.org/all/20221124053201.2372298-1-yhs@fb.com/

Changelogs:
v1 -> v2:
- rcu ptr could be NULL.
- non_null_rcu_ptr->rcu_field can be marked as MEM_RCU as well.
- Adjust the code to avoid existing error message change.
====================

Signed-off-by: Alexei Starovoitov <ast@kernel.org>

+102 -24
+9
Documentation/bpf/kfuncs.rst
··· 191 191 calls. At the moment they only require CAP_SYS_BOOT capability, but more can be 192 192 added later. 193 193 194 + 2.4.8 KF_RCU flag 195 + ----------------- 196 + 197 + The KF_RCU flag is used for kfuncs which have a rcu ptr as its argument. 198 + When used together with KF_ACQUIRE, it indicates the kfunc should have a 199 + single argument which must be a trusted argument or a MEM_RCU pointer. 200 + The argument may have reference count of 0 and the kfunc must take this 201 + into consideration. 202 + 194 203 2.5 Registering the kfuncs 195 204 -------------------------- 196 205
+1 -1
include/linux/bpf_verifier.h
··· 682 682 } 683 683 } 684 684 685 - #define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | MEM_RCU | PTR_TRUSTED) 685 + #define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED) 686 686 687 687 static inline bool bpf_type_has_unsafe_modifiers(u32 type) 688 688 {
+1
include/linux/btf.h
··· 70 70 #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ 71 71 #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ 72 72 #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ 73 + #define KF_RCU (1 << 7) /* kfunc only takes rcu pointer arguments */ 73 74 74 75 /* 75 76 * Return the name of the passed struct, if exists, or halt the build if for
+14
kernel/bpf/helpers.c
··· 1838 1838 } 1839 1839 1840 1840 /** 1841 + * bpf_task_acquire_not_zero - Acquire a reference to a rcu task object. A task 1842 + * acquired by this kfunc which is not stored in a map as a kptr, must be 1843 + * released by calling bpf_task_release(). 1844 + * @p: The task on which a reference is being acquired. 1845 + */ 1846 + struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p) 1847 + { 1848 + if (!refcount_inc_not_zero(&p->rcu_users)) 1849 + return NULL; 1850 + return p; 1851 + } 1852 + 1853 + /** 1841 1854 * bpf_task_kptr_get - Acquire a reference on a struct task_struct kptr. A task 1842 1855 * kptr acquired by this kfunc which is not subsequently stored in a map, must 1843 1856 * be released by calling bpf_task_release(). ··· 2026 2013 BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) 2027 2014 BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) 2028 2015 BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) 2016 + BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL) 2029 2017 BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) 2030 2018 BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) 2031 2019 #ifdef CONFIG_CGROUPS
+32 -13
kernel/bpf/verifier.c
··· 4275 4275 return true; 4276 4276 4277 4277 /* If a register is not referenced, it is trusted if it has the 4278 - * MEM_ALLOC, MEM_RCU or PTR_TRUSTED type modifiers, and no others. Some of the 4278 + * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the 4279 4279 * other type modifiers may be safe, but we elect to take an opt-in 4280 4280 * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are 4281 4281 * not. ··· 4285 4285 */ 4286 4286 return type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS && 4287 4287 !bpf_type_has_unsafe_modifiers(reg->type); 4288 + } 4289 + 4290 + static bool is_rcu_reg(const struct bpf_reg_state *reg) 4291 + { 4292 + return reg->type & MEM_RCU; 4288 4293 } 4289 4294 4290 4295 static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, ··· 4790 4785 4791 4786 if (flag & MEM_RCU) { 4792 4787 /* Mark value register as MEM_RCU only if it is protected by 4793 - * bpf_rcu_read_lock() and the ptr reg is trusted. MEM_RCU 4788 + * bpf_rcu_read_lock() and the ptr reg is rcu or trusted. MEM_RCU 4794 4789 * itself can already indicate trustedness inside the rcu 4795 - * read lock region. Also mark it as PTR_TRUSTED. 4790 + * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since 4791 + * it could be null in some cases. 4796 4792 */ 4797 - if (!env->cur_state->active_rcu_lock || !is_trusted_reg(reg)) 4793 + if (!env->cur_state->active_rcu_lock || 4794 + !(is_trusted_reg(reg) || is_rcu_reg(reg))) 4798 4795 flag &= ~MEM_RCU; 4799 4796 else 4800 - flag |= PTR_TRUSTED; 4797 + flag |= PTR_MAYBE_NULL; 4801 4798 } else if (reg->type & MEM_RCU) { 4802 4799 /* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged 4803 4800 * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively. ··· 5964 5957 .types = { 5965 5958 PTR_TO_BTF_ID, 5966 5959 PTR_TO_BTF_ID | PTR_TRUSTED, 5967 - PTR_TO_BTF_ID | MEM_RCU | PTR_TRUSTED, 5960 + PTR_TO_BTF_ID | MEM_RCU, 5968 5961 }, 5969 5962 }; 5970 5963 static const struct bpf_reg_types percpu_btf_ptr_types = { ··· 6143 6136 case PTR_TO_BTF_ID: 6144 6137 case PTR_TO_BTF_ID | MEM_ALLOC: 6145 6138 case PTR_TO_BTF_ID | PTR_TRUSTED: 6146 - case PTR_TO_BTF_ID | MEM_RCU | PTR_TRUSTED: 6139 + case PTR_TO_BTF_ID | MEM_RCU: 6147 6140 case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED: 6148 6141 /* When referenced PTR_TO_BTF_ID is passed to release function, 6149 6142 * it's fixed offset must be 0. In the other cases, fixed offset ··· 8045 8038 return meta->kfunc_flags & KF_DESTRUCTIVE; 8046 8039 } 8047 8040 8041 + static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta) 8042 + { 8043 + return meta->kfunc_flags & KF_RCU; 8044 + } 8045 + 8048 8046 static bool is_kfunc_arg_kptr_get(struct bpf_kfunc_call_arg_meta *meta, int arg) 8049 8047 { 8050 8048 return arg == 0 && (meta->kfunc_flags & KF_KPTR_GET); ··· 8734 8722 switch (kf_arg_type) { 8735 8723 case KF_ARG_PTR_TO_ALLOC_BTF_ID: 8736 8724 case KF_ARG_PTR_TO_BTF_ID: 8737 - if (!is_kfunc_trusted_args(meta)) 8725 + if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) 8738 8726 break; 8739 8727 8740 8728 if (!is_trusted_reg(reg)) { 8741 - verbose(env, "R%d must be referenced or trusted\n", regno); 8742 - return -EINVAL; 8729 + if (!is_kfunc_rcu(meta)) { 8730 + verbose(env, "R%d must be referenced or trusted\n", regno); 8731 + return -EINVAL; 8732 + } 8733 + if (!is_rcu_reg(reg)) { 8734 + verbose(env, "R%d must be a rcu pointer\n", regno); 8735 + return -EINVAL; 8736 + } 8743 8737 } 8738 + 8744 8739 fallthrough; 8745 8740 case KF_ARG_PTR_TO_CTX: 8746 8741 /* Trusted arguments have the same offset checks as release arguments */ ··· 8858 8839 case KF_ARG_PTR_TO_BTF_ID: 8859 8840 /* Only base_type is checked, further checks are done here */ 8860 8841 if ((base_type(reg->type) != PTR_TO_BTF_ID || 8861 - bpf_type_has_unsafe_modifiers(reg->type)) && 8842 + (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) && 8862 8843 !reg2btf_ids[base_type(reg->type)]) { 8863 8844 verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type)); 8864 8845 verbose(env, "expected %s or socket\n", ··· 8973 8954 } else if (rcu_unlock) { 8974 8955 bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ 8975 8956 if (reg->type & MEM_RCU) { 8976 - reg->type &= ~(MEM_RCU | PTR_TRUSTED); 8957 + reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); 8977 8958 reg->type |= PTR_UNTRUSTED; 8978 8959 } 8979 8960 })); ··· 11313 11294 bool is_null) 11314 11295 { 11315 11296 if (type_may_be_null(reg->type) && reg->id == id && 11316 - !WARN_ON_ONCE(!reg->id)) { 11297 + (is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) { 11317 11298 /* Old offset (both fixed and variable parts) should have been 11318 11299 * known-zero, because we don't allow pointer arithmetic on 11319 11300 * pointers that might be NULL. If we see this happening, don't
+45 -10
tools/testing/selftests/bpf/progs/rcu_read_lock.c
··· 23 23 void bpf_key_put(struct bpf_key *key) __ksym; 24 24 void bpf_rcu_read_lock(void) __ksym; 25 25 void bpf_rcu_read_unlock(void) __ksym; 26 - struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym; 26 + struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p) __ksym; 27 27 void bpf_task_release(struct task_struct *p) __ksym; 28 28 29 29 SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") 30 30 int get_cgroup_id(void *ctx) 31 31 { 32 32 struct task_struct *task; 33 + struct css_set *cgroups; 33 34 34 35 task = bpf_get_current_task_btf(); 35 36 if (task->pid != target_pid) ··· 38 37 39 38 /* simulate bpf_get_current_cgroup_id() helper */ 40 39 bpf_rcu_read_lock(); 41 - cgroup_id = task->cgroups->dfl_cgrp->kn->id; 40 + cgroups = task->cgroups; 41 + if (!cgroups) 42 + goto unlock; 43 + cgroup_id = cgroups->dfl_cgrp->kn->id; 44 + unlock: 42 45 bpf_rcu_read_unlock(); 43 46 return 0; 44 47 } ··· 61 56 bpf_rcu_read_lock(); 62 57 /* region including helper using rcu ptr real_parent */ 63 58 real_parent = task->real_parent; 59 + if (!real_parent) 60 + goto out; 64 61 ptr = bpf_task_storage_get(&map_a, real_parent, &init_val, 65 62 BPF_LOCAL_STORAGE_GET_F_CREATE); 66 63 if (!ptr) ··· 99 92 bpf_rcu_read_unlock(); 100 93 bpf_rcu_read_lock(); 101 94 real_parent = task->real_parent; 95 + if (!real_parent) 96 + goto out; 102 97 (void)bpf_task_storage_get(&map_a, real_parent, 0, 0); 98 + out: 103 99 bpf_rcu_read_unlock(); 104 100 return 0; 105 101 } ··· 115 105 task = bpf_get_current_task_btf(); 116 106 bpf_rcu_read_lock(); 117 107 real_parent = task->real_parent; 108 + if (!real_parent) 109 + goto out; 118 110 (void)bpf_task_storage_get(&map_a, real_parent, 0, 0); 111 + out: 119 112 bpf_rcu_read_unlock(); 120 113 return 0; 121 114 } ··· 134 121 135 122 bpf_rcu_read_lock(); 136 123 real_parent = task->real_parent; 124 + if (!real_parent) 125 + goto out; 137 126 (void)bpf_task_storage_get(&map_a, real_parent, 0, 0); 127 + out: 138 128 bpf_rcu_read_unlock(); 139 129 return 0; 140 130 } ··· 145 129 SEC("?fentry.s/" SYS_PREFIX "sys_nanosleep") 146 130 int task_acquire(void *ctx) 147 131 { 148 - struct task_struct *task, *real_parent; 132 + struct task_struct *task, *real_parent, *gparent; 149 133 150 134 task = bpf_get_current_task_btf(); 151 135 bpf_rcu_read_lock(); 152 136 real_parent = task->real_parent; 137 + if (!real_parent) 138 + goto out; 139 + 140 + /* rcu_ptr->rcu_field */ 141 + gparent = real_parent->real_parent; 142 + if (!gparent) 143 + goto out; 144 + 153 145 /* acquire a reference which can be used outside rcu read lock region */ 154 - real_parent = bpf_task_acquire(real_parent); 146 + gparent = bpf_task_acquire_not_zero(gparent); 147 + if (!gparent) 148 + goto out; 149 + 150 + (void)bpf_task_storage_get(&map_a, gparent, 0, 0); 151 + bpf_task_release(gparent); 152 + out: 155 153 bpf_rcu_read_unlock(); 156 - (void)bpf_task_storage_get(&map_a, real_parent, 0, 0); 157 - bpf_task_release(real_parent); 158 154 return 0; 159 155 } 160 156 ··· 209 181 /* non-sleepable: missing bpf_rcu_read_unlock() in one path */ 210 182 bpf_rcu_read_lock(); 211 183 real_parent = task->real_parent; 184 + if (!real_parent) 185 + goto out; 212 186 (void)bpf_task_storage_get(&map_a, real_parent, 0, 0); 213 187 if (real_parent) 214 188 bpf_rcu_read_unlock(); 189 + out: 215 190 return 0; 216 191 } 217 192 ··· 230 199 /* sleepable helper in rcu read lock region */ 231 200 bpf_rcu_read_lock(); 232 201 real_parent = task->real_parent; 202 + if (!real_parent) 203 + goto out; 233 204 regs = (struct pt_regs *)bpf_task_pt_regs(real_parent); 234 - if (!regs) { 235 - bpf_rcu_read_unlock(); 236 - return 0; 237 - } 205 + if (!regs) 206 + goto out; 238 207 239 208 ptr = (void *)PT_REGS_IP(regs); 240 209 (void)bpf_copy_from_user_task(&value, sizeof(uint32_t), ptr, task, 0); 241 210 user_data = value; 242 211 (void)bpf_task_storage_get(&map_a, real_parent, 0, 0); 212 + out: 243 213 bpf_rcu_read_unlock(); 244 214 return 0; 245 215 } ··· 271 239 bpf_rcu_read_lock(); 272 240 bpf_rcu_read_lock(); 273 241 real_parent = task->real_parent; 242 + if (!real_parent) 243 + goto out; 274 244 (void)bpf_task_storage_get(&map_a, real_parent, 0, 0); 245 + out: 275 246 bpf_rcu_read_unlock(); 276 247 bpf_rcu_read_unlock(); 277 248 return 0;