Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
at master 457 lines 12 kB view raw
1// SPDX-License-Identifier: GPL-2.0 2/* 3 * BPF filter support for io_uring. Supports SQE opcodes for now. 4 */ 5#include <linux/kernel.h> 6#include <linux/errno.h> 7#include <linux/io_uring.h> 8#include <linux/filter.h> 9#include <linux/bpf.h> 10#include <uapi/linux/io_uring.h> 11 12#include "io_uring.h" 13#include "bpf_filter.h" 14#include "net.h" 15#include "openclose.h" 16 17struct io_bpf_filter { 18 refcount_t refs; 19 struct bpf_prog *prog; 20 struct io_bpf_filter *next; 21}; 22 23/* Deny if this is set as the filter */ 24static const struct io_bpf_filter dummy_filter; 25 26static void io_uring_populate_bpf_ctx(struct io_uring_bpf_ctx *bctx, 27 struct io_kiocb *req) 28{ 29 const struct io_issue_def *def = &io_issue_defs[req->opcode]; 30 31 bctx->opcode = req->opcode; 32 bctx->sqe_flags = (__force int) req->flags & SQE_VALID_FLAGS; 33 bctx->user_data = req->cqe.user_data; 34 /* clear residual, anything from pdu_size and below */ 35 memset((void *) bctx + offsetof(struct io_uring_bpf_ctx, pdu_size), 0, 36 sizeof(*bctx) - offsetof(struct io_uring_bpf_ctx, pdu_size)); 37 38 /* 39 * Opcodes can provide a handler for populating more data into bctx, 40 * for filters to use. 41 */ 42 if (def->filter_pdu_size) { 43 bctx->pdu_size = def->filter_pdu_size; 44 def->filter_populate(bctx, req); 45 } 46} 47 48/* 49 * Run registered filters for a given opcode. For filters, a return of 0 denies 50 * execution of the request, a return of 1 allows it. If any filter for an 51 * opcode returns 0, filter processing is stopped, and the request is denied. 52 * This also stops the processing of filters. 53 * 54 * __io_uring_run_bpf_filters() returns 0 on success, allow running the 55 * request, and -EACCES when a request is denied. 56 */ 57int __io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters, 58 struct io_kiocb *req) 59{ 60 struct io_bpf_filter *filter; 61 struct io_uring_bpf_ctx bpf_ctx; 62 int ret; 63 64 /* Fast check for existence of filters outside of RCU */ 65 if (!rcu_access_pointer(filters[req->opcode])) 66 return 0; 67 68 /* 69 * req->opcode has already been validated to be within the range 70 * of what we expect, io_init_req() does this. 71 */ 72 guard(rcu)(); 73 filter = rcu_dereference(filters[req->opcode]); 74 if (!filter) 75 return 0; 76 else if (filter == &dummy_filter) 77 return -EACCES; 78 79 io_uring_populate_bpf_ctx(&bpf_ctx, req); 80 81 /* 82 * Iterate registered filters. The opcode is allowed IFF all filters 83 * return 1. If any filter returns denied, opcode will be denied. 84 */ 85 do { 86 if (filter == &dummy_filter) 87 return -EACCES; 88 ret = bpf_prog_run(filter->prog, &bpf_ctx); 89 if (!ret) 90 return -EACCES; 91 filter = filter->next; 92 } while (filter); 93 94 return 0; 95} 96 97static void io_free_bpf_filters(struct rcu_head *head) 98{ 99 struct io_bpf_filter __rcu **filter; 100 struct io_bpf_filters *filters; 101 int i; 102 103 filters = container_of(head, struct io_bpf_filters, rcu_head); 104 scoped_guard(spinlock, &filters->lock) { 105 filter = filters->filters; 106 if (!filter) 107 return; 108 } 109 110 for (i = 0; i < IORING_OP_LAST; i++) { 111 struct io_bpf_filter *f; 112 113 rcu_read_lock(); 114 f = rcu_dereference(filter[i]); 115 while (f) { 116 struct io_bpf_filter *next = f->next; 117 118 /* 119 * Even if stacked, dummy filter will always be last 120 * as it can only get installed into an empty spot. 121 */ 122 if (f == &dummy_filter) 123 break; 124 125 /* Someone still holds a ref, stop iterating. */ 126 if (!refcount_dec_and_test(&f->refs)) 127 break; 128 129 bpf_prog_destroy(f->prog); 130 kfree(f); 131 f = next; 132 } 133 rcu_read_unlock(); 134 } 135 kfree(filters->filters); 136 kfree(filters); 137} 138 139static void __io_put_bpf_filters(struct io_bpf_filters *filters) 140{ 141 if (refcount_dec_and_test(&filters->refs)) 142 call_rcu(&filters->rcu_head, io_free_bpf_filters); 143} 144 145void io_put_bpf_filters(struct io_restriction *res) 146{ 147 if (res->bpf_filters) 148 __io_put_bpf_filters(res->bpf_filters); 149} 150 151static struct io_bpf_filters *io_new_bpf_filters(void) 152{ 153 struct io_bpf_filters *filters __free(kfree) = NULL; 154 155 filters = kzalloc_obj(*filters, GFP_KERNEL_ACCOUNT); 156 if (!filters) 157 return ERR_PTR(-ENOMEM); 158 159 filters->filters = kzalloc_objs(struct io_bpf_filter *, IORING_OP_LAST, 160 GFP_KERNEL_ACCOUNT); 161 if (!filters->filters) 162 return ERR_PTR(-ENOMEM); 163 164 refcount_set(&filters->refs, 1); 165 spin_lock_init(&filters->lock); 166 return no_free_ptr(filters); 167} 168 169/* 170 * Validate classic BPF filter instructions. Only allow a safe subset of 171 * operations - no packet data access, just context field loads and basic 172 * ALU/jump operations. 173 */ 174static int io_uring_check_cbpf_filter(struct sock_filter *filter, 175 unsigned int flen) 176{ 177 int pc; 178 179 for (pc = 0; pc < flen; pc++) { 180 struct sock_filter *ftest = &filter[pc]; 181 u16 code = ftest->code; 182 u32 k = ftest->k; 183 184 switch (code) { 185 case BPF_LD | BPF_W | BPF_ABS: 186 ftest->code = BPF_LDX | BPF_W | BPF_ABS; 187 /* 32-bit aligned and not out of bounds. */ 188 if (k >= sizeof(struct io_uring_bpf_ctx) || k & 3) 189 return -EINVAL; 190 continue; 191 case BPF_LD | BPF_W | BPF_LEN: 192 ftest->code = BPF_LD | BPF_IMM; 193 ftest->k = sizeof(struct io_uring_bpf_ctx); 194 continue; 195 case BPF_LDX | BPF_W | BPF_LEN: 196 ftest->code = BPF_LDX | BPF_IMM; 197 ftest->k = sizeof(struct io_uring_bpf_ctx); 198 continue; 199 /* Explicitly include allowed calls. */ 200 case BPF_RET | BPF_K: 201 case BPF_RET | BPF_A: 202 case BPF_ALU | BPF_ADD | BPF_K: 203 case BPF_ALU | BPF_ADD | BPF_X: 204 case BPF_ALU | BPF_SUB | BPF_K: 205 case BPF_ALU | BPF_SUB | BPF_X: 206 case BPF_ALU | BPF_MUL | BPF_K: 207 case BPF_ALU | BPF_MUL | BPF_X: 208 case BPF_ALU | BPF_DIV | BPF_K: 209 case BPF_ALU | BPF_DIV | BPF_X: 210 case BPF_ALU | BPF_AND | BPF_K: 211 case BPF_ALU | BPF_AND | BPF_X: 212 case BPF_ALU | BPF_OR | BPF_K: 213 case BPF_ALU | BPF_OR | BPF_X: 214 case BPF_ALU | BPF_XOR | BPF_K: 215 case BPF_ALU | BPF_XOR | BPF_X: 216 case BPF_ALU | BPF_LSH | BPF_K: 217 case BPF_ALU | BPF_LSH | BPF_X: 218 case BPF_ALU | BPF_RSH | BPF_K: 219 case BPF_ALU | BPF_RSH | BPF_X: 220 case BPF_ALU | BPF_NEG: 221 case BPF_LD | BPF_IMM: 222 case BPF_LDX | BPF_IMM: 223 case BPF_MISC | BPF_TAX: 224 case BPF_MISC | BPF_TXA: 225 case BPF_LD | BPF_MEM: 226 case BPF_LDX | BPF_MEM: 227 case BPF_ST: 228 case BPF_STX: 229 case BPF_JMP | BPF_JA: 230 case BPF_JMP | BPF_JEQ | BPF_K: 231 case BPF_JMP | BPF_JEQ | BPF_X: 232 case BPF_JMP | BPF_JGE | BPF_K: 233 case BPF_JMP | BPF_JGE | BPF_X: 234 case BPF_JMP | BPF_JGT | BPF_K: 235 case BPF_JMP | BPF_JGT | BPF_X: 236 case BPF_JMP | BPF_JSET | BPF_K: 237 case BPF_JMP | BPF_JSET | BPF_X: 238 continue; 239 default: 240 return -EINVAL; 241 } 242 } 243 return 0; 244} 245 246void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src) 247{ 248 if (!src->bpf_filters) 249 return; 250 251 rcu_read_lock(); 252 /* 253 * If the src filter is going away, just ignore it. 254 */ 255 if (refcount_inc_not_zero(&src->bpf_filters->refs)) { 256 dst->bpf_filters = src->bpf_filters; 257 dst->bpf_filters_cow = true; 258 } 259 rcu_read_unlock(); 260} 261 262/* 263 * Allocate a new struct io_bpf_filters. Used when a filter is cloned and 264 * modifications need to be made. 265 */ 266static struct io_bpf_filters *io_bpf_filter_cow(struct io_restriction *src) 267{ 268 struct io_bpf_filters *filters; 269 struct io_bpf_filter *srcf; 270 int i; 271 272 filters = io_new_bpf_filters(); 273 if (IS_ERR(filters)) 274 return filters; 275 276 /* 277 * Iterate filters from src and assign in destination. Grabbing 278 * a reference is enough, we don't need to duplicate the memory. 279 * This is safe because filters are only ever appended to the 280 * front of the list, hence the only memory ever touched inside 281 * a filter is the refcount. 282 */ 283 rcu_read_lock(); 284 for (i = 0; i < IORING_OP_LAST; i++) { 285 srcf = rcu_dereference(src->bpf_filters->filters[i]); 286 if (!srcf) { 287 continue; 288 } else if (srcf == &dummy_filter) { 289 rcu_assign_pointer(filters->filters[i], &dummy_filter); 290 continue; 291 } 292 293 /* 294 * Getting a ref on the first node is enough, putting the 295 * filter and iterating nodes to free will stop on the first 296 * one that doesn't hit zero when dropping. 297 */ 298 if (!refcount_inc_not_zero(&srcf->refs)) 299 goto err; 300 rcu_assign_pointer(filters->filters[i], srcf); 301 } 302 rcu_read_unlock(); 303 return filters; 304err: 305 rcu_read_unlock(); 306 __io_put_bpf_filters(filters); 307 return ERR_PTR(-EBUSY); 308} 309 310#define IO_URING_BPF_FILTER_FLAGS (IO_URING_BPF_FILTER_DENY_REST | \ 311 IO_URING_BPF_FILTER_SZ_STRICT) 312 313static int io_bpf_filter_import(struct io_uring_bpf *reg, 314 struct io_uring_bpf __user *arg) 315{ 316 const struct io_issue_def *def; 317 int ret; 318 319 if (copy_from_user(reg, arg, sizeof(*reg))) 320 return -EFAULT; 321 if (reg->cmd_type != IO_URING_BPF_CMD_FILTER) 322 return -EINVAL; 323 if (reg->cmd_flags || reg->resv) 324 return -EINVAL; 325 326 if (reg->filter.opcode >= IORING_OP_LAST) 327 return -EINVAL; 328 if (reg->filter.flags & ~IO_URING_BPF_FILTER_FLAGS) 329 return -EINVAL; 330 if (!mem_is_zero(reg->filter.resv, sizeof(reg->filter.resv))) 331 return -EINVAL; 332 if (!mem_is_zero(reg->filter.resv2, sizeof(reg->filter.resv2))) 333 return -EINVAL; 334 if (!reg->filter.filter_len || reg->filter.filter_len > BPF_MAXINSNS) 335 return -EINVAL; 336 337 /* Verify filter size */ 338 def = &io_issue_defs[array_index_nospec(reg->filter.opcode, IORING_OP_LAST)]; 339 340 /* same size, always ok */ 341 ret = 0; 342 if (reg->filter.pdu_size == def->filter_pdu_size) 343 ; 344 /* size differs, fail in strict mode */ 345 else if (reg->filter.flags & IO_URING_BPF_FILTER_SZ_STRICT) 346 ret = -EMSGSIZE; 347 /* userspace filter is bigger, always disallow */ 348 else if (reg->filter.pdu_size > def->filter_pdu_size) 349 ret = -EMSGSIZE; 350 351 /* copy back kernel filter size */ 352 reg->filter.pdu_size = def->filter_pdu_size; 353 if (copy_to_user(&arg->filter, &reg->filter, sizeof(reg->filter))) 354 return -EFAULT; 355 356 return ret; 357} 358 359int io_register_bpf_filter(struct io_restriction *res, 360 struct io_uring_bpf __user *arg) 361{ 362 struct io_bpf_filters *filters, *old_filters = NULL; 363 struct io_bpf_filter *filter, *old_filter; 364 struct io_uring_bpf reg; 365 struct bpf_prog *prog; 366 struct sock_fprog fprog; 367 int ret; 368 369 ret = io_bpf_filter_import(&reg, arg); 370 if (ret) 371 return ret; 372 373 fprog.len = reg.filter.filter_len; 374 fprog.filter = u64_to_user_ptr(reg.filter.filter_ptr); 375 376 ret = bpf_prog_create_from_user(&prog, &fprog, 377 io_uring_check_cbpf_filter, false); 378 if (ret) 379 return ret; 380 381 /* 382 * No existing filters, allocate set. 383 */ 384 filters = res->bpf_filters; 385 if (!filters) { 386 filters = io_new_bpf_filters(); 387 if (IS_ERR(filters)) { 388 ret = PTR_ERR(filters); 389 goto err_prog; 390 } 391 } else if (res->bpf_filters_cow) { 392 filters = io_bpf_filter_cow(res); 393 if (IS_ERR(filters)) { 394 ret = PTR_ERR(filters); 395 goto err_prog; 396 } 397 /* 398 * Stash old filters, we'll put them once we know we'll 399 * succeed. Until then, res->bpf_filters is left untouched. 400 */ 401 old_filters = res->bpf_filters; 402 } 403 404 filter = kzalloc_obj(*filter, GFP_KERNEL_ACCOUNT); 405 if (!filter) { 406 ret = -ENOMEM; 407 goto err; 408 } 409 refcount_set(&filter->refs, 1); 410 filter->prog = prog; 411 412 /* 413 * Success - install the new filter set now. If we did COW, put 414 * the old filters as we're replacing them. 415 */ 416 if (old_filters) { 417 __io_put_bpf_filters(old_filters); 418 res->bpf_filters_cow = false; 419 } 420 res->bpf_filters = filters; 421 422 /* 423 * Insert filter - if the current opcode already has a filter 424 * attached, add to the set. 425 */ 426 rcu_read_lock(); 427 spin_lock_bh(&filters->lock); 428 old_filter = rcu_dereference(filters->filters[reg.filter.opcode]); 429 if (old_filter) 430 filter->next = old_filter; 431 rcu_assign_pointer(filters->filters[reg.filter.opcode], filter); 432 433 /* 434 * If IO_URING_BPF_FILTER_DENY_REST is set, fill any unregistered 435 * opcode with the dummy filter. That will cause them to be denied. 436 */ 437 if (reg.filter.flags & IO_URING_BPF_FILTER_DENY_REST) { 438 for (int i = 0; i < IORING_OP_LAST; i++) { 439 if (i == reg.filter.opcode) 440 continue; 441 old_filter = rcu_dereference(filters->filters[i]); 442 if (old_filter) 443 continue; 444 rcu_assign_pointer(filters->filters[i], &dummy_filter); 445 } 446 } 447 448 spin_unlock_bh(&filters->lock); 449 rcu_read_unlock(); 450 return 0; 451err: 452 if (filters != res->bpf_filters) 453 __io_put_bpf_filters(filters); 454err_prog: 455 bpf_prog_destroy(prog); 456 return ret; 457}