Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfilter: xtables: don't save/restore jumpstack offset

In most cases there is no reentrancy into ip/ip6tables.

For skbs sent by REJECT or SYNPROXY targets, there is one level
of reentrancy, but its not relevant as those targets issue an absolute
verdict, i.e. the jumpstack can be clobbered since its not used
after the target issues absolute verdict (ACCEPT, DROP, STOLEN, etc).

So the only special case where it is relevant is the TEE target, which
returns XT_CONTINUE.

This patch changes ip(6)_do_table to always use the jump stack starting
from 0.

When we detect we're operating on an skb sent via TEE (percpu
nf_skb_duplicated is 1) we switch to an alternate stack to leave
the original one alone.

Since there is no TEE support for arptables, it doesn't need to
test if tee is active.

The jump stack overflow tests are no longer needed as well --
since ->stacksize is the largest call depth we cannot exceed it.

A much better alternative to the external jumpstack would be to just
declare a jumps[32] stack on the local stack frame, but that would mean
we'd have to reject iptables rulesets that used to work before.

Another alternative would be to start rejecting rulesets with a larger
call depth, e.g. 1000 -- in this case it would be feasible to allocate the
entire stack in the percpu area which would avoid one dereference.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

authored by

Florian Westphal and committed by
Pablo Neira Ayuso
7814b6ec e7c8899f

+48 -49
-1
include/linux/netfilter/x_tables.h
··· 222 222 * @stacksize jumps (number of user chains) can possibly be made. 223 223 */ 224 224 unsigned int stacksize; 225 - unsigned int __percpu *stackptr; 226 225 void ***jumpstack; 227 226 228 227 unsigned char entries[0] __aligned(8);
+3 -8
net/ipv4/netfilter/arp_tables.c
··· 280 280 table_base = private->entries; 281 281 jumpstack = (struct arpt_entry **)private->jumpstack[cpu]; 282 282 283 + /* No TEE support for arptables, so no need to switch to alternate 284 + * stack. All targets that reenter must return absolute verdicts. 285 + */ 283 286 e = get_entry(table_base, private->hook_entry[hook]); 284 287 285 288 acpar.in = state->in; ··· 328 325 } 329 326 if (table_base + v 330 327 != arpt_next_entry(e)) { 331 - 332 - if (stackidx >= private->stacksize) { 333 - verdict = NF_DROP; 334 - break; 335 - } 336 328 jumpstack[stackidx++] = e; 337 329 } 338 330 ··· 335 337 continue; 336 338 } 337 339 338 - /* Targets which reenter must return 339 - * abs. verdicts 340 - */ 341 340 acpar.target = t->u.kernel.target; 342 341 acpar.targinfo = t->data; 343 342 verdict = t->u.kernel.target->target(skb, &acpar);
+20 -17
net/ipv4/netfilter/ip_tables.c
··· 296 296 const char *indev, *outdev; 297 297 const void *table_base; 298 298 struct ipt_entry *e, **jumpstack; 299 - unsigned int *stackptr, origptr, cpu; 299 + unsigned int stackidx, cpu; 300 300 const struct xt_table_info *private; 301 301 struct xt_action_param acpar; 302 302 unsigned int addend; 303 303 304 304 /* Initialization */ 305 + stackidx = 0; 305 306 ip = ip_hdr(skb); 306 307 indev = state->in ? state->in->name : nulldevname; 307 308 outdev = state->out ? state->out->name : nulldevname; ··· 332 331 smp_read_barrier_depends(); 333 332 table_base = private->entries; 334 333 jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; 335 - stackptr = per_cpu_ptr(private->stackptr, cpu); 336 - origptr = *stackptr; 334 + 335 + /* Switch to alternate jumpstack if we're being invoked via TEE. 336 + * TEE issues XT_CONTINUE verdict on original skb so we must not 337 + * clobber the jumpstack. 338 + * 339 + * For recursion via REJECT or SYNPROXY the stack will be clobbered 340 + * but it is no problem since absolute verdict is issued by these. 341 + */ 342 + jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); 337 343 338 344 e = get_entry(table_base, private->hook_entry[hook]); 339 345 340 - pr_debug("Entering %s(hook %u); sp at %u (UF %p)\n", 341 - table->name, hook, origptr, 346 + pr_debug("Entering %s(hook %u), UF %p\n", 347 + table->name, hook, 342 348 get_entry(table_base, private->underflow[hook])); 343 349 344 350 do { ··· 391 383 verdict = (unsigned int)(-v) - 1; 392 384 break; 393 385 } 394 - if (*stackptr <= origptr) { 386 + if (stackidx == 0) { 395 387 e = get_entry(table_base, 396 388 private->underflow[hook]); 397 389 pr_debug("Underflow (this is normal) " 398 390 "to %p\n", e); 399 391 } else { 400 - e = jumpstack[--*stackptr]; 392 + e = jumpstack[--stackidx]; 401 393 pr_debug("Pulled %p out from pos %u\n", 402 - e, *stackptr); 394 + e, stackidx); 403 395 e = ipt_next_entry(e); 404 396 } 405 397 continue; 406 398 } 407 399 if (table_base + v != ipt_next_entry(e) && 408 400 !(e->ip.flags & IPT_F_GOTO)) { 409 - if (*stackptr >= private->stacksize) { 410 - verdict = NF_DROP; 411 - break; 412 - } 413 - jumpstack[(*stackptr)++] = e; 401 + jumpstack[stackidx++] = e; 414 402 pr_debug("Pushed %p into pos %u\n", 415 - e, *stackptr - 1); 403 + e, stackidx - 1); 416 404 } 417 405 418 406 e = get_entry(table_base, v); ··· 427 423 /* Verdict */ 428 424 break; 429 425 } while (!acpar.hotdrop); 430 - pr_debug("Exiting %s; resetting sp from %u to %u\n", 431 - __func__, *stackptr, origptr); 432 - *stackptr = origptr; 426 + pr_debug("Exiting %s; sp at %u\n", __func__, stackidx); 427 + 433 428 xt_write_recseq_end(addend); 434 429 local_bh_enable(); 435 430
+14 -12
net/ipv6/netfilter/ip6_tables.c
··· 324 324 const char *indev, *outdev; 325 325 const void *table_base; 326 326 struct ip6t_entry *e, **jumpstack; 327 - unsigned int *stackptr, origptr, cpu; 327 + unsigned int stackidx, cpu; 328 328 const struct xt_table_info *private; 329 329 struct xt_action_param acpar; 330 330 unsigned int addend; 331 331 332 332 /* Initialization */ 333 + stackidx = 0; 333 334 indev = state->in ? state->in->name : nulldevname; 334 335 outdev = state->out ? state->out->name : nulldevname; 335 336 /* We handle fragments by dealing with the first fragment as ··· 358 357 cpu = smp_processor_id(); 359 358 table_base = private->entries; 360 359 jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; 361 - stackptr = per_cpu_ptr(private->stackptr, cpu); 362 - origptr = *stackptr; 360 + 361 + /* Switch to alternate jumpstack if we're being invoked via TEE. 362 + * TEE issues XT_CONTINUE verdict on original skb so we must not 363 + * clobber the jumpstack. 364 + * 365 + * For recursion via REJECT or SYNPROXY the stack will be clobbered 366 + * but it is no problem since absolute verdict is issued by these. 367 + */ 368 + jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); 363 369 364 370 e = get_entry(table_base, private->hook_entry[hook]); 365 371 ··· 414 406 verdict = (unsigned int)(-v) - 1; 415 407 break; 416 408 } 417 - if (*stackptr <= origptr) 409 + if (stackidx == 0) 418 410 e = get_entry(table_base, 419 411 private->underflow[hook]); 420 412 else 421 - e = ip6t_next_entry(jumpstack[--*stackptr]); 413 + e = ip6t_next_entry(jumpstack[--stackidx]); 422 414 continue; 423 415 } 424 416 if (table_base + v != ip6t_next_entry(e) && 425 417 !(e->ipv6.flags & IP6T_F_GOTO)) { 426 - if (*stackptr >= private->stacksize) { 427 - verdict = NF_DROP; 428 - break; 429 - } 430 - jumpstack[(*stackptr)++] = e; 418 + jumpstack[stackidx++] = e; 431 419 } 432 420 433 421 e = get_entry(table_base, v); ··· 440 436 /* Verdict */ 441 437 break; 442 438 } while (!acpar.hotdrop); 443 - 444 - *stackptr = origptr; 445 439 446 440 xt_write_recseq_end(addend); 447 441 local_bh_enable();
+11 -11
net/netfilter/x_tables.c
··· 67 67 [NFPROTO_IPV6] = "ip6", 68 68 }; 69 69 70 - /* Allow this many total (re)entries. */ 71 - static const unsigned int xt_jumpstack_multiplier = 2; 72 - 73 70 /* Registration hooks for targets. */ 74 71 int xt_register_target(struct xt_target *target) 75 72 { ··· 685 688 kvfree(info->jumpstack); 686 689 } 687 690 688 - free_percpu(info->stackptr); 689 - 690 691 kvfree(info); 691 692 } 692 693 EXPORT_SYMBOL(xt_free_table_info); ··· 732 737 unsigned int size; 733 738 int cpu; 734 739 735 - i->stackptr = alloc_percpu(unsigned int); 736 - if (i->stackptr == NULL) 737 - return -ENOMEM; 738 - 739 740 size = sizeof(void **) * nr_cpu_ids; 740 741 if (size > PAGE_SIZE) 741 742 i->jumpstack = vzalloc(size); ··· 744 753 if (i->stacksize == 0) 745 754 return 0; 746 755 747 - i->stacksize *= xt_jumpstack_multiplier; 748 - size = sizeof(void *) * i->stacksize; 756 + /* Jumpstack needs to be able to record two full callchains, one 757 + * from the first rule set traversal, plus one table reentrancy 758 + * via -j TEE without clobbering the callchain that brought us to 759 + * TEE target. 760 + * 761 + * This is done by allocating two jumpstacks per cpu, on reentry 762 + * the upper half of the stack is used. 763 + * 764 + * see the jumpstack setup in ipt_do_table() for more details. 765 + */ 766 + size = sizeof(void *) * i->stacksize * 2u; 749 767 for_each_possible_cpu(cpu) { 750 768 if (size > PAGE_SIZE) 751 769 i->jumpstack[cpu] = vmalloc_node(size,