Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: Track alignment of register values in the verifier.

Currently if we add only constant values to pointers we can fully
validate the alignment, and properly check if we need to reject the
program on !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS architectures.

However, once an unknown value is introduced we only allow byte sized
memory accesses which is too restrictive.

Add logic to track the known minimum alignment of register values,
and propagate this state into registers containing pointers.

The most common paradigm that makes use of this new logic is computing
the transport header using the IP header length field. For example:

struct ethhdr *ep = skb->data;
struct iphdr *iph = (struct iphdr *) (ep + 1);
struct tcphdr *th;
...
n = iph->ihl;
th = ((void *)iph + (n * 4));
port = th->dest;

The existing code will reject the load of th->dest because it cannot
validate that the alignment is at least 2 once "n * 4" is added the
the packet pointer.

In the new code, the register holding "n * 4" will have a reg->min_align
value of 4, because any value multiplied by 4 will be at least 4 byte
aligned. (actually, the eBPF code emitted by the compiler in this case
is most likely to use a shift left by 2, but the end result is identical)

At the critical addition:

th = ((void *)iph + (n * 4));

The register holding 'th' will start with reg->off value of 14. The
pointer addition will transform that reg into something that looks like:

reg->aux_off = 14
reg->aux_off_align = 4

Next, the verifier will look at the th->dest load, and it will see
a load offset of 2, and first check:

if (reg->aux_off_align % size)

which will pass because aux_off_align is 4. reg_off will be computed:

reg_off = reg->off;
...
reg_off += reg->aux_off;

plus we have off==2, and it will thus check:

if ((NET_IP_ALIGN + reg_off + off) % size != 0)

which evaluates to:

if ((NET_IP_ALIGN + 14 + 2) % size != 0)

On strict alignment architectures, NET_IP_ALIGN is 2, thus:

if ((2 + 14 + 2) % size != 0)

which passes.

These pointer transformations and checks work regardless of whether
the constant offset or the variable with known alignment is added
first to the pointer register.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>

+92 -19
+3
include/linux/bpf_verifier.h
··· 40 40 */ 41 41 s64 min_value; 42 42 u64 max_value; 43 + u32 min_align; 44 + u32 aux_off; 45 + u32 aux_off_align; 43 46 }; 44 47 45 48 enum bpf_stack_slot_type {
+89 -19
kernel/bpf/verifier.c
··· 241 241 if (reg->max_value != BPF_REGISTER_MAX_RANGE) 242 242 verbose(",max_value=%llu", 243 243 (unsigned long long)reg->max_value); 244 + if (reg->min_align) 245 + verbose(",min_align=%u", reg->min_align); 246 + if (reg->aux_off) 247 + verbose(",aux_off=%u", reg->aux_off); 248 + if (reg->aux_off_align) 249 + verbose(",aux_off_align=%u", reg->aux_off_align); 244 250 } 245 251 for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { 246 252 if (state->stack_slot_type[i] == STACK_SPILL) ··· 472 466 regs[i].imm = 0; 473 467 regs[i].min_value = BPF_REGISTER_MIN_RANGE; 474 468 regs[i].max_value = BPF_REGISTER_MAX_RANGE; 469 + regs[i].min_align = 0; 470 + regs[i].aux_off = 0; 471 + regs[i].aux_off_align = 0; 475 472 } 476 473 477 474 /* frame pointer */ ··· 501 492 { 502 493 regs[regno].min_value = BPF_REGISTER_MIN_RANGE; 503 494 regs[regno].max_value = BPF_REGISTER_MAX_RANGE; 495 + regs[regno].min_align = 0; 504 496 } 505 497 506 498 static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs, ··· 789 779 } 790 780 791 781 static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, 792 - int off, int size) 782 + int off, int size, bool strict) 793 783 { 794 - if (reg->id && size != 1) { 795 - verbose("Unknown alignment. Only byte-sized access allowed in packet access.\n"); 796 - return -EACCES; 784 + int reg_off; 785 + 786 + /* Byte size accesses are always allowed. */ 787 + if (!strict || size == 1) 788 + return 0; 789 + 790 + reg_off = reg->off; 791 + if (reg->id) { 792 + if (reg->aux_off_align % size) { 793 + verbose("Packet access is only %u byte aligned, %d byte access not allowed\n", 794 + reg->aux_off_align, size); 795 + return -EACCES; 796 + } 797 + reg_off += reg->aux_off; 797 798 } 798 799 799 800 /* skb->data is NET_IP_ALIGN-ed */ 800 - if ((NET_IP_ALIGN + reg->off + off) % size != 0) { 801 + if ((NET_IP_ALIGN + reg_off + off) % size != 0) { 801 802 verbose("misaligned packet access off %d+%d+%d size %d\n", 802 - NET_IP_ALIGN, reg->off, off, size); 803 + NET_IP_ALIGN, reg_off, off, size); 803 804 return -EACCES; 804 805 } 805 806 ··· 818 797 } 819 798 820 799 static int check_val_ptr_alignment(const struct bpf_reg_state *reg, 821 - int size) 800 + int size, bool strict) 822 801 { 823 - if (size != 1) { 802 + if (strict && size != 1) { 824 803 verbose("Unknown alignment. Only byte-sized access allowed in value access.\n"); 825 804 return -EACCES; 826 805 } ··· 831 810 static int check_ptr_alignment(const struct bpf_reg_state *reg, 832 811 int off, int size) 833 812 { 813 + bool strict = false; 814 + 815 + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) 816 + strict = true; 817 + 834 818 switch (reg->type) { 835 819 case PTR_TO_PACKET: 836 - return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 : 837 - check_pkt_ptr_alignment(reg, off, size); 820 + return check_pkt_ptr_alignment(reg, off, size, strict); 838 821 case PTR_TO_MAP_VALUE_ADJ: 839 - return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 : 840 - check_val_ptr_alignment(reg, size); 822 + return check_val_ptr_alignment(reg, size, strict); 841 823 default: 842 824 if (off % size != 0) { 843 825 verbose("misaligned access off %d size %d\n", ··· 907 883 value_regno); 908 884 /* note that reg.[id|off|range] == 0 */ 909 885 state->regs[value_regno].type = reg_type; 886 + state->regs[value_regno].aux_off = 0; 887 + state->regs[value_regno].aux_off_align = 0; 910 888 } 911 889 912 890 } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { ··· 1481 1455 */ 1482 1456 dst_reg->off += imm; 1483 1457 } else { 1458 + bool had_id; 1459 + 1484 1460 if (src_reg->type == PTR_TO_PACKET) { 1485 1461 /* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */ 1486 1462 tmp_reg = *dst_reg; /* save r7 state */ ··· 1516 1488 src_reg->imm); 1517 1489 return -EACCES; 1518 1490 } 1491 + 1492 + had_id = (dst_reg->id != 0); 1493 + 1519 1494 /* dst_reg stays as pkt_ptr type and since some positive 1520 1495 * integer value was added to the pointer, increment its 'id' 1521 1496 */ 1522 1497 dst_reg->id = ++env->id_gen; 1523 1498 1524 - /* something was added to pkt_ptr, set range and off to zero */ 1499 + /* something was added to pkt_ptr, set range to zero */ 1500 + dst_reg->aux_off = dst_reg->off; 1525 1501 dst_reg->off = 0; 1526 1502 dst_reg->range = 0; 1503 + if (had_id) 1504 + dst_reg->aux_off_align = min(dst_reg->aux_off_align, 1505 + src_reg->min_align); 1506 + else 1507 + dst_reg->aux_off_align = src_reg->min_align; 1527 1508 } 1528 1509 return 0; 1529 1510 } ··· 1706 1669 reg->min_value = BPF_REGISTER_MIN_RANGE; 1707 1670 } 1708 1671 1672 + static u32 calc_align(u32 imm) 1673 + { 1674 + if (!imm) 1675 + return 1U << 31; 1676 + return imm - ((imm - 1) & imm); 1677 + } 1678 + 1709 1679 static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, 1710 1680 struct bpf_insn *insn) 1711 1681 { ··· 1720 1676 s64 min_val = BPF_REGISTER_MIN_RANGE; 1721 1677 u64 max_val = BPF_REGISTER_MAX_RANGE; 1722 1678 u8 opcode = BPF_OP(insn->code); 1679 + u32 dst_align, src_align; 1723 1680 1724 1681 dst_reg = &regs[insn->dst_reg]; 1682 + src_align = 0; 1725 1683 if (BPF_SRC(insn->code) == BPF_X) { 1726 1684 check_reg_overflow(&regs[insn->src_reg]); 1727 1685 min_val = regs[insn->src_reg].min_value; ··· 1739 1693 regs[insn->src_reg].type != UNKNOWN_VALUE) { 1740 1694 min_val = BPF_REGISTER_MIN_RANGE; 1741 1695 max_val = BPF_REGISTER_MAX_RANGE; 1696 + src_align = 0; 1697 + } else { 1698 + src_align = regs[insn->src_reg].min_align; 1742 1699 } 1743 1700 } else if (insn->imm < BPF_REGISTER_MAX_RANGE && 1744 1701 (s64)insn->imm > BPF_REGISTER_MIN_RANGE) { 1745 1702 min_val = max_val = insn->imm; 1703 + src_align = calc_align(insn->imm); 1746 1704 } 1705 + 1706 + dst_align = dst_reg->min_align; 1747 1707 1748 1708 /* We don't know anything about what was done to this register, mark it 1749 1709 * as unknown. ··· 1775 1723 dst_reg->min_value += min_val; 1776 1724 if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) 1777 1725 dst_reg->max_value += max_val; 1726 + dst_reg->min_align = min(src_align, dst_align); 1778 1727 break; 1779 1728 case BPF_SUB: 1780 1729 if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) 1781 1730 dst_reg->min_value -= min_val; 1782 1731 if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) 1783 1732 dst_reg->max_value -= max_val; 1733 + dst_reg->min_align = min(src_align, dst_align); 1784 1734 break; 1785 1735 case BPF_MUL: 1786 1736 if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) 1787 1737 dst_reg->min_value *= min_val; 1788 1738 if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) 1789 1739 dst_reg->max_value *= max_val; 1740 + dst_reg->min_align = max(src_align, dst_align); 1790 1741 break; 1791 1742 case BPF_AND: 1792 1743 /* Disallow AND'ing of negative numbers, ain't nobody got time ··· 1801 1746 else 1802 1747 dst_reg->min_value = 0; 1803 1748 dst_reg->max_value = max_val; 1749 + dst_reg->min_align = max(src_align, dst_align); 1804 1750 break; 1805 1751 case BPF_LSH: 1806 1752 /* Gotta have special overflow logic here, if we're shifting 1807 1753 * more than MAX_RANGE then just assume we have an invalid 1808 1754 * range. 1809 1755 */ 1810 - if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) 1756 + if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) { 1811 1757 dst_reg->min_value = BPF_REGISTER_MIN_RANGE; 1812 - else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) 1813 - dst_reg->min_value <<= min_val; 1814 - 1758 + dst_reg->min_align = 1; 1759 + } else { 1760 + if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) 1761 + dst_reg->min_value <<= min_val; 1762 + if (!dst_reg->min_align) 1763 + dst_reg->min_align = 1; 1764 + dst_reg->min_align <<= min_val; 1765 + } 1815 1766 if (max_val > ilog2(BPF_REGISTER_MAX_RANGE)) 1816 1767 dst_reg->max_value = BPF_REGISTER_MAX_RANGE; 1817 1768 else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) ··· 1827 1766 /* RSH by a negative number is undefined, and the BPF_RSH is an 1828 1767 * unsigned shift, so make the appropriate casts. 1829 1768 */ 1830 - if (min_val < 0 || dst_reg->min_value < 0) 1769 + if (min_val < 0 || dst_reg->min_value < 0) { 1831 1770 dst_reg->min_value = BPF_REGISTER_MIN_RANGE; 1832 - else 1771 + } else { 1833 1772 dst_reg->min_value = 1834 1773 (u64)(dst_reg->min_value) >> min_val; 1774 + } 1775 + if (min_val < 0) { 1776 + dst_reg->min_align = 1; 1777 + } else { 1778 + dst_reg->min_align >>= (u64) min_val; 1779 + if (!dst_reg->min_align) 1780 + dst_reg->min_align = 1; 1781 + } 1835 1782 if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) 1836 1783 dst_reg->max_value >>= max_val; 1837 1784 break; ··· 1941 1872 regs[insn->dst_reg].imm = insn->imm; 1942 1873 regs[insn->dst_reg].max_value = insn->imm; 1943 1874 regs[insn->dst_reg].min_value = insn->imm; 1875 + regs[insn->dst_reg].min_align = calc_align(insn->imm); 1944 1876 } 1945 1877 1946 1878 } else if (opcode > BPF_END) {