Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ipv4: icmp: Add RFC 5837 support

Add the ability to append the incoming IP interface information to
ICMPv4 error messages in accordance with RFC 5837 and RFC 4884. This is
required for more meaningful traceroute results in unnumbered networks.

The feature is disabled by default and controlled via a new sysctl
("net.ipv4.icmp_errors_extension_mask") which accepts a bitmask of ICMP
extensions to append to ICMP error messages. Currently, only a single
value is supported, but the interface and the implementation should be
able to support more extensions, if needed.

Clone the skb and copy the relevant data portions before modifying the
skb as the caller of __icmp_send() still owns the skb after the function
returns. This should be fine since by default ICMP error messages are
rate limited to 1000 per second and no more than 1 per second per
specific host.

Trim or pad the packet to 128 bytes before appending the ICMP extension
structure in order to be compatible with legacy applications that assume
that the ICMP extension structure always starts at this offset (the
minimum length specified by RFC 4884).

Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251027082232.232571-2-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Ido Schimmel and committed by
Jakub Kicinski
f0e7036f b8a7826e

+251 -1
+17
Documentation/networking/ip-sysctl.rst
··· 1796 1796 1797 1797 Default: 0 (disabled) 1798 1798 1799 + icmp_errors_extension_mask - UNSIGNED INTEGER 1800 + Bitmask of ICMP extensions to append to ICMPv4 error messages 1801 + ("Destination Unreachable", "Time Exceeded" and "Parameter Problem"). 1802 + The original datagram is trimmed / padded to 128 bytes in order to be 1803 + compatible with applications that do not comply with RFC 4884. 1804 + 1805 + Possible extensions are: 1806 + 1807 + ==== ============================================================== 1808 + 0x01 Incoming IP interface information according to RFC 5837. 1809 + Extension will include the index, IPv4 address (if present), 1810 + name and MTU of the IP interface that received the datagram 1811 + which elicited the ICMP error. 1812 + ==== ============================================================== 1813 + 1814 + Default: 0x00 (no extensions) 1815 + 1799 1816 igmp_max_memberships - INTEGER 1800 1817 Change the maximum number of multicast groups we can subscribe to. 1801 1818 Default: 20
+32
include/linux/icmp.h
··· 40 40 struct sock_ee_data_rfc4884 *out, 41 41 int thlen, int off); 42 42 43 + /* RFC 4884 */ 44 + #define ICMP_EXT_ORIG_DGRAM_MIN_LEN 128 45 + #define ICMP_EXT_VERSION_2 2 46 + 47 + /* ICMP Extension Object Classes */ 48 + #define ICMP_EXT_OBJ_CLASS_IIO 2 /* RFC 5837 */ 49 + 50 + /* Interface Information Object - RFC 5837 */ 51 + enum { 52 + ICMP_EXT_CTYPE_IIO_ROLE_IIF, 53 + }; 54 + 55 + #define ICMP_EXT_CTYPE_IIO_ROLE(ROLE) ((ROLE) << 6) 56 + #define ICMP_EXT_CTYPE_IIO_MTU BIT(0) 57 + #define ICMP_EXT_CTYPE_IIO_NAME BIT(1) 58 + #define ICMP_EXT_CTYPE_IIO_IPADDR BIT(2) 59 + #define ICMP_EXT_CTYPE_IIO_IFINDEX BIT(3) 60 + 61 + struct icmp_ext_iio_name_subobj { 62 + u8 len; 63 + char name[IFNAMSIZ]; 64 + }; 65 + 66 + enum { 67 + /* RFC 5837 - Incoming IP Interface Role */ 68 + ICMP_ERR_EXT_IIO_IIF, 69 + /* Add new constants above. Used by "icmp_errors_extension_mask" 70 + * sysctl. 71 + */ 72 + ICMP_ERR_EXT_COUNT, 73 + }; 74 + 43 75 #endif /* _LINUX_ICMP_H */
+1
include/net/netns/ipv4.h
··· 135 135 u8 sysctl_icmp_echo_ignore_broadcasts; 136 136 u8 sysctl_icmp_ignore_bogus_error_responses; 137 137 u8 sysctl_icmp_errors_use_inbound_ifaddr; 138 + u8 sysctl_icmp_errors_extension_mask; 138 139 int sysctl_icmp_ratelimit; 139 140 int sysctl_icmp_ratemask; 140 141 int sysctl_icmp_msgs_per_sec;
+190 -1
net/ipv4/icmp.c
··· 582 582 return ERR_PTR(err); 583 583 } 584 584 585 + struct icmp_ext_iio_addr4_subobj { 586 + __be16 afi; 587 + __be16 reserved; 588 + __be32 addr4; 589 + }; 590 + 591 + static unsigned int icmp_ext_iio_len(void) 592 + { 593 + return sizeof(struct icmp_extobj_hdr) + 594 + /* ifIndex */ 595 + sizeof(__be32) + 596 + /* Interface Address Sub-Object */ 597 + sizeof(struct icmp_ext_iio_addr4_subobj) + 598 + /* Interface Name Sub-Object. Length must be a multiple of 4 599 + * bytes. 600 + */ 601 + ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) + 602 + /* MTU */ 603 + sizeof(__be32); 604 + } 605 + 606 + static unsigned int icmp_ext_max_len(u8 ext_objs) 607 + { 608 + unsigned int ext_max_len; 609 + 610 + ext_max_len = sizeof(struct icmp_ext_hdr); 611 + 612 + if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF)) 613 + ext_max_len += icmp_ext_iio_len(); 614 + 615 + return ext_max_len; 616 + } 617 + 618 + static __be32 icmp_ext_iio_addr4_find(const struct net_device *dev) 619 + { 620 + struct in_device *in_dev; 621 + struct in_ifaddr *ifa; 622 + 623 + in_dev = __in_dev_get_rcu(dev); 624 + if (!in_dev) 625 + return 0; 626 + 627 + /* It is unclear from RFC 5837 which IP address should be chosen, but 628 + * it makes sense to choose a global unicast address. 629 + */ 630 + in_dev_for_each_ifa_rcu(ifa, in_dev) { 631 + if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY) 632 + continue; 633 + if (ifa->ifa_scope != RT_SCOPE_UNIVERSE || 634 + ipv4_is_multicast(ifa->ifa_address)) 635 + continue; 636 + return ifa->ifa_address; 637 + } 638 + 639 + return 0; 640 + } 641 + 642 + static void icmp_ext_iio_iif_append(struct net *net, struct sk_buff *skb, 643 + int iif) 644 + { 645 + struct icmp_ext_iio_name_subobj *name_subobj; 646 + struct icmp_extobj_hdr *objh; 647 + struct net_device *dev; 648 + __be32 data; 649 + 650 + if (!iif) 651 + return; 652 + 653 + /* Add the fields in the order specified by RFC 5837. */ 654 + objh = skb_put(skb, sizeof(*objh)); 655 + objh->class_num = ICMP_EXT_OBJ_CLASS_IIO; 656 + objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF); 657 + 658 + data = htonl(iif); 659 + skb_put_data(skb, &data, sizeof(__be32)); 660 + objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX; 661 + 662 + rcu_read_lock(); 663 + 664 + dev = dev_get_by_index_rcu(net, iif); 665 + if (!dev) 666 + goto out; 667 + 668 + data = icmp_ext_iio_addr4_find(dev); 669 + if (data) { 670 + struct icmp_ext_iio_addr4_subobj *addr4_subobj; 671 + 672 + addr4_subobj = skb_put_zero(skb, sizeof(*addr4_subobj)); 673 + addr4_subobj->afi = htons(ICMP_AFI_IP); 674 + addr4_subobj->addr4 = data; 675 + objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR; 676 + } 677 + 678 + name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4)); 679 + name_subobj->len = ALIGN(sizeof(*name_subobj), 4); 680 + netdev_copy_name(dev, name_subobj->name); 681 + objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME; 682 + 683 + data = htonl(READ_ONCE(dev->mtu)); 684 + skb_put_data(skb, &data, sizeof(__be32)); 685 + objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU; 686 + 687 + out: 688 + rcu_read_unlock(); 689 + objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh); 690 + } 691 + 692 + static void icmp_ext_objs_append(struct net *net, struct sk_buff *skb, 693 + u8 ext_objs, int iif) 694 + { 695 + if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF)) 696 + icmp_ext_iio_iif_append(net, skb, iif); 697 + } 698 + 699 + static struct sk_buff * 700 + icmp_ext_append(struct net *net, struct sk_buff *skb_in, struct icmphdr *icmph, 701 + unsigned int room, int iif) 702 + { 703 + unsigned int payload_len, ext_max_len, ext_len; 704 + struct icmp_ext_hdr *ext_hdr; 705 + struct sk_buff *skb; 706 + u8 ext_objs; 707 + int nhoff; 708 + 709 + switch (icmph->type) { 710 + case ICMP_DEST_UNREACH: 711 + case ICMP_TIME_EXCEEDED: 712 + case ICMP_PARAMETERPROB: 713 + break; 714 + default: 715 + return NULL; 716 + } 717 + 718 + ext_objs = READ_ONCE(net->ipv4.sysctl_icmp_errors_extension_mask); 719 + if (!ext_objs) 720 + return NULL; 721 + 722 + ext_max_len = icmp_ext_max_len(ext_objs); 723 + if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room) 724 + return NULL; 725 + 726 + skb = skb_clone(skb_in, GFP_ATOMIC); 727 + if (!skb) 728 + return NULL; 729 + 730 + nhoff = skb_network_offset(skb); 731 + payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN); 732 + 733 + if (!pskb_network_may_pull(skb, payload_len)) 734 + goto free_skb; 735 + 736 + if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) || 737 + __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false)) 738 + goto free_skb; 739 + 740 + if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC)) 741 + goto free_skb; 742 + 743 + ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr)); 744 + ext_hdr->version = ICMP_EXT_VERSION_2; 745 + 746 + icmp_ext_objs_append(net, skb, ext_objs, iif); 747 + 748 + /* Do not send an empty extension structure. */ 749 + ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr; 750 + if (ext_len == sizeof(*ext_hdr)) 751 + goto free_skb; 752 + 753 + ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len); 754 + /* The length of the original datagram in 32-bit words (RFC 4884). */ 755 + icmph->un.reserved[1] = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u32); 756 + 757 + return skb; 758 + 759 + free_skb: 760 + consume_skb(skb); 761 + return NULL; 762 + } 763 + 585 764 /* 586 765 * Send an ICMP message in response to a situation 587 766 * ··· 780 601 struct icmp_bxm icmp_param; 781 602 struct rtable *rt = skb_rtable(skb_in); 782 603 bool apply_ratelimit = false; 604 + struct sk_buff *ext_skb; 783 605 struct ipcm_cookie ipc; 784 606 struct flowi4 fl4; 785 607 __be32 saddr; ··· 950 770 if (room <= (int)sizeof(struct iphdr)) 951 771 goto ende; 952 772 953 - icmp_param.data_len = skb_in->len - icmp_param.offset; 773 + ext_skb = icmp_ext_append(net, skb_in, &icmp_param.data.icmph, room, 774 + parm->iif); 775 + if (ext_skb) 776 + icmp_param.skb = ext_skb; 777 + 778 + icmp_param.data_len = icmp_param.skb->len - icmp_param.offset; 954 779 if (icmp_param.data_len > room) 955 780 icmp_param.data_len = room; 956 781 icmp_param.head_len = sizeof(struct icmphdr); ··· 970 785 trace_icmp_send(skb_in, type, code); 971 786 972 787 icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt); 788 + 789 + if (ext_skb) 790 + consume_skb(ext_skb); 973 791 ende: 974 792 ip_rt_put(rt); 975 793 out_unlock: ··· 1690 1502 net->ipv4.sysctl_icmp_ratelimit = 1 * HZ; 1691 1503 net->ipv4.sysctl_icmp_ratemask = 0x1818; 1692 1504 net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0; 1505 + net->ipv4.sysctl_icmp_errors_extension_mask = 0; 1693 1506 net->ipv4.sysctl_icmp_msgs_per_sec = 1000; 1694 1507 net->ipv4.sysctl_icmp_msgs_burst = 50; 1695 1508
+11
net/ipv4/sysctl_net_ipv4.c
··· 48 48 static int tcp_plb_max_cong_thresh = 256; 49 49 static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC; 50 50 static int tcp_ecn_mode_max = 2; 51 + static u32 icmp_errors_extension_mask_all = 52 + GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0); 51 53 52 54 /* obsolete */ 53 55 static int sysctl_tcp_low_latency __read_mostly; ··· 675 673 .proc_handler = proc_dou8vec_minmax, 676 674 .extra1 = SYSCTL_ZERO, 677 675 .extra2 = SYSCTL_ONE 676 + }, 677 + { 678 + .procname = "icmp_errors_extension_mask", 679 + .data = &init_net.ipv4.sysctl_icmp_errors_extension_mask, 680 + .maxlen = sizeof(u8), 681 + .mode = 0644, 682 + .proc_handler = proc_dou8vec_minmax, 683 + .extra1 = SYSCTL_ZERO, 684 + .extra2 = &icmp_errors_extension_mask_all, 678 685 }, 679 686 { 680 687 .procname = "icmp_ratelimit",