Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'mlx4-next'

Or Gerlitz says:

====================
mlx4: Add CHECKSUM_COMPLETE support

These patches from Shani, Matan and myself add support for
CHECKSUM_COMPLETE reporting on non TCP/UDP packets such as
GRE and ICMP. I'd like to deeply thank Jerry Chu for his
innovation and support in that effort.

Based on the feedback from Eric and Ido Shamay, in V2 we dropped
the patch which removed the calls to napi_gro_frags() and added
a patch which makes the RX code to go through that path
regardless of the checksum status.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+195 -61
+1 -1
drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
··· 115 115 "tso_packets", 116 116 "xmit_more", 117 117 "queue_stopped", "wake_queue", "tx_timeout", "rx_alloc_failed", 118 - "rx_csum_good", "rx_csum_none", "tx_chksum_offload", 118 + "rx_csum_good", "rx_csum_none", "rx_csum_complete", "tx_chksum_offload", 119 119 120 120 /* packet statistics */ 121 121 "broadcast", "rx_prio_0", "rx_prio_1", "rx_prio_2", "rx_prio_3",
+5
drivers/net/ethernet/mellanox/mlx4/en_netdev.c
··· 1893 1893 priv->rx_ring[i]->packets = 0; 1894 1894 priv->rx_ring[i]->csum_ok = 0; 1895 1895 priv->rx_ring[i]->csum_none = 0; 1896 + priv->rx_ring[i]->csum_complete = 0; 1896 1897 } 1897 1898 } 1898 1899 ··· 2503 2502 2504 2503 /* Query for default mac and max mtu */ 2505 2504 priv->max_mtu = mdev->dev->caps.eth_mtu_cap[priv->port]; 2505 + 2506 + if (mdev->dev->caps.rx_checksum_flags_port[priv->port] & 2507 + MLX4_RX_CSUM_MODE_VAL_NON_TCP_UDP) 2508 + priv->flags |= MLX4_EN_FLAG_RX_CSUM_NON_TCP_UDP; 2506 2509 2507 2510 /* Set default MAC */ 2508 2511 dev->addr_len = ETH_ALEN;
+2
drivers/net/ethernet/mellanox/mlx4/en_port.c
··· 155 155 stats->rx_bytes = 0; 156 156 priv->port_stats.rx_chksum_good = 0; 157 157 priv->port_stats.rx_chksum_none = 0; 158 + priv->port_stats.rx_chksum_complete = 0; 158 159 for (i = 0; i < priv->rx_ring_num; i++) { 159 160 stats->rx_packets += priv->rx_ring[i]->packets; 160 161 stats->rx_bytes += priv->rx_ring[i]->bytes; 161 162 priv->port_stats.rx_chksum_good += priv->rx_ring[i]->csum_ok; 162 163 priv->port_stats.rx_chksum_none += priv->rx_ring[i]->csum_none; 164 + priv->port_stats.rx_chksum_complete += priv->rx_ring[i]->csum_complete; 163 165 } 164 166 stats->tx_packets = 0; 165 167 stats->tx_bytes = 0;
+173 -59
drivers/net/ethernet/mellanox/mlx4/en_rx.c
··· 42 42 #include <linux/vmalloc.h> 43 43 #include <linux/irq.h> 44 44 45 + #if IS_ENABLED(CONFIG_IPV6) 46 + #include <net/ip6_checksum.h> 47 + #endif 48 + 45 49 #include "mlx4_en.h" 46 50 47 51 static int mlx4_alloc_pages(struct mlx4_en_priv *priv, ··· 647 643 } 648 644 } 649 645 646 + /* When hardware doesn't strip the vlan, we need to calculate the checksum 647 + * over it and add it to the hardware's checksum calculation 648 + */ 649 + static inline __wsum get_fixed_vlan_csum(__wsum hw_checksum, 650 + struct vlan_hdr *vlanh) 651 + { 652 + return csum_add(hw_checksum, *(__wsum *)vlanh); 653 + } 654 + 655 + /* Although the stack expects checksum which doesn't include the pseudo 656 + * header, the HW adds it. To address that, we are subtracting the pseudo 657 + * header checksum from the checksum value provided by the HW. 658 + */ 659 + static void get_fixed_ipv4_csum(__wsum hw_checksum, struct sk_buff *skb, 660 + struct iphdr *iph) 661 + { 662 + __u16 length_for_csum = 0; 663 + __wsum csum_pseudo_header = 0; 664 + 665 + length_for_csum = (be16_to_cpu(iph->tot_len) - (iph->ihl << 2)); 666 + csum_pseudo_header = csum_tcpudp_nofold(iph->saddr, iph->daddr, 667 + length_for_csum, iph->protocol, 0); 668 + skb->csum = csum_sub(hw_checksum, csum_pseudo_header); 669 + } 670 + 671 + #if IS_ENABLED(CONFIG_IPV6) 672 + /* In IPv6 packets, besides subtracting the pseudo header checksum, 673 + * we also compute/add the IP header checksum which 674 + * is not added by the HW. 675 + */ 676 + static int get_fixed_ipv6_csum(__wsum hw_checksum, struct sk_buff *skb, 677 + struct ipv6hdr *ipv6h) 678 + { 679 + __wsum csum_pseudo_hdr = 0; 680 + 681 + if (ipv6h->nexthdr == IPPROTO_FRAGMENT || ipv6h->nexthdr == IPPROTO_HOPOPTS) 682 + return -1; 683 + hw_checksum = csum_add(hw_checksum, (__force __wsum)(ipv6h->nexthdr << 8)); 684 + 685 + csum_pseudo_hdr = csum_partial(&ipv6h->saddr, 686 + sizeof(ipv6h->saddr) + sizeof(ipv6h->daddr), 0); 687 + csum_pseudo_hdr = csum_add(csum_pseudo_hdr, (__force __wsum)ipv6h->payload_len); 688 + csum_pseudo_hdr = csum_add(csum_pseudo_hdr, (__force __wsum)ntohs(ipv6h->nexthdr)); 689 + 690 + skb->csum = csum_sub(hw_checksum, csum_pseudo_hdr); 691 + skb->csum = csum_add(skb->csum, csum_partial(ipv6h, sizeof(struct ipv6hdr), 0)); 692 + return 0; 693 + } 694 + #endif 695 + static int check_csum(struct mlx4_cqe *cqe, struct sk_buff *skb, void *va, 696 + int hwtstamp_rx_filter) 697 + { 698 + __wsum hw_checksum = 0; 699 + 700 + void *hdr = (u8 *)va + sizeof(struct ethhdr); 701 + 702 + hw_checksum = csum_unfold((__force __sum16)cqe->checksum); 703 + 704 + if (((struct ethhdr *)va)->h_proto == htons(ETH_P_8021Q) && 705 + hwtstamp_rx_filter != HWTSTAMP_FILTER_NONE) { 706 + /* next protocol non IPv4 or IPv6 */ 707 + if (((struct vlan_hdr *)hdr)->h_vlan_encapsulated_proto 708 + != htons(ETH_P_IP) && 709 + ((struct vlan_hdr *)hdr)->h_vlan_encapsulated_proto 710 + != htons(ETH_P_IPV6)) 711 + return -1; 712 + hw_checksum = get_fixed_vlan_csum(hw_checksum, hdr); 713 + hdr += sizeof(struct vlan_hdr); 714 + } 715 + 716 + if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV4)) 717 + get_fixed_ipv4_csum(hw_checksum, skb, hdr); 718 + #if IS_ENABLED(CONFIG_IPV6) 719 + else if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV6)) 720 + if (get_fixed_ipv6_csum(hw_checksum, skb, hdr)) 721 + return -1; 722 + #endif 723 + return 0; 724 + } 725 + 650 726 int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget) 651 727 { 652 728 struct mlx4_en_priv *priv = netdev_priv(dev); ··· 828 744 (cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_L2_TUNNEL)); 829 745 830 746 if (likely(dev->features & NETIF_F_RXCSUM)) { 831 - if ((cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) && 832 - (cqe->checksum == cpu_to_be16(0xffff))) { 833 - ring->csum_ok++; 834 - /* This packet is eligible for GRO if it is: 835 - * - DIX Ethernet (type interpretation) 836 - * - TCP/IP (v4) 837 - * - without IP options 838 - * - not an IP fragment 839 - * - no LLS polling in progress 840 - */ 841 - if (!mlx4_en_cq_busy_polling(cq) && 842 - (dev->features & NETIF_F_GRO)) { 843 - struct sk_buff *gro_skb = napi_get_frags(&cq->napi); 844 - if (!gro_skb) 845 - goto next; 846 - 847 - nr = mlx4_en_complete_rx_desc(priv, 848 - rx_desc, frags, gro_skb, 849 - length); 850 - if (!nr) 851 - goto next; 852 - 853 - skb_shinfo(gro_skb)->nr_frags = nr; 854 - gro_skb->len = length; 855 - gro_skb->data_len = length; 856 - gro_skb->ip_summed = CHECKSUM_UNNECESSARY; 857 - 858 - if (l2_tunnel) 859 - gro_skb->csum_level = 1; 860 - if ((cqe->vlan_my_qpn & 861 - cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK)) && 862 - (dev->features & NETIF_F_HW_VLAN_CTAG_RX)) { 863 - u16 vid = be16_to_cpu(cqe->sl_vid); 864 - 865 - __vlan_hwaccel_put_tag(gro_skb, htons(ETH_P_8021Q), vid); 866 - } 867 - 868 - if (dev->features & NETIF_F_RXHASH) 869 - skb_set_hash(gro_skb, 870 - be32_to_cpu(cqe->immed_rss_invalid), 871 - PKT_HASH_TYPE_L3); 872 - 873 - skb_record_rx_queue(gro_skb, cq->ring); 874 - skb_mark_napi_id(gro_skb, &cq->napi); 875 - 876 - if (ring->hwtstamp_rx_filter == HWTSTAMP_FILTER_ALL) { 877 - timestamp = mlx4_en_get_cqe_ts(cqe); 878 - mlx4_en_fill_hwtstamps(mdev, 879 - skb_hwtstamps(gro_skb), 880 - timestamp); 881 - } 882 - 883 - napi_gro_frags(&cq->napi); 884 - goto next; 747 + if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_TCP | 748 + MLX4_CQE_STATUS_UDP)) { 749 + if ((cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) && 750 + cqe->checksum == cpu_to_be16(0xffff)) { 751 + ip_summed = CHECKSUM_UNNECESSARY; 752 + ring->csum_ok++; 753 + } else { 754 + ip_summed = CHECKSUM_NONE; 755 + ring->csum_none++; 885 756 } 886 - 887 - /* GRO not possible, complete processing here */ 888 - ip_summed = CHECKSUM_UNNECESSARY; 889 757 } else { 890 - ip_summed = CHECKSUM_NONE; 891 - ring->csum_none++; 758 + if (priv->flags & MLX4_EN_FLAG_RX_CSUM_NON_TCP_UDP && 759 + (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV4 | 760 + MLX4_CQE_STATUS_IPV6))) { 761 + ip_summed = CHECKSUM_COMPLETE; 762 + ring->csum_complete++; 763 + } else { 764 + ip_summed = CHECKSUM_NONE; 765 + ring->csum_none++; 766 + } 892 767 } 893 768 } else { 894 769 ip_summed = CHECKSUM_NONE; 895 770 ring->csum_none++; 896 771 } 897 772 773 + /* This packet is eligible for GRO if it is: 774 + * - DIX Ethernet (type interpretation) 775 + * - TCP/IP (v4) 776 + * - without IP options 777 + * - not an IP fragment 778 + * - no LLS polling in progress 779 + */ 780 + if (!mlx4_en_cq_busy_polling(cq) && 781 + (dev->features & NETIF_F_GRO)) { 782 + struct sk_buff *gro_skb = napi_get_frags(&cq->napi); 783 + if (!gro_skb) 784 + goto next; 785 + 786 + nr = mlx4_en_complete_rx_desc(priv, 787 + rx_desc, frags, gro_skb, 788 + length); 789 + if (!nr) 790 + goto next; 791 + 792 + if (ip_summed == CHECKSUM_COMPLETE) { 793 + void *va = skb_frag_address(skb_shinfo(gro_skb)->frags); 794 + if (check_csum(cqe, gro_skb, va, ring->hwtstamp_rx_filter)) { 795 + ip_summed = CHECKSUM_NONE; 796 + ring->csum_none++; 797 + ring->csum_complete--; 798 + } 799 + } 800 + 801 + skb_shinfo(gro_skb)->nr_frags = nr; 802 + gro_skb->len = length; 803 + gro_skb->data_len = length; 804 + gro_skb->ip_summed = ip_summed; 805 + 806 + if (l2_tunnel && ip_summed == CHECKSUM_UNNECESSARY) 807 + gro_skb->encapsulation = 1; 808 + if ((cqe->vlan_my_qpn & 809 + cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK)) && 810 + (dev->features & NETIF_F_HW_VLAN_CTAG_RX)) { 811 + u16 vid = be16_to_cpu(cqe->sl_vid); 812 + 813 + __vlan_hwaccel_put_tag(gro_skb, htons(ETH_P_8021Q), vid); 814 + } 815 + 816 + if (dev->features & NETIF_F_RXHASH) 817 + skb_set_hash(gro_skb, 818 + be32_to_cpu(cqe->immed_rss_invalid), 819 + PKT_HASH_TYPE_L3); 820 + 821 + skb_record_rx_queue(gro_skb, cq->ring); 822 + skb_mark_napi_id(gro_skb, &cq->napi); 823 + 824 + if (ring->hwtstamp_rx_filter == HWTSTAMP_FILTER_ALL) { 825 + timestamp = mlx4_en_get_cqe_ts(cqe); 826 + mlx4_en_fill_hwtstamps(mdev, 827 + skb_hwtstamps(gro_skb), 828 + timestamp); 829 + } 830 + 831 + napi_gro_frags(&cq->napi); 832 + goto next; 833 + } 834 + 835 + /* GRO not possible, complete processing here */ 898 836 skb = mlx4_en_rx_skb(priv, rx_desc, frags, length); 899 837 if (!skb) { 900 838 priv->stats.rx_dropped++; ··· 926 820 if (unlikely(priv->validate_loopback)) { 927 821 validate_loopback(priv, skb); 928 822 goto next; 823 + } 824 + 825 + if (ip_summed == CHECKSUM_COMPLETE) { 826 + if (check_csum(cqe, skb, skb->data, ring->hwtstamp_rx_filter)) { 827 + ip_summed = CHECKSUM_NONE; 828 + ring->csum_complete--; 829 + ring->csum_none++; 830 + } 929 831 } 930 832 931 833 skb->ip_summed = ip_summed;
+9
drivers/net/ethernet/mellanox/mlx4/main.c
··· 1629 1629 struct mlx4_init_hca_param init_hca; 1630 1630 u64 icm_size; 1631 1631 int err; 1632 + struct mlx4_config_dev_params params; 1632 1633 1633 1634 if (!mlx4_is_slave(dev)) { 1634 1635 err = mlx4_QUERY_FW(dev); ··· 1763 1762 goto unmap_bf; 1764 1763 } 1765 1764 1765 + /* Query CONFIG_DEV parameters */ 1766 + err = mlx4_config_dev_retrieval(dev, &params); 1767 + if (err && err != -ENOTSUPP) { 1768 + mlx4_err(dev, "Failed to query CONFIG_DEV parameters\n"); 1769 + } else if (!err) { 1770 + dev->caps.rx_checksum_flags_port[1] = params.rx_csum_flags_port_1; 1771 + dev->caps.rx_checksum_flags_port[2] = params.rx_csum_flags_port_2; 1772 + } 1766 1773 priv->eq_table.inta_pin = adapter.inta_pin; 1767 1774 memcpy(dev->board_id, adapter.board_id, sizeof dev->board_id); 1768 1775
+4 -1
drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
··· 326 326 #endif 327 327 unsigned long csum_ok; 328 328 unsigned long csum_none; 329 + unsigned long csum_complete; 329 330 int hwtstamp_rx_filter; 330 331 cpumask_var_t affinity_mask; 331 332 }; ··· 450 449 unsigned long rx_alloc_failed; 451 450 unsigned long rx_chksum_good; 452 451 unsigned long rx_chksum_none; 452 + unsigned long rx_chksum_complete; 453 453 unsigned long tx_chksum_offload; 454 454 #define NUM_PORT_STATS 9 455 455 }; ··· 509 507 MLX4_EN_FLAG_ENABLE_HW_LOOPBACK = (1 << 2), 510 508 /* whether we need to drop packets that hardware loopback-ed */ 511 509 MLX4_EN_FLAG_RX_FILTER_NEEDED = (1 << 3), 512 - MLX4_EN_FLAG_FORCE_PROMISC = (1 << 4) 510 + MLX4_EN_FLAG_FORCE_PROMISC = (1 << 4), 511 + MLX4_EN_FLAG_RX_CSUM_NON_TCP_UDP = (1 << 5), 513 512 }; 514 513 515 514 #define MLX4_EN_MAC_HASH_SIZE (1 << BITS_PER_BYTE)
+1
include/linux/mlx4/device.h
··· 497 497 u16 hca_core_clock; 498 498 u64 phys_port_id[MLX4_MAX_PORTS + 1]; 499 499 int tunnel_offload_mode; 500 + u8 rx_checksum_flags_port[MLX4_MAX_PORTS + 1]; 500 501 }; 501 502 502 503 struct mlx4_buf_list {