Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'mlxsw-events-processing-performance'

Petr Machata says:

====================
mlxsw: Improve events processing performance

Amit Cohen writes:

Spectrum ASICs only support a single interrupt, it means that all the
events are handled by one IRQ (interrupt request) handler.

Currently, we schedule a tasklet to handle events in EQ, then we also use
tasklet for CQ, SDQ and RDQ. Tasklet runs in softIRQ (software IRQ)
context, and will be run on the same CPU which scheduled it. It means that
today we have one CPU which handles all the packets (both network packets
and EMADs) from hardware.

The existing implementation is not efficient and can be improved.

Measuring latency of EMADs in the driver (without the time in FW) shows
that latency is increased by factor of 28 (x28) when network traffic is
handled by the driver.

Measuring throughput in CPU shows that CPU can handle ~35% less packets
of specific flow when corrupted packets are also handled by the driver.
There are cases that these values even worse, we measure decrease of ~44%
packet rate.

This can be improved if network packet and EMADs will be handled in
parallel by several CPUs, and more than that, if different types of traffic
will be handled in parallel. We can achieve this using NAPI.

This set converts the driver to process completions from hardware via NAPI.
The idea is to add NAPI instance per CQ (which is mapped 1:1 to SDQ/RDQ),
which means that each DQ can be handled separately. we have DQ for EMADs
and DQs for each trap group (like LLDP, BGP, L3 drops, etc..). See more
details in commit messages.

An additional improvement which is done as part of this set is related to
doorbells' ring. The idea is to handle small chunks of Rx packets (which
is also recommended using NAPI) and ring doorbells once per chunk. This
reduces the access to hardware which is expensive (time wise) and might
take time because of memory barriers.

With this set we can see better performance.
To summerize:

EMADs latency:
+------------------------------------------------------------------------+
| | Before this set | Now |
|------------------|---------------------------|-------------------------|
| Increased factor | x28 | x1.5 |
+------------------------------------------------------------------------+
Note that we can see even measurements that show better latency when
traffic is handled by the driver.

Throughput:
+------------------------------------------------------------------------+
| | Before this set | Now |
|-------------|----------------------------|-----------------------------|
| Reduced | 35% | 6% |
| packet rate | | |
+------------------------------------------------------------------------+

Additional improvements are planned - use page pool for buffer allocations
and avoid cache miss of each SKB using napi_build_skb().

Patch set overview:
Patches #1-#2 improve access to hardware by reducing dorbells' rings
Patch #3-#4 are preaparations for NAPI usage
Patch #5 converts the driver to use NAPI
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+150 -54
+150 -54
drivers/net/ethernet/mellanox/mlxsw/pci.c
··· 82 82 u8 num; /* queue number */ 83 83 u8 elem_size; /* size of one element */ 84 84 enum mlxsw_pci_queue_type type; 85 - struct tasklet_struct tasklet; /* queue processing tasklet */ 86 85 struct mlxsw_pci *pci; 87 - struct { 88 - enum mlxsw_pci_cqe_v v; 89 - struct mlxsw_pci_queue *dq; 90 - } cq; 86 + union { 87 + struct { 88 + enum mlxsw_pci_cqe_v v; 89 + struct mlxsw_pci_queue *dq; 90 + struct napi_struct napi; 91 + } cq; 92 + struct { 93 + struct tasklet_struct tasklet; 94 + } eq; 95 + } u; 91 96 }; 92 97 93 98 struct mlxsw_pci_queue_type_group { ··· 132 127 u8 num_cqs; /* Number of CQs */ 133 128 u8 num_sdqs; /* Number of SDQs */ 134 129 bool skip_reset; 130 + struct net_device *napi_dev_tx; 131 + struct net_device *napi_dev_rx; 135 132 }; 136 133 137 - static void mlxsw_pci_queue_tasklet_schedule(struct mlxsw_pci_queue *q) 134 + static int mlxsw_pci_napi_devs_init(struct mlxsw_pci *mlxsw_pci) 138 135 { 139 - tasklet_schedule(&q->tasklet); 136 + int err; 137 + 138 + mlxsw_pci->napi_dev_tx = alloc_netdev_dummy(0); 139 + if (!mlxsw_pci->napi_dev_tx) 140 + return -ENOMEM; 141 + strscpy(mlxsw_pci->napi_dev_tx->name, "mlxsw_tx", 142 + sizeof(mlxsw_pci->napi_dev_tx->name)); 143 + 144 + mlxsw_pci->napi_dev_rx = alloc_netdev_dummy(0); 145 + if (!mlxsw_pci->napi_dev_rx) { 146 + err = -ENOMEM; 147 + goto err_alloc_rx; 148 + } 149 + strscpy(mlxsw_pci->napi_dev_rx->name, "mlxsw_rx", 150 + sizeof(mlxsw_pci->napi_dev_rx->name)); 151 + dev_set_threaded(mlxsw_pci->napi_dev_rx, true); 152 + 153 + return 0; 154 + 155 + err_alloc_rx: 156 + free_netdev(mlxsw_pci->napi_dev_tx); 157 + return err; 158 + } 159 + 160 + static void mlxsw_pci_napi_devs_fini(struct mlxsw_pci *mlxsw_pci) 161 + { 162 + free_netdev(mlxsw_pci->napi_dev_rx); 163 + free_netdev(mlxsw_pci->napi_dev_tx); 140 164 } 141 165 142 166 static char *__mlxsw_pci_queue_elem_get(struct mlxsw_pci_queue *q, ··· 324 290 return err; 325 291 326 292 cq = mlxsw_pci_cq_get(mlxsw_pci, cq_num); 327 - cq->cq.dq = q; 293 + cq->u.cq.dq = q; 328 294 mlxsw_pci_queue_doorbell_producer_ring(mlxsw_pci, q); 329 295 return 0; 330 296 } ··· 433 399 return err; 434 400 435 401 cq = mlxsw_pci_cq_get(mlxsw_pci, cq_num); 436 - cq->cq.dq = q; 402 + cq->u.cq.dq = q; 437 403 438 404 mlxsw_pci_queue_doorbell_producer_ring(mlxsw_pci, q); 439 405 ··· 455 421 elem_info = mlxsw_pci_queue_elem_info_get(q, i); 456 422 mlxsw_pci_rdq_skb_free(mlxsw_pci, elem_info); 457 423 } 458 - cq->cq.dq = NULL; 424 + cq->u.cq.dq = NULL; 459 425 mlxsw_cmd_hw2sw_rdq(mlxsw_pci->core, q->num); 460 426 461 427 return err; ··· 477 443 static void mlxsw_pci_cq_pre_init(struct mlxsw_pci *mlxsw_pci, 478 444 struct mlxsw_pci_queue *q) 479 445 { 480 - q->cq.v = mlxsw_pci->max_cqe_ver; 446 + q->u.cq.v = mlxsw_pci->max_cqe_ver; 481 447 482 - if (q->cq.v == MLXSW_PCI_CQE_V2 && 448 + if (q->u.cq.v == MLXSW_PCI_CQE_V2 && 483 449 q->num < mlxsw_pci->num_sdqs && 484 450 !mlxsw_core_sdq_supports_cqe_v2(mlxsw_pci->core)) 485 - q->cq.v = MLXSW_PCI_CQE_V1; 451 + q->u.cq.v = MLXSW_PCI_CQE_V1; 486 452 } 487 453 488 454 static unsigned int mlxsw_pci_read32_off(struct mlxsw_pci *mlxsw_pci, ··· 664 630 mlxsw_core_skb_receive(mlxsw_pci->core, skb, &rx_info); 665 631 666 632 out: 667 - /* Everything is set up, ring doorbell to pass elem to HW */ 668 633 q->producer_counter++; 669 - mlxsw_pci_queue_doorbell_producer_ring(mlxsw_pci, q); 670 634 return; 671 635 } 672 636 ··· 676 644 677 645 elem_info = mlxsw_pci_queue_elem_info_consumer_get(q); 678 646 elem = elem_info->elem; 679 - owner_bit = mlxsw_pci_cqe_owner_get(q->cq.v, elem); 647 + owner_bit = mlxsw_pci_cqe_owner_get(q->u.cq.v, elem); 680 648 if (mlxsw_pci_elem_hw_owned(q, owner_bit)) 681 649 return NULL; 682 650 q->consumer_counter++; ··· 684 652 return elem; 685 653 } 686 654 687 - static void mlxsw_pci_cq_rx_tasklet(struct tasklet_struct *t) 655 + static bool mlxsw_pci_cq_cqe_to_handle(struct mlxsw_pci_queue *q) 688 656 { 689 - struct mlxsw_pci_queue *q = from_tasklet(q, t, tasklet); 690 - struct mlxsw_pci_queue *rdq = q->cq.dq; 657 + struct mlxsw_pci_queue_elem_info *elem_info; 658 + bool owner_bit; 659 + 660 + elem_info = mlxsw_pci_queue_elem_info_consumer_get(q); 661 + owner_bit = mlxsw_pci_cqe_owner_get(q->u.cq.v, elem_info->elem); 662 + return !mlxsw_pci_elem_hw_owned(q, owner_bit); 663 + } 664 + 665 + static int mlxsw_pci_napi_poll_cq_rx(struct napi_struct *napi, int budget) 666 + { 667 + struct mlxsw_pci_queue *q = container_of(napi, struct mlxsw_pci_queue, 668 + u.cq.napi); 669 + struct mlxsw_pci_queue *rdq = q->u.cq.dq; 691 670 struct mlxsw_pci *mlxsw_pci = q->pci; 692 - int credits = q->count >> 1; 693 - int items = 0; 671 + int work_done = 0; 694 672 char *cqe; 673 + 674 + /* If the budget is 0, Rx processing should be skipped. */ 675 + if (unlikely(!budget)) 676 + return 0; 695 677 696 678 while ((cqe = mlxsw_pci_cq_sw_cqe_get(q))) { 697 679 u16 wqe_counter = mlxsw_pci_cqe_wqe_counter_get(cqe); 698 - u8 sendq = mlxsw_pci_cqe_sr_get(q->cq.v, cqe); 699 - u8 dqn = mlxsw_pci_cqe_dqn_get(q->cq.v, cqe); 700 - char ncqe[MLXSW_PCI_CQE_SIZE_MAX]; 680 + u8 sendq = mlxsw_pci_cqe_sr_get(q->u.cq.v, cqe); 681 + u8 dqn = mlxsw_pci_cqe_dqn_get(q->u.cq.v, cqe); 701 682 702 683 if (unlikely(sendq)) { 703 684 WARN_ON_ONCE(1); ··· 722 677 continue; 723 678 } 724 679 725 - memcpy(ncqe, cqe, q->elem_size); 726 - mlxsw_pci_queue_doorbell_consumer_ring(mlxsw_pci, q); 727 - 728 680 mlxsw_pci_cqe_rdq_handle(mlxsw_pci, rdq, 729 - wqe_counter, q->cq.v, ncqe); 681 + wqe_counter, q->u.cq.v, cqe); 730 682 731 - if (++items == credits) 683 + if (++work_done == budget) 732 684 break; 733 685 } 734 686 735 - mlxsw_pci_queue_doorbell_arm_consumer_ring(mlxsw_pci, q); 687 + mlxsw_pci_queue_doorbell_consumer_ring(mlxsw_pci, q); 688 + mlxsw_pci_queue_doorbell_producer_ring(mlxsw_pci, rdq); 689 + 690 + if (work_done < budget) 691 + goto processing_completed; 692 + 693 + /* The driver still has outstanding work to do, budget was exhausted. 694 + * Return exactly budget. In that case, the NAPI instance will be polled 695 + * again. 696 + */ 697 + if (mlxsw_pci_cq_cqe_to_handle(q)) 698 + goto out; 699 + 700 + /* The driver processed all the completions and handled exactly 701 + * 'budget'. Return 'budget - 1' to distinguish from the case that 702 + * driver still has completions to handle. 703 + */ 704 + if (work_done == budget) 705 + work_done--; 706 + 707 + processing_completed: 708 + if (napi_complete_done(napi, work_done)) 709 + mlxsw_pci_queue_doorbell_arm_consumer_ring(mlxsw_pci, q); 710 + out: 711 + return work_done; 736 712 } 737 713 738 - static void mlxsw_pci_cq_tx_tasklet(struct tasklet_struct *t) 714 + static int mlxsw_pci_napi_poll_cq_tx(struct napi_struct *napi, int budget) 739 715 { 740 - struct mlxsw_pci_queue *q = from_tasklet(q, t, tasklet); 741 - struct mlxsw_pci_queue *sdq = q->cq.dq; 716 + struct mlxsw_pci_queue *q = container_of(napi, struct mlxsw_pci_queue, 717 + u.cq.napi); 718 + struct mlxsw_pci_queue *sdq = q->u.cq.dq; 742 719 struct mlxsw_pci *mlxsw_pci = q->pci; 743 - int credits = q->count >> 1; 744 - int items = 0; 720 + int work_done = 0; 745 721 char *cqe; 746 722 747 723 while ((cqe = mlxsw_pci_cq_sw_cqe_get(q))) { 748 724 u16 wqe_counter = mlxsw_pci_cqe_wqe_counter_get(cqe); 749 - u8 sendq = mlxsw_pci_cqe_sr_get(q->cq.v, cqe); 750 - u8 dqn = mlxsw_pci_cqe_dqn_get(q->cq.v, cqe); 725 + u8 sendq = mlxsw_pci_cqe_sr_get(q->u.cq.v, cqe); 726 + u8 dqn = mlxsw_pci_cqe_dqn_get(q->u.cq.v, cqe); 751 727 char ncqe[MLXSW_PCI_CQE_SIZE_MAX]; 752 728 753 729 if (unlikely(!sendq)) { ··· 785 719 mlxsw_pci_queue_doorbell_consumer_ring(mlxsw_pci, q); 786 720 787 721 mlxsw_pci_cqe_sdq_handle(mlxsw_pci, sdq, 788 - wqe_counter, q->cq.v, ncqe); 722 + wqe_counter, q->u.cq.v, ncqe); 789 723 790 - if (++items == credits) 791 - break; 724 + work_done++; 792 725 } 793 726 727 + /* If the budget is 0 napi_complete_done() should never be called. */ 728 + if (unlikely(!budget)) 729 + goto processing_completed; 730 + 731 + work_done = min(work_done, budget - 1); 732 + if (unlikely(!napi_complete_done(napi, work_done))) 733 + goto out; 734 + 735 + processing_completed: 794 736 mlxsw_pci_queue_doorbell_arm_consumer_ring(mlxsw_pci, q); 737 + out: 738 + return work_done; 795 739 } 796 740 797 741 static enum mlxsw_pci_cq_type ··· 817 741 return MLXSW_PCI_CQ_RDQ; 818 742 } 819 743 820 - static void mlxsw_pci_cq_tasklet_setup(struct mlxsw_pci_queue *q, 821 - enum mlxsw_pci_cq_type cq_type) 744 + static void mlxsw_pci_cq_napi_setup(struct mlxsw_pci_queue *q, 745 + enum mlxsw_pci_cq_type cq_type) 822 746 { 747 + struct mlxsw_pci *mlxsw_pci = q->pci; 748 + 823 749 switch (cq_type) { 824 750 case MLXSW_PCI_CQ_SDQ: 825 - tasklet_setup(&q->tasklet, mlxsw_pci_cq_tx_tasklet); 751 + netif_napi_add(mlxsw_pci->napi_dev_tx, &q->u.cq.napi, 752 + mlxsw_pci_napi_poll_cq_tx); 826 753 break; 827 754 case MLXSW_PCI_CQ_RDQ: 828 - tasklet_setup(&q->tasklet, mlxsw_pci_cq_rx_tasklet); 755 + netif_napi_add(mlxsw_pci->napi_dev_rx, &q->u.cq.napi, 756 + mlxsw_pci_napi_poll_cq_rx); 829 757 break; 830 758 } 759 + 760 + napi_enable(&q->u.cq.napi); 761 + } 762 + 763 + static void mlxsw_pci_cq_napi_teardown(struct mlxsw_pci_queue *q) 764 + { 765 + napi_disable(&q->u.cq.napi); 766 + netif_napi_del(&q->u.cq.napi); 831 767 } 832 768 833 769 static int mlxsw_pci_cq_init(struct mlxsw_pci *mlxsw_pci, char *mbox, ··· 853 765 for (i = 0; i < q->count; i++) { 854 766 char *elem = mlxsw_pci_queue_elem_get(q, i); 855 767 856 - mlxsw_pci_cqe_owner_set(q->cq.v, elem, 1); 768 + mlxsw_pci_cqe_owner_set(q->u.cq.v, elem, 1); 857 769 } 858 770 859 - if (q->cq.v == MLXSW_PCI_CQE_V1) 771 + if (q->u.cq.v == MLXSW_PCI_CQE_V1) 860 772 mlxsw_cmd_mbox_sw2hw_cq_cqe_ver_set(mbox, 861 773 MLXSW_CMD_MBOX_SW2HW_CQ_CQE_VER_1); 862 - else if (q->cq.v == MLXSW_PCI_CQE_V2) 774 + else if (q->u.cq.v == MLXSW_PCI_CQE_V2) 863 775 mlxsw_cmd_mbox_sw2hw_cq_cqe_ver_set(mbox, 864 776 MLXSW_CMD_MBOX_SW2HW_CQ_CQE_VER_2); 865 777 ··· 874 786 err = mlxsw_cmd_sw2hw_cq(mlxsw_pci->core, mbox, q->num); 875 787 if (err) 876 788 return err; 877 - mlxsw_pci_cq_tasklet_setup(q, mlxsw_pci_cq_type(mlxsw_pci, q)); 789 + mlxsw_pci_cq_napi_setup(q, mlxsw_pci_cq_type(mlxsw_pci, q)); 878 790 mlxsw_pci_queue_doorbell_consumer_ring(mlxsw_pci, q); 879 791 mlxsw_pci_queue_doorbell_arm_consumer_ring(mlxsw_pci, q); 880 792 return 0; ··· 883 795 static void mlxsw_pci_cq_fini(struct mlxsw_pci *mlxsw_pci, 884 796 struct mlxsw_pci_queue *q) 885 797 { 798 + mlxsw_pci_cq_napi_teardown(q); 886 799 mlxsw_cmd_hw2sw_cq(mlxsw_pci->core, q->num); 887 800 } 888 801 889 802 static u16 mlxsw_pci_cq_elem_count(const struct mlxsw_pci_queue *q) 890 803 { 891 - return q->cq.v == MLXSW_PCI_CQE_V2 ? MLXSW_PCI_CQE2_COUNT : 804 + return q->u.cq.v == MLXSW_PCI_CQE_V2 ? MLXSW_PCI_CQE2_COUNT : 892 805 MLXSW_PCI_CQE01_COUNT; 893 806 } 894 807 895 808 static u8 mlxsw_pci_cq_elem_size(const struct mlxsw_pci_queue *q) 896 809 { 897 - return q->cq.v == MLXSW_PCI_CQE_V2 ? MLXSW_PCI_CQE2_SIZE : 810 + return q->u.cq.v == MLXSW_PCI_CQE_V2 ? MLXSW_PCI_CQE2_SIZE : 898 811 MLXSW_PCI_CQE01_SIZE; 899 812 } 900 813 ··· 918 829 static void mlxsw_pci_eq_tasklet(struct tasklet_struct *t) 919 830 { 920 831 unsigned long active_cqns[BITS_TO_LONGS(MLXSW_PCI_CQS_MAX)]; 921 - struct mlxsw_pci_queue *q = from_tasklet(q, t, tasklet); 832 + struct mlxsw_pci_queue *q = from_tasklet(q, t, u.eq.tasklet); 922 833 struct mlxsw_pci *mlxsw_pci = q->pci; 923 834 int credits = q->count >> 1; 924 835 u8 cqn, cq_count; ··· 944 855 cq_count = mlxsw_pci->num_cqs; 945 856 for_each_set_bit(cqn, active_cqns, cq_count) { 946 857 q = mlxsw_pci_cq_get(mlxsw_pci, cqn); 947 - mlxsw_pci_queue_tasklet_schedule(q); 858 + napi_schedule(&q->u.cq.napi); 948 859 } 949 860 } 950 861 ··· 980 891 err = mlxsw_cmd_sw2hw_eq(mlxsw_pci->core, mbox, q->num); 981 892 if (err) 982 893 return err; 983 - tasklet_setup(&q->tasklet, mlxsw_pci_eq_tasklet); 894 + tasklet_setup(&q->u.eq.tasklet, mlxsw_pci_eq_tasklet); 984 895 mlxsw_pci_queue_doorbell_consumer_ring(mlxsw_pci, q); 985 896 mlxsw_pci_queue_doorbell_arm_consumer_ring(mlxsw_pci, q); 986 897 return 0; ··· 1541 1452 struct mlxsw_pci_queue *q; 1542 1453 1543 1454 q = mlxsw_pci_eq_get(mlxsw_pci); 1544 - mlxsw_pci_queue_tasklet_schedule(q); 1455 + tasklet_schedule(&q->u.eq.tasklet); 1545 1456 return IRQ_HANDLED; 1546 1457 } 1547 1458 ··· 1813 1724 if (err) 1814 1725 goto err_requery_resources; 1815 1726 1727 + err = mlxsw_pci_napi_devs_init(mlxsw_pci); 1728 + if (err) 1729 + goto err_napi_devs_init; 1730 + 1816 1731 err = mlxsw_pci_aqs_init(mlxsw_pci, mbox); 1817 1732 if (err) 1818 1733 goto err_aqs_init; ··· 1834 1741 err_request_eq_irq: 1835 1742 mlxsw_pci_aqs_fini(mlxsw_pci); 1836 1743 err_aqs_init: 1744 + mlxsw_pci_napi_devs_fini(mlxsw_pci); 1745 + err_napi_devs_init: 1837 1746 err_requery_resources: 1838 1747 err_config_profile: 1839 1748 err_cqe_v_check: ··· 1863 1768 1864 1769 free_irq(pci_irq_vector(mlxsw_pci->pdev, 0), mlxsw_pci); 1865 1770 mlxsw_pci_aqs_fini(mlxsw_pci); 1771 + mlxsw_pci_napi_devs_fini(mlxsw_pci); 1866 1772 mlxsw_pci_fw_area_fini(mlxsw_pci); 1867 1773 mlxsw_pci_free_irq_vectors(mlxsw_pci); 1868 1774 }