Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ath10k: enable napi on RX path for sdio

For tcp RX, the quantity of tcp acks to remote is 1/2 of the quantity
of tcp data from remote, then it will have many small length packets
on TX path of sdio bus, then it reduce the RX packets's bandwidth of
tcp.

This patch enable napi on RX path, then the RX packet of tcp will not
feed to tcp stack immeditely from mac80211 since GRO is enabled by
default, it will feed to tcp stack after napi complete, if rx bundle
is enabled, then it will feed to tcp stack one time for each bundle
of RX. For example, RX bundle size is 32, then tcp stack will receive
one large length packet, its length is neary 1500*32, then tcp stack
will send a tcp ack for this large packet, this will reduce the tcp
acks ratio from 1/2 to 1/32. This results in significant performance
improvement for tcp RX.

Tcp rx throughout is 240Mbps without this patch, and it arrive 390Mbps
with this patch. The cpu usage has no obvious difference with and
without NAPI.

call stack for each RX packet on GRO path:
(skb length is about 1500 bytes)
skb_gro_receive ([kernel.kallsyms])
tcp4_gro_receive ([kernel.kallsyms])
inet_gro_receive ([kernel.kallsyms])
dev_gro_receive ([kernel.kallsyms])
napi_gro_receive ([kernel.kallsyms])
ieee80211_deliver_skb ([mac80211])
ieee80211_rx_handlers ([mac80211])
ieee80211_prepare_and_rx_handle ([mac80211])
ieee80211_rx_napi ([mac80211])
ath10k_htt_rx_proc_rx_ind_hl ([ath10k_core])
ath10k_htt_rx_pktlog_completion_handler ([ath10k_core])
ath10k_sdio_napi_poll ([ath10k_sdio])
net_rx_action ([kernel.kallsyms])
softirqentry_text_start ([kernel.kallsyms])
do_softirq ([kernel.kallsyms])

call stack for napi complete and send tcp ack from tcp stack:
(skb length is about 1500*32 bytes)
_tcp_ack_snd_check ([kernel.kallsyms])
tcp_v4_do_rcv ([kernel.kallsyms])
tcp_v4_rcv ([kernel.kallsyms])
local_deliver_finish ([kernel.kallsyms])
ip_local_deliver ([kernel.kallsyms])
ip_rcv_finish ([kernel.kallsyms])
ip_rcv ([kernel.kallsyms])
netif_receive_skb_core ([kernel.kallsyms])
netif_receive_skb_one_core([kernel.kallsyms])
netif_receive_skb ([kernel.kallsyms])
netif_receive_skb_internal ([kernel.kallsyms])
napi_gro_complete ([kernel.kallsyms])
napi_gro_flush ([kernel.kallsyms])
napi_complete_done ([kernel.kallsyms])
ath10k_sdio_napi_poll ([ath10k_sdio])
net_rx_action ([kernel.kallsyms])
__softirqentry_text_start ([kernel.kallsyms])
do_softirq ([kernel.kallsyms])

Tested with QCA6174 SDIO with firmware
WLAN.RMH.4.4.1-00017-QCARMSWP-1.

Signed-off-by: Wen Gong <wgong@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>

authored by

Wen Gong and committed by
Kalle Valo
cfee8793 fcaf49d0

+73 -8
+2
drivers/net/wireless/ath/ath10k/core.c
··· 3220 3220 init_waitqueue_head(&ar->htt.empty_tx_wq); 3221 3221 init_waitqueue_head(&ar->wmi.tx_credits_wq); 3222 3222 3223 + skb_queue_head_init(&ar->htt.rx_indication_head); 3224 + 3223 3225 init_completion(&ar->offchan_tx_completed); 3224 3226 INIT_WORK(&ar->offchan_tx_work, ath10k_offchan_tx_work); 3225 3227 skb_queue_head_init(&ar->offchan_tx_queue);
+3
drivers/net/wireless/ath/ath10k/htt.h
··· 1869 1869 struct ath10k *ar; 1870 1870 enum ath10k_htc_ep_id eid; 1871 1871 1872 + struct sk_buff_head rx_indication_head; 1873 + 1872 1874 u8 target_version_major; 1873 1875 u8 target_version_minor; 1874 1876 struct completion target_version_received; ··· 2285 2283 void ath10k_htt_rx_pktlog_completion_handler(struct ath10k *ar, 2286 2284 struct sk_buff *skb); 2287 2285 int ath10k_htt_txrx_compl_task(struct ath10k *ar, int budget); 2286 + int ath10k_htt_rx_hl_indication(struct ath10k *ar, int budget); 2288 2287 void ath10k_htt_set_tx_ops(struct ath10k_htt *htt); 2289 2288 void ath10k_htt_set_rx_ops(struct ath10k_htt *htt); 2290 2289 #endif
+40 -8
drivers/net/wireless/ath/ath10k/htt_rx.c
··· 2359 2359 memcpy(skb->data + offset, &qos_ctrl, IEEE80211_QOS_CTL_LEN); 2360 2360 } 2361 2361 2362 - ieee80211_rx_ni(ar->hw, skb); 2362 + if (ar->napi.dev) 2363 + ieee80211_rx_napi(ar->hw, NULL, skb, &ar->napi); 2364 + else 2365 + ieee80211_rx_ni(ar->hw, skb); 2363 2366 2364 2367 /* We have delivered the skb to the upper layers (mac80211) so we 2365 2368 * must not free it. ··· 3763 3760 break; 3764 3761 } 3765 3762 case HTT_T2H_MSG_TYPE_RX_IND: 3766 - if (ar->bus_param.dev_type == ATH10K_DEV_TYPE_HL) 3767 - return ath10k_htt_rx_proc_rx_ind_hl(htt, 3768 - &resp->rx_ind_hl, 3769 - skb, 3770 - HTT_RX_PN_CHECK, 3771 - HTT_RX_NON_TKIP_MIC); 3772 - else 3763 + if (ar->bus_param.dev_type != ATH10K_DEV_TYPE_HL) { 3773 3764 ath10k_htt_rx_proc_rx_ind_ll(htt, &resp->rx_ind); 3765 + } else { 3766 + skb_queue_tail(&htt->rx_indication_head, skb); 3767 + return false; 3768 + } 3774 3769 break; 3775 3770 case HTT_T2H_MSG_TYPE_PEER_MAP: { 3776 3771 struct htt_peer_map_event ev = { ··· 3957 3956 3958 3957 return quota; 3959 3958 } 3959 + 3960 + int ath10k_htt_rx_hl_indication(struct ath10k *ar, int budget) 3961 + { 3962 + struct htt_resp *resp; 3963 + struct ath10k_htt *htt = &ar->htt; 3964 + struct sk_buff *skb; 3965 + bool release; 3966 + int quota; 3967 + 3968 + for (quota = 0; quota < budget; quota++) { 3969 + skb = skb_dequeue(&htt->rx_indication_head); 3970 + if (!skb) 3971 + break; 3972 + 3973 + resp = (struct htt_resp *)skb->data; 3974 + 3975 + release = ath10k_htt_rx_proc_rx_ind_hl(htt, 3976 + &resp->rx_ind_hl, 3977 + skb, 3978 + HTT_RX_PN_CHECK, 3979 + HTT_RX_NON_TKIP_MIC); 3980 + 3981 + if (release) 3982 + dev_kfree_skb_any(skb); 3983 + 3984 + ath10k_dbg(ar, ATH10K_DBG_HTT, "rx indication poll pending count:%d\n", 3985 + skb_queue_len(&htt->rx_indication_head)); 3986 + } 3987 + return quota; 3988 + } 3989 + EXPORT_SYMBOL(ath10k_htt_rx_hl_indication); 3960 3990 3961 3991 int ath10k_htt_txrx_compl_task(struct ath10k *ar, int budget) 3962 3992 {
+28
drivers/net/wireless/ath/ath10k/sdio.c
··· 1339 1339 ep = &ar->htc.endpoint[cb->eid]; 1340 1340 ep->ep_ops.ep_rx_complete(ar, skb); 1341 1341 } 1342 + 1343 + if (test_bit(ATH10K_FLAG_CORE_REGISTERED, &ar->dev_flags)) 1344 + napi_schedule(&ar->napi); 1342 1345 } 1343 1346 1344 1347 static void ath10k_sdio_write_async_work(struct work_struct *work) ··· 1732 1729 struct ath10k_sdio *ar_sdio = ath10k_sdio_priv(ar); 1733 1730 int ret; 1734 1731 1732 + napi_enable(&ar->napi); 1733 + 1735 1734 /* Sleep 20 ms before HIF interrupts are disabled. 1736 1735 * This will give target plenty of time to process the BMI done 1737 1736 * request before interrupts are disabled. ··· 1858 1853 } 1859 1854 1860 1855 spin_unlock_bh(&ar_sdio->wr_async_lock); 1856 + 1857 + napi_synchronize(&ar->napi); 1858 + napi_disable(&ar->napi); 1861 1859 } 1862 1860 1863 1861 #ifdef CONFIG_PM ··· 2055 2047 2056 2048 #endif /* CONFIG_PM_SLEEP */ 2057 2049 2050 + static int ath10k_sdio_napi_poll(struct napi_struct *ctx, int budget) 2051 + { 2052 + struct ath10k *ar = container_of(ctx, struct ath10k, napi); 2053 + int done; 2054 + 2055 + done = ath10k_htt_rx_hl_indication(ar, budget); 2056 + ath10k_dbg(ar, ATH10K_DBG_SDIO, "napi poll: done: %d, budget:%d\n", done, budget); 2057 + 2058 + if (done < budget) 2059 + napi_complete_done(ctx, done); 2060 + 2061 + return done; 2062 + } 2063 + 2058 2064 static int ath10k_sdio_probe(struct sdio_func *func, 2059 2065 const struct sdio_device_id *id) 2060 2066 { ··· 2093 2071 dev_err(&func->dev, "failed to allocate core\n"); 2094 2072 return -ENOMEM; 2095 2073 } 2074 + 2075 + netif_napi_add(&ar->napi_dev, &ar->napi, ath10k_sdio_napi_poll, 2076 + ATH10K_NAPI_BUDGET); 2096 2077 2097 2078 ath10k_dbg(ar, ATH10K_DBG_BOOT, 2098 2079 "sdio new func %d vendor 0x%x device 0x%x block 0x%x/0x%x\n", ··· 2209 2184 func->num, func->vendor, func->device); 2210 2185 2211 2186 ath10k_core_unregister(ar); 2187 + 2188 + netif_napi_del(&ar->napi); 2189 + 2212 2190 ath10k_core_destroy(ar); 2213 2191 2214 2192 flush_workqueue(ar_sdio->workqueue);