Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

i40e: xsk: add RX multi-buffer support

This patch is inspired from the multi-buffer support in non-zc path for
i40e as well as from the patch to support zc on ice. Each subsequent
frag is added to skb_shared_info of the first frag for possible xdp_prog
use as well to xsk buffer list for accessing the buffers in af_xdp.

For XDP_PASS, new pages are allocated for frags and contents are copied
from memory backed by xsk_buff_pool.

Replace next_to_clean with next_to_process as done in non-zc path and
advance it for every buffer and change the semantics of next_to_clean to
point to the first buffer of a packet. Driver will use next_to_process
in the same way next_to_clean was used previously.

For the non multi-buffer case, next_to_process and next_to_clean will
always be the same since each packet consists of a single buffer.

Signed-off-by: Tirthendu Sarkar <tirthendu.sarkar@intel.com>
Link: https://lore.kernel.org/r/20230719132421.584801-14-maciej.fijalkowski@intel.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

authored by

Tirthendu Sarkar and committed by
Alexei Starovoitov
1c9ba9c1 1bbc04de

+84 -18
-5
drivers/net/ethernet/intel/i40e/i40e_main.c
··· 3585 3585 if (ring->xsk_pool) { 3586 3586 ring->rx_buf_len = 3587 3587 xsk_pool_get_rx_frame_size(ring->xsk_pool); 3588 - /* For AF_XDP ZC, we disallow packets to span on 3589 - * multiple buffers, thus letting us skip that 3590 - * handling in the fast-path. 3591 - */ 3592 - chain_len = 1; 3593 3588 ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, 3594 3589 MEM_TYPE_XSK_BUFF_POOL, 3595 3590 NULL);
+2 -2
drivers/net/ethernet/intel/i40e/i40e_txrx.c
··· 2284 2284 * If the buffer is an EOP buffer, this function exits returning false, 2285 2285 * otherwise return true indicating that this is in fact a non-EOP buffer. 2286 2286 */ 2287 - static bool i40e_is_non_eop(struct i40e_ring *rx_ring, 2288 - union i40e_rx_desc *rx_desc) 2287 + bool i40e_is_non_eop(struct i40e_ring *rx_ring, 2288 + union i40e_rx_desc *rx_desc) 2289 2289 { 2290 2290 /* if we are the last buffer then there is nothing else to do */ 2291 2291 #define I40E_RXD_EOF BIT(I40E_RX_DESC_STATUS_EOF_SHIFT)
+2
drivers/net/ethernet/intel/i40e/i40e_txrx.h
··· 473 473 bool __i40e_chk_linearize(struct sk_buff *skb); 474 474 int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, 475 475 u32 flags); 476 + bool i40e_is_non_eop(struct i40e_ring *rx_ring, 477 + union i40e_rx_desc *rx_desc); 476 478 477 479 /** 478 480 * i40e_get_head - Retrieve head from head writeback
+80 -11
drivers/net/ethernet/intel/i40e/i40e_xsk.c
··· 294 294 { 295 295 unsigned int totalsize = xdp->data_end - xdp->data_meta; 296 296 unsigned int metasize = xdp->data - xdp->data_meta; 297 + struct skb_shared_info *sinfo = NULL; 297 298 struct sk_buff *skb; 299 + u32 nr_frags = 0; 298 300 301 + if (unlikely(xdp_buff_has_frags(xdp))) { 302 + sinfo = xdp_get_shared_info_from_buff(xdp); 303 + nr_frags = sinfo->nr_frags; 304 + } 299 305 net_prefetch(xdp->data_meta); 300 306 301 307 /* allocate a skb to store the frags */ ··· 318 312 __skb_pull(skb, metasize); 319 313 } 320 314 315 + if (likely(!xdp_buff_has_frags(xdp))) 316 + goto out; 317 + 318 + for (int i = 0; i < nr_frags; i++) { 319 + struct skb_shared_info *skinfo = skb_shinfo(skb); 320 + skb_frag_t *frag = &sinfo->frags[i]; 321 + struct page *page; 322 + void *addr; 323 + 324 + page = dev_alloc_page(); 325 + if (!page) { 326 + dev_kfree_skb(skb); 327 + return NULL; 328 + } 329 + addr = page_to_virt(page); 330 + 331 + memcpy(addr, skb_frag_page(frag), skb_frag_size(frag)); 332 + 333 + __skb_fill_page_desc_noacc(skinfo, skinfo->nr_frags++, 334 + addr, 0, skb_frag_size(frag)); 335 + } 336 + 321 337 out: 322 338 xsk_buff_free(xdp); 323 339 return skb; ··· 350 322 union i40e_rx_desc *rx_desc, 351 323 unsigned int *rx_packets, 352 324 unsigned int *rx_bytes, 353 - unsigned int size, 354 325 unsigned int xdp_res, 355 326 bool *failure) 356 327 { 357 328 struct sk_buff *skb; 358 329 359 330 *rx_packets = 1; 360 - *rx_bytes = size; 331 + *rx_bytes = xdp_get_buff_len(xdp_buff); 361 332 362 333 if (likely(xdp_res == I40E_XDP_REDIR) || xdp_res == I40E_XDP_TX) 363 334 return; ··· 390 363 return; 391 364 } 392 365 393 - *rx_bytes = skb->len; 394 366 i40e_process_skb_fields(rx_ring, rx_desc, skb); 395 367 napi_gro_receive(&rx_ring->q_vector->napi, skb); 396 368 return; ··· 398 372 /* Should never get here, as all valid cases have been handled already. 399 373 */ 400 374 WARN_ON_ONCE(1); 375 + } 376 + 377 + static int 378 + i40e_add_xsk_frag(struct i40e_ring *rx_ring, struct xdp_buff *first, 379 + struct xdp_buff *xdp, const unsigned int size) 380 + { 381 + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(first); 382 + 383 + if (!xdp_buff_has_frags(first)) { 384 + sinfo->nr_frags = 0; 385 + sinfo->xdp_frags_size = 0; 386 + xdp_buff_set_frags_flag(first); 387 + } 388 + 389 + if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) { 390 + xsk_buff_free(first); 391 + return -ENOMEM; 392 + } 393 + 394 + __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, 395 + virt_to_page(xdp->data_hard_start), 0, size); 396 + sinfo->xdp_frags_size += size; 397 + xsk_buff_add_frag(xdp); 398 + 399 + return 0; 401 400 } 402 401 403 402 /** ··· 435 384 int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) 436 385 { 437 386 unsigned int total_rx_bytes = 0, total_rx_packets = 0; 387 + u16 next_to_process = rx_ring->next_to_process; 438 388 u16 next_to_clean = rx_ring->next_to_clean; 439 389 u16 count_mask = rx_ring->count - 1; 440 390 unsigned int xdp_res, xdp_xmit = 0; 391 + struct xdp_buff *first = NULL; 441 392 struct bpf_prog *xdp_prog; 442 393 bool failure = false; 443 394 u16 cleaned_count; 395 + 396 + if (next_to_process != next_to_clean) 397 + first = *i40e_rx_bi(rx_ring, next_to_clean); 444 398 445 399 /* NB! xdp_prog will always be !NULL, due to the fact that 446 400 * this path is enabled by setting an XDP program. ··· 460 404 unsigned int size; 461 405 u64 qword; 462 406 463 - rx_desc = I40E_RX_DESC(rx_ring, next_to_clean); 407 + rx_desc = I40E_RX_DESC(rx_ring, next_to_process); 464 408 qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len); 465 409 466 410 /* This memory barrier is needed to keep us from reading ··· 473 417 i40e_clean_programming_status(rx_ring, 474 418 rx_desc->raw.qword[0], 475 419 qword); 476 - bi = *i40e_rx_bi(rx_ring, next_to_clean); 420 + bi = *i40e_rx_bi(rx_ring, next_to_process); 477 421 xsk_buff_free(bi); 478 - next_to_clean = (next_to_clean + 1) & count_mask; 422 + next_to_process = (next_to_process + 1) & count_mask; 479 423 continue; 480 424 } 481 425 ··· 484 428 if (!size) 485 429 break; 486 430 487 - bi = *i40e_rx_bi(rx_ring, next_to_clean); 431 + bi = *i40e_rx_bi(rx_ring, next_to_process); 488 432 xsk_buff_set_size(bi, size); 489 433 xsk_buff_dma_sync_for_cpu(bi, rx_ring->xsk_pool); 490 434 491 - xdp_res = i40e_run_xdp_zc(rx_ring, bi, xdp_prog); 492 - i40e_handle_xdp_result_zc(rx_ring, bi, rx_desc, &rx_packets, 493 - &rx_bytes, size, xdp_res, &failure); 435 + if (!first) 436 + first = bi; 437 + else if (i40e_add_xsk_frag(rx_ring, first, bi, size)) 438 + break; 439 + 440 + next_to_process = (next_to_process + 1) & count_mask; 441 + 442 + if (i40e_is_non_eop(rx_ring, rx_desc)) 443 + continue; 444 + 445 + xdp_res = i40e_run_xdp_zc(rx_ring, first, xdp_prog); 446 + i40e_handle_xdp_result_zc(rx_ring, first, rx_desc, &rx_packets, 447 + &rx_bytes, xdp_res, &failure); 448 + first->flags = 0; 449 + next_to_clean = next_to_process; 494 450 if (failure) 495 451 break; 496 452 total_rx_packets += rx_packets; 497 453 total_rx_bytes += rx_bytes; 498 454 xdp_xmit |= xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR); 499 - next_to_clean = (next_to_clean + 1) & count_mask; 455 + first = NULL; 500 456 } 501 457 502 458 rx_ring->next_to_clean = next_to_clean; 459 + rx_ring->next_to_process = next_to_process; 503 460 cleaned_count = (next_to_clean - rx_ring->next_to_use - 1) & count_mask; 504 461 505 462 if (cleaned_count >= I40E_RX_BUFFER_WRITE)