Merge tag 'net-6.7-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net

+17 -3

Documentation/process/maintainer-netdev.rst

··· 193 193 Generally speaking, the patches get triaged quickly (in less than 194 194 48h). But be patient, if your patch is active in patchwork (i.e. it's 195 195 listed on the project's patch list) the chances it was missed are close to zero. 196 - Asking the maintainer for status updates on your 197 - patch is a good way to ensure your patch is ignored or pushed to the 198 - bottom of the priority list. 196 + 197 + The high volume of development on netdev makes reviewers move on 198 + from discussions relatively quickly. New comments and replies 199 + are very unlikely to arrive after a week of silence. If a patch 200 + is no longer active in patchwork and the thread went idle for more 201 + than a week - clarify the next steps and/or post the next version. 202 + 203 + For RFC postings specifically, if nobody responded in a week - reviewers 204 + either missed the posting or have no strong opinions. If the code is ready, 205 + repost as a PATCH. 206 + 207 + Emails saying just "ping" or "bump" are considered rude. If you can't figure 208 + out the status of the patch from patchwork or where the discussion has 209 + landed - describe your best guess and ask if it's correct. For example:: 210 + 211 + I don't understand what the next steps are. Person X seems to be unhappy 212 + with A, should I do B and repost the patches? 199 213 200 214 .. _Changes requested: 201 215

+3

MAINTAINERS

··· 14992 14992 M: Paolo Abeni <pabeni@redhat.com> 14993 14993 L: netdev@vger.kernel.org 14994 14994 S: Maintained 14995 + P: Documentation/process/maintainer-netdev.rst 14995 14996 Q: https://patchwork.kernel.org/project/netdevbpf/list/ 14996 14997 T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git 14997 14998 T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git ··· 15044 15043 M: Paolo Abeni <pabeni@redhat.com> 15045 15044 L: netdev@vger.kernel.org 15046 15045 S: Maintained 15046 + P: Documentation/process/maintainer-netdev.rst 15047 15047 Q: https://patchwork.kernel.org/project/netdevbpf/list/ 15048 15048 B: mailto:netdev@vger.kernel.org 15049 15049 T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git ··· 15055 15053 F: Documentation/process/maintainer-netdev.rst 15056 15054 F: Documentation/userspace-api/netlink/ 15057 15055 F: include/linux/in.h 15056 + F: include/linux/indirect_call_wrapper.h 15058 15057 F: include/linux/net.h 15059 15058 F: include/linux/netdevice.h 15060 15059 F: include/net/

+12 -5

drivers/dpll/dpll_netlink.c

··· 1093 1093 return -ENOMEM; 1094 1094 hdr = genlmsg_put_reply(msg, info, &dpll_nl_family, 0, 1095 1095 DPLL_CMD_PIN_ID_GET); 1096 - if (!hdr) 1096 + if (!hdr) { 1097 + nlmsg_free(msg); 1097 1098 return -EMSGSIZE; 1098 - 1099 + } 1099 1100 pin = dpll_pin_find_from_nlattr(info); 1100 1101 if (!IS_ERR(pin)) { 1101 1102 ret = dpll_msg_add_pin_handle(msg, pin); ··· 1124 1123 return -ENOMEM; 1125 1124 hdr = genlmsg_put_reply(msg, info, &dpll_nl_family, 0, 1126 1125 DPLL_CMD_PIN_GET); 1127 - if (!hdr) 1126 + if (!hdr) { 1127 + nlmsg_free(msg); 1128 1128 return -EMSGSIZE; 1129 + } 1129 1130 ret = dpll_cmd_pin_get_one(msg, pin, info->extack); 1130 1131 if (ret) { 1131 1132 nlmsg_free(msg); ··· 1259 1256 return -ENOMEM; 1260 1257 hdr = genlmsg_put_reply(msg, info, &dpll_nl_family, 0, 1261 1258 DPLL_CMD_DEVICE_ID_GET); 1262 - if (!hdr) 1259 + if (!hdr) { 1260 + nlmsg_free(msg); 1263 1261 return -EMSGSIZE; 1262 + } 1264 1263 1265 1264 dpll = dpll_device_find_from_nlattr(info); 1266 1265 if (!IS_ERR(dpll)) { ··· 1289 1284 return -ENOMEM; 1290 1285 hdr = genlmsg_put_reply(msg, info, &dpll_nl_family, 0, 1291 1286 DPLL_CMD_DEVICE_GET); 1292 - if (!hdr) 1287 + if (!hdr) { 1288 + nlmsg_free(msg); 1293 1289 return -EMSGSIZE; 1290 + } 1294 1291 1295 1292 ret = dpll_device_get_one(dpll, msg, info->extack); 1296 1293 if (ret) {

+14

drivers/net/ethernet/amd/xgbe/xgbe-drv.c

··· 682 682 static void xgbe_service_timer(struct timer_list *t) 683 683 { 684 684 struct xgbe_prv_data *pdata = from_timer(pdata, t, service_timer); 685 + struct xgbe_channel *channel; 686 + unsigned int i; 685 687 686 688 queue_work(pdata->dev_workqueue, &pdata->service_work); 687 689 688 690 mod_timer(&pdata->service_timer, jiffies + HZ); 691 + 692 + if (!pdata->tx_usecs) 693 + return; 694 + 695 + for (i = 0; i < pdata->channel_count; i++) { 696 + channel = pdata->channel[i]; 697 + if (!channel->tx_ring || channel->tx_timer_active) 698 + break; 699 + channel->tx_timer_active = 1; 700 + mod_timer(&channel->tx_timer, 701 + jiffies + usecs_to_jiffies(pdata->tx_usecs)); 702 + } 689 703 } 690 704 691 705 static void xgbe_init_timers(struct xgbe_prv_data *pdata)

+8 -3

drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c

··· 314 314 315 315 cmd->base.phy_address = pdata->phy.address; 316 316 317 - cmd->base.autoneg = pdata->phy.autoneg; 318 - cmd->base.speed = pdata->phy.speed; 319 - cmd->base.duplex = pdata->phy.duplex; 317 + if (netif_carrier_ok(netdev)) { 318 + cmd->base.speed = pdata->phy.speed; 319 + cmd->base.duplex = pdata->phy.duplex; 320 + } else { 321 + cmd->base.speed = SPEED_UNKNOWN; 322 + cmd->base.duplex = DUPLEX_UNKNOWN; 323 + } 320 324 325 + cmd->base.autoneg = pdata->phy.autoneg; 321 326 cmd->base.port = PORT_NONE; 322 327 323 328 XGBE_LM_COPY(cmd, supported, lks, supported);

+13 -1

drivers/net/ethernet/amd/xgbe/xgbe-mdio.c

··· 1193 1193 if (pdata->phy.duplex != DUPLEX_FULL) 1194 1194 return -EINVAL; 1195 1195 1196 - xgbe_set_mode(pdata, mode); 1196 + /* Force the mode change for SFI in Fixed PHY config. 1197 + * Fixed PHY configs needs PLL to be enabled while doing mode set. 1198 + * When the SFP module isn't connected during boot, driver assumes 1199 + * AN is ON and attempts autonegotiation. However, if the connected 1200 + * SFP comes up in Fixed PHY config, the link will not come up as 1201 + * PLL isn't enabled while the initial mode set command is issued. 1202 + * So, force the mode change for SFI in Fixed PHY configuration to 1203 + * fix link issues. 1204 + */ 1205 + if (mode == XGBE_MODE_SFI) 1206 + xgbe_change_mode(pdata, mode); 1207 + else 1208 + xgbe_set_mode(pdata, mode); 1197 1209 1198 1210 return 0; 1199 1211 }

+9 -7

drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c

··· 3844 3844 struct i40e_pf *pf = vf->pf; 3845 3845 struct i40e_vsi *vsi = NULL; 3846 3846 int aq_ret = 0; 3847 - int i, ret; 3847 + int i; 3848 3848 3849 3849 if (!i40e_sync_vf_state(vf, I40E_VF_STATE_ACTIVE)) { 3850 3850 aq_ret = -EINVAL; ··· 3868 3868 } 3869 3869 3870 3870 cfilter = kzalloc(sizeof(*cfilter), GFP_KERNEL); 3871 - if (!cfilter) 3872 - return -ENOMEM; 3871 + if (!cfilter) { 3872 + aq_ret = -ENOMEM; 3873 + goto err_out; 3874 + } 3873 3875 3874 3876 /* parse destination mac address */ 3875 3877 for (i = 0; i < ETH_ALEN; i++) ··· 3919 3917 3920 3918 /* Adding cloud filter programmed as TC filter */ 3921 3919 if (tcf.dst_port) 3922 - ret = i40e_add_del_cloud_filter_big_buf(vsi, cfilter, true); 3920 + aq_ret = i40e_add_del_cloud_filter_big_buf(vsi, cfilter, true); 3923 3921 else 3924 - ret = i40e_add_del_cloud_filter(vsi, cfilter, true); 3925 - if (ret) { 3922 + aq_ret = i40e_add_del_cloud_filter(vsi, cfilter, true); 3923 + if (aq_ret) { 3926 3924 dev_err(&pf->pdev->dev, 3927 3925 "VF %d: Failed to add cloud filter, err %pe aq_err %s\n", 3928 - vf->vf_id, ERR_PTR(ret), 3926 + vf->vf_id, ERR_PTR(aq_ret), 3929 3927 i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status)); 3930 3928 goto err_free; 3931 3929 }

+3 -9

drivers/net/ethernet/intel/ice/ice_main.c

··· 7401 7401 goto err_vsi_rebuild; 7402 7402 } 7403 7403 7404 - /* configure PTP timestamping after VSI rebuild */ 7405 - if (test_bit(ICE_FLAG_PTP_SUPPORTED, pf->flags)) { 7406 - if (pf->ptp.tx_interrupt_mode == ICE_PTP_TX_INTERRUPT_SELF) 7407 - ice_ptp_cfg_timestamp(pf, false); 7408 - else if (pf->ptp.tx_interrupt_mode == ICE_PTP_TX_INTERRUPT_ALL) 7409 - /* for E82x PHC owner always need to have interrupts */ 7410 - ice_ptp_cfg_timestamp(pf, true); 7411 - } 7412 - 7413 7404 err = ice_vsi_rebuild_by_type(pf, ICE_VSI_SWITCHDEV_CTRL); 7414 7405 if (err) { 7415 7406 dev_err(dev, "Switchdev CTRL VSI rebuild failed: %d\n", err); ··· 7452 7461 ice_plug_aux_dev(pf); 7453 7462 if (ice_is_feature_supported(pf, ICE_F_SRIOV_LAG)) 7454 7463 ice_lag_rebuild(pf); 7464 + 7465 + /* Restore timestamp mode settings after VSI rebuild */ 7466 + ice_ptp_restore_timestamp_mode(pf); 7455 7467 return; 7456 7468 7457 7469 err_vsi_rebuild:

+79 -67

drivers/net/ethernet/intel/ice/ice_ptp.c

··· 256 256 } 257 257 258 258 /** 259 - * ice_ptp_configure_tx_tstamp - Enable or disable Tx timestamp interrupt 260 - * @pf: The PF pointer to search in 261 - * @on: bool value for whether timestamp interrupt is enabled or disabled 259 + * ice_ptp_cfg_tx_interrupt - Configure Tx timestamp interrupt for the device 260 + * @pf: Board private structure 261 + * 262 + * Program the device to respond appropriately to the Tx timestamp interrupt 263 + * cause. 262 264 */ 263 - static void ice_ptp_configure_tx_tstamp(struct ice_pf *pf, bool on) 265 + static void ice_ptp_cfg_tx_interrupt(struct ice_pf *pf) 264 266 { 267 + struct ice_hw *hw = &pf->hw; 268 + bool enable; 265 269 u32 val; 266 270 271 + switch (pf->ptp.tx_interrupt_mode) { 272 + case ICE_PTP_TX_INTERRUPT_ALL: 273 + /* React to interrupts across all quads. */ 274 + wr32(hw, PFINT_TSYN_MSK + (0x4 * hw->pf_id), (u32)0x1f); 275 + enable = true; 276 + break; 277 + case ICE_PTP_TX_INTERRUPT_NONE: 278 + /* Do not react to interrupts on any quad. */ 279 + wr32(hw, PFINT_TSYN_MSK + (0x4 * hw->pf_id), (u32)0x0); 280 + enable = false; 281 + break; 282 + case ICE_PTP_TX_INTERRUPT_SELF: 283 + default: 284 + enable = pf->ptp.tstamp_config.tx_type == HWTSTAMP_TX_ON; 285 + break; 286 + } 287 + 267 288 /* Configure the Tx timestamp interrupt */ 268 - val = rd32(&pf->hw, PFINT_OICR_ENA); 269 - if (on) 289 + val = rd32(hw, PFINT_OICR_ENA); 290 + if (enable) 270 291 val |= PFINT_OICR_TSYN_TX_M; 271 292 else 272 293 val &= ~PFINT_OICR_TSYN_TX_M; 273 - wr32(&pf->hw, PFINT_OICR_ENA, val); 274 - } 275 - 276 - /** 277 - * ice_set_tx_tstamp - Enable or disable Tx timestamping 278 - * @pf: The PF pointer to search in 279 - * @on: bool value for whether timestamps are enabled or disabled 280 - */ 281 - static void ice_set_tx_tstamp(struct ice_pf *pf, bool on) 282 - { 283 - struct ice_vsi *vsi; 284 - u16 i; 285 - 286 - vsi = ice_get_main_vsi(pf); 287 - if (!vsi) 288 - return; 289 - 290 - /* Set the timestamp enable flag for all the Tx rings */ 291 - ice_for_each_txq(vsi, i) { 292 - if (!vsi->tx_rings[i]) 293 - continue; 294 - vsi->tx_rings[i]->ptp_tx = on; 295 - } 296 - 297 - if (pf->ptp.tx_interrupt_mode == ICE_PTP_TX_INTERRUPT_SELF) 298 - ice_ptp_configure_tx_tstamp(pf, on); 299 - 300 - pf->ptp.tstamp_config.tx_type = on ? HWTSTAMP_TX_ON : HWTSTAMP_TX_OFF; 294 + wr32(hw, PFINT_OICR_ENA, val); 301 295 } 302 296 303 297 /** ··· 305 311 u16 i; 306 312 307 313 vsi = ice_get_main_vsi(pf); 308 - if (!vsi) 314 + if (!vsi || !vsi->rx_rings) 309 315 return; 310 316 311 317 /* Set the timestamp flag for all the Rx rings */ ··· 314 320 continue; 315 321 vsi->rx_rings[i]->ptp_rx = on; 316 322 } 317 - 318 - pf->ptp.tstamp_config.rx_filter = on ? HWTSTAMP_FILTER_ALL : 319 - HWTSTAMP_FILTER_NONE; 320 323 } 321 324 322 325 /** 323 - * ice_ptp_cfg_timestamp - Configure timestamp for init/deinit 326 + * ice_ptp_disable_timestamp_mode - Disable current timestamp mode 324 327 * @pf: Board private structure 325 - * @ena: bool value to enable or disable time stamp 326 328 * 327 - * This function will configure timestamping during PTP initialization 328 - * and deinitialization 329 + * Called during preparation for reset to temporarily disable timestamping on 330 + * the device. Called during remove to disable timestamping while cleaning up 331 + * driver resources. 329 332 */ 330 - void ice_ptp_cfg_timestamp(struct ice_pf *pf, bool ena) 333 + static void ice_ptp_disable_timestamp_mode(struct ice_pf *pf) 331 334 { 332 - ice_set_tx_tstamp(pf, ena); 333 - ice_set_rx_tstamp(pf, ena); 335 + struct ice_hw *hw = &pf->hw; 336 + u32 val; 337 + 338 + val = rd32(hw, PFINT_OICR_ENA); 339 + val &= ~PFINT_OICR_TSYN_TX_M; 340 + wr32(hw, PFINT_OICR_ENA, val); 341 + 342 + ice_set_rx_tstamp(pf, false); 343 + } 344 + 345 + /** 346 + * ice_ptp_restore_timestamp_mode - Restore timestamp configuration 347 + * @pf: Board private structure 348 + * 349 + * Called at the end of rebuild to restore timestamp configuration after 350 + * a device reset. 351 + */ 352 + void ice_ptp_restore_timestamp_mode(struct ice_pf *pf) 353 + { 354 + struct ice_hw *hw = &pf->hw; 355 + bool enable_rx; 356 + 357 + ice_ptp_cfg_tx_interrupt(pf); 358 + 359 + enable_rx = pf->ptp.tstamp_config.rx_filter == HWTSTAMP_FILTER_ALL; 360 + ice_set_rx_tstamp(pf, enable_rx); 361 + 362 + /* Trigger an immediate software interrupt to ensure that timestamps 363 + * which occurred during reset are handled now. 364 + */ 365 + wr32(hw, PFINT_OICR, PFINT_OICR_TSYN_TX_M); 366 + ice_flush(hw); 334 367 } 335 368 336 369 /** ··· 2058 2037 { 2059 2038 switch (config->tx_type) { 2060 2039 case HWTSTAMP_TX_OFF: 2061 - ice_set_tx_tstamp(pf, false); 2040 + pf->ptp.tstamp_config.tx_type = HWTSTAMP_TX_OFF; 2062 2041 break; 2063 2042 case HWTSTAMP_TX_ON: 2064 - ice_set_tx_tstamp(pf, true); 2043 + pf->ptp.tstamp_config.tx_type = HWTSTAMP_TX_ON; 2065 2044 break; 2066 2045 default: 2067 2046 return -ERANGE; ··· 2069 2048 2070 2049 switch (config->rx_filter) { 2071 2050 case HWTSTAMP_FILTER_NONE: 2072 - ice_set_rx_tstamp(pf, false); 2051 + pf->ptp.tstamp_config.rx_filter = HWTSTAMP_FILTER_NONE; 2073 2052 break; 2074 2053 case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: 2075 2054 case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: ··· 2085 2064 case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: 2086 2065 case HWTSTAMP_FILTER_NTP_ALL: 2087 2066 case HWTSTAMP_FILTER_ALL: 2088 - ice_set_rx_tstamp(pf, true); 2067 + pf->ptp.tstamp_config.rx_filter = HWTSTAMP_FILTER_ALL; 2089 2068 break; 2090 2069 default: 2091 2070 return -ERANGE; 2092 2071 } 2072 + 2073 + /* Immediately update the device timestamping mode */ 2074 + ice_ptp_restore_timestamp_mode(pf); 2093 2075 2094 2076 return 0; 2095 2077 } ··· 2761 2737 clear_bit(ICE_FLAG_PTP, pf->flags); 2762 2738 2763 2739 /* Disable timestamping for both Tx and Rx */ 2764 - ice_ptp_cfg_timestamp(pf, false); 2740 + ice_ptp_disable_timestamp_mode(pf); 2765 2741 2766 2742 kthread_cancel_delayed_work_sync(&ptp->work); 2767 2743 ··· 2827 2803 /* Release the global hardware lock */ 2828 2804 ice_ptp_unlock(hw); 2829 2805 2830 - if (pf->ptp.tx_interrupt_mode == ICE_PTP_TX_INTERRUPT_ALL) { 2831 - /* The clock owner for this device type handles the timestamp 2832 - * interrupt for all ports. 2833 - */ 2834 - ice_ptp_configure_tx_tstamp(pf, true); 2835 - 2836 - /* React on all quads interrupts for E82x */ 2837 - wr32(hw, PFINT_TSYN_MSK + (0x4 * hw->pf_id), (u32)0x1f); 2838 - 2806 + if (!ice_is_e810(hw)) { 2839 2807 /* Enable quad interrupts */ 2840 2808 err = ice_ptp_tx_ena_intr(pf, true, itr); 2841 2809 if (err) ··· 2897 2881 case ICE_PHY_E810: 2898 2882 return ice_ptp_init_tx_e810(pf, &ptp_port->tx); 2899 2883 case ICE_PHY_E822: 2900 - /* Non-owner PFs don't react to any interrupts on E82x, 2901 - * neither on own quad nor on others 2902 - */ 2903 - if (!ice_ptp_pf_handles_tx_interrupt(pf)) { 2904 - ice_ptp_configure_tx_tstamp(pf, false); 2905 - wr32(hw, PFINT_TSYN_MSK + (0x4 * hw->pf_id), (u32)0x0); 2906 - } 2907 2884 kthread_init_delayed_work(&ptp_port->ov_work, 2908 2885 ice_ptp_wait_for_offsets); 2909 2886 ··· 3041 3032 /* Start the PHY timestamping block */ 3042 3033 ice_ptp_reset_phy_timestamping(pf); 3043 3034 3035 + /* Configure initial Tx interrupt settings */ 3036 + ice_ptp_cfg_tx_interrupt(pf); 3037 + 3044 3038 set_bit(ICE_FLAG_PTP, pf->flags); 3045 3039 err = ice_ptp_init_work(pf, ptp); 3046 3040 if (err) ··· 3079 3067 return; 3080 3068 3081 3069 /* Disable timestamping for both Tx and Rx */ 3082 - ice_ptp_cfg_timestamp(pf, false); 3070 + ice_ptp_disable_timestamp_mode(pf); 3083 3071 3084 3072 ice_ptp_remove_auxbus_device(pf); 3085 3073

+2 -3

drivers/net/ethernet/intel/ice/ice_ptp.h

··· 292 292 struct ice_pf; 293 293 int ice_ptp_set_ts_config(struct ice_pf *pf, struct ifreq *ifr); 294 294 int ice_ptp_get_ts_config(struct ice_pf *pf, struct ifreq *ifr); 295 - void ice_ptp_cfg_timestamp(struct ice_pf *pf, bool ena); 295 + void ice_ptp_restore_timestamp_mode(struct ice_pf *pf); 296 296 297 297 void ice_ptp_extts_event(struct ice_pf *pf); 298 298 s8 ice_ptp_request_ts(struct ice_ptp_tx *tx, struct sk_buff *skb); ··· 317 317 return -EOPNOTSUPP; 318 318 } 319 319 320 - static inline void ice_ptp_cfg_timestamp(struct ice_pf *pf, bool ena) { } 321 - 320 + static inline void ice_ptp_restore_timestamp_mode(struct ice_pf *pf) { } 322 321 static inline void ice_ptp_extts_event(struct ice_pf *pf) { } 323 322 static inline s8 324 323 ice_ptp_request_ts(struct ice_ptp_tx *tx, struct sk_buff *skb)

-3

drivers/net/ethernet/intel/ice/ice_txrx.c

··· 2306 2306 if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))) 2307 2307 return; 2308 2308 2309 - if (!tx_ring->ptp_tx) 2310 - return; 2311 - 2312 2309 /* Tx timestamps cannot be sampled when doing TSO */ 2313 2310 if (first->tx_flags & ICE_TX_FLAGS_TSO) 2314 2311 return;

-1

drivers/net/ethernet/intel/ice/ice_txrx.h

··· 380 380 #define ICE_TX_FLAGS_RING_VLAN_L2TAG2 BIT(2) 381 381 u8 flags; 382 382 u8 dcb_tc; /* Traffic class of ring */ 383 - u8 ptp_tx; 384 383 } ____cacheline_internodealigned_in_smp; 385 384 386 385 static inline bool ice_ring_uses_build_skb(struct ice_rx_ring *ring)

+19 -1

drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c

··· 1088 1088 struct ethhdr *eth_hdr; 1089 1089 bool new = false; 1090 1090 int err = 0; 1091 + u64 vf_num; 1091 1092 u32 ring; 1092 1093 1093 1094 if (!flow_cfg->max_flows) { ··· 1101 1100 if (!(pfvf->flags & OTX2_FLAG_NTUPLE_SUPPORT)) 1102 1101 return -ENOMEM; 1103 1102 1104 - if (ring >= pfvf->hw.rx_queues && fsp->ring_cookie != RX_CLS_FLOW_DISC) 1103 + /* Number of queues on a VF can be greater or less than 1104 + * the PF's queue. Hence no need to check for the 1105 + * queue count. Hence no need to check queue count if PF 1106 + * is installing for its VF. Below is the expected vf_num value 1107 + * based on the ethtool commands. 1108 + * 1109 + * e.g. 1110 + * 1. ethtool -U <netdev> ... action -1 ==> vf_num:255 1111 + * 2. ethtool -U <netdev> ... action <queue_num> ==> vf_num:0 1112 + * 3. ethtool -U <netdev> ... vf <vf_idx> queue <queue_num> ==> 1113 + * vf_num:vf_idx+1 1114 + */ 1115 + vf_num = ethtool_get_flow_spec_ring_vf(fsp->ring_cookie); 1116 + if (!is_otx2_vf(pfvf->pcifunc) && !vf_num && 1117 + ring >= pfvf->hw.rx_queues && fsp->ring_cookie != RX_CLS_FLOW_DISC) 1105 1118 return -EINVAL; 1106 1119 1107 1120 if (fsp->location >= otx2_get_maxflows(flow_cfg)) ··· 1197 1182 flow_cfg->nr_flows++; 1198 1183 } 1199 1184 1185 + if (flow->is_vf) 1186 + netdev_info(pfvf->netdev, 1187 + "Make sure that VF's queue number is within its queue limit\n"); 1200 1188 return 0; 1201 1189 } 1202 1190

+2

drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c

··· 1934 1934 /* Clear RSS enable flag */ 1935 1935 rss = &pf->hw.rss_info; 1936 1936 rss->enable = false; 1937 + if (!netif_is_rxfh_configured(netdev)) 1938 + kfree(rss->rss_ctx[DEFAULT_RSS_CONTEXT_GROUP]); 1937 1939 1938 1940 /* Cleanup Queue IRQ */ 1939 1941 vec = pci_irq_vector(pf->pdev,

+1 -3

drivers/net/ethernet/realtek/r8169_main.c

··· 2599 2599 rx_mode &= ~AcceptMulticast; 2600 2600 } else if (netdev_mc_count(dev) > MC_FILTER_LIMIT || 2601 2601 dev->flags & IFF_ALLMULTI || 2602 - tp->mac_version == RTL_GIGA_MAC_VER_35 || 2603 - tp->mac_version == RTL_GIGA_MAC_VER_46 || 2604 - tp->mac_version == RTL_GIGA_MAC_VER_48) { 2602 + tp->mac_version == RTL_GIGA_MAC_VER_35) { 2605 2603 /* accept all multicasts */ 2606 2604 } else if (netdev_mc_empty(dev)) { 2607 2605 rx_mode &= ~AcceptMulticast;

+1 -1

drivers/net/ethernet/stmicro/stmmac/Kconfig

··· 280 280 config DWMAC_LOONGSON 281 281 tristate "Loongson PCI DWMAC support" 282 282 default MACH_LOONGSON64 283 - depends on STMMAC_ETH && PCI 283 + depends on (MACH_LOONGSON64 || COMPILE_TEST) && STMMAC_ETH && PCI 284 284 depends on COMMON_CLK 285 285 help 286 286 This selects the LOONGSON PCI bus support for the stmmac driver,

+5 -3

drivers/net/ethernet/wangxun/libwx/wx_hw.c

··· 1769 1769 wx->subsystem_device_id = pdev->subsystem_device; 1770 1770 } else { 1771 1771 err = wx_flash_read_dword(wx, 0xfffdc, &ssid); 1772 - if (!err) 1773 - wx->subsystem_device_id = swab16((u16)ssid); 1772 + if (err < 0) { 1773 + wx_err(wx, "read of internal subsystem device id failed\n"); 1774 + return err; 1775 + } 1774 1776 1775 - return err; 1777 + wx->subsystem_device_id = swab16((u16)ssid); 1776 1778 } 1777 1779 1778 1780 wx->mac_table = kcalloc(wx->mac.num_rar_entries,

+1 -3

drivers/net/ethernet/wangxun/ngbe/ngbe_main.c

··· 121 121 122 122 /* PCI config space info */ 123 123 err = wx_sw_init(wx); 124 - if (err < 0) { 125 - wx_err(wx, "read of internal subsystem device id failed\n"); 124 + if (err < 0) 126 125 return err; 127 - } 128 126 129 127 /* mac type, phy type , oem type */ 130 128 ngbe_init_type_code(wx);

+1 -3

drivers/net/ethernet/wangxun/txgbe/txgbe_main.c

··· 364 364 365 365 /* PCI config space info */ 366 366 err = wx_sw_init(wx); 367 - if (err < 0) { 368 - wx_err(wx, "read of internal subsystem device id failed\n"); 367 + if (err < 0) 369 368 return err; 370 - } 371 369 372 370 txgbe_init_type_code(wx); 373 371

+1 -1

drivers/net/ethernet/xilinx/xilinx_axienet_main.c

··· 822 822 if (lp->features & XAE_FEATURE_FULL_TX_CSUM) { 823 823 /* Tx Full Checksum Offload Enabled */ 824 824 cur_p->app0 |= 2; 825 - } else if (lp->features & XAE_FEATURE_PARTIAL_RX_CSUM) { 825 + } else if (lp->features & XAE_FEATURE_PARTIAL_TX_CSUM) { 826 826 csum_start_off = skb_transport_offset(skb); 827 827 csum_index_off = csum_start_off + skb->csum_offset; 828 828 /* Tx Partial Checksum Offload Enabled */

+46 -22

drivers/net/hyperv/netvsc_drv.c

··· 2206 2206 goto upper_link_failed; 2207 2207 } 2208 2208 2209 - /* set slave flag before open to prevent IPv6 addrconf */ 2210 - vf_netdev->flags |= IFF_SLAVE; 2211 - 2212 2209 schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT); 2213 2210 2214 2211 call_netdevice_notifiers(NETDEV_JOIN, vf_netdev); ··· 2312 2315 2313 2316 } 2314 2317 2315 - /* Fallback path to check synthetic vf with 2316 - * help of mac addr 2318 + /* Fallback path to check synthetic vf with help of mac addr. 2319 + * Because this function can be called before vf_netdev is 2320 + * initialized (NETDEV_POST_INIT) when its perm_addr has not been copied 2321 + * from dev_addr, also try to match to its dev_addr. 2322 + * Note: On Hyper-V and Azure, it's not possible to set a MAC address 2323 + * on a VF that matches to the MAC of a unrelated NETVSC device. 2317 2324 */ 2318 2325 list_for_each_entry(ndev_ctx, &netvsc_dev_list, list) { 2319 2326 ndev = hv_get_drvdata(ndev_ctx->device_ctx); 2320 - if (ether_addr_equal(vf_netdev->perm_addr, ndev->perm_addr)) { 2321 - netdev_notice(vf_netdev, 2322 - "falling back to mac addr based matching\n"); 2327 + if (ether_addr_equal(vf_netdev->perm_addr, ndev->perm_addr) || 2328 + ether_addr_equal(vf_netdev->dev_addr, ndev->perm_addr)) 2323 2329 return ndev; 2324 - } 2325 2330 } 2326 2331 2327 2332 netdev_notice(vf_netdev, 2328 2333 "no netdev found for vf serial:%u\n", serial); 2329 2334 return NULL; 2335 + } 2336 + 2337 + static int netvsc_prepare_bonding(struct net_device *vf_netdev) 2338 + { 2339 + struct net_device *ndev; 2340 + 2341 + ndev = get_netvsc_byslot(vf_netdev); 2342 + if (!ndev) 2343 + return NOTIFY_DONE; 2344 + 2345 + /* set slave flag before open to prevent IPv6 addrconf */ 2346 + vf_netdev->flags |= IFF_SLAVE; 2347 + return NOTIFY_DONE; 2330 2348 } 2331 2349 2332 2350 static int netvsc_register_vf(struct net_device *vf_netdev) ··· 2543 2531 goto devinfo_failed; 2544 2532 } 2545 2533 2534 + /* We must get rtnl lock before scheduling nvdev->subchan_work, 2535 + * otherwise netvsc_subchan_work() can get rtnl lock first and wait 2536 + * all subchannels to show up, but that may not happen because 2537 + * netvsc_probe() can't get rtnl lock and as a result vmbus_onoffer() 2538 + * -> ... -> device_add() -> ... -> __device_attach() can't get 2539 + * the device lock, so all the subchannels can't be processed -- 2540 + * finally netvsc_subchan_work() hangs forever. 2541 + * 2542 + * The rtnl lock also needs to be held before rndis_filter_device_add() 2543 + * which advertises nvsp_2_vsc_capability / sriov bit, and triggers 2544 + * VF NIC offering and registering. If VF NIC finished register_netdev() 2545 + * earlier it may cause name based config failure. 2546 + */ 2547 + rtnl_lock(); 2548 + 2546 2549 nvdev = rndis_filter_device_add(dev, device_info); 2547 2550 if (IS_ERR(nvdev)) { 2548 2551 ret = PTR_ERR(nvdev); ··· 2566 2539 } 2567 2540 2568 2541 eth_hw_addr_set(net, device_info->mac_adr); 2569 - 2570 - /* We must get rtnl lock before scheduling nvdev->subchan_work, 2571 - * otherwise netvsc_subchan_work() can get rtnl lock first and wait 2572 - * all subchannels to show up, but that may not happen because 2573 - * netvsc_probe() can't get rtnl lock and as a result vmbus_onoffer() 2574 - * -> ... -> device_add() -> ... -> __device_attach() can't get 2575 - * the device lock, so all the subchannels can't be processed -- 2576 - * finally netvsc_subchan_work() hangs forever. 2577 - */ 2578 - rtnl_lock(); 2579 2542 2580 2543 if (nvdev->num_chn > 1) 2581 2544 schedule_work(&nvdev->subchan_work); ··· 2603 2586 return 0; 2604 2587 2605 2588 register_failed: 2606 - rtnl_unlock(); 2607 2589 rndis_filter_device_remove(dev, nvdev); 2608 2590 rndis_failed: 2591 + rtnl_unlock(); 2609 2592 netvsc_devinfo_put(device_info); 2610 2593 devinfo_failed: 2611 2594 free_percpu(net_device_ctx->vf_stats); ··· 2770 2753 return NOTIFY_DONE; 2771 2754 2772 2755 switch (event) { 2756 + case NETDEV_POST_INIT: 2757 + return netvsc_prepare_bonding(event_dev); 2773 2758 case NETDEV_REGISTER: 2774 2759 return netvsc_register_vf(event_dev); 2775 2760 case NETDEV_UNREGISTER: ··· 2807 2788 } 2808 2789 netvsc_ring_bytes = ring_size * PAGE_SIZE; 2809 2790 2791 + register_netdevice_notifier(&netvsc_netdev_notifier); 2792 + 2810 2793 ret = vmbus_driver_register(&netvsc_drv); 2811 2794 if (ret) 2812 - return ret; 2795 + goto err_vmbus_reg; 2813 2796 2814 - register_netdevice_notifier(&netvsc_netdev_notifier); 2815 2797 return 0; 2798 + 2799 + err_vmbus_reg: 2800 + unregister_netdevice_notifier(&netvsc_netdev_notifier); 2801 + return ret; 2816 2802 } 2817 2803 2818 2804 MODULE_LICENSE("GPL");

+1 -1

drivers/net/ipa/reg/gsi_reg-v5.0.c

··· 78 78 0x0001c000 + 0x12000 * GSI_EE_AP, 0x80); 79 79 80 80 static const u32 reg_ev_ch_e_cntxt_1_fmask[] = { 81 - [R_LENGTH] = GENMASK(19, 0), 81 + [R_LENGTH] = GENMASK(23, 0), 82 82 }; 83 83 84 84 REG_STRIDE_FIELDS(EV_CH_E_CNTXT_1, ev_ch_e_cntxt_1,

+20 -2

drivers/net/netkit.c

··· 7 7 #include <linux/filter.h> 8 8 #include <linux/netfilter_netdev.h> 9 9 #include <linux/bpf_mprog.h> 10 + #include <linux/indirect_call_wrapper.h> 10 11 11 12 #include <net/netkit.h> 12 13 #include <net/dst.h> ··· 69 68 netdev_tx_t ret_dev = NET_XMIT_SUCCESS; 70 69 const struct bpf_mprog_entry *entry; 71 70 struct net_device *peer; 71 + int len = skb->len; 72 72 73 73 rcu_read_lock(); 74 74 peer = rcu_dereference(nk->peer); ··· 87 85 case NETKIT_PASS: 88 86 skb->protocol = eth_type_trans(skb, skb->dev); 89 87 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 90 - __netif_rx(skb); 88 + if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) { 89 + dev_sw_netstats_tx_add(dev, 1, len); 90 + dev_sw_netstats_rx_add(peer, len); 91 + } else { 92 + goto drop_stats; 93 + } 91 94 break; 92 95 case NETKIT_REDIRECT: 96 + dev_sw_netstats_tx_add(dev, 1, len); 93 97 skb_do_redirect(skb); 94 98 break; 95 99 case NETKIT_DROP: 96 100 default: 97 101 drop: 98 102 kfree_skb(skb); 103 + drop_stats: 99 104 dev_core_stats_tx_dropped_inc(dev); 100 105 ret_dev = NET_XMIT_DROP; 101 106 break; ··· 178 169 rcu_read_unlock(); 179 170 } 180 171 181 - static struct net_device *netkit_peer_dev(struct net_device *dev) 172 + INDIRECT_CALLABLE_SCOPE struct net_device *netkit_peer_dev(struct net_device *dev) 182 173 { 183 174 return rcu_dereference(netkit_priv(dev)->peer); 175 + } 176 + 177 + static void netkit_get_stats(struct net_device *dev, 178 + struct rtnl_link_stats64 *stats) 179 + { 180 + dev_fetch_sw_netstats(stats, dev->tstats); 181 + stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped); 184 182 } 185 183 186 184 static void netkit_uninit(struct net_device *dev); ··· 200 184 .ndo_set_rx_headroom = netkit_set_headroom, 201 185 .ndo_get_iflink = netkit_get_iflink, 202 186 .ndo_get_peer_dev = netkit_peer_dev, 187 + .ndo_get_stats64 = netkit_get_stats, 203 188 .ndo_uninit = netkit_uninit, 204 189 .ndo_features_check = passthru_features_check, 205 190 }; ··· 235 218 236 219 ether_setup(dev); 237 220 dev->max_mtu = ETH_MAX_MTU; 221 + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; 238 222 239 223 dev->flags |= IFF_NOARP; 240 224 dev->priv_flags &= ~IFF_TX_SKB_SHARING;

+4 -4

drivers/net/usb/aqc111.c

··· 1079 1079 u16 pkt_count = 0; 1080 1080 u64 desc_hdr = 0; 1081 1081 u16 vlan_tag = 0; 1082 - u32 skb_len = 0; 1082 + u32 skb_len; 1083 1083 1084 1084 if (!skb) 1085 1085 goto err; 1086 1086 1087 - if (skb->len == 0) 1087 + skb_len = skb->len; 1088 + if (skb_len < sizeof(desc_hdr)) 1088 1089 goto err; 1089 1090 1090 - skb_len = skb->len; 1091 1091 /* RX Descriptor Header */ 1092 - skb_trim(skb, skb->len - sizeof(desc_hdr)); 1092 + skb_trim(skb, skb_len - sizeof(desc_hdr)); 1093 1093 desc_hdr = le64_to_cpup((u64 *)skb_tail_pointer(skb)); 1094 1094 1095 1095 /* Check these packets */

+2 -2

drivers/net/usb/ax88179_178a.c

··· 1583 1583 1584 1584 *tmp16 = AX_PHYPWR_RSTCTL_IPRL; 1585 1585 ax88179_write_cmd(dev, AX_ACCESS_MAC, AX_PHYPWR_RSTCTL, 2, 2, tmp16); 1586 - msleep(200); 1586 + msleep(500); 1587 1587 1588 1588 *tmp = AX_CLK_SELECT_ACS | AX_CLK_SELECT_BCS; 1589 1589 ax88179_write_cmd(dev, AX_ACCESS_MAC, AX_CLK_SELECT, 1, 1, tmp); 1590 - msleep(100); 1590 + msleep(200); 1591 1591 1592 1592 /* Ethernet PHY Auto Detach*/ 1593 1593 ax88179_auto_detach(dev);

+1

drivers/net/usb/qmi_wwan.c

··· 1289 1289 {QMI_FIXED_INTF(0x19d2, 0x0168, 4)}, 1290 1290 {QMI_FIXED_INTF(0x19d2, 0x0176, 3)}, 1291 1291 {QMI_FIXED_INTF(0x19d2, 0x0178, 3)}, 1292 + {QMI_FIXED_INTF(0x19d2, 0x0189, 4)}, /* ZTE MF290 */ 1292 1293 {QMI_FIXED_INTF(0x19d2, 0x0191, 4)}, /* ZTE EuFi890 */ 1293 1294 {QMI_FIXED_INTF(0x19d2, 0x0199, 1)}, /* ZTE MF820S */ 1294 1295 {QMI_FIXED_INTF(0x19d2, 0x0200, 1)},

+13 -33

drivers/net/veth.c

··· 236 236 data[tx_idx + j] += *(u64 *)(base + offset); 237 237 } 238 238 } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); 239 - pp_idx = tx_idx + VETH_TQ_STATS_LEN; 240 239 } 240 + pp_idx = idx + dev->real_num_tx_queues * VETH_TQ_STATS_LEN; 241 241 242 242 page_pool_stats: 243 243 veth_get_page_pool_stats(dev, &data[pp_idx]); ··· 373 373 skb_tx_timestamp(skb); 374 374 if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { 375 375 if (!use_napi) 376 - dev_lstats_add(dev, length); 376 + dev_sw_netstats_tx_add(dev, 1, length); 377 377 else 378 378 __veth_xdp_flush(rq); 379 379 } else { ··· 385 385 rcu_read_unlock(); 386 386 387 387 return ret; 388 - } 389 - 390 - static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes) 391 - { 392 - struct veth_priv *priv = netdev_priv(dev); 393 - 394 - dev_lstats_read(dev, packets, bytes); 395 - return atomic64_read(&priv->dropped); 396 388 } 397 389 398 390 static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) ··· 424 432 struct veth_priv *priv = netdev_priv(dev); 425 433 struct net_device *peer; 426 434 struct veth_stats rx; 427 - u64 packets, bytes; 428 435 429 - tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes); 430 - tot->tx_bytes = bytes; 431 - tot->tx_packets = packets; 436 + tot->tx_dropped = atomic64_read(&priv->dropped); 437 + dev_fetch_sw_netstats(tot, dev->tstats); 432 438 433 439 veth_stats_rx(&rx, dev); 434 440 tot->tx_dropped += rx.xdp_tx_err; 435 441 tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; 436 - tot->rx_bytes = rx.xdp_bytes; 437 - tot->rx_packets = rx.xdp_packets; 442 + tot->rx_bytes += rx.xdp_bytes; 443 + tot->rx_packets += rx.xdp_packets; 438 444 439 445 rcu_read_lock(); 440 446 peer = rcu_dereference(priv->peer); 441 447 if (peer) { 442 - veth_stats_tx(peer, &packets, &bytes); 443 - tot->rx_bytes += bytes; 444 - tot->rx_packets += packets; 448 + struct rtnl_link_stats64 tot_peer = {}; 449 + 450 + dev_fetch_sw_netstats(&tot_peer, peer->tstats); 451 + tot->rx_bytes += tot_peer.tx_bytes; 452 + tot->rx_packets += tot_peer.tx_packets; 445 453 446 454 veth_stats_rx(&rx, peer); 447 455 tot->tx_dropped += rx.peer_tq_xdp_xmit_err; ··· 1498 1506 1499 1507 static int veth_dev_init(struct net_device *dev) 1500 1508 { 1501 - int err; 1502 - 1503 - dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); 1504 - if (!dev->lstats) 1505 - return -ENOMEM; 1506 - 1507 - err = veth_alloc_queues(dev); 1508 - if (err) { 1509 - free_percpu(dev->lstats); 1510 - return err; 1511 - } 1512 - 1513 - return 0; 1509 + return veth_alloc_queues(dev); 1514 1510 } 1515 1511 1516 1512 static void veth_dev_free(struct net_device *dev) 1517 1513 { 1518 1514 veth_free_queues(dev); 1519 - free_percpu(dev->lstats); 1520 1515 } 1521 1516 1522 1517 #ifdef CONFIG_NET_POLL_CONTROLLER ··· 1775 1796 NETIF_F_HW_VLAN_STAG_RX); 1776 1797 dev->needs_free_netdev = true; 1777 1798 dev->priv_destructor = veth_dev_free; 1799 + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; 1778 1800 dev->max_mtu = ETH_MAX_MTU; 1779 1801 1780 1802 dev->hw_features = VETH_FEATURES;

+10 -28

drivers/net/vrf.c

··· 121 121 int ifindex; 122 122 }; 123 123 124 - struct pcpu_dstats { 125 - u64 tx_pkts; 126 - u64 tx_bytes; 127 - u64 tx_drps; 128 - u64 rx_pkts; 129 - u64 rx_bytes; 130 - u64 rx_drps; 131 - struct u64_stats_sync syncp; 132 - }; 133 - 134 124 static void vrf_rx_stats(struct net_device *dev, int len) 135 125 { 136 126 struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); 137 127 138 128 u64_stats_update_begin(&dstats->syncp); 139 - dstats->rx_pkts++; 129 + dstats->rx_packets++; 140 130 dstats->rx_bytes += len; 141 131 u64_stats_update_end(&dstats->syncp); 142 132 } ··· 151 161 do { 152 162 start = u64_stats_fetch_begin(&dstats->syncp); 153 163 tbytes = dstats->tx_bytes; 154 - tpkts = dstats->tx_pkts; 155 - tdrops = dstats->tx_drps; 164 + tpkts = dstats->tx_packets; 165 + tdrops = dstats->tx_drops; 156 166 rbytes = dstats->rx_bytes; 157 - rpkts = dstats->rx_pkts; 167 + rpkts = dstats->rx_packets; 158 168 } while (u64_stats_fetch_retry(&dstats->syncp, start)); 159 169 stats->tx_bytes += tbytes; 160 170 stats->tx_packets += tpkts; ··· 411 421 if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) 412 422 vrf_rx_stats(dev, len); 413 423 else 414 - this_cpu_inc(dev->dstats->rx_drps); 424 + this_cpu_inc(dev->dstats->rx_drops); 415 425 416 426 return NETDEV_TX_OK; 417 427 } ··· 606 616 struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); 607 617 608 618 u64_stats_update_begin(&dstats->syncp); 609 - dstats->tx_pkts++; 619 + dstats->tx_packets++; 610 620 dstats->tx_bytes += len; 611 621 u64_stats_update_end(&dstats->syncp); 612 622 } else { 613 - this_cpu_inc(dev->dstats->tx_drps); 623 + this_cpu_inc(dev->dstats->tx_drops); 614 624 } 615 625 616 626 return ret; ··· 1164 1174 1165 1175 vrf_rtable_release(dev, vrf); 1166 1176 vrf_rt6_release(dev, vrf); 1167 - 1168 - free_percpu(dev->dstats); 1169 - dev->dstats = NULL; 1170 1177 } 1171 1178 1172 1179 static int vrf_dev_init(struct net_device *dev) 1173 1180 { 1174 1181 struct net_vrf *vrf = netdev_priv(dev); 1175 1182 1176 - dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats); 1177 - if (!dev->dstats) 1178 - goto out_nomem; 1179 - 1180 1183 /* create the default dst which points back to us */ 1181 1184 if (vrf_rtable_create(dev) != 0) 1182 - goto out_stats; 1185 + goto out_nomem; 1183 1186 1184 1187 if (vrf_rt6_create(dev) != 0) 1185 1188 goto out_rth; ··· 1186 1203 1187 1204 out_rth: 1188 1205 vrf_rtable_release(dev, vrf); 1189 - out_stats: 1190 - free_percpu(dev->dstats); 1191 - dev->dstats = NULL; 1192 1206 out_nomem: 1193 1207 return -ENOMEM; 1194 1208 } ··· 1684 1704 dev->min_mtu = IPV6_MIN_MTU; 1685 1705 dev->max_mtu = IP6_MAX_MTU; 1686 1706 dev->mtu = dev->max_mtu; 1707 + 1708 + dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; 1687 1709 } 1688 1710 1689 1711 static int vrf_validate(struct nlattr *tb[], struct nlattr *data[],

+2 -2

drivers/net/wireguard/device.c

··· 210 210 */ 211 211 while (skb_queue_len(&peer->staged_packet_queue) > MAX_STAGED_PACKETS) { 212 212 dev_kfree_skb(__skb_dequeue(&peer->staged_packet_queue)); 213 - ++dev->stats.tx_dropped; 213 + DEV_STATS_INC(dev, tx_dropped); 214 214 } 215 215 skb_queue_splice_tail(&packets, &peer->staged_packet_queue); 216 216 spin_unlock_bh(&peer->staged_packet_queue.lock); ··· 228 228 else if (skb->protocol == htons(ETH_P_IPV6)) 229 229 icmpv6_ndo_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 230 230 err: 231 - ++dev->stats.tx_errors; 231 + DEV_STATS_INC(dev, tx_errors); 232 232 kfree_skb(skb); 233 233 return ret; 234 234 }

+6 -6

drivers/net/wireguard/receive.c

··· 416 416 net_dbg_skb_ratelimited("%s: Packet has unallowed src IP (%pISc) from peer %llu (%pISpfsc)\n", 417 417 dev->name, skb, peer->internal_id, 418 418 &peer->endpoint.addr); 419 - ++dev->stats.rx_errors; 420 - ++dev->stats.rx_frame_errors; 419 + DEV_STATS_INC(dev, rx_errors); 420 + DEV_STATS_INC(dev, rx_frame_errors); 421 421 goto packet_processed; 422 422 dishonest_packet_type: 423 423 net_dbg_ratelimited("%s: Packet is neither ipv4 nor ipv6 from peer %llu (%pISpfsc)\n", 424 424 dev->name, peer->internal_id, &peer->endpoint.addr); 425 - ++dev->stats.rx_errors; 426 - ++dev->stats.rx_frame_errors; 425 + DEV_STATS_INC(dev, rx_errors); 426 + DEV_STATS_INC(dev, rx_frame_errors); 427 427 goto packet_processed; 428 428 dishonest_packet_size: 429 429 net_dbg_ratelimited("%s: Packet has incorrect size from peer %llu (%pISpfsc)\n", 430 430 dev->name, peer->internal_id, &peer->endpoint.addr); 431 - ++dev->stats.rx_errors; 432 - ++dev->stats.rx_length_errors; 431 + DEV_STATS_INC(dev, rx_errors); 432 + DEV_STATS_INC(dev, rx_length_errors); 433 433 goto packet_processed; 434 434 packet_processed: 435 435 dev_kfree_skb(skb);

+2 -1

drivers/net/wireguard/send.c

··· 333 333 void wg_packet_purge_staged_packets(struct wg_peer *peer) 334 334 { 335 335 spin_lock_bh(&peer->staged_packet_queue.lock); 336 - peer->device->dev->stats.tx_dropped += peer->staged_packet_queue.qlen; 336 + DEV_STATS_ADD(peer->device->dev, tx_dropped, 337 + peer->staged_packet_queue.qlen); 337 338 __skb_queue_purge(&peer->staged_packet_queue); 338 339 spin_unlock_bh(&peer->staged_packet_queue.lock); 339 340 }

+6 -1

drivers/nfc/virtual_ncidev.c

··· 26 26 struct mutex mtx; 27 27 struct sk_buff *send_buff; 28 28 struct wait_queue_head wq; 29 + bool running; 29 30 }; 30 31 31 32 static int virtual_nci_open(struct nci_dev *ndev) 32 33 { 34 + struct virtual_nci_dev *vdev = nci_get_drvdata(ndev); 35 + 36 + vdev->running = true; 33 37 return 0; 34 38 } 35 39 ··· 44 40 mutex_lock(&vdev->mtx); 45 41 kfree_skb(vdev->send_buff); 46 42 vdev->send_buff = NULL; 43 + vdev->running = false; 47 44 mutex_unlock(&vdev->mtx); 48 45 49 46 return 0; ··· 55 50 struct virtual_nci_dev *vdev = nci_get_drvdata(ndev); 56 51 57 52 mutex_lock(&vdev->mtx); 58 - if (vdev->send_buff) { 53 + if (vdev->send_buff || !vdev->running) { 59 54 mutex_unlock(&vdev->mtx); 60 55 kfree_skb(skb); 61 56 return -1;

+2 -1

drivers/s390/net/Kconfig

··· 103 103 config ISM 104 104 tristate "Support for ISM vPCI Adapter" 105 105 depends on PCI 106 + imply SMC 106 107 default n 107 108 help 108 109 Select this option if you want to use the Internal Shared Memory 109 - vPCI Adapter. 110 + vPCI Adapter. The adapter can be used with the SMC network protocol. 110 111 111 112 To compile as a module choose M. The module name is ism. 112 113 If unsure, choose N.

+46 -47

drivers/s390/net/ism_drv.c

··· 30 30 MODULE_DEVICE_TABLE(pci, ism_device_table); 31 31 32 32 static debug_info_t *ism_debug_info; 33 - static const struct smcd_ops ism_ops; 34 33 35 34 #define NO_CLIENT 0xff /* must be >= MAX_CLIENTS */ 36 35 static struct ism_client *clients[MAX_CLIENTS]; /* use an array rather than */ ··· 288 289 return ret; 289 290 } 290 291 291 - static int ism_query_rgid(struct ism_dev *ism, u64 rgid, u32 vid_valid, 292 - u32 vid) 293 - { 294 - union ism_query_rgid cmd; 295 - 296 - memset(&cmd, 0, sizeof(cmd)); 297 - cmd.request.hdr.cmd = ISM_QUERY_RGID; 298 - cmd.request.hdr.len = sizeof(cmd.request); 299 - 300 - cmd.request.rgid = rgid; 301 - cmd.request.vlan_valid = vid_valid; 302 - cmd.request.vlan_id = vid; 303 - 304 - return ism_cmd(ism, &cmd); 305 - } 306 - 307 292 static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) 308 293 { 309 294 clear_bit(dmb->sba_idx, ism->sba_bitmap); ··· 412 429 return ism_cmd(ism, &cmd); 413 430 } 414 431 415 - static int ism_signal_ieq(struct ism_dev *ism, u64 rgid, u32 trigger_irq, 416 - u32 event_code, u64 info) 417 - { 418 - union ism_sig_ieq cmd; 419 - 420 - memset(&cmd, 0, sizeof(cmd)); 421 - cmd.request.hdr.cmd = ISM_SIGNAL_IEQ; 422 - cmd.request.hdr.len = sizeof(cmd.request); 423 - 424 - cmd.request.rgid = rgid; 425 - cmd.request.trigger_irq = trigger_irq; 426 - cmd.request.event_code = event_code; 427 - cmd.request.info = info; 428 - 429 - return ism_cmd(ism, &cmd); 430 - } 431 - 432 432 static unsigned int max_bytes(unsigned int start, unsigned int len, 433 433 unsigned int boundary) 434 434 { ··· 469 503 } 470 504 EXPORT_SYMBOL_GPL(ism_get_seid); 471 505 472 - static u16 ism_get_chid(struct ism_dev *ism) 473 - { 474 - if (!ism || !ism->pdev) 475 - return 0; 476 - 477 - return to_zpci(ism->pdev)->pchid; 478 - } 479 - 480 506 static void ism_handle_event(struct ism_dev *ism) 481 507 { 482 508 struct ism_event *entry; ··· 525 567 } 526 568 spin_unlock(&ism->lock); 527 569 return IRQ_HANDLED; 528 - } 529 - 530 - static u64 ism_get_local_gid(struct ism_dev *ism) 531 - { 532 - return ism->local_gid; 533 570 } 534 571 535 572 static int ism_dev_init(struct ism_dev *ism) ··· 727 774 /*************************** SMC-D Implementation *****************************/ 728 775 729 776 #if IS_ENABLED(CONFIG_SMC) 777 + static int ism_query_rgid(struct ism_dev *ism, u64 rgid, u32 vid_valid, 778 + u32 vid) 779 + { 780 + union ism_query_rgid cmd; 781 + 782 + memset(&cmd, 0, sizeof(cmd)); 783 + cmd.request.hdr.cmd = ISM_QUERY_RGID; 784 + cmd.request.hdr.len = sizeof(cmd.request); 785 + 786 + cmd.request.rgid = rgid; 787 + cmd.request.vlan_valid = vid_valid; 788 + cmd.request.vlan_id = vid; 789 + 790 + return ism_cmd(ism, &cmd); 791 + } 792 + 730 793 static int smcd_query_rgid(struct smcd_dev *smcd, u64 rgid, u32 vid_valid, 731 794 u32 vid) 732 795 { ··· 780 811 return ism_cmd_simple(smcd->priv, ISM_RESET_VLAN); 781 812 } 782 813 814 + static int ism_signal_ieq(struct ism_dev *ism, u64 rgid, u32 trigger_irq, 815 + u32 event_code, u64 info) 816 + { 817 + union ism_sig_ieq cmd; 818 + 819 + memset(&cmd, 0, sizeof(cmd)); 820 + cmd.request.hdr.cmd = ISM_SIGNAL_IEQ; 821 + cmd.request.hdr.len = sizeof(cmd.request); 822 + 823 + cmd.request.rgid = rgid; 824 + cmd.request.trigger_irq = trigger_irq; 825 + cmd.request.event_code = event_code; 826 + cmd.request.info = info; 827 + 828 + return ism_cmd(ism, &cmd); 829 + } 830 + 783 831 static int smcd_signal_ieq(struct smcd_dev *smcd, u64 rgid, u32 trigger_irq, 784 832 u32 event_code, u64 info) 785 833 { ··· 816 830 SYSTEM_EID.type[0] != '0'; 817 831 } 818 832 833 + static u64 ism_get_local_gid(struct ism_dev *ism) 834 + { 835 + return ism->local_gid; 836 + } 837 + 819 838 static u64 smcd_get_local_gid(struct smcd_dev *smcd) 820 839 { 821 840 return ism_get_local_gid(smcd->priv); 841 + } 842 + 843 + static u16 ism_get_chid(struct ism_dev *ism) 844 + { 845 + if (!ism || !ism->pdev) 846 + return 0; 847 + 848 + return to_zpci(ism->pdev)->pchid; 822 849 } 823 850 824 851 static u16 smcd_get_chid(struct smcd_dev *smcd)

+16

include/linux/bpf_verifier.h

··· 301 301 struct tnum callback_ret_range; 302 302 bool in_async_callback_fn; 303 303 bool in_exception_callback_fn; 304 + /* For callback calling functions that limit number of possible 305 + * callback executions (e.g. bpf_loop) keeps track of current 306 + * simulated iteration number. 307 + * Value in frame N refers to number of times callback with frame 308 + * N+1 was simulated, e.g. for the following call: 309 + * 310 + * bpf_loop(..., fn, ...); | suppose current frame is N 311 + * | fn would be simulated in frame N+1 312 + * | number of simulations is tracked in frame N 313 + */ 314 + u32 callback_depth; 304 315 305 316 /* The following fields should be last. See copy_func_state() */ 306 317 int acquired_refs; ··· 411 400 struct bpf_idx_pair *jmp_history; 412 401 u32 jmp_history_cnt; 413 402 u32 dfs_depth; 403 + u32 callback_unroll_depth; 414 404 }; 415 405 416 406 #define bpf_get_spilled_reg(slot, frame, mask) \ ··· 523 511 * this instruction, regardless of any heuristics 524 512 */ 525 513 bool force_checkpoint; 514 + /* true if instruction is a call to a helper function that 515 + * accepts callback function as a parameter. 516 + */ 517 + bool calls_callback; 526 518 }; 527 519 528 520 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */

+26 -4

include/linux/netdevice.h

··· 1797 1797 ML_PRIV_CAN, 1798 1798 }; 1799 1799 1800 + enum netdev_stat_type { 1801 + NETDEV_PCPU_STAT_NONE, 1802 + NETDEV_PCPU_STAT_LSTATS, /* struct pcpu_lstats */ 1803 + NETDEV_PCPU_STAT_TSTATS, /* struct pcpu_sw_netstats */ 1804 + NETDEV_PCPU_STAT_DSTATS, /* struct pcpu_dstats */ 1805 + }; 1806 + 1800 1807 /** 1801 1808 * struct net_device - The DEVICE structure. 1802 1809 * ··· 1998 1991 * 1999 1992 * @ml_priv: Mid-layer private 2000 1993 * @ml_priv_type: Mid-layer private type 2001 - * @lstats: Loopback statistics 2002 - * @tstats: Tunnel statistics 2003 - * @dstats: Dummy statistics 2004 - * @vstats: Virtual ethernet statistics 1994 + * 1995 + * @pcpu_stat_type: Type of device statistics which the core should 1996 + * allocate/free: none, lstats, tstats, dstats. none 1997 + * means the driver is handling statistics allocation/ 1998 + * freeing internally. 1999 + * @lstats: Loopback statistics: packets, bytes 2000 + * @tstats: Tunnel statistics: RX/TX packets, RX/TX bytes 2001 + * @dstats: Dummy statistics: RX/TX/drop packets, RX/TX bytes 2005 2002 * 2006 2003 * @garp_port: GARP 2007 2004 * @mrp_port: MRP ··· 2365 2354 void *ml_priv; 2366 2355 enum netdev_ml_priv_type ml_priv_type; 2367 2356 2357 + enum netdev_stat_type pcpu_stat_type:8; 2368 2358 union { 2369 2359 struct pcpu_lstats __percpu *lstats; 2370 2360 struct pcpu_sw_netstats __percpu *tstats; ··· 2766 2754 u64_stats_t tx_bytes; 2767 2755 struct u64_stats_sync syncp; 2768 2756 } __aligned(4 * sizeof(u64)); 2757 + 2758 + struct pcpu_dstats { 2759 + u64 rx_packets; 2760 + u64 rx_bytes; 2761 + u64 rx_drops; 2762 + u64 tx_packets; 2763 + u64 tx_bytes; 2764 + u64 tx_drops; 2765 + struct u64_stats_sync syncp; 2766 + } __aligned(8 * sizeof(u64)); 2769 2767 2770 2768 struct pcpu_lstats { 2771 2769 u64_stats_t packets;

+6

include/net/netkit.h

··· 10 10 int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); 11 11 int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog); 12 12 int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); 13 + INDIRECT_CALLABLE_DECLARE(struct net_device *netkit_peer_dev(struct net_device *dev)); 13 14 #else 14 15 static inline int netkit_prog_attach(const union bpf_attr *attr, 15 16 struct bpf_prog *prog) ··· 34 33 union bpf_attr __user *uattr) 35 34 { 36 35 return -EINVAL; 36 + } 37 + 38 + static inline struct net_device *netkit_peer_dev(struct net_device *dev) 39 + { 40 + return NULL; 37 41 } 38 42 #endif /* CONFIG_NETKIT */ 39 43 #endif /* __NET_NETKIT_H */

+1 -1

include/trace/events/rxrpc.h

··· 328 328 E_(rxrpc_rtt_tx_ping, "PING") 329 329 330 330 #define rxrpc_rtt_rx_traces \ 331 - EM(rxrpc_rtt_rx_cancel, "CNCL") \ 331 + EM(rxrpc_rtt_rx_other_ack, "OACK") \ 332 332 EM(rxrpc_rtt_rx_obsolete, "OBSL") \ 333 333 EM(rxrpc_rtt_rx_lost, "LOST") \ 334 334 EM(rxrpc_rtt_rx_ping_response, "PONG") \

+284 -154

kernel/bpf/verifier.c

··· 547 547 return func_id == BPF_FUNC_dynptr_data; 548 548 } 549 549 550 - static bool is_callback_calling_kfunc(u32 btf_id); 550 + static bool is_sync_callback_calling_kfunc(u32 btf_id); 551 551 static bool is_bpf_throw_kfunc(struct bpf_insn *insn); 552 552 553 - static bool is_callback_calling_function(enum bpf_func_id func_id) 553 + static bool is_sync_callback_calling_function(enum bpf_func_id func_id) 554 554 { 555 555 return func_id == BPF_FUNC_for_each_map_elem || 556 - func_id == BPF_FUNC_timer_set_callback || 557 556 func_id == BPF_FUNC_find_vma || 558 557 func_id == BPF_FUNC_loop || 559 558 func_id == BPF_FUNC_user_ringbuf_drain; ··· 561 562 static bool is_async_callback_calling_function(enum bpf_func_id func_id) 562 563 { 563 564 return func_id == BPF_FUNC_timer_set_callback; 565 + } 566 + 567 + static bool is_callback_calling_function(enum bpf_func_id func_id) 568 + { 569 + return is_sync_callback_calling_function(func_id) || 570 + is_async_callback_calling_function(func_id); 571 + } 572 + 573 + static bool is_sync_callback_calling_insn(struct bpf_insn *insn) 574 + { 575 + return (bpf_helper_call(insn) && is_sync_callback_calling_function(insn->imm)) || 576 + (bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm)); 564 577 } 565 578 566 579 static bool is_storage_get_function(enum bpf_func_id func_id) ··· 1819 1808 dst_state->first_insn_idx = src->first_insn_idx; 1820 1809 dst_state->last_insn_idx = src->last_insn_idx; 1821 1810 dst_state->dfs_depth = src->dfs_depth; 1811 + dst_state->callback_unroll_depth = src->callback_unroll_depth; 1822 1812 dst_state->used_as_loop_entry = src->used_as_loop_entry; 1823 1813 for (i = 0; i <= src->curframe; i++) { 1824 1814 dst = dst_state->frame[i]; ··· 3451 3439 reg->subreg_def = DEF_NOT_SUBREG; 3452 3440 } 3453 3441 3454 - static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, 3455 - enum reg_arg_type t) 3442 + static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno, 3443 + enum reg_arg_type t) 3456 3444 { 3457 - struct bpf_verifier_state *vstate = env->cur_state; 3458 - struct bpf_func_state *state = vstate->frame[vstate->curframe]; 3459 3445 struct bpf_insn *insn = env->prog->insnsi + env->insn_idx; 3460 - struct bpf_reg_state *reg, *regs = state->regs; 3446 + struct bpf_reg_state *reg; 3461 3447 bool rw64; 3462 3448 3463 3449 if (regno >= MAX_BPF_REG) { ··· 3494 3484 mark_reg_unknown(env, regs, regno); 3495 3485 } 3496 3486 return 0; 3487 + } 3488 + 3489 + static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, 3490 + enum reg_arg_type t) 3491 + { 3492 + struct bpf_verifier_state *vstate = env->cur_state; 3493 + struct bpf_func_state *state = vstate->frame[vstate->curframe]; 3494 + 3495 + return __check_reg_arg(env, state->regs, regno, t); 3497 3496 } 3498 3497 3499 3498 static void mark_jmp_point(struct bpf_verifier_env *env, int idx) ··· 3743 3724 } 3744 3725 } 3745 3726 3727 + static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); 3728 + 3746 3729 /* For given verifier state backtrack_insn() is called from the last insn to 3747 3730 * the first insn. Its purpose is to compute a bitmask of registers and 3748 3731 * stack slots that needs precision in the parent verifier state. ··· 3920 3899 return -EFAULT; 3921 3900 return 0; 3922 3901 } 3923 - } else if ((bpf_helper_call(insn) && 3924 - is_callback_calling_function(insn->imm) && 3925 - !is_async_callback_calling_function(insn->imm)) || 3926 - (bpf_pseudo_kfunc_call(insn) && is_callback_calling_kfunc(insn->imm))) { 3927 - /* callback-calling helper or kfunc call, which means 3928 - * we are exiting from subprog, but unlike the subprog 3929 - * call handling above, we shouldn't propagate 3930 - * precision of r1-r5 (if any requested), as they are 3931 - * not actually arguments passed directly to callback 3932 - * subprogs 3902 + } else if (is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) { 3903 + /* exit from callback subprog to callback-calling helper or 3904 + * kfunc call. Use idx/subseq_idx check to discern it from 3905 + * straight line code backtracking. 3906 + * Unlike the subprog call handling above, we shouldn't 3907 + * propagate precision of r1-r5 (if any requested), as they are 3908 + * not actually arguments passed directly to callback subprogs 3933 3909 */ 3934 3910 if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { 3935 3911 verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); ··· 3961 3943 } else if (opcode == BPF_EXIT) { 3962 3944 bool r0_precise; 3963 3945 3946 + /* Backtracking to a nested function call, 'idx' is a part of 3947 + * the inner frame 'subseq_idx' is a part of the outer frame. 3948 + * In case of a regular function call, instructions giving 3949 + * precision to registers R1-R5 should have been found already. 3950 + * In case of a callback, it is ok to have R1-R5 marked for 3951 + * backtracking, as these registers are set by the function 3952 + * invoking callback. 3953 + */ 3954 + if (subseq_idx >= 0 && calls_callback(env, subseq_idx)) 3955 + for (i = BPF_REG_1; i <= BPF_REG_5; i++) 3956 + bt_clear_reg(bt, i); 3964 3957 if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { 3965 - /* if backtracing was looking for registers R1-R5 3966 - * they should have been found already. 3967 - */ 3968 3958 verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); 3969 3959 WARN_ONCE(1, "verifier backtracking bug"); 3970 3960 return -EFAULT; ··· 9376 9350 /* after the call registers r0 - r5 were scratched */ 9377 9351 for (i = 0; i < CALLER_SAVED_REGS; i++) { 9378 9352 mark_reg_not_init(env, regs, caller_saved[i]); 9379 - check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); 9353 + __check_reg_arg(env, regs, caller_saved[i], DST_OP_NO_MARK); 9380 9354 } 9381 9355 } 9382 9356 ··· 9389 9363 struct bpf_func_state *caller, 9390 9364 struct bpf_func_state *callee, int insn_idx); 9391 9365 9392 - static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, 9393 - int *insn_idx, int subprog, 9394 - set_callee_state_fn set_callee_state_cb) 9366 + static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int callsite, 9367 + set_callee_state_fn set_callee_state_cb, 9368 + struct bpf_verifier_state *state) 9395 9369 { 9396 - struct bpf_verifier_state *state = env->cur_state; 9397 9370 struct bpf_func_state *caller, *callee; 9398 9371 int err; 9399 9372 ··· 9402 9377 return -E2BIG; 9403 9378 } 9404 9379 9405 - caller = state->frame[state->curframe]; 9406 9380 if (state->frame[state->curframe + 1]) { 9407 9381 verbose(env, "verifier bug. Frame %d already allocated\n", 9408 9382 state->curframe + 1); 9409 9383 return -EFAULT; 9410 9384 } 9411 9385 9412 - err = btf_check_subprog_call(env, subprog, caller->regs); 9413 - if (err == -EFAULT) 9414 - return err; 9415 - if (subprog_is_global(env, subprog)) { 9416 - if (err) { 9417 - verbose(env, "Caller passes invalid args into func#%d\n", 9418 - subprog); 9419 - return err; 9420 - } else { 9421 - if (env->log.level & BPF_LOG_LEVEL) 9422 - verbose(env, 9423 - "Func#%d is global and valid. Skipping.\n", 9424 - subprog); 9425 - clear_caller_saved_regs(env, caller->regs); 9426 - 9427 - /* All global functions return a 64-bit SCALAR_VALUE */ 9428 - mark_reg_unknown(env, caller->regs, BPF_REG_0); 9429 - caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; 9430 - 9431 - /* continue with next insn after call */ 9432 - return 0; 9433 - } 9434 - } 9435 - 9436 - /* set_callee_state is used for direct subprog calls, but we are 9437 - * interested in validating only BPF helpers that can call subprogs as 9438 - * callbacks 9439 - */ 9440 - if (set_callee_state_cb != set_callee_state) { 9441 - env->subprog_info[subprog].is_cb = true; 9442 - if (bpf_pseudo_kfunc_call(insn) && 9443 - !is_callback_calling_kfunc(insn->imm)) { 9444 - verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", 9445 - func_id_name(insn->imm), insn->imm); 9446 - return -EFAULT; 9447 - } else if (!bpf_pseudo_kfunc_call(insn) && 9448 - !is_callback_calling_function(insn->imm)) { /* helper */ 9449 - verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n", 9450 - func_id_name(insn->imm), insn->imm); 9451 - return -EFAULT; 9452 - } 9453 - } 9454 - 9455 - if (insn->code == (BPF_JMP | BPF_CALL) && 9456 - insn->src_reg == 0 && 9457 - insn->imm == BPF_FUNC_timer_set_callback) { 9458 - struct bpf_verifier_state *async_cb; 9459 - 9460 - /* there is no real recursion here. timer callbacks are async */ 9461 - env->subprog_info[subprog].is_async_cb = true; 9462 - async_cb = push_async_cb(env, env->subprog_info[subprog].start, 9463 - *insn_idx, subprog); 9464 - if (!async_cb) 9465 - return -EFAULT; 9466 - callee = async_cb->frame[0]; 9467 - callee->async_entry_cnt = caller->async_entry_cnt + 1; 9468 - 9469 - /* Convert bpf_timer_set_callback() args into timer callback args */ 9470 - err = set_callee_state_cb(env, caller, callee, *insn_idx); 9471 - if (err) 9472 - return err; 9473 - 9474 - clear_caller_saved_regs(env, caller->regs); 9475 - mark_reg_unknown(env, caller->regs, BPF_REG_0); 9476 - caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; 9477 - /* continue with next insn after call */ 9478 - return 0; 9479 - } 9480 - 9386 + caller = state->frame[state->curframe]; 9481 9387 callee = kzalloc(sizeof(*callee), GFP_KERNEL); 9482 9388 if (!callee) 9483 9389 return -ENOMEM; ··· 9420 9464 */ 9421 9465 init_func_state(env, callee, 9422 9466 /* remember the callsite, it will be used by bpf_exit */ 9423 - *insn_idx /* callsite */, 9467 + callsite, 9424 9468 state->curframe + 1 /* frameno within this callchain */, 9425 9469 subprog /* subprog number within this prog */); 9426 - 9427 9470 /* Transfer references to the callee */ 9428 9471 err = copy_reference_state(callee, caller); 9472 + err = err ?: set_callee_state_cb(env, caller, callee, callsite); 9429 9473 if (err) 9430 9474 goto err_out; 9431 - 9432 - err = set_callee_state_cb(env, caller, callee, *insn_idx); 9433 - if (err) 9434 - goto err_out; 9435 - 9436 - clear_caller_saved_regs(env, caller->regs); 9437 9475 9438 9476 /* only increment it after check_reg_arg() finished */ 9439 9477 state->curframe++; 9478 + 9479 + return 0; 9480 + 9481 + err_out: 9482 + free_func_state(callee); 9483 + state->frame[state->curframe + 1] = NULL; 9484 + return err; 9485 + } 9486 + 9487 + static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *insn, 9488 + int insn_idx, int subprog, 9489 + set_callee_state_fn set_callee_state_cb) 9490 + { 9491 + struct bpf_verifier_state *state = env->cur_state, *callback_state; 9492 + struct bpf_func_state *caller, *callee; 9493 + int err; 9494 + 9495 + caller = state->frame[state->curframe]; 9496 + err = btf_check_subprog_call(env, subprog, caller->regs); 9497 + if (err == -EFAULT) 9498 + return err; 9499 + 9500 + /* set_callee_state is used for direct subprog calls, but we are 9501 + * interested in validating only BPF helpers that can call subprogs as 9502 + * callbacks 9503 + */ 9504 + env->subprog_info[subprog].is_cb = true; 9505 + if (bpf_pseudo_kfunc_call(insn) && 9506 + !is_sync_callback_calling_kfunc(insn->imm)) { 9507 + verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", 9508 + func_id_name(insn->imm), insn->imm); 9509 + return -EFAULT; 9510 + } else if (!bpf_pseudo_kfunc_call(insn) && 9511 + !is_callback_calling_function(insn->imm)) { /* helper */ 9512 + verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n", 9513 + func_id_name(insn->imm), insn->imm); 9514 + return -EFAULT; 9515 + } 9516 + 9517 + if (insn->code == (BPF_JMP | BPF_CALL) && 9518 + insn->src_reg == 0 && 9519 + insn->imm == BPF_FUNC_timer_set_callback) { 9520 + struct bpf_verifier_state *async_cb; 9521 + 9522 + /* there is no real recursion here. timer callbacks are async */ 9523 + env->subprog_info[subprog].is_async_cb = true; 9524 + async_cb = push_async_cb(env, env->subprog_info[subprog].start, 9525 + insn_idx, subprog); 9526 + if (!async_cb) 9527 + return -EFAULT; 9528 + callee = async_cb->frame[0]; 9529 + callee->async_entry_cnt = caller->async_entry_cnt + 1; 9530 + 9531 + /* Convert bpf_timer_set_callback() args into timer callback args */ 9532 + err = set_callee_state_cb(env, caller, callee, insn_idx); 9533 + if (err) 9534 + return err; 9535 + 9536 + return 0; 9537 + } 9538 + 9539 + /* for callback functions enqueue entry to callback and 9540 + * proceed with next instruction within current frame. 9541 + */ 9542 + callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false); 9543 + if (!callback_state) 9544 + return -ENOMEM; 9545 + 9546 + err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb, 9547 + callback_state); 9548 + if (err) 9549 + return err; 9550 + 9551 + callback_state->callback_unroll_depth++; 9552 + callback_state->frame[callback_state->curframe - 1]->callback_depth++; 9553 + caller->callback_depth = 0; 9554 + return 0; 9555 + } 9556 + 9557 + static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, 9558 + int *insn_idx) 9559 + { 9560 + struct bpf_verifier_state *state = env->cur_state; 9561 + struct bpf_func_state *caller; 9562 + int err, subprog, target_insn; 9563 + 9564 + target_insn = *insn_idx + insn->imm + 1; 9565 + subprog = find_subprog(env, target_insn); 9566 + if (subprog < 0) { 9567 + verbose(env, "verifier bug. No program starts at insn %d\n", target_insn); 9568 + return -EFAULT; 9569 + } 9570 + 9571 + caller = state->frame[state->curframe]; 9572 + err = btf_check_subprog_call(env, subprog, caller->regs); 9573 + if (err == -EFAULT) 9574 + return err; 9575 + if (subprog_is_global(env, subprog)) { 9576 + if (err) { 9577 + verbose(env, "Caller passes invalid args into func#%d\n", subprog); 9578 + return err; 9579 + } 9580 + 9581 + if (env->log.level & BPF_LOG_LEVEL) 9582 + verbose(env, "Func#%d is global and valid. Skipping.\n", subprog); 9583 + clear_caller_saved_regs(env, caller->regs); 9584 + 9585 + /* All global functions return a 64-bit SCALAR_VALUE */ 9586 + mark_reg_unknown(env, caller->regs, BPF_REG_0); 9587 + caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; 9588 + 9589 + /* continue with next insn after call */ 9590 + return 0; 9591 + } 9592 + 9593 + /* for regular function entry setup new frame and continue 9594 + * from that frame. 9595 + */ 9596 + err = setup_func_entry(env, subprog, *insn_idx, set_callee_state, state); 9597 + if (err) 9598 + return err; 9599 + 9600 + clear_caller_saved_regs(env, caller->regs); 9440 9601 9441 9602 /* and go analyze first insn of the callee */ 9442 9603 *insn_idx = env->subprog_info[subprog].start - 1; ··· 9562 9489 verbose(env, "caller:\n"); 9563 9490 print_verifier_state(env, caller, true); 9564 9491 verbose(env, "callee:\n"); 9565 - print_verifier_state(env, callee, true); 9492 + print_verifier_state(env, state->frame[state->curframe], true); 9566 9493 } 9567 - return 0; 9568 9494 9569 - err_out: 9570 - free_func_state(callee); 9571 - state->frame[state->curframe + 1] = NULL; 9572 - return err; 9495 + return 0; 9573 9496 } 9574 9497 9575 9498 int map_set_for_each_callback_args(struct bpf_verifier_env *env, ··· 9607 9538 for (i = BPF_REG_1; i <= BPF_REG_5; i++) 9608 9539 callee->regs[i] = caller->regs[i]; 9609 9540 return 0; 9610 - } 9611 - 9612 - static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, 9613 - int *insn_idx) 9614 - { 9615 - int subprog, target_insn; 9616 - 9617 - target_insn = *insn_idx + insn->imm + 1; 9618 - subprog = find_subprog(env, target_insn); 9619 - if (subprog < 0) { 9620 - verbose(env, "verifier bug. No program starts at insn %d\n", 9621 - target_insn); 9622 - return -EFAULT; 9623 - } 9624 - 9625 - return __check_func_call(env, insn, insn_idx, subprog, set_callee_state); 9626 9541 } 9627 9542 9628 9543 static int set_map_elem_callback_state(struct bpf_verifier_env *env, ··· 9801 9748 9802 9749 static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) 9803 9750 { 9804 - struct bpf_verifier_state *state = env->cur_state; 9751 + struct bpf_verifier_state *state = env->cur_state, *prev_st; 9805 9752 struct bpf_func_state *caller, *callee; 9806 9753 struct bpf_reg_state *r0; 9754 + bool in_callback_fn; 9807 9755 int err; 9808 9756 9809 9757 callee = state->frame[state->curframe]; ··· 9833 9779 verbose_invalid_scalar(env, r0, &range, "callback return", "R0"); 9834 9780 return -EINVAL; 9835 9781 } 9782 + if (!calls_callback(env, callee->callsite)) { 9783 + verbose(env, "BUG: in callback at %d, callsite %d !calls_callback\n", 9784 + *insn_idx, callee->callsite); 9785 + return -EFAULT; 9786 + } 9836 9787 } else { 9837 9788 /* return to the caller whatever r0 had in the callee */ 9838 9789 caller->regs[BPF_REG_0] = *r0; ··· 9855 9796 return err; 9856 9797 } 9857 9798 9858 - *insn_idx = callee->callsite + 1; 9799 + /* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite, 9800 + * there function call logic would reschedule callback visit. If iteration 9801 + * converges is_state_visited() would prune that visit eventually. 9802 + */ 9803 + in_callback_fn = callee->in_callback_fn; 9804 + if (in_callback_fn) 9805 + *insn_idx = callee->callsite; 9806 + else 9807 + *insn_idx = callee->callsite + 1; 9808 + 9859 9809 if (env->log.level & BPF_LOG_LEVEL) { 9860 9810 verbose(env, "returning from callee:\n"); 9861 9811 print_verifier_state(env, callee, true); ··· 9875 9807 * bpf_throw, this will be done by copy_verifier_state for extra frames. */ 9876 9808 free_func_state(callee); 9877 9809 state->frame[state->curframe--] = NULL; 9810 + 9811 + /* for callbacks widen imprecise scalars to make programs like below verify: 9812 + * 9813 + * struct ctx { int i; } 9814 + * void cb(int idx, struct ctx *ctx) { ctx->i++; ... } 9815 + * ... 9816 + * struct ctx = { .i = 0; } 9817 + * bpf_loop(100, cb, &ctx, 0); 9818 + * 9819 + * This is similar to what is done in process_iter_next_call() for open 9820 + * coded iterators. 9821 + */ 9822 + prev_st = in_callback_fn ? find_prev_entry(env, state, *insn_idx) : NULL; 9823 + if (prev_st) { 9824 + err = widen_imprecise_scalars(env, prev_st, state); 9825 + if (err) 9826 + return err; 9827 + } 9878 9828 return 0; 9879 9829 } 9880 9830 ··· 10295 10209 } 10296 10210 break; 10297 10211 case BPF_FUNC_for_each_map_elem: 10298 - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, 10299 - set_map_elem_callback_state); 10212 + err = push_callback_call(env, insn, insn_idx, meta.subprogno, 10213 + set_map_elem_callback_state); 10300 10214 break; 10301 10215 case BPF_FUNC_timer_set_callback: 10302 - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, 10303 - set_timer_callback_state); 10216 + err = push_callback_call(env, insn, insn_idx, meta.subprogno, 10217 + set_timer_callback_state); 10304 10218 break; 10305 10219 case BPF_FUNC_find_vma: 10306 - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, 10307 - set_find_vma_callback_state); 10220 + err = push_callback_call(env, insn, insn_idx, meta.subprogno, 10221 + set_find_vma_callback_state); 10308 10222 break; 10309 10223 case BPF_FUNC_snprintf: 10310 10224 err = check_bpf_snprintf_call(env, regs); 10311 10225 break; 10312 10226 case BPF_FUNC_loop: 10313 10227 update_loop_inline_state(env, meta.subprogno); 10314 - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, 10315 - set_loop_callback_state); 10228 + /* Verifier relies on R1 value to determine if bpf_loop() iteration 10229 + * is finished, thus mark it precise. 10230 + */ 10231 + err = mark_chain_precision(env, BPF_REG_1); 10232 + if (err) 10233 + return err; 10234 + if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) { 10235 + err = push_callback_call(env, insn, insn_idx, meta.subprogno, 10236 + set_loop_callback_state); 10237 + } else { 10238 + cur_func(env)->callback_depth = 0; 10239 + if (env->log.level & BPF_LOG_LEVEL2) 10240 + verbose(env, "frame%d bpf_loop iteration limit reached\n", 10241 + env->cur_state->curframe); 10242 + } 10316 10243 break; 10317 10244 case BPF_FUNC_dynptr_from_mem: 10318 10245 if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) { ··· 10421 10322 break; 10422 10323 } 10423 10324 case BPF_FUNC_user_ringbuf_drain: 10424 - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, 10425 - set_user_ringbuf_callback_state); 10325 + err = push_callback_call(env, insn, insn_idx, meta.subprogno, 10326 + set_user_ringbuf_callback_state); 10426 10327 break; 10427 10328 } 10428 10329 ··· 11310 11211 btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]; 11311 11212 } 11312 11213 11313 - static bool is_callback_calling_kfunc(u32 btf_id) 11214 + static bool is_sync_callback_calling_kfunc(u32 btf_id) 11314 11215 { 11315 11216 return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]; 11316 11217 } ··· 12062 11963 return -EACCES; 12063 11964 } 12064 11965 11966 + /* Check the arguments */ 11967 + err = check_kfunc_args(env, &meta, insn_idx); 11968 + if (err < 0) 11969 + return err; 11970 + 11971 + if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { 11972 + err = push_callback_call(env, insn, insn_idx, meta.subprogno, 11973 + set_rbtree_add_callback_state); 11974 + if (err) { 11975 + verbose(env, "kfunc %s#%d failed callback verification\n", 11976 + func_name, meta.func_id); 11977 + return err; 11978 + } 11979 + } 11980 + 12065 11981 rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); 12066 11982 rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); 12067 11983 ··· 12112 11998 return -EINVAL; 12113 11999 } 12114 12000 12115 - /* Check the arguments */ 12116 - err = check_kfunc_args(env, &meta, insn_idx); 12117 - if (err < 0) 12118 - return err; 12119 12001 /* In case of release function, we get register number of refcounted 12120 12002 * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. 12121 12003 */ ··· 12140 12030 err = release_reference(env, release_ref_obj_id); 12141 12031 if (err) { 12142 12032 verbose(env, "kfunc %s#%d reference has not been acquired before\n", 12143 - func_name, meta.func_id); 12144 - return err; 12145 - } 12146 - } 12147 - 12148 - if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { 12149 - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, 12150 - set_rbtree_add_callback_state); 12151 - if (err) { 12152 - verbose(env, "kfunc %s#%d failed callback verification\n", 12153 12033 func_name, meta.func_id); 12154 12034 return err; 12155 12035 } ··· 15508 15408 return env->insn_aux_data[insn_idx].force_checkpoint; 15509 15409 } 15510 15410 15411 + static void mark_calls_callback(struct bpf_verifier_env *env, int idx) 15412 + { 15413 + env->insn_aux_data[idx].calls_callback = true; 15414 + } 15415 + 15416 + static bool calls_callback(struct bpf_verifier_env *env, int insn_idx) 15417 + { 15418 + return env->insn_aux_data[insn_idx].calls_callback; 15419 + } 15511 15420 15512 15421 enum { 15513 15422 DONE_EXPLORING = 0, ··· 15630 15521 * async state will be pushed for further exploration. 15631 15522 */ 15632 15523 mark_prune_point(env, t); 15524 + /* For functions that invoke callbacks it is not known how many times 15525 + * callback would be called. Verifier models callback calling functions 15526 + * by repeatedly visiting callback bodies and returning to origin call 15527 + * instruction. 15528 + * In order to stop such iteration verifier needs to identify when a 15529 + * state identical some state from a previous iteration is reached. 15530 + * Check below forces creation of checkpoint before callback calling 15531 + * instruction to allow search for such identical states. 15532 + */ 15533 + if (is_sync_callback_calling_insn(insn)) { 15534 + mark_calls_callback(env, t); 15535 + mark_force_checkpoint(env, t); 15536 + mark_prune_point(env, t); 15537 + mark_jmp_point(env, t); 15538 + } 15633 15539 if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { 15634 15540 struct bpf_kfunc_call_arg_meta meta; 15635 15541 ··· 17114 16990 } 17115 16991 goto skip_inf_loop_check; 17116 16992 } 16993 + if (calls_callback(env, insn_idx)) { 16994 + if (states_equal(env, &sl->state, cur, true)) 16995 + goto hit; 16996 + goto skip_inf_loop_check; 16997 + } 17117 16998 /* attempt to detect infinite loop to avoid unnecessary doomed work */ 17118 16999 if (states_maybe_looping(&sl->state, cur) && 17119 17000 states_equal(env, &sl->state, cur, false) && 17120 - !iter_active_depths_differ(&sl->state, cur)) { 17001 + !iter_active_depths_differ(&sl->state, cur) && 17002 + sl->state.callback_unroll_depth == cur->callback_unroll_depth) { 17121 17003 verbose_linfo(env, insn_idx, "; "); 17122 17004 verbose(env, "infinite loop detected at insn %d\n", insn_idx); 17123 17005 verbose(env, "cur state:");

+56 -1

net/core/dev.c

··· 10051 10051 } 10052 10052 EXPORT_SYMBOL(netif_tx_stop_all_queues); 10053 10053 10054 + static int netdev_do_alloc_pcpu_stats(struct net_device *dev) 10055 + { 10056 + void __percpu *v; 10057 + 10058 + /* Drivers implementing ndo_get_peer_dev must support tstat 10059 + * accounting, so that skb_do_redirect() can bump the dev's 10060 + * RX stats upon network namespace switch. 10061 + */ 10062 + if (dev->netdev_ops->ndo_get_peer_dev && 10063 + dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS) 10064 + return -EOPNOTSUPP; 10065 + 10066 + switch (dev->pcpu_stat_type) { 10067 + case NETDEV_PCPU_STAT_NONE: 10068 + return 0; 10069 + case NETDEV_PCPU_STAT_LSTATS: 10070 + v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); 10071 + break; 10072 + case NETDEV_PCPU_STAT_TSTATS: 10073 + v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); 10074 + break; 10075 + case NETDEV_PCPU_STAT_DSTATS: 10076 + v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats); 10077 + break; 10078 + default: 10079 + return -EINVAL; 10080 + } 10081 + 10082 + return v ? 0 : -ENOMEM; 10083 + } 10084 + 10085 + static void netdev_do_free_pcpu_stats(struct net_device *dev) 10086 + { 10087 + switch (dev->pcpu_stat_type) { 10088 + case NETDEV_PCPU_STAT_NONE: 10089 + return; 10090 + case NETDEV_PCPU_STAT_LSTATS: 10091 + free_percpu(dev->lstats); 10092 + break; 10093 + case NETDEV_PCPU_STAT_TSTATS: 10094 + free_percpu(dev->tstats); 10095 + break; 10096 + case NETDEV_PCPU_STAT_DSTATS: 10097 + free_percpu(dev->dstats); 10098 + break; 10099 + } 10100 + } 10101 + 10054 10102 /** 10055 10103 * register_netdevice() - register a network device 10056 10104 * @dev: device to register ··· 10159 10111 goto err_uninit; 10160 10112 } 10161 10113 10114 + ret = netdev_do_alloc_pcpu_stats(dev); 10115 + if (ret) 10116 + goto err_uninit; 10117 + 10162 10118 ret = dev_index_reserve(net, dev->ifindex); 10163 10119 if (ret < 0) 10164 - goto err_uninit; 10120 + goto err_free_pcpu; 10165 10121 dev->ifindex = ret; 10166 10122 10167 10123 /* Transfer changeable features to wanted_features and enable ··· 10271 10219 call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); 10272 10220 err_ifindex_release: 10273 10221 dev_index_release(net, dev->ifindex); 10222 + err_free_pcpu: 10223 + netdev_do_free_pcpu_stats(dev); 10274 10224 err_uninit: 10275 10225 if (dev->netdev_ops->ndo_uninit) 10276 10226 dev->netdev_ops->ndo_uninit(dev); ··· 10525 10471 WARN_ON(rcu_access_pointer(dev->ip_ptr)); 10526 10472 WARN_ON(rcu_access_pointer(dev->ip6_ptr)); 10527 10473 10474 + netdev_do_free_pcpu_stats(dev); 10528 10475 if (dev->priv_destructor) 10529 10476 dev->priv_destructor(dev); 10530 10477 if (dev->needs_free_netdev)

+14 -5

net/core/filter.c

··· 81 81 #include <net/xdp.h> 82 82 #include <net/mptcp.h> 83 83 #include <net/netfilter/nf_conntrack_bpf.h> 84 + #include <net/netkit.h> 84 85 #include <linux/un.h> 85 86 86 87 #include "dev.h" ··· 2469 2468 DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); 2470 2469 EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); 2471 2470 2471 + static struct net_device *skb_get_peer_dev(struct net_device *dev) 2472 + { 2473 + const struct net_device_ops *ops = dev->netdev_ops; 2474 + 2475 + if (likely(ops->ndo_get_peer_dev)) 2476 + return INDIRECT_CALL_1(ops->ndo_get_peer_dev, 2477 + netkit_peer_dev, dev); 2478 + return NULL; 2479 + } 2480 + 2472 2481 int skb_do_redirect(struct sk_buff *skb) 2473 2482 { 2474 2483 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); ··· 2492 2481 if (unlikely(!dev)) 2493 2482 goto out_drop; 2494 2483 if (flags & BPF_F_PEER) { 2495 - const struct net_device_ops *ops = dev->netdev_ops; 2496 - 2497 - if (unlikely(!ops->ndo_get_peer_dev || 2498 - !skb_at_tc_ingress(skb))) 2484 + if (unlikely(!skb_at_tc_ingress(skb))) 2499 2485 goto out_drop; 2500 - dev = ops->ndo_get_peer_dev(dev); 2486 + dev = skb_get_peer_dev(dev); 2501 2487 if (unlikely(!dev || 2502 2488 !(dev->flags & IFF_UP) || 2503 2489 net_eq(net, dev_net(dev)))) 2504 2490 goto out_drop; 2505 2491 skb->dev = dev; 2492 + dev_sw_netstats_rx_add(dev, skb->len); 2506 2493 return -EAGAIN; 2507 2494 } 2508 2495 return flags & BPF_F_NEIGH ?

+1

net/ipv4/inet_diag.c

··· 1481 1481 module_init(inet_diag_init); 1482 1482 module_exit(inet_diag_exit); 1483 1483 MODULE_LICENSE("GPL"); 1484 + MODULE_DESCRIPTION("INET/INET6: socket monitoring via SOCK_DIAG"); 1484 1485 MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2 /* AF_INET */); 1485 1486 MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10 /* AF_INET6 */);

+1

net/ipv4/raw_diag.c

··· 257 257 module_init(raw_diag_init); 258 258 module_exit(raw_diag_exit); 259 259 MODULE_LICENSE("GPL"); 260 + MODULE_DESCRIPTION("RAW socket monitoring via SOCK_DIAG"); 260 261 MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-255 /* AF_INET - IPPROTO_RAW */); 261 262 MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10-255 /* AF_INET6 - IPPROTO_RAW */);

+1 -1

net/ipv4/route.c

··· 780 780 goto reject_redirect; 781 781 } 782 782 783 - n = __ipv4_neigh_lookup(rt->dst.dev, new_gw); 783 + n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw); 784 784 if (!n) 785 785 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev); 786 786 if (!IS_ERR(n)) {

+1

net/ipv4/tcp_diag.c

··· 247 247 module_init(tcp_diag_init); 248 248 module_exit(tcp_diag_exit); 249 249 MODULE_LICENSE("GPL"); 250 + MODULE_DESCRIPTION("TCP socket monitoring via SOCK_DIAG"); 250 251 MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-6 /* AF_INET - IPPROTO_TCP */);

+1

net/ipv4/udp_diag.c

··· 296 296 module_init(udp_diag_init); 297 297 module_exit(udp_diag_exit); 298 298 MODULE_LICENSE("GPL"); 299 + MODULE_DESCRIPTION("UDP socket monitoring via SOCK_DIAG"); 299 300 MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */); 300 301 MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */);

+1

net/mptcp/mptcp_diag.c

··· 245 245 module_init(mptcp_diag_init); 246 246 module_exit(mptcp_diag_exit); 247 247 MODULE_LICENSE("GPL"); 248 + MODULE_DESCRIPTION("MPTCP socket monitoring via SOCK_DIAG"); 248 249 MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-262 /* AF_INET - IPPROTO_MPTCP */);

+1

net/packet/diag.c

··· 262 262 module_init(packet_diag_init); 263 263 module_exit(packet_diag_exit); 264 264 MODULE_LICENSE("GPL"); 265 + MODULE_DESCRIPTION("PACKET socket monitoring via SOCK_DIAG"); 265 266 MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 17 /* AF_PACKET */);

+4 -3

net/rxrpc/conn_client.c

··· 73 73 static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_call *call, 74 74 gfp_t gfp) 75 75 { 76 + static atomic_t rxrpc_bundle_id; 76 77 struct rxrpc_bundle *bundle; 77 78 78 79 bundle = kzalloc(sizeof(*bundle), gfp); ··· 86 85 bundle->upgrade = test_bit(RXRPC_CALL_UPGRADE, &call->flags); 87 86 bundle->service_id = call->dest_srx.srx_service; 88 87 bundle->security_level = call->security_level; 88 + bundle->debug_id = atomic_inc_return(&rxrpc_bundle_id); 89 89 refcount_set(&bundle->ref, 1); 90 90 atomic_set(&bundle->active, 1); 91 91 INIT_LIST_HEAD(&bundle->waiting_calls); ··· 107 105 108 106 static void rxrpc_free_bundle(struct rxrpc_bundle *bundle) 109 107 { 110 - trace_rxrpc_bundle(bundle->debug_id, 1, rxrpc_bundle_free); 108 + trace_rxrpc_bundle(bundle->debug_id, refcount_read(&bundle->ref), 109 + rxrpc_bundle_free); 111 110 rxrpc_put_peer(bundle->peer, rxrpc_peer_put_bundle); 112 111 key_put(bundle->key); 113 112 kfree(bundle); ··· 242 239 */ 243 240 int rxrpc_look_up_bundle(struct rxrpc_call *call, gfp_t gfp) 244 241 { 245 - static atomic_t rxrpc_bundle_id; 246 242 struct rxrpc_bundle *bundle, *candidate; 247 243 struct rxrpc_local *local = call->local; 248 244 struct rb_node *p, **pp, *parent; ··· 308 306 } 309 307 310 308 _debug("new bundle"); 311 - candidate->debug_id = atomic_inc_return(&rxrpc_bundle_id); 312 309 rb_link_node(&candidate->local_node, parent, pp); 313 310 rb_insert_color(&candidate->local_node, &local->client_bundles); 314 311 call->bundle = rxrpc_get_bundle(candidate, rxrpc_bundle_get_client_call);

+29 -32

net/rxrpc/input.c

··· 643 643 clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); 644 644 smp_mb(); /* Read data before setting avail bit */ 645 645 set_bit(i, &call->rtt_avail); 646 - if (type != rxrpc_rtt_rx_cancel) 647 - rxrpc_peer_add_rtt(call, type, i, acked_serial, ack_serial, 648 - sent_at, resp_time); 649 - else 650 - trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_cancel, i, 651 - orig_serial, acked_serial, 0, 0); 646 + rxrpc_peer_add_rtt(call, type, i, acked_serial, ack_serial, 647 + sent_at, resp_time); 652 648 matched = true; 653 649 } 654 650 ··· 797 801 summary.ack_reason, nr_acks); 798 802 rxrpc_inc_stat(call->rxnet, stat_rx_acks[ack.reason]); 799 803 800 - switch (ack.reason) { 801 - case RXRPC_ACK_PING_RESPONSE: 802 - rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, 803 - rxrpc_rtt_rx_ping_response); 804 - break; 805 - case RXRPC_ACK_REQUESTED: 806 - rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, 807 - rxrpc_rtt_rx_requested_ack); 808 - break; 809 - default: 810 - if (acked_serial != 0) 804 + if (acked_serial != 0) { 805 + switch (ack.reason) { 806 + case RXRPC_ACK_PING_RESPONSE: 811 807 rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, 812 - rxrpc_rtt_rx_cancel); 813 - break; 814 - } 815 - 816 - if (ack.reason == RXRPC_ACK_PING) { 817 - rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, ack_serial, 818 - rxrpc_propose_ack_respond_to_ping); 819 - } else if (sp->hdr.flags & RXRPC_REQUEST_ACK) { 820 - rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, ack_serial, 821 - rxrpc_propose_ack_respond_to_ack); 808 + rxrpc_rtt_rx_ping_response); 809 + break; 810 + case RXRPC_ACK_REQUESTED: 811 + rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, 812 + rxrpc_rtt_rx_requested_ack); 813 + break; 814 + default: 815 + rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, 816 + rxrpc_rtt_rx_other_ack); 817 + break; 818 + } 822 819 } 823 820 824 821 /* If we get an EXCEEDS_WINDOW ACK from the server, it probably ··· 824 835 rxrpc_is_client_call(call)) { 825 836 rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, 826 837 0, -ENETRESET); 827 - return; 838 + goto send_response; 828 839 } 829 840 830 841 /* If we get an OUT_OF_SEQUENCE ACK from the server, that can also ··· 838 849 rxrpc_is_client_call(call)) { 839 850 rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, 840 851 0, -ENETRESET); 841 - return; 852 + goto send_response; 842 853 } 843 854 844 855 /* Discard any out-of-order or duplicate ACKs (outside lock). */ ··· 846 857 trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial, 847 858 first_soft_ack, call->acks_first_seq, 848 859 prev_pkt, call->acks_prev_seq); 849 - return; 860 + goto send_response; 850 861 } 851 862 852 863 info.rxMTU = 0; ··· 886 897 case RXRPC_CALL_SERVER_AWAIT_ACK: 887 898 break; 888 899 default: 889 - return; 900 + goto send_response; 890 901 } 891 902 892 903 if (before(hard_ack, call->acks_hard_ack) || ··· 898 909 if (after(hard_ack, call->acks_hard_ack)) { 899 910 if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) { 900 911 rxrpc_end_tx_phase(call, false, rxrpc_eproto_unexpected_ack); 901 - return; 912 + goto send_response; 902 913 } 903 914 } 904 915 ··· 916 927 rxrpc_propose_ack_ping_for_lost_reply); 917 928 918 929 rxrpc_congestion_management(call, skb, &summary, acked_serial); 930 + 931 + send_response: 932 + if (ack.reason == RXRPC_ACK_PING) 933 + rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, ack_serial, 934 + rxrpc_propose_ack_respond_to_ping); 935 + else if (sp->hdr.flags & RXRPC_REQUEST_ACK) 936 + rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, ack_serial, 937 + rxrpc_propose_ack_respond_to_ack); 919 938 } 920 939 921 940 /*

+1

net/sctp/diag.c

··· 527 527 module_init(sctp_diag_init); 528 528 module_exit(sctp_diag_exit); 529 529 MODULE_LICENSE("GPL"); 530 + MODULE_DESCRIPTION("SCTP socket monitoring via SOCK_DIAG"); 530 531 MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-132);

+6 -2

net/smc/af_smc.c

··· 598 598 struct smc_llc_qentry *qentry; 599 599 int rc; 600 600 601 - /* receive CONFIRM LINK request from server over RoCE fabric */ 602 - qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, 601 + /* Receive CONFIRM LINK request from server over RoCE fabric. 602 + * Increasing the client's timeout by twice as much as the server's 603 + * timeout by default can temporarily avoid decline messages of 604 + * both sides crossing or colliding 605 + */ 606 + qentry = smc_llc_wait(link->lgr, NULL, 2 * SMC_LLC_WAIT_TIME, 603 607 SMC_LLC_CONFIRM_LINK); 604 608 if (!qentry) { 605 609 struct smc_clc_msg_decline dclc;

+1

net/smc/smc_diag.c

··· 268 268 module_init(smc_diag_init); 269 269 module_exit(smc_diag_exit); 270 270 MODULE_LICENSE("GPL"); 271 + MODULE_DESCRIPTION("SMC socket monitoring via SOCK_DIAG"); 271 272 MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */); 272 273 MODULE_ALIAS_GENL_FAMILY(SMCR_GENL_FAMILY_NAME);

+1

net/tipc/diag.c

··· 113 113 module_exit(tipc_diag_exit); 114 114 115 115 MODULE_LICENSE("Dual BSD/GPL"); 116 + MODULE_DESCRIPTION("TIPC socket monitoring via SOCK_DIAG"); 116 117 MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_TIPC);

+3

net/tls/tls_sw.c

··· 1232 1232 lock_sock(sk); 1233 1233 1234 1234 retry: 1235 + /* same checks as in tls_sw_push_pending_record() */ 1235 1236 rec = ctx->open_rec; 1236 1237 if (!rec) 1237 1238 goto unlock; 1238 1239 1239 1240 msg_pl = &rec->msg_plaintext; 1241 + if (msg_pl->sg.size == 0) 1242 + goto unlock; 1240 1243 1241 1244 /* Check the BPF advisor and perform transmission. */ 1242 1245 ret = bpf_exec_tx_verdict(msg_pl, sk, false, TLS_RECORD_TYPE_DATA,

+1

net/unix/diag.c

··· 339 339 module_init(unix_diag_init); 340 340 module_exit(unix_diag_exit); 341 341 MODULE_LICENSE("GPL"); 342 + MODULE_DESCRIPTION("UNIX socket monitoring via SOCK_DIAG"); 342 343 MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 1 /* AF_LOCAL */);

+1

net/vmw_vsock/diag.c

··· 174 174 module_init(vsock_diag_init); 175 175 module_exit(vsock_diag_exit); 176 176 MODULE_LICENSE("GPL"); 177 + MODULE_DESCRIPTION("VMware Virtual Sockets monitoring via SOCK_DIAG"); 177 178 MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 178 179 40 /* AF_VSOCK */);

+1

net/xdp/xsk_diag.c

··· 211 211 module_init(xsk_diag_init); 212 212 module_exit(xsk_diag_exit); 213 213 MODULE_LICENSE("GPL"); 214 + MODULE_DESCRIPTION("XDP socket monitoring via SOCK_DIAG"); 214 215 MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_XDP);

+1 -1

tools/net/ynl/Makefile.deps

··· 18 18 CFLAGS_ethtool:=$(call get_hdr_inc,_LINUX_ETHTOOL_NETLINK_H_,ethtool_netlink.h) 19 19 CFLAGS_handshake:=$(call get_hdr_inc,_LINUX_HANDSHAKE_H,handshake.h) 20 20 CFLAGS_netdev:=$(call get_hdr_inc,_LINUX_NETDEV_H,netdev.h) 21 - CFLAGS_nfsd:=$(call get_hdr_inc,_LINUX_NFSD_H,nfsd.h) 21 + CFLAGS_nfsd:=$(call get_hdr_inc,_LINUX_NFSD_NETLINK_H,nfsd_netlink.h)

+1 -1

tools/net/ynl/generated/devlink-user.c

··· 15 15 /* Enums */ 16 16 static const char * const devlink_op_strmap[] = { 17 17 [3] = "get", 18 - [7] = "port-get", 18 + // skip "port-get", duplicate reply value 19 19 [DEVLINK_CMD_PORT_NEW] = "port-new", 20 20 [13] = "sb-get", 21 21 [17] = "sb-pool-get",

+6

tools/net/ynl/ynl-gen-c.py

··· 1505 1505 cw.block_start(line=f"static const char * const {map_name}[] =") 1506 1506 for op_name, op in family.msgs.items(): 1507 1507 if op.rsp_value: 1508 + # Make sure we don't add duplicated entries, if multiple commands 1509 + # produce the same response in legacy families. 1510 + if family.rsp_by_value[op.rsp_value] != op: 1511 + cw.p(f'// skip "{op_name}", duplicate reply value') 1512 + continue 1513 + 1508 1514 if op.req_value == op.rsp_value: 1509 1515 cw.p(f'[{op.enum_name}] = "{op_name}",') 1510 1516 else:

+189 -126

tools/testing/selftests/bpf/prog_tests/tc_redirect.c

··· 24 24 25 25 #include "test_progs.h" 26 26 #include "network_helpers.h" 27 + #include "netlink_helpers.h" 27 28 #include "test_tc_neigh_fib.skel.h" 28 29 #include "test_tc_neigh.skel.h" 29 30 #include "test_tc_peer.skel.h" ··· 111 110 } 112 111 } 113 112 113 + enum dev_mode { 114 + MODE_VETH, 115 + MODE_NETKIT, 116 + }; 117 + 114 118 struct netns_setup_result { 115 - int ifindex_veth_src; 116 - int ifindex_veth_src_fwd; 117 - int ifindex_veth_dst; 118 - int ifindex_veth_dst_fwd; 119 + enum dev_mode dev_mode; 120 + int ifindex_src; 121 + int ifindex_src_fwd; 122 + int ifindex_dst; 123 + int ifindex_dst_fwd; 119 124 }; 120 125 121 126 static int get_ifaddr(const char *name, char *ifaddr) ··· 144 137 return 0; 145 138 } 146 139 140 + static int create_netkit(int mode, char *prim, char *peer) 141 + { 142 + struct rtattr *linkinfo, *data, *peer_info; 143 + struct rtnl_handle rth = { .fd = -1 }; 144 + const char *type = "netkit"; 145 + struct { 146 + struct nlmsghdr n; 147 + struct ifinfomsg i; 148 + char buf[1024]; 149 + } req = {}; 150 + int err; 151 + 152 + err = rtnl_open(&rth, 0); 153 + if (!ASSERT_OK(err, "open_rtnetlink")) 154 + return err; 155 + 156 + memset(&req, 0, sizeof(req)); 157 + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); 158 + req.n.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; 159 + req.n.nlmsg_type = RTM_NEWLINK; 160 + req.i.ifi_family = AF_UNSPEC; 161 + 162 + addattr_l(&req.n, sizeof(req), IFLA_IFNAME, prim, strlen(prim)); 163 + linkinfo = addattr_nest(&req.n, sizeof(req), IFLA_LINKINFO); 164 + addattr_l(&req.n, sizeof(req), IFLA_INFO_KIND, type, strlen(type)); 165 + data = addattr_nest(&req.n, sizeof(req), IFLA_INFO_DATA); 166 + addattr32(&req.n, sizeof(req), IFLA_NETKIT_MODE, mode); 167 + peer_info = addattr_nest(&req.n, sizeof(req), IFLA_NETKIT_PEER_INFO); 168 + req.n.nlmsg_len += sizeof(struct ifinfomsg); 169 + addattr_l(&req.n, sizeof(req), IFLA_IFNAME, peer, strlen(peer)); 170 + addattr_nest_end(&req.n, peer_info); 171 + addattr_nest_end(&req.n, data); 172 + addattr_nest_end(&req.n, linkinfo); 173 + 174 + err = rtnl_talk(&rth, &req.n, NULL); 175 + ASSERT_OK(err, "talk_rtnetlink"); 176 + rtnl_close(&rth); 177 + return err; 178 + } 179 + 147 180 static int netns_setup_links_and_routes(struct netns_setup_result *result) 148 181 { 149 182 struct nstoken *nstoken = NULL; 150 - char veth_src_fwd_addr[IFADDR_STR_LEN+1] = {}; 183 + char src_fwd_addr[IFADDR_STR_LEN+1] = {}; 184 + int err; 151 185 152 - SYS(fail, "ip link add veth_src type veth peer name veth_src_fwd"); 153 - SYS(fail, "ip link add veth_dst type veth peer name veth_dst_fwd"); 186 + if (result->dev_mode == MODE_VETH) { 187 + SYS(fail, "ip link add src type veth peer name src_fwd"); 188 + SYS(fail, "ip link add dst type veth peer name dst_fwd"); 154 189 155 - SYS(fail, "ip link set veth_dst_fwd address " MAC_DST_FWD); 156 - SYS(fail, "ip link set veth_dst address " MAC_DST); 190 + SYS(fail, "ip link set dst_fwd address " MAC_DST_FWD); 191 + SYS(fail, "ip link set dst address " MAC_DST); 192 + } else if (result->dev_mode == MODE_NETKIT) { 193 + err = create_netkit(NETKIT_L3, "src", "src_fwd"); 194 + if (!ASSERT_OK(err, "create_ifindex_src")) 195 + goto fail; 196 + err = create_netkit(NETKIT_L3, "dst", "dst_fwd"); 197 + if (!ASSERT_OK(err, "create_ifindex_dst")) 198 + goto fail; 199 + } 157 200 158 - if (get_ifaddr("veth_src_fwd", veth_src_fwd_addr)) 201 + if (get_ifaddr("src_fwd", src_fwd_addr)) 159 202 goto fail; 160 203 161 - result->ifindex_veth_src = if_nametoindex("veth_src"); 162 - if (!ASSERT_GT(result->ifindex_veth_src, 0, "ifindex_veth_src")) 204 + result->ifindex_src = if_nametoindex("src"); 205 + if (!ASSERT_GT(result->ifindex_src, 0, "ifindex_src")) 163 206 goto fail; 164 207 165 - result->ifindex_veth_src_fwd = if_nametoindex("veth_src_fwd"); 166 - if (!ASSERT_GT(result->ifindex_veth_src_fwd, 0, "ifindex_veth_src_fwd")) 208 + result->ifindex_src_fwd = if_nametoindex("src_fwd"); 209 + if (!ASSERT_GT(result->ifindex_src_fwd, 0, "ifindex_src_fwd")) 167 210 goto fail; 168 211 169 - result->ifindex_veth_dst = if_nametoindex("veth_dst"); 170 - if (!ASSERT_GT(result->ifindex_veth_dst, 0, "ifindex_veth_dst")) 212 + result->ifindex_dst = if_nametoindex("dst"); 213 + if (!ASSERT_GT(result->ifindex_dst, 0, "ifindex_dst")) 171 214 goto fail; 172 215 173 - result->ifindex_veth_dst_fwd = if_nametoindex("veth_dst_fwd"); 174 - if (!ASSERT_GT(result->ifindex_veth_dst_fwd, 0, "ifindex_veth_dst_fwd")) 216 + result->ifindex_dst_fwd = if_nametoindex("dst_fwd"); 217 + if (!ASSERT_GT(result->ifindex_dst_fwd, 0, "ifindex_dst_fwd")) 175 218 goto fail; 176 219 177 - SYS(fail, "ip link set veth_src netns " NS_SRC); 178 - SYS(fail, "ip link set veth_src_fwd netns " NS_FWD); 179 - SYS(fail, "ip link set veth_dst_fwd netns " NS_FWD); 180 - SYS(fail, "ip link set veth_dst netns " NS_DST); 220 + SYS(fail, "ip link set src netns " NS_SRC); 221 + SYS(fail, "ip link set src_fwd netns " NS_FWD); 222 + SYS(fail, "ip link set dst_fwd netns " NS_FWD); 223 + SYS(fail, "ip link set dst netns " NS_DST); 181 224 182 225 /** setup in 'src' namespace */ 183 226 nstoken = open_netns(NS_SRC); 184 227 if (!ASSERT_OK_PTR(nstoken, "setns src")) 185 228 goto fail; 186 229 187 - SYS(fail, "ip addr add " IP4_SRC "/32 dev veth_src"); 188 - SYS(fail, "ip addr add " IP6_SRC "/128 dev veth_src nodad"); 189 - SYS(fail, "ip link set dev veth_src up"); 230 + SYS(fail, "ip addr add " IP4_SRC "/32 dev src"); 231 + SYS(fail, "ip addr add " IP6_SRC "/128 dev src nodad"); 232 + SYS(fail, "ip link set dev src up"); 190 233 191 - SYS(fail, "ip route add " IP4_DST "/32 dev veth_src scope global"); 192 - SYS(fail, "ip route add " IP4_NET "/16 dev veth_src scope global"); 193 - SYS(fail, "ip route add " IP6_DST "/128 dev veth_src scope global"); 234 + SYS(fail, "ip route add " IP4_DST "/32 dev src scope global"); 235 + SYS(fail, "ip route add " IP4_NET "/16 dev src scope global"); 236 + SYS(fail, "ip route add " IP6_DST "/128 dev src scope global"); 194 237 195 - SYS(fail, "ip neigh add " IP4_DST " dev veth_src lladdr %s", 196 - veth_src_fwd_addr); 197 - SYS(fail, "ip neigh add " IP6_DST " dev veth_src lladdr %s", 198 - veth_src_fwd_addr); 238 + if (result->dev_mode == MODE_VETH) { 239 + SYS(fail, "ip neigh add " IP4_DST " dev src lladdr %s", 240 + src_fwd_addr); 241 + SYS(fail, "ip neigh add " IP6_DST " dev src lladdr %s", 242 + src_fwd_addr); 243 + } 199 244 200 245 close_netns(nstoken); 201 246 ··· 260 201 * needs v4 one in order to start ARP probing. IP4_NET route is added 261 202 * to the endpoints so that the ARP processing will reply. 262 203 */ 263 - SYS(fail, "ip addr add " IP4_SLL "/32 dev veth_src_fwd"); 264 - SYS(fail, "ip addr add " IP4_DLL "/32 dev veth_dst_fwd"); 265 - SYS(fail, "ip link set dev veth_src_fwd up"); 266 - SYS(fail, "ip link set dev veth_dst_fwd up"); 204 + SYS(fail, "ip addr add " IP4_SLL "/32 dev src_fwd"); 205 + SYS(fail, "ip addr add " IP4_DLL "/32 dev dst_fwd"); 206 + SYS(fail, "ip link set dev src_fwd up"); 207 + SYS(fail, "ip link set dev dst_fwd up"); 267 208 268 - SYS(fail, "ip route add " IP4_SRC "/32 dev veth_src_fwd scope global"); 269 - SYS(fail, "ip route add " IP6_SRC "/128 dev veth_src_fwd scope global"); 270 - SYS(fail, "ip route add " IP4_DST "/32 dev veth_dst_fwd scope global"); 271 - SYS(fail, "ip route add " IP6_DST "/128 dev veth_dst_fwd scope global"); 209 + SYS(fail, "ip route add " IP4_SRC "/32 dev src_fwd scope global"); 210 + SYS(fail, "ip route add " IP6_SRC "/128 dev src_fwd scope global"); 211 + SYS(fail, "ip route add " IP4_DST "/32 dev dst_fwd scope global"); 212 + SYS(fail, "ip route add " IP6_DST "/128 dev dst_fwd scope global"); 272 213 273 214 close_netns(nstoken); 274 215 ··· 277 218 if (!ASSERT_OK_PTR(nstoken, "setns dst")) 278 219 goto fail; 279 220 280 - SYS(fail, "ip addr add " IP4_DST "/32 dev veth_dst"); 281 - SYS(fail, "ip addr add " IP6_DST "/128 dev veth_dst nodad"); 282 - SYS(fail, "ip link set dev veth_dst up"); 221 + SYS(fail, "ip addr add " IP4_DST "/32 dev dst"); 222 + SYS(fail, "ip addr add " IP6_DST "/128 dev dst nodad"); 223 + SYS(fail, "ip link set dev dst up"); 283 224 284 - SYS(fail, "ip route add " IP4_SRC "/32 dev veth_dst scope global"); 285 - SYS(fail, "ip route add " IP4_NET "/16 dev veth_dst scope global"); 286 - SYS(fail, "ip route add " IP6_SRC "/128 dev veth_dst scope global"); 225 + SYS(fail, "ip route add " IP4_SRC "/32 dev dst scope global"); 226 + SYS(fail, "ip route add " IP4_NET "/16 dev dst scope global"); 227 + SYS(fail, "ip route add " IP6_SRC "/128 dev dst scope global"); 287 228 288 - SYS(fail, "ip neigh add " IP4_SRC " dev veth_dst lladdr " MAC_DST_FWD); 289 - SYS(fail, "ip neigh add " IP6_SRC " dev veth_dst lladdr " MAC_DST_FWD); 229 + if (result->dev_mode == MODE_VETH) { 230 + SYS(fail, "ip neigh add " IP4_SRC " dev dst lladdr " MAC_DST_FWD); 231 + SYS(fail, "ip neigh add " IP6_SRC " dev dst lladdr " MAC_DST_FWD); 232 + } 290 233 291 234 close_netns(nstoken); 292 235 ··· 354 293 const struct bpf_program *chk_prog, 355 294 const struct netns_setup_result *setup_result) 356 295 { 357 - LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_src_fwd); 358 - LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_dst_fwd); 296 + LIBBPF_OPTS(bpf_tc_hook, qdisc_src_fwd); 297 + LIBBPF_OPTS(bpf_tc_hook, qdisc_dst_fwd); 359 298 int err; 360 299 361 - /* tc qdisc add dev veth_src_fwd clsact */ 362 - QDISC_CLSACT_CREATE(&qdisc_veth_src_fwd, setup_result->ifindex_veth_src_fwd); 363 - /* tc filter add dev veth_src_fwd ingress bpf da src_prog */ 364 - XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_INGRESS, src_prog, 0); 365 - /* tc filter add dev veth_src_fwd egress bpf da chk_prog */ 366 - XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_EGRESS, chk_prog, 0); 300 + /* tc qdisc add dev src_fwd clsact */ 301 + QDISC_CLSACT_CREATE(&qdisc_src_fwd, setup_result->ifindex_src_fwd); 302 + /* tc filter add dev src_fwd ingress bpf da src_prog */ 303 + XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_INGRESS, src_prog, 0); 304 + /* tc filter add dev src_fwd egress bpf da chk_prog */ 305 + XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_EGRESS, chk_prog, 0); 367 306 368 - /* tc qdisc add dev veth_dst_fwd clsact */ 369 - QDISC_CLSACT_CREATE(&qdisc_veth_dst_fwd, setup_result->ifindex_veth_dst_fwd); 370 - /* tc filter add dev veth_dst_fwd ingress bpf da dst_prog */ 371 - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_INGRESS, dst_prog, 0); 372 - /* tc filter add dev veth_dst_fwd egress bpf da chk_prog */ 373 - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_EGRESS, chk_prog, 0); 307 + /* tc qdisc add dev dst_fwd clsact */ 308 + QDISC_CLSACT_CREATE(&qdisc_dst_fwd, setup_result->ifindex_dst_fwd); 309 + /* tc filter add dev dst_fwd ingress bpf da dst_prog */ 310 + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS, dst_prog, 0); 311 + /* tc filter add dev dst_fwd egress bpf da chk_prog */ 312 + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS, chk_prog, 0); 374 313 375 314 return 0; 376 315 fail: ··· 600 539 static int netns_load_dtime_bpf(struct test_tc_dtime *skel, 601 540 const struct netns_setup_result *setup_result) 602 541 { 603 - LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_src_fwd); 604 - LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_dst_fwd); 605 - LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_src); 606 - LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_dst); 542 + LIBBPF_OPTS(bpf_tc_hook, qdisc_src_fwd); 543 + LIBBPF_OPTS(bpf_tc_hook, qdisc_dst_fwd); 544 + LIBBPF_OPTS(bpf_tc_hook, qdisc_src); 545 + LIBBPF_OPTS(bpf_tc_hook, qdisc_dst); 607 546 struct nstoken *nstoken; 608 547 int err; 609 548 ··· 611 550 nstoken = open_netns(NS_SRC); 612 551 if (!ASSERT_OK_PTR(nstoken, "setns " NS_SRC)) 613 552 return -1; 614 - /* tc qdisc add dev veth_src clsact */ 615 - QDISC_CLSACT_CREATE(&qdisc_veth_src, setup_result->ifindex_veth_src); 616 - /* tc filter add dev veth_src ingress bpf da ingress_host */ 617 - XGRESS_FILTER_ADD(&qdisc_veth_src, BPF_TC_INGRESS, skel->progs.ingress_host, 0); 618 - /* tc filter add dev veth_src egress bpf da egress_host */ 619 - XGRESS_FILTER_ADD(&qdisc_veth_src, BPF_TC_EGRESS, skel->progs.egress_host, 0); 553 + /* tc qdisc add dev src clsact */ 554 + QDISC_CLSACT_CREATE(&qdisc_src, setup_result->ifindex_src); 555 + /* tc filter add dev src ingress bpf da ingress_host */ 556 + XGRESS_FILTER_ADD(&qdisc_src, BPF_TC_INGRESS, skel->progs.ingress_host, 0); 557 + /* tc filter add dev src egress bpf da egress_host */ 558 + XGRESS_FILTER_ADD(&qdisc_src, BPF_TC_EGRESS, skel->progs.egress_host, 0); 620 559 close_netns(nstoken); 621 560 622 561 /* setup ns_dst tc progs */ 623 562 nstoken = open_netns(NS_DST); 624 563 if (!ASSERT_OK_PTR(nstoken, "setns " NS_DST)) 625 564 return -1; 626 - /* tc qdisc add dev veth_dst clsact */ 627 - QDISC_CLSACT_CREATE(&qdisc_veth_dst, setup_result->ifindex_veth_dst); 628 - /* tc filter add dev veth_dst ingress bpf da ingress_host */ 629 - XGRESS_FILTER_ADD(&qdisc_veth_dst, BPF_TC_INGRESS, skel->progs.ingress_host, 0); 630 - /* tc filter add dev veth_dst egress bpf da egress_host */ 631 - XGRESS_FILTER_ADD(&qdisc_veth_dst, BPF_TC_EGRESS, skel->progs.egress_host, 0); 565 + /* tc qdisc add dev dst clsact */ 566 + QDISC_CLSACT_CREATE(&qdisc_dst, setup_result->ifindex_dst); 567 + /* tc filter add dev dst ingress bpf da ingress_host */ 568 + XGRESS_FILTER_ADD(&qdisc_dst, BPF_TC_INGRESS, skel->progs.ingress_host, 0); 569 + /* tc filter add dev dst egress bpf da egress_host */ 570 + XGRESS_FILTER_ADD(&qdisc_dst, BPF_TC_EGRESS, skel->progs.egress_host, 0); 632 571 close_netns(nstoken); 633 572 634 573 /* setup ns_fwd tc progs */ 635 574 nstoken = open_netns(NS_FWD); 636 575 if (!ASSERT_OK_PTR(nstoken, "setns " NS_FWD)) 637 576 return -1; 638 - /* tc qdisc add dev veth_dst_fwd clsact */ 639 - QDISC_CLSACT_CREATE(&qdisc_veth_dst_fwd, setup_result->ifindex_veth_dst_fwd); 640 - /* tc filter add dev veth_dst_fwd ingress prio 100 bpf da ingress_fwdns_prio100 */ 641 - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_INGRESS, 577 + /* tc qdisc add dev dst_fwd clsact */ 578 + QDISC_CLSACT_CREATE(&qdisc_dst_fwd, setup_result->ifindex_dst_fwd); 579 + /* tc filter add dev dst_fwd ingress prio 100 bpf da ingress_fwdns_prio100 */ 580 + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS, 642 581 skel->progs.ingress_fwdns_prio100, 100); 643 - /* tc filter add dev veth_dst_fwd ingress prio 101 bpf da ingress_fwdns_prio101 */ 644 - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_INGRESS, 582 + /* tc filter add dev dst_fwd ingress prio 101 bpf da ingress_fwdns_prio101 */ 583 + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS, 645 584 skel->progs.ingress_fwdns_prio101, 101); 646 - /* tc filter add dev veth_dst_fwd egress prio 100 bpf da egress_fwdns_prio100 */ 647 - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_EGRESS, 585 + /* tc filter add dev dst_fwd egress prio 100 bpf da egress_fwdns_prio100 */ 586 + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS, 648 587 skel->progs.egress_fwdns_prio100, 100); 649 - /* tc filter add dev veth_dst_fwd egress prio 101 bpf da egress_fwdns_prio101 */ 650 - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_EGRESS, 588 + /* tc filter add dev dst_fwd egress prio 101 bpf da egress_fwdns_prio101 */ 589 + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS, 651 590 skel->progs.egress_fwdns_prio101, 101); 652 591 653 - /* tc qdisc add dev veth_src_fwd clsact */ 654 - QDISC_CLSACT_CREATE(&qdisc_veth_src_fwd, setup_result->ifindex_veth_src_fwd); 655 - /* tc filter add dev veth_src_fwd ingress prio 100 bpf da ingress_fwdns_prio100 */ 656 - XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_INGRESS, 592 + /* tc qdisc add dev src_fwd clsact */ 593 + QDISC_CLSACT_CREATE(&qdisc_src_fwd, setup_result->ifindex_src_fwd); 594 + /* tc filter add dev src_fwd ingress prio 100 bpf da ingress_fwdns_prio100 */ 595 + XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_INGRESS, 657 596 skel->progs.ingress_fwdns_prio100, 100); 658 - /* tc filter add dev veth_src_fwd ingress prio 101 bpf da ingress_fwdns_prio101 */ 659 - XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_INGRESS, 597 + /* tc filter add dev src_fwd ingress prio 101 bpf da ingress_fwdns_prio101 */ 598 + XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_INGRESS, 660 599 skel->progs.ingress_fwdns_prio101, 101); 661 - /* tc filter add dev veth_src_fwd egress prio 100 bpf da egress_fwdns_prio100 */ 662 - XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_EGRESS, 600 + /* tc filter add dev src_fwd egress prio 100 bpf da egress_fwdns_prio100 */ 601 + XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_EGRESS, 663 602 skel->progs.egress_fwdns_prio100, 100); 664 - /* tc filter add dev veth_src_fwd egress prio 101 bpf da egress_fwdns_prio101 */ 665 - XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_EGRESS, 603 + /* tc filter add dev src_fwd egress prio 101 bpf da egress_fwdns_prio101 */ 604 + XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_EGRESS, 666 605 skel->progs.egress_fwdns_prio101, 101); 667 606 close_netns(nstoken); 668 607 return 0; ··· 838 777 if (!ASSERT_OK_PTR(skel, "test_tc_dtime__open")) 839 778 return; 840 779 841 - skel->rodata->IFINDEX_SRC = setup_result->ifindex_veth_src_fwd; 842 - skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd; 780 + skel->rodata->IFINDEX_SRC = setup_result->ifindex_src_fwd; 781 + skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd; 843 782 844 783 err = test_tc_dtime__load(skel); 845 784 if (!ASSERT_OK(err, "test_tc_dtime__load")) ··· 929 868 if (!ASSERT_OK_PTR(skel, "test_tc_neigh__open")) 930 869 goto done; 931 870 932 - skel->rodata->IFINDEX_SRC = setup_result->ifindex_veth_src_fwd; 933 - skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd; 871 + skel->rodata->IFINDEX_SRC = setup_result->ifindex_src_fwd; 872 + skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd; 934 873 935 874 err = test_tc_neigh__load(skel); 936 875 if (!ASSERT_OK(err, "test_tc_neigh__load")) ··· 965 904 if (!ASSERT_OK_PTR(skel, "test_tc_peer__open")) 966 905 goto done; 967 906 968 - skel->rodata->IFINDEX_SRC = setup_result->ifindex_veth_src_fwd; 969 - skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd; 907 + skel->rodata->IFINDEX_SRC = setup_result->ifindex_src_fwd; 908 + skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd; 970 909 971 910 err = test_tc_peer__load(skel); 972 911 if (!ASSERT_OK(err, "test_tc_peer__load")) ··· 1057 996 static void test_tc_redirect_peer_l3(struct netns_setup_result *setup_result) 1058 997 { 1059 998 LIBBPF_OPTS(bpf_tc_hook, qdisc_tun_fwd); 1060 - LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_dst_fwd); 999 + LIBBPF_OPTS(bpf_tc_hook, qdisc_dst_fwd); 1061 1000 struct test_tc_peer *skel = NULL; 1062 1001 struct nstoken *nstoken = NULL; 1063 1002 int err; ··· 1106 1045 goto fail; 1107 1046 1108 1047 skel->rodata->IFINDEX_SRC = ifindex; 1109 - skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd; 1048 + skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd; 1110 1049 1111 1050 err = test_tc_peer__load(skel); 1112 1051 if (!ASSERT_OK(err, "test_tc_peer__load")) ··· 1114 1053 1115 1054 /* Load "tc_src_l3" to the tun_fwd interface to redirect packets 1116 1055 * towards dst, and "tc_dst" to redirect packets 1117 - * and "tc_chk" on veth_dst_fwd to drop non-redirected packets. 1056 + * and "tc_chk" on dst_fwd to drop non-redirected packets. 1118 1057 */ 1119 1058 /* tc qdisc add dev tun_fwd clsact */ 1120 1059 QDISC_CLSACT_CREATE(&qdisc_tun_fwd, ifindex); 1121 1060 /* tc filter add dev tun_fwd ingress bpf da tc_src_l3 */ 1122 1061 XGRESS_FILTER_ADD(&qdisc_tun_fwd, BPF_TC_INGRESS, skel->progs.tc_src_l3, 0); 1123 1062 1124 - /* tc qdisc add dev veth_dst_fwd clsact */ 1125 - QDISC_CLSACT_CREATE(&qdisc_veth_dst_fwd, setup_result->ifindex_veth_dst_fwd); 1126 - /* tc filter add dev veth_dst_fwd ingress bpf da tc_dst_l3 */ 1127 - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_INGRESS, skel->progs.tc_dst_l3, 0); 1128 - /* tc filter add dev veth_dst_fwd egress bpf da tc_chk */ 1129 - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_EGRESS, skel->progs.tc_chk, 0); 1063 + /* tc qdisc add dev dst_fwd clsact */ 1064 + QDISC_CLSACT_CREATE(&qdisc_dst_fwd, setup_result->ifindex_dst_fwd); 1065 + /* tc filter add dev dst_fwd ingress bpf da tc_dst_l3 */ 1066 + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS, skel->progs.tc_dst_l3, 0); 1067 + /* tc filter add dev dst_fwd egress bpf da tc_chk */ 1068 + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS, skel->progs.tc_chk, 0); 1130 1069 1131 1070 /* Setup route and neigh tables */ 1132 1071 SYS(fail, "ip -netns " NS_SRC " addr add dev tun_src " IP4_TUN_SRC "/24"); ··· 1135 1074 SYS(fail, "ip -netns " NS_SRC " addr add dev tun_src " IP6_TUN_SRC "/64 nodad"); 1136 1075 SYS(fail, "ip -netns " NS_FWD " addr add dev tun_fwd " IP6_TUN_FWD "/64 nodad"); 1137 1076 1138 - SYS(fail, "ip -netns " NS_SRC " route del " IP4_DST "/32 dev veth_src scope global"); 1077 + SYS(fail, "ip -netns " NS_SRC " route del " IP4_DST "/32 dev src scope global"); 1139 1078 SYS(fail, "ip -netns " NS_SRC " route add " IP4_DST "/32 via " IP4_TUN_FWD 1140 1079 " dev tun_src scope global"); 1141 - SYS(fail, "ip -netns " NS_DST " route add " IP4_TUN_SRC "/32 dev veth_dst scope global"); 1142 - SYS(fail, "ip -netns " NS_SRC " route del " IP6_DST "/128 dev veth_src scope global"); 1080 + SYS(fail, "ip -netns " NS_DST " route add " IP4_TUN_SRC "/32 dev dst scope global"); 1081 + SYS(fail, "ip -netns " NS_SRC " route del " IP6_DST "/128 dev src scope global"); 1143 1082 SYS(fail, "ip -netns " NS_SRC " route add " IP6_DST "/128 via " IP6_TUN_FWD 1144 1083 " dev tun_src scope global"); 1145 - SYS(fail, "ip -netns " NS_DST " route add " IP6_TUN_SRC "/128 dev veth_dst scope global"); 1084 + SYS(fail, "ip -netns " NS_DST " route add " IP6_TUN_SRC "/128 dev dst scope global"); 1146 1085 1147 - SYS(fail, "ip -netns " NS_DST " neigh add " IP4_TUN_SRC " dev veth_dst lladdr " MAC_DST_FWD); 1148 - SYS(fail, "ip -netns " NS_DST " neigh add " IP6_TUN_SRC " dev veth_dst lladdr " MAC_DST_FWD); 1086 + SYS(fail, "ip -netns " NS_DST " neigh add " IP4_TUN_SRC " dev dst lladdr " MAC_DST_FWD); 1087 + SYS(fail, "ip -netns " NS_DST " neigh add " IP6_TUN_SRC " dev dst lladdr " MAC_DST_FWD); 1149 1088 1150 1089 if (!ASSERT_OK(set_forwarding(false), "disable forwarding")) 1151 1090 goto fail; ··· 1167 1106 close_netns(nstoken); 1168 1107 } 1169 1108 1170 - #define RUN_TEST(name) \ 1109 + #define RUN_TEST(name, mode) \ 1171 1110 ({ \ 1172 - struct netns_setup_result setup_result; \ 1111 + struct netns_setup_result setup_result = { .dev_mode = mode, }; \ 1173 1112 if (test__start_subtest(#name)) \ 1174 1113 if (ASSERT_OK(netns_setup_namespaces("add"), "setup namespaces")) { \ 1175 1114 if (ASSERT_OK(netns_setup_links_and_routes(&setup_result), \ ··· 1183 1122 { 1184 1123 netns_setup_namespaces_nofail("delete"); 1185 1124 1186 - RUN_TEST(tc_redirect_peer); 1187 - RUN_TEST(tc_redirect_peer_l3); 1188 - RUN_TEST(tc_redirect_neigh); 1189 - RUN_TEST(tc_redirect_neigh_fib); 1190 - RUN_TEST(tc_redirect_dtime); 1125 + RUN_TEST(tc_redirect_peer, MODE_VETH); 1126 + RUN_TEST(tc_redirect_peer, MODE_NETKIT); 1127 + RUN_TEST(tc_redirect_peer_l3, MODE_VETH); 1128 + RUN_TEST(tc_redirect_peer_l3, MODE_NETKIT); 1129 + RUN_TEST(tc_redirect_neigh, MODE_VETH); 1130 + RUN_TEST(tc_redirect_neigh_fib, MODE_VETH); 1131 + RUN_TEST(tc_redirect_dtime, MODE_VETH); 1191 1132 return NULL; 1192 1133 } 1193 1134

+2

tools/testing/selftests/bpf/prog_tests/verifier.c

··· 31 31 #include "verifier_helper_restricted.skel.h" 32 32 #include "verifier_helper_value_access.skel.h" 33 33 #include "verifier_int_ptr.skel.h" 34 + #include "verifier_iterating_callbacks.skel.h" 34 35 #include "verifier_jeq_infer_not_null.skel.h" 35 36 #include "verifier_ld_ind.skel.h" 36 37 #include "verifier_ldsx.skel.h" ··· 140 139 void test_verifier_helper_restricted(void) { RUN(verifier_helper_restricted); } 141 140 void test_verifier_helper_value_access(void) { RUN(verifier_helper_value_access); } 142 141 void test_verifier_int_ptr(void) { RUN(verifier_int_ptr); } 142 + void test_verifier_iterating_callbacks(void) { RUN(verifier_iterating_callbacks); } 143 143 void test_verifier_jeq_infer_not_null(void) { RUN(verifier_jeq_infer_not_null); } 144 144 void test_verifier_ld_ind(void) { RUN(verifier_ld_ind); } 145 145 void test_verifier_ldsx(void) { RUN(verifier_ldsx); }

+8 -5

tools/testing/selftests/bpf/progs/bpf_loop_bench.c

··· 15 15 return 0; 16 16 } 17 17 18 + static int outer_loop(__u32 index, void *data) 19 + { 20 + bpf_loop(nr_loops, empty_callback, NULL, 0); 21 + __sync_add_and_fetch(&hits, nr_loops); 22 + return 0; 23 + } 24 + 18 25 SEC("fentry/" SYS_PREFIX "sys_getpgid") 19 26 int benchmark(void *ctx) 20 27 { 21 - for (int i = 0; i < 1000; i++) { 22 - bpf_loop(nr_loops, empty_callback, NULL, 0); 23 - 24 - __sync_add_and_fetch(&hits, nr_loops); 25 - } 28 + bpf_loop(1000, outer_loop, NULL, 0); 26 29 return 0; 27 30 }

+1

tools/testing/selftests/bpf/progs/cb_refs.c

··· 33 33 if (!p) 34 34 return 0; 35 35 bpf_for_each_map_elem(&array_map, cb1, &p, 0); 36 + bpf_kfunc_call_test_release(p); 36 37 return 0; 37 38 } 38 39

+2

tools/testing/selftests/bpf/progs/exceptions_fail.c

··· 171 171 return 0; 172 172 bpf_spin_lock(&lock); 173 173 bpf_rbtree_add(&rbtree, &f->node, rbless); 174 + bpf_spin_unlock(&lock); 174 175 return 0; 175 176 } 176 177 ··· 215 214 if (!f) 216 215 return 0; 217 216 bpf_loop(5, subprog_cb_ref, NULL, 0); 217 + bpf_obj_drop(f); 218 218 return 0; 219 219 } 220 220

+48 -30

tools/testing/selftests/bpf/progs/strobemeta.h

··· 24 24 #define STACK_TABLE_EPOCH_SHIFT 20 25 25 #define STROBE_MAX_STR_LEN 1 26 26 #define STROBE_MAX_CFGS 32 27 + #define READ_MAP_VAR_PAYLOAD_CAP \ 28 + ((1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN) 27 29 #define STROBE_MAX_PAYLOAD \ 28 30 (STROBE_MAX_STRS * STROBE_MAX_STR_LEN + \ 29 - STROBE_MAX_MAPS * (1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN) 31 + STROBE_MAX_MAPS * READ_MAP_VAR_PAYLOAD_CAP) 30 32 31 33 struct strobe_value_header { 32 34 /* ··· 357 355 size_t idx, void *tls_base, 358 356 struct strobe_value_generic *value, 359 357 struct strobemeta_payload *data, 360 - void *payload) 358 + size_t off) 361 359 { 362 360 void *location; 363 361 uint64_t len; ··· 368 366 return 0; 369 367 370 368 bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); 371 - len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, value->ptr); 369 + len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, value->ptr); 372 370 /* 373 371 * if bpf_probe_read_user_str returns error (<0), due to casting to 374 372 * unsinged int, it will become big number, so next check is ··· 380 378 return 0; 381 379 382 380 data->str_lens[idx] = len; 383 - return len; 381 + return off + len; 384 382 } 385 383 386 - static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, 387 - size_t idx, void *tls_base, 388 - struct strobe_value_generic *value, 389 - struct strobemeta_payload *data, 390 - void *payload) 384 + static __always_inline uint64_t read_map_var(struct strobemeta_cfg *cfg, 385 + size_t idx, void *tls_base, 386 + struct strobe_value_generic *value, 387 + struct strobemeta_payload *data, 388 + size_t off) 391 389 { 392 390 struct strobe_map_descr* descr = &data->map_descrs[idx]; 393 391 struct strobe_map_raw map; ··· 399 397 400 398 location = calc_location(&cfg->map_locs[idx], tls_base); 401 399 if (!location) 402 - return payload; 400 + return off; 403 401 404 402 bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); 405 403 if (bpf_probe_read_user(&map, sizeof(struct strobe_map_raw), value->ptr)) 406 - return payload; 404 + return off; 407 405 408 406 descr->id = map.id; 409 407 descr->cnt = map.cnt; ··· 412 410 data->req_meta_valid = 1; 413 411 } 414 412 415 - len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, map.tag); 413 + len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, map.tag); 416 414 if (len <= STROBE_MAX_STR_LEN) { 417 415 descr->tag_len = len; 418 - payload += len; 416 + off += len; 419 417 } 420 418 421 419 #ifdef NO_UNROLL ··· 428 426 break; 429 427 430 428 descr->key_lens[i] = 0; 431 - len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, 429 + len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, 432 430 map.entries[i].key); 433 431 if (len <= STROBE_MAX_STR_LEN) { 434 432 descr->key_lens[i] = len; 435 - payload += len; 433 + off += len; 436 434 } 437 435 descr->val_lens[i] = 0; 438 - len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, 436 + len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, 439 437 map.entries[i].val); 440 438 if (len <= STROBE_MAX_STR_LEN) { 441 439 descr->val_lens[i] = len; 442 - payload += len; 440 + off += len; 443 441 } 444 442 } 445 443 446 - return payload; 444 + return off; 447 445 } 448 446 449 447 #ifdef USE_BPF_LOOP ··· 457 455 struct strobemeta_payload *data; 458 456 void *tls_base; 459 457 struct strobemeta_cfg *cfg; 460 - void *payload; 458 + size_t payload_off; 461 459 /* value gets mutated */ 462 460 struct strobe_value_generic *value; 463 461 enum read_type type; 464 462 }; 465 463 466 - static int read_var_callback(__u32 index, struct read_var_ctx *ctx) 464 + static int read_var_callback(__u64 index, struct read_var_ctx *ctx) 467 465 { 466 + /* lose precision info for ctx->payload_off, verifier won't track 467 + * double xor, barrier_var() is needed to force clang keep both xors. 468 + */ 469 + ctx->payload_off ^= index; 470 + barrier_var(ctx->payload_off); 471 + ctx->payload_off ^= index; 468 472 switch (ctx->type) { 469 473 case READ_INT_VAR: 470 474 if (index >= STROBE_MAX_INTS) ··· 480 472 case READ_MAP_VAR: 481 473 if (index >= STROBE_MAX_MAPS) 482 474 return 1; 483 - ctx->payload = read_map_var(ctx->cfg, index, ctx->tls_base, 484 - ctx->value, ctx->data, ctx->payload); 475 + if (ctx->payload_off > sizeof(ctx->data->payload) - READ_MAP_VAR_PAYLOAD_CAP) 476 + return 1; 477 + ctx->payload_off = read_map_var(ctx->cfg, index, ctx->tls_base, 478 + ctx->value, ctx->data, ctx->payload_off); 485 479 break; 486 480 case READ_STR_VAR: 487 481 if (index >= STROBE_MAX_STRS) 488 482 return 1; 489 - ctx->payload += read_str_var(ctx->cfg, index, ctx->tls_base, 490 - ctx->value, ctx->data, ctx->payload); 483 + if (ctx->payload_off > sizeof(ctx->data->payload) - STROBE_MAX_STR_LEN) 484 + return 1; 485 + ctx->payload_off = read_str_var(ctx->cfg, index, ctx->tls_base, 486 + ctx->value, ctx->data, ctx->payload_off); 491 487 break; 492 488 } 493 489 return 0; ··· 513 501 pid_t pid = bpf_get_current_pid_tgid() >> 32; 514 502 struct strobe_value_generic value = {0}; 515 503 struct strobemeta_cfg *cfg; 516 - void *tls_base, *payload; 504 + size_t payload_off; 505 + void *tls_base; 517 506 518 507 cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid); 519 508 if (!cfg) ··· 522 509 523 510 data->int_vals_set_mask = 0; 524 511 data->req_meta_valid = 0; 525 - payload = data->payload; 512 + payload_off = 0; 526 513 /* 527 514 * we don't have struct task_struct definition, it should be: 528 515 * tls_base = (void *)task->thread.fsbase; ··· 535 522 .tls_base = tls_base, 536 523 .value = &value, 537 524 .data = data, 538 - .payload = payload, 525 + .payload_off = 0, 539 526 }; 540 527 int err; 541 528 ··· 553 540 err = bpf_loop(STROBE_MAX_MAPS, read_var_callback, &ctx, 0); 554 541 if (err != STROBE_MAX_MAPS) 555 542 return NULL; 543 + 544 + payload_off = ctx.payload_off; 545 + /* this should not really happen, here only to satisfy verifer */ 546 + if (payload_off > sizeof(data->payload)) 547 + payload_off = sizeof(data->payload); 556 548 #else 557 549 #ifdef NO_UNROLL 558 550 #pragma clang loop unroll(disable) ··· 573 555 #pragma unroll 574 556 #endif /* NO_UNROLL */ 575 557 for (int i = 0; i < STROBE_MAX_STRS; ++i) { 576 - payload += read_str_var(cfg, i, tls_base, &value, data, payload); 558 + payload_off = read_str_var(cfg, i, tls_base, &value, data, payload_off); 577 559 } 578 560 #ifdef NO_UNROLL 579 561 #pragma clang loop unroll(disable) ··· 581 563 #pragma unroll 582 564 #endif /* NO_UNROLL */ 583 565 for (int i = 0; i < STROBE_MAX_MAPS; ++i) { 584 - payload = read_map_var(cfg, i, tls_base, &value, data, payload); 566 + payload_off = read_map_var(cfg, i, tls_base, &value, data, payload_off); 585 567 } 586 568 #endif /* USE_BPF_LOOP */ 587 569 ··· 589 571 * return pointer right after end of payload, so it's possible to 590 572 * calculate exact amount of useful data that needs to be sent 591 573 */ 592 - return payload; 574 + return &data->payload[payload_off]; 593 575 } 594 576 595 577 SEC("raw_tracepoint/kfree_skb")

+242

tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <linux/bpf.h> 4 + #include <bpf/bpf_helpers.h> 5 + #include "bpf_misc.h" 6 + 7 + struct { 8 + __uint(type, BPF_MAP_TYPE_ARRAY); 9 + __uint(max_entries, 8); 10 + __type(key, __u32); 11 + __type(value, __u64); 12 + } map SEC(".maps"); 13 + 14 + struct { 15 + __uint(type, BPF_MAP_TYPE_USER_RINGBUF); 16 + __uint(max_entries, 8); 17 + } ringbuf SEC(".maps"); 18 + 19 + struct vm_area_struct; 20 + struct bpf_map; 21 + 22 + struct buf_context { 23 + char *buf; 24 + }; 25 + 26 + struct num_context { 27 + __u64 i; 28 + __u64 j; 29 + }; 30 + 31 + __u8 choice_arr[2] = { 0, 1 }; 32 + 33 + static int unsafe_on_2nd_iter_cb(__u32 idx, struct buf_context *ctx) 34 + { 35 + if (idx == 0) { 36 + ctx->buf = (char *)(0xDEAD); 37 + return 0; 38 + } 39 + 40 + if (bpf_probe_read_user(ctx->buf, 8, (void *)(0xBADC0FFEE))) 41 + return 1; 42 + 43 + return 0; 44 + } 45 + 46 + SEC("?raw_tp") 47 + __failure __msg("R1 type=scalar expected=fp") 48 + int unsafe_on_2nd_iter(void *unused) 49 + { 50 + char buf[4]; 51 + struct buf_context loop_ctx = { .buf = buf }; 52 + 53 + bpf_loop(100, unsafe_on_2nd_iter_cb, &loop_ctx, 0); 54 + return 0; 55 + } 56 + 57 + static int unsafe_on_zero_iter_cb(__u32 idx, struct num_context *ctx) 58 + { 59 + ctx->i = 0; 60 + return 0; 61 + } 62 + 63 + SEC("?raw_tp") 64 + __failure __msg("invalid access to map value, value_size=2 off=32 size=1") 65 + int unsafe_on_zero_iter(void *unused) 66 + { 67 + struct num_context loop_ctx = { .i = 32 }; 68 + 69 + bpf_loop(100, unsafe_on_zero_iter_cb, &loop_ctx, 0); 70 + return choice_arr[loop_ctx.i]; 71 + } 72 + 73 + static int widening_cb(__u32 idx, struct num_context *ctx) 74 + { 75 + ++ctx->i; 76 + return 0; 77 + } 78 + 79 + SEC("?raw_tp") 80 + __success 81 + int widening(void *unused) 82 + { 83 + struct num_context loop_ctx = { .i = 0, .j = 1 }; 84 + 85 + bpf_loop(100, widening_cb, &loop_ctx, 0); 86 + /* loop_ctx.j is not changed during callback iteration, 87 + * verifier should not apply widening to it. 88 + */ 89 + return choice_arr[loop_ctx.j]; 90 + } 91 + 92 + static int loop_detection_cb(__u32 idx, struct num_context *ctx) 93 + { 94 + for (;;) {} 95 + return 0; 96 + } 97 + 98 + SEC("?raw_tp") 99 + __failure __msg("infinite loop detected") 100 + int loop_detection(void *unused) 101 + { 102 + struct num_context loop_ctx = { .i = 0 }; 103 + 104 + bpf_loop(100, loop_detection_cb, &loop_ctx, 0); 105 + return 0; 106 + } 107 + 108 + static __always_inline __u64 oob_state_machine(struct num_context *ctx) 109 + { 110 + switch (ctx->i) { 111 + case 0: 112 + ctx->i = 1; 113 + break; 114 + case 1: 115 + ctx->i = 32; 116 + break; 117 + } 118 + return 0; 119 + } 120 + 121 + static __u64 for_each_map_elem_cb(struct bpf_map *map, __u32 *key, __u64 *val, void *data) 122 + { 123 + return oob_state_machine(data); 124 + } 125 + 126 + SEC("?raw_tp") 127 + __failure __msg("invalid access to map value, value_size=2 off=32 size=1") 128 + int unsafe_for_each_map_elem(void *unused) 129 + { 130 + struct num_context loop_ctx = { .i = 0 }; 131 + 132 + bpf_for_each_map_elem(&map, for_each_map_elem_cb, &loop_ctx, 0); 133 + return choice_arr[loop_ctx.i]; 134 + } 135 + 136 + static __u64 ringbuf_drain_cb(struct bpf_dynptr *dynptr, void *data) 137 + { 138 + return oob_state_machine(data); 139 + } 140 + 141 + SEC("?raw_tp") 142 + __failure __msg("invalid access to map value, value_size=2 off=32 size=1") 143 + int unsafe_ringbuf_drain(void *unused) 144 + { 145 + struct num_context loop_ctx = { .i = 0 }; 146 + 147 + bpf_user_ringbuf_drain(&ringbuf, ringbuf_drain_cb, &loop_ctx, 0); 148 + return choice_arr[loop_ctx.i]; 149 + } 150 + 151 + static __u64 find_vma_cb(struct task_struct *task, struct vm_area_struct *vma, void *data) 152 + { 153 + return oob_state_machine(data); 154 + } 155 + 156 + SEC("?raw_tp") 157 + __failure __msg("invalid access to map value, value_size=2 off=32 size=1") 158 + int unsafe_find_vma(void *unused) 159 + { 160 + struct task_struct *task = bpf_get_current_task_btf(); 161 + struct num_context loop_ctx = { .i = 0 }; 162 + 163 + bpf_find_vma(task, 0, find_vma_cb, &loop_ctx, 0); 164 + return choice_arr[loop_ctx.i]; 165 + } 166 + 167 + static int iter_limit_cb(__u32 idx, struct num_context *ctx) 168 + { 169 + ctx->i++; 170 + return 0; 171 + } 172 + 173 + SEC("?raw_tp") 174 + __success 175 + int bpf_loop_iter_limit_ok(void *unused) 176 + { 177 + struct num_context ctx = { .i = 0 }; 178 + 179 + bpf_loop(1, iter_limit_cb, &ctx, 0); 180 + return choice_arr[ctx.i]; 181 + } 182 + 183 + SEC("?raw_tp") 184 + __failure __msg("invalid access to map value, value_size=2 off=2 size=1") 185 + int bpf_loop_iter_limit_overflow(void *unused) 186 + { 187 + struct num_context ctx = { .i = 0 }; 188 + 189 + bpf_loop(2, iter_limit_cb, &ctx, 0); 190 + return choice_arr[ctx.i]; 191 + } 192 + 193 + static int iter_limit_level2a_cb(__u32 idx, struct num_context *ctx) 194 + { 195 + ctx->i += 100; 196 + return 0; 197 + } 198 + 199 + static int iter_limit_level2b_cb(__u32 idx, struct num_context *ctx) 200 + { 201 + ctx->i += 10; 202 + return 0; 203 + } 204 + 205 + static int iter_limit_level1_cb(__u32 idx, struct num_context *ctx) 206 + { 207 + ctx->i += 1; 208 + bpf_loop(1, iter_limit_level2a_cb, ctx, 0); 209 + bpf_loop(1, iter_limit_level2b_cb, ctx, 0); 210 + return 0; 211 + } 212 + 213 + /* Check that path visiting every callback function once had been 214 + * reached by verifier. Variables 'ctx{1,2}i' below serve as flags, 215 + * with each decimal digit corresponding to a callback visit marker. 216 + */ 217 + SEC("socket") 218 + __success __retval(111111) 219 + int bpf_loop_iter_limit_nested(void *unused) 220 + { 221 + struct num_context ctx1 = { .i = 0 }; 222 + struct num_context ctx2 = { .i = 0 }; 223 + __u64 a, b, c; 224 + 225 + bpf_loop(1, iter_limit_level1_cb, &ctx1, 0); 226 + bpf_loop(1, iter_limit_level1_cb, &ctx2, 0); 227 + a = ctx1.i; 228 + b = ctx2.i; 229 + /* Force 'ctx1.i' and 'ctx2.i' precise. */ 230 + c = choice_arr[(a + b) % 2]; 231 + /* This makes 'c' zero, but neither clang nor verifier know it. */ 232 + c /= 10; 233 + /* Make sure that verifier does not visit 'impossible' states: 234 + * enumerate all possible callback visit masks. 235 + */ 236 + if (a != 0 && a != 1 && a != 11 && a != 101 && a != 111 && 237 + b != 0 && b != 1 && b != 11 && b != 101 && b != 111) 238 + asm volatile ("r0 /= 0;" ::: "r0"); 239 + return 1000 * a + b + c; 240 + } 241 + 242 + char _license[] SEC("license") = "GPL";

+72 -14

tools/testing/selftests/bpf/progs/verifier_subprog_precision.c

··· 119 119 120 120 SEC("?raw_tp") 121 121 __success __log_level(2) 122 + /* First simulated path does not include callback body, 123 + * r1 and r4 are always precise for bpf_loop() calls. 124 + */ 125 + __msg("9: (85) call bpf_loop#181") 126 + __msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1") 127 + __msg("mark_precise: frame0: parent state regs=r4 stack=:") 128 + __msg("mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx 9") 129 + __msg("mark_precise: frame0: regs=r4 stack= before 8: (b7) r4 = 0") 130 + __msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1") 131 + __msg("mark_precise: frame0: parent state regs=r1 stack=:") 132 + __msg("mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx 9") 133 + __msg("mark_precise: frame0: regs=r1 stack= before 8: (b7) r4 = 0") 134 + __msg("mark_precise: frame0: regs=r1 stack= before 7: (b7) r3 = 0") 135 + __msg("mark_precise: frame0: regs=r1 stack= before 6: (bf) r2 = r8") 136 + __msg("mark_precise: frame0: regs=r1 stack= before 5: (bf) r1 = r6") 137 + __msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3") 138 + /* r6 precision propagation */ 122 139 __msg("14: (0f) r1 += r6") 123 - __msg("mark_precise: frame0: last_idx 14 first_idx 10") 140 + __msg("mark_precise: frame0: last_idx 14 first_idx 9") 124 141 __msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7") 125 142 __msg("mark_precise: frame0: regs=r6 stack= before 12: (27) r6 *= 4") 126 143 __msg("mark_precise: frame0: regs=r6 stack= before 11: (25) if r6 > 0x3 goto pc+4") 127 144 __msg("mark_precise: frame0: regs=r6 stack= before 10: (bf) r6 = r0") 128 - __msg("mark_precise: frame0: parent state regs=r0 stack=:") 129 - __msg("mark_precise: frame0: last_idx 18 first_idx 0") 130 - __msg("mark_precise: frame0: regs=r0 stack= before 18: (95) exit") 145 + __msg("mark_precise: frame0: regs=r0 stack= before 9: (85) call bpf_loop") 146 + /* State entering callback body popped from states stack */ 147 + __msg("from 9 to 17: frame1:") 148 + __msg("17: frame1: R1=scalar() R2=0 R10=fp0 cb") 149 + __msg("17: (b7) r0 = 0") 150 + __msg("18: (95) exit") 151 + __msg("returning from callee:") 152 + __msg("to caller at 9:") 153 + __msg("frame 0: propagating r1,r4") 154 + __msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1") 155 + __msg("mark_precise: frame0: regs=r1,r4 stack= before 18: (95) exit") 156 + __msg("from 18 to 9: safe") 131 157 __naked int callback_result_precise(void) 132 158 { 133 159 asm volatile ( ··· 259 233 260 234 SEC("?raw_tp") 261 235 __success __log_level(2) 236 + /* First simulated path does not include callback body */ 262 237 __msg("12: (0f) r1 += r6") 263 - __msg("mark_precise: frame0: last_idx 12 first_idx 10") 238 + __msg("mark_precise: frame0: last_idx 12 first_idx 9") 264 239 __msg("mark_precise: frame0: regs=r6 stack= before 11: (bf) r1 = r7") 265 240 __msg("mark_precise: frame0: regs=r6 stack= before 10: (27) r6 *= 4") 241 + __msg("mark_precise: frame0: regs=r6 stack= before 9: (85) call bpf_loop") 266 242 __msg("mark_precise: frame0: parent state regs=r6 stack=:") 267 - __msg("mark_precise: frame0: last_idx 16 first_idx 0") 268 - __msg("mark_precise: frame0: regs=r6 stack= before 16: (95) exit") 269 - __msg("mark_precise: frame1: regs= stack= before 15: (b7) r0 = 0") 270 - __msg("mark_precise: frame1: regs= stack= before 9: (85) call bpf_loop#181") 243 + __msg("mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx 9") 271 244 __msg("mark_precise: frame0: regs=r6 stack= before 8: (b7) r4 = 0") 272 245 __msg("mark_precise: frame0: regs=r6 stack= before 7: (b7) r3 = 0") 273 246 __msg("mark_precise: frame0: regs=r6 stack= before 6: (bf) r2 = r8") 274 247 __msg("mark_precise: frame0: regs=r6 stack= before 5: (b7) r1 = 1") 275 248 __msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3") 249 + /* State entering callback body popped from states stack */ 250 + __msg("from 9 to 15: frame1:") 251 + __msg("15: frame1: R1=scalar() R2=0 R10=fp0 cb") 252 + __msg("15: (b7) r0 = 0") 253 + __msg("16: (95) exit") 254 + __msg("returning from callee:") 255 + __msg("to caller at 9:") 256 + /* r1, r4 are always precise for bpf_loop(), 257 + * r6 was marked before backtracking to callback body. 258 + */ 259 + __msg("frame 0: propagating r1,r4,r6") 260 + __msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1") 261 + __msg("mark_precise: frame0: regs=r1,r4,r6 stack= before 16: (95) exit") 262 + __msg("mark_precise: frame1: regs= stack= before 15: (b7) r0 = 0") 263 + __msg("mark_precise: frame1: regs= stack= before 9: (85) call bpf_loop") 264 + __msg("mark_precise: frame0: parent state regs= stack=:") 265 + __msg("from 16 to 9: safe") 276 266 __naked int parent_callee_saved_reg_precise_with_callback(void) 277 267 { 278 268 asm volatile ( ··· 415 373 416 374 SEC("?raw_tp") 417 375 __success __log_level(2) 376 + /* First simulated path does not include callback body */ 418 377 __msg("14: (0f) r1 += r6") 419 - __msg("mark_precise: frame0: last_idx 14 first_idx 11") 378 + __msg("mark_precise: frame0: last_idx 14 first_idx 10") 420 379 __msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7") 421 380 __msg("mark_precise: frame0: regs=r6 stack= before 12: (27) r6 *= 4") 422 381 __msg("mark_precise: frame0: regs=r6 stack= before 11: (79) r6 = *(u64 *)(r10 -8)") 382 + __msg("mark_precise: frame0: regs= stack=-8 before 10: (85) call bpf_loop") 423 383 __msg("mark_precise: frame0: parent state regs= stack=-8:") 424 - __msg("mark_precise: frame0: last_idx 18 first_idx 0") 425 - __msg("mark_precise: frame0: regs= stack=-8 before 18: (95) exit") 426 - __msg("mark_precise: frame1: regs= stack= before 17: (b7) r0 = 0") 427 - __msg("mark_precise: frame1: regs= stack= before 10: (85) call bpf_loop#181") 384 + __msg("mark_precise: frame0: last_idx 9 first_idx 0 subseq_idx 10") 428 385 __msg("mark_precise: frame0: regs= stack=-8 before 9: (b7) r4 = 0") 429 386 __msg("mark_precise: frame0: regs= stack=-8 before 8: (b7) r3 = 0") 430 387 __msg("mark_precise: frame0: regs= stack=-8 before 7: (bf) r2 = r8") 431 388 __msg("mark_precise: frame0: regs= stack=-8 before 6: (bf) r1 = r6") 432 389 __msg("mark_precise: frame0: regs= stack=-8 before 5: (7b) *(u64 *)(r10 -8) = r6") 433 390 __msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3") 391 + /* State entering callback body popped from states stack */ 392 + __msg("from 10 to 17: frame1:") 393 + __msg("17: frame1: R1=scalar() R2=0 R10=fp0 cb") 394 + __msg("17: (b7) r0 = 0") 395 + __msg("18: (95) exit") 396 + __msg("returning from callee:") 397 + __msg("to caller at 10:") 398 + /* r1, r4 are always precise for bpf_loop(), 399 + * fp-8 was marked before backtracking to callback body. 400 + */ 401 + __msg("frame 0: propagating r1,r4,fp-8") 402 + __msg("mark_precise: frame0: last_idx 10 first_idx 10 subseq_idx -1") 403 + __msg("mark_precise: frame0: regs=r1,r4 stack=-8 before 18: (95) exit") 404 + __msg("mark_precise: frame1: regs= stack= before 17: (b7) r0 = 0") 405 + __msg("mark_precise: frame1: regs= stack= before 10: (85) call bpf_loop#181") 406 + __msg("mark_precise: frame0: parent state regs= stack=:") 407 + __msg("from 18 to 10: safe") 434 408 __naked int parent_stack_slot_precise_with_callback(void) 435 409 { 436 410 asm volatile (

+52 -32

tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c

··· 53 53 #define DEFAULT_TTL 64 54 54 #define MAX_ALLOWED_PORTS 8 55 55 56 + #define MAX_PACKET_OFF 0xffff 57 + 56 58 #define swap(a, b) \ 57 59 do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) 58 60 ··· 185 183 } 186 184 187 185 struct tcpopt_context { 188 - __u8 *ptr; 189 - __u8 *end; 186 + void *data; 190 187 void *data_end; 191 188 __be32 *tsecr; 192 189 __u8 wscale; 193 190 bool option_timestamp; 194 191 bool option_sack; 192 + __u32 off; 195 193 }; 194 + 195 + static __always_inline u8 *next(struct tcpopt_context *ctx, __u32 sz) 196 + { 197 + __u64 off = ctx->off; 198 + __u8 *data; 199 + 200 + /* Verifier forbids access to packet when offset exceeds MAX_PACKET_OFF */ 201 + if (off > MAX_PACKET_OFF - sz) 202 + return NULL; 203 + 204 + data = ctx->data + off; 205 + barrier_var(data); 206 + if (data + sz >= ctx->data_end) 207 + return NULL; 208 + 209 + ctx->off += sz; 210 + return data; 211 + } 196 212 197 213 static int tscookie_tcpopt_parse(struct tcpopt_context *ctx) 198 214 { 199 - __u8 opcode, opsize; 215 + __u8 *opcode, *opsize, *wscale, *tsecr; 216 + __u32 off = ctx->off; 200 217 201 - if (ctx->ptr >= ctx->end) 202 - return 1; 203 - if (ctx->ptr >= ctx->data_end) 218 + opcode = next(ctx, 1); 219 + if (!opcode) 204 220 return 1; 205 221 206 - opcode = ctx->ptr[0]; 207 - 208 - if (opcode == TCPOPT_EOL) 222 + if (*opcode == TCPOPT_EOL) 209 223 return 1; 210 - if (opcode == TCPOPT_NOP) { 211 - ++ctx->ptr; 224 + if (*opcode == TCPOPT_NOP) 212 225 return 0; 213 - } 214 226 215 - if (ctx->ptr + 1 >= ctx->end) 216 - return 1; 217 - if (ctx->ptr + 1 >= ctx->data_end) 218 - return 1; 219 - opsize = ctx->ptr[1]; 220 - if (opsize < 2) 227 + opsize = next(ctx, 1); 228 + if (!opsize || *opsize < 2) 221 229 return 1; 222 230 223 - if (ctx->ptr + opsize > ctx->end) 224 - return 1; 225 - 226 - switch (opcode) { 231 + switch (*opcode) { 227 232 case TCPOPT_WINDOW: 228 - if (opsize == TCPOLEN_WINDOW && ctx->ptr + TCPOLEN_WINDOW <= ctx->data_end) 229 - ctx->wscale = ctx->ptr[2] < TCP_MAX_WSCALE ? ctx->ptr[2] : TCP_MAX_WSCALE; 233 + wscale = next(ctx, 1); 234 + if (!wscale) 235 + return 1; 236 + if (*opsize == TCPOLEN_WINDOW) 237 + ctx->wscale = *wscale < TCP_MAX_WSCALE ? *wscale : TCP_MAX_WSCALE; 230 238 break; 231 239 case TCPOPT_TIMESTAMP: 232 - if (opsize == TCPOLEN_TIMESTAMP && ctx->ptr + TCPOLEN_TIMESTAMP <= ctx->data_end) { 240 + tsecr = next(ctx, 4); 241 + if (!tsecr) 242 + return 1; 243 + if (*opsize == TCPOLEN_TIMESTAMP) { 233 244 ctx->option_timestamp = true; 234 245 /* Client's tsval becomes our tsecr. */ 235 - *ctx->tsecr = get_unaligned((__be32 *)(ctx->ptr + 2)); 246 + *ctx->tsecr = get_unaligned((__be32 *)tsecr); 236 247 } 237 248 break; 238 249 case TCPOPT_SACK_PERM: 239 - if (opsize == TCPOLEN_SACK_PERM) 250 + if (*opsize == TCPOLEN_SACK_PERM) 240 251 ctx->option_sack = true; 241 252 break; 242 253 } 243 254 244 - ctx->ptr += opsize; 255 + ctx->off = off + *opsize; 245 256 246 257 return 0; 247 258 } ··· 271 256 272 257 static __always_inline bool tscookie_init(struct tcphdr *tcp_header, 273 258 __u16 tcp_len, __be32 *tsval, 274 - __be32 *tsecr, void *data_end) 259 + __be32 *tsecr, void *data, void *data_end) 275 260 { 276 261 struct tcpopt_context loop_ctx = { 277 - .ptr = (__u8 *)(tcp_header + 1), 278 - .end = (__u8 *)tcp_header + tcp_len, 262 + .data = data, 279 263 .data_end = data_end, 280 264 .tsecr = tsecr, 281 265 .wscale = TS_OPT_WSCALE_MASK, 282 266 .option_timestamp = false, 283 267 .option_sack = false, 268 + /* Note: currently verifier would track .off as unbound scalar. 269 + * In case if verifier would at some point get smarter and 270 + * compute bounded value for this var, beware that it might 271 + * hinder bpf_loop() convergence validation. 272 + */ 273 + .off = (__u8 *)(tcp_header + 1) - (__u8 *)data, 284 274 }; 285 275 u32 cookie; 286 276 ··· 655 635 cookie = (__u32)value; 656 636 657 637 if (tscookie_init((void *)hdr->tcp, hdr->tcp_len, 658 - &tsopt_buf[0], &tsopt_buf[1], data_end)) 638 + &tsopt_buf[0], &tsopt_buf[1], data, data_end)) 659 639 tsopt = tsopt_buf; 660 640 661 641 /* Check that there is enough space for a SYNACK. It also covers

+1 -1

tools/testing/selftests/net/rtnetlink.sh

··· 859 859 860 860 861 861 run_cmd ip -netns "$testns" addr add dev "$DEV_NS" 10.1.1.100/24 862 - run_cmd ip -netns "$testns" link set dev $DEV_NS ups 862 + run_cmd ip -netns "$testns" link set dev $DEV_NS up 863 863 run_cmd ip -netns "$testns" link del "$DEV_NS" 864 864 865 865 # test external mode

+13 -6

tools/testing/vsock/vsock_test.c

··· 353 353 } 354 354 355 355 #define SOCK_BUF_SIZE (2 * 1024 * 1024) 356 - #define MAX_MSG_SIZE (32 * 1024) 356 + #define MAX_MSG_PAGES 4 357 357 358 358 static void test_seqpacket_msg_bounds_client(const struct test_opts *opts) 359 359 { 360 360 unsigned long curr_hash; 361 + size_t max_msg_size; 361 362 int page_size; 362 363 int msg_count; 363 364 int fd; ··· 374 373 375 374 curr_hash = 0; 376 375 page_size = getpagesize(); 377 - msg_count = SOCK_BUF_SIZE / MAX_MSG_SIZE; 376 + max_msg_size = MAX_MSG_PAGES * page_size; 377 + msg_count = SOCK_BUF_SIZE / max_msg_size; 378 378 379 379 for (int i = 0; i < msg_count; i++) { 380 380 size_t buf_size; ··· 385 383 /* Use "small" buffers and "big" buffers. */ 386 384 if (i & 1) 387 385 buf_size = page_size + 388 - (rand() % (MAX_MSG_SIZE - page_size)); 386 + (rand() % (max_msg_size - page_size)); 389 387 else 390 388 buf_size = 1 + (rand() % page_size); 391 389 ··· 431 429 unsigned long remote_hash; 432 430 unsigned long curr_hash; 433 431 int fd; 434 - char buf[MAX_MSG_SIZE]; 435 432 struct msghdr msg = {0}; 436 433 struct iovec iov = {0}; 437 434 ··· 458 457 control_writeln("SRVREADY"); 459 458 /* Wait, until peer sends whole data. */ 460 459 control_expectln("SENDDONE"); 461 - iov.iov_base = buf; 462 - iov.iov_len = sizeof(buf); 460 + iov.iov_len = MAX_MSG_PAGES * getpagesize(); 461 + iov.iov_base = malloc(iov.iov_len); 462 + if (!iov.iov_base) { 463 + perror("malloc"); 464 + exit(EXIT_FAILURE); 465 + } 466 + 463 467 msg.msg_iov = &iov; 464 468 msg.msg_iovlen = 1; 465 469 ··· 489 483 curr_hash += hash_djb2(msg.msg_iov[0].iov_base, recv_size); 490 484 } 491 485 486 + free(iov.iov_base); 492 487 close(fd); 493 488 remote_hash = control_readulong(); 494 489