Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'net-prevent-rps-table-overwrite-of-active-flows'

Krishna Kumar says:

====================
net: Prevent RPS table overwrite of active flows

This series splits the original RPS patch [1] into two patches for
net-next. It also addresses a kernel test robot warning by defining
rps_flow_is_active() only when aRFS is enabled. I tested v3 with
four builds and reboots: two for [PATCH 1/2] with aRFS enabled &
disabled, and two for [PATCH 2/2]. There are no code changes in v4
and v5, only documentation. Patch v6 has one line change to keep
'hash' field under #ifdef, and was test built with aRFS=on and
aRFS=off. The same two builds were done for v7, along with 15m load
testing with aRFS=on to ensure the new changes are correct.

The first patch prevents RPS table overwrite for active flows thereby
improving aRFS stability.

The second patch caches hash & flow_id in get_rps_cpu() to avoid
recalculating it in set_rps_cpu().

[1] lore.kernel.org/netdev/20250708081516.53048-1-krikku@gmail.com/
[2] lore.kernel.org/netdev/20250729104109.1687418-1-krikku@gmail.com/
====================

Link: https://patch.msgid.link/20250825031005.3674864-1-krikku@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+68 -14
+5 -2
include/net/rps.h
··· 25 25 26 26 /* 27 27 * The rps_dev_flow structure contains the mapping of a flow to a CPU, the 28 - * tail pointer for that CPU's input queue at the time of last enqueue, and 29 - * a hardware filter index. 28 + * tail pointer for that CPU's input queue at the time of last enqueue, a 29 + * hardware filter index, and the hash of the flow if aRFS is enabled. 30 30 */ 31 31 struct rps_dev_flow { 32 32 u16 cpu; 33 33 u16 filter; 34 34 unsigned int last_qtail; 35 + #ifdef CONFIG_RFS_ACCEL 36 + u32 hash; 37 + #endif 35 38 }; 36 39 #define RPS_NO_FILTER 0xffff 37 40
+60 -11
net/core/dev.c
··· 4849 4849 return hash_32(hash, flow_table->log); 4850 4850 } 4851 4851 4852 + #ifdef CONFIG_RFS_ACCEL 4853 + /** 4854 + * rps_flow_is_active - check whether the flow is recently active. 4855 + * @rflow: Specific flow to check activity. 4856 + * @flow_table: per-queue flowtable that @rflow belongs to. 4857 + * @cpu: CPU saved in @rflow. 4858 + * 4859 + * If the CPU has processed many packets since the flow's last activity 4860 + * (beyond 10 times the table size), the flow is considered stale. 4861 + * 4862 + * Return: true if flow was recently active. 4863 + */ 4864 + static bool rps_flow_is_active(struct rps_dev_flow *rflow, 4865 + struct rps_dev_flow_table *flow_table, 4866 + unsigned int cpu) 4867 + { 4868 + unsigned int flow_last_active; 4869 + unsigned int sd_input_head; 4870 + 4871 + if (cpu >= nr_cpu_ids) 4872 + return false; 4873 + 4874 + sd_input_head = READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head); 4875 + flow_last_active = READ_ONCE(rflow->last_qtail); 4876 + 4877 + return (int)(sd_input_head - flow_last_active) < 4878 + (int)(10 << flow_table->log); 4879 + } 4880 + #endif 4881 + 4852 4882 static struct rps_dev_flow * 4853 4883 set_rps_cpu(struct net_device *dev, struct sk_buff *skb, 4854 - struct rps_dev_flow *rflow, u16 next_cpu) 4884 + struct rps_dev_flow *rflow, u16 next_cpu, u32 hash, 4885 + u32 flow_id) 4855 4886 { 4856 4887 if (next_cpu < nr_cpu_ids) { 4857 4888 u32 head; ··· 4890 4859 struct netdev_rx_queue *rxqueue; 4891 4860 struct rps_dev_flow_table *flow_table; 4892 4861 struct rps_dev_flow *old_rflow; 4862 + struct rps_dev_flow *tmp_rflow; 4863 + unsigned int tmp_cpu; 4893 4864 u16 rxq_index; 4894 - u32 flow_id; 4895 4865 int rc; 4896 4866 4897 4867 /* Should we steer this flow to a different hardware queue? */ ··· 4907 4875 flow_table = rcu_dereference(rxqueue->rps_flow_table); 4908 4876 if (!flow_table) 4909 4877 goto out; 4910 - flow_id = rfs_slot(skb_get_hash(skb), flow_table); 4878 + 4879 + tmp_rflow = &flow_table->flows[flow_id]; 4880 + tmp_cpu = READ_ONCE(tmp_rflow->cpu); 4881 + 4882 + if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) { 4883 + if (rps_flow_is_active(tmp_rflow, flow_table, 4884 + tmp_cpu)) { 4885 + if (hash != READ_ONCE(tmp_rflow->hash) || 4886 + next_cpu == tmp_cpu) 4887 + goto out; 4888 + } 4889 + } 4890 + 4911 4891 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, 4912 4892 rxq_index, flow_id); 4913 4893 if (rc < 0) 4914 4894 goto out; 4895 + 4915 4896 old_rflow = rflow; 4916 - rflow = &flow_table->flows[flow_id]; 4897 + rflow = tmp_rflow; 4917 4898 WRITE_ONCE(rflow->filter, rc); 4899 + WRITE_ONCE(rflow->hash, hash); 4900 + 4918 4901 if (old_rflow->filter == rc) 4919 4902 WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER); 4920 4903 out: ··· 4955 4908 struct rps_dev_flow_table *flow_table; 4956 4909 struct rps_map *map; 4957 4910 int cpu = -1; 4911 + u32 flow_id; 4958 4912 u32 tcpu; 4959 4913 u32 hash; 4960 4914 ··· 5002 4954 /* OK, now we know there is a match, 5003 4955 * we can look at the local (per receive queue) flow table 5004 4956 */ 5005 - rflow = &flow_table->flows[rfs_slot(hash, flow_table)]; 4957 + flow_id = rfs_slot(hash, flow_table); 4958 + rflow = &flow_table->flows[flow_id]; 5006 4959 tcpu = rflow->cpu; 5007 4960 5008 4961 /* ··· 5022 4973 ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) - 5023 4974 rflow->last_qtail)) >= 0)) { 5024 4975 tcpu = next_cpu; 5025 - rflow = set_rps_cpu(dev, skb, rflow, next_cpu); 4976 + rflow = set_rps_cpu(dev, skb, rflow, next_cpu, hash, 4977 + flow_id); 5026 4978 } 5027 4979 5028 4980 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { ··· 5067 5017 struct rps_dev_flow_table *flow_table; 5068 5018 struct rps_dev_flow *rflow; 5069 5019 bool expire = true; 5070 - unsigned int cpu; 5071 5020 5072 5021 rcu_read_lock(); 5073 5022 flow_table = rcu_dereference(rxqueue->rps_flow_table); 5074 5023 if (flow_table && flow_id < (1UL << flow_table->log)) { 5024 + unsigned int cpu; 5025 + 5075 5026 rflow = &flow_table->flows[flow_id]; 5076 5027 cpu = READ_ONCE(rflow->cpu); 5077 - if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids && 5078 - ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) - 5079 - READ_ONCE(rflow->last_qtail)) < 5080 - (int)(10 << flow_table->log))) 5028 + if (READ_ONCE(rflow->filter) == filter_id && 5029 + rps_flow_is_active(rflow, flow_table, cpu)) 5081 5030 expire = false; 5082 5031 } 5083 5032 rcu_read_unlock();
+3 -1
net/core/net-sysfs.c
··· 1120 1120 return -ENOMEM; 1121 1121 1122 1122 table->log = ilog2(mask) + 1; 1123 - for (count = 0; count <= mask; count++) 1123 + for (count = 0; count <= mask; count++) { 1124 1124 table->flows[count].cpu = RPS_NO_CPU; 1125 + table->flows[count].filter = RPS_NO_FILTER; 1126 + } 1125 1127 } else { 1126 1128 table = NULL; 1127 1129 }