Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: rfs: hash function change

RFS is using two kinds of hash tables.

First one is controlled by /proc/sys/net/core/rps_sock_flow_entries = 2^N
and using the N low order bits of the l4 hash is good enough.

Then each RX queue has its own hash table, controlled by
/sys/class/net/eth1/queues/rx-$q/rps_flow_cnt = 2^X

Current hash function, using the X low order bits is suboptimal,
because RSS is usually using Func(hash) = (hash % power_of_two);

For example, with 32 RX queues, 6 low order bits have no entropy
for a given queue.

Switch this hash function to hash_32(hash, log) to increase
chances to use all possible slots and reduce collisions.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <tom@herbertland.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20250321171309.634100-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Eric Dumazet and committed by
Jakub Kicinski
f3483c8e 1952e19c

+12 -7
+1 -1
include/net/rps.h
··· 39 39 * The rps_dev_flow_table structure contains a table of flow mappings. 40 40 */ 41 41 struct rps_dev_flow_table { 42 - unsigned int mask; 42 + u8 log; 43 43 struct rcu_head rcu; 44 44 struct rps_dev_flow flows[]; 45 45 };
+9 -4
net/core/dev.c
··· 4751 4751 struct static_key_false rfs_needed __read_mostly; 4752 4752 EXPORT_SYMBOL(rfs_needed); 4753 4753 4754 + static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table) 4755 + { 4756 + return hash_32(hash, flow_table->log); 4757 + } 4758 + 4754 4759 static struct rps_dev_flow * 4755 4760 set_rps_cpu(struct net_device *dev, struct sk_buff *skb, 4756 4761 struct rps_dev_flow *rflow, u16 next_cpu) ··· 4782 4777 flow_table = rcu_dereference(rxqueue->rps_flow_table); 4783 4778 if (!flow_table) 4784 4779 goto out; 4785 - flow_id = skb_get_hash(skb) & flow_table->mask; 4780 + flow_id = rfs_slot(skb_get_hash(skb), flow_table); 4786 4781 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, 4787 4782 rxq_index, flow_id); 4788 4783 if (rc < 0) ··· 4861 4856 /* OK, now we know there is a match, 4862 4857 * we can look at the local (per receive queue) flow table 4863 4858 */ 4864 - rflow = &flow_table->flows[hash & flow_table->mask]; 4859 + rflow = &flow_table->flows[rfs_slot(hash, flow_table)]; 4865 4860 tcpu = rflow->cpu; 4866 4861 4867 4862 /* ··· 4928 4923 4929 4924 rcu_read_lock(); 4930 4925 flow_table = rcu_dereference(rxqueue->rps_flow_table); 4931 - if (flow_table && flow_id <= flow_table->mask) { 4926 + if (flow_table && flow_id < (1UL << flow_table->log)) { 4932 4927 rflow = &flow_table->flows[flow_id]; 4933 4928 cpu = READ_ONCE(rflow->cpu); 4934 4929 if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids && 4935 4930 ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) - 4936 4931 READ_ONCE(rflow->last_qtail)) < 4937 - (int)(10 * flow_table->mask))) 4932 + (int)(10 << flow_table->log))) 4938 4933 expire = false; 4939 4934 } 4940 4935 rcu_read_unlock();
+2 -2
net/core/net-sysfs.c
··· 1056 1056 rcu_read_lock(); 1057 1057 flow_table = rcu_dereference(queue->rps_flow_table); 1058 1058 if (flow_table) 1059 - val = (unsigned long)flow_table->mask + 1; 1059 + val = 1UL << flow_table->log; 1060 1060 rcu_read_unlock(); 1061 1061 1062 1062 return sysfs_emit(buf, "%lu\n", val); ··· 1109 1109 if (!table) 1110 1110 return -ENOMEM; 1111 1111 1112 - table->mask = mask; 1112 + table->log = ilog2(mask) + 1; 1113 1113 for (count = 0; count <= mask; count++) 1114 1114 table->flows[count].cpu = RPS_NO_CPU; 1115 1115 } else {