Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: openvswitch: fix to make sure flow_lookup() is not preempted

The flow_lookup() function uses per CPU variables, which must be called
with BH disabled. However, this is fine in the general NAPI use case
where the local BH is disabled. But, it's also called from the netlink
context. The below patch makes sure that even in the netlink path, the
BH is disabled.

In addition, u64_stats_update_begin() requires a lock to ensure one writer
which is not ensured here. Making it per-CPU and disabling NAPI (softirq)
ensures that there is always only one writer.

Fixes: eac87c413bf9 ("net: openvswitch: reorder masks array based on usage")
Reported-by: Juri Lelli <jlelli@redhat.com>
Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
Link: https://lore.kernel.org/r/160295903253.7789.826736662555102345.stgit@ebuild
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Eelco Chaudron and committed by
Jakub Kicinski
f981fc3d f355a55f

+41 -25
+35 -23
net/openvswitch/flow_table.c
··· 175 175 176 176 static void __mask_array_destroy(struct mask_array *ma) 177 177 { 178 - free_percpu(ma->masks_usage_cntr); 178 + free_percpu(ma->masks_usage_stats); 179 179 kfree(ma); 180 180 } 181 181 ··· 199 199 ma->masks_usage_zero_cntr[i] = 0; 200 200 201 201 for_each_possible_cpu(cpu) { 202 - u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr, 203 - cpu); 202 + struct mask_array_stats *stats; 204 203 unsigned int start; 205 204 u64 counter; 206 205 206 + stats = per_cpu_ptr(ma->masks_usage_stats, cpu); 207 207 do { 208 - start = u64_stats_fetch_begin_irq(&ma->syncp); 209 - counter = usage_counters[i]; 210 - } while (u64_stats_fetch_retry_irq(&ma->syncp, start)); 208 + start = u64_stats_fetch_begin_irq(&stats->syncp); 209 + counter = stats->usage_cntrs[i]; 210 + } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); 211 211 212 212 ma->masks_usage_zero_cntr[i] += counter; 213 213 } ··· 230 230 sizeof(struct sw_flow_mask *) * 231 231 size); 232 232 233 - new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size, 234 - __alignof__(u64)); 235 - if (!new->masks_usage_cntr) { 233 + new->masks_usage_stats = __alloc_percpu(sizeof(struct mask_array_stats) + 234 + sizeof(u64) * size, 235 + __alignof__(u64)); 236 + if (!new->masks_usage_stats) { 236 237 kfree(new); 237 238 return NULL; 238 239 } ··· 723 722 724 723 /* Flow lookup does full lookup on flow table. It starts with 725 724 * mask from index passed in *index. 725 + * This function MUST be called with BH disabled due to the use 726 + * of CPU specific variables. 726 727 */ 727 728 static struct sw_flow *flow_lookup(struct flow_table *tbl, 728 729 struct table_instance *ti, ··· 734 731 u32 *n_cache_hit, 735 732 u32 *index) 736 733 { 737 - u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr); 734 + struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats); 738 735 struct sw_flow *flow; 739 736 struct sw_flow_mask *mask; 740 737 int i; ··· 744 741 if (mask) { 745 742 flow = masked_flow_lookup(ti, key, mask, n_mask_hit); 746 743 if (flow) { 747 - u64_stats_update_begin(&ma->syncp); 748 - usage_counters[*index]++; 749 - u64_stats_update_end(&ma->syncp); 744 + u64_stats_update_begin(&stats->syncp); 745 + stats->usage_cntrs[*index]++; 746 + u64_stats_update_end(&stats->syncp); 750 747 (*n_cache_hit)++; 751 748 return flow; 752 749 } ··· 765 762 flow = masked_flow_lookup(ti, key, mask, n_mask_hit); 766 763 if (flow) { /* Found */ 767 764 *index = i; 768 - u64_stats_update_begin(&ma->syncp); 769 - usage_counters[*index]++; 770 - u64_stats_update_end(&ma->syncp); 765 + u64_stats_update_begin(&stats->syncp); 766 + stats->usage_cntrs[*index]++; 767 + u64_stats_update_end(&stats->syncp); 771 768 return flow; 772 769 } 773 770 } ··· 853 850 struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array); 854 851 u32 __always_unused n_mask_hit; 855 852 u32 __always_unused n_cache_hit; 853 + struct sw_flow *flow; 856 854 u32 index = 0; 857 855 858 - return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index); 856 + /* This function gets called trough the netlink interface and therefore 857 + * is preemptible. However, flow_lookup() function needs to be called 858 + * with BH disabled due to CPU specific variables. 859 + */ 860 + local_bh_disable(); 861 + flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index); 862 + local_bh_enable(); 863 + return flow; 859 864 } 860 865 861 866 struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, ··· 1120 1109 1121 1110 for (i = 0; i < ma->max; i++) { 1122 1111 struct sw_flow_mask *mask; 1123 - unsigned int start; 1124 1112 int cpu; 1125 1113 1126 1114 mask = rcu_dereference_ovsl(ma->masks[i]); ··· 1130 1120 masks_and_count[i].counter = 0; 1131 1121 1132 1122 for_each_possible_cpu(cpu) { 1133 - u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr, 1134 - cpu); 1123 + struct mask_array_stats *stats; 1124 + unsigned int start; 1135 1125 u64 counter; 1136 1126 1127 + stats = per_cpu_ptr(ma->masks_usage_stats, cpu); 1137 1128 do { 1138 - start = u64_stats_fetch_begin_irq(&ma->syncp); 1139 - counter = usage_counters[i]; 1140 - } while (u64_stats_fetch_retry_irq(&ma->syncp, start)); 1129 + start = u64_stats_fetch_begin_irq(&stats->syncp); 1130 + counter = stats->usage_cntrs[i]; 1131 + } while (u64_stats_fetch_retry_irq(&stats->syncp, 1132 + start)); 1141 1133 1142 1134 masks_and_count[i].counter += counter; 1143 1135 }
+6 -2
net/openvswitch/flow_table.h
··· 38 38 u64 counter; 39 39 }; 40 40 41 + struct mask_array_stats { 42 + struct u64_stats_sync syncp; 43 + u64 usage_cntrs[]; 44 + }; 45 + 41 46 struct mask_array { 42 47 struct rcu_head rcu; 43 48 int count, max; 44 - u64 __percpu *masks_usage_cntr; 49 + struct mask_array_stats __percpu *masks_usage_stats; 45 50 u64 *masks_usage_zero_cntr; 46 - struct u64_stats_sync syncp; 47 51 struct sw_flow_mask __rcu *masks[]; 48 52 }; 49 53