Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

xfrm: Fix RCU vs hash_resize_mutex lock inversion

xfrm_bydst_resize() calls synchronize_rcu() while holding
hash_resize_mutex. But then on PREEMPT_RT configurations,
xfrm_policy_lookup_bytype() may acquire that mutex while running in an
RCU read side critical section. This results in a deadlock.

In fact the scope of hash_resize_mutex is way beyond the purpose of
xfrm_policy_lookup_bytype() to just fetch a coherent and stable policy
for a given destination/direction, along with other details.

The lower level net->xfrm.xfrm_policy_lock, which among other things
protects per destination/direction references to policy entries, is
enough to serialize and benefit from priority inheritance against the
write side. As a bonus, it makes it officially a per network namespace
synchronization business where a policy table resize on namespace A
shouldn't block a policy lookup on namespace B.

Fixes: 77cc278f7b20 (xfrm: policy: Use sequence counters with associated lock)
Cc: stable@vger.kernel.org
Cc: Ahmed S. Darwish <a.darwish@linutronix.de>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Varad Gautam <varad.gautam@suse.com>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>

authored by

Frederic Weisbecker and committed by
Steffen Klassert
2580d3f4 eaf22826

+9 -9
+1
include/net/netns/xfrm.h
··· 74 74 #endif 75 75 spinlock_t xfrm_state_lock; 76 76 seqcount_spinlock_t xfrm_state_hash_generation; 77 + seqcount_spinlock_t xfrm_policy_hash_generation; 77 78 78 79 spinlock_t xfrm_policy_lock; 79 80 struct mutex xfrm_cfg_mutex;
+8 -9
net/xfrm/xfrm_policy.c
··· 155 155 __read_mostly; 156 156 157 157 static struct kmem_cache *xfrm_dst_cache __ro_after_init; 158 - static __read_mostly seqcount_mutex_t xfrm_policy_hash_generation; 159 158 160 159 static struct rhashtable xfrm_policy_inexact_table; 161 160 static const struct rhashtable_params xfrm_pol_inexact_params; ··· 584 585 return; 585 586 586 587 spin_lock_bh(&net->xfrm.xfrm_policy_lock); 587 - write_seqcount_begin(&xfrm_policy_hash_generation); 588 + write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); 588 589 589 590 odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table, 590 591 lockdep_is_held(&net->xfrm.xfrm_policy_lock)); ··· 595 596 rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst); 596 597 net->xfrm.policy_bydst[dir].hmask = nhashmask; 597 598 598 - write_seqcount_end(&xfrm_policy_hash_generation); 599 + write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation); 599 600 spin_unlock_bh(&net->xfrm.xfrm_policy_lock); 600 601 601 602 synchronize_rcu(); ··· 1244 1245 } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq)); 1245 1246 1246 1247 spin_lock_bh(&net->xfrm.xfrm_policy_lock); 1247 - write_seqcount_begin(&xfrm_policy_hash_generation); 1248 + write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); 1248 1249 1249 1250 /* make sure that we can insert the indirect policies again before 1250 1251 * we start with destructive action. ··· 1353 1354 1354 1355 out_unlock: 1355 1356 __xfrm_policy_inexact_flush(net); 1356 - write_seqcount_end(&xfrm_policy_hash_generation); 1357 + write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation); 1357 1358 spin_unlock_bh(&net->xfrm.xfrm_policy_lock); 1358 1359 1359 1360 mutex_unlock(&hash_resize_mutex); ··· 2094 2095 rcu_read_lock(); 2095 2096 retry: 2096 2097 do { 2097 - sequence = read_seqcount_begin(&xfrm_policy_hash_generation); 2098 + sequence = read_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); 2098 2099 chain = policy_hash_direct(net, daddr, saddr, family, dir); 2099 - } while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)); 2100 + } while (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence)); 2100 2101 2101 2102 ret = NULL; 2102 2103 hlist_for_each_entry_rcu(pol, chain, bydst) { ··· 2127 2128 } 2128 2129 2129 2130 skip_inexact: 2130 - if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)) 2131 + if (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence)) 2131 2132 goto retry; 2132 2133 2133 2134 if (ret && !xfrm_pol_hold_rcu(ret)) ··· 4083 4084 /* Initialize the per-net locks here */ 4084 4085 spin_lock_init(&net->xfrm.xfrm_state_lock); 4085 4086 spin_lock_init(&net->xfrm.xfrm_policy_lock); 4087 + seqcount_spinlock_init(&net->xfrm.xfrm_policy_hash_generation, &net->xfrm.xfrm_policy_lock); 4086 4088 mutex_init(&net->xfrm.xfrm_cfg_mutex); 4087 4089 4088 4090 rv = xfrm_statistics_init(net); ··· 4128 4128 { 4129 4129 register_pernet_subsys(&xfrm_net_ops); 4130 4130 xfrm_dev_init(); 4131 - seqcount_mutex_init(&xfrm_policy_hash_generation, &hash_resize_mutex); 4132 4131 xfrm_input_init(); 4133 4132 4134 4133 #ifdef CONFIG_XFRM_ESPINTCP