Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'sysctl-data-races'

Kuniyuki Iwashima says:

====================
sysctl: Fix data-races around ipv4_table.

A sysctl variable is accessed concurrently, and there is always a chance
of data-race. So, all readers and writers need some basic protection to
avoid load/store-tearing.

The first half of this series changes some proc handlers used in ipv4_table
to use READ_ONCE() and WRITE_ONCE() internally to fix data-races on the
sysctl side. Then, the second half adds READ_ONCE() to the other readers
of ipv4_table.

Changes:
v2:
* Drop some changes that makes backporting difficult
* First cleanup patch
* Lockless helpers and .proc_handler changes
* Drop the tracing part for .sysctl_mem
* Steve already posted a fix
* Drop int-to-bool change for cipso
* Should be posted to net-next later
* Drop proc_dobool() change
* Can be included in another series

v1: https://lore.kernel.org/netdev/20220706052130.16368-1-kuniyu@amazon.com/
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+37 -26
+1 -1
Documentation/networking/ip-sysctl.rst
··· 1085 1085 cipso_cache_bucket_size - INTEGER 1086 1086 The CIPSO label cache consists of a fixed size hash table with each 1087 1087 hash bucket containing a number of cache entries. This variable limits 1088 - the number of entries in each hash bucket; the larger the value the 1088 + the number of entries in each hash bucket; the larger the value is, the 1089 1089 more CIPSO label mappings that can be cached. When the number of 1090 1090 entries in a given hash bucket reaches this limit adding new entries 1091 1091 causes the oldest entry in the bucket to be removed to make room.
+1 -1
include/net/sock.h
··· 1529 1529 /* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */ 1530 1530 static inline long sk_prot_mem_limits(const struct sock *sk, int index) 1531 1531 { 1532 - long val = sk->sk_prot->sysctl_mem[index]; 1532 + long val = READ_ONCE(sk->sk_prot->sysctl_mem[index]); 1533 1533 1534 1534 #if PAGE_SIZE > SK_MEM_QUANTUM 1535 1535 val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT;
+14 -11
kernel/sysctl.c
··· 446 446 if (*negp) { 447 447 if (*lvalp > (unsigned long) INT_MAX + 1) 448 448 return -EINVAL; 449 - *valp = -*lvalp; 449 + WRITE_ONCE(*valp, -*lvalp); 450 450 } else { 451 451 if (*lvalp > (unsigned long) INT_MAX) 452 452 return -EINVAL; 453 - *valp = *lvalp; 453 + WRITE_ONCE(*valp, *lvalp); 454 454 } 455 455 } else { 456 - int val = *valp; 456 + int val = READ_ONCE(*valp); 457 457 if (val < 0) { 458 458 *negp = true; 459 459 *lvalp = -(unsigned long)val; ··· 472 472 if (write) { 473 473 if (*lvalp > UINT_MAX) 474 474 return -EINVAL; 475 - *valp = *lvalp; 475 + WRITE_ONCE(*valp, *lvalp); 476 476 } else { 477 - unsigned int val = *valp; 477 + unsigned int val = READ_ONCE(*valp); 478 478 *lvalp = (unsigned long)val; 479 479 } 480 480 return 0; ··· 857 857 if ((param->min && *param->min > tmp) || 858 858 (param->max && *param->max < tmp)) 859 859 return -EINVAL; 860 - *valp = tmp; 860 + WRITE_ONCE(*valp, tmp); 861 861 } 862 862 863 863 return 0; ··· 923 923 (param->max && *param->max < tmp)) 924 924 return -ERANGE; 925 925 926 - *valp = tmp; 926 + WRITE_ONCE(*valp, tmp); 927 927 } 928 928 929 929 return 0; ··· 1090 1090 err = -EINVAL; 1091 1091 break; 1092 1092 } 1093 - *i = val; 1093 + WRITE_ONCE(*i, val); 1094 1094 } else { 1095 - val = convdiv * (*i) / convmul; 1095 + val = convdiv * READ_ONCE(*i) / convmul; 1096 1096 if (!first) 1097 1097 proc_put_char(&buffer, &left, '\t'); 1098 1098 proc_put_long(&buffer, &left, val, false); ··· 1173 1173 if (write) { 1174 1174 if (*lvalp > INT_MAX / HZ) 1175 1175 return 1; 1176 - *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); 1176 + if (*negp) 1177 + WRITE_ONCE(*valp, -*lvalp * HZ); 1178 + else 1179 + WRITE_ONCE(*valp, *lvalp * HZ); 1177 1180 } else { 1178 - int val = *valp; 1181 + int val = READ_ONCE(*valp); 1179 1182 unsigned long lval; 1180 1183 if (val < 0) { 1181 1184 *negp = true;
+7 -5
net/ipv4/cipso_ipv4.c
··· 239 239 struct cipso_v4_map_cache_entry *prev_entry = NULL; 240 240 u32 hash; 241 241 242 - if (!cipso_v4_cache_enabled) 242 + if (!READ_ONCE(cipso_v4_cache_enabled)) 243 243 return -ENOENT; 244 244 245 245 hash = cipso_v4_map_cache_hash(key, key_len); ··· 296 296 int cipso_v4_cache_add(const unsigned char *cipso_ptr, 297 297 const struct netlbl_lsm_secattr *secattr) 298 298 { 299 + int bkt_size = READ_ONCE(cipso_v4_cache_bucketsize); 299 300 int ret_val = -EPERM; 300 301 u32 bkt; 301 302 struct cipso_v4_map_cache_entry *entry = NULL; 302 303 struct cipso_v4_map_cache_entry *old_entry = NULL; 303 304 u32 cipso_ptr_len; 304 305 305 - if (!cipso_v4_cache_enabled || cipso_v4_cache_bucketsize <= 0) 306 + if (!READ_ONCE(cipso_v4_cache_enabled) || bkt_size <= 0) 306 307 return 0; 307 308 308 309 cipso_ptr_len = cipso_ptr[1]; ··· 323 322 324 323 bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETS - 1); 325 324 spin_lock_bh(&cipso_v4_cache[bkt].lock); 326 - if (cipso_v4_cache[bkt].size < cipso_v4_cache_bucketsize) { 325 + if (cipso_v4_cache[bkt].size < bkt_size) { 327 326 list_add(&entry->list, &cipso_v4_cache[bkt].list); 328 327 cipso_v4_cache[bkt].size += 1; 329 328 } else { ··· 1200 1199 /* This will send packets using the "optimized" format when 1201 1200 * possible as specified in section 3.4.2.6 of the 1202 1201 * CIPSO draft. */ 1203 - if (cipso_v4_rbm_optfmt && ret_val > 0 && ret_val <= 10) 1202 + if (READ_ONCE(cipso_v4_rbm_optfmt) && ret_val > 0 && 1203 + ret_val <= 10) 1204 1204 tag_len = 14; 1205 1205 else 1206 1206 tag_len = 4 + ret_val; ··· 1605 1603 * all the CIPSO validations here but it doesn't 1606 1604 * really specify _exactly_ what we need to validate 1607 1605 * ... so, just make it a sysctl tunable. */ 1608 - if (cipso_v4_rbm_strictvalid) { 1606 + if (READ_ONCE(cipso_v4_rbm_strictvalid)) { 1609 1607 if (cipso_v4_map_lvl_valid(doi_def, 1610 1608 tag[3]) < 0) { 1611 1609 err_offset = opt_iter + 3;
+1 -1
net/ipv4/fib_trie.c
··· 498 498 tn = container_of(head, struct tnode, rcu)->kv; 499 499 } 500 500 501 - if (tnode_free_size >= sysctl_fib_sync_mem) { 501 + if (tnode_free_size >= READ_ONCE(sysctl_fib_sync_mem)) { 502 502 tnode_free_size = 0; 503 503 synchronize_rcu(); 504 504 }
+3 -2
net/ipv4/icmp.c
··· 253 253 spin_lock(&icmp_global.lock); 254 254 delta = min_t(u32, now - icmp_global.stamp, HZ); 255 255 if (delta >= HZ / 50) { 256 - incr = sysctl_icmp_msgs_per_sec * delta / HZ ; 256 + incr = READ_ONCE(sysctl_icmp_msgs_per_sec) * delta / HZ; 257 257 if (incr) 258 258 WRITE_ONCE(icmp_global.stamp, now); 259 259 } 260 - credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst); 260 + credit = min_t(u32, icmp_global.credit + incr, 261 + READ_ONCE(sysctl_icmp_msgs_burst)); 261 262 if (credit) { 262 263 /* We want to use a credit of one in average, but need to randomize 263 264 * it for security reasons.
+8 -4
net/ipv4/inetpeer.c
··· 141 141 struct inet_peer *gc_stack[], 142 142 unsigned int gc_cnt) 143 143 { 144 + int peer_threshold, peer_maxttl, peer_minttl; 144 145 struct inet_peer *p; 145 146 __u32 delta, ttl; 146 147 int i; 147 148 148 - if (base->total >= inet_peer_threshold) 149 + peer_threshold = READ_ONCE(inet_peer_threshold); 150 + peer_maxttl = READ_ONCE(inet_peer_maxttl); 151 + peer_minttl = READ_ONCE(inet_peer_minttl); 152 + 153 + if (base->total >= peer_threshold) 149 154 ttl = 0; /* be aggressive */ 150 155 else 151 - ttl = inet_peer_maxttl 152 - - (inet_peer_maxttl - inet_peer_minttl) / HZ * 153 - base->total / inet_peer_threshold * HZ; 156 + ttl = peer_maxttl - (peer_maxttl - peer_minttl) / HZ * 157 + base->total / peer_threshold * HZ; 154 158 for (i = 0; i < gc_cnt; i++) { 155 159 p = gc_stack[i]; 156 160
+2 -1
net/ipv4/tcp.c
··· 2715 2715 2716 2716 static bool tcp_too_many_orphans(int shift) 2717 2717 { 2718 - return READ_ONCE(tcp_orphan_cache) << shift > sysctl_tcp_max_orphans; 2718 + return READ_ONCE(tcp_orphan_cache) << shift > 2719 + READ_ONCE(sysctl_tcp_max_orphans); 2719 2720 } 2720 2721 2721 2722 bool tcp_check_oom(struct sock *sk, int shift)