Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ipvs: do not disable bh for long time

We used a global BH disable in LOCAL_OUT hook.
Add _bh suffix to all places that need it and remove
the disabling from LOCAL_OUT and sync code.

Functions like ip_defrag need protection from
BH, so add it. As for nf_nat_mangle_tcp_packet, it needs
RCU lock.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>

authored by

Julian Anastasov and committed by
Pablo Neira Ayuso
ac69269a ceec4c38

+64 -87
+2 -2
net/netfilter/ipvs/ip_vs_app.c
··· 352 352 unsigned int flag, __u32 seq, int diff) 353 353 { 354 354 /* spinlock is to keep updating cp->flags atomic */ 355 - spin_lock(&cp->lock); 355 + spin_lock_bh(&cp->lock); 356 356 if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { 357 357 vseq->previous_delta = vseq->delta; 358 358 vseq->delta += diff; 359 359 vseq->init_seq = seq; 360 360 cp->flags |= flag; 361 361 } 362 - spin_unlock(&cp->lock); 362 + spin_unlock_bh(&cp->lock); 363 363 } 364 364 365 365 static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
+15 -15
net/netfilter/ipvs/ip_vs_conn.c
··· 86 86 static struct ip_vs_aligned_lock 87 87 __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; 88 88 89 - static inline void ct_write_lock(unsigned int key) 89 + static inline void ct_write_lock_bh(unsigned int key) 90 90 { 91 - spin_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 91 + spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 92 92 } 93 93 94 - static inline void ct_write_unlock(unsigned int key) 94 + static inline void ct_write_unlock_bh(unsigned int key) 95 95 { 96 - spin_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 96 + spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 97 97 } 98 98 99 99 ··· 167 167 /* Hash by protocol, client address and port */ 168 168 hash = ip_vs_conn_hashkey_conn(cp); 169 169 170 - ct_write_lock(hash); 170 + ct_write_lock_bh(hash); 171 171 spin_lock(&cp->lock); 172 172 173 173 if (!(cp->flags & IP_VS_CONN_F_HASHED)) { ··· 182 182 } 183 183 184 184 spin_unlock(&cp->lock); 185 - ct_write_unlock(hash); 185 + ct_write_unlock_bh(hash); 186 186 187 187 return ret; 188 188 } ··· 200 200 /* unhash it and decrease its reference counter */ 201 201 hash = ip_vs_conn_hashkey_conn(cp); 202 202 203 - ct_write_lock(hash); 203 + ct_write_lock_bh(hash); 204 204 spin_lock(&cp->lock); 205 205 206 206 if (cp->flags & IP_VS_CONN_F_HASHED) { ··· 212 212 ret = 0; 213 213 214 214 spin_unlock(&cp->lock); 215 - ct_write_unlock(hash); 215 + ct_write_unlock_bh(hash); 216 216 217 217 return ret; 218 218 } ··· 227 227 228 228 hash = ip_vs_conn_hashkey_conn(cp); 229 229 230 - ct_write_lock(hash); 230 + ct_write_lock_bh(hash); 231 231 spin_lock(&cp->lock); 232 232 233 233 if (cp->flags & IP_VS_CONN_F_HASHED) { ··· 242 242 ret = atomic_read(&cp->refcnt) ? false : true; 243 243 244 244 spin_unlock(&cp->lock); 245 - ct_write_unlock(hash); 245 + ct_write_unlock_bh(hash); 246 246 247 247 return ret; 248 248 } ··· 462 462 void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) 463 463 { 464 464 if (ip_vs_conn_unhash(cp)) { 465 - spin_lock(&cp->lock); 465 + spin_lock_bh(&cp->lock); 466 466 if (cp->flags & IP_VS_CONN_F_NO_CPORT) { 467 467 atomic_dec(&ip_vs_conn_no_cport_cnt); 468 468 cp->flags &= ~IP_VS_CONN_F_NO_CPORT; 469 469 cp->cport = cport; 470 470 } 471 - spin_unlock(&cp->lock); 471 + spin_unlock_bh(&cp->lock); 472 472 473 473 /* hash on new dport */ 474 474 ip_vs_conn_hash(cp); ··· 622 622 if (dest) { 623 623 struct ip_vs_proto_data *pd; 624 624 625 - spin_lock(&cp->lock); 625 + spin_lock_bh(&cp->lock); 626 626 if (cp->dest) { 627 - spin_unlock(&cp->lock); 627 + spin_unlock_bh(&cp->lock); 628 628 rcu_read_unlock(); 629 629 return; 630 630 } ··· 635 635 ip_vs_unbind_app(cp); 636 636 637 637 ip_vs_bind_dest(cp, dest); 638 - spin_unlock(&cp->lock); 638 + spin_unlock_bh(&cp->lock); 639 639 640 640 /* Update its packet transmitter */ 641 641 cp->packet_xmit = NULL;
+8 -29
net/netfilter/ipvs/ip_vs_core.c
··· 638 638 639 639 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) 640 640 { 641 - int err = ip_defrag(skb, user); 641 + int err; 642 642 643 + local_bh_disable(); 644 + err = ip_defrag(skb, user); 645 + local_bh_enable(); 643 646 if (!err) 644 647 ip_send_check(ip_hdr(skb)); 645 648 ··· 1220 1217 const struct net_device *in, const struct net_device *out, 1221 1218 int (*okfn)(struct sk_buff *)) 1222 1219 { 1223 - unsigned int verdict; 1224 - 1225 - /* Disable BH in LOCAL_OUT until all places are fixed */ 1226 - local_bh_disable(); 1227 - verdict = ip_vs_out(hooknum, skb, AF_INET); 1228 - local_bh_enable(); 1229 - return verdict; 1220 + return ip_vs_out(hooknum, skb, AF_INET); 1230 1221 } 1231 1222 1232 1223 #ifdef CONFIG_IP_VS_IPV6 ··· 1247 1250 const struct net_device *in, const struct net_device *out, 1248 1251 int (*okfn)(struct sk_buff *)) 1249 1252 { 1250 - unsigned int verdict; 1251 - 1252 - /* Disable BH in LOCAL_OUT until all places are fixed */ 1253 - local_bh_disable(); 1254 - verdict = ip_vs_out(hooknum, skb, AF_INET6); 1255 - local_bh_enable(); 1256 - return verdict; 1253 + return ip_vs_out(hooknum, skb, AF_INET6); 1257 1254 } 1258 1255 1259 1256 #endif ··· 1705 1714 const struct net_device *in, const struct net_device *out, 1706 1715 int (*okfn)(struct sk_buff *)) 1707 1716 { 1708 - unsigned int verdict; 1709 - 1710 - /* Disable BH in LOCAL_OUT until all places are fixed */ 1711 - local_bh_disable(); 1712 - verdict = ip_vs_in(hooknum, skb, AF_INET); 1713 - local_bh_enable(); 1714 - return verdict; 1717 + return ip_vs_in(hooknum, skb, AF_INET); 1715 1718 } 1716 1719 1717 1720 #ifdef CONFIG_IP_VS_IPV6 ··· 1764 1779 const struct net_device *in, const struct net_device *out, 1765 1780 int (*okfn)(struct sk_buff *)) 1766 1781 { 1767 - unsigned int verdict; 1768 - 1769 - /* Disable BH in LOCAL_OUT until all places are fixed */ 1770 - local_bh_disable(); 1771 - verdict = ip_vs_in(hooknum, skb, AF_INET6); 1772 - local_bh_enable(); 1773 - return verdict; 1782 + return ip_vs_in(hooknum, skb, AF_INET6); 1774 1783 } 1775 1784 1776 1785 #endif
+2
net/netfilter/ipvs/ip_vs_ftp.c
··· 267 267 * hopefully it will succeed on the retransmitted 268 268 * packet. 269 269 */ 270 + rcu_read_lock(); 270 271 ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, 271 272 iph->ihl * 4, 272 273 start-data, end-start, 273 274 buf, buf_len); 275 + rcu_read_unlock(); 274 276 if (ret) { 275 277 ip_vs_nfct_expect_related(skb, ct, n_cp, 276 278 IPPROTO_TCP, 0, 0);
+2 -2
net/netfilter/ipvs/ip_vs_lblc.c
··· 527 527 } 528 528 529 529 /* If we fail to create a cache entry, we'll just use the valid dest */ 530 - spin_lock(&svc->sched_lock); 530 + spin_lock_bh(&svc->sched_lock); 531 531 if (!tbl->dead) 532 532 ip_vs_lblc_new(tbl, &iph.daddr, dest); 533 - spin_unlock(&svc->sched_lock); 533 + spin_unlock_bh(&svc->sched_lock); 534 534 535 535 out: 536 536 IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n",
+6 -6
net/netfilter/ipvs/ip_vs_lblcr.c
··· 678 678 if (atomic_read(&en->set.size) > 1 && 679 679 time_after(jiffies, en->set.lastmod + 680 680 sysctl_lblcr_expiration(svc))) { 681 - spin_lock(&svc->sched_lock); 681 + spin_lock_bh(&svc->sched_lock); 682 682 if (atomic_read(&en->set.size) > 1) { 683 683 struct ip_vs_dest *m; 684 684 ··· 686 686 if (m) 687 687 ip_vs_dest_set_erase(&en->set, m); 688 688 } 689 - spin_unlock(&svc->sched_lock); 689 + spin_unlock_bh(&svc->sched_lock); 690 690 } 691 691 692 692 /* If the destination is not overloaded, use it */ ··· 701 701 } 702 702 703 703 /* Update our cache entry */ 704 - spin_lock(&svc->sched_lock); 704 + spin_lock_bh(&svc->sched_lock); 705 705 if (!tbl->dead) 706 706 ip_vs_dest_set_insert(&en->set, dest, true); 707 - spin_unlock(&svc->sched_lock); 707 + spin_unlock_bh(&svc->sched_lock); 708 708 goto out; 709 709 } 710 710 ··· 716 716 } 717 717 718 718 /* If we fail to create a cache entry, we'll just use the valid dest */ 719 - spin_lock(&svc->sched_lock); 719 + spin_lock_bh(&svc->sched_lock); 720 720 if (!tbl->dead) 721 721 ip_vs_lblcr_new(tbl, &iph.daddr, dest); 722 - spin_unlock(&svc->sched_lock); 722 + spin_unlock_bh(&svc->sched_lock); 723 723 724 724 out: 725 725 IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n",
+2 -2
net/netfilter/ipvs/ip_vs_proto_sctp.c
··· 994 994 sctp_state_transition(struct ip_vs_conn *cp, int direction, 995 995 const struct sk_buff *skb, struct ip_vs_proto_data *pd) 996 996 { 997 - spin_lock(&cp->lock); 997 + spin_lock_bh(&cp->lock); 998 998 set_sctp_state(pd, cp, direction, skb); 999 - spin_unlock(&cp->lock); 999 + spin_unlock_bh(&cp->lock); 1000 1000 } 1001 1001 1002 1002 static inline __u16 sctp_app_hashkey(__be16 port)
+4 -4
net/netfilter/ipvs/ip_vs_proto_tcp.c
··· 557 557 if (th == NULL) 558 558 return; 559 559 560 - spin_lock(&cp->lock); 560 + spin_lock_bh(&cp->lock); 561 561 set_tcp_state(pd, cp, direction, th); 562 - spin_unlock(&cp->lock); 562 + spin_unlock_bh(&cp->lock); 563 563 } 564 564 565 565 static inline __u16 tcp_app_hashkey(__be16 port) ··· 655 655 { 656 656 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); 657 657 658 - spin_lock(&cp->lock); 658 + spin_lock_bh(&cp->lock); 659 659 cp->state = IP_VS_TCP_S_LISTEN; 660 660 cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN] 661 661 : tcp_timeouts[IP_VS_TCP_S_LISTEN]); 662 - spin_unlock(&cp->lock); 662 + spin_unlock_bh(&cp->lock); 663 663 } 664 664 665 665 /* ---------------------------------------------
+3 -3
net/netfilter/ipvs/ip_vs_rr.c
··· 63 63 64 64 IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); 65 65 66 - spin_lock(&svc->sched_lock); 66 + spin_lock_bh(&svc->sched_lock); 67 67 p = (struct list_head *) svc->sched_data; 68 68 last = dest = list_entry(p, struct ip_vs_dest, n_list); 69 69 ··· 85 85 } while (pass < 2 && p != &svc->destinations); 86 86 87 87 stop: 88 - spin_unlock(&svc->sched_lock); 88 + spin_unlock_bh(&svc->sched_lock); 89 89 ip_vs_scheduler_err(svc, "no destination available"); 90 90 return NULL; 91 91 92 92 out: 93 93 svc->sched_data = &dest->n_list; 94 - spin_unlock(&svc->sched_lock); 94 + spin_unlock_bh(&svc->sched_lock); 95 95 IP_VS_DBG_BUF(6, "RR: server %s:%u " 96 96 "activeconns %d refcnt %d weight %d\n", 97 97 IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
+10 -14
net/netfilter/ipvs/ip_vs_sync.c
··· 531 531 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) 532 532 return; 533 533 534 - spin_lock(&ipvs->sync_buff_lock); 534 + spin_lock_bh(&ipvs->sync_buff_lock); 535 535 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 536 - spin_unlock(&ipvs->sync_buff_lock); 536 + spin_unlock_bh(&ipvs->sync_buff_lock); 537 537 return; 538 538 } 539 539 ··· 552 552 if (!buff) { 553 553 buff = ip_vs_sync_buff_create_v0(ipvs); 554 554 if (!buff) { 555 - spin_unlock(&ipvs->sync_buff_lock); 555 + spin_unlock_bh(&ipvs->sync_buff_lock); 556 556 pr_err("ip_vs_sync_buff_create failed.\n"); 557 557 return; 558 558 } ··· 590 590 sb_queue_tail(ipvs, ms); 591 591 ms->sync_buff = NULL; 592 592 } 593 - spin_unlock(&ipvs->sync_buff_lock); 593 + spin_unlock_bh(&ipvs->sync_buff_lock); 594 594 595 595 /* synchronize its controller if it has */ 596 596 cp = cp->control; ··· 641 641 pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); 642 642 } 643 643 644 - spin_lock(&ipvs->sync_buff_lock); 644 + spin_lock_bh(&ipvs->sync_buff_lock); 645 645 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 646 - spin_unlock(&ipvs->sync_buff_lock); 646 + spin_unlock_bh(&ipvs->sync_buff_lock); 647 647 return; 648 648 } 649 649 ··· 683 683 if (!buff) { 684 684 buff = ip_vs_sync_buff_create(ipvs); 685 685 if (!buff) { 686 - spin_unlock(&ipvs->sync_buff_lock); 686 + spin_unlock_bh(&ipvs->sync_buff_lock); 687 687 pr_err("ip_vs_sync_buff_create failed.\n"); 688 688 return; 689 689 } ··· 750 750 } 751 751 } 752 752 753 - spin_unlock(&ipvs->sync_buff_lock); 753 + spin_unlock_bh(&ipvs->sync_buff_lock); 754 754 755 755 control: 756 756 /* synchronize its controller if it has */ ··· 843 843 kfree(param->pe_data); 844 844 845 845 dest = cp->dest; 846 - spin_lock(&cp->lock); 846 + spin_lock_bh(&cp->lock); 847 847 if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE && 848 848 !(flags & IP_VS_CONN_F_TEMPLATE) && dest) { 849 849 if (flags & IP_VS_CONN_F_INACTIVE) { ··· 857 857 flags &= IP_VS_CONN_F_BACKUP_UPD_MASK; 858 858 flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK; 859 859 cp->flags = flags; 860 - spin_unlock(&cp->lock); 860 + spin_unlock_bh(&cp->lock); 861 861 if (!dest) 862 862 ip_vs_try_bind_dest(cp); 863 863 } else { ··· 1689 1689 break; 1690 1690 } 1691 1691 1692 - /* disable bottom half, because it accesses the data 1693 - shared by softirq while getting/creating conns */ 1694 - local_bh_disable(); 1695 1692 ip_vs_process_message(tinfo->net, tinfo->buf, len); 1696 - local_bh_enable(); 1697 1693 } 1698 1694 } 1699 1695
+2 -2
net/netfilter/ipvs/ip_vs_wrr.c
··· 170 170 171 171 IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); 172 172 173 - spin_lock(&svc->sched_lock); 173 + spin_lock_bh(&svc->sched_lock); 174 174 dest = mark->cl; 175 175 /* No available dests? */ 176 176 if (mark->mw == 0) ··· 222 222 mark->cl = dest; 223 223 224 224 out: 225 - spin_unlock(&svc->sched_lock); 225 + spin_unlock_bh(&svc->sched_lock); 226 226 return dest; 227 227 228 228 err_noavail:
+8 -8
net/netfilter/ipvs/ip_vs_xmit.c
··· 177 177 rt = (struct rtable *) dest_dst->dst_cache; 178 178 else { 179 179 dest_dst = ip_vs_dest_dst_alloc(); 180 - spin_lock(&dest->dst_lock); 180 + spin_lock_bh(&dest->dst_lock); 181 181 if (!dest_dst) { 182 182 __ip_vs_dst_set(dest, NULL, NULL, 0); 183 - spin_unlock(&dest->dst_lock); 183 + spin_unlock_bh(&dest->dst_lock); 184 184 goto err_unreach; 185 185 } 186 186 rt = do_output_route4(net, dest->addr.ip, rt_mode, 187 187 &dest_dst->dst_saddr.ip); 188 188 if (!rt) { 189 189 __ip_vs_dst_set(dest, NULL, NULL, 0); 190 - spin_unlock(&dest->dst_lock); 190 + spin_unlock_bh(&dest->dst_lock); 191 191 ip_vs_dest_dst_free(dest_dst); 192 192 goto err_unreach; 193 193 } 194 194 __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0); 195 - spin_unlock(&dest->dst_lock); 195 + spin_unlock_bh(&dest->dst_lock); 196 196 IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", 197 197 &dest->addr.ip, &dest_dst->dst_saddr.ip, 198 198 atomic_read(&rt->dst.__refcnt)); ··· 358 358 u32 cookie; 359 359 360 360 dest_dst = ip_vs_dest_dst_alloc(); 361 - spin_lock(&dest->dst_lock); 361 + spin_lock_bh(&dest->dst_lock); 362 362 if (!dest_dst) { 363 363 __ip_vs_dst_set(dest, NULL, NULL, 0); 364 - spin_unlock(&dest->dst_lock); 364 + spin_unlock_bh(&dest->dst_lock); 365 365 goto err_unreach; 366 366 } 367 367 dst = __ip_vs_route_output_v6(net, &dest->addr.in6, ··· 369 369 do_xfrm); 370 370 if (!dst) { 371 371 __ip_vs_dst_set(dest, NULL, NULL, 0); 372 - spin_unlock(&dest->dst_lock); 372 + spin_unlock_bh(&dest->dst_lock); 373 373 ip_vs_dest_dst_free(dest_dst); 374 374 goto err_unreach; 375 375 } 376 376 rt = (struct rt6_info *) dst; 377 377 cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; 378 378 __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); 379 - spin_unlock(&dest->dst_lock); 379 + spin_unlock_bh(&dest->dst_lock); 380 380 IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", 381 381 &dest->addr.in6, &dest_dst->dst_saddr.in6, 382 382 atomic_read(&rt->dst.__refcnt));