Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bonding: delete migrated IP addresses from the rlb hash table

Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).

At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.

The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.

This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.

The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).

I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.

When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.

To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head

(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)

Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Jiri Bohac and committed by
David S. Miller
e53665c6 567b871e

+184 -40
+157 -34
drivers/net/bonding/bond_alb.c
··· 84 84 85 85 /* Forward declaration */ 86 86 static void alb_send_learning_packets(struct slave *slave, u8 mac_addr[]); 87 + static void rlb_purge_src_ip(struct bonding *bond, struct arp_pkt *arp); 88 + static void rlb_src_unlink(struct bonding *bond, u32 index); 89 + static void rlb_src_link(struct bonding *bond, u32 ip_src_hash, 90 + u32 ip_dst_hash); 87 91 88 92 static inline u8 _simple_hash(const u8 *hash_start, int hash_size) 89 93 { ··· 358 354 if (!arp) 359 355 goto out; 360 356 357 + /* We received an ARP from arp->ip_src. 358 + * We might have used this IP address previously (on the bonding host 359 + * itself or on a system that is bridged together with the bond). 360 + * However, if arp->mac_src is different than what is stored in 361 + * rx_hashtbl, some other host is now using the IP and we must prevent 362 + * sending out client updates with this IP address and the old MAC 363 + * address. 364 + * Clean up all hash table entries that have this address as ip_src but 365 + * have a different mac_src. 366 + */ 367 + rlb_purge_src_ip(bond, arp); 368 + 361 369 if (arp->op_code == htons(ARPOP_REPLY)) { 362 370 /* update rx hash table for this ARP */ 363 371 rlb_update_entry_from_arp(bond, arp); ··· 448 432 _lock_rx_hashtbl_bh(bond); 449 433 450 434 rx_hash_table = bond_info->rx_hashtbl; 451 - index = bond_info->rx_hashtbl_head; 435 + index = bond_info->rx_hashtbl_used_head; 452 436 for (; index != RLB_NULL_INDEX; index = next_index) { 453 - next_index = rx_hash_table[index].next; 437 + next_index = rx_hash_table[index].used_next; 454 438 if (rx_hash_table[index].slave == slave) { 455 439 struct slave *assigned_slave = rlb_next_rx_slave(bond); 456 440 ··· 535 519 536 520 _lock_rx_hashtbl_bh(bond); 537 521 538 - hash_index = bond_info->rx_hashtbl_head; 539 - for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->next) { 522 + hash_index = bond_info->rx_hashtbl_used_head; 523 + for (; hash_index != RLB_NULL_INDEX; 524 + hash_index = client_info->used_next) { 540 525 client_info = &(bond_info->rx_hashtbl[hash_index]); 541 526 if (client_info->ntt) { 542 527 rlb_update_client(client_info); ··· 565 548 566 549 _lock_rx_hashtbl_bh(bond); 567 550 568 - hash_index = bond_info->rx_hashtbl_head; 569 - for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->next) { 551 + hash_index = bond_info->rx_hashtbl_used_head; 552 + for (; hash_index != RLB_NULL_INDEX; 553 + hash_index = client_info->used_next) { 570 554 client_info = &(bond_info->rx_hashtbl[hash_index]); 571 555 572 556 if ((client_info->slave == slave) && ··· 596 578 597 579 _lock_rx_hashtbl(bond); 598 580 599 - hash_index = bond_info->rx_hashtbl_head; 600 - for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->next) { 581 + hash_index = bond_info->rx_hashtbl_used_head; 582 + for (; hash_index != RLB_NULL_INDEX; 583 + hash_index = client_info->used_next) { 601 584 client_info = &(bond_info->rx_hashtbl[hash_index]); 602 585 603 586 if (!client_info->slave) { ··· 644 625 /* update mac address from arp */ 645 626 memcpy(client_info->mac_dst, arp->mac_dst, ETH_ALEN); 646 627 } 628 + memcpy(client_info->mac_src, arp->mac_src, ETH_ALEN); 647 629 648 630 assigned_slave = client_info->slave; 649 631 if (assigned_slave) { ··· 667 647 assigned_slave = rlb_next_rx_slave(bond); 668 648 669 649 if (assigned_slave) { 650 + if (!(client_info->assigned && 651 + client_info->ip_src == arp->ip_src)) { 652 + /* ip_src is going to be updated, 653 + * fix the src hash list 654 + */ 655 + u32 hash_src = _simple_hash((u8 *)&arp->ip_src, 656 + sizeof(arp->ip_src)); 657 + rlb_src_unlink(bond, hash_index); 658 + rlb_src_link(bond, hash_src, hash_index); 659 + } 660 + 670 661 client_info->ip_src = arp->ip_src; 671 662 client_info->ip_dst = arp->ip_dst; 672 663 /* arp->mac_dst is broadcast for arp reqeusts. ··· 685 654 * upon receiving an arp reply. 686 655 */ 687 656 memcpy(client_info->mac_dst, arp->mac_dst, ETH_ALEN); 657 + memcpy(client_info->mac_src, arp->mac_src, ETH_ALEN); 688 658 client_info->slave = assigned_slave; 689 659 690 660 if (!ether_addr_equal_64bits(client_info->mac_dst, mac_bcast)) { ··· 701 669 } 702 670 703 671 if (!client_info->assigned) { 704 - u32 prev_tbl_head = bond_info->rx_hashtbl_head; 705 - bond_info->rx_hashtbl_head = hash_index; 706 - client_info->next = prev_tbl_head; 672 + u32 prev_tbl_head = bond_info->rx_hashtbl_used_head; 673 + bond_info->rx_hashtbl_used_head = hash_index; 674 + client_info->used_next = prev_tbl_head; 707 675 if (prev_tbl_head != RLB_NULL_INDEX) { 708 - bond_info->rx_hashtbl[prev_tbl_head].prev = 676 + bond_info->rx_hashtbl[prev_tbl_head].used_prev = 709 677 hash_index; 710 678 } 711 679 client_info->assigned = 1; ··· 778 746 _lock_rx_hashtbl_bh(bond); 779 747 780 748 ntt = 0; 781 - hash_index = bond_info->rx_hashtbl_head; 782 - for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->next) { 749 + hash_index = bond_info->rx_hashtbl_used_head; 750 + for (; hash_index != RLB_NULL_INDEX; 751 + hash_index = client_info->used_next) { 783 752 client_info = &(bond_info->rx_hashtbl[hash_index]); 784 753 assigned_slave = rlb_next_rx_slave(bond); 785 754 if (assigned_slave && (client_info->slave != assigned_slave)) { ··· 798 765 } 799 766 800 767 /* Caller must hold rx_hashtbl lock */ 768 + static void rlb_init_table_entry_dst(struct rlb_client_info *entry) 769 + { 770 + entry->used_next = RLB_NULL_INDEX; 771 + entry->used_prev = RLB_NULL_INDEX; 772 + entry->assigned = 0; 773 + entry->slave = NULL; 774 + entry->tag = 0; 775 + } 776 + static void rlb_init_table_entry_src(struct rlb_client_info *entry) 777 + { 778 + entry->src_first = RLB_NULL_INDEX; 779 + entry->src_prev = RLB_NULL_INDEX; 780 + entry->src_next = RLB_NULL_INDEX; 781 + } 782 + 801 783 static void rlb_init_table_entry(struct rlb_client_info *entry) 802 784 { 803 785 memset(entry, 0, sizeof(struct rlb_client_info)); 804 - entry->next = RLB_NULL_INDEX; 805 - entry->prev = RLB_NULL_INDEX; 786 + rlb_init_table_entry_dst(entry); 787 + rlb_init_table_entry_src(entry); 788 + } 789 + 790 + static void rlb_delete_table_entry_dst(struct bonding *bond, u32 index) 791 + { 792 + struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); 793 + u32 next_index = bond_info->rx_hashtbl[index].used_next; 794 + u32 prev_index = bond_info->rx_hashtbl[index].used_prev; 795 + 796 + if (index == bond_info->rx_hashtbl_used_head) 797 + bond_info->rx_hashtbl_used_head = next_index; 798 + if (prev_index != RLB_NULL_INDEX) 799 + bond_info->rx_hashtbl[prev_index].used_next = next_index; 800 + if (next_index != RLB_NULL_INDEX) 801 + bond_info->rx_hashtbl[next_index].used_prev = prev_index; 802 + } 803 + 804 + /* unlink a rlb hash table entry from the src list */ 805 + static void rlb_src_unlink(struct bonding *bond, u32 index) 806 + { 807 + struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); 808 + u32 next_index = bond_info->rx_hashtbl[index].src_next; 809 + u32 prev_index = bond_info->rx_hashtbl[index].src_prev; 810 + 811 + bond_info->rx_hashtbl[index].src_next = RLB_NULL_INDEX; 812 + bond_info->rx_hashtbl[index].src_prev = RLB_NULL_INDEX; 813 + 814 + if (next_index != RLB_NULL_INDEX) 815 + bond_info->rx_hashtbl[next_index].src_prev = prev_index; 816 + 817 + if (prev_index == RLB_NULL_INDEX) 818 + return; 819 + 820 + /* is prev_index pointing to the head of this list? */ 821 + if (bond_info->rx_hashtbl[prev_index].src_first == index) 822 + bond_info->rx_hashtbl[prev_index].src_first = next_index; 823 + else 824 + bond_info->rx_hashtbl[prev_index].src_next = next_index; 825 + 826 + } 827 + 828 + static void rlb_delete_table_entry(struct bonding *bond, u32 index) 829 + { 830 + struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); 831 + struct rlb_client_info *entry = &(bond_info->rx_hashtbl[index]); 832 + 833 + rlb_delete_table_entry_dst(bond, index); 834 + rlb_init_table_entry_dst(entry); 835 + 836 + rlb_src_unlink(bond, index); 837 + } 838 + 839 + /* add the rx_hashtbl[ip_dst_hash] entry to the list 840 + * of entries with identical ip_src_hash 841 + */ 842 + static void rlb_src_link(struct bonding *bond, u32 ip_src_hash, u32 ip_dst_hash) 843 + { 844 + struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); 845 + u32 next; 846 + 847 + bond_info->rx_hashtbl[ip_dst_hash].src_prev = ip_src_hash; 848 + next = bond_info->rx_hashtbl[ip_src_hash].src_first; 849 + bond_info->rx_hashtbl[ip_dst_hash].src_next = next; 850 + if (next != RLB_NULL_INDEX) 851 + bond_info->rx_hashtbl[next].src_prev = ip_dst_hash; 852 + bond_info->rx_hashtbl[ip_src_hash].src_first = ip_dst_hash; 853 + } 854 + 855 + /* deletes all rx_hashtbl entries with arp->ip_src if their mac_src does 856 + * not match arp->mac_src */ 857 + static void rlb_purge_src_ip(struct bonding *bond, struct arp_pkt *arp) 858 + { 859 + struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); 860 + u32 ip_src_hash = _simple_hash((u8*)&(arp->ip_src), sizeof(arp->ip_src)); 861 + u32 index; 862 + 863 + _lock_rx_hashtbl_bh(bond); 864 + 865 + index = bond_info->rx_hashtbl[ip_src_hash].src_first; 866 + while (index != RLB_NULL_INDEX) { 867 + struct rlb_client_info *entry = &(bond_info->rx_hashtbl[index]); 868 + u32 next_index = entry->src_next; 869 + if (entry->ip_src == arp->ip_src && 870 + !ether_addr_equal_64bits(arp->mac_src, entry->mac_src)) 871 + rlb_delete_table_entry(bond, index); 872 + index = next_index; 873 + } 874 + _unlock_rx_hashtbl_bh(bond); 806 875 } 807 876 808 877 static int rlb_initialize(struct bonding *bond) ··· 922 787 923 788 bond_info->rx_hashtbl = new_hashtbl; 924 789 925 - bond_info->rx_hashtbl_head = RLB_NULL_INDEX; 790 + bond_info->rx_hashtbl_used_head = RLB_NULL_INDEX; 926 791 927 792 for (i = 0; i < RLB_HASH_TABLE_SIZE; i++) { 928 793 rlb_init_table_entry(bond_info->rx_hashtbl + i); ··· 944 809 945 810 kfree(bond_info->rx_hashtbl); 946 811 bond_info->rx_hashtbl = NULL; 947 - bond_info->rx_hashtbl_head = RLB_NULL_INDEX; 812 + bond_info->rx_hashtbl_used_head = RLB_NULL_INDEX; 948 813 949 814 _unlock_rx_hashtbl_bh(bond); 950 815 } ··· 956 821 957 822 _lock_rx_hashtbl_bh(bond); 958 823 959 - curr_index = bond_info->rx_hashtbl_head; 824 + curr_index = bond_info->rx_hashtbl_used_head; 960 825 while (curr_index != RLB_NULL_INDEX) { 961 826 struct rlb_client_info *curr = &(bond_info->rx_hashtbl[curr_index]); 962 - u32 next_index = bond_info->rx_hashtbl[curr_index].next; 963 - u32 prev_index = bond_info->rx_hashtbl[curr_index].prev; 827 + u32 next_index = bond_info->rx_hashtbl[curr_index].used_next; 964 828 965 - if (curr->tag && (curr->vlan_id == vlan_id)) { 966 - if (curr_index == bond_info->rx_hashtbl_head) { 967 - bond_info->rx_hashtbl_head = next_index; 968 - } 969 - if (prev_index != RLB_NULL_INDEX) { 970 - bond_info->rx_hashtbl[prev_index].next = next_index; 971 - } 972 - if (next_index != RLB_NULL_INDEX) { 973 - bond_info->rx_hashtbl[next_index].prev = prev_index; 974 - } 975 - 976 - rlb_init_table_entry(curr); 977 - } 829 + if (curr->tag && (curr->vlan_id == vlan_id)) 830 + rlb_delete_table_entry(bond, curr_index); 978 831 979 832 curr_index = next_index; 980 833 }
+24 -4
drivers/net/bonding/bond_alb.h
··· 94 94 95 95 /* ------------------------------------------------------------------------- 96 96 * struct rlb_client_info contains all info related to a specific rx client 97 - * connection. This is the Clients Hash Table entry struct 97 + * connection. This is the Clients Hash Table entry struct. 98 + * Note that this is not a proper hash table; if a new client's IP address 99 + * hash collides with an existing client entry, the old entry is replaced. 100 + * 101 + * There is a linked list (linked by the used_next and used_prev members) 102 + * linking all the used entries of the hash table. This allows updating 103 + * all the clients without walking over all the unused elements of the table. 104 + * 105 + * There are also linked lists of entries with identical hash(ip_src). These 106 + * allow cleaning up the table from ip_src<->mac_src associations that have 107 + * become outdated and would cause sending out invalid ARP updates to the 108 + * network. These are linked by the (src_next and src_prev members). 98 109 * ------------------------------------------------------------------------- 99 110 */ 100 111 struct rlb_client_info { 101 112 __be32 ip_src; /* the server IP address */ 102 113 __be32 ip_dst; /* the client IP address */ 114 + u8 mac_src[ETH_ALEN]; /* the server MAC address */ 103 115 u8 mac_dst[ETH_ALEN]; /* the client MAC address */ 104 - u32 next; /* The next Hash table entry index */ 105 - u32 prev; /* The previous Hash table entry index */ 116 + 117 + /* list of used hash table entries, starting at rx_hashtbl_used_head */ 118 + u32 used_next; 119 + u32 used_prev; 120 + 121 + /* ip_src based hashing */ 122 + u32 src_next; /* next entry with same hash(ip_src) */ 123 + u32 src_prev; /* prev entry with same hash(ip_src) */ 124 + u32 src_first; /* first entry with hash(ip_src) == this entry's index */ 125 + 106 126 u8 assigned; /* checking whether this entry is assigned */ 107 127 u8 ntt; /* flag - need to transmit client info */ 108 128 struct slave *slave; /* the slave assigned to this client */ ··· 151 131 int rlb_enabled; 152 132 struct rlb_client_info *rx_hashtbl; /* Receive hash table */ 153 133 spinlock_t rx_hashtbl_lock; 154 - u32 rx_hashtbl_head; 134 + u32 rx_hashtbl_used_head; 155 135 u8 rx_ntt; /* flag - need to transmit 156 136 * to all rx clients 157 137 */
+3 -2
drivers/net/bonding/bond_debugfs.c
··· 31 31 32 32 spin_lock_bh(&(BOND_ALB_INFO(bond).rx_hashtbl_lock)); 33 33 34 - hash_index = bond_info->rx_hashtbl_head; 35 - for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->next) { 34 + hash_index = bond_info->rx_hashtbl_used_head; 35 + for (; hash_index != RLB_NULL_INDEX; 36 + hash_index = client_info->used_next) { 36 37 client_info = &(bond_info->rx_hashtbl[hash_index]); 37 38 seq_printf(m, "%-15pI4 %-15pI4 %-17pM %s\n", 38 39 &client_info->ip_src,