Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tipc: add name distributor resiliency queue

TIPC name table updates are distributed asynchronously in a cluster,
entailing a risk of certain race conditions. E.g., if two nodes
simultaneously issue conflicting (overlapping) publications, this may
not be detected until both publications have reached a third node, in
which case one of the publications will be silently dropped on that
node. Hence, we end up with an inconsistent name table.

In most cases this conflict is just a temporary race, e.g., one
node is issuing a publication under the assumption that a previous,
conflicting, publication has already been withdrawn by the other node.
However, because of the (rtt related) distributed update delay, this
may not yet hold true on all nodes. The symptom of this failure is a
syslog message: "tipc: Cannot publish {%u,%u,%u}, overlap error".

In this commit we add a resiliency queue at the receiving end of
the name table distributor. When insertion of an arriving publication
fails, we retain it in this queue for a short amount of time, assuming
that another update will arrive very soon and clear the conflict. If so
happens, we insert the publication, otherwise we drop it.

The (configurable) retention value defaults to 2000 ms. Knowing from
experience that the situation described above is extremely rare, there
is no risk that the queue will accumulate any large number of items.

Signed-off-by: Erik Hugne <erik.hugne@ericsson.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Erik Hugne and committed by
David S. Miller
a5325ae5 f4ad8a4b

+95 -7
+16
Documentation/sysctl/net.txt
··· 241 241 6. TIPC 242 242 ------------------------------------------------------- 243 243 244 + tipc_rmem 245 + ---------- 246 + 244 247 The TIPC protocol now has a tunable for the receive memory, similar to the 245 248 tcp_rmem - i.e. a vector of 3 INTEGERs: (min, default, max) 246 249 ··· 255 252 are scaled (shifted) versions of that same value. Note that the min value 256 253 is not at this point in time used in any meaningful way, but the triplet is 257 254 preserved in order to be consistent with things like tcp_rmem. 255 + 256 + named_timeout 257 + -------------- 258 + 259 + TIPC name table updates are distributed asynchronously in a cluster, without 260 + any form of transaction handling. This means that different race scenarios are 261 + possible. One such is that a name withdrawal sent out by one node and received 262 + by another node may arrive after a second, overlapping name publication already 263 + has been accepted from a third node, although the conflicting updates 264 + originally may have been issued in the correct sequential order. 265 + If named_timeout is nonzero, failed topology updates will be placed on a defer 266 + queue until another event arrives that clears the error, or until the timeout 267 + expires. Value is in milliseconds.
+1
net/tipc/core.h
··· 81 81 extern int tipc_max_ports __read_mostly; 82 82 extern int tipc_net_id __read_mostly; 83 83 extern int sysctl_tipc_rmem[3] __read_mostly; 84 + extern int sysctl_tipc_named_timeout __read_mostly; 84 85 85 86 /* 86 87 * Other global variables
+66 -3
net/tipc/name_distr.c
··· 1 1 /* 2 2 * net/tipc/name_distr.c: TIPC name distribution code 3 3 * 4 - * Copyright (c) 2000-2006, Ericsson AB 4 + * Copyright (c) 2000-2006, 2014, Ericsson AB 5 5 * Copyright (c) 2005, 2010-2011, Wind River Systems 6 6 * All rights reserved. 7 7 * ··· 70 70 &publ_node /* publ_lists[TIPC_NODE_SCOPE] */ 71 71 }; 72 72 73 + 74 + int sysctl_tipc_named_timeout __read_mostly = 2000; 75 + 76 + /** 77 + * struct tipc_dist_queue - queue holding deferred name table updates 78 + */ 79 + static struct list_head tipc_dist_queue = LIST_HEAD_INIT(tipc_dist_queue); 80 + 81 + struct distr_queue_item { 82 + struct distr_item i; 83 + u32 dtype; 84 + u32 node; 85 + unsigned long expires; 86 + struct list_head next; 87 + }; 73 88 74 89 /** 75 90 * publ_to_item - add publication info to a publication message ··· 314 299 } 315 300 316 301 /** 302 + * tipc_named_add_backlog - add a failed name table update to the backlog 303 + * 304 + */ 305 + static void tipc_named_add_backlog(struct distr_item *i, u32 type, u32 node) 306 + { 307 + struct distr_queue_item *e; 308 + unsigned long now = get_jiffies_64(); 309 + 310 + e = kzalloc(sizeof(*e), GFP_ATOMIC); 311 + if (!e) 312 + return; 313 + e->dtype = type; 314 + e->node = node; 315 + e->expires = now + msecs_to_jiffies(sysctl_tipc_named_timeout); 316 + memcpy(e, i, sizeof(*i)); 317 + list_add_tail(&e->next, &tipc_dist_queue); 318 + } 319 + 320 + /** 321 + * tipc_named_process_backlog - try to process any pending name table updates 322 + * from the network. 323 + */ 324 + void tipc_named_process_backlog(void) 325 + { 326 + struct distr_queue_item *e, *tmp; 327 + char addr[16]; 328 + unsigned long now = get_jiffies_64(); 329 + 330 + list_for_each_entry_safe(e, tmp, &tipc_dist_queue, next) { 331 + if (time_after(e->expires, now)) { 332 + if (!tipc_update_nametbl(&e->i, e->node, e->dtype)) 333 + continue; 334 + } else { 335 + tipc_addr_string_fill(addr, e->node); 336 + pr_warn_ratelimited("Dropping name table update (%d) of {%u, %u, %u} from %s key=%u\n", 337 + e->dtype, ntohl(e->i.type), 338 + ntohl(e->i.lower), 339 + ntohl(e->i.upper), 340 + addr, ntohl(e->i.key)); 341 + } 342 + list_del(&e->next); 343 + kfree(e); 344 + } 345 + } 346 + 347 + /** 317 348 * tipc_named_rcv - process name table update message sent by another node 318 349 */ 319 350 void tipc_named_rcv(struct sk_buff *buf) ··· 367 306 struct tipc_msg *msg = buf_msg(buf); 368 307 struct distr_item *item = (struct distr_item *)msg_data(msg); 369 308 u32 count = msg_data_sz(msg) / ITEM_SIZE; 309 + u32 node = msg_orignode(msg); 370 310 371 311 write_lock_bh(&tipc_nametbl_lock); 372 312 while (count--) { 373 - tipc_update_nametbl(item, msg_orignode(msg), 374 - msg_type(msg)); 313 + if (!tipc_update_nametbl(item, node, msg_type(msg))) 314 + tipc_named_add_backlog(item, msg_type(msg), node); 375 315 item++; 376 316 } 317 + tipc_named_process_backlog(); 377 318 write_unlock_bh(&tipc_nametbl_lock); 378 319 kfree_skb(buf); 379 320 }
+1
net/tipc/name_distr.h
··· 73 73 void tipc_named_node_up(u32 dnode); 74 74 void tipc_named_rcv(struct sk_buff *buf); 75 75 void tipc_named_reinit(void); 76 + void tipc_named_process_backlog(void); 76 77 77 78 #endif
+4 -4
net/tipc/name_table.c
··· 261 261 262 262 /* Lower end overlaps existing entry => need an exact match */ 263 263 if ((sseq->lower != lower) || (sseq->upper != upper)) { 264 - pr_warn("Cannot publish {%u,%u,%u}, overlap error\n", 265 - type, lower, upper); 266 264 return NULL; 267 265 } 268 266 ··· 282 284 /* Fail if upper end overlaps into an existing entry */ 283 285 if ((inspos < nseq->first_free) && 284 286 (upper >= nseq->sseqs[inspos].lower)) { 285 - pr_warn("Cannot publish {%u,%u,%u}, overlap error\n", 286 - type, lower, upper); 287 287 return NULL; 288 288 } 289 289 ··· 673 677 if (likely(publ)) { 674 678 table.local_publ_count++; 675 679 buf = tipc_named_publish(publ); 680 + /* Any pending external events? */ 681 + tipc_named_process_backlog(); 676 682 } 677 683 write_unlock_bh(&tipc_nametbl_lock); 678 684 ··· 696 698 if (likely(publ)) { 697 699 table.local_publ_count--; 698 700 buf = tipc_named_withdraw(publ); 701 + /* Any pending external events? */ 702 + tipc_named_process_backlog(); 699 703 write_unlock_bh(&tipc_nametbl_lock); 700 704 list_del_init(&publ->pport_list); 701 705 kfree(publ);
+7
net/tipc/sysctl.c
··· 47 47 .mode = 0644, 48 48 .proc_handler = proc_dointvec, 49 49 }, 50 + { 51 + .procname = "named_timeout", 52 + .data = &sysctl_tipc_named_timeout, 53 + .maxlen = sizeof(sysctl_tipc_named_timeout), 54 + .mode = 0644, 55 + .proc_handler = proc_dointvec, 56 + }, 50 57 {} 51 58 }; 52 59