Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

dlm: use rsbtbl as resource directory

Remove the dir hash table (dirtbl), and use
the rsb hash table (rsbtbl) as the resource
directory. It has always been an unnecessary
duplication of information.

This improves efficiency by using a single rsbtbl
lookup in many cases where both rsbtbl and dirtbl
lookups were needed previously.

This eliminates the need to handle cases of rsbtbl
and dirtbl being out of sync.

In many cases there will be memory savings because
the dir hash table no longer exists.

Signed-off-by: David Teigland <teigland@redhat.com>

+1227 -600
-7
fs/dlm/config.c
··· 96 96 unsigned int cl_tcp_port; 97 97 unsigned int cl_buffer_size; 98 98 unsigned int cl_rsbtbl_size; 99 - unsigned int cl_dirtbl_size; 100 99 unsigned int cl_recover_timer; 101 100 unsigned int cl_toss_secs; 102 101 unsigned int cl_scan_secs; ··· 112 113 CLUSTER_ATTR_TCP_PORT = 0, 113 114 CLUSTER_ATTR_BUFFER_SIZE, 114 115 CLUSTER_ATTR_RSBTBL_SIZE, 115 - CLUSTER_ATTR_DIRTBL_SIZE, 116 116 CLUSTER_ATTR_RECOVER_TIMER, 117 117 CLUSTER_ATTR_TOSS_SECS, 118 118 CLUSTER_ATTR_SCAN_SECS, ··· 187 189 CLUSTER_ATTR(tcp_port, 1); 188 190 CLUSTER_ATTR(buffer_size, 1); 189 191 CLUSTER_ATTR(rsbtbl_size, 1); 190 - CLUSTER_ATTR(dirtbl_size, 1); 191 192 CLUSTER_ATTR(recover_timer, 1); 192 193 CLUSTER_ATTR(toss_secs, 1); 193 194 CLUSTER_ATTR(scan_secs, 1); ··· 201 204 [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr, 202 205 [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size.attr, 203 206 [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size.attr, 204 - [CLUSTER_ATTR_DIRTBL_SIZE] = &cluster_attr_dirtbl_size.attr, 205 207 [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer.attr, 206 208 [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr, 207 209 [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr, ··· 474 478 cl->cl_tcp_port = dlm_config.ci_tcp_port; 475 479 cl->cl_buffer_size = dlm_config.ci_buffer_size; 476 480 cl->cl_rsbtbl_size = dlm_config.ci_rsbtbl_size; 477 - cl->cl_dirtbl_size = dlm_config.ci_dirtbl_size; 478 481 cl->cl_recover_timer = dlm_config.ci_recover_timer; 479 482 cl->cl_toss_secs = dlm_config.ci_toss_secs; 480 483 cl->cl_scan_secs = dlm_config.ci_scan_secs; ··· 1045 1050 #define DEFAULT_TCP_PORT 21064 1046 1051 #define DEFAULT_BUFFER_SIZE 4096 1047 1052 #define DEFAULT_RSBTBL_SIZE 1024 1048 - #define DEFAULT_DIRTBL_SIZE 1024 1049 1053 #define DEFAULT_RECOVER_TIMER 5 1050 1054 #define DEFAULT_TOSS_SECS 10 1051 1055 #define DEFAULT_SCAN_SECS 5 ··· 1060 1066 .ci_tcp_port = DEFAULT_TCP_PORT, 1061 1067 .ci_buffer_size = DEFAULT_BUFFER_SIZE, 1062 1068 .ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE, 1063 - .ci_dirtbl_size = DEFAULT_DIRTBL_SIZE, 1064 1069 .ci_recover_timer = DEFAULT_RECOVER_TIMER, 1065 1070 .ci_toss_secs = DEFAULT_TOSS_SECS, 1066 1071 .ci_scan_secs = DEFAULT_SCAN_SECS,
-1
fs/dlm/config.h
··· 27 27 int ci_tcp_port; 28 28 int ci_buffer_size; 29 29 int ci_rsbtbl_size; 30 - int ci_dirtbl_size; 31 30 int ci_recover_timer; 32 31 int ci_toss_secs; 33 32 int ci_scan_secs;
+96 -7
fs/dlm/debug_fs.c
··· 344 344 return rv; 345 345 } 346 346 347 + static int print_format4(struct dlm_rsb *r, struct seq_file *s) 348 + { 349 + int our_nodeid = dlm_our_nodeid(); 350 + int print_name = 1; 351 + int i, rv; 352 + 353 + lock_rsb(r); 354 + 355 + rv = seq_printf(s, "rsb %p %d %d %d %d %lu %lx %d ", 356 + r, 357 + r->res_nodeid, 358 + r->res_master_nodeid, 359 + r->res_dir_nodeid, 360 + our_nodeid, 361 + r->res_toss_time, 362 + r->res_flags, 363 + r->res_length); 364 + if (rv) 365 + goto out; 366 + 367 + for (i = 0; i < r->res_length; i++) { 368 + if (!isascii(r->res_name[i]) || !isprint(r->res_name[i])) 369 + print_name = 0; 370 + } 371 + 372 + seq_printf(s, "%s", print_name ? "str " : "hex"); 373 + 374 + for (i = 0; i < r->res_length; i++) { 375 + if (print_name) 376 + seq_printf(s, "%c", r->res_name[i]); 377 + else 378 + seq_printf(s, " %02x", (unsigned char)r->res_name[i]); 379 + } 380 + rv = seq_printf(s, "\n"); 381 + out: 382 + unlock_rsb(r); 383 + return rv; 384 + } 385 + 347 386 struct rsbtbl_iter { 348 387 struct dlm_rsb *rsb; 349 388 unsigned bucket; ··· 421 382 } 422 383 rv = print_format3(ri->rsb, seq); 423 384 break; 385 + case 4: 386 + if (ri->header) { 387 + seq_printf(seq, "version 4 rsb 2\n"); 388 + ri->header = 0; 389 + } 390 + rv = print_format4(ri->rsb, seq); 391 + break; 424 392 } 425 393 426 394 return rv; ··· 436 390 static const struct seq_operations format1_seq_ops; 437 391 static const struct seq_operations format2_seq_ops; 438 392 static const struct seq_operations format3_seq_ops; 393 + static const struct seq_operations format4_seq_ops; 439 394 440 395 static void *table_seq_start(struct seq_file *seq, loff_t *pos) 441 396 { 397 + struct rb_root *tree; 442 398 struct rb_node *node; 443 399 struct dlm_ls *ls = seq->private; 444 400 struct rsbtbl_iter *ri; 445 401 struct dlm_rsb *r; 446 402 loff_t n = *pos; 447 403 unsigned bucket, entry; 404 + int toss = (seq->op == &format4_seq_ops); 448 405 449 406 bucket = n >> 32; 450 407 entry = n & ((1LL << 32) - 1); ··· 466 417 ri->format = 2; 467 418 if (seq->op == &format3_seq_ops) 468 419 ri->format = 3; 420 + if (seq->op == &format4_seq_ops) 421 + ri->format = 4; 422 + 423 + tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep; 469 424 470 425 spin_lock(&ls->ls_rsbtbl[bucket].lock); 471 - if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) { 472 - for (node = rb_first(&ls->ls_rsbtbl[bucket].keep); node; 473 - node = rb_next(node)) { 426 + if (!RB_EMPTY_ROOT(tree)) { 427 + for (node = rb_first(tree); node; node = rb_next(node)) { 474 428 r = rb_entry(node, struct dlm_rsb, res_hashnode); 475 429 if (!entry--) { 476 430 dlm_hold_rsb(r); ··· 501 449 kfree(ri); 502 450 return NULL; 503 451 } 452 + tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep; 504 453 505 454 spin_lock(&ls->ls_rsbtbl[bucket].lock); 506 - if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) { 507 - node = rb_first(&ls->ls_rsbtbl[bucket].keep); 455 + if (!RB_EMPTY_ROOT(tree)) { 456 + node = rb_first(tree); 508 457 r = rb_entry(node, struct dlm_rsb, res_hashnode); 509 458 dlm_hold_rsb(r); 510 459 ri->rsb = r; ··· 522 469 { 523 470 struct dlm_ls *ls = seq->private; 524 471 struct rsbtbl_iter *ri = iter_ptr; 472 + struct rb_root *tree; 525 473 struct rb_node *next; 526 474 struct dlm_rsb *r, *rp; 527 475 loff_t n = *pos; 528 476 unsigned bucket; 477 + int toss = (seq->op == &format4_seq_ops); 529 478 530 479 bucket = n >> 32; 531 480 ··· 566 511 kfree(ri); 567 512 return NULL; 568 513 } 514 + tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep; 569 515 570 516 spin_lock(&ls->ls_rsbtbl[bucket].lock); 571 - if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) { 572 - next = rb_first(&ls->ls_rsbtbl[bucket].keep); 517 + if (!RB_EMPTY_ROOT(tree)) { 518 + next = rb_first(tree); 573 519 r = rb_entry(next, struct dlm_rsb, res_hashnode); 574 520 dlm_hold_rsb(r); 575 521 ri->rsb = r; ··· 614 558 .show = table_seq_show, 615 559 }; 616 560 561 + static const struct seq_operations format4_seq_ops = { 562 + .start = table_seq_start, 563 + .next = table_seq_next, 564 + .stop = table_seq_stop, 565 + .show = table_seq_show, 566 + }; 567 + 617 568 static const struct file_operations format1_fops; 618 569 static const struct file_operations format2_fops; 619 570 static const struct file_operations format3_fops; 571 + static const struct file_operations format4_fops; 620 572 621 573 static int table_open(struct inode *inode, struct file *file) 622 574 { ··· 637 573 ret = seq_open(file, &format2_seq_ops); 638 574 else if (file->f_op == &format3_fops) 639 575 ret = seq_open(file, &format3_seq_ops); 576 + else if (file->f_op == &format4_fops) 577 + ret = seq_open(file, &format4_seq_ops); 640 578 641 579 if (ret) 642 580 return ret; ··· 665 599 }; 666 600 667 601 static const struct file_operations format3_fops = { 602 + .owner = THIS_MODULE, 603 + .open = table_open, 604 + .read = seq_read, 605 + .llseek = seq_lseek, 606 + .release = seq_release 607 + }; 608 + 609 + static const struct file_operations format4_fops = { 668 610 .owner = THIS_MODULE, 669 611 .open = table_open, 670 612 .read = seq_read, ··· 726 652 debugfs_remove(ls->ls_debug_locks_dentry); 727 653 if (ls->ls_debug_all_dentry) 728 654 debugfs_remove(ls->ls_debug_all_dentry); 655 + if (ls->ls_debug_toss_dentry) 656 + debugfs_remove(ls->ls_debug_toss_dentry); 729 657 } 730 658 731 659 int dlm_create_debug_file(struct dlm_ls *ls) ··· 768 692 ls, 769 693 &format3_fops); 770 694 if (!ls->ls_debug_all_dentry) 695 + goto fail; 696 + 697 + /* format 4 */ 698 + 699 + memset(name, 0, sizeof(name)); 700 + snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_toss", ls->ls_name); 701 + 702 + ls->ls_debug_toss_dentry = debugfs_create_file(name, 703 + S_IFREG | S_IRUGO, 704 + dlm_root, 705 + ls, 706 + &format4_fops); 707 + if (!ls->ls_debug_toss_dentry) 771 708 goto fail; 772 709 773 710 memset(name, 0, sizeof(name));
+69 -218
fs/dlm/dir.c
··· 23 23 #include "lock.h" 24 24 #include "dir.h" 25 25 26 - 27 - static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de) 28 - { 29 - spin_lock(&ls->ls_recover_list_lock); 30 - list_add(&de->list, &ls->ls_recover_list); 31 - spin_unlock(&ls->ls_recover_list_lock); 32 - } 33 - 34 - static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len) 35 - { 36 - int found = 0; 37 - struct dlm_direntry *de; 38 - 39 - spin_lock(&ls->ls_recover_list_lock); 40 - list_for_each_entry(de, &ls->ls_recover_list, list) { 41 - if (de->length == len) { 42 - list_del(&de->list); 43 - de->master_nodeid = 0; 44 - memset(de->name, 0, len); 45 - found = 1; 46 - break; 47 - } 48 - } 49 - spin_unlock(&ls->ls_recover_list_lock); 50 - 51 - if (!found) 52 - de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_NOFS); 53 - return de; 54 - } 55 - 56 - void dlm_clear_free_entries(struct dlm_ls *ls) 57 - { 58 - struct dlm_direntry *de; 59 - 60 - spin_lock(&ls->ls_recover_list_lock); 61 - while (!list_empty(&ls->ls_recover_list)) { 62 - de = list_entry(ls->ls_recover_list.next, struct dlm_direntry, 63 - list); 64 - list_del(&de->list); 65 - kfree(de); 66 - } 67 - spin_unlock(&ls->ls_recover_list_lock); 68 - } 69 - 70 26 /* 71 27 * We use the upper 16 bits of the hash value to select the directory node. 72 28 * Low bits are used for distribution of rsb's among hash buckets on each node. ··· 34 78 35 79 int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash) 36 80 { 37 - struct list_head *tmp; 38 - struct dlm_member *memb = NULL; 39 - uint32_t node, n = 0; 40 - int nodeid; 81 + uint32_t node; 41 82 42 - if (ls->ls_num_nodes == 1) { 43 - nodeid = dlm_our_nodeid(); 44 - goto out; 45 - } 46 - 47 - if (ls->ls_node_array) { 83 + if (ls->ls_num_nodes == 1) 84 + return dlm_our_nodeid(); 85 + else { 48 86 node = (hash >> 16) % ls->ls_total_weight; 49 - nodeid = ls->ls_node_array[node]; 50 - goto out; 87 + return ls->ls_node_array[node]; 51 88 } 52 - 53 - /* make_member_array() failed to kmalloc ls_node_array... */ 54 - 55 - node = (hash >> 16) % ls->ls_num_nodes; 56 - 57 - list_for_each(tmp, &ls->ls_nodes) { 58 - if (n++ != node) 59 - continue; 60 - memb = list_entry(tmp, struct dlm_member, list); 61 - break; 62 - } 63 - 64 - DLM_ASSERT(memb , printk("num_nodes=%u n=%u node=%u\n", 65 - ls->ls_num_nodes, n, node);); 66 - nodeid = memb->nodeid; 67 - out: 68 - return nodeid; 69 89 } 70 90 71 91 int dlm_dir_nodeid(struct dlm_rsb *r) 72 92 { 73 - return dlm_hash2nodeid(r->res_ls, r->res_hash); 93 + return r->res_dir_nodeid; 74 94 } 75 95 76 - static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len) 96 + void dlm_recover_dir_nodeid(struct dlm_ls *ls) 77 97 { 78 - uint32_t val; 98 + struct dlm_rsb *r; 79 99 80 - val = jhash(name, len, 0); 81 - val &= (ls->ls_dirtbl_size - 1); 82 - 83 - return val; 84 - } 85 - 86 - static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de) 87 - { 88 - uint32_t bucket; 89 - 90 - bucket = dir_hash(ls, de->name, de->length); 91 - list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list); 92 - } 93 - 94 - static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name, 95 - int namelen, uint32_t bucket) 96 - { 97 - struct dlm_direntry *de; 98 - 99 - list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) { 100 - if (de->length == namelen && !memcmp(name, de->name, namelen)) 101 - goto out; 100 + down_read(&ls->ls_root_sem); 101 + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { 102 + r->res_dir_nodeid = dlm_hash2nodeid(ls, r->res_hash); 102 103 } 103 - de = NULL; 104 - out: 105 - return de; 106 - } 107 - 108 - void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen) 109 - { 110 - struct dlm_direntry *de; 111 - uint32_t bucket; 112 - 113 - bucket = dir_hash(ls, name, namelen); 114 - 115 - spin_lock(&ls->ls_dirtbl[bucket].lock); 116 - 117 - de = search_bucket(ls, name, namelen, bucket); 118 - 119 - if (!de) { 120 - log_error(ls, "remove fr %u none", nodeid); 121 - goto out; 122 - } 123 - 124 - if (de->master_nodeid != nodeid) { 125 - log_error(ls, "remove fr %u ID %u", nodeid, de->master_nodeid); 126 - goto out; 127 - } 128 - 129 - list_del(&de->list); 130 - kfree(de); 131 - out: 132 - spin_unlock(&ls->ls_dirtbl[bucket].lock); 133 - } 134 - 135 - void dlm_dir_clear(struct dlm_ls *ls) 136 - { 137 - struct list_head *head; 138 - struct dlm_direntry *de; 139 - int i; 140 - 141 - DLM_ASSERT(list_empty(&ls->ls_recover_list), ); 142 - 143 - for (i = 0; i < ls->ls_dirtbl_size; i++) { 144 - spin_lock(&ls->ls_dirtbl[i].lock); 145 - head = &ls->ls_dirtbl[i].list; 146 - while (!list_empty(head)) { 147 - de = list_entry(head->next, struct dlm_direntry, list); 148 - list_del(&de->list); 149 - put_free_de(ls, de); 150 - } 151 - spin_unlock(&ls->ls_dirtbl[i].lock); 152 - } 104 + up_read(&ls->ls_root_sem); 153 105 } 154 106 155 107 int dlm_recover_directory(struct dlm_ls *ls) 156 108 { 157 109 struct dlm_member *memb; 158 - struct dlm_direntry *de; 159 110 char *b, *last_name = NULL; 160 - int error = -ENOMEM, last_len, count = 0; 111 + int error = -ENOMEM, last_len, nodeid, result; 161 112 uint16_t namelen; 113 + unsigned int count = 0, count_match = 0, count_bad = 0, count_add = 0; 162 114 163 115 log_debug(ls, "dlm_recover_directory"); 164 116 165 117 if (dlm_no_directory(ls)) 166 118 goto out_status; 167 119 168 - dlm_dir_clear(ls); 169 - 170 120 last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_NOFS); 171 121 if (!last_name) 172 122 goto out; 173 123 174 124 list_for_each_entry(memb, &ls->ls_nodes, list) { 125 + if (memb->nodeid == dlm_our_nodeid()) 126 + continue; 127 + 175 128 memset(last_name, 0, DLM_RESNAME_MAXLEN); 176 129 last_len = 0; 177 130 ··· 95 230 if (error) 96 231 goto out_free; 97 232 98 - schedule(); 233 + cond_resched(); 99 234 100 235 /* 101 236 * pick namelen/name pairs out of received buffer ··· 132 267 if (namelen > DLM_RESNAME_MAXLEN) 133 268 goto out_free; 134 269 135 - error = -ENOMEM; 136 - de = get_free_de(ls, namelen); 137 - if (!de) 270 + error = dlm_master_lookup(ls, memb->nodeid, 271 + b, namelen, 272 + DLM_LU_RECOVER_DIR, 273 + &nodeid, &result); 274 + if (error) { 275 + log_error(ls, "recover_dir lookup %d", 276 + error); 138 277 goto out_free; 278 + } 139 279 140 - de->master_nodeid = memb->nodeid; 141 - de->length = namelen; 280 + /* The name was found in rsbtbl, but the 281 + * master nodeid is different from 282 + * memb->nodeid which says it is the master. 283 + * This should not happen. */ 284 + 285 + if (result == DLM_LU_MATCH && 286 + nodeid != memb->nodeid) { 287 + count_bad++; 288 + log_error(ls, "recover_dir lookup %d " 289 + "nodeid %d memb %d bad %u", 290 + result, nodeid, memb->nodeid, 291 + count_bad); 292 + print_hex_dump_bytes("dlm_recover_dir ", 293 + DUMP_PREFIX_NONE, 294 + b, namelen); 295 + } 296 + 297 + /* The name was found in rsbtbl, and the 298 + * master nodeid matches memb->nodeid. */ 299 + 300 + if (result == DLM_LU_MATCH && 301 + nodeid == memb->nodeid) { 302 + count_match++; 303 + } 304 + 305 + /* The name was not found in rsbtbl and was 306 + * added with memb->nodeid as the master. */ 307 + 308 + if (result == DLM_LU_ADD) { 309 + count_add++; 310 + } 311 + 142 312 last_len = namelen; 143 - memcpy(de->name, b, namelen); 144 313 memcpy(last_name, b, namelen); 145 314 b += namelen; 146 315 left -= namelen; 147 - 148 - add_entry_to_hash(ls, de); 149 316 count++; 150 317 } 151 318 } 152 - done: 319 + done: 153 320 ; 154 321 } 155 322 156 323 out_status: 157 324 error = 0; 158 - log_debug(ls, "dlm_recover_directory %d entries", count); 325 + dlm_set_recover_status(ls, DLM_RS_DIR); 326 + 327 + log_debug(ls, "dlm_recover_directory %u in %u new", 328 + count, count_add); 159 329 out_free: 160 330 kfree(last_name); 161 331 out: 162 - dlm_clear_free_entries(ls); 163 332 return error; 164 - } 165 - 166 - static int get_entry(struct dlm_ls *ls, int nodeid, char *name, 167 - int namelen, int *r_nodeid) 168 - { 169 - struct dlm_direntry *de, *tmp; 170 - uint32_t bucket; 171 - 172 - bucket = dir_hash(ls, name, namelen); 173 - 174 - spin_lock(&ls->ls_dirtbl[bucket].lock); 175 - de = search_bucket(ls, name, namelen, bucket); 176 - if (de) { 177 - *r_nodeid = de->master_nodeid; 178 - spin_unlock(&ls->ls_dirtbl[bucket].lock); 179 - if (*r_nodeid == nodeid) 180 - return -EEXIST; 181 - return 0; 182 - } 183 - 184 - spin_unlock(&ls->ls_dirtbl[bucket].lock); 185 - 186 - if (namelen > DLM_RESNAME_MAXLEN) 187 - return -EINVAL; 188 - 189 - de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_NOFS); 190 - if (!de) 191 - return -ENOMEM; 192 - 193 - de->master_nodeid = nodeid; 194 - de->length = namelen; 195 - memcpy(de->name, name, namelen); 196 - 197 - spin_lock(&ls->ls_dirtbl[bucket].lock); 198 - tmp = search_bucket(ls, name, namelen, bucket); 199 - if (tmp) { 200 - kfree(de); 201 - de = tmp; 202 - } else { 203 - list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list); 204 - } 205 - *r_nodeid = de->master_nodeid; 206 - spin_unlock(&ls->ls_dirtbl[bucket].lock); 207 - return 0; 208 - } 209 - 210 - int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen, 211 - int *r_nodeid) 212 - { 213 - return get_entry(ls, nodeid, name, namelen, r_nodeid); 214 333 } 215 334 216 335 static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len) ··· 207 358 bucket = hash & (ls->ls_rsbtbl_size - 1); 208 359 209 360 spin_lock(&ls->ls_rsbtbl[bucket].lock); 210 - rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[bucket].keep, name, len, 0, &r); 361 + rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[bucket].keep, name, len, &r); 211 362 if (rv) 212 363 rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[bucket].toss, 213 - name, len, 0, &r); 364 + name, len, &r); 214 365 spin_unlock(&ls->ls_rsbtbl[bucket].lock); 215 366 216 367 if (!rv) ··· 220 371 list_for_each_entry(r, &ls->ls_root_list, res_root_list) { 221 372 if (len == r->res_length && !memcmp(name, r->res_name, len)) { 222 373 up_read(&ls->ls_root_sem); 223 - log_error(ls, "find_rsb_root revert to root_list %s", 374 + log_debug(ls, "find_rsb_root revert to root_list %s", 224 375 r->res_name); 225 376 return r; 226 377 } ··· 278 429 be_namelen = cpu_to_be16(0); 279 430 memcpy(outbuf + offset, &be_namelen, sizeof(__be16)); 280 431 offset += sizeof(__be16); 432 + ls->ls_recover_dir_sent_msg++; 281 433 goto out; 282 434 } 283 435 ··· 287 437 offset += sizeof(__be16); 288 438 memcpy(outbuf + offset, r->res_name, r->res_length); 289 439 offset += r->res_length; 440 + ls->ls_recover_dir_sent_res++; 290 441 } 291 442 292 443 /* ··· 300 449 be_namelen = cpu_to_be16(0xFFFF); 301 450 memcpy(outbuf + offset, &be_namelen, sizeof(__be16)); 302 451 offset += sizeof(__be16); 452 + ls->ls_recover_dir_sent_msg++; 303 453 } 304 - 305 454 out: 306 455 up_read(&ls->ls_root_sem); 307 456 }
+1 -6
fs/dlm/dir.h
··· 14 14 #ifndef __DIR_DOT_H__ 15 15 #define __DIR_DOT_H__ 16 16 17 - 18 17 int dlm_dir_nodeid(struct dlm_rsb *rsb); 19 18 int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash); 20 - void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len); 21 - void dlm_dir_clear(struct dlm_ls *ls); 22 - void dlm_clear_free_entries(struct dlm_ls *ls); 19 + void dlm_recover_dir_nodeid(struct dlm_ls *ls); 23 20 int dlm_recover_directory(struct dlm_ls *ls); 24 - int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen, 25 - int *r_nodeid); 26 21 void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, 27 22 char *outbuf, int outlen, int nodeid); 28 23
+27 -19
fs/dlm/dlm_internal.h
··· 55 55 struct dlm_rsb; 56 56 struct dlm_member; 57 57 struct dlm_rsbtable; 58 - struct dlm_dirtable; 59 - struct dlm_direntry; 60 58 struct dlm_recover; 61 59 struct dlm_header; 62 60 struct dlm_message; ··· 95 97 } \ 96 98 } 97 99 98 - 99 - struct dlm_direntry { 100 - struct list_head list; 101 - uint32_t master_nodeid; 102 - uint16_t length; 103 - char name[1]; 104 - }; 105 - 106 - struct dlm_dirtable { 107 - struct list_head list; 108 - spinlock_t lock; 109 - }; 110 100 111 101 struct dlm_rsbtable { 112 102 struct rb_root keep; ··· 269 283 }; 270 284 }; 271 285 286 + /* 287 + * res_master_nodeid is "normal": 0 is unset/invalid, non-zero is the real 288 + * nodeid, even when nodeid is our_nodeid. 289 + * 290 + * res_nodeid is "odd": -1 is unset/invalid, zero means our_nodeid, 291 + * greater than zero when another nodeid. 292 + * 293 + * (TODO: remove res_nodeid and only use res_master_nodeid) 294 + */ 272 295 273 296 struct dlm_rsb { 274 297 struct dlm_ls *res_ls; /* the lockspace */ ··· 286 291 unsigned long res_flags; 287 292 int res_length; /* length of rsb name */ 288 293 int res_nodeid; 294 + int res_master_nodeid; 295 + int res_dir_nodeid; 289 296 uint32_t res_lvbseq; 290 297 uint32_t res_hash; 291 298 uint32_t res_bucket; /* rsbtbl */ ··· 310 313 char res_name[DLM_RESNAME_MAXLEN+1]; 311 314 }; 312 315 316 + /* dlm_master_lookup() flags */ 317 + 318 + #define DLM_LU_RECOVER_DIR 1 319 + #define DLM_LU_RECOVER_MASTER 2 320 + 321 + /* dlm_master_lookup() results */ 322 + 323 + #define DLM_LU_MATCH 1 324 + #define DLM_LU_ADD 2 325 + 313 326 /* find_rsb() flags */ 314 327 315 - #define R_MASTER 1 /* only return rsb if it's a master */ 316 - #define R_CREATE 2 /* create/add rsb if not found */ 328 + #define R_REQUEST 0x00000001 329 + #define R_RECEIVE_REQUEST 0x00000002 330 + #define R_RECEIVE_RECOVER 0x00000004 317 331 318 332 /* rsb_flags */ 319 333 ··· 517 509 struct dlm_rsbtable *ls_rsbtbl; 518 510 uint32_t ls_rsbtbl_size; 519 511 520 - struct dlm_dirtable *ls_dirtbl; 521 - uint32_t ls_dirtbl_size; 522 - 523 512 struct mutex ls_waiters_mutex; 524 513 struct list_head ls_waiters; /* lkbs needing a reply */ 525 514 ··· 550 545 struct dentry *ls_debug_waiters_dentry; /* debugfs */ 551 546 struct dentry *ls_debug_locks_dentry; /* debugfs */ 552 547 struct dentry *ls_debug_all_dentry; /* debugfs */ 548 + struct dentry *ls_debug_toss_dentry; /* debugfs */ 553 549 554 550 wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ 555 551 int ls_uevent_result; ··· 579 573 struct mutex ls_requestqueue_mutex; 580 574 struct dlm_rcom *ls_recover_buf; 581 575 int ls_recover_nodeid; /* for debugging */ 576 + unsigned int ls_recover_dir_sent_res; /* for log info */ 577 + unsigned int ls_recover_dir_sent_msg; /* for log info */ 582 578 unsigned int ls_recover_locks_in; /* for log info */ 583 579 uint64_t ls_rcom_seq; 584 580 spinlock_t ls_rcom_spin;
+831 -215
fs/dlm/lock.c
··· 90 90 static int receive_extralen(struct dlm_message *ms); 91 91 static void do_purge(struct dlm_ls *ls, int nodeid, int pid); 92 92 static void del_timeout(struct dlm_lkb *lkb); 93 + static void toss_rsb(struct kref *kref); 93 94 94 95 /* 95 96 * Lock compatibilty matrix - thanks Steve ··· 171 170 172 171 static void dlm_print_rsb(struct dlm_rsb *r) 173 172 { 174 - printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n", 175 - r->res_nodeid, r->res_flags, r->res_first_lkid, 176 - r->res_recover_locks_count, r->res_name); 173 + printk(KERN_ERR "rsb: nodeid %d master %d dir %d flags %lx first %x " 174 + "rlc %d name %s\n", 175 + r->res_nodeid, r->res_master_nodeid, r->res_dir_nodeid, 176 + r->res_flags, r->res_first_lkid, r->res_recover_locks_count, 177 + r->res_name); 177 178 } 178 179 179 180 void dlm_dump_rsb(struct dlm_rsb *r) ··· 330 327 * Basic operations on rsb's and lkb's 331 328 */ 332 329 330 + /* This is only called to add a reference when the code already holds 331 + a valid reference to the rsb, so there's no need for locking. */ 332 + 333 + static inline void hold_rsb(struct dlm_rsb *r) 334 + { 335 + kref_get(&r->res_ref); 336 + } 337 + 338 + void dlm_hold_rsb(struct dlm_rsb *r) 339 + { 340 + hold_rsb(r); 341 + } 342 + 343 + /* When all references to the rsb are gone it's transferred to 344 + the tossed list for later disposal. */ 345 + 346 + static void put_rsb(struct dlm_rsb *r) 347 + { 348 + struct dlm_ls *ls = r->res_ls; 349 + uint32_t bucket = r->res_bucket; 350 + 351 + spin_lock(&ls->ls_rsbtbl[bucket].lock); 352 + kref_put(&r->res_ref, toss_rsb); 353 + spin_unlock(&ls->ls_rsbtbl[bucket].lock); 354 + } 355 + 356 + void dlm_put_rsb(struct dlm_rsb *r) 357 + { 358 + put_rsb(r); 359 + } 360 + 333 361 static int pre_rsb_struct(struct dlm_ls *ls) 334 362 { 335 363 struct dlm_rsb *r1, *r2; ··· 445 411 } 446 412 447 413 int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len, 448 - unsigned int flags, struct dlm_rsb **r_ret) 414 + struct dlm_rsb **r_ret) 449 415 { 450 416 struct rb_node *node = tree->rb_node; 451 417 struct dlm_rsb *r; 452 - int error = 0; 453 418 int rc; 454 419 455 420 while (node) { ··· 465 432 return -EBADR; 466 433 467 434 found: 468 - if (r->res_nodeid && (flags & R_MASTER)) 469 - error = -ENOTBLK; 470 435 *r_ret = r; 471 - return error; 436 + return 0; 472 437 } 473 438 474 439 static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree) ··· 498 467 return 0; 499 468 } 500 469 501 - static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b, 502 - unsigned int flags, struct dlm_rsb **r_ret) 503 - { 504 - struct dlm_rsb *r; 505 - int error; 506 - 507 - error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, flags, &r); 508 - if (!error) { 509 - kref_get(&r->res_ref); 510 - goto out; 511 - } 512 - if (error == -ENOTBLK) 513 - goto out; 514 - 515 - error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, flags, &r); 516 - if (error) 517 - goto out; 518 - 519 - rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); 520 - error = rsb_insert(r, &ls->ls_rsbtbl[b].keep); 521 - if (error) 522 - return error; 523 - 524 - if (dlm_no_directory(ls)) 525 - goto out; 526 - 527 - if (r->res_nodeid == -1) { 528 - rsb_clear_flag(r, RSB_MASTER_UNCERTAIN); 529 - r->res_first_lkid = 0; 530 - } else if (r->res_nodeid > 0) { 531 - rsb_set_flag(r, RSB_MASTER_UNCERTAIN); 532 - r->res_first_lkid = 0; 533 - } else { 534 - DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r);); 535 - DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),); 536 - } 537 - out: 538 - *r_ret = r; 539 - return error; 540 - } 541 - 542 470 /* 543 471 * Find rsb in rsbtbl and potentially create/add one 544 472 * ··· 510 520 * Searching for an rsb means looking through both the normal list and toss 511 521 * list. When found on the toss list the rsb is moved to the normal list with 512 522 * ref count of 1; when found on normal list the ref count is incremented. 523 + * 524 + * rsb's on the keep list are being used locally and refcounted. 525 + * rsb's on the toss list are not being used locally, and are not refcounted. 526 + * 527 + * The toss list rsb's were either 528 + * - previously used locally but not any more (were on keep list, then 529 + * moved to toss list when last refcount dropped) 530 + * - created and put on toss list as a directory record for a lookup 531 + * (we are the dir node for the res, but are not using the res right now, 532 + * but some other node is) 533 + * 534 + * The purpose of find_rsb() is to return a refcounted rsb for local use. 535 + * So, if the given rsb is on the toss list, it is moved to the keep list 536 + * before being returned. 537 + * 538 + * toss_rsb() happens when all local usage of the rsb is done, i.e. no 539 + * more refcounts exist, so the rsb is moved from the keep list to the 540 + * toss list. 541 + * 542 + * rsb's on both keep and toss lists are used for doing a name to master 543 + * lookups. rsb's that are in use locally (and being refcounted) are on 544 + * the keep list, rsb's that are not in use locally (not refcounted) and 545 + * only exist for name/master lookups are on the toss list. 546 + * 547 + * rsb's on the toss list who's dir_nodeid is not local can have stale 548 + * name/master mappings. So, remote requests on such rsb's can potentially 549 + * return with an error, which means the mapping is stale and needs to 550 + * be updated with a new lookup. (The idea behind MASTER UNCERTAIN and 551 + * first_lkid is to keep only a single outstanding request on an rsb 552 + * while that rsb has a potentially stale master.) 513 553 */ 514 554 515 - static int find_rsb(struct dlm_ls *ls, char *name, int namelen, 516 - unsigned int flags, struct dlm_rsb **r_ret) 555 + static int find_rsb_dir(struct dlm_ls *ls, char *name, int len, 556 + uint32_t hash, uint32_t b, 557 + int dir_nodeid, int from_nodeid, 558 + unsigned int flags, struct dlm_rsb **r_ret) 517 559 { 518 560 struct dlm_rsb *r = NULL; 519 - uint32_t hash, bucket; 561 + int our_nodeid = dlm_our_nodeid(); 562 + int from_local = 0; 563 + int from_other = 0; 564 + int from_dir = 0; 565 + int create = 0; 520 566 int error; 521 567 522 - if (namelen > DLM_RESNAME_MAXLEN) { 523 - error = -EINVAL; 524 - goto out; 568 + if (flags & R_RECEIVE_REQUEST) { 569 + if (from_nodeid == dir_nodeid) 570 + from_dir = 1; 571 + else 572 + from_other = 1; 573 + } else if (flags & R_REQUEST) { 574 + from_local = 1; 525 575 } 526 576 527 - if (dlm_no_directory(ls)) 528 - flags |= R_CREATE; 577 + /* 578 + * flags & R_RECEIVE_RECOVER is from dlm_recover_master_copy, so 579 + * from_nodeid has sent us a lock in dlm_recover_locks, believing 580 + * we're the new master. Our local recovery may not have set 581 + * res_master_nodeid to our_nodeid yet, so allow either. Don't 582 + * create the rsb; dlm_recover_process_copy() will handle EBADR 583 + * by resending. 584 + * 585 + * If someone sends us a request, we are the dir node, and we do 586 + * not find the rsb anywhere, then recreate it. This happens if 587 + * someone sends us a request after we have removed/freed an rsb 588 + * from our toss list. (They sent a request instead of lookup 589 + * because they are using an rsb from their toss list.) 590 + */ 529 591 530 - hash = jhash(name, namelen, 0); 531 - bucket = hash & (ls->ls_rsbtbl_size - 1); 592 + if (from_local || from_dir || 593 + (from_other && (dir_nodeid == our_nodeid))) { 594 + create = 1; 595 + } 532 596 533 597 retry: 534 - if (flags & R_CREATE) { 598 + if (create) { 535 599 error = pre_rsb_struct(ls); 536 600 if (error < 0) 537 601 goto out; 538 602 } 539 603 540 - spin_lock(&ls->ls_rsbtbl[bucket].lock); 604 + spin_lock(&ls->ls_rsbtbl[b].lock); 541 605 542 - error = _search_rsb(ls, name, namelen, bucket, flags, &r); 543 - if (!error) 606 + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); 607 + if (error) 608 + goto do_toss; 609 + 610 + /* 611 + * rsb is active, so we can't check master_nodeid without lock_rsb. 612 + */ 613 + 614 + kref_get(&r->res_ref); 615 + error = 0; 616 + goto out_unlock; 617 + 618 + 619 + do_toss: 620 + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); 621 + if (error) 622 + goto do_new; 623 + 624 + /* 625 + * rsb found inactive (master_nodeid may be out of date unless 626 + * we are the dir_nodeid or were the master) No other thread 627 + * is using this rsb because it's on the toss list, so we can 628 + * look at or update res_master_nodeid without lock_rsb. 629 + */ 630 + 631 + if ((r->res_master_nodeid != our_nodeid) && from_other) { 632 + /* our rsb was not master, and another node (not the dir node) 633 + has sent us a request */ 634 + log_debug(ls, "find_rsb toss from_other %d master %d dir %d %s", 635 + from_nodeid, r->res_master_nodeid, dir_nodeid, 636 + r->res_name); 637 + error = -ENOTBLK; 638 + goto out_unlock; 639 + } 640 + 641 + if ((r->res_master_nodeid != our_nodeid) && from_dir) { 642 + /* don't think this should ever happen */ 643 + log_error(ls, "find_rsb toss from_dir %d master %d", 644 + from_nodeid, r->res_master_nodeid); 645 + dlm_print_rsb(r); 646 + /* fix it and go on */ 647 + r->res_master_nodeid = our_nodeid; 648 + r->res_nodeid = 0; 649 + rsb_clear_flag(r, RSB_MASTER_UNCERTAIN); 650 + r->res_first_lkid = 0; 651 + } 652 + 653 + if (from_local && (r->res_master_nodeid != our_nodeid)) { 654 + /* Because we have held no locks on this rsb, 655 + res_master_nodeid could have become stale. */ 656 + rsb_set_flag(r, RSB_MASTER_UNCERTAIN); 657 + r->res_first_lkid = 0; 658 + } 659 + 660 + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); 661 + error = rsb_insert(r, &ls->ls_rsbtbl[b].keep); 662 + goto out_unlock; 663 + 664 + 665 + do_new: 666 + /* 667 + * rsb not found 668 + */ 669 + 670 + if (error == -EBADR && !create) 544 671 goto out_unlock; 545 672 546 - if (error == -EBADR && !(flags & R_CREATE)) 547 - goto out_unlock; 548 - 549 - /* the rsb was found but wasn't a master copy */ 550 - if (error == -ENOTBLK) 551 - goto out_unlock; 552 - 553 - error = get_rsb_struct(ls, name, namelen, &r); 673 + error = get_rsb_struct(ls, name, len, &r); 554 674 if (error == -EAGAIN) { 555 - spin_unlock(&ls->ls_rsbtbl[bucket].lock); 675 + spin_unlock(&ls->ls_rsbtbl[b].lock); 556 676 goto retry; 557 677 } 558 678 if (error) 559 679 goto out_unlock; 560 680 561 681 r->res_hash = hash; 562 - r->res_bucket = bucket; 563 - r->res_nodeid = -1; 682 + r->res_bucket = b; 683 + r->res_dir_nodeid = dir_nodeid; 564 684 kref_init(&r->res_ref); 565 685 566 - /* With no directory, the master can be set immediately */ 567 - if (dlm_no_directory(ls)) { 568 - int nodeid = dlm_dir_nodeid(r); 569 - if (nodeid == dlm_our_nodeid()) 570 - nodeid = 0; 571 - r->res_nodeid = nodeid; 686 + if (from_dir) { 687 + /* want to see how often this happens */ 688 + log_debug(ls, "find_rsb new from_dir %d recreate %s", 689 + from_nodeid, r->res_name); 690 + r->res_master_nodeid = our_nodeid; 691 + r->res_nodeid = 0; 692 + goto out_add; 572 693 } 573 - error = rsb_insert(r, &ls->ls_rsbtbl[bucket].keep); 694 + 695 + if (from_other && (dir_nodeid != our_nodeid)) { 696 + /* should never happen */ 697 + log_error(ls, "find_rsb new from_other %d dir %d our %d %s", 698 + from_nodeid, dir_nodeid, our_nodeid, r->res_name); 699 + dlm_free_rsb(r); 700 + error = -ENOTBLK; 701 + goto out_unlock; 702 + } 703 + 704 + if (from_other) { 705 + log_debug(ls, "find_rsb new from_other %d dir %d %s", 706 + from_nodeid, dir_nodeid, r->res_name); 707 + } 708 + 709 + if (dir_nodeid == our_nodeid) { 710 + /* When we are the dir nodeid, we can set the master 711 + node immediately */ 712 + r->res_master_nodeid = our_nodeid; 713 + r->res_nodeid = 0; 714 + } else { 715 + /* set_master will send_lookup to dir_nodeid */ 716 + r->res_master_nodeid = 0; 717 + r->res_nodeid = -1; 718 + } 719 + 720 + out_add: 721 + error = rsb_insert(r, &ls->ls_rsbtbl[b].keep); 574 722 out_unlock: 575 - spin_unlock(&ls->ls_rsbtbl[bucket].lock); 723 + spin_unlock(&ls->ls_rsbtbl[b].lock); 576 724 out: 577 725 *r_ret = r; 726 + return error; 727 + } 728 + 729 + /* During recovery, other nodes can send us new MSTCPY locks (from 730 + dlm_recover_locks) before we've made ourself master (in 731 + dlm_recover_masters). */ 732 + 733 + static int find_rsb_nodir(struct dlm_ls *ls, char *name, int len, 734 + uint32_t hash, uint32_t b, 735 + int dir_nodeid, int from_nodeid, 736 + unsigned int flags, struct dlm_rsb **r_ret) 737 + { 738 + struct dlm_rsb *r = NULL; 739 + int our_nodeid = dlm_our_nodeid(); 740 + int recover = (flags & R_RECEIVE_RECOVER); 741 + int error; 742 + 743 + retry: 744 + error = pre_rsb_struct(ls); 745 + if (error < 0) 746 + goto out; 747 + 748 + spin_lock(&ls->ls_rsbtbl[b].lock); 749 + 750 + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); 751 + if (error) 752 + goto do_toss; 753 + 754 + /* 755 + * rsb is active, so we can't check master_nodeid without lock_rsb. 756 + */ 757 + 758 + kref_get(&r->res_ref); 759 + goto out_unlock; 760 + 761 + 762 + do_toss: 763 + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); 764 + if (error) 765 + goto do_new; 766 + 767 + /* 768 + * rsb found inactive. No other thread is using this rsb because 769 + * it's on the toss list, so we can look at or update 770 + * res_master_nodeid without lock_rsb. 771 + */ 772 + 773 + if (!recover && (r->res_master_nodeid != our_nodeid) && from_nodeid) { 774 + /* our rsb is not master, and another node has sent us a 775 + request; this should never happen */ 776 + log_error(ls, "find_rsb toss from_nodeid %d master %d dir %d", 777 + from_nodeid, r->res_master_nodeid, dir_nodeid); 778 + dlm_print_rsb(r); 779 + error = -ENOTBLK; 780 + goto out_unlock; 781 + } 782 + 783 + if (!recover && (r->res_master_nodeid != our_nodeid) && 784 + (dir_nodeid == our_nodeid)) { 785 + /* our rsb is not master, and we are dir; may as well fix it; 786 + this should never happen */ 787 + log_error(ls, "find_rsb toss our %d master %d dir %d", 788 + our_nodeid, r->res_master_nodeid, dir_nodeid); 789 + dlm_print_rsb(r); 790 + r->res_master_nodeid = our_nodeid; 791 + r->res_nodeid = 0; 792 + } 793 + 794 + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); 795 + error = rsb_insert(r, &ls->ls_rsbtbl[b].keep); 796 + goto out_unlock; 797 + 798 + 799 + do_new: 800 + /* 801 + * rsb not found 802 + */ 803 + 804 + error = get_rsb_struct(ls, name, len, &r); 805 + if (error == -EAGAIN) { 806 + spin_unlock(&ls->ls_rsbtbl[b].lock); 807 + goto retry; 808 + } 809 + if (error) 810 + goto out_unlock; 811 + 812 + r->res_hash = hash; 813 + r->res_bucket = b; 814 + r->res_dir_nodeid = dir_nodeid; 815 + r->res_master_nodeid = dir_nodeid; 816 + r->res_nodeid = (dir_nodeid == our_nodeid) ? 0 : dir_nodeid; 817 + kref_init(&r->res_ref); 818 + 819 + error = rsb_insert(r, &ls->ls_rsbtbl[b].keep); 820 + out_unlock: 821 + spin_unlock(&ls->ls_rsbtbl[b].lock); 822 + out: 823 + *r_ret = r; 824 + return error; 825 + } 826 + 827 + static int find_rsb(struct dlm_ls *ls, char *name, int len, int from_nodeid, 828 + unsigned int flags, struct dlm_rsb **r_ret) 829 + { 830 + uint32_t hash, b; 831 + int dir_nodeid; 832 + 833 + if (len > DLM_RESNAME_MAXLEN) 834 + return -EINVAL; 835 + 836 + hash = jhash(name, len, 0); 837 + b = hash & (ls->ls_rsbtbl_size - 1); 838 + 839 + dir_nodeid = dlm_hash2nodeid(ls, hash); 840 + 841 + if (dlm_no_directory(ls)) 842 + return find_rsb_nodir(ls, name, len, hash, b, dir_nodeid, 843 + from_nodeid, flags, r_ret); 844 + else 845 + return find_rsb_dir(ls, name, len, hash, b, dir_nodeid, 846 + from_nodeid, flags, r_ret); 847 + } 848 + 849 + /* we have received a request and found that res_master_nodeid != our_nodeid, 850 + so we need to return an error or make ourself the master */ 851 + 852 + static int validate_master_nodeid(struct dlm_ls *ls, struct dlm_rsb *r, 853 + int from_nodeid) 854 + { 855 + if (dlm_no_directory(ls)) { 856 + log_error(ls, "find_rsb keep from_nodeid %d master %d dir %d", 857 + from_nodeid, r->res_master_nodeid, 858 + r->res_dir_nodeid); 859 + dlm_print_rsb(r); 860 + return -ENOTBLK; 861 + } 862 + 863 + if (from_nodeid != r->res_dir_nodeid) { 864 + /* our rsb is not master, and another node (not the dir node) 865 + has sent us a request. this is much more common when our 866 + master_nodeid is zero, so limit debug to non-zero. */ 867 + 868 + if (r->res_master_nodeid) { 869 + log_debug(ls, "validate master from_other %d master %d " 870 + "dir %d first %x %s", from_nodeid, 871 + r->res_master_nodeid, r->res_dir_nodeid, 872 + r->res_first_lkid, r->res_name); 873 + } 874 + return -ENOTBLK; 875 + } else { 876 + /* our rsb is not master, but the dir nodeid has sent us a 877 + request; this could happen with master 0 / res_nodeid -1 */ 878 + 879 + if (r->res_master_nodeid) { 880 + log_error(ls, "validate master from_dir %d master %d " 881 + "first %x %s", 882 + from_nodeid, r->res_master_nodeid, 883 + r->res_first_lkid, r->res_name); 884 + } 885 + 886 + r->res_master_nodeid = dlm_our_nodeid(); 887 + r->res_nodeid = 0; 888 + return 0; 889 + } 890 + } 891 + 892 + /* 893 + * We're the dir node for this res and another node wants to know the 894 + * master nodeid. During normal operation (non recovery) this is only 895 + * called from receive_lookup(); master lookups when the local node is 896 + * the dir node are done by find_rsb(). 897 + * 898 + * normal operation, we are the dir node for a resource 899 + * . _request_lock 900 + * . set_master 901 + * . send_lookup 902 + * . receive_lookup 903 + * . dlm_master_lookup flags 0 904 + * 905 + * recover directory, we are rebuilding dir for all resources 906 + * . dlm_recover_directory 907 + * . dlm_rcom_names 908 + * remote node sends back the rsb names it is master of and we are dir of 909 + * . dlm_master_lookup RECOVER_DIR (fix_master 0, from_master 1) 910 + * we either create new rsb setting remote node as master, or find existing 911 + * rsb and set master to be the remote node. 912 + * 913 + * recover masters, we are finding the new master for resources 914 + * . dlm_recover_masters 915 + * . recover_master 916 + * . dlm_send_rcom_lookup 917 + * . receive_rcom_lookup 918 + * . dlm_master_lookup RECOVER_MASTER (fix_master 1, from_master 0) 919 + */ 920 + 921 + int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len, 922 + unsigned int flags, int *r_nodeid, int *result) 923 + { 924 + struct dlm_rsb *r = NULL; 925 + uint32_t hash, b; 926 + int from_master = (flags & DLM_LU_RECOVER_DIR); 927 + int fix_master = (flags & DLM_LU_RECOVER_MASTER); 928 + int our_nodeid = dlm_our_nodeid(); 929 + int dir_nodeid, error, toss_list = 0; 930 + 931 + if (len > DLM_RESNAME_MAXLEN) 932 + return -EINVAL; 933 + 934 + if (from_nodeid == our_nodeid) { 935 + log_error(ls, "dlm_master_lookup from our_nodeid %d flags %x", 936 + our_nodeid, flags); 937 + return -EINVAL; 938 + } 939 + 940 + hash = jhash(name, len, 0); 941 + b = hash & (ls->ls_rsbtbl_size - 1); 942 + 943 + dir_nodeid = dlm_hash2nodeid(ls, hash); 944 + if (dir_nodeid != our_nodeid) { 945 + log_error(ls, "dlm_master_lookup from %d dir %d our %d h %x %d", 946 + from_nodeid, dir_nodeid, our_nodeid, hash, 947 + ls->ls_num_nodes); 948 + *r_nodeid = -1; 949 + return -EINVAL; 950 + } 951 + 952 + retry: 953 + error = pre_rsb_struct(ls); 954 + if (error < 0) 955 + return error; 956 + 957 + spin_lock(&ls->ls_rsbtbl[b].lock); 958 + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); 959 + if (!error) { 960 + /* because the rsb is active, we need to lock_rsb before 961 + checking/changing re_master_nodeid */ 962 + 963 + hold_rsb(r); 964 + spin_unlock(&ls->ls_rsbtbl[b].lock); 965 + lock_rsb(r); 966 + goto found; 967 + } 968 + 969 + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); 970 + if (error) 971 + goto not_found; 972 + 973 + /* because the rsb is inactive (on toss list), it's not refcounted 974 + and lock_rsb is not used, but is protected by the rsbtbl lock */ 975 + 976 + toss_list = 1; 977 + found: 978 + if (r->res_dir_nodeid != our_nodeid) { 979 + /* should not happen, but may as well fix it and carry on */ 980 + log_error(ls, "dlm_master_lookup res_dir %d our %d %s", 981 + r->res_dir_nodeid, our_nodeid, r->res_name); 982 + r->res_dir_nodeid = our_nodeid; 983 + } 984 + 985 + if (fix_master && dlm_is_removed(ls, r->res_master_nodeid)) { 986 + /* Recovery uses this function to set a new master when 987 + the previous master failed. Setting NEW_MASTER will 988 + force dlm_recover_masters to call recover_master on this 989 + rsb even though the res_nodeid is no longer removed. */ 990 + 991 + r->res_master_nodeid = from_nodeid; 992 + r->res_nodeid = from_nodeid; 993 + rsb_set_flag(r, RSB_NEW_MASTER); 994 + 995 + if (toss_list) { 996 + /* I don't think we should ever find it on toss list. */ 997 + log_error(ls, "dlm_master_lookup fix_master on toss"); 998 + dlm_dump_rsb(r); 999 + } 1000 + } 1001 + 1002 + if (from_master && (r->res_master_nodeid != from_nodeid)) { 1003 + /* this will happen if from_nodeid became master during 1004 + a previous recovery cycle, and we aborted the previous 1005 + cycle before recovering this master value */ 1006 + 1007 + log_limit(ls, "dlm_master_lookup from_master %d " 1008 + "master_nodeid %d res_nodeid %d first %x %s", 1009 + from_nodeid, r->res_master_nodeid, r->res_nodeid, 1010 + r->res_first_lkid, r->res_name); 1011 + 1012 + if (r->res_master_nodeid == our_nodeid) { 1013 + log_error(ls, "from_master %d our_master", from_nodeid); 1014 + dlm_dump_rsb(r); 1015 + dlm_send_rcom_lookup_dump(r, from_nodeid); 1016 + goto out_found; 1017 + } 1018 + 1019 + r->res_master_nodeid = from_nodeid; 1020 + r->res_nodeid = from_nodeid; 1021 + rsb_set_flag(r, RSB_NEW_MASTER); 1022 + } 1023 + 1024 + if (!r->res_master_nodeid) { 1025 + /* this will happen if recovery happens while we're looking 1026 + up the master for this rsb */ 1027 + 1028 + log_debug(ls, "dlm_master_lookup master 0 to %d first %x %s", 1029 + from_nodeid, r->res_first_lkid, r->res_name); 1030 + r->res_master_nodeid = from_nodeid; 1031 + r->res_nodeid = from_nodeid; 1032 + } 1033 + 1034 + if (!from_master && !fix_master && 1035 + (r->res_master_nodeid == from_nodeid)) { 1036 + /* this can happen when the master sends remove, the dir node 1037 + finds the rsb on the keep list and ignores the remove, 1038 + and the former master sends a lookup */ 1039 + 1040 + log_limit(ls, "dlm_master_lookup from master %d flags %x " 1041 + "first %x %s", from_nodeid, flags, 1042 + r->res_first_lkid, r->res_name); 1043 + } 1044 + 1045 + out_found: 1046 + *r_nodeid = r->res_master_nodeid; 1047 + if (result) 1048 + *result = DLM_LU_MATCH; 1049 + 1050 + if (toss_list) { 1051 + r->res_toss_time = jiffies; 1052 + /* the rsb was inactive (on toss list) */ 1053 + spin_unlock(&ls->ls_rsbtbl[b].lock); 1054 + } else { 1055 + /* the rsb was active */ 1056 + unlock_rsb(r); 1057 + put_rsb(r); 1058 + } 1059 + return 0; 1060 + 1061 + not_found: 1062 + error = get_rsb_struct(ls, name, len, &r); 1063 + if (error == -EAGAIN) { 1064 + spin_unlock(&ls->ls_rsbtbl[b].lock); 1065 + goto retry; 1066 + } 1067 + if (error) 1068 + goto out_unlock; 1069 + 1070 + r->res_hash = hash; 1071 + r->res_bucket = b; 1072 + r->res_dir_nodeid = our_nodeid; 1073 + r->res_master_nodeid = from_nodeid; 1074 + r->res_nodeid = from_nodeid; 1075 + kref_init(&r->res_ref); 1076 + r->res_toss_time = jiffies; 1077 + 1078 + error = rsb_insert(r, &ls->ls_rsbtbl[b].toss); 1079 + if (error) { 1080 + /* should never happen */ 1081 + dlm_free_rsb(r); 1082 + spin_unlock(&ls->ls_rsbtbl[b].lock); 1083 + goto retry; 1084 + } 1085 + 1086 + if (result) 1087 + *result = DLM_LU_ADD; 1088 + *r_nodeid = from_nodeid; 1089 + error = 0; 1090 + out_unlock: 1091 + spin_unlock(&ls->ls_rsbtbl[b].lock); 578 1092 return error; 579 1093 } 580 1094 ··· 1099 605 } 1100 606 } 1101 607 1102 - /* This is only called to add a reference when the code already holds 1103 - a valid reference to the rsb, so there's no need for locking. */ 1104 - 1105 - static inline void hold_rsb(struct dlm_rsb *r) 608 + void dlm_dump_rsb_name(struct dlm_ls *ls, char *name, int len) 1106 609 { 1107 - kref_get(&r->res_ref); 1108 - } 610 + struct dlm_rsb *r = NULL; 611 + uint32_t hash, b; 612 + int error; 1109 613 1110 - void dlm_hold_rsb(struct dlm_rsb *r) 1111 - { 1112 - hold_rsb(r); 614 + hash = jhash(name, len, 0); 615 + b = hash & (ls->ls_rsbtbl_size - 1); 616 + 617 + spin_lock(&ls->ls_rsbtbl[b].lock); 618 + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); 619 + if (!error) 620 + goto out_dump; 621 + 622 + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); 623 + if (error) 624 + goto out; 625 + out_dump: 626 + dlm_dump_rsb(r); 627 + out: 628 + spin_unlock(&ls->ls_rsbtbl[b].lock); 1113 629 } 1114 630 1115 631 static void toss_rsb(struct kref *kref) ··· 1136 632 dlm_free_lvb(r->res_lvbptr); 1137 633 r->res_lvbptr = NULL; 1138 634 } 1139 - } 1140 - 1141 - /* When all references to the rsb are gone it's transferred to 1142 - the tossed list for later disposal. */ 1143 - 1144 - static void put_rsb(struct dlm_rsb *r) 1145 - { 1146 - struct dlm_ls *ls = r->res_ls; 1147 - uint32_t bucket = r->res_bucket; 1148 - 1149 - spin_lock(&ls->ls_rsbtbl[bucket].lock); 1150 - kref_put(&r->res_ref, toss_rsb); 1151 - spin_unlock(&ls->ls_rsbtbl[bucket].lock); 1152 - } 1153 - 1154 - void dlm_put_rsb(struct dlm_rsb *r) 1155 - { 1156 - put_rsb(r); 1157 635 } 1158 636 1159 637 /* See comment for unhold_lkb */ ··· 1624 1138 return error; 1625 1139 } 1626 1140 1627 - static void dir_remove(struct dlm_rsb *r) 1628 - { 1629 - int to_nodeid; 1630 - 1631 - if (dlm_no_directory(r->res_ls)) 1632 - return; 1633 - 1634 - to_nodeid = dlm_dir_nodeid(r); 1635 - if (to_nodeid != dlm_our_nodeid()) 1636 - send_remove(r); 1637 - else 1638 - dlm_dir_remove_entry(r->res_ls, to_nodeid, 1639 - r->res_name, r->res_length); 1640 - } 1641 - 1642 1141 /* FIXME: make this more efficient */ 1643 1142 1644 1143 static int shrink_bucket(struct dlm_ls *ls, int b) 1645 1144 { 1646 1145 struct rb_node *n; 1647 1146 struct dlm_rsb *r; 1147 + int our_nodeid = dlm_our_nodeid(); 1648 1148 int count = 0, found; 1649 1149 1650 1150 for (;;) { ··· 1638 1166 spin_lock(&ls->ls_rsbtbl[b].lock); 1639 1167 for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = rb_next(n)) { 1640 1168 r = rb_entry(n, struct dlm_rsb, res_hashnode); 1169 + 1170 + /* If we're the directory record for this rsb, and 1171 + we're not the master of it, then we need to wait 1172 + for the master node to send us a dir remove for 1173 + before removing the dir record. */ 1174 + 1175 + if (!dlm_no_directory(ls) && !is_master(r) && 1176 + (dlm_dir_nodeid(r) == our_nodeid)) { 1177 + continue; 1178 + } 1179 + 1641 1180 if (!time_after_eq(jiffies, r->res_toss_time + 1642 1181 dlm_config.ci_toss_secs * HZ)) 1643 1182 continue; ··· 1665 1182 rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); 1666 1183 spin_unlock(&ls->ls_rsbtbl[b].lock); 1667 1184 1668 - if (is_master(r)) 1669 - dir_remove(r); 1185 + /* We're the master of this rsb but we're not 1186 + the directory record, so we need to tell the 1187 + dir node to remove the dir record. */ 1188 + 1189 + if (!dlm_no_directory(ls) && is_master(r) && 1190 + (dlm_dir_nodeid(r) != our_nodeid)) { 1191 + send_remove(r); 1192 + } 1193 + 1670 1194 dlm_free_rsb(r); 1671 1195 count++; 1672 1196 } else { ··· 2568 2078 2569 2079 static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) 2570 2080 { 2571 - struct dlm_ls *ls = r->res_ls; 2572 - int i, error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid(); 2081 + int our_nodeid = dlm_our_nodeid(); 2573 2082 2574 2083 if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) { 2575 2084 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN); ··· 2582 2093 return 1; 2583 2094 } 2584 2095 2585 - if (r->res_nodeid == 0) { 2096 + if (r->res_master_nodeid == our_nodeid) { 2586 2097 lkb->lkb_nodeid = 0; 2587 2098 return 0; 2588 2099 } 2589 2100 2590 - if (r->res_nodeid > 0) { 2591 - lkb->lkb_nodeid = r->res_nodeid; 2101 + if (r->res_master_nodeid) { 2102 + lkb->lkb_nodeid = r->res_master_nodeid; 2592 2103 return 0; 2593 2104 } 2594 2105 2595 - DLM_ASSERT(r->res_nodeid == -1, dlm_dump_rsb(r);); 2596 - 2597 - dir_nodeid = dlm_dir_nodeid(r); 2598 - 2599 - if (dir_nodeid != our_nodeid) { 2600 - r->res_first_lkid = lkb->lkb_id; 2601 - send_lookup(r, lkb); 2602 - return 1; 2603 - } 2604 - 2605 - for (i = 0; i < 2; i++) { 2606 - /* It's possible for dlm_scand to remove an old rsb for 2607 - this same resource from the toss list, us to create 2608 - a new one, look up the master locally, and find it 2609 - already exists just before dlm_scand does the 2610 - dir_remove() on the previous rsb. */ 2611 - 2612 - error = dlm_dir_lookup(ls, our_nodeid, r->res_name, 2613 - r->res_length, &ret_nodeid); 2614 - if (!error) 2615 - break; 2616 - log_debug(ls, "dir_lookup error %d %s", error, r->res_name); 2617 - schedule(); 2618 - } 2619 - if (error && error != -EEXIST) 2620 - return error; 2621 - 2622 - if (ret_nodeid == our_nodeid) { 2623 - r->res_first_lkid = 0; 2106 + if (dlm_dir_nodeid(r) == our_nodeid) { 2107 + /* This is a somewhat unusual case; find_rsb will usually 2108 + have set res_master_nodeid when dir nodeid is local, but 2109 + there are cases where we become the dir node after we've 2110 + past find_rsb and go through _request_lock again. 2111 + confirm_master() or process_lookup_list() needs to be 2112 + called after this. */ 2113 + log_debug(r->res_ls, "set_master %x self master %d dir %d %s", 2114 + lkb->lkb_id, r->res_master_nodeid, r->res_dir_nodeid, 2115 + r->res_name); 2116 + r->res_master_nodeid = our_nodeid; 2624 2117 r->res_nodeid = 0; 2625 2118 lkb->lkb_nodeid = 0; 2626 - } else { 2627 - r->res_first_lkid = lkb->lkb_id; 2628 - r->res_nodeid = ret_nodeid; 2629 - lkb->lkb_nodeid = ret_nodeid; 2119 + return 0; 2630 2120 } 2631 - return 0; 2121 + 2122 + r->res_first_lkid = lkb->lkb_id; 2123 + send_lookup(r, lkb); 2124 + return 1; 2632 2125 } 2633 2126 2634 2127 static void process_lookup_list(struct dlm_rsb *r) ··· 3055 2584 } 3056 2585 3057 2586 /* returns: 0 did nothing, -DLM_ECANCEL canceled lock */ 3058 - 2587 + 3059 2588 static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb) 3060 2589 { 3061 2590 int error; ··· 3179 2708 3180 2709 error = validate_lock_args(ls, lkb, args); 3181 2710 if (error) 3182 - goto out; 2711 + return error; 3183 2712 3184 - error = find_rsb(ls, name, len, R_CREATE, &r); 2713 + error = find_rsb(ls, name, len, 0, R_REQUEST, &r); 3185 2714 if (error) 3186 - goto out; 2715 + return error; 3187 2716 3188 2717 lock_rsb(r); 3189 2718 ··· 3194 2723 3195 2724 unlock_rsb(r); 3196 2725 put_rsb(r); 3197 - 3198 - out: 3199 2726 return error; 3200 2727 } 3201 2728 ··· 3875 3406 { 3876 3407 struct dlm_lkb *lkb; 3877 3408 struct dlm_rsb *r; 3409 + int from_nodeid; 3878 3410 int error, namelen; 3411 + 3412 + from_nodeid = ms->m_header.h_nodeid; 3879 3413 3880 3414 error = create_lkb(ls, &lkb); 3881 3415 if (error) ··· 3892 3420 goto fail; 3893 3421 } 3894 3422 3423 + /* The dir node is the authority on whether we are the master 3424 + for this rsb or not, so if the master sends us a request, we should 3425 + recreate the rsb if we've destroyed it. This race happens when we 3426 + send a remove message to the dir node at the same time that the dir 3427 + node sends us a request for the rsb. */ 3428 + 3895 3429 namelen = receive_extralen(ms); 3896 3430 3897 - error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r); 3431 + error = find_rsb(ls, ms->m_extra, namelen, from_nodeid, 3432 + R_RECEIVE_REQUEST, &r); 3898 3433 if (error) { 3899 3434 __put_lkb(ls, lkb); 3900 3435 goto fail; 3901 3436 } 3902 3437 3903 3438 lock_rsb(r); 3439 + 3440 + if (r->res_master_nodeid != dlm_our_nodeid()) { 3441 + error = validate_master_nodeid(ls, r, from_nodeid); 3442 + if (error) { 3443 + unlock_rsb(r); 3444 + put_rsb(r); 3445 + __put_lkb(ls, lkb); 3446 + goto fail; 3447 + } 3448 + } 3904 3449 3905 3450 attach_lkb(r, lkb); 3906 3451 error = do_request(r, lkb); ··· 3934 3445 return 0; 3935 3446 3936 3447 fail: 3448 + /* TODO: instead of returning ENOTBLK, add the lkb to res_lookup 3449 + and do this receive_request again from process_lookup_list once 3450 + we get the lookup reply. This would avoid a many repeated 3451 + ENOTBLK request failures when the lookup reply designating us 3452 + as master is delayed. */ 3453 + 3454 + /* We could repeatedly return -EBADR here if our send_remove() is 3455 + delayed in being sent/arriving/being processed on the dir node. 3456 + Another node would repeatedly lookup up the master, and the dir 3457 + node would continue returning our nodeid until our send_remove 3458 + took effect. */ 3459 + 3460 + if (error != -ENOTBLK) { 3461 + log_limit(ls, "receive_request %x from %d %d", 3462 + ms->m_lkid, from_nodeid, error); 3463 + } 3464 + 3937 3465 setup_stub_lkb(ls, ms); 3938 3466 send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); 3939 3467 return error; ··· 4157 3651 4158 3652 static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms) 4159 3653 { 4160 - int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid; 3654 + int len, error, ret_nodeid, from_nodeid, our_nodeid; 4161 3655 4162 3656 from_nodeid = ms->m_header.h_nodeid; 4163 3657 our_nodeid = dlm_our_nodeid(); 4164 3658 4165 3659 len = receive_extralen(ms); 4166 3660 4167 - dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash); 4168 - if (dir_nodeid != our_nodeid) { 4169 - log_error(ls, "lookup dir_nodeid %d from %d", 4170 - dir_nodeid, from_nodeid); 4171 - error = -EINVAL; 4172 - ret_nodeid = -1; 4173 - goto out; 4174 - } 4175 - 4176 - error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid); 3661 + error = dlm_master_lookup(ls, from_nodeid, ms->m_extra, len, 0, 3662 + &ret_nodeid, NULL); 4177 3663 4178 3664 /* Optimization: we're master so treat lookup as a request */ 4179 3665 if (!error && ret_nodeid == our_nodeid) { 4180 3666 receive_request(ls, ms); 4181 3667 return; 4182 3668 } 4183 - out: 4184 3669 send_lookup_reply(ls, ms, ret_nodeid, error); 4185 3670 } 4186 3671 4187 3672 static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms) 4188 3673 { 4189 - int len, dir_nodeid, from_nodeid; 3674 + char name[DLM_RESNAME_MAXLEN+1]; 3675 + struct dlm_rsb *r; 3676 + uint32_t hash, b; 3677 + int rv, len, dir_nodeid, from_nodeid; 4190 3678 4191 3679 from_nodeid = ms->m_header.h_nodeid; 4192 3680 4193 3681 len = receive_extralen(ms); 4194 3682 4195 - dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash); 4196 - if (dir_nodeid != dlm_our_nodeid()) { 4197 - log_error(ls, "remove dir entry dir_nodeid %d from %d", 4198 - dir_nodeid, from_nodeid); 3683 + if (len > DLM_RESNAME_MAXLEN) { 3684 + log_error(ls, "receive_remove from %d bad len %d", 3685 + from_nodeid, len); 4199 3686 return; 4200 3687 } 4201 3688 4202 - dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len); 3689 + dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash); 3690 + if (dir_nodeid != dlm_our_nodeid()) { 3691 + log_error(ls, "receive_remove from %d bad nodeid %d", 3692 + from_nodeid, dir_nodeid); 3693 + return; 3694 + } 3695 + 3696 + /* Look for name on rsbtbl.toss, if it's there, kill it. 3697 + If it's on rsbtbl.keep, it's being used, and we should ignore this 3698 + message. This is an expected race between the dir node sending a 3699 + request to the master node at the same time as the master node sends 3700 + a remove to the dir node. The resolution to that race is for the 3701 + dir node to ignore the remove message, and the master node to 3702 + recreate the master rsb when it gets a request from the dir node for 3703 + an rsb it doesn't have. */ 3704 + 3705 + memset(name, 0, sizeof(name)); 3706 + memcpy(name, ms->m_extra, len); 3707 + 3708 + hash = jhash(name, len, 0); 3709 + b = hash & (ls->ls_rsbtbl_size - 1); 3710 + 3711 + spin_lock(&ls->ls_rsbtbl[b].lock); 3712 + 3713 + rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); 3714 + if (rv) { 3715 + /* verify the rsb is on keep list per comment above */ 3716 + rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); 3717 + if (rv) { 3718 + /* should not happen */ 3719 + log_error(ls, "receive_remove from %d not found %s", 3720 + from_nodeid, name); 3721 + spin_unlock(&ls->ls_rsbtbl[b].lock); 3722 + return; 3723 + } 3724 + if (r->res_master_nodeid != from_nodeid) { 3725 + /* should not happen */ 3726 + log_error(ls, "receive_remove keep from %d master %d", 3727 + from_nodeid, r->res_master_nodeid); 3728 + dlm_print_rsb(r); 3729 + spin_unlock(&ls->ls_rsbtbl[b].lock); 3730 + return; 3731 + } 3732 + 3733 + log_debug(ls, "receive_remove from %d master %d first %x %s", 3734 + from_nodeid, r->res_master_nodeid, r->res_first_lkid, 3735 + name); 3736 + spin_unlock(&ls->ls_rsbtbl[b].lock); 3737 + return; 3738 + } 3739 + 3740 + if (r->res_master_nodeid != from_nodeid) { 3741 + log_error(ls, "receive_remove toss from %d master %d", 3742 + from_nodeid, r->res_master_nodeid); 3743 + dlm_print_rsb(r); 3744 + spin_unlock(&ls->ls_rsbtbl[b].lock); 3745 + return; 3746 + } 3747 + 3748 + if (kref_put(&r->res_ref, kill_rsb)) { 3749 + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); 3750 + spin_unlock(&ls->ls_rsbtbl[b].lock); 3751 + dlm_free_rsb(r); 3752 + } else { 3753 + log_error(ls, "receive_remove from %d rsb ref error", 3754 + from_nodeid); 3755 + dlm_print_rsb(r); 3756 + spin_unlock(&ls->ls_rsbtbl[b].lock); 3757 + } 4203 3758 } 4204 3759 4205 3760 static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms) ··· 4273 3706 struct dlm_lkb *lkb; 4274 3707 struct dlm_rsb *r; 4275 3708 int error, mstype, result; 3709 + int from_nodeid = ms->m_header.h_nodeid; 4276 3710 4277 3711 error = find_lkb(ls, ms->m_remid, &lkb); 4278 3712 if (error) ··· 4291 3723 error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY); 4292 3724 if (error) { 4293 3725 log_error(ls, "receive_request_reply %x remote %d %x result %d", 4294 - lkb->lkb_id, ms->m_header.h_nodeid, ms->m_lkid, 4295 - ms->m_result); 3726 + lkb->lkb_id, from_nodeid, ms->m_lkid, ms->m_result); 4296 3727 dlm_dump_rsb(r); 4297 3728 goto out; 4298 3729 } ··· 4299 3732 /* Optimization: the dir node was also the master, so it took our 4300 3733 lookup as a request and sent request reply instead of lookup reply */ 4301 3734 if (mstype == DLM_MSG_LOOKUP) { 4302 - r->res_nodeid = ms->m_header.h_nodeid; 4303 - lkb->lkb_nodeid = r->res_nodeid; 3735 + r->res_master_nodeid = from_nodeid; 3736 + r->res_nodeid = from_nodeid; 3737 + lkb->lkb_nodeid = from_nodeid; 4304 3738 } 4305 3739 4306 3740 /* this is the value returned from do_request() on the master */ ··· 4335 3767 case -EBADR: 4336 3768 case -ENOTBLK: 4337 3769 /* find_rsb failed to find rsb or rsb wasn't master */ 4338 - log_debug(ls, "receive_request_reply %x %x master diff %d %d", 4339 - lkb->lkb_id, lkb->lkb_flags, r->res_nodeid, result); 4340 - r->res_nodeid = -1; 4341 - lkb->lkb_nodeid = -1; 3770 + log_limit(ls, "receive_request_reply %x from %d %d " 3771 + "master %d dir %d first %x %s", lkb->lkb_id, 3772 + from_nodeid, result, r->res_master_nodeid, 3773 + r->res_dir_nodeid, r->res_first_lkid, r->res_name); 3774 + 3775 + if (r->res_dir_nodeid != dlm_our_nodeid() && 3776 + r->res_master_nodeid != dlm_our_nodeid()) { 3777 + /* cause _request_lock->set_master->send_lookup */ 3778 + r->res_master_nodeid = 0; 3779 + r->res_nodeid = -1; 3780 + lkb->lkb_nodeid = -1; 3781 + } 4342 3782 4343 3783 if (is_overlap(lkb)) { 4344 3784 /* we'll ignore error in cancel/unlock reply */ 4345 3785 queue_cast_overlap(r, lkb); 4346 3786 confirm_master(r, result); 4347 3787 unhold_lkb(lkb); /* undoes create_lkb() */ 4348 - } else 3788 + } else { 4349 3789 _request_lock(r, lkb); 3790 + 3791 + if (r->res_master_nodeid == dlm_our_nodeid()) 3792 + confirm_master(r, 0); 3793 + } 4350 3794 break; 4351 3795 4352 3796 default: ··· 4574 3994 struct dlm_lkb *lkb; 4575 3995 struct dlm_rsb *r; 4576 3996 int error, ret_nodeid; 3997 + int do_lookup_list = 0; 4577 3998 4578 3999 error = find_lkb(ls, ms->m_lkid, &lkb); 4579 4000 if (error) { ··· 4582 4001 return; 4583 4002 } 4584 4003 4585 - /* ms->m_result is the value returned by dlm_dir_lookup on dir node 4004 + /* ms->m_result is the value returned by dlm_master_lookup on dir node 4586 4005 FIXME: will a non-zero error ever be returned? */ 4587 4006 4588 4007 r = lkb->lkb_resource; ··· 4594 4013 goto out; 4595 4014 4596 4015 ret_nodeid = ms->m_nodeid; 4016 + 4017 + /* We sometimes receive a request from the dir node for this 4018 + rsb before we've received the dir node's loookup_reply for it. 4019 + The request from the dir node implies we're the master, so we set 4020 + ourself as master in receive_request_reply, and verify here that 4021 + we are indeed the master. */ 4022 + 4023 + if (r->res_master_nodeid && (r->res_master_nodeid != ret_nodeid)) { 4024 + /* This should never happen */ 4025 + log_error(ls, "receive_lookup_reply %x from %d ret %d " 4026 + "master %d dir %d our %d first %x %s", 4027 + lkb->lkb_id, ms->m_header.h_nodeid, ret_nodeid, 4028 + r->res_master_nodeid, r->res_dir_nodeid, 4029 + dlm_our_nodeid(), r->res_first_lkid, r->res_name); 4030 + } 4031 + 4597 4032 if (ret_nodeid == dlm_our_nodeid()) { 4033 + r->res_master_nodeid = ret_nodeid; 4598 4034 r->res_nodeid = 0; 4599 - ret_nodeid = 0; 4035 + do_lookup_list = 1; 4600 4036 r->res_first_lkid = 0; 4037 + } else if (ret_nodeid == -1) { 4038 + /* the remote node doesn't believe it's the dir node */ 4039 + log_error(ls, "receive_lookup_reply %x from %d bad ret_nodeid", 4040 + lkb->lkb_id, ms->m_header.h_nodeid); 4041 + r->res_master_nodeid = 0; 4042 + r->res_nodeid = -1; 4043 + lkb->lkb_nodeid = -1; 4601 4044 } else { 4602 - /* set_master() will copy res_nodeid to lkb_nodeid */ 4045 + /* set_master() will set lkb_nodeid from r */ 4046 + r->res_master_nodeid = ret_nodeid; 4603 4047 r->res_nodeid = ret_nodeid; 4604 4048 } 4605 4049 ··· 4639 4033 _request_lock(r, lkb); 4640 4034 4641 4035 out_list: 4642 - if (!ret_nodeid) 4036 + if (do_lookup_list) 4643 4037 process_lookup_list(r); 4644 4038 out: 4645 4039 unlock_rsb(r); ··· 4653 4047 int error = 0, noent = 0; 4654 4048 4655 4049 if (!dlm_is_member(ls, ms->m_header.h_nodeid)) { 4656 - log_debug(ls, "ignore non-member message %d from %d %x %x %d", 4050 + log_limit(ls, "receive %d from non-member %d %x %x %d", 4657 4051 ms->m_type, ms->m_header.h_nodeid, ms->m_lkid, 4658 4052 ms->m_remid, ms->m_result); 4659 4053 return; ··· 4780 4174 int nodeid) 4781 4175 { 4782 4176 if (dlm_locking_stopped(ls)) { 4177 + /* If we were a member of this lockspace, left, and rejoined, 4178 + other nodes may still be sending us messages from the 4179 + lockspace generation before we left. */ 4180 + if (!ls->ls_generation) { 4181 + log_limit(ls, "receive %d from %d ignore old gen", 4182 + ms->m_type, nodeid); 4183 + return; 4184 + } 4185 + 4783 4186 dlm_add_requestqueue(ls, nodeid, ms); 4784 4187 } else { 4785 4188 dlm_wait_requestqueue(ls); ··· 5413 4798 struct dlm_rsb *r; 5414 4799 struct dlm_lkb *lkb; 5415 4800 uint32_t remid = 0; 4801 + int from_nodeid = rc->rc_header.h_nodeid; 5416 4802 int error; 5417 4803 5418 4804 if (rl->rl_parent_lkid) { ··· 5431 4815 we make ourselves master, dlm_recover_masters() won't touch the 5432 4816 MSTCPY locks we've received early. */ 5433 4817 5434 - error = find_rsb(ls, rl->rl_name, le16_to_cpu(rl->rl_namelen), 0, &r); 4818 + error = find_rsb(ls, rl->rl_name, le16_to_cpu(rl->rl_namelen), 4819 + from_nodeid, R_RECEIVE_RECOVER, &r); 5435 4820 if (error) 5436 4821 goto out; 5437 4822 5438 - if (dlm_no_directory(ls) && (dlm_dir_nodeid(r) != dlm_our_nodeid())) { 5439 - log_error(ls, "dlm_recover_master_copy remote %d %x not dir", 5440 - rc->rc_header.h_nodeid, remid); 5441 - error = -EBADR; 5442 - put_rsb(r); 5443 - goto out; 5444 - } 5445 - 5446 4823 lock_rsb(r); 5447 4824 5448 - lkb = search_remid(r, rc->rc_header.h_nodeid, remid); 4825 + if (dlm_no_directory(ls) && (dlm_dir_nodeid(r) != dlm_our_nodeid())) { 4826 + log_error(ls, "dlm_recover_master_copy remote %d %x not dir", 4827 + from_nodeid, remid); 4828 + error = -EBADR; 4829 + goto out_unlock; 4830 + } 4831 + 4832 + lkb = search_remid(r, from_nodeid, remid); 5449 4833 if (lkb) { 5450 4834 error = -EEXIST; 5451 4835 goto out_remid; ··· 5482 4866 out: 5483 4867 if (error && error != -EEXIST) 5484 4868 log_debug(ls, "dlm_recover_master_copy remote %d %x error %d", 5485 - rc->rc_header.h_nodeid, remid, error); 4869 + from_nodeid, remid, error); 5486 4870 rl->rl_result = cpu_to_le32(error); 5487 4871 return error; 5488 4872 }
+4 -1
fs/dlm/lock.h
··· 14 14 #define __LOCK_DOT_H__ 15 15 16 16 void dlm_dump_rsb(struct dlm_rsb *r); 17 + void dlm_dump_rsb_name(struct dlm_ls *ls, char *name, int len); 17 18 void dlm_print_lkb(struct dlm_lkb *lkb); 18 19 void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms, 19 20 uint32_t saved_seq); ··· 29 28 void dlm_scan_waiters(struct dlm_ls *ls); 30 29 void dlm_scan_timeout(struct dlm_ls *ls); 31 30 void dlm_adjust_timeouts(struct dlm_ls *ls); 31 + int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len, 32 + unsigned int flags, int *r_nodeid, int *result); 32 33 33 34 int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len, 34 - unsigned int flags, struct dlm_rsb **r_ret); 35 + struct dlm_rsb **r_ret); 35 36 36 37 void dlm_recover_purge(struct dlm_ls *ls); 37 38 void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
+1 -22
fs/dlm/lockspace.c
··· 509 509 idr_init(&ls->ls_lkbidr); 510 510 spin_lock_init(&ls->ls_lkbidr_spin); 511 511 512 - size = dlm_config.ci_dirtbl_size; 513 - ls->ls_dirtbl_size = size; 514 - 515 - ls->ls_dirtbl = vmalloc(sizeof(struct dlm_dirtable) * size); 516 - if (!ls->ls_dirtbl) 517 - goto out_lkbfree; 518 - for (i = 0; i < size; i++) { 519 - INIT_LIST_HEAD(&ls->ls_dirtbl[i].list); 520 - spin_lock_init(&ls->ls_dirtbl[i].lock); 521 - } 522 - 523 512 INIT_LIST_HEAD(&ls->ls_waiters); 524 513 mutex_init(&ls->ls_waiters_mutex); 525 514 INIT_LIST_HEAD(&ls->ls_orphans); ··· 556 567 557 568 ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS); 558 569 if (!ls->ls_recover_buf) 559 - goto out_dirfree; 570 + goto out_lkbfree; 560 571 561 572 ls->ls_slot = 0; 562 573 ls->ls_num_slots = 0; ··· 637 648 list_del(&ls->ls_list); 638 649 spin_unlock(&lslist_lock); 639 650 kfree(ls->ls_recover_buf); 640 - out_dirfree: 641 - vfree(ls->ls_dirtbl); 642 651 out_lkbfree: 643 652 idr_destroy(&ls->ls_lkbidr); 644 653 vfree(ls->ls_rsbtbl); ··· 766 779 kfree(ls->ls_recover_buf); 767 780 768 781 /* 769 - * Free direntry structs. 770 - */ 771 - 772 - dlm_dir_clear(ls); 773 - vfree(ls->ls_dirtbl); 774 - 775 - /* 776 782 * Free all lkb's in idr 777 783 */ 778 784 ··· 806 826 807 827 dlm_purge_requestqueue(ls); 808 828 kfree(ls->ls_recover_args); 809 - dlm_clear_free_entries(ls); 810 829 dlm_clear_members(ls); 811 830 dlm_clear_members_gone(ls); 812 831 kfree(ls->ls_node_array);
+111 -34
fs/dlm/rcom.c
··· 23 23 #include "memory.h" 24 24 #include "lock.h" 25 25 #include "util.h" 26 - #include "member.h" 27 - 28 26 29 27 static int rcom_response(struct dlm_ls *ls) 30 28 { ··· 273 275 struct dlm_rcom *rc; 274 276 struct dlm_mhandle *mh; 275 277 int error = 0; 276 - int max_size = dlm_config.ci_buffer_size - sizeof(struct dlm_rcom); 277 278 278 279 ls->ls_recover_nodeid = nodeid; 279 - 280 - if (nodeid == dlm_our_nodeid()) { 281 - ls->ls_recover_buf->rc_header.h_length = 282 - dlm_config.ci_buffer_size; 283 - dlm_copy_master_names(ls, last_name, last_len, 284 - ls->ls_recover_buf->rc_buf, 285 - max_size, nodeid); 286 - goto out; 287 - } 288 280 289 281 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh); 290 282 if (error) ··· 332 344 return error; 333 345 } 334 346 347 + int dlm_send_rcom_lookup_dump(struct dlm_rsb *r, int to_nodeid) 348 + { 349 + struct dlm_rcom *rc; 350 + struct dlm_mhandle *mh; 351 + struct dlm_ls *ls = r->res_ls; 352 + int error; 353 + 354 + error = create_rcom(ls, to_nodeid, DLM_RCOM_LOOKUP, r->res_length, 355 + &rc, &mh); 356 + if (error) 357 + goto out; 358 + memcpy(rc->rc_buf, r->res_name, r->res_length); 359 + rc->rc_id = 0xFFFFFFFF; 360 + 361 + send_rcom(ls, mh, rc); 362 + out: 363 + return error; 364 + } 365 + 335 366 static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) 336 367 { 337 368 struct dlm_rcom *rc; ··· 362 355 if (error) 363 356 return; 364 357 365 - error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid); 358 + if (rc_in->rc_id == 0xFFFFFFFF) { 359 + log_error(ls, "receive_rcom_lookup dump from %d", nodeid); 360 + dlm_dump_rsb_name(ls, rc_in->rc_buf, len); 361 + return; 362 + } 363 + 364 + error = dlm_master_lookup(ls, nodeid, rc_in->rc_buf, len, 365 + DLM_LU_RECOVER_MASTER, &ret_nodeid, NULL); 366 366 if (error) 367 367 ret_nodeid = error; 368 368 rc->rc_result = ret_nodeid; ··· 500 486 return 0; 501 487 } 502 488 489 + /* 490 + * Ignore messages for stage Y before we set 491 + * recover_status bit for stage X: 492 + * 493 + * recover_status = 0 494 + * 495 + * dlm_recover_members() 496 + * - send nothing 497 + * - recv nothing 498 + * - ignore NAMES, NAMES_REPLY 499 + * - ignore LOOKUP, LOOKUP_REPLY 500 + * - ignore LOCK, LOCK_REPLY 501 + * 502 + * recover_status |= NODES 503 + * 504 + * dlm_recover_members_wait() 505 + * 506 + * dlm_recover_directory() 507 + * - send NAMES 508 + * - recv NAMES_REPLY 509 + * - ignore LOOKUP, LOOKUP_REPLY 510 + * - ignore LOCK, LOCK_REPLY 511 + * 512 + * recover_status |= DIR 513 + * 514 + * dlm_recover_directory_wait() 515 + * 516 + * dlm_recover_masters() 517 + * - send LOOKUP 518 + * - recv LOOKUP_REPLY 519 + * 520 + * dlm_recover_locks() 521 + * - send LOCKS 522 + * - recv LOCKS_REPLY 523 + * 524 + * recover_status |= LOCKS 525 + * 526 + * dlm_recover_locks_wait() 527 + * 528 + * recover_status |= DONE 529 + */ 530 + 503 531 /* Called by dlm_recv; corresponds to dlm_receive_message() but special 504 532 recovery-only comms are sent through here. */ 505 533 506 534 void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) 507 535 { 508 536 int lock_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_lock); 509 - int stop, reply = 0, lock = 0; 537 + int stop, reply = 0, names = 0, lookup = 0, lock = 0; 510 538 uint32_t status; 511 539 uint64_t seq; 512 540 513 541 switch (rc->rc_type) { 542 + case DLM_RCOM_STATUS_REPLY: 543 + reply = 1; 544 + break; 545 + case DLM_RCOM_NAMES: 546 + names = 1; 547 + break; 548 + case DLM_RCOM_NAMES_REPLY: 549 + names = 1; 550 + reply = 1; 551 + break; 552 + case DLM_RCOM_LOOKUP: 553 + lookup = 1; 554 + break; 555 + case DLM_RCOM_LOOKUP_REPLY: 556 + lookup = 1; 557 + reply = 1; 558 + break; 514 559 case DLM_RCOM_LOCK: 515 560 lock = 1; 516 561 break; ··· 577 504 lock = 1; 578 505 reply = 1; 579 506 break; 580 - case DLM_RCOM_STATUS_REPLY: 581 - case DLM_RCOM_NAMES_REPLY: 582 - case DLM_RCOM_LOOKUP_REPLY: 583 - reply = 1; 584 507 }; 585 508 586 509 spin_lock(&ls->ls_recover_lock); ··· 585 516 seq = ls->ls_recover_seq; 586 517 spin_unlock(&ls->ls_recover_lock); 587 518 588 - if ((stop && (rc->rc_type != DLM_RCOM_STATUS)) || 589 - (reply && (rc->rc_seq_reply != seq)) || 590 - (lock && !(status & DLM_RS_DIR))) { 591 - log_limit(ls, "dlm_receive_rcom ignore msg %d " 592 - "from %d %llu %llu recover seq %llu sts %x gen %u", 593 - rc->rc_type, 594 - nodeid, 595 - (unsigned long long)rc->rc_seq, 596 - (unsigned long long)rc->rc_seq_reply, 597 - (unsigned long long)seq, 598 - status, ls->ls_generation); 599 - goto out; 600 - } 519 + if (stop && (rc->rc_type != DLM_RCOM_STATUS)) 520 + goto ignore; 521 + 522 + if (reply && (rc->rc_seq_reply != seq)) 523 + goto ignore; 524 + 525 + if (!(status & DLM_RS_NODES) && (names || lookup || lock)) 526 + goto ignore; 527 + 528 + if (!(status & DLM_RS_DIR) && (lookup || lock)) 529 + goto ignore; 601 530 602 531 switch (rc->rc_type) { 603 532 case DLM_RCOM_STATUS: ··· 637 570 default: 638 571 log_error(ls, "receive_rcom bad type %d", rc->rc_type); 639 572 } 640 - out: 573 + return; 574 + 575 + ignore: 576 + log_limit(ls, "dlm_receive_rcom ignore msg %d " 577 + "from %d %llu %llu recover seq %llu sts %x gen %u", 578 + rc->rc_type, 579 + nodeid, 580 + (unsigned long long)rc->rc_seq, 581 + (unsigned long long)rc->rc_seq_reply, 582 + (unsigned long long)seq, 583 + status, ls->ls_generation); 641 584 return; 642 585 Eshort: 643 - log_error(ls, "recovery message %x from %d is too short", 644 - rc->rc_type, nodeid); 586 + log_error(ls, "recovery message %d from %d is too short", 587 + rc->rc_type, nodeid); 645 588 } 646 589
+1
fs/dlm/rcom.h
··· 17 17 int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags); 18 18 int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len); 19 19 int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid); 20 + int dlm_send_rcom_lookup_dump(struct dlm_rsb *r, int to_nodeid); 20 21 int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); 21 22 void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid); 22 23 int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in);
+77 -63
fs/dlm/recover.c
··· 361 361 * rsb's to consider. 362 362 */ 363 363 364 - static void set_new_master(struct dlm_rsb *r, int nodeid) 364 + static void set_new_master(struct dlm_rsb *r) 365 365 { 366 - r->res_nodeid = nodeid; 367 366 set_master_lkbs(r); 368 367 rsb_set_flag(r, RSB_NEW_MASTER); 369 368 rsb_set_flag(r, RSB_NEW_MASTER2); ··· 371 372 /* 372 373 * We do async lookups on rsb's that need new masters. The rsb's 373 374 * waiting for a lookup reply are kept on the recover_list. 375 + * 376 + * Another node recovering the master may have sent us a rcom lookup, 377 + * and our dlm_master_lookup() set it as the new master, along with 378 + * NEW_MASTER so that we'll recover it here (this implies dir_nodeid 379 + * equals our_nodeid below). 374 380 */ 375 381 376 - static int recover_master(struct dlm_rsb *r) 382 + static int recover_master(struct dlm_rsb *r, unsigned int *count) 377 383 { 378 384 struct dlm_ls *ls = r->res_ls; 379 - int error, ret_nodeid; 380 - int our_nodeid = dlm_our_nodeid(); 381 - int dir_nodeid = dlm_dir_nodeid(r); 385 + int our_nodeid, dir_nodeid; 386 + int is_removed = 0; 387 + int error; 388 + 389 + if (is_master(r)) 390 + return 0; 391 + 392 + is_removed = dlm_is_removed(ls, r->res_nodeid); 393 + 394 + if (!is_removed && !rsb_flag(r, RSB_NEW_MASTER)) 395 + return 0; 396 + 397 + our_nodeid = dlm_our_nodeid(); 398 + dir_nodeid = dlm_dir_nodeid(r); 382 399 383 400 if (dir_nodeid == our_nodeid) { 384 - error = dlm_dir_lookup(ls, our_nodeid, r->res_name, 385 - r->res_length, &ret_nodeid); 386 - if (error) 387 - log_error(ls, "recover dir lookup error %d", error); 401 + if (is_removed) { 402 + r->res_master_nodeid = our_nodeid; 403 + r->res_nodeid = 0; 404 + } 388 405 389 - if (ret_nodeid == our_nodeid) 390 - ret_nodeid = 0; 391 - lock_rsb(r); 392 - set_new_master(r, ret_nodeid); 393 - unlock_rsb(r); 406 + /* set master of lkbs to ourself when is_removed, or to 407 + another new master which we set along with NEW_MASTER 408 + in dlm_master_lookup */ 409 + set_new_master(r); 410 + error = 0; 394 411 } else { 395 412 recover_list_add(r); 396 413 error = dlm_send_rcom_lookup(r, dir_nodeid); 397 414 } 398 415 416 + (*count)++; 399 417 return error; 400 418 } 401 419 ··· 431 415 * resent. 432 416 */ 433 417 434 - static int recover_master_static(struct dlm_rsb *r) 418 + static int recover_master_static(struct dlm_rsb *r, unsigned int *count) 435 419 { 436 420 int dir_nodeid = dlm_dir_nodeid(r); 437 421 int new_master = dir_nodeid; ··· 439 423 if (dir_nodeid == dlm_our_nodeid()) 440 424 new_master = 0; 441 425 442 - lock_rsb(r); 443 426 dlm_purge_mstcpy_locks(r); 444 - set_new_master(r, new_master); 445 - unlock_rsb(r); 446 - return 1; 427 + r->res_master_nodeid = dir_nodeid; 428 + r->res_nodeid = new_master; 429 + set_new_master(r); 430 + (*count)++; 431 + return 0; 447 432 } 448 433 449 434 /* ··· 460 443 int dlm_recover_masters(struct dlm_ls *ls) 461 444 { 462 445 struct dlm_rsb *r; 463 - int error = 0, count = 0; 446 + unsigned int total = 0; 447 + unsigned int count = 0; 448 + int nodir = dlm_no_directory(ls); 449 + int error; 464 450 465 451 log_debug(ls, "dlm_recover_masters"); 466 452 ··· 475 455 goto out; 476 456 } 477 457 478 - if (dlm_no_directory(ls)) 479 - count += recover_master_static(r); 480 - else if (!is_master(r) && 481 - (dlm_is_removed(ls, r->res_nodeid) || 482 - rsb_flag(r, RSB_NEW_MASTER))) { 483 - recover_master(r); 484 - count++; 485 - } 458 + lock_rsb(r); 459 + if (nodir) 460 + error = recover_master_static(r, &count); 461 + else 462 + error = recover_master(r, &count); 463 + unlock_rsb(r); 464 + cond_resched(); 465 + total++; 486 466 487 - schedule(); 467 + if (error) { 468 + up_read(&ls->ls_root_sem); 469 + goto out; 470 + } 488 471 } 489 472 up_read(&ls->ls_root_sem); 490 473 491 - log_debug(ls, "dlm_recover_masters %d resources", count); 474 + log_debug(ls, "dlm_recover_masters %u of %u", count, total); 492 475 493 476 error = dlm_wait_function(ls, &recover_list_empty); 494 477 out: ··· 503 480 int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc) 504 481 { 505 482 struct dlm_rsb *r; 506 - int nodeid; 483 + int ret_nodeid, new_master; 507 484 508 485 r = recover_list_find(ls, rc->rc_id); 509 486 if (!r) { ··· 512 489 goto out; 513 490 } 514 491 515 - nodeid = rc->rc_result; 516 - if (nodeid == dlm_our_nodeid()) 517 - nodeid = 0; 492 + ret_nodeid = rc->rc_result; 493 + 494 + if (ret_nodeid == dlm_our_nodeid()) 495 + new_master = 0; 496 + else 497 + new_master = ret_nodeid; 518 498 519 499 lock_rsb(r); 520 - set_new_master(r, nodeid); 500 + r->res_master_nodeid = ret_nodeid; 501 + r->res_nodeid = new_master; 502 + set_new_master(r); 521 503 unlock_rsb(r); 522 504 recover_list_del(r); 523 505 ··· 819 791 dlm_hold_rsb(r); 820 792 } 821 793 822 - /* If we're using a directory, add tossed rsbs to the root 823 - list; they'll have entries created in the new directory, 824 - but no other recovery steps should do anything with them. */ 825 - 826 - if (dlm_no_directory(ls)) { 827 - spin_unlock(&ls->ls_rsbtbl[i].lock); 828 - continue; 829 - } 830 - 831 - for (n = rb_first(&ls->ls_rsbtbl[i].toss); n; n = rb_next(n)) { 832 - r = rb_entry(n, struct dlm_rsb, res_hashnode); 833 - list_add(&r->res_root_list, &ls->ls_root_list); 834 - dlm_hold_rsb(r); 835 - } 794 + if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[i].toss)) 795 + log_error(ls, "dlm_create_root_list toss not empty"); 836 796 spin_unlock(&ls->ls_rsbtbl[i].lock); 837 797 } 838 798 out: ··· 840 824 up_write(&ls->ls_root_sem); 841 825 } 842 826 843 - /* If not using a directory, clear the entire toss list, there's no benefit to 844 - caching the master value since it's fixed. If we are using a dir, keep the 845 - rsb's we're the master of. Recovery will add them to the root list and from 846 - there they'll be entered in the rebuilt directory. */ 847 - 848 - void dlm_clear_toss_list(struct dlm_ls *ls) 827 + void dlm_clear_toss(struct dlm_ls *ls) 849 828 { 850 829 struct rb_node *n, *next; 851 - struct dlm_rsb *rsb; 830 + struct dlm_rsb *r; 831 + unsigned int count = 0; 852 832 int i; 853 833 854 834 for (i = 0; i < ls->ls_rsbtbl_size; i++) { 855 835 spin_lock(&ls->ls_rsbtbl[i].lock); 856 836 for (n = rb_first(&ls->ls_rsbtbl[i].toss); n; n = next) { 857 - next = rb_next(n);; 858 - rsb = rb_entry(n, struct dlm_rsb, res_hashnode); 859 - if (dlm_no_directory(ls) || !is_master(rsb)) { 860 - rb_erase(n, &ls->ls_rsbtbl[i].toss); 861 - dlm_free_rsb(rsb); 862 - } 837 + next = rb_next(n); 838 + r = rb_entry(n, struct dlm_rsb, res_hashnode); 839 + rb_erase(n, &ls->ls_rsbtbl[i].toss); 840 + dlm_free_rsb(r); 841 + count++; 863 842 } 864 843 spin_unlock(&ls->ls_rsbtbl[i].lock); 865 844 } 845 + 846 + if (count) 847 + log_debug(ls, "dlm_clear_toss %u done", count); 866 848 } 867 849
+1 -1
fs/dlm/recover.h
··· 27 27 void dlm_recovered_lock(struct dlm_rsb *r); 28 28 int dlm_create_root_list(struct dlm_ls *ls); 29 29 void dlm_release_root_list(struct dlm_ls *ls); 30 - void dlm_clear_toss_list(struct dlm_ls *ls); 30 + void dlm_clear_toss(struct dlm_ls *ls); 31 31 void dlm_recover_rsbs(struct dlm_ls *ls); 32 32 33 33 #endif /* __RECOVER_DOT_H__ */
+8 -6
fs/dlm/recoverd.c
··· 60 60 61 61 dlm_callback_suspend(ls); 62 62 63 - /* 64 - * Free non-master tossed rsb's. Master rsb's are kept on toss 65 - * list and put on root list to be included in resdir recovery. 66 - */ 67 - 68 - dlm_clear_toss_list(ls); 63 + dlm_clear_toss(ls); 69 64 70 65 /* 71 66 * This list of root rsb's will be the basis of most of the recovery ··· 79 84 goto fail; 80 85 } 81 86 87 + dlm_recover_dir_nodeid(ls); 88 + 89 + ls->ls_recover_dir_sent_res = 0; 90 + ls->ls_recover_dir_sent_msg = 0; 82 91 ls->ls_recover_locks_in = 0; 83 92 84 93 dlm_set_recover_status(ls, DLM_RS_NODES); ··· 113 114 log_debug(ls, "dlm_recover_directory_wait error %d", error); 114 115 goto fail; 115 116 } 117 + 118 + log_debug(ls, "dlm_recover_directory %u out %u messages", 119 + ls->ls_recover_dir_sent_res, ls->ls_recover_dir_sent_msg); 116 120 117 121 /* 118 122 * We may have outstanding operations that are waiting for a reply from