Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

dlm: fixes for nodir mode

The "nodir" mode (statically assign master nodes instead
of using the resource directory) has always been highly
experimental, and never seriously used. This commit
fixes a number of problems, making nodir much more usable.

- Major change to recovery: recover all locks and restart
all in-progress operations after recovery. In some
cases it's not possible to know which in-progess locks
to recover, so recover all. (Most require recovery
in nodir mode anyway since rehashing changes most
master nodes.)

- Change the way nodir mode is enabled, from a command
line mount arg passed through gfs2, into a sysfs
file managed by dlm_controld, consistent with the
other config settings.

- Allow recovering MSTCPY locks on an rsb that has not
yet been turned into a master copy.

- Ignore RCOM_LOCK and RCOM_LOCK_REPLY recovery messages
from a previous, aborted recovery cycle. Base this
on the local recovery status not being in the state
where any nodes should be sending LOCK messages for the
current recovery cycle.

- Hold rsb lock around dlm_purge_mstcpy_locks() because it
may run concurrently with dlm_recover_master_copy().

- Maintain highbast on process-copy lkb's (in addition to
the master as is usual), because the lkb can switch
back and forth between being a master and being a
process copy as the master node changes in recovery.

- When recovering MSTCPY locks, flag rsb's that have
non-empty convert or waiting queues for granting
at the end of recovery. (Rename flag from LOCKS_PURGED
to RECOVER_GRANT and similar for the recovery function,
because it's not only resources with purged locks
that need grant a grant attempt.)

- Replace a couple of unnecessary assertion panics with
error messages.

Signed-off-by: David Teigland <teigland@redhat.com>

+311 -179
+2 -1
fs/dlm/ast.c
··· 310 310 } 311 311 mutex_unlock(&ls->ls_cb_mutex); 312 312 313 - log_debug(ls, "dlm_callback_resume %d", count); 313 + if (count) 314 + log_debug(ls, "dlm_callback_resume %d", count); 314 315 } 315 316
+6 -2
fs/dlm/dlm_internal.h
··· 271 271 ktime_t lkb_last_cast_time; /* for debugging */ 272 272 ktime_t lkb_last_bast_time; /* for debugging */ 273 273 274 + uint64_t lkb_recover_seq; /* from ls_recover_seq */ 275 + 274 276 char *lkb_lvbptr; 275 277 struct dlm_lksb *lkb_lksb; /* caller's status block */ 276 278 void (*lkb_astfn) (void *astparam); ··· 327 325 RSB_NEW_MASTER, 328 326 RSB_NEW_MASTER2, 329 327 RSB_RECOVER_CONVERT, 330 - RSB_LOCKS_PURGED, 328 + RSB_RECOVER_GRANT, 331 329 }; 332 330 333 331 static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag) ··· 573 571 struct mutex ls_requestqueue_mutex; 574 572 struct dlm_rcom *ls_recover_buf; 575 573 int ls_recover_nodeid; /* for debugging */ 574 + unsigned int ls_recover_locks_in; /* for log info */ 576 575 uint64_t ls_rcom_seq; 577 576 spinlock_t ls_rcom_spin; 578 577 struct list_head ls_recover_list; ··· 600 597 #define LSFL_UEVENT_WAIT 5 601 598 #define LSFL_TIMEWARN 6 602 599 #define LSFL_CB_DELAY 7 600 + #define LSFL_NODIR 8 603 601 604 602 /* much of this is just saving user space pointers associated with the 605 603 lock that we pass back to the user lib with an ast */ ··· 648 644 649 645 static inline int dlm_no_directory(struct dlm_ls *ls) 650 646 { 651 - return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0; 647 + return test_bit(LSFL_NODIR, &ls->ls_flags); 652 648 } 653 649 654 650 int dlm_netlink_init(void);
+204 -96
fs/dlm/lock.c
··· 161 161 void dlm_print_lkb(struct dlm_lkb *lkb) 162 162 { 163 163 printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x " 164 - "sts %d rq %d gr %d wait_type %d wait_nodeid %d\n", 164 + "sts %d rq %d gr %d wait_type %d wait_nodeid %d seq %llu\n", 165 165 lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags, 166 166 lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode, 167 - lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_wait_nodeid); 167 + lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_wait_nodeid, 168 + (unsigned long long)lkb->lkb_recover_seq); 168 169 } 169 170 170 171 static void dlm_print_rsb(struct dlm_rsb *r) ··· 252 251 253 252 static inline int is_master_copy(struct dlm_lkb *lkb) 254 253 { 255 - if (lkb->lkb_flags & DLM_IFL_MSTCPY) 256 - DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb);); 257 254 return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0; 258 255 } 259 256 ··· 1518 1519 } 1519 1520 1520 1521 lkb->lkb_rqmode = DLM_LOCK_IV; 1522 + lkb->lkb_highbast = 0; 1521 1523 } 1522 1524 1523 1525 static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) 1524 1526 { 1525 1527 set_lvb_lock(r, lkb); 1526 1528 _grant_lock(r, lkb); 1527 - lkb->lkb_highbast = 0; 1528 1529 } 1529 1530 1530 1531 static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, ··· 1886 1887 /* Returns the highest requested mode of all blocked conversions; sets 1887 1888 cw if there's a blocked conversion to DLM_LOCK_CW. */ 1888 1889 1889 - static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw) 1890 + static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw, 1891 + unsigned int *count) 1890 1892 { 1891 1893 struct dlm_lkb *lkb, *s; 1892 1894 int hi, demoted, quit, grant_restart, demote_restart; ··· 1906 1906 if (can_be_granted(r, lkb, 0, &deadlk)) { 1907 1907 grant_lock_pending(r, lkb); 1908 1908 grant_restart = 1; 1909 + if (count) 1910 + (*count)++; 1909 1911 continue; 1910 1912 } 1911 1913 ··· 1941 1939 return max_t(int, high, hi); 1942 1940 } 1943 1941 1944 - static int grant_pending_wait(struct dlm_rsb *r, int high, int *cw) 1942 + static int grant_pending_wait(struct dlm_rsb *r, int high, int *cw, 1943 + unsigned int *count) 1945 1944 { 1946 1945 struct dlm_lkb *lkb, *s; 1947 1946 1948 1947 list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) { 1949 - if (can_be_granted(r, lkb, 0, NULL)) 1948 + if (can_be_granted(r, lkb, 0, NULL)) { 1950 1949 grant_lock_pending(r, lkb); 1951 - else { 1950 + if (count) 1951 + (*count)++; 1952 + } else { 1952 1953 high = max_t(int, lkb->lkb_rqmode, high); 1953 1954 if (lkb->lkb_rqmode == DLM_LOCK_CW) 1954 1955 *cw = 1; ··· 1980 1975 return 0; 1981 1976 } 1982 1977 1983 - static void grant_pending_locks(struct dlm_rsb *r) 1978 + static void grant_pending_locks(struct dlm_rsb *r, unsigned int *count) 1984 1979 { 1985 1980 struct dlm_lkb *lkb, *s; 1986 1981 int high = DLM_LOCK_IV; 1987 1982 int cw = 0; 1988 1983 1989 - DLM_ASSERT(is_master(r), dlm_dump_rsb(r);); 1984 + if (!is_master(r)) { 1985 + log_print("grant_pending_locks r nodeid %d", r->res_nodeid); 1986 + dlm_dump_rsb(r); 1987 + return; 1988 + } 1990 1989 1991 - high = grant_pending_convert(r, high, &cw); 1992 - high = grant_pending_wait(r, high, &cw); 1990 + high = grant_pending_convert(r, high, &cw, count); 1991 + high = grant_pending_wait(r, high, &cw, count); 1993 1992 1994 1993 if (high == DLM_LOCK_IV) 1995 1994 return; ··· 2529 2520 before we try again to grant this one. */ 2530 2521 2531 2522 if (is_demoted(lkb)) { 2532 - grant_pending_convert(r, DLM_LOCK_IV, NULL); 2523 + grant_pending_convert(r, DLM_LOCK_IV, NULL, NULL); 2533 2524 if (_can_be_granted(r, lkb, 1)) { 2534 2525 grant_lock(r, lkb); 2535 2526 queue_cast(r, lkb, 0); ··· 2557 2548 { 2558 2549 switch (error) { 2559 2550 case 0: 2560 - grant_pending_locks(r); 2551 + grant_pending_locks(r, NULL); 2561 2552 /* grant_pending_locks also sends basts */ 2562 2553 break; 2563 2554 case -EAGAIN: ··· 2580 2571 static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, 2581 2572 int error) 2582 2573 { 2583 - grant_pending_locks(r); 2574 + grant_pending_locks(r, NULL); 2584 2575 } 2585 2576 2586 2577 /* returns: 0 did nothing, -DLM_ECANCEL canceled lock */ ··· 2601 2592 int error) 2602 2593 { 2603 2594 if (error) 2604 - grant_pending_locks(r); 2595 + grant_pending_locks(r, NULL); 2605 2596 } 2606 2597 2607 2598 /* ··· 3461 3452 goto fail; 3462 3453 3463 3454 if (lkb->lkb_remid != ms->m_lkid) { 3464 - log_error(ls, "receive_convert %x remid %x remote %d %x", 3465 - lkb->lkb_id, lkb->lkb_remid, 3455 + log_error(ls, "receive_convert %x remid %x recover_seq %llu " 3456 + "remote %d %x", lkb->lkb_id, lkb->lkb_remid, 3457 + (unsigned long long)lkb->lkb_recover_seq, 3466 3458 ms->m_header.h_nodeid, ms->m_lkid); 3467 3459 error = -ENOENT; 3468 3460 goto fail; ··· 3641 3631 goto out; 3642 3632 3643 3633 queue_bast(r, lkb, ms->m_bastmode); 3634 + lkb->lkb_highbast = ms->m_bastmode; 3644 3635 out: 3645 3636 unlock_rsb(r); 3646 3637 put_rsb(r); ··· 3721 3710 3722 3711 mstype = lkb->lkb_wait_type; 3723 3712 error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY); 3724 - if (error) 3713 + if (error) { 3714 + log_error(ls, "receive_request_reply %x remote %d %x result %d", 3715 + lkb->lkb_id, ms->m_header.h_nodeid, ms->m_lkid, 3716 + ms->m_result); 3717 + dlm_dump_rsb(r); 3725 3718 goto out; 3719 + } 3726 3720 3727 3721 /* Optimization: the dir node was also the master, so it took our 3728 3722 lookup as a request and sent request reply instead of lookup reply */ ··· 4138 4122 * happen in normal usage for the async messages and cancel, so 4139 4123 * only use log_debug for them. 4140 4124 * 4141 - * Other errors are expected and normal. 4125 + * Some errors are expected and normal. 4142 4126 */ 4143 4127 4144 4128 if (error == -ENOENT && noent) { 4145 - log_debug(ls, "receive %d no %x remote %d %x seq %u", 4129 + log_debug(ls, "receive %d no %x remote %d %x saved_seq %u", 4146 4130 ms->m_type, ms->m_remid, ms->m_header.h_nodeid, 4147 4131 ms->m_lkid, saved_seq); 4148 4132 } else if (error == -ENOENT) { 4149 - log_error(ls, "receive %d no %x remote %d %x seq %u", 4133 + log_error(ls, "receive %d no %x remote %d %x saved_seq %u", 4150 4134 ms->m_type, ms->m_remid, ms->m_header.h_nodeid, 4151 4135 ms->m_lkid, saved_seq); 4152 4136 4153 4137 if (ms->m_type == DLM_MSG_CONVERT) 4154 4138 dlm_dump_rsb_hash(ls, ms->m_hash); 4139 + } 4140 + 4141 + if (error == -EINVAL) { 4142 + log_error(ls, "receive %d inval from %d lkid %x remid %x " 4143 + "saved_seq %u", 4144 + ms->m_type, ms->m_header.h_nodeid, 4145 + ms->m_lkid, ms->m_remid, saved_seq); 4155 4146 } 4156 4147 } 4157 4148 ··· 4223 4200 4224 4201 ls = dlm_find_lockspace_global(hd->h_lockspace); 4225 4202 if (!ls) { 4226 - if (dlm_config.ci_log_debug) 4227 - log_print("invalid lockspace %x from %d cmd %d type %d", 4228 - hd->h_lockspace, nodeid, hd->h_cmd, type); 4203 + if (dlm_config.ci_log_debug) { 4204 + printk_ratelimited(KERN_DEBUG "dlm: invalid lockspace " 4205 + "%u from %d cmd %d type %d\n", 4206 + hd->h_lockspace, nodeid, hd->h_cmd, type); 4207 + } 4229 4208 4230 4209 if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS) 4231 4210 dlm_send_ls_not_ready(nodeid, &p->rcom); ··· 4278 4253 static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb, 4279 4254 int dir_nodeid) 4280 4255 { 4256 + if (dlm_no_directory(ls)) 4257 + return 1; 4258 + 4281 4259 if (dlm_is_removed(ls, lkb->lkb_wait_nodeid)) 4282 - return 1; 4283 - 4284 - if (!dlm_no_directory(ls)) 4285 - return 0; 4286 - 4287 - if (dir_nodeid == dlm_our_nodeid()) 4288 - return 1; 4289 - 4290 - if (dir_nodeid != lkb->lkb_wait_nodeid) 4291 4260 return 1; 4292 4261 4293 4262 return 0; ··· 4538 4519 return error; 4539 4520 } 4540 4521 4541 - static void purge_queue(struct dlm_rsb *r, struct list_head *queue, 4542 - int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb)) 4522 + static void purge_mstcpy_list(struct dlm_ls *ls, struct dlm_rsb *r, 4523 + struct list_head *list) 4543 4524 { 4544 - struct dlm_ls *ls = r->res_ls; 4545 4525 struct dlm_lkb *lkb, *safe; 4546 4526 4547 - list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) { 4548 - if (test(ls, lkb)) { 4549 - rsb_set_flag(r, RSB_LOCKS_PURGED); 4550 - del_lkb(r, lkb); 4551 - /* this put should free the lkb */ 4552 - if (!dlm_put_lkb(lkb)) 4553 - log_error(ls, "purged lkb not released"); 4554 - } 4527 + list_for_each_entry_safe(lkb, safe, list, lkb_statequeue) { 4528 + if (!is_master_copy(lkb)) 4529 + continue; 4530 + 4531 + /* don't purge lkbs we've added in recover_master_copy for 4532 + the current recovery seq */ 4533 + 4534 + if (lkb->lkb_recover_seq == ls->ls_recover_seq) 4535 + continue; 4536 + 4537 + del_lkb(r, lkb); 4538 + 4539 + /* this put should free the lkb */ 4540 + if (!dlm_put_lkb(lkb)) 4541 + log_error(ls, "purged mstcpy lkb not released"); 4555 4542 } 4556 - } 4557 - 4558 - static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb) 4559 - { 4560 - return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid)); 4561 - } 4562 - 4563 - static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb) 4564 - { 4565 - return is_master_copy(lkb); 4566 - } 4567 - 4568 - static void purge_dead_locks(struct dlm_rsb *r) 4569 - { 4570 - purge_queue(r, &r->res_grantqueue, &purge_dead_test); 4571 - purge_queue(r, &r->res_convertqueue, &purge_dead_test); 4572 - purge_queue(r, &r->res_waitqueue, &purge_dead_test); 4573 4543 } 4574 4544 4575 4545 void dlm_purge_mstcpy_locks(struct dlm_rsb *r) 4576 4546 { 4577 - purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test); 4578 - purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test); 4579 - purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test); 4547 + struct dlm_ls *ls = r->res_ls; 4548 + 4549 + purge_mstcpy_list(ls, r, &r->res_grantqueue); 4550 + purge_mstcpy_list(ls, r, &r->res_convertqueue); 4551 + purge_mstcpy_list(ls, r, &r->res_waitqueue); 4552 + } 4553 + 4554 + static void purge_dead_list(struct dlm_ls *ls, struct dlm_rsb *r, 4555 + struct list_head *list, 4556 + int nodeid_gone, unsigned int *count) 4557 + { 4558 + struct dlm_lkb *lkb, *safe; 4559 + 4560 + list_for_each_entry_safe(lkb, safe, list, lkb_statequeue) { 4561 + if (!is_master_copy(lkb)) 4562 + continue; 4563 + 4564 + if ((lkb->lkb_nodeid == nodeid_gone) || 4565 + dlm_is_removed(ls, lkb->lkb_nodeid)) { 4566 + 4567 + del_lkb(r, lkb); 4568 + 4569 + /* this put should free the lkb */ 4570 + if (!dlm_put_lkb(lkb)) 4571 + log_error(ls, "purged dead lkb not released"); 4572 + 4573 + rsb_set_flag(r, RSB_RECOVER_GRANT); 4574 + 4575 + (*count)++; 4576 + } 4577 + } 4580 4578 } 4581 4579 4582 4580 /* Get rid of locks held by nodes that are gone. */ 4583 4581 4584 - int dlm_purge_locks(struct dlm_ls *ls) 4582 + void dlm_recover_purge(struct dlm_ls *ls) 4585 4583 { 4586 4584 struct dlm_rsb *r; 4585 + struct dlm_member *memb; 4586 + int nodes_count = 0; 4587 + int nodeid_gone = 0; 4588 + unsigned int lkb_count = 0; 4587 4589 4588 - log_debug(ls, "dlm_purge_locks"); 4590 + /* cache one removed nodeid to optimize the common 4591 + case of a single node removed */ 4592 + 4593 + list_for_each_entry(memb, &ls->ls_nodes_gone, list) { 4594 + nodes_count++; 4595 + nodeid_gone = memb->nodeid; 4596 + } 4597 + 4598 + if (!nodes_count) 4599 + return; 4589 4600 4590 4601 down_write(&ls->ls_root_sem); 4591 4602 list_for_each_entry(r, &ls->ls_root_list, res_root_list) { 4592 4603 hold_rsb(r); 4593 4604 lock_rsb(r); 4594 - if (is_master(r)) 4595 - purge_dead_locks(r); 4605 + if (is_master(r)) { 4606 + purge_dead_list(ls, r, &r->res_grantqueue, 4607 + nodeid_gone, &lkb_count); 4608 + purge_dead_list(ls, r, &r->res_convertqueue, 4609 + nodeid_gone, &lkb_count); 4610 + purge_dead_list(ls, r, &r->res_waitqueue, 4611 + nodeid_gone, &lkb_count); 4612 + } 4596 4613 unlock_rsb(r); 4597 4614 unhold_rsb(r); 4598 - 4599 - schedule(); 4615 + cond_resched(); 4600 4616 } 4601 4617 up_write(&ls->ls_root_sem); 4602 4618 4603 - return 0; 4619 + if (lkb_count) 4620 + log_debug(ls, "dlm_recover_purge %u locks for %u nodes", 4621 + lkb_count, nodes_count); 4604 4622 } 4605 4623 4606 - static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket) 4624 + static struct dlm_rsb *find_grant_rsb(struct dlm_ls *ls, int bucket) 4607 4625 { 4608 4626 struct rb_node *n; 4609 - struct dlm_rsb *r, *r_ret = NULL; 4627 + struct dlm_rsb *r; 4610 4628 4611 4629 spin_lock(&ls->ls_rsbtbl[bucket].lock); 4612 4630 for (n = rb_first(&ls->ls_rsbtbl[bucket].keep); n; n = rb_next(n)) { 4613 4631 r = rb_entry(n, struct dlm_rsb, res_hashnode); 4614 - if (!rsb_flag(r, RSB_LOCKS_PURGED)) 4632 + 4633 + if (!rsb_flag(r, RSB_RECOVER_GRANT)) 4634 + continue; 4635 + rsb_clear_flag(r, RSB_RECOVER_GRANT); 4636 + if (!is_master(r)) 4615 4637 continue; 4616 4638 hold_rsb(r); 4617 - rsb_clear_flag(r, RSB_LOCKS_PURGED); 4618 - r_ret = r; 4619 - break; 4639 + spin_unlock(&ls->ls_rsbtbl[bucket].lock); 4640 + return r; 4620 4641 } 4621 4642 spin_unlock(&ls->ls_rsbtbl[bucket].lock); 4622 - return r_ret; 4643 + return NULL; 4623 4644 } 4624 4645 4625 - void dlm_grant_after_purge(struct dlm_ls *ls) 4646 + /* 4647 + * Attempt to grant locks on resources that we are the master of. 4648 + * Locks may have become grantable during recovery because locks 4649 + * from departed nodes have been purged (or not rebuilt), allowing 4650 + * previously blocked locks to now be granted. The subset of rsb's 4651 + * we are interested in are those with lkb's on either the convert or 4652 + * waiting queues. 4653 + * 4654 + * Simplest would be to go through each master rsb and check for non-empty 4655 + * convert or waiting queues, and attempt to grant on those rsbs. 4656 + * Checking the queues requires lock_rsb, though, for which we'd need 4657 + * to release the rsbtbl lock. This would make iterating through all 4658 + * rsb's very inefficient. So, we rely on earlier recovery routines 4659 + * to set RECOVER_GRANT on any rsb's that we should attempt to grant 4660 + * locks for. 4661 + */ 4662 + 4663 + void dlm_recover_grant(struct dlm_ls *ls) 4626 4664 { 4627 4665 struct dlm_rsb *r; 4628 4666 int bucket = 0; 4667 + unsigned int count = 0; 4668 + unsigned int rsb_count = 0; 4669 + unsigned int lkb_count = 0; 4629 4670 4630 4671 while (1) { 4631 - r = find_purged_rsb(ls, bucket); 4672 + r = find_grant_rsb(ls, bucket); 4632 4673 if (!r) { 4633 4674 if (bucket == ls->ls_rsbtbl_size - 1) 4634 4675 break; 4635 4676 bucket++; 4636 4677 continue; 4637 4678 } 4679 + rsb_count++; 4680 + count = 0; 4638 4681 lock_rsb(r); 4639 - if (is_master(r)) { 4640 - grant_pending_locks(r); 4641 - confirm_master(r, 0); 4642 - } 4682 + grant_pending_locks(r, &count); 4683 + lkb_count += count; 4684 + confirm_master(r, 0); 4643 4685 unlock_rsb(r); 4644 4686 put_rsb(r); 4645 - schedule(); 4687 + cond_resched(); 4646 4688 } 4689 + 4690 + if (lkb_count) 4691 + log_debug(ls, "dlm_recover_grant %u locks on %u resources", 4692 + lkb_count, rsb_count); 4647 4693 } 4648 4694 4649 4695 static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid, ··· 4807 4723 4808 4724 remid = le32_to_cpu(rl->rl_lkid); 4809 4725 4810 - error = find_rsb(ls, rl->rl_name, le16_to_cpu(rl->rl_namelen), 4811 - R_MASTER, &r); 4726 + /* In general we expect the rsb returned to be R_MASTER, but we don't 4727 + have to require it. Recovery of masters on one node can overlap 4728 + recovery of locks on another node, so one node can send us MSTCPY 4729 + locks before we've made ourselves master of this rsb. We can still 4730 + add new MSTCPY locks that we receive here without any harm; when 4731 + we make ourselves master, dlm_recover_masters() won't touch the 4732 + MSTCPY locks we've received early. */ 4733 + 4734 + error = find_rsb(ls, rl->rl_name, le16_to_cpu(rl->rl_namelen), 0, &r); 4812 4735 if (error) 4813 4736 goto out; 4737 + 4738 + if (dlm_no_directory(ls) && (dlm_dir_nodeid(r) != dlm_our_nodeid())) { 4739 + log_error(ls, "dlm_recover_master_copy remote %d %x not dir", 4740 + rc->rc_header.h_nodeid, remid); 4741 + error = -EBADR; 4742 + put_rsb(r); 4743 + goto out; 4744 + } 4814 4745 4815 4746 lock_rsb(r); 4816 4747 ··· 4848 4749 attach_lkb(r, lkb); 4849 4750 add_lkb(r, lkb, rl->rl_status); 4850 4751 error = 0; 4752 + ls->ls_recover_locks_in++; 4753 + 4754 + if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue)) 4755 + rsb_set_flag(r, RSB_RECOVER_GRANT); 4851 4756 4852 4757 out_remid: 4853 4758 /* this is the new value returned to the lock holder for 4854 4759 saving in its process-copy lkb */ 4855 4760 rl->rl_remid = cpu_to_le32(lkb->lkb_id); 4761 + 4762 + lkb->lkb_recover_seq = ls->ls_recover_seq; 4856 4763 4857 4764 out_unlock: 4858 4765 unlock_rsb(r); ··· 4891 4786 return error; 4892 4787 } 4893 4788 4894 - if (!is_process_copy(lkb)) { 4895 - log_error(ls, "dlm_recover_process_copy bad %x remote %d %x %d", 4896 - lkid, rc->rc_header.h_nodeid, remid, result); 4897 - dlm_print_lkb(lkb); 4898 - return -EINVAL; 4899 - } 4900 - 4901 4789 r = lkb->lkb_resource; 4902 4790 hold_rsb(r); 4903 4791 lock_rsb(r); 4792 + 4793 + if (!is_process_copy(lkb)) { 4794 + log_error(ls, "dlm_recover_process_copy bad %x remote %d %x %d", 4795 + lkid, rc->rc_header.h_nodeid, remid, result); 4796 + dlm_dump_rsb(r); 4797 + unlock_rsb(r); 4798 + put_rsb(r); 4799 + dlm_put_lkb(lkb); 4800 + return -EINVAL; 4801 + } 4904 4802 4905 4803 switch (result) { 4906 4804 case -EBADR:
+2 -2
fs/dlm/lock.h
··· 32 32 int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len, 33 33 unsigned int flags, struct dlm_rsb **r_ret); 34 34 35 - int dlm_purge_locks(struct dlm_ls *ls); 35 + void dlm_recover_purge(struct dlm_ls *ls); 36 36 void dlm_purge_mstcpy_locks(struct dlm_rsb *r); 37 - void dlm_grant_after_purge(struct dlm_ls *ls); 37 + void dlm_recover_grant(struct dlm_ls *ls); 38 38 int dlm_recover_waiters_post(struct dlm_ls *ls); 39 39 void dlm_recover_waiters_pre(struct dlm_ls *ls); 40 40 int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
+20
fs/dlm/lockspace.c
··· 74 74 return len; 75 75 } 76 76 77 + static ssize_t dlm_nodir_show(struct dlm_ls *ls, char *buf) 78 + { 79 + return snprintf(buf, PAGE_SIZE, "%u\n", dlm_no_directory(ls)); 80 + } 81 + 82 + static ssize_t dlm_nodir_store(struct dlm_ls *ls, const char *buf, size_t len) 83 + { 84 + int val = simple_strtoul(buf, NULL, 0); 85 + if (val == 1) 86 + set_bit(LSFL_NODIR, &ls->ls_flags); 87 + return len; 88 + } 89 + 77 90 static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf) 78 91 { 79 92 uint32_t status = dlm_recover_status(ls); ··· 120 107 .store = dlm_id_store 121 108 }; 122 109 110 + static struct dlm_attr dlm_attr_nodir = { 111 + .attr = {.name = "nodir", .mode = S_IRUGO | S_IWUSR}, 112 + .show = dlm_nodir_show, 113 + .store = dlm_nodir_store 114 + }; 115 + 123 116 static struct dlm_attr dlm_attr_recover_status = { 124 117 .attr = {.name = "recover_status", .mode = S_IRUGO}, 125 118 .show = dlm_recover_status_show ··· 140 121 &dlm_attr_control.attr, 141 122 &dlm_attr_event.attr, 142 123 &dlm_attr_id.attr, 124 + &dlm_attr_nodir.attr, 143 125 &dlm_attr_recover_status.attr, 144 126 &dlm_attr_recover_nodeid.attr, 145 127 NULL,
+17 -6
fs/dlm/rcom.c
··· 492 492 void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) 493 493 { 494 494 int lock_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_lock); 495 - int stop, reply = 0; 495 + int stop, reply = 0, lock = 0; 496 + uint32_t status; 496 497 uint64_t seq; 497 498 498 499 switch (rc->rc_type) { 500 + case DLM_RCOM_LOCK: 501 + lock = 1; 502 + break; 503 + case DLM_RCOM_LOCK_REPLY: 504 + lock = 1; 505 + reply = 1; 506 + break; 499 507 case DLM_RCOM_STATUS_REPLY: 500 508 case DLM_RCOM_NAMES_REPLY: 501 509 case DLM_RCOM_LOOKUP_REPLY: 502 - case DLM_RCOM_LOCK_REPLY: 503 510 reply = 1; 504 511 }; 505 512 506 513 spin_lock(&ls->ls_recover_lock); 514 + status = ls->ls_recover_status; 507 515 stop = test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags); 508 516 seq = ls->ls_recover_seq; 509 517 spin_unlock(&ls->ls_recover_lock); 510 518 511 519 if ((stop && (rc->rc_type != DLM_RCOM_STATUS)) || 512 - (reply && (rc->rc_seq_reply != seq))) { 520 + (reply && (rc->rc_seq_reply != seq)) || 521 + (lock && !(status & DLM_RS_DIR))) { 513 522 log_limit(ls, "dlm_receive_rcom ignore msg %d " 514 - "from %d %llu %llu seq %llu", 515 - rc->rc_type, nodeid, 523 + "from %d %llu %llu recover seq %llu sts %x gen %u", 524 + rc->rc_type, 525 + nodeid, 516 526 (unsigned long long)rc->rc_seq, 517 527 (unsigned long long)rc->rc_seq_reply, 518 - (unsigned long long)seq); 528 + (unsigned long long)seq, 529 + status, ls->ls_generation); 519 530 goto out; 520 531 } 521 532
+42 -31
fs/dlm/recover.c
··· 339 339 { 340 340 struct dlm_lkb *lkb; 341 341 342 - list_for_each_entry(lkb, queue, lkb_statequeue) 343 - if (!(lkb->lkb_flags & DLM_IFL_MSTCPY)) 342 + list_for_each_entry(lkb, queue, lkb_statequeue) { 343 + if (!(lkb->lkb_flags & DLM_IFL_MSTCPY)) { 344 344 lkb->lkb_nodeid = nodeid; 345 + lkb->lkb_remid = 0; 346 + } 347 + } 345 348 } 346 349 347 350 static void set_master_lkbs(struct dlm_rsb *r) ··· 357 354 /* 358 355 * Propagate the new master nodeid to locks 359 356 * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider. 360 - * The NEW_MASTER2 flag tells recover_lvb() and set_locks_purged() which 357 + * The NEW_MASTER2 flag tells recover_lvb() and recover_grant() which 361 358 * rsb's to consider. 362 359 */ 363 360 364 361 static void set_new_master(struct dlm_rsb *r, int nodeid) 365 362 { 366 - lock_rsb(r); 367 363 r->res_nodeid = nodeid; 368 364 set_master_lkbs(r); 369 365 rsb_set_flag(r, RSB_NEW_MASTER); 370 366 rsb_set_flag(r, RSB_NEW_MASTER2); 371 - unlock_rsb(r); 372 367 } 373 368 374 369 /* ··· 377 376 static int recover_master(struct dlm_rsb *r) 378 377 { 379 378 struct dlm_ls *ls = r->res_ls; 380 - int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid(); 381 - 382 - dir_nodeid = dlm_dir_nodeid(r); 379 + int error, ret_nodeid; 380 + int our_nodeid = dlm_our_nodeid(); 381 + int dir_nodeid = dlm_dir_nodeid(r); 383 382 384 383 if (dir_nodeid == our_nodeid) { 385 384 error = dlm_dir_lookup(ls, our_nodeid, r->res_name, ··· 389 388 390 389 if (ret_nodeid == our_nodeid) 391 390 ret_nodeid = 0; 391 + lock_rsb(r); 392 392 set_new_master(r, ret_nodeid); 393 + unlock_rsb(r); 393 394 } else { 394 395 recover_list_add(r); 395 396 error = dlm_send_rcom_lookup(r, dir_nodeid); ··· 401 398 } 402 399 403 400 /* 404 - * When not using a directory, most resource names will hash to a new static 405 - * master nodeid and the resource will need to be remastered. 401 + * All MSTCPY locks are purged and rebuilt, even if the master stayed the same. 402 + * This is necessary because recovery can be started, aborted and restarted, 403 + * causing the master nodeid to briefly change during the aborted recovery, and 404 + * change back to the original value in the second recovery. The MSTCPY locks 405 + * may or may not have been purged during the aborted recovery. Another node 406 + * with an outstanding request in waiters list and a request reply saved in the 407 + * requestqueue, cannot know whether it should ignore the reply and resend the 408 + * request, or accept the reply and complete the request. It must do the 409 + * former if the remote node purged MSTCPY locks, and it must do the later if 410 + * the remote node did not. This is solved by always purging MSTCPY locks, in 411 + * which case, the request reply would always be ignored and the request 412 + * resent. 406 413 */ 407 414 408 415 static int recover_master_static(struct dlm_rsb *r) 409 416 { 410 - int master = dlm_dir_nodeid(r); 417 + int dir_nodeid = dlm_dir_nodeid(r); 418 + int new_master = dir_nodeid; 411 419 412 - if (master == dlm_our_nodeid()) 413 - master = 0; 420 + if (dir_nodeid == dlm_our_nodeid()) 421 + new_master = 0; 414 422 415 - if (r->res_nodeid != master) { 416 - if (is_master(r)) 417 - dlm_purge_mstcpy_locks(r); 418 - set_new_master(r, master); 419 - return 1; 420 - } 421 - return 0; 423 + lock_rsb(r); 424 + dlm_purge_mstcpy_locks(r); 425 + set_new_master(r, new_master); 426 + unlock_rsb(r); 427 + return 1; 422 428 } 423 429 424 430 /* ··· 493 481 if (nodeid == dlm_our_nodeid()) 494 482 nodeid = 0; 495 483 484 + lock_rsb(r); 496 485 set_new_master(r, nodeid); 486 + unlock_rsb(r); 497 487 recover_list_del(r); 498 488 499 489 if (recover_list_empty(ls)) ··· 570 556 struct dlm_rsb *r; 571 557 int error, count = 0; 572 558 573 - log_debug(ls, "dlm_recover_locks"); 574 - 575 559 down_read(&ls->ls_root_sem); 576 560 list_for_each_entry(r, &ls->ls_root_list, res_root_list) { 577 561 if (is_master(r)) { ··· 596 584 } 597 585 up_read(&ls->ls_root_sem); 598 586 599 - log_debug(ls, "dlm_recover_locks %d locks", count); 587 + log_debug(ls, "dlm_recover_locks %d out", count); 600 588 601 589 error = dlm_wait_function(ls, &recover_list_empty); 602 590 out: ··· 733 721 } 734 722 735 723 /* We've become the new master for this rsb and waiting/converting locks may 736 - need to be granted in dlm_grant_after_purge() due to locks that may have 724 + need to be granted in dlm_recover_grant() due to locks that may have 737 725 existed from a removed node. */ 738 726 739 - static void set_locks_purged(struct dlm_rsb *r) 727 + static void recover_grant(struct dlm_rsb *r) 740 728 { 741 729 if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue)) 742 - rsb_set_flag(r, RSB_LOCKS_PURGED); 730 + rsb_set_flag(r, RSB_RECOVER_GRANT); 743 731 } 744 732 745 733 void dlm_recover_rsbs(struct dlm_ls *ls) 746 734 { 747 735 struct dlm_rsb *r; 748 - int count = 0; 749 - 750 - log_debug(ls, "dlm_recover_rsbs"); 736 + unsigned int count = 0; 751 737 752 738 down_read(&ls->ls_root_sem); 753 739 list_for_each_entry(r, &ls->ls_root_list, res_root_list) { ··· 754 744 if (rsb_flag(r, RSB_RECOVER_CONVERT)) 755 745 recover_conversion(r); 756 746 if (rsb_flag(r, RSB_NEW_MASTER2)) 757 - set_locks_purged(r); 747 + recover_grant(r); 758 748 recover_lvb(r); 759 749 count++; 760 750 } ··· 764 754 } 765 755 up_read(&ls->ls_root_sem); 766 756 767 - log_debug(ls, "dlm_recover_rsbs %d rsbs", count); 757 + if (count) 758 + log_debug(ls, "dlm_recover_rsbs %d done", count); 768 759 } 769 760 770 761 /* Create a single list of all root rsb's to be used during recovery */
+7 -2
fs/dlm/recoverd.c
··· 84 84 goto fail; 85 85 } 86 86 87 + ls->ls_recover_locks_in = 0; 88 + 87 89 dlm_set_recover_status(ls, DLM_RS_NODES); 88 90 89 91 error = dlm_recover_members_wait(ls); ··· 132 130 * Clear lkb's for departed nodes. 133 131 */ 134 132 135 - dlm_purge_locks(ls); 133 + dlm_recover_purge(ls); 136 134 137 135 /* 138 136 * Get new master nodeid's for rsb's that were mastered on ··· 162 160 log_debug(ls, "dlm_recover_locks_wait error %d", error); 163 161 goto fail; 164 162 } 163 + 164 + log_debug(ls, "dlm_recover_locks %u in", 165 + ls->ls_recover_locks_in); 165 166 166 167 /* 167 168 * Finalize state in master rsb's now that all locks can be ··· 230 225 goto fail; 231 226 } 232 227 233 - dlm_grant_after_purge(ls); 228 + dlm_recover_grant(ls); 234 229 235 230 log_debug(ls, "dlm_recover %llu generation %u done: %u ms", 236 231 (unsigned long long)rv->seq, ls->ls_generation,
+10 -29
fs/dlm/requestqueue.c
··· 65 65 int dlm_process_requestqueue(struct dlm_ls *ls) 66 66 { 67 67 struct rq_entry *e; 68 + struct dlm_message *ms; 68 69 int error = 0; 69 70 70 71 mutex_lock(&ls->ls_requestqueue_mutex); ··· 78 77 } 79 78 e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list); 80 79 mutex_unlock(&ls->ls_requestqueue_mutex); 80 + 81 + ms = &e->request; 82 + 83 + log_limit(ls, "dlm_process_requestqueue msg %d from %d " 84 + "lkid %x remid %x result %d seq %u", 85 + ms->m_type, ms->m_header.h_nodeid, 86 + ms->m_lkid, ms->m_remid, ms->m_result, 87 + e->recover_seq); 81 88 82 89 dlm_receive_message_saved(ls, &e->request, e->recover_seq); 83 90 ··· 149 140 if (!dlm_no_directory(ls)) 150 141 return 0; 151 142 152 - /* with no directory, the master is likely to change as a part of 153 - recovery; requests to/from the defunct master need to be purged */ 154 - 155 - switch (type) { 156 - case DLM_MSG_REQUEST: 157 - case DLM_MSG_CONVERT: 158 - case DLM_MSG_UNLOCK: 159 - case DLM_MSG_CANCEL: 160 - /* we're no longer the master of this resource, the sender 161 - will resend to the new master (see waiter_needs_recovery) */ 162 - 163 - if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid()) 164 - return 1; 165 - break; 166 - 167 - case DLM_MSG_REQUEST_REPLY: 168 - case DLM_MSG_CONVERT_REPLY: 169 - case DLM_MSG_UNLOCK_REPLY: 170 - case DLM_MSG_CANCEL_REPLY: 171 - case DLM_MSG_GRANT: 172 - /* this reply is from the former master of the resource, 173 - we'll resend to the new master if needed */ 174 - 175 - if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid) 176 - return 1; 177 - break; 178 - } 179 - 180 - return 0; 143 + return 1; 181 144 } 182 145 183 146 void dlm_purge_requestqueue(struct dlm_ls *ls)
-1
fs/gfs2/incore.h
··· 556 556 struct lm_lockstruct { 557 557 int ls_jid; 558 558 unsigned int ls_first; 559 - unsigned int ls_nodir; 560 559 const struct lm_lockops *ls_ops; 561 560 dlm_lockspace_t *ls_dlm; 562 561
-2
fs/gfs2/lock_dlm.c
··· 1209 1209 fsname++; 1210 1210 1211 1211 flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL; 1212 - if (ls->ls_nodir) 1213 - flags |= DLM_LSFL_NODIR; 1214 1212 1215 1213 /* 1216 1214 * create/join lockspace
+1 -6
fs/gfs2/ops_fstype.c
··· 994 994 ls->ls_jid = option; 995 995 break; 996 996 case Opt_id: 997 + case Opt_nodir: 997 998 /* Obsolete, but left for backward compat purposes */ 998 999 break; 999 1000 case Opt_first: ··· 1002 1001 if (ret || (option != 0 && option != 1)) 1003 1002 goto hostdata_error; 1004 1003 ls->ls_first = option; 1005 - break; 1006 - case Opt_nodir: 1007 - ret = match_int(&tmp[0], &option); 1008 - if (ret || (option != 0 && option != 1)) 1009 - goto hostdata_error; 1010 - ls->ls_nodir = option; 1011 1004 break; 1012 1005 case Opt_err: 1013 1006 default:
-1
include/linux/dlm.h
··· 67 67 68 68 /* dlm_new_lockspace() flags */ 69 69 70 - #define DLM_LSFL_NODIR 0x00000001 71 70 #define DLM_LSFL_TIMEWARN 0x00000002 72 71 #define DLM_LSFL_FS 0x00000004 73 72 #define DLM_LSFL_NEWEXCL 0x00000008