Merge branch 'upstream-linus' of master.kernel.org:/pub/scm/linux/kernel/git/mfasheh/ocfs2

-1

fs/Kconfig

··· 426 426 select CONFIGFS_FS 427 427 select JBD 428 428 select CRC32 429 - select INET 430 429 help 431 430 OCFS2 is a general purpose extent based shared disk cluster file 432 431 system with many similarities to ext3. It supports 64 bit inode

+6 -3

fs/configfs/file.c

··· 162 162 int error; 163 163 164 164 if (!buffer->page) 165 - buffer->page = (char *)get_zeroed_page(GFP_KERNEL); 165 + buffer->page = (char *)__get_free_pages(GFP_KERNEL, 0); 166 166 if (!buffer->page) 167 167 return -ENOMEM; 168 168 169 - if (count > PAGE_SIZE) 170 - count = PAGE_SIZE; 169 + if (count >= PAGE_SIZE) 170 + count = PAGE_SIZE - 1; 171 171 error = copy_from_user(buffer->page,buf,count); 172 172 buffer->needs_read_fill = 1; 173 + /* if buf is assumed to contain a string, terminate it by \0, 174 + * so e.g. sscanf() can scan the string easily */ 175 + buffer->page[count] = 0; 173 176 return error ? -EFAULT : count; 174 177 } 175 178

+31 -127

fs/ocfs2/cluster/heartbeat.c

··· 184 184 flush_scheduled_work(); 185 185 } 186 186 187 - static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc, 188 - unsigned int num_ios) 187 + static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) 189 188 { 190 - atomic_set(&wc->wc_num_reqs, num_ios); 189 + atomic_set(&wc->wc_num_reqs, 1); 191 190 init_completion(&wc->wc_io_complete); 192 191 wc->wc_error = 0; 193 192 } ··· 211 212 struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping; 212 213 213 214 blk_run_address_space(mapping); 215 + o2hb_bio_wait_dec(wc, 1); 214 216 215 217 wait_for_completion(&wc->wc_io_complete); 216 218 } ··· 231 231 return 1; 232 232 233 233 o2hb_bio_wait_dec(wc, 1); 234 + bio_put(bio); 234 235 return 0; 235 236 } 236 237 ··· 239 238 * start_slot. */ 240 239 static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, 241 240 struct o2hb_bio_wait_ctxt *wc, 242 - unsigned int start_slot, 243 - unsigned int num_slots) 241 + unsigned int *current_slot, 242 + unsigned int max_slots) 244 243 { 245 - int i, nr_vecs, len, first_page, last_page; 244 + int len, current_page; 246 245 unsigned int vec_len, vec_start; 247 246 unsigned int bits = reg->hr_block_bits; 248 247 unsigned int spp = reg->hr_slots_per_page; 248 + unsigned int cs = *current_slot; 249 249 struct bio *bio; 250 250 struct page *page; 251 - 252 - nr_vecs = (num_slots + spp - 1) / spp; 253 251 254 252 /* Testing has shown this allocation to take long enough under 255 253 * GFP_KERNEL that the local node can get fenced. It would be 256 254 * nicest if we could pre-allocate these bios and avoid this 257 255 * all together. */ 258 - bio = bio_alloc(GFP_ATOMIC, nr_vecs); 256 + bio = bio_alloc(GFP_ATOMIC, 16); 259 257 if (!bio) { 260 258 mlog(ML_ERROR, "Could not alloc slots BIO!\n"); 261 259 bio = ERR_PTR(-ENOMEM); ··· 262 262 } 263 263 264 264 /* Must put everything in 512 byte sectors for the bio... */ 265 - bio->bi_sector = (reg->hr_start_block + start_slot) << (bits - 9); 265 + bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9); 266 266 bio->bi_bdev = reg->hr_bdev; 267 267 bio->bi_private = wc; 268 268 bio->bi_end_io = o2hb_bio_end_io; 269 269 270 - first_page = start_slot / spp; 271 - last_page = first_page + nr_vecs; 272 - vec_start = (start_slot << bits) % PAGE_CACHE_SIZE; 273 - for(i = first_page; i < last_page; i++) { 274 - page = reg->hr_slot_data[i]; 270 + vec_start = (cs << bits) % PAGE_CACHE_SIZE; 271 + while(cs < max_slots) { 272 + current_page = cs / spp; 273 + page = reg->hr_slot_data[current_page]; 275 274 276 - vec_len = PAGE_CACHE_SIZE; 277 - /* last page might be short */ 278 - if (((i + 1) * spp) > (start_slot + num_slots)) 279 - vec_len = ((num_slots + start_slot) % spp) << bits; 280 - vec_len -= vec_start; 275 + vec_len = min(PAGE_CACHE_SIZE, 276 + (max_slots-cs) * (PAGE_CACHE_SIZE/spp) ); 281 277 282 278 mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n", 283 - i, vec_len, vec_start); 279 + current_page, vec_len, vec_start); 284 280 285 281 len = bio_add_page(bio, page, vec_len, vec_start); 286 - if (len != vec_len) { 287 - bio_put(bio); 288 - bio = ERR_PTR(-EIO); 282 + if (len != vec_len) break; 289 283 290 - mlog(ML_ERROR, "Error adding page to bio i = %d, " 291 - "vec_len = %u, len = %d\n, start = %u\n", 292 - i, vec_len, len, vec_start); 293 - goto bail; 294 - } 295 - 284 + cs += vec_len / (PAGE_CACHE_SIZE/spp); 296 285 vec_start = 0; 297 286 } 298 287 299 288 bail: 289 + *current_slot = cs; 300 290 return bio; 301 - } 302 - 303 - /* 304 - * Compute the maximum number of sectors the bdev can handle in one bio, 305 - * as a power of two. 306 - * 307 - * Stolen from oracleasm, thanks Joel! 308 - */ 309 - static int compute_max_sectors(struct block_device *bdev) 310 - { 311 - int max_pages, max_sectors, pow_two_sectors; 312 - 313 - struct request_queue *q; 314 - 315 - q = bdev_get_queue(bdev); 316 - max_pages = q->max_sectors >> (PAGE_SHIFT - 9); 317 - if (max_pages > BIO_MAX_PAGES) 318 - max_pages = BIO_MAX_PAGES; 319 - if (max_pages > q->max_phys_segments) 320 - max_pages = q->max_phys_segments; 321 - if (max_pages > q->max_hw_segments) 322 - max_pages = q->max_hw_segments; 323 - max_pages--; /* Handle I/Os that straddle a page */ 324 - 325 - if (max_pages) { 326 - max_sectors = max_pages << (PAGE_SHIFT - 9); 327 - } else { 328 - /* If BIO contains 1 or less than 1 page. */ 329 - max_sectors = q->max_sectors; 330 - } 331 - /* Why is fls() 1-based???? */ 332 - pow_two_sectors = 1 << (fls(max_sectors) - 1); 333 - 334 - return pow_two_sectors; 335 - } 336 - 337 - static inline void o2hb_compute_request_limits(struct o2hb_region *reg, 338 - unsigned int num_slots, 339 - unsigned int *num_bios, 340 - unsigned int *slots_per_bio) 341 - { 342 - unsigned int max_sectors, io_sectors; 343 - 344 - max_sectors = compute_max_sectors(reg->hr_bdev); 345 - 346 - io_sectors = num_slots << (reg->hr_block_bits - 9); 347 - 348 - *num_bios = (io_sectors + max_sectors - 1) / max_sectors; 349 - *slots_per_bio = max_sectors >> (reg->hr_block_bits - 9); 350 - 351 - mlog(ML_HB_BIO, "My io size is %u sectors for %u slots. This " 352 - "device can handle %u sectors of I/O\n", io_sectors, num_slots, 353 - max_sectors); 354 - mlog(ML_HB_BIO, "Will need %u bios holding %u slots each\n", 355 - *num_bios, *slots_per_bio); 356 291 } 357 292 358 293 static int o2hb_read_slots(struct o2hb_region *reg, 359 294 unsigned int max_slots) 360 295 { 361 - unsigned int num_bios, slots_per_bio, start_slot, num_slots; 362 - int i, status; 296 + unsigned int current_slot=0; 297 + int status; 363 298 struct o2hb_bio_wait_ctxt wc; 364 - struct bio **bios; 365 299 struct bio *bio; 366 300 367 - o2hb_compute_request_limits(reg, max_slots, &num_bios, &slots_per_bio); 301 + o2hb_bio_wait_init(&wc); 368 302 369 - bios = kcalloc(num_bios, sizeof(struct bio *), GFP_KERNEL); 370 - if (!bios) { 371 - status = -ENOMEM; 372 - mlog_errno(status); 373 - return status; 374 - } 375 - 376 - o2hb_bio_wait_init(&wc, num_bios); 377 - 378 - num_slots = slots_per_bio; 379 - for(i = 0; i < num_bios; i++) { 380 - start_slot = i * slots_per_bio; 381 - 382 - /* adjust num_slots at last bio */ 383 - if (max_slots < (start_slot + num_slots)) 384 - num_slots = max_slots - start_slot; 385 - 386 - bio = o2hb_setup_one_bio(reg, &wc, start_slot, num_slots); 303 + while(current_slot < max_slots) { 304 + bio = o2hb_setup_one_bio(reg, &wc, &current_slot, max_slots); 387 305 if (IS_ERR(bio)) { 388 - o2hb_bio_wait_dec(&wc, num_bios - i); 389 - 390 306 status = PTR_ERR(bio); 391 307 mlog_errno(status); 392 308 goto bail_and_wait; 393 309 } 394 - bios[i] = bio; 395 310 311 + atomic_inc(&wc.wc_num_reqs); 396 312 submit_bio(READ, bio); 397 313 } 398 314 ··· 319 403 if (wc.wc_error && !status) 320 404 status = wc.wc_error; 321 405 322 - if (bios) { 323 - for(i = 0; i < num_bios; i++) 324 - if (bios[i]) 325 - bio_put(bios[i]); 326 - kfree(bios); 327 - } 328 - 329 406 return status; 330 407 } 331 408 332 409 static int o2hb_issue_node_write(struct o2hb_region *reg, 333 - struct bio **write_bio, 334 410 struct o2hb_bio_wait_ctxt *write_wc) 335 411 { 336 412 int status; 337 413 unsigned int slot; 338 414 struct bio *bio; 339 415 340 - o2hb_bio_wait_init(write_wc, 1); 416 + o2hb_bio_wait_init(write_wc); 341 417 342 418 slot = o2nm_this_node(); 343 419 344 - bio = o2hb_setup_one_bio(reg, write_wc, slot, 1); 420 + bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1); 345 421 if (IS_ERR(bio)) { 346 422 status = PTR_ERR(bio); 347 423 mlog_errno(status); 348 424 goto bail; 349 425 } 350 426 427 + atomic_inc(&write_wc->wc_num_reqs); 351 428 submit_bio(WRITE, bio); 352 429 353 - *write_bio = bio; 354 430 status = 0; 355 431 bail: 356 432 return status; ··· 734 826 { 735 827 int i, ret, highest_node, change = 0; 736 828 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; 737 - struct bio *write_bio; 738 829 struct o2hb_bio_wait_ctxt write_wc; 739 830 740 831 ret = o2nm_configured_node_map(configured_nodes, ··· 771 864 772 865 /* And fire off the write. Note that we don't wait on this I/O 773 866 * until later. */ 774 - ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); 867 + ret = o2hb_issue_node_write(reg, &write_wc); 775 868 if (ret < 0) { 776 869 mlog_errno(ret); 777 870 return ret; ··· 789 882 * people we find in our steady state have seen us. 790 883 */ 791 884 o2hb_wait_on_io(reg, &write_wc); 792 - bio_put(write_bio); 793 885 if (write_wc.wc_error) { 794 886 /* Do not re-arm the write timeout on I/O error - we 795 887 * can't be sure that the new block ever made it to ··· 849 943 { 850 944 int i, ret; 851 945 struct o2hb_region *reg = data; 852 - struct bio *write_bio; 853 946 struct o2hb_bio_wait_ctxt write_wc; 854 947 struct timeval before_hb, after_hb; 855 948 unsigned int elapsed_msec; ··· 898 993 * 899 994 * XXX: Should we skip this on unclean_stop? */ 900 995 o2hb_prepare_block(reg, 0); 901 - ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); 996 + ret = o2hb_issue_node_write(reg, &write_wc); 902 997 if (ret == 0) { 903 998 o2hb_wait_on_io(reg, &write_wc); 904 - bio_put(write_bio); 905 999 } else { 906 1000 mlog_errno(ret); 907 1001 }

+27 -8

fs/ocfs2/cluster/tcp.c

··· 556 556 sk->sk_data_ready = o2net_data_ready; 557 557 sk->sk_state_change = o2net_state_change; 558 558 559 + mutex_init(&sc->sc_send_lock); 560 + 559 561 write_unlock_bh(&sk->sk_callback_lock); 560 562 } 561 563 ··· 690 688 * be given to the handler if their payload is longer than the max. */ 691 689 int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, 692 690 o2net_msg_handler_func *func, void *data, 691 + o2net_post_msg_handler_func *post_func, 693 692 struct list_head *unreg_list) 694 693 { 695 694 struct o2net_msg_handler *nmh = NULL; ··· 725 722 726 723 nmh->nh_func = func; 727 724 nmh->nh_func_data = data; 725 + nmh->nh_post_func = post_func; 728 726 nmh->nh_msg_type = msg_type; 729 727 nmh->nh_max_len = max_len; 730 728 nmh->nh_key = key; ··· 860 856 ssize_t ret; 861 857 862 858 859 + mutex_lock(&sc->sc_send_lock); 863 860 ret = sc->sc_sock->ops->sendpage(sc->sc_sock, 864 861 virt_to_page(kmalloced_virt), 865 862 (long)kmalloced_virt & ~PAGE_MASK, 866 863 size, MSG_DONTWAIT); 864 + mutex_unlock(&sc->sc_send_lock); 867 865 if (ret != size) { 868 866 mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT 869 867 " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret); ··· 980 974 981 975 /* finally, convert the message header to network byte-order 982 976 * and send */ 977 + mutex_lock(&sc->sc_send_lock); 983 978 ret = o2net_send_tcp_msg(sc->sc_sock, vec, veclen, 984 979 sizeof(struct o2net_msg) + caller_bytes); 980 + mutex_unlock(&sc->sc_send_lock); 985 981 msglog(msg, "sending returned %d\n", ret); 986 982 if (ret < 0) { 987 983 mlog(0, "error returned from o2net_send_tcp_msg=%d\n", ret); ··· 1057 1049 int ret = 0, handler_status; 1058 1050 enum o2net_system_error syserr; 1059 1051 struct o2net_msg_handler *nmh = NULL; 1052 + void *ret_data = NULL; 1060 1053 1061 1054 msglog(hdr, "processing message\n"); 1062 1055 ··· 1110 1101 sc->sc_msg_type = be16_to_cpu(hdr->msg_type); 1111 1102 handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) + 1112 1103 be16_to_cpu(hdr->data_len), 1113 - nmh->nh_func_data); 1104 + nmh->nh_func_data, &ret_data); 1114 1105 do_gettimeofday(&sc->sc_tv_func_stop); 1115 1106 1116 1107 out_respond: 1117 1108 /* this destroys the hdr, so don't use it after this */ 1109 + mutex_lock(&sc->sc_send_lock); 1118 1110 ret = o2net_send_status_magic(sc->sc_sock, hdr, syserr, 1119 1111 handler_status); 1112 + mutex_unlock(&sc->sc_send_lock); 1120 1113 hdr = NULL; 1121 1114 mlog(0, "sending handler status %d, syserr %d returned %d\n", 1122 1115 handler_status, syserr, ret); 1116 + 1117 + if (nmh) { 1118 + BUG_ON(ret_data != NULL && nmh->nh_post_func == NULL); 1119 + if (nmh->nh_post_func) 1120 + (nmh->nh_post_func)(handler_status, nmh->nh_func_data, 1121 + ret_data); 1122 + } 1123 1123 1124 1124 out: 1125 1125 if (nmh) ··· 1813 1795 ready(sk, bytes); 1814 1796 } 1815 1797 1816 - static int o2net_open_listening_sock(__be16 port) 1798 + static int o2net_open_listening_sock(__be32 addr, __be16 port) 1817 1799 { 1818 1800 struct socket *sock = NULL; 1819 1801 int ret; 1820 1802 struct sockaddr_in sin = { 1821 1803 .sin_family = PF_INET, 1822 - .sin_addr = { .s_addr = (__force u32)htonl(INADDR_ANY) }, 1804 + .sin_addr = { .s_addr = (__force u32)addr }, 1823 1805 .sin_port = (__force u16)port, 1824 1806 }; 1825 1807 ··· 1842 1824 sock->sk->sk_reuse = 1; 1843 1825 ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); 1844 1826 if (ret < 0) { 1845 - mlog(ML_ERROR, "unable to bind socket to port %d, ret=%d\n", 1846 - ntohs(port), ret); 1827 + mlog(ML_ERROR, "unable to bind socket at %u.%u.%u.%u:%u, " 1828 + "ret=%d\n", NIPQUAD(addr), ntohs(port), ret); 1847 1829 goto out; 1848 1830 } 1849 1831 1850 1832 ret = sock->ops->listen(sock, 64); 1851 1833 if (ret < 0) { 1852 - mlog(ML_ERROR, "unable to listen on port %d, ret=%d\n", 1853 - ntohs(port), ret); 1834 + mlog(ML_ERROR, "unable to listen on %u.%u.%u.%u:%u, ret=%d\n", 1835 + NIPQUAD(addr), ntohs(port), ret); 1854 1836 } 1855 1837 1856 1838 out: ··· 1883 1865 return -ENOMEM; /* ? */ 1884 1866 } 1885 1867 1886 - ret = o2net_open_listening_sock(node->nd_ipv4_port); 1868 + ret = o2net_open_listening_sock(node->nd_ipv4_address, 1869 + node->nd_ipv4_port); 1887 1870 if (ret) { 1888 1871 destroy_workqueue(o2net_wq); 1889 1872 o2net_wq = NULL;

+5 -1

fs/ocfs2/cluster/tcp.h

··· 50 50 __u8 buf[0]; 51 51 }; 52 52 53 - typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data); 53 + typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data, 54 + void **ret_data); 55 + typedef void (o2net_post_msg_handler_func)(int status, void *data, 56 + void *ret_data); 54 57 55 58 #define O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(struct o2net_msg)) 56 59 ··· 102 99 103 100 int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, 104 101 o2net_msg_handler_func *func, void *data, 102 + o2net_post_msg_handler_func *post_func, 105 103 struct list_head *unreg_list); 106 104 void o2net_unregister_handler_list(struct list_head *list); 107 105

+11 -1

fs/ocfs2/cluster/tcp_internal.h

··· 38 38 * locking semantics of the file system using the protocol. It should 39 39 * be somewhere else, I'm sure, but right now it isn't. 40 40 * 41 + * New in version 7: 42 + * - DLM join domain includes the live nodemap 43 + * 44 + * New in version 6: 45 + * - DLM lockres remote refcount fixes. 46 + * 41 47 * New in version 5: 42 48 * - Network timeout checking protocol 43 49 * ··· 57 51 * - full 64 bit i_size in the metadata lock lvbs 58 52 * - introduction of "rw" lock and pushing meta/data locking down 59 53 */ 60 - #define O2NET_PROTOCOL_VERSION 5ULL 54 + #define O2NET_PROTOCOL_VERSION 7ULL 61 55 struct o2net_handshake { 62 56 __be64 protocol_version; 63 57 __be64 connector_id; ··· 155 149 struct timeval sc_tv_func_stop; 156 150 u32 sc_msg_key; 157 151 u16 sc_msg_type; 152 + 153 + struct mutex sc_send_lock; 158 154 }; 159 155 160 156 struct o2net_msg_handler { ··· 166 158 u32 nh_key; 167 159 o2net_msg_handler_func *nh_func; 168 160 o2net_msg_handler_func *nh_func_data; 161 + o2net_post_msg_handler_func 162 + *nh_post_func; 169 163 struct kref nh_kref; 170 164 struct list_head nh_unregister_item; 171 165 };

+8 -6

fs/ocfs2/dlm/dlmast.c

··· 263 263 264 264 265 265 266 - int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data) 266 + int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, 267 + void **ret_data) 267 268 { 268 269 int ret; 269 270 unsigned int locklen; ··· 312 311 past->type != DLM_BAST) { 313 312 mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu" 314 313 "name=%.*s\n", past->type, 315 - dlm_get_lock_cookie_node(cookie), 316 - dlm_get_lock_cookie_seq(cookie), 314 + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), 315 + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), 317 316 locklen, name); 318 317 ret = DLM_IVLOCKID; 319 318 goto leave; ··· 324 323 mlog(0, "got %sast for unknown lockres! " 325 324 "cookie=%u:%llu, name=%.*s, namelen=%u\n", 326 325 past->type == DLM_AST ? "" : "b", 327 - dlm_get_lock_cookie_node(cookie), 328 - dlm_get_lock_cookie_seq(cookie), 326 + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), 327 + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), 329 328 locklen, name, locklen); 330 329 ret = DLM_IVLOCKID; 331 330 goto leave; ··· 370 369 371 370 mlog(0, "got %sast for unknown lock! cookie=%u:%llu, " 372 371 "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", 373 - dlm_get_lock_cookie_node(cookie), dlm_get_lock_cookie_seq(cookie), 372 + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), 373 + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), 374 374 locklen, name, locklen); 375 375 376 376 ret = DLM_NORMAL;

+109 -21

fs/ocfs2/dlm/dlmcommon.h

··· 180 180 unsigned ignore_higher:1; 181 181 }; 182 182 183 + struct dlm_deref_lockres_priv 184 + { 185 + struct dlm_lock_resource *deref_res; 186 + u8 deref_node; 187 + }; 183 188 184 189 struct dlm_work_item 185 190 { ··· 196 191 struct dlm_request_all_locks_priv ral; 197 192 struct dlm_mig_lockres_priv ml; 198 193 struct dlm_assert_master_priv am; 194 + struct dlm_deref_lockres_priv dl; 199 195 } u; 200 196 }; 201 197 ··· 228 222 #define DLM_LOCK_RES_DIRTY 0x00000008 229 223 #define DLM_LOCK_RES_IN_PROGRESS 0x00000010 230 224 #define DLM_LOCK_RES_MIGRATING 0x00000020 225 + #define DLM_LOCK_RES_DROPPING_REF 0x00000040 226 + #define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000 227 + #define DLM_LOCK_RES_SETREF_INPROG 0x00002000 231 228 232 229 /* max milliseconds to wait to sync up a network failure with a node death */ 233 230 #define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) ··· 274 265 u8 owner; //node which owns the lock resource, or unknown 275 266 u16 state; 276 267 char lvb[DLM_LVB_LEN]; 268 + unsigned int inflight_locks; 269 + unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; 277 270 }; 278 271 279 272 struct dlm_migratable_lock ··· 378 367 DLM_CONVERT_LOCK_MSG, /* 504 */ 379 368 DLM_PROXY_AST_MSG, /* 505 */ 380 369 DLM_UNLOCK_LOCK_MSG, /* 506 */ 381 - DLM_UNUSED_MSG2, /* 507 */ 370 + DLM_DEREF_LOCKRES_MSG, /* 507 */ 382 371 DLM_MIGRATE_REQUEST_MSG, /* 508 */ 383 372 DLM_MIG_LOCKRES_MSG, /* 509 */ 384 373 DLM_QUERY_JOIN_MSG, /* 510 */ ··· 428 417 u8 name[O2NM_MAX_NAME_LEN]; 429 418 }; 430 419 420 + #define DLM_ASSERT_RESPONSE_REASSERT 0x00000001 421 + #define DLM_ASSERT_RESPONSE_MASTERY_REF 0x00000002 422 + 431 423 #define DLM_ASSERT_MASTER_MLE_CLEANUP 0x00000001 432 424 #define DLM_ASSERT_MASTER_REQUERY 0x00000002 433 425 #define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004 ··· 443 429 444 430 u8 name[O2NM_MAX_NAME_LEN]; 445 431 }; 432 + 433 + #define DLM_MIGRATE_RESPONSE_MASTERY_REF 0x00000001 446 434 447 435 struct dlm_migrate_request 448 436 { ··· 625 609 }; 626 610 627 611 612 + #define BITS_PER_BYTE 8 613 + #define BITS_TO_BYTES(bits) (((bits)+BITS_PER_BYTE-1)/BITS_PER_BYTE) 614 + 628 615 struct dlm_query_join_request 629 616 { 630 617 u8 node_idx; 631 618 u8 pad1[2]; 632 619 u8 name_len; 633 620 u8 domain[O2NM_MAX_NAME_LEN]; 621 + u8 node_map[BITS_TO_BYTES(O2NM_MAX_NODES)]; 634 622 }; 635 623 636 624 struct dlm_assert_joined ··· 666 646 u8 flags; 667 647 u8 pad1; 668 648 __be32 pad2; 649 + }; 650 + 651 + struct dlm_deref_lockres 652 + { 653 + u32 pad1; 654 + u16 pad2; 655 + u8 node_idx; 656 + u8 namelen; 657 + 658 + u8 name[O2NM_MAX_NAME_LEN]; 669 659 }; 670 660 671 661 static inline enum dlm_status ··· 718 688 void dlm_lock_attach_lockres(struct dlm_lock *lock, 719 689 struct dlm_lock_resource *res); 720 690 721 - int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data); 722 - int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data); 723 - int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data); 691 + int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data, 692 + void **ret_data); 693 + int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data, 694 + void **ret_data); 695 + int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, 696 + void **ret_data); 724 697 725 698 void dlm_revert_pending_convert(struct dlm_lock_resource *res, 726 699 struct dlm_lock *lock); 727 700 void dlm_revert_pending_lock(struct dlm_lock_resource *res, 728 701 struct dlm_lock *lock); 729 702 730 - int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data); 703 + int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, 704 + void **ret_data); 731 705 void dlm_commit_pending_cancel(struct dlm_lock_resource *res, 732 706 struct dlm_lock *lock); 733 707 void dlm_commit_pending_unlock(struct dlm_lock_resource *res, ··· 755 721 struct dlm_lock_resource *res); 756 722 void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, 757 723 struct dlm_lock_resource *res); 758 - void dlm_purge_lockres(struct dlm_ctxt *dlm, 759 - struct dlm_lock_resource *lockres); 760 724 static inline void dlm_lockres_get(struct dlm_lock_resource *res) 761 725 { 762 726 /* This is called on every lookup, so it might be worth ··· 765 733 void __dlm_unhash_lockres(struct dlm_lock_resource *res); 766 734 void __dlm_insert_lockres(struct dlm_ctxt *dlm, 767 735 struct dlm_lock_resource *res); 736 + struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, 737 + const char *name, 738 + unsigned int len, 739 + unsigned int hash); 768 740 struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, 769 741 const char *name, 770 742 unsigned int len, ··· 788 752 struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, 789 753 const char *name, 790 754 unsigned int namelen); 755 + 756 + #define dlm_lockres_set_refmap_bit(bit,res) \ 757 + __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__) 758 + #define dlm_lockres_clear_refmap_bit(bit,res) \ 759 + __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__) 760 + 761 + static inline void __dlm_lockres_set_refmap_bit(int bit, 762 + struct dlm_lock_resource *res, 763 + const char *file, 764 + int line) 765 + { 766 + //printk("%s:%d:%.*s: setting bit %d\n", file, line, 767 + // res->lockname.len, res->lockname.name, bit); 768 + set_bit(bit, res->refmap); 769 + } 770 + 771 + static inline void __dlm_lockres_clear_refmap_bit(int bit, 772 + struct dlm_lock_resource *res, 773 + const char *file, 774 + int line) 775 + { 776 + //printk("%s:%d:%.*s: clearing bit %d\n", file, line, 777 + // res->lockname.len, res->lockname.name, bit); 778 + clear_bit(bit, res->refmap); 779 + } 780 + 781 + void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, 782 + struct dlm_lock_resource *res, 783 + const char *file, 784 + int line); 785 + void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, 786 + struct dlm_lock_resource *res, 787 + int new_lockres, 788 + const char *file, 789 + int line); 790 + #define dlm_lockres_drop_inflight_ref(d,r) \ 791 + __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__) 792 + #define dlm_lockres_grab_inflight_ref(d,r) \ 793 + __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__) 794 + #define dlm_lockres_grab_inflight_ref_new(d,r) \ 795 + __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__) 791 796 792 797 void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 793 798 void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); ··· 878 801 void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data); 879 802 void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data); 880 803 881 - int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); 882 - int dlm_migrate_lockres(struct dlm_ctxt *dlm, 883 - struct dlm_lock_resource *res, 884 - u8 target); 804 + int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); 885 805 int dlm_finish_migration(struct dlm_ctxt *dlm, 886 806 struct dlm_lock_resource *res, 887 807 u8 old_master); ··· 886 812 struct dlm_lock_resource *res); 887 813 void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res); 888 814 889 - int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data); 890 - int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data); 891 - int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data); 892 - int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data); 893 - int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data); 894 - int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data); 895 - int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data); 896 - int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data); 897 - int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data); 815 + int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, 816 + void **ret_data); 817 + int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, 818 + void **ret_data); 819 + void dlm_assert_master_post_handler(int status, void *data, void *ret_data); 820 + int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, 821 + void **ret_data); 822 + int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, 823 + void **ret_data); 824 + int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, 825 + void **ret_data); 826 + int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, 827 + void **ret_data); 828 + int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data, 829 + void **ret_data); 830 + int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data, 831 + void **ret_data); 832 + int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data, 833 + void **ret_data); 834 + int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, 835 + void **ret_data); 898 836 int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, 899 837 u8 nodenum, u8 *real_master); 900 838 ··· 942 856 int dlm_init_mle_cache(void); 943 857 void dlm_destroy_mle_cache(void); 944 858 void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up); 859 + int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, 860 + struct dlm_lock_resource *res); 945 861 void dlm_clean_master_list(struct dlm_ctxt *dlm, 946 862 u8 dead_node); 947 863 int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); 948 - 864 + int __dlm_lockres_has_locks(struct dlm_lock_resource *res); 949 865 int __dlm_lockres_unused(struct dlm_lock_resource *res); 950 866 951 867 static inline const char * dlm_lock_mode_name(int mode)

+14 -26

fs/ocfs2/dlm/dlmconvert.c

··· 286 286 __dlm_print_one_lock_resource(res); 287 287 mlog(ML_ERROR, "converting a remote lock that is already " 288 288 "converting! (cookie=%u:%llu, conv=%d)\n", 289 - dlm_get_lock_cookie_node(lock->ml.cookie), 290 - dlm_get_lock_cookie_seq(lock->ml.cookie), 289 + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), 290 + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), 291 291 lock->ml.convert_type); 292 292 status = DLM_DENIED; 293 293 goto bail; ··· 418 418 * returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS, 419 419 * status from __dlmconvert_master 420 420 */ 421 - int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) 421 + int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data, 422 + void **ret_data) 422 423 { 423 424 struct dlm_ctxt *dlm = data; 424 425 struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf; ··· 429 428 struct dlm_lockstatus *lksb; 430 429 enum dlm_status status = DLM_NORMAL; 431 430 u32 flags; 432 - int call_ast = 0, kick_thread = 0, ast_reserved = 0; 431 + int call_ast = 0, kick_thread = 0, ast_reserved = 0, wake = 0; 433 432 434 433 if (!dlm_grab(dlm)) { 435 434 dlm_error(DLM_REJECTED); ··· 480 479 } 481 480 lock = NULL; 482 481 } 483 - if (!lock) { 484 - __dlm_print_one_lock_resource(res); 485 - list_for_each(iter, &res->granted) { 486 - lock = list_entry(iter, struct dlm_lock, list); 487 - if (lock->ml.node == cnv->node_idx) { 488 - mlog(ML_ERROR, "There is something here " 489 - "for node %u, lock->ml.cookie=%llu, " 490 - "cnv->cookie=%llu\n", cnv->node_idx, 491 - (unsigned long long)lock->ml.cookie, 492 - (unsigned long long)cnv->cookie); 493 - break; 494 - } 495 - } 496 - lock = NULL; 497 - } 498 482 spin_unlock(&res->spinlock); 499 483 if (!lock) { 500 484 status = DLM_IVLOCKID; 501 - dlm_error(status); 485 + mlog(ML_ERROR, "did not find lock to convert on grant queue! " 486 + "cookie=%u:%llu\n", 487 + dlm_get_lock_cookie_node(be64_to_cpu(cnv->cookie)), 488 + dlm_get_lock_cookie_seq(be64_to_cpu(cnv->cookie))); 489 + __dlm_print_one_lock_resource(res); 502 490 goto leave; 503 491 } 504 492 ··· 514 524 cnv->requested_type, 515 525 &call_ast, &kick_thread); 516 526 res->state &= ~DLM_LOCK_RES_IN_PROGRESS; 527 + wake = 1; 517 528 } 518 529 spin_unlock(&res->spinlock); 530 + if (wake) 531 + wake_up(&res->wq); 519 532 520 533 if (status != DLM_NORMAL) { 521 534 if (status != DLM_NOTQUEUED) ··· 527 534 } 528 535 529 536 leave: 530 - if (!lock) 531 - mlog(ML_ERROR, "did not find lock to convert on grant queue! " 532 - "cookie=%u:%llu\n", 533 - dlm_get_lock_cookie_node(cnv->cookie), 534 - dlm_get_lock_cookie_seq(cnv->cookie)); 535 - else 537 + if (lock) 536 538 dlm_lock_put(lock); 537 539 538 540 /* either queue the ast or release it, if reserved */

+24 -6

fs/ocfs2/dlm/dlmdebug.c

··· 53 53 spin_unlock(&res->spinlock); 54 54 } 55 55 56 + static void dlm_print_lockres_refmap(struct dlm_lock_resource *res) 57 + { 58 + int bit; 59 + assert_spin_locked(&res->spinlock); 60 + 61 + mlog(ML_NOTICE, " refmap nodes: [ "); 62 + bit = 0; 63 + while (1) { 64 + bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); 65 + if (bit >= O2NM_MAX_NODES) 66 + break; 67 + printk("%u ", bit); 68 + bit++; 69 + } 70 + printk("], inflight=%u\n", res->inflight_locks); 71 + } 72 + 56 73 void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) 57 74 { 58 75 struct list_head *iter2; ··· 82 65 res->owner, res->state); 83 66 mlog(ML_NOTICE, " last used: %lu, on purge list: %s\n", 84 67 res->last_used, list_empty(&res->purge) ? "no" : "yes"); 68 + dlm_print_lockres_refmap(res); 85 69 mlog(ML_NOTICE, " granted queue: \n"); 86 70 list_for_each(iter2, &res->granted) { 87 71 lock = list_entry(iter2, struct dlm_lock, list); ··· 90 72 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " 91 73 "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 92 74 lock->ml.type, lock->ml.convert_type, lock->ml.node, 93 - dlm_get_lock_cookie_node(lock->ml.cookie), 94 - dlm_get_lock_cookie_seq(lock->ml.cookie), 75 + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), 76 + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), 95 77 list_empty(&lock->ast_list) ? 'y' : 'n', 96 78 lock->ast_pending ? 'y' : 'n', 97 79 list_empty(&lock->bast_list) ? 'y' : 'n', ··· 105 87 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " 106 88 "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 107 89 lock->ml.type, lock->ml.convert_type, lock->ml.node, 108 - dlm_get_lock_cookie_node(lock->ml.cookie), 109 - dlm_get_lock_cookie_seq(lock->ml.cookie), 90 + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), 91 + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), 110 92 list_empty(&lock->ast_list) ? 'y' : 'n', 111 93 lock->ast_pending ? 'y' : 'n', 112 94 list_empty(&lock->bast_list) ? 'y' : 'n', ··· 120 102 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " 121 103 "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 122 104 lock->ml.type, lock->ml.convert_type, lock->ml.node, 123 - dlm_get_lock_cookie_node(lock->ml.cookie), 124 - dlm_get_lock_cookie_seq(lock->ml.cookie), 105 + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), 106 + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), 125 107 list_empty(&lock->ast_list) ? 'y' : 'n', 126 108 lock->ast_pending ? 'y' : 'n', 127 109 list_empty(&lock->bast_list) ? 'y' : 'n',

+192 -57

fs/ocfs2/dlm/dlmdomain.c

··· 48 48 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) 49 49 #include "cluster/masklog.h" 50 50 51 + /* 52 + * ocfs2 node maps are array of long int, which limits to send them freely 53 + * across the wire due to endianness issues. To workaround this, we convert 54 + * long ints to byte arrays. Following 3 routines are helper functions to 55 + * set/test/copy bits within those array of bytes 56 + */ 57 + static inline void byte_set_bit(u8 nr, u8 map[]) 58 + { 59 + map[nr >> 3] |= (1UL << (nr & 7)); 60 + } 61 + 62 + static inline int byte_test_bit(u8 nr, u8 map[]) 63 + { 64 + return ((1UL << (nr & 7)) & (map[nr >> 3])) != 0; 65 + } 66 + 67 + static inline void byte_copymap(u8 dmap[], unsigned long smap[], 68 + unsigned int sz) 69 + { 70 + unsigned int nn; 71 + 72 + if (!sz) 73 + return; 74 + 75 + memset(dmap, 0, ((sz + 7) >> 3)); 76 + for (nn = 0 ; nn < sz; nn++) 77 + if (test_bit(nn, smap)) 78 + byte_set_bit(nn, dmap); 79 + } 80 + 51 81 static void dlm_free_pagevec(void **vec, int pages) 52 82 { 53 83 while (pages--) ··· 125 95 126 96 #define DLM_DOMAIN_BACKOFF_MS 200 127 97 128 - static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data); 129 - static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data); 130 - static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data); 131 - static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data); 98 + static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, 99 + void **ret_data); 100 + static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, 101 + void **ret_data); 102 + static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, 103 + void **ret_data); 104 + static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, 105 + void **ret_data); 132 106 133 107 static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); 134 108 ··· 159 125 hlist_add_head(&res->hash_node, bucket); 160 126 } 161 127 162 - struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, 163 - const char *name, 164 - unsigned int len, 165 - unsigned int hash) 128 + struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, 129 + const char *name, 130 + unsigned int len, 131 + unsigned int hash) 166 132 { 167 133 struct hlist_head *bucket; 168 134 struct hlist_node *list; ··· 186 152 return res; 187 153 } 188 154 return NULL; 155 + } 156 + 157 + /* intended to be called by functions which do not care about lock 158 + * resources which are being purged (most net _handler functions). 159 + * this will return NULL for any lock resource which is found but 160 + * currently in the process of dropping its mastery reference. 161 + * use __dlm_lookup_lockres_full when you need the lock resource 162 + * regardless (e.g. dlm_get_lock_resource) */ 163 + struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, 164 + const char *name, 165 + unsigned int len, 166 + unsigned int hash) 167 + { 168 + struct dlm_lock_resource *res = NULL; 169 + 170 + mlog_entry("%.*s\n", len, name); 171 + 172 + assert_spin_locked(&dlm->spinlock); 173 + 174 + res = __dlm_lookup_lockres_full(dlm, name, len, hash); 175 + if (res) { 176 + spin_lock(&res->spinlock); 177 + if (res->state & DLM_LOCK_RES_DROPPING_REF) { 178 + spin_unlock(&res->spinlock); 179 + dlm_lockres_put(res); 180 + return NULL; 181 + } 182 + spin_unlock(&res->spinlock); 183 + } 184 + 185 + return res; 189 186 } 190 187 191 188 struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, ··· 395 330 wake_up(&dlm_domain_events); 396 331 } 397 332 398 - static void dlm_migrate_all_locks(struct dlm_ctxt *dlm) 333 + static int dlm_migrate_all_locks(struct dlm_ctxt *dlm) 399 334 { 400 - int i; 335 + int i, num, n, ret = 0; 401 336 struct dlm_lock_resource *res; 337 + struct hlist_node *iter; 338 + struct hlist_head *bucket; 339 + int dropped; 402 340 403 341 mlog(0, "Migrating locks from domain %s\n", dlm->name); 404 - restart: 342 + 343 + num = 0; 405 344 spin_lock(&dlm->spinlock); 406 345 for (i = 0; i < DLM_HASH_BUCKETS; i++) { 407 - while (!hlist_empty(dlm_lockres_hash(dlm, i))) { 408 - res = hlist_entry(dlm_lockres_hash(dlm, i)->first, 409 - struct dlm_lock_resource, hash_node); 410 - /* need reference when manually grabbing lockres */ 346 + redo_bucket: 347 + n = 0; 348 + bucket = dlm_lockres_hash(dlm, i); 349 + iter = bucket->first; 350 + while (iter) { 351 + n++; 352 + res = hlist_entry(iter, struct dlm_lock_resource, 353 + hash_node); 411 354 dlm_lockres_get(res); 412 - /* this should unhash the lockres 413 - * and exit with dlm->spinlock */ 414 - mlog(0, "purging res=%p\n", res); 415 - if (dlm_lockres_is_dirty(dlm, res)) { 416 - /* HACK! this should absolutely go. 417 - * need to figure out why some empty 418 - * lockreses are still marked dirty */ 419 - mlog(ML_ERROR, "lockres %.*s dirty!\n", 420 - res->lockname.len, res->lockname.name); 355 + /* migrate, if necessary. this will drop the dlm 356 + * spinlock and retake it if it does migration. */ 357 + dropped = dlm_empty_lockres(dlm, res); 421 358 422 - spin_unlock(&dlm->spinlock); 423 - dlm_kick_thread(dlm, res); 424 - wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); 425 - dlm_lockres_put(res); 426 - goto restart; 427 - } 428 - dlm_purge_lockres(dlm, res); 359 + spin_lock(&res->spinlock); 360 + __dlm_lockres_calc_usage(dlm, res); 361 + iter = res->hash_node.next; 362 + spin_unlock(&res->spinlock); 363 + 429 364 dlm_lockres_put(res); 365 + 366 + cond_resched_lock(&dlm->spinlock); 367 + 368 + if (dropped) 369 + goto redo_bucket; 430 370 } 371 + num += n; 372 + mlog(0, "%s: touched %d lockreses in bucket %d " 373 + "(tot=%d)\n", dlm->name, n, i, num); 431 374 } 432 375 spin_unlock(&dlm->spinlock); 376 + wake_up(&dlm->dlm_thread_wq); 433 377 378 + /* let the dlm thread take care of purging, keep scanning until 379 + * nothing remains in the hash */ 380 + if (num) { 381 + mlog(0, "%s: %d lock resources in hash last pass\n", 382 + dlm->name, num); 383 + ret = -EAGAIN; 384 + } 434 385 mlog(0, "DONE Migrating locks from domain %s\n", dlm->name); 386 + return ret; 435 387 } 436 388 437 389 static int dlm_no_joining_node(struct dlm_ctxt *dlm) ··· 500 418 printk("\n"); 501 419 } 502 420 503 - static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data) 421 + static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, 422 + void **ret_data) 504 423 { 505 424 struct dlm_ctxt *dlm = data; 506 425 unsigned int node; ··· 654 571 /* We changed dlm state, notify the thread */ 655 572 dlm_kick_thread(dlm, NULL); 656 573 657 - dlm_migrate_all_locks(dlm); 574 + while (dlm_migrate_all_locks(dlm)) { 575 + mlog(0, "%s: more migration to do\n", dlm->name); 576 + } 658 577 dlm_mark_domain_leaving(dlm); 659 578 dlm_leave_domain(dlm); 660 579 dlm_complete_dlm_shutdown(dlm); ··· 665 580 } 666 581 EXPORT_SYMBOL_GPL(dlm_unregister_domain); 667 582 668 - static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) 583 + static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, 584 + void **ret_data) 669 585 { 670 586 struct dlm_query_join_request *query; 671 587 enum dlm_query_join_response response; 672 588 struct dlm_ctxt *dlm = NULL; 589 + u8 nodenum; 673 590 674 591 query = (struct dlm_query_join_request *) msg->buf; 675 592 ··· 695 608 696 609 spin_lock(&dlm_domain_lock); 697 610 dlm = __dlm_lookup_domain_full(query->domain, query->name_len); 611 + if (!dlm) 612 + goto unlock_respond; 613 + 614 + /* 615 + * There is a small window where the joining node may not see the 616 + * node(s) that just left but still part of the cluster. DISALLOW 617 + * join request if joining node has different node map. 618 + */ 619 + nodenum=0; 620 + while (nodenum < O2NM_MAX_NODES) { 621 + if (test_bit(nodenum, dlm->domain_map)) { 622 + if (!byte_test_bit(nodenum, query->node_map)) { 623 + mlog(0, "disallow join as node %u does not " 624 + "have node %u in its nodemap\n", 625 + query->node_idx, nodenum); 626 + response = JOIN_DISALLOW; 627 + goto unlock_respond; 628 + } 629 + } 630 + nodenum++; 631 + } 632 + 698 633 /* Once the dlm ctxt is marked as leaving then we don't want 699 634 * to be put in someone's domain map. 700 635 * Also, explicitly disallow joining at certain troublesome ··· 735 626 /* Disallow parallel joins. */ 736 627 response = JOIN_DISALLOW; 737 628 } else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) { 738 - mlog(ML_NOTICE, "node %u trying to join, but recovery " 629 + mlog(0, "node %u trying to join, but recovery " 739 630 "is ongoing.\n", bit); 740 631 response = JOIN_DISALLOW; 741 632 } else if (test_bit(bit, dlm->recovery_map)) { 742 - mlog(ML_NOTICE, "node %u trying to join, but it " 633 + mlog(0, "node %u trying to join, but it " 743 634 "still needs recovery.\n", bit); 744 635 response = JOIN_DISALLOW; 745 636 } else if (test_bit(bit, dlm->domain_map)) { 746 - mlog(ML_NOTICE, "node %u trying to join, but it " 637 + mlog(0, "node %u trying to join, but it " 747 638 "is still in the domain! needs recovery?\n", 748 639 bit); 749 640 response = JOIN_DISALLOW; ··· 758 649 759 650 spin_unlock(&dlm->spinlock); 760 651 } 652 + unlock_respond: 761 653 spin_unlock(&dlm_domain_lock); 762 654 763 655 respond: ··· 767 657 return response; 768 658 } 769 659 770 - static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data) 660 + static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, 661 + void **ret_data) 771 662 { 772 663 struct dlm_assert_joined *assert; 773 664 struct dlm_ctxt *dlm = NULL; ··· 805 694 return 0; 806 695 } 807 696 808 - static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data) 697 + static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, 698 + void **ret_data) 809 699 { 810 700 struct dlm_cancel_join *cancel; 811 701 struct dlm_ctxt *dlm = NULL; ··· 907 795 join_msg.node_idx = dlm->node_num; 908 796 join_msg.name_len = strlen(dlm->name); 909 797 memcpy(join_msg.domain, dlm->name, join_msg.name_len); 798 + 799 + /* copy live node map to join message */ 800 + byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES); 910 801 911 802 status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg, 912 803 sizeof(join_msg), node, &retval); ··· 1151 1036 status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key, 1152 1037 sizeof(struct dlm_master_request), 1153 1038 dlm_master_request_handler, 1154 - dlm, &dlm->dlm_domain_handlers); 1039 + dlm, NULL, &dlm->dlm_domain_handlers); 1155 1040 if (status) 1156 1041 goto bail; 1157 1042 1158 1043 status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key, 1159 1044 sizeof(struct dlm_assert_master), 1160 1045 dlm_assert_master_handler, 1161 - dlm, &dlm->dlm_domain_handlers); 1046 + dlm, dlm_assert_master_post_handler, 1047 + &dlm->dlm_domain_handlers); 1162 1048 if (status) 1163 1049 goto bail; 1164 1050 1165 1051 status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key, 1166 1052 sizeof(struct dlm_create_lock), 1167 1053 dlm_create_lock_handler, 1168 - dlm, &dlm->dlm_domain_handlers); 1054 + dlm, NULL, &dlm->dlm_domain_handlers); 1169 1055 if (status) 1170 1056 goto bail; 1171 1057 1172 1058 status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key, 1173 1059 DLM_CONVERT_LOCK_MAX_LEN, 1174 1060 dlm_convert_lock_handler, 1175 - dlm, &dlm->dlm_domain_handlers); 1061 + dlm, NULL, &dlm->dlm_domain_handlers); 1176 1062 if (status) 1177 1063 goto bail; 1178 1064 1179 1065 status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key, 1180 1066 DLM_UNLOCK_LOCK_MAX_LEN, 1181 1067 dlm_unlock_lock_handler, 1182 - dlm, &dlm->dlm_domain_handlers); 1068 + dlm, NULL, &dlm->dlm_domain_handlers); 1183 1069 if (status) 1184 1070 goto bail; 1185 1071 1186 1072 status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key, 1187 1073 DLM_PROXY_AST_MAX_LEN, 1188 1074 dlm_proxy_ast_handler, 1189 - dlm, &dlm->dlm_domain_handlers); 1075 + dlm, NULL, &dlm->dlm_domain_handlers); 1190 1076 if (status) 1191 1077 goto bail; 1192 1078 1193 1079 status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key, 1194 1080 sizeof(struct dlm_exit_domain), 1195 1081 dlm_exit_domain_handler, 1196 - dlm, &dlm->dlm_domain_handlers); 1082 + dlm, NULL, &dlm->dlm_domain_handlers); 1083 + if (status) 1084 + goto bail; 1085 + 1086 + status = o2net_register_handler(DLM_DEREF_LOCKRES_MSG, dlm->key, 1087 + sizeof(struct dlm_deref_lockres), 1088 + dlm_deref_lockres_handler, 1089 + dlm, NULL, &dlm->dlm_domain_handlers); 1197 1090 if (status) 1198 1091 goto bail; 1199 1092 1200 1093 status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key, 1201 1094 sizeof(struct dlm_migrate_request), 1202 1095 dlm_migrate_request_handler, 1203 - dlm, &dlm->dlm_domain_handlers); 1096 + dlm, NULL, &dlm->dlm_domain_handlers); 1204 1097 if (status) 1205 1098 goto bail; 1206 1099 1207 1100 status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key, 1208 1101 DLM_MIG_LOCKRES_MAX_LEN, 1209 1102 dlm_mig_lockres_handler, 1210 - dlm, &dlm->dlm_domain_handlers); 1103 + dlm, NULL, &dlm->dlm_domain_handlers); 1211 1104 if (status) 1212 1105 goto bail; 1213 1106 1214 1107 status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key, 1215 1108 sizeof(struct dlm_master_requery), 1216 1109 dlm_master_requery_handler, 1217 - dlm, &dlm->dlm_domain_handlers); 1110 + dlm, NULL, &dlm->dlm_domain_handlers); 1218 1111 if (status) 1219 1112 goto bail; 1220 1113 1221 1114 status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key, 1222 1115 sizeof(struct dlm_lock_request), 1223 1116 dlm_request_all_locks_handler, 1224 - dlm, &dlm->dlm_domain_handlers); 1117 + dlm, NULL, &dlm->dlm_domain_handlers); 1225 1118 if (status) 1226 1119 goto bail; 1227 1120 1228 1121 status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key, 1229 1122 sizeof(struct dlm_reco_data_done), 1230 1123 dlm_reco_data_done_handler, 1231 - dlm, &dlm->dlm_domain_handlers); 1124 + dlm, NULL, &dlm->dlm_domain_handlers); 1232 1125 if (status) 1233 1126 goto bail; 1234 1127 1235 1128 status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key, 1236 1129 sizeof(struct dlm_begin_reco), 1237 1130 dlm_begin_reco_handler, 1238 - dlm, &dlm->dlm_domain_handlers); 1131 + dlm, NULL, &dlm->dlm_domain_handlers); 1239 1132 if (status) 1240 1133 goto bail; 1241 1134 1242 1135 status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key, 1243 1136 sizeof(struct dlm_finalize_reco), 1244 1137 dlm_finalize_reco_handler, 1245 - dlm, &dlm->dlm_domain_handlers); 1138 + dlm, NULL, &dlm->dlm_domain_handlers); 1246 1139 if (status) 1247 1140 goto bail; 1248 1141 ··· 1264 1141 static int dlm_join_domain(struct dlm_ctxt *dlm) 1265 1142 { 1266 1143 int status; 1144 + unsigned int backoff; 1145 + unsigned int total_backoff = 0; 1267 1146 1268 1147 BUG_ON(!dlm); 1269 1148 ··· 1297 1172 } 1298 1173 1299 1174 do { 1300 - unsigned int backoff; 1301 1175 status = dlm_try_to_join_domain(dlm); 1302 1176 1303 1177 /* If we're racing another node to the join, then we 1304 1178 * need to back off temporarily and let them 1305 1179 * complete. */ 1180 + #define DLM_JOIN_TIMEOUT_MSECS 90000 1306 1181 if (status == -EAGAIN) { 1307 1182 if (signal_pending(current)) { 1308 1183 status = -ERESTARTSYS; 1184 + goto bail; 1185 + } 1186 + 1187 + if (total_backoff > 1188 + msecs_to_jiffies(DLM_JOIN_TIMEOUT_MSECS)) { 1189 + status = -ERESTARTSYS; 1190 + mlog(ML_NOTICE, "Timed out joining dlm domain " 1191 + "%s after %u msecs\n", dlm->name, 1192 + jiffies_to_msecs(total_backoff)); 1309 1193 goto bail; 1310 1194 } 1311 1195 ··· 1327 1193 */ 1328 1194 backoff = (unsigned int)(jiffies & 0x3); 1329 1195 backoff *= DLM_DOMAIN_BACKOFF_MS; 1196 + total_backoff += backoff; 1330 1197 mlog(0, "backoff %d\n", backoff); 1331 1198 msleep(backoff); 1332 1199 } ··· 1556 1421 status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, 1557 1422 sizeof(struct dlm_query_join_request), 1558 1423 dlm_query_join_handler, 1559 - NULL, &dlm_join_handlers); 1424 + NULL, NULL, &dlm_join_handlers); 1560 1425 if (status) 1561 1426 goto bail; 1562 1427 1563 1428 status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, 1564 1429 sizeof(struct dlm_assert_joined), 1565 1430 dlm_assert_joined_handler, 1566 - NULL, &dlm_join_handlers); 1431 + NULL, NULL, &dlm_join_handlers); 1567 1432 if (status) 1568 1433 goto bail; 1569 1434 1570 1435 status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY, 1571 1436 sizeof(struct dlm_cancel_join), 1572 1437 dlm_cancel_join_handler, 1573 - NULL, &dlm_join_handlers); 1438 + NULL, NULL, &dlm_join_handlers); 1574 1439 1575 1440 bail: 1576 1441 if (status < 0)

+6 -1

fs/ocfs2/dlm/dlmlock.c

··· 163 163 kick_thread = 1; 164 164 } 165 165 } 166 + /* reduce the inflight count, this may result in the lockres 167 + * being purged below during calc_usage */ 168 + if (lock->ml.node == dlm->node_num) 169 + dlm_lockres_drop_inflight_ref(dlm, res); 166 170 167 171 spin_unlock(&res->spinlock); 168 172 wake_up(&res->wq); ··· 441 437 * held on exit: none 442 438 * returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED 443 439 */ 444 - int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data) 440 + int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data, 441 + void **ret_data) 445 442 { 446 443 struct dlm_ctxt *dlm = data; 447 444 struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf;

+510 -69

fs/ocfs2/dlm/dlmmaster.c

··· 99 99 int idx); 100 100 101 101 static void dlm_assert_master_worker(struct dlm_work_item *item, void *data); 102 - static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname, 103 - unsigned int namelen, void *nodemap, 104 - u32 flags); 102 + static int dlm_do_assert_master(struct dlm_ctxt *dlm, 103 + struct dlm_lock_resource *res, 104 + void *nodemap, u32 flags); 105 + static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data); 105 106 106 107 static inline int dlm_mle_equal(struct dlm_ctxt *dlm, 107 108 struct dlm_master_list_entry *mle, ··· 238 237 struct dlm_master_list_entry **mle, 239 238 char *name, unsigned int namelen); 240 239 241 - static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to); 240 + static int dlm_do_master_request(struct dlm_lock_resource *res, 241 + struct dlm_master_list_entry *mle, int to); 242 242 243 243 244 244 static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, ··· 689 687 INIT_LIST_HEAD(&res->purge); 690 688 atomic_set(&res->asts_reserved, 0); 691 689 res->migration_pending = 0; 690 + res->inflight_locks = 0; 692 691 693 692 kref_init(&res->refs); 694 693 ··· 703 700 res->last_used = 0; 704 701 705 702 memset(res->lvb, 0, DLM_LVB_LEN); 703 + memset(res->refmap, 0, sizeof(res->refmap)); 706 704 } 707 705 708 706 struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, ··· 724 720 725 721 dlm_init_lockres(dlm, res, name, namelen); 726 722 return res; 723 + } 724 + 725 + void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, 726 + struct dlm_lock_resource *res, 727 + int new_lockres, 728 + const char *file, 729 + int line) 730 + { 731 + if (!new_lockres) 732 + assert_spin_locked(&res->spinlock); 733 + 734 + if (!test_bit(dlm->node_num, res->refmap)) { 735 + BUG_ON(res->inflight_locks != 0); 736 + dlm_lockres_set_refmap_bit(dlm->node_num, res); 737 + } 738 + res->inflight_locks++; 739 + mlog(0, "%s:%.*s: inflight++: now %u\n", 740 + dlm->name, res->lockname.len, res->lockname.name, 741 + res->inflight_locks); 742 + } 743 + 744 + void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, 745 + struct dlm_lock_resource *res, 746 + const char *file, 747 + int line) 748 + { 749 + assert_spin_locked(&res->spinlock); 750 + 751 + BUG_ON(res->inflight_locks == 0); 752 + res->inflight_locks--; 753 + mlog(0, "%s:%.*s: inflight--: now %u\n", 754 + dlm->name, res->lockname.len, res->lockname.name, 755 + res->inflight_locks); 756 + if (res->inflight_locks == 0) 757 + dlm_lockres_clear_refmap_bit(dlm->node_num, res); 758 + wake_up(&res->wq); 727 759 } 728 760 729 761 /* ··· 792 752 unsigned int hash; 793 753 int tries = 0; 794 754 int bit, wait_on_recovery = 0; 755 + int drop_inflight_if_nonlocal = 0; 795 756 796 757 BUG_ON(!lockid); 797 758 ··· 802 761 803 762 lookup: 804 763 spin_lock(&dlm->spinlock); 805 - tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash); 764 + tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); 806 765 if (tmpres) { 766 + int dropping_ref = 0; 767 + 768 + spin_lock(&tmpres->spinlock); 769 + if (tmpres->owner == dlm->node_num) { 770 + BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF); 771 + dlm_lockres_grab_inflight_ref(dlm, tmpres); 772 + } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) 773 + dropping_ref = 1; 774 + spin_unlock(&tmpres->spinlock); 807 775 spin_unlock(&dlm->spinlock); 776 + 777 + /* wait until done messaging the master, drop our ref to allow 778 + * the lockres to be purged, start over. */ 779 + if (dropping_ref) { 780 + spin_lock(&tmpres->spinlock); 781 + __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF); 782 + spin_unlock(&tmpres->spinlock); 783 + dlm_lockres_put(tmpres); 784 + tmpres = NULL; 785 + goto lookup; 786 + } 787 + 808 788 mlog(0, "found in hash!\n"); 809 789 if (res) 810 790 dlm_lockres_put(res); ··· 855 793 spin_lock(&res->spinlock); 856 794 dlm_change_lockres_owner(dlm, res, dlm->node_num); 857 795 __dlm_insert_lockres(dlm, res); 796 + dlm_lockres_grab_inflight_ref(dlm, res); 858 797 spin_unlock(&res->spinlock); 859 798 spin_unlock(&dlm->spinlock); 860 799 /* lockres still marked IN_PROGRESS */ ··· 868 805 /* if we found a block, wait for lock to be mastered by another node */ 869 806 blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen); 870 807 if (blocked) { 808 + int mig; 871 809 if (mle->type == DLM_MLE_MASTER) { 872 810 mlog(ML_ERROR, "master entry for nonexistent lock!\n"); 873 811 BUG(); 874 - } else if (mle->type == DLM_MLE_MIGRATION) { 875 - /* migration is in progress! */ 876 - /* the good news is that we now know the 877 - * "current" master (mle->master). */ 878 - 812 + } 813 + mig = (mle->type == DLM_MLE_MIGRATION); 814 + /* if there is a migration in progress, let the migration 815 + * finish before continuing. we can wait for the absence 816 + * of the MIGRATION mle: either the migrate finished or 817 + * one of the nodes died and the mle was cleaned up. 818 + * if there is a BLOCK here, but it already has a master 819 + * set, we are too late. the master does not have a ref 820 + * for us in the refmap. detach the mle and drop it. 821 + * either way, go back to the top and start over. */ 822 + if (mig || mle->master != O2NM_MAX_NODES) { 823 + BUG_ON(mig && mle->master == dlm->node_num); 824 + /* we arrived too late. the master does not 825 + * have a ref for us. retry. */ 826 + mlog(0, "%s:%.*s: late on %s\n", 827 + dlm->name, namelen, lockid, 828 + mig ? "MIGRATION" : "BLOCK"); 879 829 spin_unlock(&dlm->master_lock); 880 - assert_spin_locked(&dlm->spinlock); 881 - 882 - /* set the lockres owner and hash it */ 883 - spin_lock(&res->spinlock); 884 - dlm_set_lockres_owner(dlm, res, mle->master); 885 - __dlm_insert_lockres(dlm, res); 886 - spin_unlock(&res->spinlock); 887 830 spin_unlock(&dlm->spinlock); 888 831 889 832 /* master is known, detach */ 890 - dlm_mle_detach_hb_events(dlm, mle); 833 + if (!mig) 834 + dlm_mle_detach_hb_events(dlm, mle); 891 835 dlm_put_mle(mle); 892 836 mle = NULL; 893 - goto wake_waiters; 837 + /* this is lame, but we cant wait on either 838 + * the mle or lockres waitqueue here */ 839 + if (mig) 840 + msleep(100); 841 + goto lookup; 894 842 } 895 843 } else { 896 844 /* go ahead and try to master lock on this node */ ··· 932 858 933 859 /* finally add the lockres to its hash bucket */ 934 860 __dlm_insert_lockres(dlm, res); 861 + /* since this lockres is new it doesnt not require the spinlock */ 862 + dlm_lockres_grab_inflight_ref_new(dlm, res); 863 + 864 + /* if this node does not become the master make sure to drop 865 + * this inflight reference below */ 866 + drop_inflight_if_nonlocal = 1; 867 + 935 868 /* get an extra ref on the mle in case this is a BLOCK 936 869 * if so, the creator of the BLOCK may try to put the last 937 870 * ref at this time in the assert master handler, so we ··· 991 910 ret = -EINVAL; 992 911 dlm_node_iter_init(mle->vote_map, &iter); 993 912 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { 994 - ret = dlm_do_master_request(mle, nodenum); 913 + ret = dlm_do_master_request(res, mle, nodenum); 995 914 if (ret < 0) 996 915 mlog_errno(ret); 997 916 if (mle->master != O2NM_MAX_NODES) { ··· 1041 960 1042 961 wake_waiters: 1043 962 spin_lock(&res->spinlock); 963 + if (res->owner != dlm->node_num && drop_inflight_if_nonlocal) 964 + dlm_lockres_drop_inflight_ref(dlm, res); 1044 965 res->state &= ~DLM_LOCK_RES_IN_PROGRESS; 1045 966 spin_unlock(&res->spinlock); 1046 967 wake_up(&res->wq); ··· 1081 998 /* this will cause the master to re-assert across 1082 999 * the whole cluster, freeing up mles */ 1083 1000 if (res->owner != dlm->node_num) { 1084 - ret = dlm_do_master_request(mle, res->owner); 1001 + ret = dlm_do_master_request(res, mle, res->owner); 1085 1002 if (ret < 0) { 1086 1003 /* give recovery a chance to run */ 1087 1004 mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); ··· 1145 1062 * now tell other nodes that I am 1146 1063 * mastering this. */ 1147 1064 mle->master = dlm->node_num; 1065 + /* ref was grabbed in get_lock_resource 1066 + * will be dropped in dlmlock_master */ 1148 1067 assert = 1; 1149 1068 sleep = 0; 1150 1069 } ··· 1172 1087 (atomic_read(&mle->woken) == 1), 1173 1088 timeo); 1174 1089 if (res->owner == O2NM_MAX_NODES) { 1175 - mlog(0, "waiting again\n"); 1090 + mlog(0, "%s:%.*s: waiting again\n", dlm->name, 1091 + res->lockname.len, res->lockname.name); 1176 1092 goto recheck; 1177 1093 } 1178 1094 mlog(0, "done waiting, master is %u\n", res->owner); ··· 1186 1100 m = dlm->node_num; 1187 1101 mlog(0, "about to master %.*s here, this=%u\n", 1188 1102 res->lockname.len, res->lockname.name, m); 1189 - ret = dlm_do_assert_master(dlm, res->lockname.name, 1190 - res->lockname.len, mle->vote_map, 0); 1103 + ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0); 1191 1104 if (ret) { 1192 1105 /* This is a failure in the network path, 1193 1106 * not in the response to the assert_master ··· 1202 1117 1203 1118 /* set the lockres owner */ 1204 1119 spin_lock(&res->spinlock); 1120 + /* mastery reference obtained either during 1121 + * assert_master_handler or in get_lock_resource */ 1205 1122 dlm_change_lockres_owner(dlm, res, m); 1206 1123 spin_unlock(&res->spinlock); 1207 1124 ··· 1370 1283 * 1371 1284 */ 1372 1285 1373 - static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to) 1286 + static int dlm_do_master_request(struct dlm_lock_resource *res, 1287 + struct dlm_master_list_entry *mle, int to) 1374 1288 { 1375 1289 struct dlm_ctxt *dlm = mle->dlm; 1376 1290 struct dlm_master_request request; ··· 1427 1339 case DLM_MASTER_RESP_YES: 1428 1340 set_bit(to, mle->response_map); 1429 1341 mlog(0, "node %u is the master, response=YES\n", to); 1342 + mlog(0, "%s:%.*s: master node %u now knows I have a " 1343 + "reference\n", dlm->name, res->lockname.len, 1344 + res->lockname.name, to); 1430 1345 mle->master = to; 1431 1346 break; 1432 1347 case DLM_MASTER_RESP_NO: ··· 1470 1379 * 1471 1380 * if possible, TRIM THIS DOWN!!! 1472 1381 */ 1473 - int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data) 1382 + int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, 1383 + void **ret_data) 1474 1384 { 1475 1385 u8 response = DLM_MASTER_RESP_MAYBE; 1476 1386 struct dlm_ctxt *dlm = data; ··· 1509 1417 1510 1418 /* take care of the easy cases up front */ 1511 1419 spin_lock(&res->spinlock); 1512 - if (res->state & DLM_LOCK_RES_RECOVERING) { 1420 + if (res->state & (DLM_LOCK_RES_RECOVERING| 1421 + DLM_LOCK_RES_MIGRATING)) { 1513 1422 spin_unlock(&res->spinlock); 1514 1423 mlog(0, "returning DLM_MASTER_RESP_ERROR since res is " 1515 - "being recovered\n"); 1424 + "being recovered/migrated\n"); 1516 1425 response = DLM_MASTER_RESP_ERROR; 1517 1426 if (mle) 1518 1427 kmem_cache_free(dlm_mle_cache, mle); ··· 1521 1428 } 1522 1429 1523 1430 if (res->owner == dlm->node_num) { 1431 + mlog(0, "%s:%.*s: setting bit %u in refmap\n", 1432 + dlm->name, namelen, name, request->node_idx); 1433 + dlm_lockres_set_refmap_bit(request->node_idx, res); 1524 1434 spin_unlock(&res->spinlock); 1525 - // mlog(0, "this node is the master\n"); 1526 1435 response = DLM_MASTER_RESP_YES; 1527 1436 if (mle) 1528 1437 kmem_cache_free(dlm_mle_cache, mle); ··· 1572 1477 mlog(0, "node %u is master, but trying to migrate to " 1573 1478 "node %u.\n", tmpmle->master, tmpmle->new_master); 1574 1479 if (tmpmle->master == dlm->node_num) { 1575 - response = DLM_MASTER_RESP_YES; 1576 1480 mlog(ML_ERROR, "no owner on lockres, but this " 1577 1481 "node is trying to migrate it to %u?!\n", 1578 1482 tmpmle->new_master); ··· 1588 1494 * go back and clean the mles on any 1589 1495 * other nodes */ 1590 1496 dispatch_assert = 1; 1497 + dlm_lockres_set_refmap_bit(request->node_idx, res); 1498 + mlog(0, "%s:%.*s: setting bit %u in refmap\n", 1499 + dlm->name, namelen, name, 1500 + request->node_idx); 1591 1501 } else 1592 1502 response = DLM_MASTER_RESP_NO; 1593 1503 } else { ··· 1705 1607 * can periodically run all locks owned by this node 1706 1608 * and re-assert across the cluster... 1707 1609 */ 1708 - static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname, 1709 - unsigned int namelen, void *nodemap, 1710 - u32 flags) 1610 + int dlm_do_assert_master(struct dlm_ctxt *dlm, 1611 + struct dlm_lock_resource *res, 1612 + void *nodemap, u32 flags) 1711 1613 { 1712 1614 struct dlm_assert_master assert; 1713 1615 int to, tmpret; 1714 1616 struct dlm_node_iter iter; 1715 1617 int ret = 0; 1716 1618 int reassert; 1619 + const char *lockname = res->lockname.name; 1620 + unsigned int namelen = res->lockname.len; 1717 1621 1718 1622 BUG_ON(namelen > O2NM_MAX_NAME_LEN); 1623 + 1624 + spin_lock(&res->spinlock); 1625 + res->state |= DLM_LOCK_RES_SETREF_INPROG; 1626 + spin_unlock(&res->spinlock); 1627 + 1719 1628 again: 1720 1629 reassert = 0; 1721 1630 ··· 1752 1647 mlog(0, "link to %d went down!\n", to); 1753 1648 /* any nonzero status return will do */ 1754 1649 ret = tmpret; 1650 + r = 0; 1755 1651 } else if (r < 0) { 1756 1652 /* ok, something horribly messed. kill thyself. */ 1757 1653 mlog(ML_ERROR,"during assert master of %.*s to %u, " ··· 1767 1661 spin_unlock(&dlm->master_lock); 1768 1662 spin_unlock(&dlm->spinlock); 1769 1663 BUG(); 1770 - } else if (r == EAGAIN) { 1664 + } 1665 + 1666 + if (r & DLM_ASSERT_RESPONSE_REASSERT && 1667 + !(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) { 1668 + mlog(ML_ERROR, "%.*s: very strange, " 1669 + "master MLE but no lockres on %u\n", 1670 + namelen, lockname, to); 1671 + } 1672 + 1673 + if (r & DLM_ASSERT_RESPONSE_REASSERT) { 1771 1674 mlog(0, "%.*s: node %u create mles on other " 1772 1675 "nodes and requests a re-assert\n", 1773 1676 namelen, lockname, to); 1774 1677 reassert = 1; 1775 1678 } 1679 + if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) { 1680 + mlog(0, "%.*s: node %u has a reference to this " 1681 + "lockres, set the bit in the refmap\n", 1682 + namelen, lockname, to); 1683 + spin_lock(&res->spinlock); 1684 + dlm_lockres_set_refmap_bit(to, res); 1685 + spin_unlock(&res->spinlock); 1686 + } 1776 1687 } 1777 1688 1778 1689 if (reassert) 1779 1690 goto again; 1691 + 1692 + spin_lock(&res->spinlock); 1693 + res->state &= ~DLM_LOCK_RES_SETREF_INPROG; 1694 + spin_unlock(&res->spinlock); 1695 + wake_up(&res->wq); 1780 1696 1781 1697 return ret; 1782 1698 } ··· 1812 1684 * 1813 1685 * if possible, TRIM THIS DOWN!!! 1814 1686 */ 1815 - int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) 1687 + int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, 1688 + void **ret_data) 1816 1689 { 1817 1690 struct dlm_ctxt *dlm = data; 1818 1691 struct dlm_master_list_entry *mle = NULL; ··· 1822 1693 char *name; 1823 1694 unsigned int namelen, hash; 1824 1695 u32 flags; 1825 - int master_request = 0; 1696 + int master_request = 0, have_lockres_ref = 0; 1826 1697 int ret = 0; 1827 1698 1828 1699 if (!dlm_grab(dlm)) ··· 1980 1851 spin_unlock(&mle->spinlock); 1981 1852 1982 1853 if (res) { 1854 + int wake = 0; 1983 1855 spin_lock(&res->spinlock); 1984 1856 if (mle->type == DLM_MLE_MIGRATION) { 1985 1857 mlog(0, "finishing off migration of lockres %.*s, " ··· 1988 1858 res->lockname.len, res->lockname.name, 1989 1859 dlm->node_num, mle->new_master); 1990 1860 res->state &= ~DLM_LOCK_RES_MIGRATING; 1861 + wake = 1; 1991 1862 dlm_change_lockres_owner(dlm, res, mle->new_master); 1992 1863 BUG_ON(res->state & DLM_LOCK_RES_DIRTY); 1993 1864 } else { 1994 1865 dlm_change_lockres_owner(dlm, res, mle->master); 1995 1866 } 1996 1867 spin_unlock(&res->spinlock); 1868 + have_lockres_ref = 1; 1869 + if (wake) 1870 + wake_up(&res->wq); 1997 1871 } 1998 1872 1999 1873 /* master is known, detach if not already detached. ··· 2047 1913 2048 1914 done: 2049 1915 ret = 0; 2050 - if (res) 2051 - dlm_lockres_put(res); 1916 + if (res) { 1917 + spin_lock(&res->spinlock); 1918 + res->state |= DLM_LOCK_RES_SETREF_INPROG; 1919 + spin_unlock(&res->spinlock); 1920 + *ret_data = (void *)res; 1921 + } 2052 1922 dlm_put(dlm); 2053 1923 if (master_request) { 2054 1924 mlog(0, "need to tell master to reassert\n"); 2055 - ret = EAGAIN; // positive. negative would shoot down the node. 1925 + /* positive. negative would shoot down the node. */ 1926 + ret |= DLM_ASSERT_RESPONSE_REASSERT; 1927 + if (!have_lockres_ref) { 1928 + mlog(ML_ERROR, "strange, got assert from %u, MASTER " 1929 + "mle present here for %s:%.*s, but no lockres!\n", 1930 + assert->node_idx, dlm->name, namelen, name); 1931 + } 1932 + } 1933 + if (have_lockres_ref) { 1934 + /* let the master know we have a reference to the lockres */ 1935 + ret |= DLM_ASSERT_RESPONSE_MASTERY_REF; 1936 + mlog(0, "%s:%.*s: got assert from %u, need a ref\n", 1937 + dlm->name, namelen, name, assert->node_idx); 2056 1938 } 2057 1939 return ret; 2058 1940 ··· 2079 1929 __dlm_print_one_lock_resource(res); 2080 1930 spin_unlock(&res->spinlock); 2081 1931 spin_unlock(&dlm->spinlock); 2082 - dlm_lockres_put(res); 1932 + *ret_data = (void *)res; 2083 1933 dlm_put(dlm); 2084 1934 return -EINVAL; 1935 + } 1936 + 1937 + void dlm_assert_master_post_handler(int status, void *data, void *ret_data) 1938 + { 1939 + struct dlm_lock_resource *res = (struct dlm_lock_resource *)ret_data; 1940 + 1941 + if (ret_data) { 1942 + spin_lock(&res->spinlock); 1943 + res->state &= ~DLM_LOCK_RES_SETREF_INPROG; 1944 + spin_unlock(&res->spinlock); 1945 + wake_up(&res->wq); 1946 + dlm_lockres_put(res); 1947 + } 1948 + return; 2085 1949 } 2086 1950 2087 1951 int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, ··· 2187 2023 * even if one or more nodes die */ 2188 2024 mlog(0, "worker about to master %.*s here, this=%u\n", 2189 2025 res->lockname.len, res->lockname.name, dlm->node_num); 2190 - ret = dlm_do_assert_master(dlm, res->lockname.name, 2191 - res->lockname.len, 2192 - nodemap, flags); 2026 + ret = dlm_do_assert_master(dlm, res, nodemap, flags); 2193 2027 if (ret < 0) { 2194 2028 /* no need to restart, we are done */ 2195 2029 if (!dlm_is_host_down(ret)) ··· 2259 2097 return ret; 2260 2098 } 2261 2099 2100 + /* 2101 + * DLM_DEREF_LOCKRES_MSG 2102 + */ 2103 + 2104 + int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) 2105 + { 2106 + struct dlm_deref_lockres deref; 2107 + int ret = 0, r; 2108 + const char *lockname; 2109 + unsigned int namelen; 2110 + 2111 + lockname = res->lockname.name; 2112 + namelen = res->lockname.len; 2113 + BUG_ON(namelen > O2NM_MAX_NAME_LEN); 2114 + 2115 + mlog(0, "%s:%.*s: sending deref to %d\n", 2116 + dlm->name, namelen, lockname, res->owner); 2117 + memset(&deref, 0, sizeof(deref)); 2118 + deref.node_idx = dlm->node_num; 2119 + deref.namelen = namelen; 2120 + memcpy(deref.name, lockname, namelen); 2121 + 2122 + ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, 2123 + &deref, sizeof(deref), res->owner, &r); 2124 + if (ret < 0) 2125 + mlog_errno(ret); 2126 + else if (r < 0) { 2127 + /* BAD. other node says I did not have a ref. */ 2128 + mlog(ML_ERROR,"while dropping ref on %s:%.*s " 2129 + "(master=%u) got %d.\n", dlm->name, namelen, 2130 + lockname, res->owner, r); 2131 + dlm_print_one_lock_resource(res); 2132 + BUG(); 2133 + } 2134 + return ret; 2135 + } 2136 + 2137 + int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, 2138 + void **ret_data) 2139 + { 2140 + struct dlm_ctxt *dlm = data; 2141 + struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf; 2142 + struct dlm_lock_resource *res = NULL; 2143 + char *name; 2144 + unsigned int namelen; 2145 + int ret = -EINVAL; 2146 + u8 node; 2147 + unsigned int hash; 2148 + struct dlm_work_item *item; 2149 + int cleared = 0; 2150 + int dispatch = 0; 2151 + 2152 + if (!dlm_grab(dlm)) 2153 + return 0; 2154 + 2155 + name = deref->name; 2156 + namelen = deref->namelen; 2157 + node = deref->node_idx; 2158 + 2159 + if (namelen > DLM_LOCKID_NAME_MAX) { 2160 + mlog(ML_ERROR, "Invalid name length!"); 2161 + goto done; 2162 + } 2163 + if (deref->node_idx >= O2NM_MAX_NODES) { 2164 + mlog(ML_ERROR, "Invalid node number: %u\n", node); 2165 + goto done; 2166 + } 2167 + 2168 + hash = dlm_lockid_hash(name, namelen); 2169 + 2170 + spin_lock(&dlm->spinlock); 2171 + res = __dlm_lookup_lockres_full(dlm, name, namelen, hash); 2172 + if (!res) { 2173 + spin_unlock(&dlm->spinlock); 2174 + mlog(ML_ERROR, "%s:%.*s: bad lockres name\n", 2175 + dlm->name, namelen, name); 2176 + goto done; 2177 + } 2178 + spin_unlock(&dlm->spinlock); 2179 + 2180 + spin_lock(&res->spinlock); 2181 + if (res->state & DLM_LOCK_RES_SETREF_INPROG) 2182 + dispatch = 1; 2183 + else { 2184 + BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); 2185 + if (test_bit(node, res->refmap)) { 2186 + dlm_lockres_clear_refmap_bit(node, res); 2187 + cleared = 1; 2188 + } 2189 + } 2190 + spin_unlock(&res->spinlock); 2191 + 2192 + if (!dispatch) { 2193 + if (cleared) 2194 + dlm_lockres_calc_usage(dlm, res); 2195 + else { 2196 + mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " 2197 + "but it is already dropped!\n", dlm->name, 2198 + res->lockname.len, res->lockname.name, node); 2199 + __dlm_print_one_lock_resource(res); 2200 + } 2201 + ret = 0; 2202 + goto done; 2203 + } 2204 + 2205 + item = kzalloc(sizeof(*item), GFP_NOFS); 2206 + if (!item) { 2207 + ret = -ENOMEM; 2208 + mlog_errno(ret); 2209 + goto done; 2210 + } 2211 + 2212 + dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL); 2213 + item->u.dl.deref_res = res; 2214 + item->u.dl.deref_node = node; 2215 + 2216 + spin_lock(&dlm->work_lock); 2217 + list_add_tail(&item->list, &dlm->work_list); 2218 + spin_unlock(&dlm->work_lock); 2219 + 2220 + queue_work(dlm->dlm_worker, &dlm->dispatched_work); 2221 + return 0; 2222 + 2223 + done: 2224 + if (res) 2225 + dlm_lockres_put(res); 2226 + dlm_put(dlm); 2227 + 2228 + return ret; 2229 + } 2230 + 2231 + static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) 2232 + { 2233 + struct dlm_ctxt *dlm; 2234 + struct dlm_lock_resource *res; 2235 + u8 node; 2236 + u8 cleared = 0; 2237 + 2238 + dlm = item->dlm; 2239 + res = item->u.dl.deref_res; 2240 + node = item->u.dl.deref_node; 2241 + 2242 + spin_lock(&res->spinlock); 2243 + BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); 2244 + if (test_bit(node, res->refmap)) { 2245 + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); 2246 + dlm_lockres_clear_refmap_bit(node, res); 2247 + cleared = 1; 2248 + } 2249 + spin_unlock(&res->spinlock); 2250 + 2251 + if (cleared) { 2252 + mlog(0, "%s:%.*s node %u ref dropped in dispatch\n", 2253 + dlm->name, res->lockname.len, res->lockname.name, node); 2254 + dlm_lockres_calc_usage(dlm, res); 2255 + } else { 2256 + mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " 2257 + "but it is already dropped!\n", dlm->name, 2258 + res->lockname.len, res->lockname.name, node); 2259 + __dlm_print_one_lock_resource(res); 2260 + } 2261 + 2262 + dlm_lockres_put(res); 2263 + } 2264 + 2262 2265 2263 2266 /* 2264 2267 * DLM_MIGRATE_LOCKRES 2265 2268 */ 2266 2269 2267 2270 2268 - int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, 2269 - u8 target) 2271 + static int dlm_migrate_lockres(struct dlm_ctxt *dlm, 2272 + struct dlm_lock_resource *res, 2273 + u8 target) 2270 2274 { 2271 2275 struct dlm_master_list_entry *mle = NULL; 2272 2276 struct dlm_master_list_entry *oldmle = NULL; ··· 2444 2116 struct list_head *queue, *iter; 2445 2117 int i; 2446 2118 struct dlm_lock *lock; 2447 - int empty = 1; 2119 + int empty = 1, wake = 0; 2448 2120 2449 2121 if (!dlm_grab(dlm)) 2450 2122 return -EINVAL; ··· 2569 2241 res->lockname.name, target); 2570 2242 spin_lock(&res->spinlock); 2571 2243 res->state &= ~DLM_LOCK_RES_MIGRATING; 2244 + wake = 1; 2572 2245 spin_unlock(&res->spinlock); 2573 2246 ret = -EINVAL; 2574 2247 } ··· 2597 2268 * the lockres 2598 2269 */ 2599 2270 2271 + /* now that remote nodes are spinning on the MIGRATING flag, 2272 + * ensure that all assert_master work is flushed. */ 2273 + flush_workqueue(dlm->dlm_worker); 2600 2274 2601 2275 /* get an extra reference on the mle. 2602 2276 * otherwise the assert_master from the new ··· 2628 2296 dlm_put_mle_inuse(mle); 2629 2297 spin_lock(&res->spinlock); 2630 2298 res->state &= ~DLM_LOCK_RES_MIGRATING; 2299 + wake = 1; 2631 2300 spin_unlock(&res->spinlock); 2632 2301 goto leave; 2633 2302 } ··· 2655 2322 res->owner == target) 2656 2323 break; 2657 2324 2658 - mlog(0, "timed out during migration\n"); 2325 + mlog(0, "%s:%.*s: timed out during migration\n", 2326 + dlm->name, res->lockname.len, res->lockname.name); 2659 2327 /* avoid hang during shutdown when migrating lockres 2660 2328 * to a node which also goes down */ 2661 2329 if (dlm_is_node_dead(dlm, target)) { ··· 2664 2330 "target %u is no longer up, restarting\n", 2665 2331 dlm->name, res->lockname.len, 2666 2332 res->lockname.name, target); 2667 - ret = -ERESTARTSYS; 2333 + ret = -EINVAL; 2334 + /* migration failed, detach and clean up mle */ 2335 + dlm_mle_detach_hb_events(dlm, mle); 2336 + dlm_put_mle(mle); 2337 + dlm_put_mle_inuse(mle); 2338 + spin_lock(&res->spinlock); 2339 + res->state &= ~DLM_LOCK_RES_MIGRATING; 2340 + wake = 1; 2341 + spin_unlock(&res->spinlock); 2342 + goto leave; 2668 2343 } 2669 - } 2670 - if (ret == -ERESTARTSYS) { 2671 - /* migration failed, detach and clean up mle */ 2672 - dlm_mle_detach_hb_events(dlm, mle); 2673 - dlm_put_mle(mle); 2674 - dlm_put_mle_inuse(mle); 2675 - spin_lock(&res->spinlock); 2676 - res->state &= ~DLM_LOCK_RES_MIGRATING; 2677 - spin_unlock(&res->spinlock); 2678 - goto leave; 2679 - } 2680 - /* TODO: if node died: stop, clean up, return error */ 2344 + } else 2345 + mlog(0, "%s:%.*s: caught signal during migration\n", 2346 + dlm->name, res->lockname.len, res->lockname.name); 2681 2347 } 2682 2348 2683 2349 /* all done, set the owner, clear the flag */ ··· 2700 2366 if (ret < 0) 2701 2367 dlm_kick_thread(dlm, res); 2702 2368 2369 + /* wake up waiters if the MIGRATING flag got set 2370 + * but migration failed */ 2371 + if (wake) 2372 + wake_up(&res->wq); 2373 + 2703 2374 /* TODO: cleanup */ 2704 2375 if (mres) 2705 2376 free_page((unsigned long)mres); ··· 2713 2374 2714 2375 mlog(0, "returning %d\n", ret); 2715 2376 return ret; 2377 + } 2378 + 2379 + #define DLM_MIGRATION_RETRY_MS 100 2380 + 2381 + /* Should be called only after beginning the domain leave process. 2382 + * There should not be any remaining locks on nonlocal lock resources, 2383 + * and there should be no local locks left on locally mastered resources. 2384 + * 2385 + * Called with the dlm spinlock held, may drop it to do migration, but 2386 + * will re-acquire before exit. 2387 + * 2388 + * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */ 2389 + int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) 2390 + { 2391 + int ret; 2392 + int lock_dropped = 0; 2393 + 2394 + if (res->owner != dlm->node_num) { 2395 + if (!__dlm_lockres_unused(res)) { 2396 + mlog(ML_ERROR, "%s:%.*s: this node is not master, " 2397 + "trying to free this but locks remain\n", 2398 + dlm->name, res->lockname.len, res->lockname.name); 2399 + } 2400 + goto leave; 2401 + } 2402 + 2403 + /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */ 2404 + spin_unlock(&dlm->spinlock); 2405 + lock_dropped = 1; 2406 + while (1) { 2407 + ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES); 2408 + if (ret >= 0) 2409 + break; 2410 + if (ret == -ENOTEMPTY) { 2411 + mlog(ML_ERROR, "lockres %.*s still has local locks!\n", 2412 + res->lockname.len, res->lockname.name); 2413 + BUG(); 2414 + } 2415 + 2416 + mlog(0, "lockres %.*s: migrate failed, " 2417 + "retrying\n", res->lockname.len, 2418 + res->lockname.name); 2419 + msleep(DLM_MIGRATION_RETRY_MS); 2420 + } 2421 + spin_lock(&dlm->spinlock); 2422 + leave: 2423 + return lock_dropped; 2716 2424 } 2717 2425 2718 2426 int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock) ··· 2791 2405 return can_proceed; 2792 2406 } 2793 2407 2794 - int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) 2408 + static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, 2409 + struct dlm_lock_resource *res) 2795 2410 { 2796 2411 int ret; 2797 2412 spin_lock(&res->spinlock); ··· 2821 2434 __dlm_lockres_reserve_ast(res); 2822 2435 spin_unlock(&res->spinlock); 2823 2436 2824 - /* now flush all the pending asts.. hang out for a bit */ 2437 + /* now flush all the pending asts */ 2825 2438 dlm_kick_thread(dlm, res); 2439 + /* before waiting on DIRTY, block processes which may 2440 + * try to dirty the lockres before MIGRATING is set */ 2441 + spin_lock(&res->spinlock); 2442 + BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY); 2443 + res->state |= DLM_LOCK_RES_BLOCK_DIRTY; 2444 + spin_unlock(&res->spinlock); 2445 + /* now wait on any pending asts and the DIRTY state */ 2826 2446 wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); 2827 2447 dlm_lockres_release_ast(dlm, res); 2828 2448 ··· 2855 2461 mlog(0, "trying again...\n"); 2856 2462 goto again; 2857 2463 } 2464 + /* now that we are sure the MIGRATING state is there, drop 2465 + * the unneded state which blocked threads trying to DIRTY */ 2466 + spin_lock(&res->spinlock); 2467 + BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY)); 2468 + BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); 2469 + res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; 2470 + spin_unlock(&res->spinlock); 2858 2471 2859 2472 /* did the target go down or die? */ 2860 2473 spin_lock(&dlm->spinlock); ··· 2891 2490 { 2892 2491 struct list_head *iter, *iter2; 2893 2492 struct list_head *queue = &res->granted; 2894 - int i; 2493 + int i, bit; 2895 2494 struct dlm_lock *lock; 2896 2495 2897 2496 assert_spin_locked(&res->spinlock); ··· 2909 2508 BUG_ON(!list_empty(&lock->bast_list)); 2910 2509 BUG_ON(lock->ast_pending); 2911 2510 BUG_ON(lock->bast_pending); 2511 + dlm_lockres_clear_refmap_bit(lock->ml.node, res); 2912 2512 list_del_init(&lock->list); 2913 2513 dlm_lock_put(lock); 2914 2514 } 2915 2515 } 2916 2516 queue++; 2517 + } 2518 + bit = 0; 2519 + while (1) { 2520 + bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); 2521 + if (bit >= O2NM_MAX_NODES) 2522 + break; 2523 + /* do not clear the local node reference, if there is a 2524 + * process holding this, let it drop the ref itself */ 2525 + if (bit != dlm->node_num) { 2526 + mlog(0, "%s:%.*s: node %u had a ref to this " 2527 + "migrating lockres, clearing\n", dlm->name, 2528 + res->lockname.len, res->lockname.name, bit); 2529 + dlm_lockres_clear_refmap_bit(bit, res); 2530 + } 2531 + bit++; 2917 2532 } 2918 2533 } 2919 2534 ··· 3018 2601 mlog(0, "migrate request (node %u) returned %d!\n", 3019 2602 nodenum, status); 3020 2603 ret = status; 2604 + } else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) { 2605 + /* during the migration request we short-circuited 2606 + * the mastery of the lockres. make sure we have 2607 + * a mastery ref for nodenum */ 2608 + mlog(0, "%s:%.*s: need ref for node %u\n", 2609 + dlm->name, res->lockname.len, res->lockname.name, 2610 + nodenum); 2611 + spin_lock(&res->spinlock); 2612 + dlm_lockres_set_refmap_bit(nodenum, res); 2613 + spin_unlock(&res->spinlock); 3021 2614 } 3022 2615 } 3023 2616 ··· 3046 2619 * we will have no mle in the list to start with. now we can add an mle for 3047 2620 * the migration and this should be the only one found for those scanning the 3048 2621 * list. */ 3049 - int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data) 2622 + int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, 2623 + void **ret_data) 3050 2624 { 3051 2625 struct dlm_ctxt *dlm = data; 3052 2626 struct dlm_lock_resource *res = NULL; ··· 3173 2745 /* remove it from the list so that only one 3174 2746 * mle will be found */ 3175 2747 list_del_init(&tmp->list); 3176 - __dlm_mle_detach_hb_events(dlm, mle); 2748 + /* this was obviously WRONG. mle is uninited here. should be tmp. */ 2749 + __dlm_mle_detach_hb_events(dlm, tmp); 2750 + ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; 2751 + mlog(0, "%s:%.*s: master=%u, newmaster=%u, " 2752 + "telling master to get ref for cleared out mle " 2753 + "during migration\n", dlm->name, namelen, name, 2754 + master, new_master); 3177 2755 } 3178 2756 spin_unlock(&tmp->spinlock); 3179 2757 } ··· 3187 2753 /* now add a migration mle to the tail of the list */ 3188 2754 dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen); 3189 2755 mle->new_master = new_master; 2756 + /* the new master will be sending an assert master for this. 2757 + * at that point we will get the refmap reference */ 3190 2758 mle->master = master; 3191 2759 /* do this for consistency with other mle types */ 3192 2760 set_bit(new_master, mle->maybe_map); ··· 3338 2902 clear_bit(dlm->node_num, iter.node_map); 3339 2903 spin_unlock(&dlm->spinlock); 3340 2904 2905 + /* ownership of the lockres is changing. account for the 2906 + * mastery reference here since old_master will briefly have 2907 + * a reference after the migration completes */ 2908 + spin_lock(&res->spinlock); 2909 + dlm_lockres_set_refmap_bit(old_master, res); 2910 + spin_unlock(&res->spinlock); 2911 + 3341 2912 mlog(0, "now time to do a migrate request to other nodes\n"); 3342 2913 ret = dlm_do_migrate_request(dlm, res, old_master, 3343 2914 dlm->node_num, &iter); ··· 3357 2914 res->lockname.len, res->lockname.name); 3358 2915 /* this call now finishes out the nodemap 3359 2916 * even if one or more nodes die */ 3360 - ret = dlm_do_assert_master(dlm, res->lockname.name, 3361 - res->lockname.len, iter.node_map, 2917 + ret = dlm_do_assert_master(dlm, res, iter.node_map, 3362 2918 DLM_ASSERT_MASTER_FINISH_MIGRATION); 3363 2919 if (ret < 0) { 3364 2920 /* no longer need to retry. all living nodes contacted. */ ··· 3369 2927 set_bit(old_master, iter.node_map); 3370 2928 mlog(0, "doing assert master of %.*s back to %u\n", 3371 2929 res->lockname.len, res->lockname.name, old_master); 3372 - ret = dlm_do_assert_master(dlm, res->lockname.name, 3373 - res->lockname.len, iter.node_map, 2930 + ret = dlm_do_assert_master(dlm, res, iter.node_map, 3374 2931 DLM_ASSERT_MASTER_FINISH_MIGRATION); 3375 2932 if (ret < 0) { 3376 2933 mlog(0, "assert master to original master failed "

+154 -28

fs/ocfs2/dlm/dlmrecovery.c

··· 163 163 dlm_workfunc_t *workfunc; 164 164 int tot=0; 165 165 166 - if (!dlm_joined(dlm)) 167 - return; 168 - 169 166 spin_lock(&dlm->work_lock); 170 167 list_splice_init(&dlm->work_list, &tmp_list); 171 168 spin_unlock(&dlm->work_lock); ··· 818 821 819 822 } 820 823 821 - int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data) 824 + int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data, 825 + void **ret_data) 822 826 { 823 827 struct dlm_ctxt *dlm = data; 824 828 struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf; ··· 976 978 } 977 979 978 980 979 - int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data) 981 + int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data, 982 + void **ret_data) 980 983 { 981 984 struct dlm_ctxt *dlm = data; 982 985 struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf; ··· 1128 1129 if (total_locks == mres_total_locks) 1129 1130 mres->flags |= DLM_MRES_ALL_DONE; 1130 1131 1132 + mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n", 1133 + dlm->name, res->lockname.len, res->lockname.name, 1134 + orig_flags & DLM_MRES_MIGRATION ? "migrate" : "recovery", 1135 + send_to); 1136 + 1131 1137 /* send it */ 1132 1138 ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres, 1133 1139 sz, send_to, &status); ··· 1217 1213 return 0; 1218 1214 } 1219 1215 1216 + static void dlm_add_dummy_lock(struct dlm_ctxt *dlm, 1217 + struct dlm_migratable_lockres *mres) 1218 + { 1219 + struct dlm_lock dummy; 1220 + memset(&dummy, 0, sizeof(dummy)); 1221 + dummy.ml.cookie = 0; 1222 + dummy.ml.type = LKM_IVMODE; 1223 + dummy.ml.convert_type = LKM_IVMODE; 1224 + dummy.ml.highest_blocked = LKM_IVMODE; 1225 + dummy.lksb = NULL; 1226 + dummy.ml.node = dlm->node_num; 1227 + dlm_add_lock_to_array(&dummy, mres, DLM_BLOCKED_LIST); 1228 + } 1229 + 1230 + static inline int dlm_is_dummy_lock(struct dlm_ctxt *dlm, 1231 + struct dlm_migratable_lock *ml, 1232 + u8 *nodenum) 1233 + { 1234 + if (unlikely(ml->cookie == 0 && 1235 + ml->type == LKM_IVMODE && 1236 + ml->convert_type == LKM_IVMODE && 1237 + ml->highest_blocked == LKM_IVMODE && 1238 + ml->list == DLM_BLOCKED_LIST)) { 1239 + *nodenum = ml->node; 1240 + return 1; 1241 + } 1242 + return 0; 1243 + } 1220 1244 1221 1245 int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, 1222 1246 struct dlm_migratable_lockres *mres, ··· 1292 1260 goto error; 1293 1261 } 1294 1262 } 1263 + if (total_locks == 0) { 1264 + /* send a dummy lock to indicate a mastery reference only */ 1265 + mlog(0, "%s:%.*s: sending dummy lock to %u, %s\n", 1266 + dlm->name, res->lockname.len, res->lockname.name, 1267 + send_to, flags & DLM_MRES_RECOVERY ? "recovery" : 1268 + "migration"); 1269 + dlm_add_dummy_lock(dlm, mres); 1270 + } 1295 1271 /* flush any remaining locks */ 1296 1272 ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); 1297 1273 if (ret < 0) ··· 1333 1293 * do we spin? returning an error only delays the problem really 1334 1294 */ 1335 1295 1336 - int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) 1296 + int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, 1297 + void **ret_data) 1337 1298 { 1338 1299 struct dlm_ctxt *dlm = data; 1339 1300 struct dlm_migratable_lockres *mres = ··· 1423 1382 spin_lock(&res->spinlock); 1424 1383 res->state &= ~DLM_LOCK_RES_IN_PROGRESS; 1425 1384 spin_unlock(&res->spinlock); 1385 + wake_up(&res->wq); 1426 1386 1427 1387 /* add an extra ref for just-allocated lockres 1428 1388 * otherwise the lockres will be purged immediately */ 1429 1389 dlm_lockres_get(res); 1430 - 1431 1390 } 1432 1391 1433 1392 /* at this point we have allocated everything we need, 1434 1393 * and we have a hashed lockres with an extra ref and 1435 1394 * the proper res->state flags. */ 1436 1395 ret = 0; 1396 + spin_lock(&res->spinlock); 1397 + /* drop this either when master requery finds a different master 1398 + * or when a lock is added by the recovery worker */ 1399 + dlm_lockres_grab_inflight_ref(dlm, res); 1437 1400 if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) { 1438 1401 /* migration cannot have an unknown master */ 1439 1402 BUG_ON(!(mres->flags & DLM_MRES_RECOVERY)); ··· 1445 1400 "unknown owner.. will need to requery: " 1446 1401 "%.*s\n", mres->lockname_len, mres->lockname); 1447 1402 } else { 1448 - spin_lock(&res->spinlock); 1403 + /* take a reference now to pin the lockres, drop it 1404 + * when locks are added in the worker */ 1449 1405 dlm_change_lockres_owner(dlm, res, dlm->node_num); 1450 - spin_unlock(&res->spinlock); 1451 1406 } 1407 + spin_unlock(&res->spinlock); 1452 1408 1453 1409 /* queue up work for dlm_mig_lockres_worker */ 1454 1410 dlm_grab(dlm); /* get an extra ref for the work item */ ··· 1505 1459 "this node will take it.\n", 1506 1460 res->lockname.len, res->lockname.name); 1507 1461 } else { 1462 + spin_lock(&res->spinlock); 1463 + dlm_lockres_drop_inflight_ref(dlm, res); 1464 + spin_unlock(&res->spinlock); 1508 1465 mlog(0, "master needs to respond to sender " 1509 1466 "that node %u still owns %.*s\n", 1510 1467 real_master, res->lockname.len, ··· 1627 1578 /* this function cannot error, so unless the sending 1628 1579 * or receiving of the message failed, the owner can 1629 1580 * be trusted */ 1630 - int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data) 1581 + int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, 1582 + void **ret_data) 1631 1583 { 1632 1584 struct dlm_ctxt *dlm = data; 1633 1585 struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf; ··· 1710 1660 { 1711 1661 struct dlm_migratable_lock *ml; 1712 1662 struct list_head *queue; 1663 + struct list_head *tmpq = NULL; 1713 1664 struct dlm_lock *newlock = NULL; 1714 1665 struct dlm_lockstatus *lksb = NULL; 1715 1666 int ret = 0; 1716 - int i, bad; 1667 + int i, j, bad; 1717 1668 struct list_head *iter; 1718 1669 struct dlm_lock *lock = NULL; 1670 + u8 from = O2NM_MAX_NODES; 1671 + unsigned int added = 0; 1719 1672 1720 1673 mlog(0, "running %d locks for this lockres\n", mres->num_locks); 1721 1674 for (i=0; i<mres->num_locks; i++) { 1722 1675 ml = &(mres->ml[i]); 1676 + 1677 + if (dlm_is_dummy_lock(dlm, ml, &from)) { 1678 + /* placeholder, just need to set the refmap bit */ 1679 + BUG_ON(mres->num_locks != 1); 1680 + mlog(0, "%s:%.*s: dummy lock for %u\n", 1681 + dlm->name, mres->lockname_len, mres->lockname, 1682 + from); 1683 + spin_lock(&res->spinlock); 1684 + dlm_lockres_set_refmap_bit(from, res); 1685 + spin_unlock(&res->spinlock); 1686 + added++; 1687 + break; 1688 + } 1723 1689 BUG_ON(ml->highest_blocked != LKM_IVMODE); 1724 1690 newlock = NULL; 1725 1691 lksb = NULL; 1726 1692 1727 1693 queue = dlm_list_num_to_pointer(res, ml->list); 1694 + tmpq = NULL; 1728 1695 1729 1696 /* if the lock is for the local node it needs to 1730 1697 * be moved to the proper location within the queue. ··· 1751 1684 BUG_ON(!(mres->flags & DLM_MRES_MIGRATION)); 1752 1685 1753 1686 spin_lock(&res->spinlock); 1754 - list_for_each(iter, queue) { 1755 - lock = list_entry (iter, struct dlm_lock, list); 1756 - if (lock->ml.cookie != ml->cookie) 1757 - lock = NULL; 1758 - else 1687 + for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { 1688 + tmpq = dlm_list_idx_to_ptr(res, j); 1689 + list_for_each(iter, tmpq) { 1690 + lock = list_entry (iter, struct dlm_lock, list); 1691 + if (lock->ml.cookie != ml->cookie) 1692 + lock = NULL; 1693 + else 1694 + break; 1695 + } 1696 + if (lock) 1759 1697 break; 1760 1698 } 1761 1699 ··· 1770 1698 u64 c = ml->cookie; 1771 1699 mlog(ML_ERROR, "could not find local lock " 1772 1700 "with cookie %u:%llu!\n", 1773 - dlm_get_lock_cookie_node(c), 1774 - dlm_get_lock_cookie_seq(c)); 1701 + dlm_get_lock_cookie_node(be64_to_cpu(c)), 1702 + dlm_get_lock_cookie_seq(be64_to_cpu(c))); 1703 + __dlm_print_one_lock_resource(res); 1775 1704 BUG(); 1776 1705 } 1777 1706 BUG_ON(lock->ml.node != ml->node); 1707 + 1708 + if (tmpq != queue) { 1709 + mlog(0, "lock was on %u instead of %u for %.*s\n", 1710 + j, ml->list, res->lockname.len, res->lockname.name); 1711 + spin_unlock(&res->spinlock); 1712 + continue; 1713 + } 1778 1714 1779 1715 /* see NOTE above about why we do not update 1780 1716 * to match the master here */ ··· 1791 1711 /* do not alter lock refcount. switching lists. */ 1792 1712 list_move_tail(&lock->list, queue); 1793 1713 spin_unlock(&res->spinlock); 1714 + added++; 1794 1715 1795 1716 mlog(0, "just reordered a local lock!\n"); 1796 1717 continue; ··· 1880 1799 mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already " 1881 1800 "exists on this lockres!\n", dlm->name, 1882 1801 res->lockname.len, res->lockname.name, 1883 - dlm_get_lock_cookie_node(c), 1884 - dlm_get_lock_cookie_seq(c)); 1802 + dlm_get_lock_cookie_node(be64_to_cpu(c)), 1803 + dlm_get_lock_cookie_seq(be64_to_cpu(c))); 1885 1804 1886 1805 mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, " 1887 1806 "node=%u, cookie=%u:%llu, queue=%d\n", 1888 1807 ml->type, ml->convert_type, ml->node, 1889 - dlm_get_lock_cookie_node(ml->cookie), 1890 - dlm_get_lock_cookie_seq(ml->cookie), 1808 + dlm_get_lock_cookie_node(be64_to_cpu(ml->cookie)), 1809 + dlm_get_lock_cookie_seq(be64_to_cpu(ml->cookie)), 1891 1810 ml->list); 1892 1811 1893 1812 __dlm_print_one_lock_resource(res); ··· 1898 1817 if (!bad) { 1899 1818 dlm_lock_get(newlock); 1900 1819 list_add_tail(&newlock->list, queue); 1820 + mlog(0, "%s:%.*s: added lock for node %u, " 1821 + "setting refmap bit\n", dlm->name, 1822 + res->lockname.len, res->lockname.name, ml->node); 1823 + dlm_lockres_set_refmap_bit(ml->node, res); 1824 + added++; 1901 1825 } 1902 1826 spin_unlock(&res->spinlock); 1903 1827 } 1904 1828 mlog(0, "done running all the locks\n"); 1905 1829 1906 1830 leave: 1831 + /* balance the ref taken when the work was queued */ 1832 + spin_lock(&res->spinlock); 1833 + dlm_lockres_drop_inflight_ref(dlm, res); 1834 + spin_unlock(&res->spinlock); 1835 + 1907 1836 if (ret < 0) { 1908 1837 mlog_errno(ret); 1909 1838 if (newlock) ··· 2026 1935 if (res->owner == dead_node) { 2027 1936 list_del_init(&res->recovering); 2028 1937 spin_lock(&res->spinlock); 1938 + /* new_master has our reference from 1939 + * the lock state sent during recovery */ 2029 1940 dlm_change_lockres_owner(dlm, res, new_master); 2030 1941 res->state &= ~DLM_LOCK_RES_RECOVERING; 2031 - if (!__dlm_lockres_unused(res)) 1942 + if (__dlm_lockres_has_locks(res)) 2032 1943 __dlm_dirty_lockres(dlm, res); 2033 1944 spin_unlock(&res->spinlock); 2034 1945 wake_up(&res->wq); ··· 2070 1977 dlm_lockres_put(res); 2071 1978 } 2072 1979 spin_lock(&res->spinlock); 1980 + /* new_master has our reference from 1981 + * the lock state sent during recovery */ 2073 1982 dlm_change_lockres_owner(dlm, res, new_master); 2074 1983 res->state &= ~DLM_LOCK_RES_RECOVERING; 2075 - if (!__dlm_lockres_unused(res)) 1984 + if (__dlm_lockres_has_locks(res)) 2076 1985 __dlm_dirty_lockres(dlm, res); 2077 1986 spin_unlock(&res->spinlock); 2078 1987 wake_up(&res->wq); ··· 2143 2048 { 2144 2049 struct list_head *iter, *tmpiter; 2145 2050 struct dlm_lock *lock; 2051 + unsigned int freed = 0; 2146 2052 2147 2053 /* this node is the lockres master: 2148 2054 * 1) remove any stale locks for the dead node ··· 2158 2062 if (lock->ml.node == dead_node) { 2159 2063 list_del_init(&lock->list); 2160 2064 dlm_lock_put(lock); 2065 + freed++; 2161 2066 } 2162 2067 } 2163 2068 list_for_each_safe(iter, tmpiter, &res->converting) { ··· 2166 2069 if (lock->ml.node == dead_node) { 2167 2070 list_del_init(&lock->list); 2168 2071 dlm_lock_put(lock); 2072 + freed++; 2169 2073 } 2170 2074 } 2171 2075 list_for_each_safe(iter, tmpiter, &res->blocked) { ··· 2174 2076 if (lock->ml.node == dead_node) { 2175 2077 list_del_init(&lock->list); 2176 2078 dlm_lock_put(lock); 2079 + freed++; 2177 2080 } 2081 + } 2082 + 2083 + if (freed) { 2084 + mlog(0, "%s:%.*s: freed %u locks for dead node %u, " 2085 + "dropping ref from lockres\n", dlm->name, 2086 + res->lockname.len, res->lockname.name, freed, dead_node); 2087 + BUG_ON(!test_bit(dead_node, res->refmap)); 2088 + dlm_lockres_clear_refmap_bit(dead_node, res); 2089 + } else if (test_bit(dead_node, res->refmap)) { 2090 + mlog(0, "%s:%.*s: dead node %u had a ref, but had " 2091 + "no locks and had not purged before dying\n", dlm->name, 2092 + res->lockname.len, res->lockname.name, dead_node); 2093 + dlm_lockres_clear_refmap_bit(dead_node, res); 2178 2094 } 2179 2095 2180 2096 /* do not kick thread yet */ ··· 2253 2141 spin_lock(&res->spinlock); 2254 2142 /* zero the lvb if necessary */ 2255 2143 dlm_revalidate_lvb(dlm, res, dead_node); 2256 - if (res->owner == dead_node) 2144 + if (res->owner == dead_node) { 2145 + if (res->state & DLM_LOCK_RES_DROPPING_REF) 2146 + mlog(0, "%s:%.*s: owned by " 2147 + "dead node %u, this node was " 2148 + "dropping its ref when it died. " 2149 + "continue, dropping the flag.\n", 2150 + dlm->name, res->lockname.len, 2151 + res->lockname.name, dead_node); 2152 + 2153 + /* the wake_up for this will happen when the 2154 + * RECOVERING flag is dropped later */ 2155 + res->state &= ~DLM_LOCK_RES_DROPPING_REF; 2156 + 2257 2157 dlm_move_lockres_to_recovery_list(dlm, res); 2258 - else if (res->owner == dlm->node_num) { 2158 + } else if (res->owner == dlm->node_num) { 2259 2159 dlm_free_dead_locks(dlm, res, dead_node); 2260 2160 __dlm_lockres_calc_usage(dlm, res); 2261 2161 } ··· 2604 2480 return ret; 2605 2481 } 2606 2482 2607 - int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) 2483 + int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data, 2484 + void **ret_data) 2608 2485 { 2609 2486 struct dlm_ctxt *dlm = data; 2610 2487 struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf; ··· 2733 2608 return ret; 2734 2609 } 2735 2610 2736 - int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data) 2611 + int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, 2612 + void **ret_data) 2737 2613 { 2738 2614 struct dlm_ctxt *dlm = data; 2739 2615 struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;

+98 -102

fs/ocfs2/dlm/dlmthread.c

··· 54 54 #include "cluster/masklog.h" 55 55 56 56 static int dlm_thread(void *data); 57 - static void dlm_purge_lockres_now(struct dlm_ctxt *dlm, 58 - struct dlm_lock_resource *lockres); 59 - 60 57 static void dlm_flush_asts(struct dlm_ctxt *dlm); 61 58 62 59 #define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num) ··· 79 82 current->state = TASK_RUNNING; 80 83 } 81 84 82 - 83 - int __dlm_lockres_unused(struct dlm_lock_resource *res) 85 + int __dlm_lockres_has_locks(struct dlm_lock_resource *res) 84 86 { 85 87 if (list_empty(&res->granted) && 86 88 list_empty(&res->converting) && 87 - list_empty(&res->blocked) && 88 - list_empty(&res->dirty)) 89 - return 1; 89 + list_empty(&res->blocked)) 90 + return 0; 91 + return 1; 92 + } 93 + 94 + /* "unused": the lockres has no locks, is not on the dirty list, 95 + * has no inflight locks (in the gap between mastery and acquiring 96 + * the first lock), and has no bits in its refmap. 97 + * truly ready to be freed. */ 98 + int __dlm_lockres_unused(struct dlm_lock_resource *res) 99 + { 100 + if (!__dlm_lockres_has_locks(res) && 101 + (list_empty(&res->dirty) && !(res->state & DLM_LOCK_RES_DIRTY))) { 102 + /* try not to scan the bitmap unless the first two 103 + * conditions are already true */ 104 + int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); 105 + if (bit >= O2NM_MAX_NODES) { 106 + /* since the bit for dlm->node_num is not 107 + * set, inflight_locks better be zero */ 108 + BUG_ON(res->inflight_locks != 0); 109 + return 1; 110 + } 111 + } 90 112 return 0; 91 113 } 92 114 ··· 122 106 assert_spin_locked(&res->spinlock); 123 107 124 108 if (__dlm_lockres_unused(res)){ 125 - /* For now, just keep any resource we master */ 126 - if (res->owner == dlm->node_num) 127 - { 128 - if (!list_empty(&res->purge)) { 129 - mlog(0, "we master %s:%.*s, but it is on " 130 - "the purge list. Removing\n", 131 - dlm->name, res->lockname.len, 132 - res->lockname.name); 133 - list_del_init(&res->purge); 134 - dlm->purge_count--; 135 - } 136 - return; 137 - } 138 - 139 109 if (list_empty(&res->purge)) { 140 - mlog(0, "putting lockres %.*s from purge list\n", 141 - res->lockname.len, res->lockname.name); 110 + mlog(0, "putting lockres %.*s:%p onto purge list\n", 111 + res->lockname.len, res->lockname.name, res); 142 112 143 113 res->last_used = jiffies; 114 + dlm_lockres_get(res); 144 115 list_add_tail(&res->purge, &dlm->purge_list); 145 116 dlm->purge_count++; 146 - 147 - /* if this node is not the owner, there is 148 - * no way to keep track of who the owner could be. 149 - * unhash it to avoid serious problems. */ 150 - if (res->owner != dlm->node_num) { 151 - mlog(0, "%s:%.*s: doing immediate " 152 - "purge of lockres owned by %u\n", 153 - dlm->name, res->lockname.len, 154 - res->lockname.name, res->owner); 155 - 156 - dlm_purge_lockres_now(dlm, res); 157 - } 158 117 } 159 118 } else if (!list_empty(&res->purge)) { 160 - mlog(0, "removing lockres %.*s from purge list, " 161 - "owner=%u\n", res->lockname.len, res->lockname.name, 162 - res->owner); 119 + mlog(0, "removing lockres %.*s:%p from purge list, owner=%u\n", 120 + res->lockname.len, res->lockname.name, res, res->owner); 163 121 164 122 list_del_init(&res->purge); 123 + dlm_lockres_put(res); 165 124 dlm->purge_count--; 166 125 } 167 126 } ··· 154 163 spin_unlock(&dlm->spinlock); 155 164 } 156 165 157 - /* TODO: Eventual API: Called with the dlm spinlock held, may drop it 158 - * to do migration, but will re-acquire before exit. */ 159 - void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *lockres) 166 + static int dlm_purge_lockres(struct dlm_ctxt *dlm, 167 + struct dlm_lock_resource *res) 160 168 { 161 169 int master; 162 - int ret; 170 + int ret = 0; 163 171 164 - spin_lock(&lockres->spinlock); 165 - master = lockres->owner == dlm->node_num; 166 - spin_unlock(&lockres->spinlock); 167 - 168 - mlog(0, "purging lockres %.*s, master = %d\n", lockres->lockname.len, 169 - lockres->lockname.name, master); 170 - 171 - /* Non master is the easy case -- no migration required, just 172 - * quit. */ 172 + spin_lock(&res->spinlock); 173 + if (!__dlm_lockres_unused(res)) { 174 + spin_unlock(&res->spinlock); 175 + mlog(0, "%s:%.*s: tried to purge but not unused\n", 176 + dlm->name, res->lockname.len, res->lockname.name); 177 + return -ENOTEMPTY; 178 + } 179 + master = (res->owner == dlm->node_num); 173 180 if (!master) 174 - goto finish; 181 + res->state |= DLM_LOCK_RES_DROPPING_REF; 182 + spin_unlock(&res->spinlock); 175 183 176 - /* Wheee! Migrate lockres here! */ 177 - spin_unlock(&dlm->spinlock); 178 - again: 184 + mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len, 185 + res->lockname.name, master); 179 186 180 - ret = dlm_migrate_lockres(dlm, lockres, O2NM_MAX_NODES); 181 - if (ret == -ENOTEMPTY) { 182 - mlog(ML_ERROR, "lockres %.*s still has local locks!\n", 183 - lockres->lockname.len, lockres->lockname.name); 184 - 185 - BUG(); 186 - } else if (ret < 0) { 187 - mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n", 188 - lockres->lockname.len, lockres->lockname.name); 189 - msleep(100); 190 - goto again; 187 + if (!master) { 188 + spin_lock(&res->spinlock); 189 + /* This ensures that clear refmap is sent after the set */ 190 + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); 191 + spin_unlock(&res->spinlock); 192 + /* drop spinlock to do messaging, retake below */ 193 + spin_unlock(&dlm->spinlock); 194 + /* clear our bit from the master's refmap, ignore errors */ 195 + ret = dlm_drop_lockres_ref(dlm, res); 196 + if (ret < 0) { 197 + mlog_errno(ret); 198 + if (!dlm_is_host_down(ret)) 199 + BUG(); 200 + } 201 + mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n", 202 + dlm->name, res->lockname.len, res->lockname.name, ret); 203 + spin_lock(&dlm->spinlock); 191 204 } 192 205 193 - spin_lock(&dlm->spinlock); 194 - 195 - finish: 196 - if (!list_empty(&lockres->purge)) { 197 - list_del_init(&lockres->purge); 206 + if (!list_empty(&res->purge)) { 207 + mlog(0, "removing lockres %.*s:%p from purgelist, " 208 + "master = %d\n", res->lockname.len, res->lockname.name, 209 + res, master); 210 + list_del_init(&res->purge); 211 + dlm_lockres_put(res); 198 212 dlm->purge_count--; 199 213 } 200 - __dlm_unhash_lockres(lockres); 201 - } 214 + __dlm_unhash_lockres(res); 202 215 203 - /* make an unused lockres go away immediately. 204 - * as soon as the dlm spinlock is dropped, this lockres 205 - * will not be found. kfree still happens on last put. */ 206 - static void dlm_purge_lockres_now(struct dlm_ctxt *dlm, 207 - struct dlm_lock_resource *lockres) 208 - { 209 - assert_spin_locked(&dlm->spinlock); 210 - assert_spin_locked(&lockres->spinlock); 211 - 212 - BUG_ON(!__dlm_lockres_unused(lockres)); 213 - 214 - if (!list_empty(&lockres->purge)) { 215 - list_del_init(&lockres->purge); 216 - dlm->purge_count--; 216 + /* lockres is not in the hash now. drop the flag and wake up 217 + * any processes waiting in dlm_get_lock_resource. */ 218 + if (!master) { 219 + spin_lock(&res->spinlock); 220 + res->state &= ~DLM_LOCK_RES_DROPPING_REF; 221 + spin_unlock(&res->spinlock); 222 + wake_up(&res->wq); 217 223 } 218 - __dlm_unhash_lockres(lockres); 224 + return 0; 219 225 } 220 226 221 227 static void dlm_run_purge_list(struct dlm_ctxt *dlm, ··· 256 268 break; 257 269 } 258 270 271 + mlog(0, "removing lockres %.*s:%p from purgelist\n", 272 + lockres->lockname.len, lockres->lockname.name, lockres); 259 273 list_del_init(&lockres->purge); 274 + dlm_lockres_put(lockres); 260 275 dlm->purge_count--; 261 276 262 277 /* This may drop and reacquire the dlm spinlock if it 263 278 * has to do migration. */ 264 279 mlog(0, "calling dlm_purge_lockres!\n"); 265 - dlm_purge_lockres(dlm, lockres); 280 + if (dlm_purge_lockres(dlm, lockres)) 281 + BUG(); 266 282 mlog(0, "DONE calling dlm_purge_lockres!\n"); 267 283 268 284 /* Avoid adding any scheduling latencies */ ··· 459 467 assert_spin_locked(&res->spinlock); 460 468 461 469 /* don't shuffle secondary queues */ 462 - if ((res->owner == dlm->node_num) && 463 - !(res->state & DLM_LOCK_RES_DIRTY)) { 464 - /* ref for dirty_list */ 465 - dlm_lockres_get(res); 466 - list_add_tail(&res->dirty, &dlm->dirty_list); 467 - res->state |= DLM_LOCK_RES_DIRTY; 470 + if ((res->owner == dlm->node_num)) { 471 + if (res->state & (DLM_LOCK_RES_MIGRATING | 472 + DLM_LOCK_RES_BLOCK_DIRTY)) 473 + return; 474 + 475 + if (list_empty(&res->dirty)) { 476 + /* ref for dirty_list */ 477 + dlm_lockres_get(res); 478 + list_add_tail(&res->dirty, &dlm->dirty_list); 479 + res->state |= DLM_LOCK_RES_DIRTY; 480 + } 468 481 } 469 482 } 470 483 ··· 648 651 dlm_lockres_get(res); 649 652 650 653 spin_lock(&res->spinlock); 651 - res->state &= ~DLM_LOCK_RES_DIRTY; 654 + /* We clear the DLM_LOCK_RES_DIRTY state once we shuffle lists below */ 652 655 list_del_init(&res->dirty); 653 656 spin_unlock(&res->spinlock); 654 657 spin_unlock(&dlm->spinlock); ··· 672 675 /* it is now ok to move lockreses in these states 673 676 * to the dirty list, assuming that they will only be 674 677 * dirty for a short while. */ 678 + BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); 675 679 if (res->state & (DLM_LOCK_RES_IN_PROGRESS | 676 - DLM_LOCK_RES_MIGRATING | 677 680 DLM_LOCK_RES_RECOVERING)) { 678 681 /* move it to the tail and keep going */ 682 + res->state &= ~DLM_LOCK_RES_DIRTY; 679 683 spin_unlock(&res->spinlock); 680 684 mlog(0, "delaying list shuffling for in-" 681 685 "progress lockres %.*s, state=%d\n", ··· 697 699 698 700 /* called while holding lockres lock */ 699 701 dlm_shuffle_lists(dlm, res); 702 + res->state &= ~DLM_LOCK_RES_DIRTY; 700 703 spin_unlock(&res->spinlock); 701 704 702 705 dlm_lockres_calc_usage(dlm, res); ··· 708 709 /* if the lock was in-progress, stick 709 710 * it on the back of the list */ 710 711 if (delay) { 711 - /* ref for dirty_list */ 712 - dlm_lockres_get(res); 713 712 spin_lock(&res->spinlock); 714 - list_add_tail(&res->dirty, &dlm->dirty_list); 715 - res->state |= DLM_LOCK_RES_DIRTY; 713 + __dlm_dirty_lockres(dlm, res); 716 714 spin_unlock(&res->spinlock); 717 715 } 718 716 dlm_lockres_put(res);

+10 -5

fs/ocfs2/dlm/dlmunlock.c

··· 147 147 goto leave; 148 148 } 149 149 150 + if (res->state & DLM_LOCK_RES_MIGRATING) { 151 + status = DLM_MIGRATING; 152 + goto leave; 153 + } 150 154 151 155 /* see above for what the spec says about 152 156 * LKM_CANCEL and the lock queue state */ ··· 248 244 /* this should always be coupled with list removal */ 249 245 BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK)); 250 246 mlog(0, "lock %u:%llu should be gone now! refs=%d\n", 251 - dlm_get_lock_cookie_node(lock->ml.cookie), 252 - dlm_get_lock_cookie_seq(lock->ml.cookie), 247 + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), 248 + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), 253 249 atomic_read(&lock->lock_refs.refcount)-1); 254 250 dlm_lock_put(lock); 255 251 } ··· 383 379 * returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID, 384 380 * return value from dlmunlock_master 385 381 */ 386 - int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data) 382 + int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, 383 + void **ret_data) 387 384 { 388 385 struct dlm_ctxt *dlm = data; 389 386 struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf; ··· 507 502 if (!found) 508 503 mlog(ML_ERROR, "failed to find lock to unlock! " 509 504 "cookie=%u:%llu\n", 510 - dlm_get_lock_cookie_node(unlock->cookie), 511 - dlm_get_lock_cookie_seq(unlock->cookie)); 505 + dlm_get_lock_cookie_node(be64_to_cpu(unlock->cookie)), 506 + dlm_get_lock_cookie_seq(be64_to_cpu(unlock->cookie))); 512 507 else 513 508 dlm_lock_put(lock); 514 509

+4 -4

fs/ocfs2/vote.c

··· 887 887 888 888 static int ocfs2_handle_response_message(struct o2net_msg *msg, 889 889 u32 len, 890 - void *data) 890 + void *data, void **ret_data) 891 891 { 892 892 unsigned int response_id, node_num; 893 893 int response_status; ··· 943 943 944 944 static int ocfs2_handle_vote_message(struct o2net_msg *msg, 945 945 u32 len, 946 - void *data) 946 + void *data, void **ret_data) 947 947 { 948 948 int status; 949 949 struct ocfs2_super *osb = data; ··· 1007 1007 osb->net_key, 1008 1008 sizeof(struct ocfs2_response_msg), 1009 1009 ocfs2_handle_response_message, 1010 - osb, &osb->osb_net_handlers); 1010 + osb, NULL, &osb->osb_net_handlers); 1011 1011 if (status) { 1012 1012 mlog_errno(status); 1013 1013 goto bail; ··· 1017 1017 osb->net_key, 1018 1018 sizeof(struct ocfs2_vote_msg), 1019 1019 ocfs2_handle_vote_message, 1020 - osb, &osb->osb_net_handlers); 1020 + osb, NULL, &osb->osb_net_handlers); 1021 1021 if (status) { 1022 1022 mlog_errno(status); 1023 1023 goto bail;