commit 997396a73a94de7d92d82e30d7bb1d931e38cb16 · tjh.dev/kernel

tjh.dev / kernel

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client:
ceph: fix get_ticket_handler() error handling
ceph: don't BUG on ENOMEM during mds reconnect
ceph: ceph_mdsc_build_path() returns an ERR_PTR
ceph: Fix warnings
ceph: ceph_get_inode() returns an ERR_PTR
ceph: initialize fields on new dentry_infos
ceph: maintain i_head_snapc when any caps are dirty, not just for data
ceph: fix osd request lru adjustment when sending request
ceph: don't improperly set dir complete when holding EXCL cap
mm: exporting account_page_dirty
ceph: direct requests in snapped namespace based on nonsnap parent
ceph: queue cap snap writeback for realm children on snap update
ceph: include dirty xattrs state in snapped caps
ceph: fix xattr cap writeback
ceph: fix multiple mds session shutdown

Linus Torvalds 15 years ago 997396a7 6f4dbeca

+185 -107

14 changed files

expand all

unified split

ceph

addr.c

auth_x.c

caps.c

debugfs.c

dir.c

inode.c

locks.c

mds_client.c

mds_client.h

osd_client.c

snap.c

super.h

xattr.c

page-writeback.c

+3 -9

fs/ceph/addr.c

··· 87 87 88 88 /* dirty the head */ 89 89 spin_lock(&inode->i_lock); 90 - if (ci->i_wrbuffer_ref_head == 0) 90 + if (ci->i_head_snapc == NULL) 91 91 ci->i_head_snapc = ceph_get_snap_context(snapc); 92 92 ++ci->i_wrbuffer_ref_head; 93 93 if (ci->i_wrbuffer_ref == 0) ··· 105 105 spin_lock_irq(&mapping->tree_lock); 106 106 if (page->mapping) { /* Race with truncate? */ 107 107 WARN_ON_ONCE(!PageUptodate(page)); 108 - 109 - if (mapping_cap_account_dirty(mapping)) { 110 - __inc_zone_page_state(page, NR_FILE_DIRTY); 111 - __inc_bdi_stat(mapping->backing_dev_info, 112 - BDI_RECLAIMABLE); 113 - task_io_account_write(PAGE_CACHE_SIZE); 114 - } 108 + account_page_dirtied(page, page->mapping); 115 109 radix_tree_tag_set(&mapping->page_tree, 116 110 page_index(page), PAGECACHE_TAG_DIRTY); 117 111 ··· 346 352 break; 347 353 } 348 354 } 349 - if (!snapc && ci->i_head_snapc) { 355 + if (!snapc && ci->i_wrbuffer_ref_head) { 350 356 snapc = ceph_get_snap_context(ci->i_head_snapc); 351 357 dout(" head snapc %p has %d dirty pages\n", 352 358 snapc, ci->i_wrbuffer_ref_head);

+9 -6

fs/ceph/auth_x.c

··· 376 376 377 377 th = get_ticket_handler(ac, service); 378 378 379 - if (!th) { 379 + if (IS_ERR(th)) { 380 380 *pneed |= service; 381 381 continue; 382 382 } ··· 398 398 int ret; 399 399 struct ceph_x_ticket_handler *th = 400 400 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); 401 + 402 + if (IS_ERR(th)) 403 + return PTR_ERR(th); 401 404 402 405 ceph_x_validate_tickets(ac, &need); 403 406 ··· 453 450 return -ERANGE; 454 451 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY); 455 452 456 - BUG_ON(!th); 457 453 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer); 458 454 if (ret) 459 455 return ret; ··· 507 505 508 506 case CEPHX_GET_PRINCIPAL_SESSION_KEY: 509 507 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); 510 - BUG_ON(!th); 508 + if (IS_ERR(th)) 509 + return PTR_ERR(th); 511 510 ret = ceph_x_proc_ticket_reply(ac, &th->session_key, 512 511 buf + sizeof(*head), end); 513 512 break; ··· 566 563 void *end = p + sizeof(au->reply_buf); 567 564 568 565 th = get_ticket_handler(ac, au->service); 569 - if (!th) 570 - return -EIO; /* hrm! */ 566 + if (IS_ERR(th)) 567 + return PTR_ERR(th); 571 568 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply)); 572 569 if (ret < 0) 573 570 return ret; ··· 629 626 struct ceph_x_ticket_handler *th; 630 627 631 628 th = get_ticket_handler(ac, peer_type); 632 - if (th && !IS_ERR(th)) 629 + if (!IS_ERR(th)) 633 630 remove_ticket_handler(ac, th); 634 631 } 635 632

+23 -9

fs/ceph/caps.c

··· 1082 1082 gid_t gid; 1083 1083 struct ceph_mds_session *session; 1084 1084 u64 xattr_version = 0; 1085 + struct ceph_buffer *xattr_blob = NULL; 1085 1086 int delayed = 0; 1086 1087 u64 flush_tid = 0; 1087 1088 int i; ··· 1143 1142 for (i = 0; i < CEPH_CAP_BITS; i++) 1144 1143 if (flushing & (1 << i)) 1145 1144 ci->i_cap_flush_tid[i] = flush_tid; 1145 + 1146 + follows = ci->i_head_snapc->seq; 1147 + } else { 1148 + follows = 0; 1146 1149 } 1147 1150 1148 1151 keep = cap->implemented; ··· 1160 1155 mtime = inode->i_mtime; 1161 1156 atime = inode->i_atime; 1162 1157 time_warp_seq = ci->i_time_warp_seq; 1163 - follows = ci->i_snap_realm->cached_context->seq; 1164 1158 uid = inode->i_uid; 1165 1159 gid = inode->i_gid; 1166 1160 mode = inode->i_mode; 1167 1161 1168 - if (dropping & CEPH_CAP_XATTR_EXCL) { 1162 + if (flushing & CEPH_CAP_XATTR_EXCL) { 1169 1163 __ceph_build_xattrs_blob(ci); 1170 - xattr_version = ci->i_xattrs.version + 1; 1164 + xattr_blob = ci->i_xattrs.blob; 1165 + xattr_version = ci->i_xattrs.version; 1171 1166 } 1172 1167 1173 1168 spin_unlock(&inode->i_lock); ··· 1175 1170 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, 1176 1171 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, 1177 1172 size, max_size, &mtime, &atime, time_warp_seq, 1178 - uid, gid, mode, 1179 - xattr_version, 1180 - (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL, 1173 + uid, gid, mode, xattr_version, xattr_blob, 1181 1174 follows); 1182 1175 if (ret < 0) { 1183 1176 dout("error sending cap msg, must requeue %p\n", inode); ··· 1285 1282 &capsnap->mtime, &capsnap->atime, 1286 1283 capsnap->time_warp_seq, 1287 1284 capsnap->uid, capsnap->gid, capsnap->mode, 1288 - 0, NULL, 1285 + capsnap->xattr_version, capsnap->xattr_blob, 1289 1286 capsnap->follows); 1290 1287 1291 1288 next_follows = capsnap->follows + 1; ··· 1335 1332 ceph_cap_string(was | mask)); 1336 1333 ci->i_dirty_caps |= mask; 1337 1334 if (was == 0) { 1338 - dout(" inode %p now dirty\n", &ci->vfs_inode); 1335 + if (!ci->i_head_snapc) 1336 + ci->i_head_snapc = ceph_get_snap_context( 1337 + ci->i_snap_realm->cached_context); 1338 + dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode, 1339 + ci->i_head_snapc); 1339 1340 BUG_ON(!list_empty(&ci->i_dirty_item)); 1340 1341 spin_lock(&mdsc->cap_dirty_lock); 1341 1342 list_add(&ci->i_dirty_item, &mdsc->cap_dirty); ··· 2197 2190 2198 2191 if (ci->i_head_snapc == snapc) { 2199 2192 ci->i_wrbuffer_ref_head -= nr; 2200 - if (!ci->i_wrbuffer_ref_head) { 2193 + if (ci->i_wrbuffer_ref_head == 0 && 2194 + ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { 2195 + BUG_ON(!ci->i_head_snapc); 2201 2196 ceph_put_snap_context(ci->i_head_snapc); 2202 2197 ci->i_head_snapc = NULL; 2203 2198 } ··· 2492 2483 dout(" inode %p now clean\n", inode); 2493 2484 BUG_ON(!list_empty(&ci->i_dirty_item)); 2494 2485 drop = 1; 2486 + if (ci->i_wrbuffer_ref_head == 0) { 2487 + BUG_ON(!ci->i_head_snapc); 2488 + ceph_put_snap_context(ci->i_head_snapc); 2489 + ci->i_head_snapc = NULL; 2490 + } 2495 2491 } else { 2496 2492 BUG_ON(list_empty(&ci->i_dirty_item)); 2497 2493 }

fs/ceph/debugfs.c

··· 171 171 } else if (req->r_dentry) { 172 172 path = ceph_mdsc_build_path(req->r_dentry, &pathlen, 173 173 &pathbase, 0); 174 + if (IS_ERR(path)) 175 + path = NULL; 174 176 spin_lock(&req->r_dentry->d_lock); 175 177 seq_printf(s, " #%llx/%.*s (%s)", 176 178 ceph_ino(req->r_dentry->d_parent->d_inode), ··· 189 187 if (req->r_old_dentry) { 190 188 path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen, 191 189 &pathbase, 0); 190 + if (IS_ERR(path)) 191 + path = NULL; 192 192 spin_lock(&req->r_old_dentry->d_lock); 193 193 seq_printf(s, " #%llx/%.*s (%s)", 194 194 ceph_ino(req->r_old_dentry->d_parent->d_inode),

+1 -1

fs/ceph/dir.c

··· 46 46 else 47 47 dentry->d_op = &ceph_snap_dentry_ops; 48 48 49 - di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS); 49 + di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); 50 50 if (!di) 51 51 return -ENOMEM; /* oh well */ 52 52

+3 -2

fs/ceph/inode.c

··· 677 677 if (ci->i_files == 0 && ci->i_subdirs == 0 && 678 678 ceph_snap(inode) == CEPH_NOSNAP && 679 679 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && 680 + (issued & CEPH_CAP_FILE_EXCL) == 0 && 680 681 (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { 681 682 dout(" marking %p complete (empty)\n", inode); 682 683 ci->i_ceph_flags |= CEPH_I_COMPLETE; ··· 1230 1229 in = dn->d_inode; 1231 1230 } else { 1232 1231 in = ceph_get_inode(parent->d_sb, vino); 1233 - if (in == NULL) { 1232 + if (IS_ERR(in)) { 1234 1233 dout("new_inode badness\n"); 1235 1234 d_delete(dn); 1236 1235 dput(dn); 1237 - err = -ENOMEM; 1236 + err = PTR_ERR(in); 1238 1237 goto out; 1239 1238 } 1240 1239 dn = splice_dentry(dn, in, NULL);

+9 -5

fs/ceph/locks.c

··· 82 82 length = fl->fl_end - fl->fl_start + 1; 83 83 84 84 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, 85 - (u64)fl->fl_pid, (u64)fl->fl_nspid, 85 + (u64)fl->fl_pid, 86 + (u64)(unsigned long)fl->fl_nspid, 86 87 lock_cmd, fl->fl_start, 87 88 length, wait); 88 89 if (!err) { ··· 93 92 /* undo! This should only happen if the kernel detects 94 93 * local deadlock. */ 95 94 ceph_lock_message(CEPH_LOCK_FCNTL, op, file, 96 - (u64)fl->fl_pid, (u64)fl->fl_nspid, 95 + (u64)fl->fl_pid, 96 + (u64)(unsigned long)fl->fl_nspid, 97 97 CEPH_LOCK_UNLOCK, fl->fl_start, 98 98 length, 0); 99 99 dout("got %d on posix_lock_file, undid lock", err); ··· 134 132 length = fl->fl_end - fl->fl_start + 1; 135 133 136 134 err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, 137 - file, (u64)fl->fl_pid, (u64)fl->fl_nspid, 135 + file, (u64)fl->fl_pid, 136 + (u64)(unsigned long)fl->fl_nspid, 138 137 lock_cmd, fl->fl_start, 139 138 length, wait); 140 139 if (!err) { ··· 144 141 ceph_lock_message(CEPH_LOCK_FLOCK, 145 142 CEPH_MDS_OP_SETFILELOCK, 146 143 file, (u64)fl->fl_pid, 147 - (u64)fl->fl_nspid, 144 + (u64)(unsigned long)fl->fl_nspid, 148 145 CEPH_LOCK_UNLOCK, fl->fl_start, 149 146 length, 0); 150 147 dout("got %d on flock_lock_file_wait, undid lock", err); ··· 238 235 cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); 239 236 cephlock->client = cpu_to_le64(0); 240 237 cephlock->pid = cpu_to_le64(lock->fl_pid); 241 - cephlock->pid_namespace = cpu_to_le64((u64)lock->fl_nspid); 238 + cephlock->pid_namespace = 239 + cpu_to_le64((u64)(unsigned long)lock->fl_nspid); 242 240 243 241 switch (lock->fl_type) { 244 242 case F_RDLCK:

+63 -38

fs/ceph/mds_client.c

··· 560 560 * 561 561 * Called under mdsc->mutex. 562 562 */ 563 + struct dentry *get_nonsnap_parent(struct dentry *dentry) 564 + { 565 + while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP) 566 + dentry = dentry->d_parent; 567 + return dentry; 568 + } 569 + 563 570 static int __choose_mds(struct ceph_mds_client *mdsc, 564 571 struct ceph_mds_request *req) 565 572 { ··· 597 590 if (req->r_inode) { 598 591 inode = req->r_inode; 599 592 } else if (req->r_dentry) { 600 - if (req->r_dentry->d_inode) { 593 + struct inode *dir = req->r_dentry->d_parent->d_inode; 594 + 595 + if (dir->i_sb != mdsc->client->sb) { 596 + /* not this fs! */ 597 + inode = req->r_dentry->d_inode; 598 + } else if (ceph_snap(dir) != CEPH_NOSNAP) { 599 + /* direct snapped/virtual snapdir requests 600 + * based on parent dir inode */ 601 + struct dentry *dn = 602 + get_nonsnap_parent(req->r_dentry->d_parent); 603 + inode = dn->d_inode; 604 + dout("__choose_mds using nonsnap parent %p\n", inode); 605 + } else if (req->r_dentry->d_inode) { 606 + /* dentry target */ 601 607 inode = req->r_dentry->d_inode; 602 608 } else { 603 - inode = req->r_dentry->d_parent->d_inode; 609 + /* dir + name */ 610 + inode = dir; 604 611 hash = req->r_dentry->d_name.hash; 605 612 is_hash = true; 606 613 } 607 614 } 615 + 608 616 dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, 609 617 (int)hash, mode); 610 618 if (!inode) ··· 2230 2208 pr_info("mds%d reconnect denied\n", session->s_mds); 2231 2209 remove_session_caps(session); 2232 2210 wake = 1; /* for good measure */ 2233 - complete_all(&mdsc->session_close_waiters); 2211 + wake_up_all(&mdsc->session_close_wq); 2234 2212 kick_requests(mdsc, mds); 2235 2213 break; 2236 2214 ··· 2324 2302 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0); 2325 2303 if (IS_ERR(path)) { 2326 2304 err = PTR_ERR(path); 2327 - BUG_ON(err); 2305 + goto out_dput; 2328 2306 } 2329 2307 } else { 2330 2308 path = NULL; ··· 2332 2310 } 2333 2311 err = ceph_pagelist_encode_string(pagelist, path, pathlen); 2334 2312 if (err) 2335 - goto out; 2313 + goto out_free; 2336 2314 2337 2315 spin_lock(&inode->i_lock); 2338 2316 cap->seq = 0; /* reset cap seq */ ··· 2376 2354 unlock_kernel(); 2377 2355 } 2378 2356 2379 - out: 2357 + out_free: 2380 2358 kfree(path); 2359 + out_dput: 2381 2360 dput(dentry); 2382 2361 return err; 2383 2362 } ··· 2899 2876 return -ENOMEM; 2900 2877 2901 2878 init_completion(&mdsc->safe_umount_waiters); 2902 - init_completion(&mdsc->session_close_waiters); 2879 + init_waitqueue_head(&mdsc->session_close_wq); 2903 2880 INIT_LIST_HEAD(&mdsc->waiting_for_map); 2904 2881 mdsc->sessions = NULL; 2905 2882 mdsc->max_sessions = 0; ··· 3044 3021 wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); 3045 3022 } 3046 3023 3024 + /* 3025 + * true if all sessions are closed, or we force unmount 3026 + */ 3027 + bool done_closing_sessions(struct ceph_mds_client *mdsc) 3028 + { 3029 + int i, n = 0; 3030 + 3031 + if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) 3032 + return true; 3033 + 3034 + mutex_lock(&mdsc->mutex); 3035 + for (i = 0; i < mdsc->max_sessions; i++) 3036 + if (mdsc->sessions[i]) 3037 + n++; 3038 + mutex_unlock(&mdsc->mutex); 3039 + return n == 0; 3040 + } 3047 3041 3048 3042 /* 3049 3043 * called after sb is ro. ··· 3069 3029 { 3070 3030 struct ceph_mds_session *session; 3071 3031 int i; 3072 - int n; 3073 3032 struct ceph_client *client = mdsc->client; 3074 - unsigned long started, timeout = client->mount_args->mount_timeout * HZ; 3033 + unsigned long timeout = client->mount_args->mount_timeout * HZ; 3075 3034 3076 3035 dout("close_sessions\n"); 3077 3036 3078 - mutex_lock(&mdsc->mutex); 3079 - 3080 3037 /* close sessions */ 3081 - started = jiffies; 3082 - while (time_before(jiffies, started + timeout)) { 3083 - dout("closing sessions\n"); 3084 - n = 0; 3085 - for (i = 0; i < mdsc->max_sessions; i++) { 3086 - session = __ceph_lookup_mds_session(mdsc, i); 3087 - if (!session) 3088 - continue; 3089 - mutex_unlock(&mdsc->mutex); 3090 - mutex_lock(&session->s_mutex); 3091 - __close_session(mdsc, session); 3092 - mutex_unlock(&session->s_mutex); 3093 - ceph_put_mds_session(session); 3094 - mutex_lock(&mdsc->mutex); 3095 - n++; 3096 - } 3097 - if (n == 0) 3098 - break; 3099 - 3100 - if (client->mount_state == CEPH_MOUNT_SHUTDOWN) 3101 - break; 3102 - 3103 - dout("waiting for sessions to close\n"); 3038 + mutex_lock(&mdsc->mutex); 3039 + for (i = 0; i < mdsc->max_sessions; i++) { 3040 + session = __ceph_lookup_mds_session(mdsc, i); 3041 + if (!session) 3042 + continue; 3104 3043 mutex_unlock(&mdsc->mutex); 3105 - wait_for_completion_timeout(&mdsc->session_close_waiters, 3106 - timeout); 3044 + mutex_lock(&session->s_mutex); 3045 + __close_session(mdsc, session); 3046 + mutex_unlock(&session->s_mutex); 3047 + ceph_put_mds_session(session); 3107 3048 mutex_lock(&mdsc->mutex); 3108 3049 } 3050 + mutex_unlock(&mdsc->mutex); 3051 + 3052 + dout("waiting for sessions to close\n"); 3053 + wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc), 3054 + timeout); 3109 3055 3110 3056 /* tear down remaining sessions */ 3057 + mutex_lock(&mdsc->mutex); 3111 3058 for (i = 0; i < mdsc->max_sessions; i++) { 3112 3059 if (mdsc->sessions[i]) { 3113 3060 session = get_session(mdsc->sessions[i]); ··· 3107 3080 mutex_lock(&mdsc->mutex); 3108 3081 } 3109 3082 } 3110 - 3111 3083 WARN_ON(!list_empty(&mdsc->cap_delay_list)); 3112 - 3113 3084 mutex_unlock(&mdsc->mutex); 3114 3085 3115 3086 ceph_cleanup_empty_realms(mdsc);

+2 -1

fs/ceph/mds_client.h

··· 234 234 struct mutex mutex; /* all nested structures */ 235 235 236 236 struct ceph_mdsmap *mdsmap; 237 - struct completion safe_umount_waiters, session_close_waiters; 237 + struct completion safe_umount_waiters; 238 + wait_queue_head_t session_close_wq; 238 239 struct list_head waiting_for_map; 239 240 240 241 struct ceph_mds_session **sessions; /* NULL for mds if no session */

+1 -1

fs/ceph/osd_client.c

··· 661 661 reqhead->reassert_version = req->r_reassert_version; 662 662 663 663 req->r_stamp = jiffies; 664 - list_move_tail(&osdc->req_lru, &req->r_req_lru_item); 664 + list_move_tail(&req->r_req_lru_item, &osdc->req_lru); 665 665 666 666 ceph_msg_get(req->r_request); /* send consumes a ref */ 667 667 ceph_con_send(&req->r_osd->o_con, req->r_request);

+58 -31

fs/ceph/snap.c

··· 435 435 { 436 436 struct inode *inode = &ci->vfs_inode; 437 437 struct ceph_cap_snap *capsnap; 438 - int used; 438 + int used, dirty; 439 439 440 440 capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); 441 441 if (!capsnap) { ··· 445 445 446 446 spin_lock(&inode->i_lock); 447 447 used = __ceph_caps_used(ci); 448 + dirty = __ceph_caps_dirty(ci); 448 449 if (__ceph_have_pending_cap_snap(ci)) { 449 450 /* there is no point in queuing multiple "pending" cap_snaps, 450 451 as no new writes are allowed to start when pending, so any ··· 453 452 cap_snap. lucky us. */ 454 453 dout("queue_cap_snap %p already pending\n", inode); 455 454 kfree(capsnap); 456 - } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) { 455 + } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) || 456 + (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| 457 + CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) { 457 458 struct ceph_snap_context *snapc = ci->i_head_snapc; 458 459 460 + dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode, 461 + capsnap, snapc); 459 462 igrab(inode); 460 - 463 + 461 464 atomic_set(&capsnap->nref, 1); 462 465 capsnap->ci = ci; 463 466 INIT_LIST_HEAD(&capsnap->ci_item); ··· 469 464 470 465 capsnap->follows = snapc->seq - 1; 471 466 capsnap->issued = __ceph_caps_issued(ci, NULL); 472 - capsnap->dirty = __ceph_caps_dirty(ci); 467 + capsnap->dirty = dirty; 473 468 474 469 capsnap->mode = inode->i_mode; 475 470 capsnap->uid = inode->i_uid; 476 471 capsnap->gid = inode->i_gid; 477 472 478 - /* fixme? */ 479 - capsnap->xattr_blob = NULL; 480 - capsnap->xattr_len = 0; 473 + if (dirty & CEPH_CAP_XATTR_EXCL) { 474 + __ceph_build_xattrs_blob(ci); 475 + capsnap->xattr_blob = 476 + ceph_buffer_get(ci->i_xattrs.blob); 477 + capsnap->xattr_version = ci->i_xattrs.version; 478 + } else { 479 + capsnap->xattr_blob = NULL; 480 + capsnap->xattr_version = 0; 481 + } 481 482 482 483 /* dirty page count moved from _head to this cap_snap; 483 484 all subsequent writes page dirties occur _after_ this ··· 491 480 capsnap->dirty_pages = ci->i_wrbuffer_ref_head; 492 481 ci->i_wrbuffer_ref_head = 0; 493 482 capsnap->context = snapc; 494 - ci->i_head_snapc = NULL; 483 + ci->i_head_snapc = 484 + ceph_get_snap_context(ci->i_snap_realm->cached_context); 485 + dout(" new snapc is %p\n", ci->i_head_snapc); 495 486 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); 496 487 497 488 if (used & CEPH_CAP_FILE_WR) { ··· 552 539 return 1; /* caller may want to ceph_flush_snaps */ 553 540 } 554 541 542 + /* 543 + * Queue cap_snaps for snap writeback for this realm and its children. 544 + * Called under snap_rwsem, so realm topology won't change. 545 + */ 546 + static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) 547 + { 548 + struct ceph_inode_info *ci; 549 + struct inode *lastinode = NULL; 550 + struct ceph_snap_realm *child; 551 + 552 + dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino); 553 + 554 + spin_lock(&realm->inodes_with_caps_lock); 555 + list_for_each_entry(ci, &realm->inodes_with_caps, 556 + i_snap_realm_item) { 557 + struct inode *inode = igrab(&ci->vfs_inode); 558 + if (!inode) 559 + continue; 560 + spin_unlock(&realm->inodes_with_caps_lock); 561 + if (lastinode) 562 + iput(lastinode); 563 + lastinode = inode; 564 + ceph_queue_cap_snap(ci); 565 + spin_lock(&realm->inodes_with_caps_lock); 566 + } 567 + spin_unlock(&realm->inodes_with_caps_lock); 568 + if (lastinode) 569 + iput(lastinode); 570 + 571 + dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino); 572 + list_for_each_entry(child, &realm->children, child_item) 573 + queue_realm_cap_snaps(child); 574 + 575 + dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); 576 + } 555 577 556 578 /* 557 579 * Parse and apply a snapblob "snap trace" from the MDS. This specifies ··· 637 589 * 638 590 * ...unless it's a snap deletion! 639 591 */ 640 - if (!deletion) { 641 - struct ceph_inode_info *ci; 642 - struct inode *lastinode = NULL; 643 - 644 - spin_lock(&realm->inodes_with_caps_lock); 645 - list_for_each_entry(ci, &realm->inodes_with_caps, 646 - i_snap_realm_item) { 647 - struct inode *inode = igrab(&ci->vfs_inode); 648 - if (!inode) 649 - continue; 650 - spin_unlock(&realm->inodes_with_caps_lock); 651 - if (lastinode) 652 - iput(lastinode); 653 - lastinode = inode; 654 - ceph_queue_cap_snap(ci); 655 - spin_lock(&realm->inodes_with_caps_lock); 656 - } 657 - spin_unlock(&realm->inodes_with_caps_lock); 658 - if (lastinode) 659 - iput(lastinode); 660 - dout("update_snap_trace cap_snaps queued\n"); 661 - } 662 - 592 + if (!deletion) 593 + queue_realm_cap_snaps(realm); 663 594 } else { 664 595 dout("update_snap_trace %llx %p seq %lld unchanged\n", 665 596 realm->ino, realm, realm->seq);

+7 -4

fs/ceph/super.h

··· 216 216 uid_t uid; 217 217 gid_t gid; 218 218 219 - void *xattr_blob; 220 - int xattr_len; 219 + struct ceph_buffer *xattr_blob; 221 220 u64 xattr_version; 222 221 223 222 u64 size; ··· 228 229 229 230 static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) 230 231 { 231 - if (atomic_dec_and_test(&capsnap->nref)) 232 + if (atomic_dec_and_test(&capsnap->nref)) { 233 + if (capsnap->xattr_blob) 234 + ceph_buffer_put(capsnap->xattr_blob); 232 235 kfree(capsnap); 236 + } 233 237 } 234 238 235 239 /* ··· 344 342 unsigned i_cap_exporting_issued; 345 343 struct ceph_cap_reservation i_cap_migration_resv; 346 344 struct list_head i_cap_snaps; /* snapped state pending flush to mds */ 347 - struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */ 345 + struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or 346 + dirty|flushing caps */ 348 347 unsigned i_snap_caps; /* cap bits for snapped files */ 349 348 350 349 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */

fs/ceph/xattr.c

··· 485 485 ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob; 486 486 ci->i_xattrs.prealloc_blob = NULL; 487 487 ci->i_xattrs.dirty = false; 488 + ci->i_xattrs.version++; 488 489 } 489 490 } 490 491

mm/page-writeback.c

··· 1126 1126 task_io_account_write(PAGE_CACHE_SIZE); 1127 1127 } 1128 1128 } 1129 + EXPORT_SYMBOL(account_page_dirtied); 1129 1130 1130 1131 /* 1131 1132 * For address_spaces which do not use buffers. Just tag the page as dirty in