Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'ceph-for-4.15-rc1' of git://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
"We have a set of file locking improvements from Zheng, rbd rw/ro state
handling code cleanup from myself and some assorted CephFS fixes from
Jeff.

rbd now defaults to single-major=Y, lifting the limit of ~240 rbd
images per host for everyone"

* tag 'ceph-for-4.15-rc1' of git://github.com/ceph/ceph-client:
rbd: default to single-major device number scheme
libceph: don't WARN() if user tries to add invalid key
rbd: set discard_alignment to zero
ceph: silence sparse endianness warning in encode_caps_cb
ceph: remove the bump of i_version
ceph: present consistent fsid, regardless of arch endianness
ceph: clean up spinlocking and list handling around cleanup_cap_releases()
rbd: get rid of rbd_mapping::read_only
rbd: fix and simplify rbd_ioctl_set_ro()
ceph: remove unused and redundant variable dropping
ceph: mark expected switch fall-throughs
ceph: -EINVAL on decoding failure in ceph_mdsc_handle_fsmap()
ceph: disable cached readdir after dropping positive dentry
ceph: fix bool initialization/comparison
ceph: handle 'session get evicted while there are file locks'
ceph: optimize flock encoding during reconnect
ceph: make lock_to_ceph_filelock() static
ceph: keep auth cap when inode has flocks or posix locks

+238 -151
+13 -52
drivers/block/rbd.c
··· 348 348 struct rbd_mapping { 349 349 u64 size; 350 350 u64 features; 351 - bool read_only; 352 351 }; 353 352 354 353 /* ··· 449 450 static struct workqueue_struct *rbd_wq; 450 451 451 452 /* 452 - * Default to false for now, as single-major requires >= 0.75 version of 453 - * userspace rbd utility. 453 + * single-major requires >= 0.75 version of userspace rbd utility. 454 454 */ 455 - static bool single_major = false; 455 + static bool single_major = true; 456 456 module_param(single_major, bool, S_IRUGO); 457 - MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)"); 457 + MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)"); 458 458 459 459 static int rbd_img_request_submit(struct rbd_img_request *img_request); 460 460 ··· 606 608 struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 607 609 bool removing = false; 608 610 609 - if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 610 - return -EROFS; 611 - 612 611 spin_lock_irq(&rbd_dev->lock); 613 612 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 614 613 removing = true; ··· 635 640 636 641 static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) 637 642 { 638 - int ret = 0; 639 - int val; 640 - bool ro; 641 - bool ro_changed = false; 643 + int ro; 642 644 643 - /* get_user() may sleep, so call it before taking rbd_dev->lock */ 644 - if (get_user(val, (int __user *)(arg))) 645 + if (get_user(ro, (int __user *)arg)) 645 646 return -EFAULT; 646 647 647 - ro = val ? true : false; 648 - /* Snapshot doesn't allow to write*/ 648 + /* Snapshots can't be marked read-write */ 649 649 if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro) 650 650 return -EROFS; 651 651 652 - spin_lock_irq(&rbd_dev->lock); 653 - /* prevent others open this device */ 654 - if (rbd_dev->open_count > 1) { 655 - ret = -EBUSY; 656 - goto out; 657 - } 658 - 659 - if (rbd_dev->mapping.read_only != ro) { 660 - rbd_dev->mapping.read_only = ro; 661 - ro_changed = true; 662 - } 663 - 664 - out: 665 - spin_unlock_irq(&rbd_dev->lock); 666 - /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */ 667 - if (ret == 0 && ro_changed) 668 - set_disk_ro(rbd_dev->disk, ro ? 1 : 0); 669 - 670 - return ret; 652 + /* Let blkdev_roset() handle it */ 653 + return -ENOTTY; 671 654 } 672 655 673 656 static int rbd_ioctl(struct block_device *bdev, fmode_t mode, 674 657 unsigned int cmd, unsigned long arg) 675 658 { 676 659 struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 677 - int ret = 0; 660 + int ret; 678 661 679 662 switch (cmd) { 680 663 case BLKROSET: ··· 4023 4050 goto err_rq; 4024 4051 } 4025 4052 4026 - /* Only reads are allowed to a read-only device */ 4027 - 4028 - if (op_type != OBJ_OP_READ) { 4029 - if (rbd_dev->mapping.read_only) { 4030 - result = -EROFS; 4031 - goto err_rq; 4032 - } 4033 - rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 4034 - } 4053 + rbd_assert(op_type == OBJ_OP_READ || 4054 + rbd_dev->spec->snap_id == CEPH_NOSNAP); 4035 4055 4036 4056 /* 4037 4057 * Quit early if the mapped snapshot no longer exists. It's ··· 4389 4423 /* enable the discard support */ 4390 4424 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); 4391 4425 q->limits.discard_granularity = segment_size; 4392 - q->limits.discard_alignment = segment_size; 4393 4426 blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE); 4394 4427 blk_queue_max_write_zeroes_sectors(q, segment_size / SECTOR_SIZE); 4395 4428 ··· 5959 5994 goto err_out_disk; 5960 5995 5961 5996 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 5962 - set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); 5997 + set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only); 5963 5998 5964 5999 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id); 5965 6000 if (ret) ··· 6110 6145 struct rbd_options *rbd_opts = NULL; 6111 6146 struct rbd_spec *spec = NULL; 6112 6147 struct rbd_client *rbdc; 6113 - bool read_only; 6114 6148 int rc; 6115 6149 6116 6150 if (!try_module_get(THIS_MODULE)) ··· 6158 6194 } 6159 6195 6160 6196 /* If we are mapping a snapshot it must be marked read-only */ 6161 - 6162 - read_only = rbd_dev->opts->read_only; 6163 6197 if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 6164 - read_only = true; 6165 - rbd_dev->mapping.read_only = read_only; 6198 + rbd_dev->opts->read_only = true; 6166 6199 6167 6200 rc = rbd_dev_device_setup(rbd_dev); 6168 6201 if (rc)
+4 -5
fs/ceph/caps.c
··· 1160 1160 struct ceph_inode_info *ci = cap->ci; 1161 1161 struct inode *inode = &ci->vfs_inode; 1162 1162 struct cap_msg_args arg; 1163 - int held, revoking, dropping; 1163 + int held, revoking; 1164 1164 int wake = 0; 1165 1165 int delayed = 0; 1166 1166 int ret; ··· 1168 1168 held = cap->issued | cap->implemented; 1169 1169 revoking = cap->implemented & ~cap->issued; 1170 1170 retain &= ~revoking; 1171 - dropping = cap->issued & ~retain; 1172 1171 1173 1172 dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n", 1174 1173 inode, cap, cap->session, ··· 1711 1712 1712 1713 /* if we are unmounting, flush any unused caps immediately. */ 1713 1714 if (mdsc->stopping) 1714 - is_delayed = 1; 1715 + is_delayed = true; 1715 1716 1716 1717 spin_lock(&ci->i_ceph_lock); 1717 1718 ··· 3188 3189 int dirty = le32_to_cpu(m->dirty); 3189 3190 int cleaned = 0; 3190 3191 bool drop = false; 3191 - bool wake_ci = 0; 3192 - bool wake_mdsc = 0; 3192 + bool wake_ci = false; 3193 + bool wake_mdsc = false; 3193 3194 3194 3195 list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { 3195 3196 if (cf->tid == flush_tid)
+7 -2
fs/ceph/inode.c
··· 493 493 ci->i_wb_ref = 0; 494 494 ci->i_wrbuffer_ref = 0; 495 495 ci->i_wrbuffer_ref_head = 0; 496 + atomic_set(&ci->i_filelock_ref, 0); 496 497 ci->i_shared_gen = 0; 497 498 ci->i_rdcache_gen = 0; 498 499 ci->i_rdcache_revoking = 0; ··· 787 786 788 787 /* update inode */ 789 788 ci->i_version = le64_to_cpu(info->version); 790 - inode->i_version++; 791 789 inode->i_rdev = le32_to_cpu(info->rdev); 792 790 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; 793 791 ··· 1185 1185 ceph_snap(d_inode(dn)) != tvino.snap)) { 1186 1186 dout(" dn %p points to wrong inode %p\n", 1187 1187 dn, d_inode(dn)); 1188 + ceph_dir_clear_ordered(dir); 1188 1189 d_delete(dn); 1189 1190 dput(dn); 1190 1191 goto retry_lookup; ··· 1323 1322 dout(" %p links to %p %llx.%llx, not %llx.%llx\n", 1324 1323 dn, d_inode(dn), ceph_vinop(d_inode(dn)), 1325 1324 ceph_vinop(in)); 1325 + ceph_dir_clear_ordered(dir); 1326 1326 d_invalidate(dn); 1327 1327 have_lease = false; 1328 1328 } ··· 1575 1573 ceph_snap(d_inode(dn)) != tvino.snap)) { 1576 1574 dout(" dn %p points to wrong inode %p\n", 1577 1575 dn, d_inode(dn)); 1576 + __ceph_dir_clear_ordered(ci); 1578 1577 d_delete(dn); 1579 1578 dput(dn); 1580 1579 goto retry_lookup; ··· 1600 1597 &req->r_caps_reservation); 1601 1598 if (ret < 0) { 1602 1599 pr_err("fill_inode badness on %p\n", in); 1603 - if (d_really_is_negative(dn)) 1600 + if (d_really_is_positive(dn)) 1601 + __ceph_dir_clear_ordered(ci); 1602 + else 1604 1603 iput(in); 1605 1604 d_drop(dn); 1606 1605 err = ret;
+126 -51
fs/ceph/locks.c
··· 30 30 get_random_bytes(&lock_secret, sizeof(lock_secret)); 31 31 } 32 32 33 + static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) 34 + { 35 + struct inode *inode = file_inode(src->fl_file); 36 + atomic_inc(&ceph_inode(inode)->i_filelock_ref); 37 + } 38 + 39 + static void ceph_fl_release_lock(struct file_lock *fl) 40 + { 41 + struct inode *inode = file_inode(fl->fl_file); 42 + struct ceph_inode_info *ci = ceph_inode(inode); 43 + if (atomic_dec_and_test(&ci->i_filelock_ref)) { 44 + /* clear error when all locks are released */ 45 + spin_lock(&ci->i_ceph_lock); 46 + ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK; 47 + spin_unlock(&ci->i_ceph_lock); 48 + } 49 + } 50 + 51 + static const struct file_lock_operations ceph_fl_lock_ops = { 52 + .fl_copy_lock = ceph_fl_copy_lock, 53 + .fl_release_private = ceph_fl_release_lock, 54 + }; 55 + 33 56 /** 34 57 * Implement fcntl and flock locking functions. 35 58 */ 36 - static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, 59 + static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, 37 60 int cmd, u8 wait, struct file_lock *fl) 38 61 { 39 - struct inode *inode = file_inode(file); 40 62 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 41 63 struct ceph_mds_request *req; 42 64 int err; 43 65 u64 length = 0; 44 66 u64 owner; 67 + 68 + if (operation == CEPH_MDS_OP_SETFILELOCK) { 69 + /* 70 + * increasing i_filelock_ref closes race window between 71 + * handling request reply and adding file_lock struct to 72 + * inode. Otherwise, auth caps may get trimmed in the 73 + * window. Caller function will decrease the counter. 74 + */ 75 + fl->fl_ops = &ceph_fl_lock_ops; 76 + atomic_inc(&ceph_inode(inode)->i_filelock_ref); 77 + } 45 78 46 79 if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) 47 80 wait = 0; ··· 213 180 */ 214 181 int ceph_lock(struct file *file, int cmd, struct file_lock *fl) 215 182 { 216 - u8 lock_cmd; 217 - int err; 218 - u8 wait = 0; 183 + struct inode *inode = file_inode(file); 184 + struct ceph_inode_info *ci = ceph_inode(inode); 185 + int err = 0; 219 186 u16 op = CEPH_MDS_OP_SETFILELOCK; 187 + u8 wait = 0; 188 + u8 lock_cmd; 220 189 221 190 if (!(fl->fl_flags & FL_POSIX)) 222 191 return -ENOLCK; ··· 234 199 else if (IS_SETLKW(cmd)) 235 200 wait = 1; 236 201 202 + spin_lock(&ci->i_ceph_lock); 203 + if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { 204 + err = -EIO; 205 + } else if (op == CEPH_MDS_OP_SETFILELOCK) { 206 + /* 207 + * increasing i_filelock_ref closes race window between 208 + * handling request reply and adding file_lock struct to 209 + * inode. Otherwise, i_auth_cap may get trimmed in the 210 + * window. Caller function will decrease the counter. 211 + */ 212 + fl->fl_ops = &ceph_fl_lock_ops; 213 + atomic_inc(&ci->i_filelock_ref); 214 + } 215 + spin_unlock(&ci->i_ceph_lock); 216 + if (err < 0) { 217 + if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) 218 + posix_lock_file(file, fl, NULL); 219 + return err; 220 + } 221 + 237 222 if (F_RDLCK == fl->fl_type) 238 223 lock_cmd = CEPH_LOCK_SHARED; 239 224 else if (F_WRLCK == fl->fl_type) ··· 261 206 else 262 207 lock_cmd = CEPH_LOCK_UNLOCK; 263 208 264 - err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl); 209 + err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl); 265 210 if (!err) { 266 - if (op != CEPH_MDS_OP_GETFILELOCK) { 211 + if (op == CEPH_MDS_OP_SETFILELOCK) { 267 212 dout("mds locked, locking locally"); 268 213 err = posix_lock_file(file, fl, NULL); 269 - if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { 214 + if (err) { 270 215 /* undo! This should only happen if 271 216 * the kernel detects local 272 217 * deadlock. */ 273 - ceph_lock_message(CEPH_LOCK_FCNTL, op, file, 218 + ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, 274 219 CEPH_LOCK_UNLOCK, 0, fl); 275 220 dout("got %d on posix_lock_file, undid lock", 276 221 err); ··· 282 227 283 228 int ceph_flock(struct file *file, int cmd, struct file_lock *fl) 284 229 { 285 - u8 lock_cmd; 286 - int err; 230 + struct inode *inode = file_inode(file); 231 + struct ceph_inode_info *ci = ceph_inode(inode); 232 + int err = 0; 287 233 u8 wait = 0; 234 + u8 lock_cmd; 288 235 289 236 if (!(fl->fl_flags & FL_FLOCK)) 290 237 return -ENOLCK; ··· 295 238 return -EOPNOTSUPP; 296 239 297 240 dout("ceph_flock, fl_file: %p", fl->fl_file); 241 + 242 + spin_lock(&ci->i_ceph_lock); 243 + if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { 244 + err = -EIO; 245 + } else { 246 + /* see comment in ceph_lock */ 247 + fl->fl_ops = &ceph_fl_lock_ops; 248 + atomic_inc(&ci->i_filelock_ref); 249 + } 250 + spin_unlock(&ci->i_ceph_lock); 251 + if (err < 0) { 252 + if (F_UNLCK == fl->fl_type) 253 + locks_lock_file_wait(file, fl); 254 + return err; 255 + } 298 256 299 257 if (IS_SETLKW(cmd)) 300 258 wait = 1; ··· 322 250 lock_cmd = CEPH_LOCK_UNLOCK; 323 251 324 252 err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, 325 - file, lock_cmd, wait, fl); 253 + inode, lock_cmd, wait, fl); 326 254 if (!err) { 327 255 err = locks_lock_file_wait(file, fl); 328 256 if (err) { 329 257 ceph_lock_message(CEPH_LOCK_FLOCK, 330 258 CEPH_MDS_OP_SETFILELOCK, 331 - file, CEPH_LOCK_UNLOCK, 0, fl); 259 + inode, CEPH_LOCK_UNLOCK, 0, fl); 332 260 dout("got %d on locks_lock_file_wait, undid lock", err); 333 261 } 334 262 } ··· 358 286 } 359 287 dout("counted %d flock locks and %d fcntl locks", 360 288 *flock_count, *fcntl_count); 289 + } 290 + 291 + /* 292 + * Given a pointer to a lock, convert it to a ceph filelock 293 + */ 294 + static int lock_to_ceph_filelock(struct file_lock *lock, 295 + struct ceph_filelock *cephlock) 296 + { 297 + int err = 0; 298 + cephlock->start = cpu_to_le64(lock->fl_start); 299 + cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); 300 + cephlock->client = cpu_to_le64(0); 301 + cephlock->pid = cpu_to_le64((u64)lock->fl_pid); 302 + cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner)); 303 + 304 + switch (lock->fl_type) { 305 + case F_RDLCK: 306 + cephlock->type = CEPH_LOCK_SHARED; 307 + break; 308 + case F_WRLCK: 309 + cephlock->type = CEPH_LOCK_EXCL; 310 + break; 311 + case F_UNLCK: 312 + cephlock->type = CEPH_LOCK_UNLOCK; 313 + break; 314 + default: 315 + dout("Have unknown lock type %d", lock->fl_type); 316 + err = -EINVAL; 317 + } 318 + 319 + return err; 361 320 } 362 321 363 322 /** ··· 459 356 if (err) 460 357 goto out_fail; 461 358 462 - err = ceph_pagelist_append(pagelist, flocks, 463 - num_fcntl_locks * sizeof(*flocks)); 464 - if (err) 465 - goto out_fail; 359 + if (num_fcntl_locks > 0) { 360 + err = ceph_pagelist_append(pagelist, flocks, 361 + num_fcntl_locks * sizeof(*flocks)); 362 + if (err) 363 + goto out_fail; 364 + } 466 365 467 366 nlocks = cpu_to_le32(num_flock_locks); 468 367 err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); 469 368 if (err) 470 369 goto out_fail; 471 370 472 - err = ceph_pagelist_append(pagelist, 473 - &flocks[num_fcntl_locks], 474 - num_flock_locks * sizeof(*flocks)); 475 - out_fail: 476 - return err; 477 - } 478 - 479 - /* 480 - * Given a pointer to a lock, convert it to a ceph filelock 481 - */ 482 - int lock_to_ceph_filelock(struct file_lock *lock, 483 - struct ceph_filelock *cephlock) 484 - { 485 - int err = 0; 486 - cephlock->start = cpu_to_le64(lock->fl_start); 487 - cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); 488 - cephlock->client = cpu_to_le64(0); 489 - cephlock->pid = cpu_to_le64((u64)lock->fl_pid); 490 - cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner)); 491 - 492 - switch (lock->fl_type) { 493 - case F_RDLCK: 494 - cephlock->type = CEPH_LOCK_SHARED; 495 - break; 496 - case F_WRLCK: 497 - cephlock->type = CEPH_LOCK_EXCL; 498 - break; 499 - case F_UNLCK: 500 - cephlock->type = CEPH_LOCK_UNLOCK; 501 - break; 502 - default: 503 - dout("Have unknown lock type %d", lock->fl_type); 504 - err = -EINVAL; 371 + if (num_flock_locks > 0) { 372 + err = ceph_pagelist_append(pagelist, &flocks[num_fcntl_locks], 373 + num_flock_locks * sizeof(*flocks)); 505 374 } 506 - 375 + out_fail: 507 376 return err; 508 377 }
+64 -34
fs/ceph/mds_client.c
··· 1039 1039 * session caps 1040 1040 */ 1041 1041 1042 - /* caller holds s_cap_lock, we drop it */ 1043 - static void cleanup_cap_releases(struct ceph_mds_client *mdsc, 1044 - struct ceph_mds_session *session) 1045 - __releases(session->s_cap_lock) 1042 + static void detach_cap_releases(struct ceph_mds_session *session, 1043 + struct list_head *target) 1046 1044 { 1047 - LIST_HEAD(tmp_list); 1048 - list_splice_init(&session->s_cap_releases, &tmp_list); 1049 - session->s_num_cap_releases = 0; 1050 - spin_unlock(&session->s_cap_lock); 1045 + lockdep_assert_held(&session->s_cap_lock); 1051 1046 1052 - dout("cleanup_cap_releases mds%d\n", session->s_mds); 1053 - while (!list_empty(&tmp_list)) { 1047 + list_splice_init(&session->s_cap_releases, target); 1048 + session->s_num_cap_releases = 0; 1049 + dout("dispose_cap_releases mds%d\n", session->s_mds); 1050 + } 1051 + 1052 + static void dispose_cap_releases(struct ceph_mds_client *mdsc, 1053 + struct list_head *dispose) 1054 + { 1055 + while (!list_empty(dispose)) { 1054 1056 struct ceph_cap *cap; 1055 1057 /* zero out the in-progress message */ 1056 - cap = list_first_entry(&tmp_list, 1057 - struct ceph_cap, session_caps); 1058 + cap = list_first_entry(dispose, struct ceph_cap, session_caps); 1058 1059 list_del(&cap->session_caps); 1059 1060 ceph_put_cap(mdsc, cap); 1060 1061 } ··· 1216 1215 } 1217 1216 spin_unlock(&mdsc->cap_dirty_lock); 1218 1217 1218 + if (atomic_read(&ci->i_filelock_ref) > 0) { 1219 + /* make further file lock syscall return -EIO */ 1220 + ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; 1221 + pr_warn_ratelimited(" dropping file locks for %p %lld\n", 1222 + inode, ceph_ino(inode)); 1223 + } 1224 + 1219 1225 if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { 1220 1226 list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); 1221 1227 ci->i_prealloc_cap_flush = NULL; ··· 1252 1244 { 1253 1245 struct ceph_fs_client *fsc = session->s_mdsc->fsc; 1254 1246 struct super_block *sb = fsc->sb; 1247 + LIST_HEAD(dispose); 1248 + 1255 1249 dout("remove_session_caps on %p\n", session); 1256 1250 iterate_session_caps(session, remove_session_caps_cb, fsc); 1257 1251 ··· 1288 1278 } 1289 1279 1290 1280 // drop cap expires and unlock s_cap_lock 1291 - cleanup_cap_releases(session->s_mdsc, session); 1281 + detach_cap_releases(session, &dispose); 1292 1282 1293 1283 BUG_ON(session->s_nr_caps > 0); 1294 1284 BUG_ON(!list_empty(&session->s_cap_flushing)); 1285 + spin_unlock(&session->s_cap_lock); 1286 + dispose_cap_releases(session->s_mdsc, &dispose); 1295 1287 } 1296 1288 1297 1289 /* ··· 1473 1461 !list_empty(&ci->i_cap_snaps)) 1474 1462 goto out; 1475 1463 if ((used | wanted) & CEPH_CAP_ANY_WR) 1464 + goto out; 1465 + /* Note: it's possible that i_filelock_ref becomes non-zero 1466 + * after dropping auth caps. It doesn't hurt because reply 1467 + * of lock mds request will re-add auth caps. */ 1468 + if (atomic_read(&ci->i_filelock_ref) > 0) 1476 1469 goto out; 1477 1470 } 1478 1471 /* The inode has cached pages, but it's no longer used. ··· 2844 2827 struct ceph_mds_cap_reconnect v2; 2845 2828 struct ceph_mds_cap_reconnect_v1 v1; 2846 2829 } rec; 2847 - struct ceph_inode_info *ci; 2830 + struct ceph_inode_info *ci = cap->ci; 2848 2831 struct ceph_reconnect_state *recon_state = arg; 2849 2832 struct ceph_pagelist *pagelist = recon_state->pagelist; 2850 2833 char *path; ··· 2852 2835 u64 pathbase; 2853 2836 u64 snap_follows; 2854 2837 struct dentry *dentry; 2855 - 2856 - ci = cap->ci; 2857 2838 2858 2839 dout(" adding %p ino %llx.%llx cap %p %lld %s\n", 2859 2840 inode, ceph_vinop(inode), cap, cap->cap_id, ··· 2885 2870 rec.v2.issued = cpu_to_le32(cap->issued); 2886 2871 rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); 2887 2872 rec.v2.pathbase = cpu_to_le64(pathbase); 2888 - rec.v2.flock_len = 0; 2873 + rec.v2.flock_len = (__force __le32) 2874 + ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1); 2889 2875 } else { 2890 2876 rec.v1.cap_id = cpu_to_le64(cap->cap_id); 2891 2877 rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); ··· 2910 2894 2911 2895 if (recon_state->msg_version >= 2) { 2912 2896 int num_fcntl_locks, num_flock_locks; 2913 - struct ceph_filelock *flocks; 2897 + struct ceph_filelock *flocks = NULL; 2914 2898 size_t struct_len, total_len = 0; 2915 2899 u8 struct_v = 0; 2916 2900 2917 2901 encode_again: 2918 - ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); 2919 - flocks = kmalloc((num_fcntl_locks+num_flock_locks) * 2920 - sizeof(struct ceph_filelock), GFP_NOFS); 2921 - if (!flocks) { 2922 - err = -ENOMEM; 2923 - goto out_free; 2902 + if (rec.v2.flock_len) { 2903 + ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); 2904 + } else { 2905 + num_fcntl_locks = 0; 2906 + num_flock_locks = 0; 2924 2907 } 2925 - err = ceph_encode_locks_to_buffer(inode, flocks, 2926 - num_fcntl_locks, 2927 - num_flock_locks); 2928 - if (err) { 2908 + if (num_fcntl_locks + num_flock_locks > 0) { 2909 + flocks = kmalloc((num_fcntl_locks + num_flock_locks) * 2910 + sizeof(struct ceph_filelock), GFP_NOFS); 2911 + if (!flocks) { 2912 + err = -ENOMEM; 2913 + goto out_free; 2914 + } 2915 + err = ceph_encode_locks_to_buffer(inode, flocks, 2916 + num_fcntl_locks, 2917 + num_flock_locks); 2918 + if (err) { 2919 + kfree(flocks); 2920 + flocks = NULL; 2921 + if (err == -ENOSPC) 2922 + goto encode_again; 2923 + goto out_free; 2924 + } 2925 + } else { 2929 2926 kfree(flocks); 2930 - if (err == -ENOSPC) 2931 - goto encode_again; 2932 - goto out_free; 2927 + flocks = NULL; 2933 2928 } 2934 2929 2935 2930 if (recon_state->msg_version >= 3) { ··· 3020 2993 int s_nr_caps; 3021 2994 struct ceph_pagelist *pagelist; 3022 2995 struct ceph_reconnect_state recon_state; 2996 + LIST_HEAD(dispose); 3023 2997 3024 2998 pr_info("mds%d reconnect start\n", mds); 3025 2999 ··· 3054 3026 */ 3055 3027 session->s_cap_reconnect = 1; 3056 3028 /* drop old cap expires; we're about to reestablish that state */ 3057 - cleanup_cap_releases(mdsc, session); 3029 + detach_cap_releases(session, &dispose); 3030 + spin_unlock(&session->s_cap_lock); 3031 + dispose_cap_releases(mdsc, &dispose); 3058 3032 3059 3033 /* trim unused caps to reduce MDS's cache rejoin time */ 3060 3034 if (mdsc->fsc->sb->s_root) ··· 3887 3857 goto err_out; 3888 3858 } 3889 3859 return; 3860 + 3890 3861 bad: 3891 3862 pr_err("error decoding fsmap\n"); 3892 3863 err_out: 3893 3864 mutex_lock(&mdsc->mutex); 3894 - mdsc->mdsmap_err = -ENOENT; 3865 + mdsc->mdsmap_err = err; 3895 3866 __wake_requests(mdsc, &mdsc->waiting_for_map); 3896 3867 mutex_unlock(&mdsc->mutex); 3897 - return; 3898 3868 } 3899 3869 3900 3870 /*
+3 -2
fs/ceph/super.c
··· 84 84 buf->f_ffree = -1; 85 85 buf->f_namelen = NAME_MAX; 86 86 87 - /* leave fsid little-endian, regardless of host endianness */ 88 - fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1); 87 + /* Must convert the fsid, for consistent values across arches */ 88 + fsid = le64_to_cpu(*(__le64 *)(&monmap->fsid)) ^ 89 + le64_to_cpu(*((__le64 *)&monmap->fsid + 1)); 89 90 buf->f_fsid.val[0] = fsid & 0xffffffff; 90 91 buf->f_fsid.val[1] = fsid >> 32; 91 92
+3 -1
fs/ceph/super.h
··· 352 352 int i_pin_ref; 353 353 int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref; 354 354 int i_wrbuffer_ref, i_wrbuffer_ref_head; 355 + atomic_t i_filelock_ref; 355 356 u32 i_shared_gen; /* increment each time we get FILE_SHARED */ 356 357 u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ 357 358 u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ ··· 488 487 #define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */ 489 488 #define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */ 490 489 #define CEPH_I_ERROR_WRITE (1 << 11) /* have seen write errors */ 490 + #define CEPH_I_ERROR_FILELOCK (1 << 12) /* have seen file lock errors */ 491 + 491 492 492 493 /* 493 494 * We set the ERROR_WRITE bit when we start seeing write errors on an inode ··· 1014 1011 extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks, 1015 1012 struct ceph_pagelist *pagelist, 1016 1013 int num_fcntl_locks, int num_flock_locks); 1017 - extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); 1018 1014 1019 1015 /* debugfs.c */ 1020 1016 extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
+11 -1
net/ceph/ceph_hash.c
··· 47 47 48 48 /* handle the last 11 bytes */ 49 49 c = c + length; 50 - switch (len) { /* all the case statements fall through */ 50 + switch (len) { 51 51 case 11: 52 52 c = c + ((__u32)k[10] << 24); 53 + /* fall through */ 53 54 case 10: 54 55 c = c + ((__u32)k[9] << 16); 56 + /* fall through */ 55 57 case 9: 56 58 c = c + ((__u32)k[8] << 8); 57 59 /* the first byte of c is reserved for the length */ 60 + /* fall through */ 58 61 case 8: 59 62 b = b + ((__u32)k[7] << 24); 63 + /* fall through */ 60 64 case 7: 61 65 b = b + ((__u32)k[6] << 16); 66 + /* fall through */ 62 67 case 6: 63 68 b = b + ((__u32)k[5] << 8); 69 + /* fall through */ 64 70 case 5: 65 71 b = b + k[4]; 72 + /* fall through */ 66 73 case 4: 67 74 a = a + ((__u32)k[3] << 24); 75 + /* fall through */ 68 76 case 3: 69 77 a = a + ((__u32)k[2] << 16); 78 + /* fall through */ 70 79 case 2: 71 80 a = a + ((__u32)k[1] << 8); 81 + /* fall through */ 72 82 case 1: 73 83 a = a + k[0]; 74 84 /* case 0: nothing left to add */
+3 -1
net/ceph/crypto.c
··· 37 37 return -ENOTSUPP; 38 38 } 39 39 40 - WARN_ON(!key->len); 40 + if (!key->len) 41 + return -EINVAL; 42 + 41 43 key->key = kmemdup(buf, key->len, GFP_NOIO); 42 44 if (!key->key) { 43 45 ret = -ENOMEM;
+1
net/ceph/messenger.c
··· 430 430 switch (sk->sk_state) { 431 431 case TCP_CLOSE: 432 432 dout("%s TCP_CLOSE\n", __func__); 433 + /* fall through */ 433 434 case TCP_CLOSE_WAIT: 434 435 dout("%s TCP_CLOSE_WAIT\n", __func__); 435 436 con_sock_state_closing(con);
+3 -2
net/ceph/mon_client.c
··· 1279 1279 1280 1280 /* 1281 1281 * Older OSDs don't set reply tid even if the orignal 1282 - * request had a non-zero tid. Workaround this weirdness 1283 - * by falling through to the allocate case. 1282 + * request had a non-zero tid. Work around this weirdness 1283 + * by allocating a new message. 1284 1284 */ 1285 + /* fall through */ 1285 1286 case CEPH_MSG_MON_MAP: 1286 1287 case CEPH_MSG_MDS_MAP: 1287 1288 case CEPH_MSG_OSD_MAP: