Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'ceph-for-6.11-rc1' of https://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
"A small patchset to address bogus I/O errors and ultimately an
assertion failure in the face of watch errors with -o exclusive
mappings in RBD marked for stable and some assorted CephFS fixes"

* tag 'ceph-for-6.11-rc1' of https://github.com/ceph/ceph-client:
rbd: don't assume rbd_is_lock_owner() for exclusive mappings
rbd: don't assume RBD_LOCK_STATE_LOCKED for exclusive mappings
rbd: rename RBD_LOCK_STATE_RELEASING and releasing_wait
ceph: fix incorrect kmalloc size of pagevec mempool
ceph: periodically flush the cap releases
ceph: convert comma to semicolon in __ceph_dentry_dir_lease_touch()
ceph: use cap_wait_list only if debugfs is enabled

+34 -22
+15 -20
drivers/block/rbd.c
··· 362 362 enum rbd_lock_state { 363 363 RBD_LOCK_STATE_UNLOCKED, 364 364 RBD_LOCK_STATE_LOCKED, 365 - RBD_LOCK_STATE_RELEASING, 365 + RBD_LOCK_STATE_QUIESCING, 366 366 }; 367 367 368 368 /* WatchNotify::ClientId */ ··· 422 422 struct list_head running_list; 423 423 struct completion acquire_wait; 424 424 int acquire_err; 425 - struct completion releasing_wait; 425 + struct completion quiescing_wait; 426 426 427 427 spinlock_t object_map_lock; 428 428 u8 *object_map; ··· 525 525 lockdep_assert_held(&rbd_dev->lock_rwsem); 526 526 527 527 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED || 528 - rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING; 528 + rbd_dev->lock_state == RBD_LOCK_STATE_QUIESCING; 529 529 } 530 530 531 531 static bool rbd_is_lock_owner(struct rbd_device *rbd_dev) ··· 3457 3457 lockdep_assert_held(&rbd_dev->lock_rwsem); 3458 3458 spin_lock(&rbd_dev->lock_lists_lock); 3459 3459 if (!list_empty(&img_req->lock_item)) { 3460 + rbd_assert(!list_empty(&rbd_dev->running_list)); 3460 3461 list_del_init(&img_req->lock_item); 3461 - need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING && 3462 + need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_QUIESCING && 3462 3463 list_empty(&rbd_dev->running_list)); 3463 3464 } 3464 3465 spin_unlock(&rbd_dev->lock_lists_lock); 3465 3466 if (need_wakeup) 3466 - complete(&rbd_dev->releasing_wait); 3467 + complete(&rbd_dev->quiescing_wait); 3467 3468 } 3468 3469 3469 3470 static int rbd_img_exclusive_lock(struct rbd_img_request *img_req) ··· 3476 3475 3477 3476 if (rbd_lock_add_request(img_req)) 3478 3477 return 1; 3479 - 3480 - if (rbd_dev->opts->exclusive) { 3481 - WARN_ON(1); /* lock got released? */ 3482 - return -EROFS; 3483 - } 3484 3478 3485 3479 /* 3486 3480 * Note the use of mod_delayed_work() in rbd_acquire_lock() ··· 4177 4181 /* 4178 4182 * Ensure that all in-flight IO is flushed. 4179 4183 */ 4180 - rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING; 4181 - rbd_assert(!completion_done(&rbd_dev->releasing_wait)); 4184 + rbd_dev->lock_state = RBD_LOCK_STATE_QUIESCING; 4185 + rbd_assert(!completion_done(&rbd_dev->quiescing_wait)); 4182 4186 if (list_empty(&rbd_dev->running_list)) 4183 4187 return true; 4184 4188 4185 4189 up_write(&rbd_dev->lock_rwsem); 4186 - wait_for_completion(&rbd_dev->releasing_wait); 4190 + wait_for_completion(&rbd_dev->quiescing_wait); 4187 4191 4188 4192 down_write(&rbd_dev->lock_rwsem); 4189 - if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING) 4193 + if (rbd_dev->lock_state != RBD_LOCK_STATE_QUIESCING) 4190 4194 return false; 4191 4195 4192 4196 rbd_assert(list_empty(&rbd_dev->running_list)); ··· 4596 4600 if (ret != -EOPNOTSUPP) 4597 4601 rbd_warn(rbd_dev, "failed to update lock cookie: %d", 4598 4602 ret); 4603 + 4604 + if (rbd_dev->opts->exclusive) 4605 + rbd_warn(rbd_dev, 4606 + "temporarily releasing lock on exclusive mapping"); 4599 4607 4600 4608 /* 4601 4609 * Lock cookie cannot be updated on older OSDs, so do ··· 5376 5376 INIT_LIST_HEAD(&rbd_dev->acquiring_list); 5377 5377 INIT_LIST_HEAD(&rbd_dev->running_list); 5378 5378 init_completion(&rbd_dev->acquire_wait); 5379 - init_completion(&rbd_dev->releasing_wait); 5379 + init_completion(&rbd_dev->quiescing_wait); 5380 5380 5381 5381 spin_lock_init(&rbd_dev->object_map_lock); 5382 5382 ··· 6582 6582 if (ret) 6583 6583 return ret; 6584 6584 6585 - /* 6586 - * The lock may have been released by now, unless automatic lock 6587 - * transitions are disabled. 6588 - */ 6589 - rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev)); 6590 6585 return 0; 6591 6586 } 6592 6587
+6
fs/ceph/caps.c
··· 3067 3067 flags, &_got); 3068 3068 WARN_ON_ONCE(ret == -EAGAIN); 3069 3069 if (!ret) { 3070 + #ifdef CONFIG_DEBUG_FS 3070 3071 struct ceph_mds_client *mdsc = fsc->mdsc; 3071 3072 struct cap_wait cw; 3073 + #endif 3072 3074 DEFINE_WAIT_FUNC(wait, woken_wake_function); 3073 3075 3076 + #ifdef CONFIG_DEBUG_FS 3074 3077 cw.ino = ceph_ino(inode); 3075 3078 cw.tgid = current->tgid; 3076 3079 cw.need = need; ··· 3082 3079 spin_lock(&mdsc->caps_list_lock); 3083 3080 list_add(&cw.list, &mdsc->cap_wait_list); 3084 3081 spin_unlock(&mdsc->caps_list_lock); 3082 + #endif 3085 3083 3086 3084 /* make sure used fmode not timeout */ 3087 3085 ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS); ··· 3101 3097 remove_wait_queue(&ci->i_cap_wq, &wait); 3102 3098 ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS); 3103 3099 3100 + #ifdef CONFIG_DEBUG_FS 3104 3101 spin_lock(&mdsc->caps_list_lock); 3105 3102 list_del(&cw.list); 3106 3103 spin_unlock(&mdsc->caps_list_lock); 3104 + #endif 3107 3105 3108 3106 if (ret == -EAGAIN) 3109 3107 continue;
+1 -1
fs/ceph/dir.c
··· 1589 1589 } 1590 1590 1591 1591 spin_lock(&mdsc->dentry_list_lock); 1592 - __dentry_dir_lease_touch(mdsc, di), 1592 + __dentry_dir_lease_touch(mdsc, di); 1593 1593 spin_unlock(&mdsc->dentry_list_lock); 1594 1594 } 1595 1595
+4
fs/ceph/mds_client.c
··· 5446 5446 } 5447 5447 mutex_unlock(&mdsc->mutex); 5448 5448 5449 + ceph_flush_cap_releases(mdsc, s); 5450 + 5449 5451 mutex_lock(&s->s_mutex); 5450 5452 if (renew_caps) 5451 5453 send_renew_caps(mdsc, s); ··· 5507 5505 INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); 5508 5506 mdsc->last_renew_caps = jiffies; 5509 5507 INIT_LIST_HEAD(&mdsc->cap_delay_list); 5508 + #ifdef CONFIG_DEBUG_FS 5510 5509 INIT_LIST_HEAD(&mdsc->cap_wait_list); 5510 + #endif 5511 5511 spin_lock_init(&mdsc->cap_delay_lock); 5512 5512 INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list); 5513 5513 INIT_LIST_HEAD(&mdsc->snap_flush_list);
+6
fs/ceph/mds_client.h
··· 416 416 struct inode *inode; 417 417 }; 418 418 419 + #ifdef CONFIG_DEBUG_FS 420 + 419 421 struct cap_wait { 420 422 struct list_head list; 421 423 u64 ino; ··· 425 423 int need; 426 424 int want; 427 425 }; 426 + 427 + #endif 428 428 429 429 enum { 430 430 CEPH_MDSC_STOPPING_BEGIN = 1, ··· 516 512 spinlock_t caps_list_lock; 517 513 struct list_head caps_list; /* unused (reserved or 518 514 unreserved) */ 515 + #ifdef CONFIG_DEBUG_FS 519 516 struct list_head cap_wait_list; 517 + #endif 520 518 int caps_total_count; /* total caps allocated */ 521 519 int caps_use_count; /* in use */ 522 520 int caps_use_max; /* max used caps */
+2 -1
fs/ceph/super.c
··· 961 961 if (!ceph_mds_request_cachep) 962 962 goto bad_mds_req; 963 963 964 - ceph_wb_pagevec_pool = mempool_create_kmalloc_pool(10, CEPH_MAX_WRITE_SIZE >> PAGE_SHIFT); 964 + ceph_wb_pagevec_pool = mempool_create_kmalloc_pool(10, 965 + (CEPH_MAX_WRITE_SIZE >> PAGE_SHIFT) * sizeof(struct page *)); 965 966 if (!ceph_wb_pagevec_pool) 966 967 goto bad_pagevec_pool; 967 968