Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ublk: add feature UBLK_F_QUIESCE

Add feature UBLK_F_QUIESCE, which adds control command `UBLK_U_CMD_QUIESCE_DEV`
for quiescing device, then device state can become `UBLK_S_DEV_QUIESCED`
or `UBLK_S_DEV_FAIL_IO` finally from ublk_ch_release() with ublk server
cooperation.

This feature can help to support to upgrade ublk server application by
shutting down ublk server gracefully, meantime keep ublk block device
persistent during the upgrading period.

The feature is only available for UBLK_F_USER_RECOVERY.

Suggested-by: Yoav Cohen <yoav@nvidia.com>
Link: https://lore.kernel.org/linux-block/DM4PR12MB632807AB7CDCE77D1E5AB7D0A9B92@DM4PR12MB6328.namprd12.prod.outlook.com/
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20250522163523.406289-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Ming Lei and committed by
Jens Axboe
b465ae7b f40b1f26

+142 -1
+123 -1
drivers/block/ublk_drv.c
··· 51 51 /* private ioctl command mirror */ 52 52 #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) 53 53 #define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE) 54 + #define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV) 54 55 55 56 #define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF) 56 57 #define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF) ··· 68 67 | UBLK_F_ZONED \ 69 68 | UBLK_F_USER_RECOVERY_FAIL_IO \ 70 69 | UBLK_F_UPDATE_SIZE \ 71 - | UBLK_F_AUTO_BUF_REG) 70 + | UBLK_F_AUTO_BUF_REG \ 71 + | UBLK_F_QUIESCE) 72 72 73 73 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ 74 74 | UBLK_F_USER_RECOVERY_REISSUE \ ··· 2843 2841 return -EINVAL; 2844 2842 } 2845 2843 2844 + if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) { 2845 + pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n"); 2846 + return -EINVAL; 2847 + } 2848 + 2846 2849 /* 2847 2850 * unprivileged device can't be trusted, but RECOVERY and 2848 2851 * RECOVERY_REISSUE still may hang error handling, so can't ··· 3240 3233 set_capacity_and_notify(ub->ub_disk, p->dev_sectors); 3241 3234 mutex_unlock(&ub->mutex); 3242 3235 } 3236 + 3237 + struct count_busy { 3238 + const struct ublk_queue *ubq; 3239 + unsigned int nr_busy; 3240 + }; 3241 + 3242 + static bool ublk_count_busy_req(struct request *rq, void *data) 3243 + { 3244 + struct count_busy *idle = data; 3245 + 3246 + if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq) 3247 + idle->nr_busy += 1; 3248 + return true; 3249 + } 3250 + 3251 + /* uring_cmd is guaranteed to be active if the associated request is idle */ 3252 + static bool ubq_has_idle_io(const struct ublk_queue *ubq) 3253 + { 3254 + struct count_busy data = { 3255 + .ubq = ubq, 3256 + }; 3257 + 3258 + blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data); 3259 + return data.nr_busy < ubq->q_depth; 3260 + } 3261 + 3262 + /* Wait until each hw queue has at least one idle IO */ 3263 + static int ublk_wait_for_idle_io(struct ublk_device *ub, 3264 + unsigned int timeout_ms) 3265 + { 3266 + unsigned int elapsed = 0; 3267 + int ret; 3268 + 3269 + while (elapsed < timeout_ms && !signal_pending(current)) { 3270 + unsigned int queues_cancelable = 0; 3271 + int i; 3272 + 3273 + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { 3274 + struct ublk_queue *ubq = ublk_get_queue(ub, i); 3275 + 3276 + queues_cancelable += !!ubq_has_idle_io(ubq); 3277 + } 3278 + 3279 + /* 3280 + * Each queue needs at least one active command for 3281 + * notifying ublk server 3282 + */ 3283 + if (queues_cancelable == ub->dev_info.nr_hw_queues) 3284 + break; 3285 + 3286 + msleep(UBLK_REQUEUE_DELAY_MS); 3287 + elapsed += UBLK_REQUEUE_DELAY_MS; 3288 + } 3289 + 3290 + if (signal_pending(current)) 3291 + ret = -EINTR; 3292 + else if (elapsed >= timeout_ms) 3293 + ret = -EBUSY; 3294 + else 3295 + ret = 0; 3296 + 3297 + return ret; 3298 + } 3299 + 3300 + static int ublk_ctrl_quiesce_dev(struct ublk_device *ub, 3301 + const struct ublksrv_ctrl_cmd *header) 3302 + { 3303 + /* zero means wait forever */ 3304 + u64 timeout_ms = header->data[0]; 3305 + struct gendisk *disk; 3306 + int i, ret = -ENODEV; 3307 + 3308 + if (!(ub->dev_info.flags & UBLK_F_QUIESCE)) 3309 + return -EOPNOTSUPP; 3310 + 3311 + mutex_lock(&ub->mutex); 3312 + disk = ublk_get_disk(ub); 3313 + if (!disk) 3314 + goto unlock; 3315 + if (ub->dev_info.state == UBLK_S_DEV_DEAD) 3316 + goto put_disk; 3317 + 3318 + ret = 0; 3319 + /* already in expected state */ 3320 + if (ub->dev_info.state != UBLK_S_DEV_LIVE) 3321 + goto put_disk; 3322 + 3323 + /* Mark all queues as canceling */ 3324 + blk_mq_quiesce_queue(disk->queue); 3325 + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { 3326 + struct ublk_queue *ubq = ublk_get_queue(ub, i); 3327 + 3328 + ubq->canceling = true; 3329 + } 3330 + blk_mq_unquiesce_queue(disk->queue); 3331 + 3332 + if (!timeout_ms) 3333 + timeout_ms = UINT_MAX; 3334 + ret = ublk_wait_for_idle_io(ub, timeout_ms); 3335 + 3336 + put_disk: 3337 + ublk_put_disk(disk); 3338 + unlock: 3339 + mutex_unlock(&ub->mutex); 3340 + 3341 + /* Cancel pending uring_cmd */ 3342 + if (!ret) 3343 + ublk_cancel_dev(ub); 3344 + return ret; 3345 + } 3346 + 3243 3347 /* 3244 3348 * All control commands are sent via /dev/ublk-control, so we have to check 3245 3349 * the destination device's permission ··· 3437 3319 case UBLK_CMD_START_USER_RECOVERY: 3438 3320 case UBLK_CMD_END_USER_RECOVERY: 3439 3321 case UBLK_CMD_UPDATE_SIZE: 3322 + case UBLK_CMD_QUIESCE_DEV: 3440 3323 mask = MAY_READ | MAY_WRITE; 3441 3324 break; 3442 3325 default: ··· 3532 3413 case UBLK_CMD_UPDATE_SIZE: 3533 3414 ublk_ctrl_set_size(ub, header); 3534 3415 ret = 0; 3416 + break; 3417 + case UBLK_CMD_QUIESCE_DEV: 3418 + ret = ublk_ctrl_quiesce_dev(ub, header); 3535 3419 break; 3536 3420 default: 3537 3421 ret = -EOPNOTSUPP;
+19
include/uapi/linux/ublk_cmd.h
··· 53 53 _IOR('u', 0x14, struct ublksrv_ctrl_cmd) 54 54 #define UBLK_U_CMD_UPDATE_SIZE \ 55 55 _IOWR('u', 0x15, struct ublksrv_ctrl_cmd) 56 + #define UBLK_U_CMD_QUIESCE_DEV \ 57 + _IOWR('u', 0x16, struct ublksrv_ctrl_cmd) 56 58 57 59 /* 58 60 * 64bits are enough now, and it should be easy to extend in case of ··· 254 252 * successfully 255 253 */ 256 254 #define UBLK_F_AUTO_BUF_REG (1ULL << 11) 255 + 256 + /* 257 + * Control command `UBLK_U_CMD_QUIESCE_DEV` is added for quiescing device, 258 + * which state can be transitioned to `UBLK_S_DEV_QUIESCED` or 259 + * `UBLK_S_DEV_FAIL_IO` finally, and it needs ublk server cooperation for 260 + * handling `UBLK_IO_RES_ABORT` correctly. 261 + * 262 + * Typical use case is for supporting to upgrade ublk server application, 263 + * meantime keep ublk block device persistent during the period. 264 + * 265 + * This feature is only available when UBLK_F_USER_RECOVERY is enabled. 266 + * 267 + * Note, this command returns -EBUSY in case that all IO commands are being 268 + * handled by ublk server and not completed in specified time period which 269 + * is passed from the control command parameter. 270 + */ 271 + #define UBLK_F_QUIESCE (1ULL << 12) 257 272 258 273 /* device state */ 259 274 #define UBLK_S_DEV_DEAD 0