Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

dm log userspace: allow mark requests to piggyback on flush requests

In the cluster evironment, cluster write has poor performance because
userspace_flush() has to contact a userspace program (cmirrord) for
clear/mark/flush requests. But both mark and flush requests require
cmirrord to communicate the message to all the cluster nodes for each
flush call. This behaviour is really slow.

To address this we now merge mark and flush requests together to reduce
the kernel-userspace-kernel time. We allow a new directive,
"integrated_flush" that can be used to instruct the kernel log code to
combine flush and mark requests when directed by userspace. If not
directed by userspace (due to an older version of the userspace code
perhaps), the kernel will function as it did previously - preserving
backwards compatibility. Additionally, flush requests are performed
lazily when only clear requests exist.

Signed-off-by: Dongmao Zhang <dmzhang@suse.com>
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

authored by

Dongmao Zhang and committed by
Mike Snitzer
5066a4df fca02843

+178 -52
+161 -49
drivers/md/dm-log-userspace-base.c
··· 10 10 #include <linux/device-mapper.h> 11 11 #include <linux/dm-log-userspace.h> 12 12 #include <linux/module.h> 13 + #include <linux/workqueue.h> 13 14 14 15 #include "dm-log-userspace-transfer.h" 15 16 16 - #define DM_LOG_USERSPACE_VSN "1.1.0" 17 + #define DM_LOG_USERSPACE_VSN "1.3.0" 17 18 18 19 struct flush_entry { 19 20 int type; ··· 59 58 spinlock_t flush_lock; 60 59 struct list_head mark_list; 61 60 struct list_head clear_list; 61 + 62 + /* 63 + * Workqueue for flush of clear region requests. 64 + */ 65 + struct workqueue_struct *dmlog_wq; 66 + struct delayed_work flush_log_work; 67 + atomic_t sched_flush; 68 + 69 + /* 70 + * Combine userspace flush and mark requests for efficiency. 71 + */ 72 + uint32_t integrated_flush; 62 73 }; 63 74 64 75 static mempool_t *flush_entry_pool; ··· 135 122 136 123 *ctr_str = NULL; 137 124 125 + /* 126 + * Determine overall size of the string. 127 + */ 138 128 for (i = 0, str_size = 0; i < argc; i++) 139 129 str_size += strlen(argv[i]) + 1; /* +1 for space between args */ 140 130 ··· 157 141 return str_size; 158 142 } 159 143 144 + static void do_flush(struct work_struct *work) 145 + { 146 + int r; 147 + struct log_c *lc = container_of(work, struct log_c, flush_log_work.work); 148 + 149 + atomic_set(&lc->sched_flush, 0); 150 + 151 + r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, NULL, 0, NULL, NULL); 152 + 153 + if (r) 154 + dm_table_event(lc->ti->table); 155 + } 156 + 160 157 /* 161 158 * userspace_ctr 162 159 * 163 160 * argv contains: 164 - * <UUID> <other args> 165 - * Where 'other args' is the userspace implementation specific log 166 - * arguments. An example might be: 167 - * <UUID> clustered-disk <arg count> <log dev> <region_size> [[no]sync] 161 + * <UUID> [integrated_flush] <other args> 162 + * Where 'other args' are the userspace implementation-specific log 163 + * arguments. 168 164 * 169 - * So, this module will strip off the <UUID> for identification purposes 170 - * when communicating with userspace about a log; but will pass on everything 171 - * else. 165 + * Example: 166 + * <UUID> [integrated_flush] clustered-disk <arg count> <log dev> 167 + * <region_size> [[no]sync] 168 + * 169 + * This module strips off the <UUID> and uses it for identification 170 + * purposes when communicating with userspace about a log. 171 + * 172 + * If integrated_flush is defined, the kernel combines flush 173 + * and mark requests. 174 + * 175 + * The rest of the line, beginning with 'clustered-disk', is passed 176 + * to the userspace ctr function. 172 177 */ 173 178 static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, 174 179 unsigned argc, char **argv) ··· 225 188 return -EINVAL; 226 189 } 227 190 191 + lc->usr_argc = argc; 192 + 228 193 strncpy(lc->uuid, argv[0], DM_UUID_LEN); 194 + argc--; 195 + argv++; 229 196 spin_lock_init(&lc->flush_lock); 230 197 INIT_LIST_HEAD(&lc->mark_list); 231 198 INIT_LIST_HEAD(&lc->clear_list); 232 199 233 - str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); 200 + if (!strcasecmp(argv[0], "integrated_flush")) { 201 + lc->integrated_flush = 1; 202 + argc--; 203 + argv++; 204 + } 205 + 206 + str_size = build_constructor_string(ti, argc, argv, &ctr_str); 234 207 if (str_size < 0) { 235 208 kfree(lc); 236 209 return str_size; ··· 293 246 DMERR("Failed to register %s with device-mapper", 294 247 devices_rdata); 295 248 } 249 + 250 + if (lc->integrated_flush) { 251 + lc->dmlog_wq = alloc_workqueue("dmlogd", WQ_MEM_RECLAIM, 0); 252 + if (!lc->dmlog_wq) { 253 + DMERR("couldn't start dmlogd"); 254 + r = -ENOMEM; 255 + goto out; 256 + } 257 + 258 + INIT_DELAYED_WORK(&lc->flush_log_work, do_flush); 259 + atomic_set(&lc->sched_flush, 0); 260 + } 261 + 296 262 out: 297 263 kfree(devices_rdata); 298 264 if (r) { ··· 313 253 kfree(ctr_str); 314 254 } else { 315 255 lc->usr_argv_str = ctr_str; 316 - lc->usr_argc = argc; 317 256 log->context = lc; 318 257 } 319 258 ··· 323 264 { 324 265 struct log_c *lc = log->context; 325 266 267 + if (lc->integrated_flush) { 268 + /* flush workqueue */ 269 + if (atomic_read(&lc->sched_flush)) 270 + flush_delayed_work(&lc->flush_log_work); 271 + 272 + destroy_workqueue(lc->dmlog_wq); 273 + } 274 + 326 275 (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, 327 - NULL, 0, 328 - NULL, NULL); 276 + NULL, 0, NULL, NULL); 329 277 330 278 if (lc->log_dev) 331 279 dm_put_device(lc->ti, lc->log_dev); ··· 349 283 struct log_c *lc = log->context; 350 284 351 285 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND, 352 - NULL, 0, 353 - NULL, NULL); 286 + NULL, 0, NULL, NULL); 354 287 355 288 return r; 356 289 } ··· 359 294 int r; 360 295 struct log_c *lc = log->context; 361 296 297 + /* 298 + * Run planned flush earlier. 299 + */ 300 + if (lc->integrated_flush && atomic_read(&lc->sched_flush)) 301 + flush_delayed_work(&lc->flush_log_work); 302 + 362 303 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND, 363 - NULL, 0, 364 - NULL, NULL); 304 + NULL, 0, NULL, NULL); 365 305 366 306 return r; 367 307 } ··· 378 308 379 309 lc->in_sync_hint = 0; 380 310 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME, 381 - NULL, 0, 382 - NULL, NULL); 311 + NULL, 0, NULL, NULL); 383 312 384 313 return r; 385 314 } ··· 474 405 return r; 475 406 } 476 407 477 - static int flush_by_group(struct log_c *lc, struct list_head *flush_list) 408 + static int flush_by_group(struct log_c *lc, struct list_head *flush_list, 409 + int flush_with_payload) 478 410 { 479 411 int r = 0; 480 412 int count; ··· 501 431 break; 502 432 } 503 433 504 - r = userspace_do_request(lc, lc->uuid, type, 505 - (char *)(group), 506 - count * sizeof(uint64_t), 507 - NULL, NULL); 508 - if (r) { 509 - /* Group send failed. Attempt one-by-one. */ 510 - list_splice_init(&tmp_list, flush_list); 511 - r = flush_one_by_one(lc, flush_list); 512 - break; 434 + if (flush_with_payload) { 435 + r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, 436 + (char *)(group), 437 + count * sizeof(uint64_t), 438 + NULL, NULL); 439 + /* 440 + * Integrated flush failed. 441 + */ 442 + if (r) 443 + break; 444 + } else { 445 + r = userspace_do_request(lc, lc->uuid, type, 446 + (char *)(group), 447 + count * sizeof(uint64_t), 448 + NULL, NULL); 449 + if (r) { 450 + /* 451 + * Group send failed. Attempt one-by-one. 452 + */ 453 + list_splice_init(&tmp_list, flush_list); 454 + r = flush_one_by_one(lc, flush_list); 455 + break; 456 + } 513 457 } 514 458 } 515 459 ··· 560 476 struct log_c *lc = log->context; 561 477 LIST_HEAD(mark_list); 562 478 LIST_HEAD(clear_list); 479 + int mark_list_is_empty; 480 + int clear_list_is_empty; 563 481 struct flush_entry *fe, *tmp_fe; 564 482 565 483 spin_lock_irqsave(&lc->flush_lock, flags); ··· 569 483 list_splice_init(&lc->clear_list, &clear_list); 570 484 spin_unlock_irqrestore(&lc->flush_lock, flags); 571 485 572 - if (list_empty(&mark_list) && list_empty(&clear_list)) 486 + mark_list_is_empty = list_empty(&mark_list); 487 + clear_list_is_empty = list_empty(&clear_list); 488 + 489 + if (mark_list_is_empty && clear_list_is_empty) 573 490 return 0; 574 491 575 - r = flush_by_group(lc, &mark_list); 492 + r = flush_by_group(lc, &clear_list, 0); 576 493 if (r) 577 - goto fail; 494 + goto out; 578 495 579 - r = flush_by_group(lc, &clear_list); 580 - if (r) 581 - goto fail; 496 + if (!lc->integrated_flush) { 497 + r = flush_by_group(lc, &mark_list, 0); 498 + if (r) 499 + goto out; 500 + r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, 501 + NULL, 0, NULL, NULL); 502 + goto out; 503 + } 582 504 583 - r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, 584 - NULL, 0, NULL, NULL); 585 - 586 - fail: 587 505 /* 588 - * We can safely remove these entries, even if failure. 506 + * Send integrated flush request with mark_list as payload. 507 + */ 508 + r = flush_by_group(lc, &mark_list, 1); 509 + if (r) 510 + goto out; 511 + 512 + if (mark_list_is_empty && !atomic_read(&lc->sched_flush)) { 513 + /* 514 + * When there are only clear region requests, 515 + * we schedule a flush in the future. 516 + */ 517 + queue_delayed_work(lc->dmlog_wq, &lc->flush_log_work, 3 * HZ); 518 + atomic_set(&lc->sched_flush, 1); 519 + } else { 520 + /* 521 + * Cancel pending flush because we 522 + * have already flushed in mark_region. 523 + */ 524 + cancel_delayed_work(&lc->flush_log_work); 525 + atomic_set(&lc->sched_flush, 0); 526 + } 527 + 528 + out: 529 + /* 530 + * We can safely remove these entries, even after failure. 589 531 * Calling code will receive an error and will know that 590 532 * the log facility has failed. 591 533 */ ··· 717 603 718 604 rdata_size = sizeof(pkg); 719 605 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK, 720 - NULL, 0, 721 - (char *)&pkg, &rdata_size); 606 + NULL, 0, (char *)&pkg, &rdata_size); 722 607 723 608 *region = pkg.r; 724 609 return (r) ? r : (int)pkg.i; ··· 743 630 pkg.i = (int64_t)in_sync; 744 631 745 632 r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC, 746 - (char *)&pkg, sizeof(pkg), 747 - NULL, NULL); 633 + (char *)&pkg, sizeof(pkg), NULL, NULL); 748 634 749 635 /* 750 636 * It would be nice to be able to report failures. ··· 769 657 770 658 rdata_size = sizeof(sync_count); 771 659 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT, 772 - NULL, 0, 773 - (char *)&sync_count, &rdata_size); 660 + NULL, 0, (char *)&sync_count, &rdata_size); 774 661 775 662 if (r) 776 663 return 0; ··· 796 685 switch (status_type) { 797 686 case STATUSTYPE_INFO: 798 687 r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO, 799 - NULL, 0, 800 - result, &sz); 688 + NULL, 0, result, &sz); 801 689 802 690 if (r) { 803 691 sz = 0; ··· 809 699 BUG_ON(!table_args); /* There will always be a ' ' */ 810 700 table_args++; 811 701 812 - DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc, 813 - lc->uuid, table_args); 702 + DMEMIT("%s %u %s ", log->type->name, lc->usr_argc, lc->uuid); 703 + if (lc->integrated_flush) 704 + DMEMIT("integrated_flush "); 705 + DMEMIT("%s ", table_args); 814 706 break; 815 707 } 816 708 return (r) ? 0 : (int)sz;
+17 -3
include/uapi/linux/dm-log-userspace.h
··· 201 201 * int (*flush)(struct dm_dirty_log *log); 202 202 * 203 203 * Payload-to-userspace: 204 - * None. 204 + * If the 'integrated_flush' directive is present in the constructor 205 + * table, the payload is as same as DM_ULOG_MARK_REGION: 206 + * uint64_t [] - region(s) to mark 207 + * else 208 + * None 205 209 * Payload-to-kernel: 206 210 * None. 207 211 * 208 - * No incoming or outgoing payload. Simply flush log state to disk. 212 + * If the 'integrated_flush' option was used during the creation of the 213 + * log, mark region requests are carried as payload in the flush request. 214 + * Piggybacking the mark requests in this way allows for fewer communications 215 + * between kernel and userspace. 209 216 * 210 217 * When the request has been processed, user-space must return the 211 218 * dm_ulog_request to the kernel - setting the 'error' field and clearing ··· 392 385 * version 2: DM_ULOG_CTR allowed to return a string containing a 393 386 * device name that is to be registered with DM via 394 387 * 'dm_get_device'. 388 + * version 3: DM_ULOG_FLUSH is capable of carrying payload for marking 389 + * regions. This "integrated flush" reduces the number of 390 + * requests between the kernel and userspace by effectively 391 + * merging 'mark' and 'flush' requests. A constructor table 392 + * argument ('integrated_flush') is required to turn this 393 + * feature on, so it is backwards compatible with older 394 + * userspace versions. 395 395 */ 396 - #define DM_ULOG_REQUEST_VERSION 2 396 + #define DM_ULOG_REQUEST_VERSION 3 397 397 398 398 struct dm_ulog_request { 399 399 /*