Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

md: Runtime support for multiple ppls

Increase PPL area to 1MB and use it as circular buffer to store PPL. The
entry with highest generation number is the latest one. If PPL to be
written is larger then space left in a buffer, rewind the buffer to the
start (don't wrap it).

Signed-off-by: Pawel Baldysiak <pawel.baldysiak@intel.com>
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>

authored by

Pawel Baldysiak and committed by
Shaohua Li
ddc08823 8a8e6f84

+62 -9
+13 -3
drivers/md/md.c
··· 1536 1536 } else if (sb->bblog_offset != 0) 1537 1537 rdev->badblocks.shift = 0; 1538 1538 1539 - if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) { 1539 + if ((le32_to_cpu(sb->feature_map) & 1540 + (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { 1540 1541 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); 1541 1542 rdev->ppl.size = le16_to_cpu(sb->ppl.size); 1542 1543 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; ··· 1656 1655 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) 1657 1656 set_bit(MD_HAS_JOURNAL, &mddev->flags); 1658 1657 1659 - if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) { 1658 + if (le32_to_cpu(sb->feature_map) & 1659 + (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { 1660 1660 if (le32_to_cpu(sb->feature_map) & 1661 1661 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) 1662 + return -EINVAL; 1663 + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && 1664 + (le32_to_cpu(sb->feature_map) & 1665 + MD_FEATURE_MULTIPLE_PPLS)) 1662 1666 return -EINVAL; 1663 1667 set_bit(MD_HAS_PPL, &mddev->flags); 1664 1668 } ··· 1881 1875 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); 1882 1876 1883 1877 if (test_bit(MD_HAS_PPL, &mddev->flags)) { 1884 - sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); 1878 + if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) 1879 + sb->feature_map |= 1880 + cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); 1881 + else 1882 + sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); 1885 1883 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); 1886 1884 sb->ppl.size = cpu_to_le16(rdev->ppl.size); 1887 1885 }
+1
drivers/md/md.h
··· 236 236 * never cause the array to become failed. 237 237 */ 238 238 MD_HAS_PPL, /* The raid array has PPL feature set */ 239 + MD_HAS_MULTIPLE_PPLS, /* The raid array has multiple PPLs feature set */ 239 240 }; 240 241 241 242 enum mddev_sb_flags {
+2 -1
drivers/md/raid0.c
··· 30 30 ((1L << MD_HAS_JOURNAL) | \ 31 31 (1L << MD_JOURNAL_CLEAN) | \ 32 32 (1L << MD_FAILFAST_SUPPORTED) |\ 33 - (1L << MD_HAS_PPL)) 33 + (1L << MD_HAS_PPL) | \ 34 + (1L << MD_HAS_MULTIPLE_PPLS)) 34 35 35 36 static int raid0_congested(struct mddev *mddev, int bits) 36 37 {
+2 -1
drivers/md/raid1.c
··· 48 48 #define UNSUPPORTED_MDDEV_FLAGS \ 49 49 ((1L << MD_HAS_JOURNAL) | \ 50 50 (1L << MD_JOURNAL_CLEAN) | \ 51 - (1L << MD_HAS_PPL)) 51 + (1L << MD_HAS_PPL) | \ 52 + (1L << MD_HAS_MULTIPLE_PPLS)) 52 53 53 54 /* 54 55 * Number of guaranteed r1bios in case of extreme VM load:
+40 -3
drivers/md/raid5-ppl.c
··· 87 87 * The current io_unit accepting new stripes is always at the end of the list. 88 88 */ 89 89 90 + #define PPL_SPACE_SIZE (128 * 1024) 91 + 90 92 struct ppl_conf { 91 93 struct mddev *mddev; 92 94 ··· 124 122 * always at the end of io_list */ 125 123 spinlock_t io_list_lock; 126 124 struct list_head io_list; /* all io_units of this log */ 125 + 126 + sector_t next_io_sector; 127 + unsigned int entry_space; 128 + bool use_multippl; 127 129 }; 128 130 129 131 #define PPL_IO_INLINE_BVECS 32 ··· 270 264 int i; 271 265 sector_t data_sector = 0; 272 266 int data_disks = 0; 273 - unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE; 274 267 struct r5conf *conf = sh->raid_conf; 275 268 276 269 pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector); 277 270 278 271 /* check if current io_unit is full */ 279 - if (io && (io->pp_size == entry_space || 272 + if (io && (io->pp_size == log->entry_space || 280 273 io->entries_count == PPL_HDR_MAX_ENTRIES)) { 281 274 pr_debug("%s: add io_unit blocked by seq: %llu\n", 282 275 __func__, io->seq); ··· 456 451 pplhdr->entries_count = cpu_to_le32(io->entries_count); 457 452 pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE)); 458 453 454 + /* Rewind the buffer if current PPL is larger then remaining space */ 455 + if (log->use_multippl && 456 + log->rdev->ppl.sector + log->rdev->ppl.size - log->next_io_sector < 457 + (PPL_HEADER_SIZE + io->pp_size) >> 9) 458 + log->next_io_sector = log->rdev->ppl.sector; 459 + 460 + 459 461 bio->bi_end_io = ppl_log_endio; 460 462 bio->bi_opf = REQ_OP_WRITE | REQ_FUA; 461 463 bio->bi_bdev = log->rdev->bdev; 462 - bio->bi_iter.bi_sector = log->rdev->ppl.sector; 464 + bio->bi_iter.bi_sector = log->next_io_sector; 463 465 bio_add_page(bio, io->header_page, PAGE_SIZE, 0); 466 + 467 + pr_debug("%s: log->current_io_sector: %llu\n", __func__, 468 + (unsigned long long)log->next_io_sector); 469 + 470 + if (log->use_multippl) 471 + log->next_io_sector += (PPL_HEADER_SIZE + io->pp_size) >> 9; 464 472 465 473 list_for_each_entry(sh, &io->stripe_list, log_list) { 466 474 /* entries for full stripe writes have no partial parity */ ··· 1049 1031 static void __ppl_exit_log(struct ppl_conf *ppl_conf) 1050 1032 { 1051 1033 clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags); 1034 + clear_bit(MD_HAS_MULTIPLE_PPLS, &ppl_conf->mddev->flags); 1052 1035 1053 1036 kfree(ppl_conf->child_logs); 1054 1037 ··· 1116 1097 rdev->ppl.size = ppl_size_new; 1117 1098 1118 1099 return 0; 1100 + } 1101 + 1102 + static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev) 1103 + { 1104 + if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE + 1105 + PPL_HEADER_SIZE) * 2) { 1106 + log->use_multippl = true; 1107 + set_bit(MD_HAS_MULTIPLE_PPLS, 1108 + &log->ppl_conf->mddev->flags); 1109 + log->entry_space = PPL_SPACE_SIZE; 1110 + } else { 1111 + log->use_multippl = false; 1112 + log->entry_space = (log->rdev->ppl.size << 9) - 1113 + PPL_HEADER_SIZE; 1114 + } 1115 + log->next_io_sector = rdev->ppl.sector; 1119 1116 } 1120 1117 1121 1118 int ppl_init_log(struct r5conf *conf) ··· 1231 1196 q = bdev_get_queue(rdev->bdev); 1232 1197 if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) 1233 1198 need_cache_flush = true; 1199 + ppl_init_child_log(log, rdev); 1234 1200 } 1235 1201 } 1236 1202 ··· 1297 1261 if (!ret) { 1298 1262 log->rdev = rdev; 1299 1263 ret = ppl_write_empty_header(log); 1264 + ppl_init_child_log(log, rdev); 1300 1265 } 1301 1266 } else { 1302 1267 log->rdev = NULL;
+1
drivers/md/raid5.c
··· 7236 7236 pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n", 7237 7237 mdname(mddev)); 7238 7238 clear_bit(MD_HAS_PPL, &mddev->flags); 7239 + clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags); 7239 7240 } 7240 7241 7241 7242 if (mddev->private == NULL)
+3 -1
include/uapi/linux/raid/md_p.h
··· 324 324 #define MD_FEATURE_RECOVERY_BITMAP 128 /* recovery that is happening 325 325 * is guided by bitmap. 326 326 */ 327 - #define MD_FEATURE_CLUSTERED 256 /* clustered MD */ 327 + #define MD_FEATURE_CLUSTERED 256 /* clustered MD */ 328 328 #define MD_FEATURE_JOURNAL 512 /* support write cache */ 329 329 #define MD_FEATURE_PPL 1024 /* support PPL */ 330 + #define MD_FEATURE_MULTIPLE_PPLS 2048 /* support for multiple PPLs */ 330 331 #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ 331 332 |MD_FEATURE_RECOVERY_OFFSET \ 332 333 |MD_FEATURE_RESHAPE_ACTIVE \ ··· 339 338 |MD_FEATURE_CLUSTERED \ 340 339 |MD_FEATURE_JOURNAL \ 341 340 |MD_FEATURE_PPL \ 341 + |MD_FEATURE_MULTIPLE_PPLS \ 342 342 ) 343 343 344 344 struct r5l_payload_header {