Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

md/md-llbitmap: introduce new lockless bitmap

Redundant data is used to enhance data fault tolerance, and the storage
method for redundant data vary depending on the RAID levels. And it's
important to maintain the consistency of redundant data.

Bitmap is used to record which data blocks have been synchronized and which
ones need to be resynchronized or recovered. Each bit in the bitmap
represents a segment of data in the array. When a bit is set, it indicates
that the multiple redundant copies of that data segment may not be
consistent. Data synchronization can be performed based on the bitmap after
power failure or readding a disk. If there is no bitmap, a full disk
synchronization is required.

Due to known performance issues with md-bitmap and the unreasonable
implementations:

- self-managed IO submitting like filemap_write_page();
- global spin_lock

I have decided not to continue optimizing based on the current bitmap
implementation, this new bitmap is invented without locking from IO fast
path and can be used with fast disks.

For designs and details, see the comments in drivers/md-llbitmap.c.

Link: https://lore.kernel.org/linux-raid/20250829080426.1441678-12-yukuai1@huaweicloud.com
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Li Nan <linan122@huawei.com>

Yu Kuai 5ab829f1 66be318e

+1696 -12
+20
Documentation/admin-guide/md.rst
··· 387 387 No bitmap 388 388 bitmap 389 389 The default internal bitmap 390 + llbitmap 391 + The lockless internal bitmap 390 392 391 393 If bitmap_type is not none, then additional bitmap attributes bitmap/xxx or 392 394 llbitmap/xxx will be created after md device KOBJ_CHANGE event. ··· 448 446 When metadata is managed externally, it should be set to true 449 447 once the array becomes non-degraded, and this fact has been 450 448 recorded in the metadata. 449 + 450 + If bitmap_type is llbitmap, then the md device will also contain: 451 + 452 + llbitmap/bits 453 + This is read-only, show status of bitmap bits, the number of each 454 + value. 455 + 456 + llbitmap/metadata 457 + This is read-only, show bitmap metadata, include chunksize, chunkshift, 458 + chunks, offset and daemon_sleep. 459 + 460 + llbitmap/daemon_sleep 461 + This is read-write, time in seconds that daemon function will be 462 + triggered to clear dirty bits. 463 + 464 + llbitmap/barrier_idle 465 + This is read-write, time in seconds that page barrier will be idled, 466 + means dirty bits in the page will be cleared. 451 467 452 468 As component devices are added to an md array, they appear in the ``md`` 453 469 directory as new directories named::
+11
drivers/md/Kconfig
··· 52 52 53 53 If unsure, say Y. 54 54 55 + config MD_LLBITMAP 56 + bool "MD RAID lockless bitmap support" 57 + depends on BLK_DEV_MD 58 + help 59 + If you say Y here, support for the lockless write intent bitmap will 60 + be enabled. 61 + 62 + Note, this is an experimental feature. 63 + 64 + If unsure, say N. 65 + 55 66 config MD_AUTODETECT 56 67 bool "Autodetect RAID arrays during kernel boot" 57 68 depends on BLK_DEV_MD=y
+1
drivers/md/Makefile
··· 29 29 30 30 md-mod-y += md.o 31 31 md-mod-$(CONFIG_MD_BITMAP) += md-bitmap.o 32 + md-mod-$(CONFIG_MD_LLBITMAP) += md-llbitmap.o 32 33 raid456-y += raid5.o raid5-cache.o raid5-ppl.o 33 34 linear-y += md-linear.o 34 35
-9
drivers/md/md-bitmap.c
··· 34 34 #include "md-bitmap.h" 35 35 #include "md-cluster.h" 36 36 37 - #define BITMAP_MAJOR_LO 3 38 - /* version 4 insists the bitmap is in little-endian order 39 - * with version 3, it is host-endian which is non-portable 40 - * Version 5 is currently set only for clustered devices 41 - */ 42 - #define BITMAP_MAJOR_HI 4 43 - #define BITMAP_MAJOR_CLUSTERED 5 44 - #define BITMAP_MAJOR_HOSTENDIAN 3 45 - 46 37 /* 47 38 * in-memory bitmap: 48 39 *
+30 -1
drivers/md/md-bitmap.h
··· 9 9 10 10 #define BITMAP_MAGIC 0x6d746962 11 11 12 + /* 13 + * version 3 is host-endian order, this is deprecated and not used for new 14 + * array 15 + */ 16 + #define BITMAP_MAJOR_LO 3 17 + #define BITMAP_MAJOR_HOSTENDIAN 3 18 + /* version 4 is little-endian order, the default value */ 19 + #define BITMAP_MAJOR_HI 4 20 + /* version 5 is only used for cluster */ 21 + #define BITMAP_MAJOR_CLUSTERED 5 22 + /* version 6 is only used for lockless bitmap */ 23 + #define BITMAP_MAJOR_LOCKLESS 6 24 + 12 25 /* use these for bitmap->flags and bitmap->sb->state bit-fields */ 13 26 enum bitmap_state { 14 - BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ 27 + BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ 15 28 BITMAP_WRITE_ERROR = 2, /* A write error has occurred */ 29 + BITMAP_FIRST_USE = 3, /* llbitmap is just created */ 30 + BITMAP_CLEAN = 4, /* llbitmap is created with assume_clean */ 31 + BITMAP_DAEMON_BUSY = 5, /* llbitmap daemon is not finished after daemon_sleep */ 16 32 BITMAP_HOSTENDIAN =15, 17 33 }; 18 34 ··· 178 162 return 0; 179 163 } 180 164 static inline void md_bitmap_exit(void) 165 + { 166 + } 167 + #endif 168 + 169 + #ifdef CONFIG_MD_LLBITMAP 170 + int md_llbitmap_init(void); 171 + void md_llbitmap_exit(void); 172 + #else 173 + static inline int md_llbitmap_init(void) 174 + { 175 + return 0; 176 + } 177 + static inline void md_llbitmap_exit(void) 181 178 { 182 179 } 183 180 #endif
+1626
drivers/md/md-llbitmap.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + 3 + #include <linux/blkdev.h> 4 + #include <linux/module.h> 5 + #include <linux/errno.h> 6 + #include <linux/slab.h> 7 + #include <linux/init.h> 8 + #include <linux/timer.h> 9 + #include <linux/sched.h> 10 + #include <linux/list.h> 11 + #include <linux/file.h> 12 + #include <linux/seq_file.h> 13 + #include <trace/events/block.h> 14 + 15 + #include "md.h" 16 + #include "md-bitmap.h" 17 + 18 + /* 19 + * #### Background 20 + * 21 + * Redundant data is used to enhance data fault tolerance, and the storage 22 + * methods for redundant data vary depending on the RAID levels. And it's 23 + * important to maintain the consistency of redundant data. 24 + * 25 + * Bitmap is used to record which data blocks have been synchronized and which 26 + * ones need to be resynchronized or recovered. Each bit in the bitmap 27 + * represents a segment of data in the array. When a bit is set, it indicates 28 + * that the multiple redundant copies of that data segment may not be 29 + * consistent. Data synchronization can be performed based on the bitmap after 30 + * power failure or readding a disk. If there is no bitmap, a full disk 31 + * synchronization is required. 32 + * 33 + * #### Key Features 34 + * 35 + * - IO fastpath is lockless, if user issues lots of write IO to the same 36 + * bitmap bit in a short time, only the first write has additional overhead 37 + * to update bitmap bit, no additional overhead for the following writes; 38 + * - support only resync or recover written data, means in the case creating 39 + * new array or replacing with a new disk, there is no need to do a full disk 40 + * resync/recovery; 41 + * 42 + * #### Key Concept 43 + * 44 + * ##### State Machine 45 + * 46 + * Each bit is one byte, contain 6 different states, see llbitmap_state. And 47 + * there are total 8 different actions, see llbitmap_action, can change state: 48 + * 49 + * llbitmap state machine: transitions between states 50 + * 51 + * | | Startwrite | Startsync | Endsync | Abortsync| 52 + * | --------- | ---------- | --------- | ------- | ------- | 53 + * | Unwritten | Dirty | x | x | x | 54 + * | Clean | Dirty | x | x | x | 55 + * | Dirty | x | x | x | x | 56 + * | NeedSync | x | Syncing | x | x | 57 + * | Syncing | x | Syncing | Dirty | NeedSync | 58 + * 59 + * | | Reload | Daemon | Discard | Stale | 60 + * | --------- | -------- | ------ | --------- | --------- | 61 + * | Unwritten | x | x | x | x | 62 + * | Clean | x | x | Unwritten | NeedSync | 63 + * | Dirty | NeedSync | Clean | Unwritten | NeedSync | 64 + * | NeedSync | x | x | Unwritten | x | 65 + * | Syncing | NeedSync | x | Unwritten | NeedSync | 66 + * 67 + * Typical scenarios: 68 + * 69 + * 1) Create new array 70 + * All bits will be set to Unwritten by default, if --assume-clean is set, 71 + * all bits will be set to Clean instead. 72 + * 73 + * 2) write data, raid1/raid10 have full copy of data, while raid456 doesn't and 74 + * rely on xor data 75 + * 76 + * 2.1) write new data to raid1/raid10: 77 + * Unwritten --StartWrite--> Dirty 78 + * 79 + * 2.2) write new data to raid456: 80 + * Unwritten --StartWrite--> NeedSync 81 + * 82 + * Because the initial recover for raid456 is skipped, the xor data is not built 83 + * yet, the bit must be set to NeedSync first and after lazy initial recover is 84 + * finished, the bit will finally set to Dirty(see 5.1 and 5.4); 85 + * 86 + * 2.3) cover write 87 + * Clean --StartWrite--> Dirty 88 + * 89 + * 3) daemon, if the array is not degraded: 90 + * Dirty --Daemon--> Clean 91 + * 92 + * 4) discard 93 + * {Clean, Dirty, NeedSync, Syncing} --Discard--> Unwritten 94 + * 95 + * 5) resync and recover 96 + * 97 + * 5.1) common process 98 + * NeedSync --Startsync--> Syncing --Endsync--> Dirty --Daemon--> Clean 99 + * 100 + * 5.2) resync after power failure 101 + * Dirty --Reload--> NeedSync 102 + * 103 + * 5.3) recover while replacing with a new disk 104 + * By default, the old bitmap framework will recover all data, and llbitmap 105 + * implements this by a new helper, see llbitmap_skip_sync_blocks: 106 + * 107 + * skip recover for bits other than dirty or clean; 108 + * 109 + * 5.4) lazy initial recover for raid5: 110 + * By default, the old bitmap framework will only allow new recover when there 111 + * are spares(new disk), a new recovery flag MD_RECOVERY_LAZY_RECOVER is added 112 + * to perform raid456 lazy recover for set bits(from 2.2). 113 + * 114 + * 6. special handling for degraded array: 115 + * 116 + * - Dirty bits will never be cleared, daemon will just do nothing, so that if 117 + * a disk is readded, Clean bits can be skipped with recovery; 118 + * - Dirty bits will convert to Syncing from start write, to do data recovery 119 + * for new added disks; 120 + * - New write will convert bits to NeedSync directly; 121 + * 122 + * ##### Bitmap IO 123 + * 124 + * ##### Chunksize 125 + * 126 + * The default bitmap size is 128k, incluing 1k bitmap super block, and 127 + * the default size of segment of data in the array each bit(chunksize) is 64k, 128 + * and chunksize will adjust to twice the old size each time if the total number 129 + * bits is not less than 127k.(see llbitmap_init) 130 + * 131 + * ##### READ 132 + * 133 + * While creating bitmap, all pages will be allocated and read for llbitmap, 134 + * there won't be read afterwards 135 + * 136 + * ##### WRITE 137 + * 138 + * WRITE IO is divided into logical_block_size of the array, the dirty state 139 + * of each block is tracked independently, for example: 140 + * 141 + * each page is 4k, contain 8 blocks; each block is 512 bytes contain 512 bit; 142 + * 143 + * | page0 | page1 | ... | page 31 | 144 + * | | 145 + * | \-----------------------\ 146 + * | | 147 + * | block0 | block1 | ... | block 8| 148 + * | | 149 + * | \-----------------\ 150 + * | | 151 + * | bit0 | bit1 | ... | bit511 | 152 + * 153 + * From IO path, if one bit is changed to Dirty or NeedSync, the corresponding 154 + * subpage will be marked dirty, such block must write first before the IO is 155 + * issued. This behaviour will affect IO performance, to reduce the impact, if 156 + * multiple bits are changed in the same block in a short time, all bits in this 157 + * block will be changed to Dirty/NeedSync, so that there won't be any overhead 158 + * until daemon clears dirty bits. 159 + * 160 + * ##### Dirty Bits synchronization 161 + * 162 + * IO fast path will set bits to dirty, and those dirty bits will be cleared 163 + * by daemon after IO is done. llbitmap_page_ctl is used to synchronize between 164 + * IO path and daemon; 165 + * 166 + * IO path: 167 + * 1) try to grab a reference, if succeed, set expire time after 5s and return; 168 + * 2) if failed to grab a reference, wait for daemon to finish clearing dirty 169 + * bits; 170 + * 171 + * Daemon (Daemon will be woken up every daemon_sleep seconds): 172 + * For each page: 173 + * 1) check if page expired, if not skip this page; for expired page: 174 + * 2) suspend the page and wait for inflight write IO to be done; 175 + * 3) change dirty page to clean; 176 + * 4) resume the page; 177 + */ 178 + 179 + #define BITMAP_DATA_OFFSET 1024 180 + 181 + /* 64k is the max IO size of sync IO for raid1/raid10 */ 182 + #define MIN_CHUNK_SIZE (64 * 2) 183 + 184 + /* By default, daemon will be woken up every 30s */ 185 + #define DEFAULT_DAEMON_SLEEP 30 186 + 187 + /* 188 + * Dirtied bits that have not been accessed for more than 5s will be cleared 189 + * by daemon. 190 + */ 191 + #define DEFAULT_BARRIER_IDLE 5 192 + 193 + enum llbitmap_state { 194 + /* No valid data, init state after assemble the array */ 195 + BitUnwritten = 0, 196 + /* data is consistent */ 197 + BitClean, 198 + /* data will be consistent after IO is done, set directly for writes */ 199 + BitDirty, 200 + /* 201 + * data need to be resynchronized: 202 + * 1) set directly for writes if array is degraded, prevent full disk 203 + * synchronization after readding a disk; 204 + * 2) reassemble the array after power failure, and dirty bits are 205 + * found after reloading the bitmap; 206 + * 3) set for first write for raid5, to build initial xor data lazily 207 + */ 208 + BitNeedSync, 209 + /* data is synchronizing */ 210 + BitSyncing, 211 + BitStateCount, 212 + BitNone = 0xff, 213 + }; 214 + 215 + enum llbitmap_action { 216 + /* User write new data, this is the only action from IO fast path */ 217 + BitmapActionStartwrite = 0, 218 + /* Start recovery */ 219 + BitmapActionStartsync, 220 + /* Finish recovery */ 221 + BitmapActionEndsync, 222 + /* Failed recovery */ 223 + BitmapActionAbortsync, 224 + /* Reassemble the array */ 225 + BitmapActionReload, 226 + /* Daemon thread is trying to clear dirty bits */ 227 + BitmapActionDaemon, 228 + /* Data is deleted */ 229 + BitmapActionDiscard, 230 + /* 231 + * Bitmap is stale, mark all bits in addition to BitUnwritten to 232 + * BitNeedSync. 233 + */ 234 + BitmapActionStale, 235 + BitmapActionCount, 236 + /* Init state is BitUnwritten */ 237 + BitmapActionInit, 238 + }; 239 + 240 + enum llbitmap_page_state { 241 + LLPageFlush = 0, 242 + LLPageDirty, 243 + }; 244 + 245 + struct llbitmap_page_ctl { 246 + char *state; 247 + struct page *page; 248 + unsigned long expire; 249 + unsigned long flags; 250 + wait_queue_head_t wait; 251 + struct percpu_ref active; 252 + /* Per block size dirty state, maximum 64k page / 1 sector = 128 */ 253 + unsigned long dirty[]; 254 + }; 255 + 256 + struct llbitmap { 257 + struct mddev *mddev; 258 + struct llbitmap_page_ctl **pctl; 259 + 260 + unsigned int nr_pages; 261 + unsigned int io_size; 262 + unsigned int blocks_per_page; 263 + 264 + /* shift of one chunk */ 265 + unsigned long chunkshift; 266 + /* size of one chunk in sector */ 267 + unsigned long chunksize; 268 + /* total number of chunks */ 269 + unsigned long chunks; 270 + unsigned long last_end_sync; 271 + /* 272 + * time in seconds that dirty bits will be cleared if the page is not 273 + * accessed. 274 + */ 275 + unsigned long barrier_idle; 276 + /* fires on first BitDirty state */ 277 + struct timer_list pending_timer; 278 + struct work_struct daemon_work; 279 + 280 + unsigned long flags; 281 + __u64 events_cleared; 282 + 283 + /* for slow disks */ 284 + atomic_t behind_writes; 285 + wait_queue_head_t behind_wait; 286 + }; 287 + 288 + struct llbitmap_unplug_work { 289 + struct work_struct work; 290 + struct llbitmap *llbitmap; 291 + struct completion *done; 292 + }; 293 + 294 + static struct workqueue_struct *md_llbitmap_io_wq; 295 + static struct workqueue_struct *md_llbitmap_unplug_wq; 296 + 297 + static char state_machine[BitStateCount][BitmapActionCount] = { 298 + [BitUnwritten] = { 299 + [BitmapActionStartwrite] = BitDirty, 300 + [BitmapActionStartsync] = BitNone, 301 + [BitmapActionEndsync] = BitNone, 302 + [BitmapActionAbortsync] = BitNone, 303 + [BitmapActionReload] = BitNone, 304 + [BitmapActionDaemon] = BitNone, 305 + [BitmapActionDiscard] = BitNone, 306 + [BitmapActionStale] = BitNone, 307 + }, 308 + [BitClean] = { 309 + [BitmapActionStartwrite] = BitDirty, 310 + [BitmapActionStartsync] = BitNone, 311 + [BitmapActionEndsync] = BitNone, 312 + [BitmapActionAbortsync] = BitNone, 313 + [BitmapActionReload] = BitNone, 314 + [BitmapActionDaemon] = BitNone, 315 + [BitmapActionDiscard] = BitUnwritten, 316 + [BitmapActionStale] = BitNeedSync, 317 + }, 318 + [BitDirty] = { 319 + [BitmapActionStartwrite] = BitNone, 320 + [BitmapActionStartsync] = BitNone, 321 + [BitmapActionEndsync] = BitNone, 322 + [BitmapActionAbortsync] = BitNone, 323 + [BitmapActionReload] = BitNeedSync, 324 + [BitmapActionDaemon] = BitClean, 325 + [BitmapActionDiscard] = BitUnwritten, 326 + [BitmapActionStale] = BitNeedSync, 327 + }, 328 + [BitNeedSync] = { 329 + [BitmapActionStartwrite] = BitNone, 330 + [BitmapActionStartsync] = BitSyncing, 331 + [BitmapActionEndsync] = BitNone, 332 + [BitmapActionAbortsync] = BitNone, 333 + [BitmapActionReload] = BitNone, 334 + [BitmapActionDaemon] = BitNone, 335 + [BitmapActionDiscard] = BitUnwritten, 336 + [BitmapActionStale] = BitNone, 337 + }, 338 + [BitSyncing] = { 339 + [BitmapActionStartwrite] = BitNone, 340 + [BitmapActionStartsync] = BitSyncing, 341 + [BitmapActionEndsync] = BitDirty, 342 + [BitmapActionAbortsync] = BitNeedSync, 343 + [BitmapActionReload] = BitNeedSync, 344 + [BitmapActionDaemon] = BitNone, 345 + [BitmapActionDiscard] = BitUnwritten, 346 + [BitmapActionStale] = BitNeedSync, 347 + }, 348 + }; 349 + 350 + static void __llbitmap_flush(struct mddev *mddev); 351 + 352 + static enum llbitmap_state llbitmap_read(struct llbitmap *llbitmap, loff_t pos) 353 + { 354 + unsigned int idx; 355 + unsigned int offset; 356 + 357 + pos += BITMAP_DATA_OFFSET; 358 + idx = pos >> PAGE_SHIFT; 359 + offset = offset_in_page(pos); 360 + 361 + return llbitmap->pctl[idx]->state[offset]; 362 + } 363 + 364 + /* set all the bits in the subpage as dirty */ 365 + static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap, 366 + struct llbitmap_page_ctl *pctl, 367 + unsigned int block) 368 + { 369 + bool level_456 = raid_is_456(llbitmap->mddev); 370 + unsigned int io_size = llbitmap->io_size; 371 + int pos; 372 + 373 + for (pos = block * io_size; pos < (block + 1) * io_size; pos++) { 374 + switch (pctl->state[pos]) { 375 + case BitUnwritten: 376 + pctl->state[pos] = level_456 ? BitNeedSync : BitDirty; 377 + break; 378 + case BitClean: 379 + pctl->state[pos] = BitDirty; 380 + break; 381 + }; 382 + } 383 + } 384 + 385 + static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx, 386 + int offset) 387 + { 388 + struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; 389 + unsigned int io_size = llbitmap->io_size; 390 + int block = offset / io_size; 391 + int pos; 392 + 393 + if (!test_bit(LLPageDirty, &pctl->flags)) 394 + set_bit(LLPageDirty, &pctl->flags); 395 + 396 + /* 397 + * For degraded array, dirty bits will never be cleared, and we must 398 + * resync all the dirty bits, hence skip infect new dirty bits to 399 + * prevent resync unnecessary data. 400 + */ 401 + if (llbitmap->mddev->degraded) { 402 + set_bit(block, pctl->dirty); 403 + return; 404 + } 405 + 406 + /* 407 + * The subpage usually contains a total of 512 bits. If any single bit 408 + * within the subpage is marked as dirty, the entire sector will be 409 + * written. To avoid impacting write performance, when multiple bits 410 + * within the same sector are modified within llbitmap->barrier_idle, 411 + * all bits in the sector will be collectively marked as dirty at once. 412 + */ 413 + if (test_and_set_bit(block, pctl->dirty)) { 414 + llbitmap_infect_dirty_bits(llbitmap, pctl, block); 415 + return; 416 + } 417 + 418 + for (pos = block * io_size; pos < (block + 1) * io_size; pos++) { 419 + if (pos == offset) 420 + continue; 421 + if (pctl->state[pos] == BitDirty || 422 + pctl->state[pos] == BitNeedSync) { 423 + llbitmap_infect_dirty_bits(llbitmap, pctl, block); 424 + return; 425 + } 426 + } 427 + } 428 + 429 + static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state, 430 + loff_t pos) 431 + { 432 + unsigned int idx; 433 + unsigned int bit; 434 + 435 + pos += BITMAP_DATA_OFFSET; 436 + idx = pos >> PAGE_SHIFT; 437 + bit = offset_in_page(pos); 438 + 439 + llbitmap->pctl[idx]->state[bit] = state; 440 + if (state == BitDirty || state == BitNeedSync) 441 + llbitmap_set_page_dirty(llbitmap, idx, bit); 442 + } 443 + 444 + static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx) 445 + { 446 + struct mddev *mddev = llbitmap->mddev; 447 + struct page *page = NULL; 448 + struct md_rdev *rdev; 449 + 450 + if (llbitmap->pctl && llbitmap->pctl[idx]) 451 + page = llbitmap->pctl[idx]->page; 452 + if (page) 453 + return page; 454 + 455 + page = alloc_page(GFP_KERNEL | __GFP_ZERO); 456 + if (!page) 457 + return ERR_PTR(-ENOMEM); 458 + 459 + rdev_for_each(rdev, mddev) { 460 + sector_t sector; 461 + 462 + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) 463 + continue; 464 + 465 + sector = mddev->bitmap_info.offset + 466 + (idx << PAGE_SECTORS_SHIFT); 467 + 468 + if (sync_page_io(rdev, sector, PAGE_SIZE, page, REQ_OP_READ, 469 + true)) 470 + return page; 471 + 472 + md_error(mddev, rdev); 473 + } 474 + 475 + __free_page(page); 476 + return ERR_PTR(-EIO); 477 + } 478 + 479 + static void llbitmap_write_page(struct llbitmap *llbitmap, int idx) 480 + { 481 + struct page *page = llbitmap->pctl[idx]->page; 482 + struct mddev *mddev = llbitmap->mddev; 483 + struct md_rdev *rdev; 484 + int block; 485 + 486 + for (block = 0; block < llbitmap->blocks_per_page; block++) { 487 + struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; 488 + 489 + if (!test_and_clear_bit(block, pctl->dirty)) 490 + continue; 491 + 492 + rdev_for_each(rdev, mddev) { 493 + sector_t sector; 494 + sector_t bit_sector = llbitmap->io_size >> SECTOR_SHIFT; 495 + 496 + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) 497 + continue; 498 + 499 + sector = mddev->bitmap_info.offset + rdev->sb_start + 500 + (idx << PAGE_SECTORS_SHIFT) + 501 + block * bit_sector; 502 + md_write_metadata(mddev, rdev, sector, 503 + llbitmap->io_size, page, 504 + block * llbitmap->io_size); 505 + } 506 + } 507 + } 508 + 509 + static void active_release(struct percpu_ref *ref) 510 + { 511 + struct llbitmap_page_ctl *pctl = 512 + container_of(ref, struct llbitmap_page_ctl, active); 513 + 514 + wake_up(&pctl->wait); 515 + } 516 + 517 + static void llbitmap_free_pages(struct llbitmap *llbitmap) 518 + { 519 + int i; 520 + 521 + if (!llbitmap->pctl) 522 + return; 523 + 524 + for (i = 0; i < llbitmap->nr_pages; i++) { 525 + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; 526 + 527 + if (!pctl || !pctl->page) 528 + break; 529 + 530 + __free_page(pctl->page); 531 + percpu_ref_exit(&pctl->active); 532 + } 533 + 534 + kfree(llbitmap->pctl[0]); 535 + kfree(llbitmap->pctl); 536 + llbitmap->pctl = NULL; 537 + } 538 + 539 + static int llbitmap_cache_pages(struct llbitmap *llbitmap) 540 + { 541 + struct llbitmap_page_ctl *pctl; 542 + unsigned int nr_pages = DIV_ROUND_UP(llbitmap->chunks + 543 + BITMAP_DATA_OFFSET, PAGE_SIZE); 544 + unsigned int size = struct_size(pctl, dirty, BITS_TO_LONGS( 545 + llbitmap->blocks_per_page)); 546 + int i; 547 + 548 + llbitmap->pctl = kmalloc_array(nr_pages, sizeof(void *), 549 + GFP_KERNEL | __GFP_ZERO); 550 + if (!llbitmap->pctl) 551 + return -ENOMEM; 552 + 553 + size = round_up(size, cache_line_size()); 554 + pctl = kmalloc_array(nr_pages, size, GFP_KERNEL | __GFP_ZERO); 555 + if (!pctl) { 556 + kfree(llbitmap->pctl); 557 + return -ENOMEM; 558 + } 559 + 560 + llbitmap->nr_pages = nr_pages; 561 + 562 + for (i = 0; i < nr_pages; i++, pctl = (void *)pctl + size) { 563 + struct page *page = llbitmap_read_page(llbitmap, i); 564 + 565 + llbitmap->pctl[i] = pctl; 566 + 567 + if (IS_ERR(page)) { 568 + llbitmap_free_pages(llbitmap); 569 + return PTR_ERR(page); 570 + } 571 + 572 + if (percpu_ref_init(&pctl->active, active_release, 573 + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { 574 + __free_page(page); 575 + llbitmap_free_pages(llbitmap); 576 + return -ENOMEM; 577 + } 578 + 579 + pctl->page = page; 580 + pctl->state = page_address(page); 581 + init_waitqueue_head(&pctl->wait); 582 + } 583 + 584 + return 0; 585 + } 586 + 587 + static void llbitmap_init_state(struct llbitmap *llbitmap) 588 + { 589 + enum llbitmap_state state = BitUnwritten; 590 + unsigned long i; 591 + 592 + if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) 593 + state = BitClean; 594 + 595 + for (i = 0; i < llbitmap->chunks; i++) 596 + llbitmap_write(llbitmap, state, i); 597 + } 598 + 599 + /* The return value is only used from resync, where @start == @end. */ 600 + static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap, 601 + unsigned long start, 602 + unsigned long end, 603 + enum llbitmap_action action) 604 + { 605 + struct mddev *mddev = llbitmap->mddev; 606 + enum llbitmap_state state = BitNone; 607 + bool level_456 = raid_is_456(llbitmap->mddev); 608 + bool need_resync = false; 609 + bool need_recovery = false; 610 + 611 + if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) 612 + return BitNone; 613 + 614 + if (action == BitmapActionInit) { 615 + llbitmap_init_state(llbitmap); 616 + return BitNone; 617 + } 618 + 619 + while (start <= end) { 620 + enum llbitmap_state c = llbitmap_read(llbitmap, start); 621 + 622 + if (c < 0 || c >= BitStateCount) { 623 + pr_err("%s: invalid bit %lu state %d action %d, forcing resync\n", 624 + __func__, start, c, action); 625 + state = BitNeedSync; 626 + goto write_bitmap; 627 + } 628 + 629 + if (c == BitNeedSync) 630 + need_resync = !mddev->degraded; 631 + 632 + state = state_machine[c][action]; 633 + 634 + write_bitmap: 635 + if (unlikely(mddev->degraded)) { 636 + /* For degraded array, mark new data as need sync. */ 637 + if (state == BitDirty && 638 + action == BitmapActionStartwrite) 639 + state = BitNeedSync; 640 + /* 641 + * For degraded array, resync dirty data as well, noted 642 + * if array is still degraded after resync is done, all 643 + * new data will still be dirty until array is clean. 644 + */ 645 + else if (c == BitDirty && 646 + action == BitmapActionStartsync) 647 + state = BitSyncing; 648 + } else if (c == BitUnwritten && state == BitDirty && 649 + action == BitmapActionStartwrite && level_456) { 650 + /* Delay raid456 initial recovery to first write. */ 651 + state = BitNeedSync; 652 + } 653 + 654 + if (state == BitNone) { 655 + start++; 656 + continue; 657 + } 658 + 659 + llbitmap_write(llbitmap, state, start); 660 + 661 + if (state == BitNeedSync) 662 + need_resync = !mddev->degraded; 663 + else if (state == BitDirty && 664 + !timer_pending(&llbitmap->pending_timer)) 665 + mod_timer(&llbitmap->pending_timer, 666 + jiffies + mddev->bitmap_info.daemon_sleep * HZ); 667 + 668 + start++; 669 + } 670 + 671 + if (need_resync && level_456) 672 + need_recovery = true; 673 + 674 + if (need_recovery) { 675 + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 676 + set_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); 677 + md_wakeup_thread(mddev->thread); 678 + } else if (need_resync) { 679 + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 680 + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 681 + md_wakeup_thread(mddev->thread); 682 + } 683 + 684 + return state; 685 + } 686 + 687 + static void llbitmap_raise_barrier(struct llbitmap *llbitmap, int page_idx) 688 + { 689 + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; 690 + 691 + retry: 692 + if (likely(percpu_ref_tryget_live(&pctl->active))) { 693 + WRITE_ONCE(pctl->expire, jiffies + llbitmap->barrier_idle * HZ); 694 + return; 695 + } 696 + 697 + wait_event(pctl->wait, !percpu_ref_is_dying(&pctl->active)); 698 + goto retry; 699 + } 700 + 701 + static void llbitmap_release_barrier(struct llbitmap *llbitmap, int page_idx) 702 + { 703 + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; 704 + 705 + percpu_ref_put(&pctl->active); 706 + } 707 + 708 + static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx) 709 + { 710 + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; 711 + 712 + percpu_ref_kill(&pctl->active); 713 + 714 + if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active), 715 + llbitmap->mddev->bitmap_info.daemon_sleep * HZ)) 716 + return -ETIMEDOUT; 717 + 718 + return 0; 719 + } 720 + 721 + static void llbitmap_resume(struct llbitmap *llbitmap, int page_idx) 722 + { 723 + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; 724 + 725 + pctl->expire = LONG_MAX; 726 + percpu_ref_resurrect(&pctl->active); 727 + wake_up(&pctl->wait); 728 + } 729 + 730 + static int llbitmap_check_support(struct mddev *mddev) 731 + { 732 + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 733 + pr_notice("md/llbitmap: %s: array with journal cannot have bitmap\n", 734 + mdname(mddev)); 735 + return -EBUSY; 736 + } 737 + 738 + if (mddev->bitmap_info.space == 0) { 739 + if (mddev->bitmap_info.default_space == 0) { 740 + pr_notice("md/llbitmap: %s: no space for bitmap\n", 741 + mdname(mddev)); 742 + return -ENOSPC; 743 + } 744 + } 745 + 746 + if (!mddev->persistent) { 747 + pr_notice("md/llbitmap: %s: array must be persistent\n", 748 + mdname(mddev)); 749 + return -EOPNOTSUPP; 750 + } 751 + 752 + if (mddev->bitmap_info.file) { 753 + pr_notice("md/llbitmap: %s: doesn't support bitmap file\n", 754 + mdname(mddev)); 755 + return -EOPNOTSUPP; 756 + } 757 + 758 + if (mddev->bitmap_info.external) { 759 + pr_notice("md/llbitmap: %s: doesn't support external metadata\n", 760 + mdname(mddev)); 761 + return -EOPNOTSUPP; 762 + } 763 + 764 + if (mddev_is_dm(mddev)) { 765 + pr_notice("md/llbitmap: %s: doesn't support dm-raid\n", 766 + mdname(mddev)); 767 + return -EOPNOTSUPP; 768 + } 769 + 770 + return 0; 771 + } 772 + 773 + static int llbitmap_init(struct llbitmap *llbitmap) 774 + { 775 + struct mddev *mddev = llbitmap->mddev; 776 + sector_t blocks = mddev->resync_max_sectors; 777 + unsigned long chunksize = MIN_CHUNK_SIZE; 778 + unsigned long chunks = DIV_ROUND_UP(blocks, chunksize); 779 + unsigned long space = mddev->bitmap_info.space << SECTOR_SHIFT; 780 + int ret; 781 + 782 + while (chunks > space) { 783 + chunksize = chunksize << 1; 784 + chunks = DIV_ROUND_UP(blocks, chunksize); 785 + } 786 + 787 + llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE; 788 + llbitmap->chunkshift = ffz(~chunksize); 789 + llbitmap->chunksize = chunksize; 790 + llbitmap->chunks = chunks; 791 + mddev->bitmap_info.daemon_sleep = DEFAULT_DAEMON_SLEEP; 792 + 793 + ret = llbitmap_cache_pages(llbitmap); 794 + if (ret) 795 + return ret; 796 + 797 + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, 798 + BitmapActionInit); 799 + /* flush initial llbitmap to disk */ 800 + __llbitmap_flush(mddev); 801 + 802 + return 0; 803 + } 804 + 805 + static int llbitmap_read_sb(struct llbitmap *llbitmap) 806 + { 807 + struct mddev *mddev = llbitmap->mddev; 808 + unsigned long daemon_sleep; 809 + unsigned long chunksize; 810 + unsigned long events; 811 + struct page *sb_page; 812 + bitmap_super_t *sb; 813 + int ret = -EINVAL; 814 + 815 + if (!mddev->bitmap_info.offset) { 816 + pr_err("md/llbitmap: %s: no super block found", mdname(mddev)); 817 + return -EINVAL; 818 + } 819 + 820 + sb_page = llbitmap_read_page(llbitmap, 0); 821 + if (IS_ERR(sb_page)) { 822 + pr_err("md/llbitmap: %s: read super block failed", 823 + mdname(mddev)); 824 + return -EIO; 825 + } 826 + 827 + sb = kmap_local_page(sb_page); 828 + if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) { 829 + pr_err("md/llbitmap: %s: invalid super block magic number", 830 + mdname(mddev)); 831 + goto out_put_page; 832 + } 833 + 834 + if (sb->version != cpu_to_le32(BITMAP_MAJOR_LOCKLESS)) { 835 + pr_err("md/llbitmap: %s: invalid super block version", 836 + mdname(mddev)); 837 + goto out_put_page; 838 + } 839 + 840 + if (memcmp(sb->uuid, mddev->uuid, 16)) { 841 + pr_err("md/llbitmap: %s: bitmap superblock UUID mismatch\n", 842 + mdname(mddev)); 843 + goto out_put_page; 844 + } 845 + 846 + if (mddev->bitmap_info.space == 0) { 847 + int room = le32_to_cpu(sb->sectors_reserved); 848 + 849 + if (room) 850 + mddev->bitmap_info.space = room; 851 + else 852 + mddev->bitmap_info.space = mddev->bitmap_info.default_space; 853 + } 854 + llbitmap->flags = le32_to_cpu(sb->state); 855 + if (test_and_clear_bit(BITMAP_FIRST_USE, &llbitmap->flags)) { 856 + ret = llbitmap_init(llbitmap); 857 + goto out_put_page; 858 + } 859 + 860 + chunksize = le32_to_cpu(sb->chunksize); 861 + if (!is_power_of_2(chunksize)) { 862 + pr_err("md/llbitmap: %s: chunksize not a power of 2", 863 + mdname(mddev)); 864 + goto out_put_page; 865 + } 866 + 867 + if (chunksize < DIV_ROUND_UP(mddev->resync_max_sectors, 868 + mddev->bitmap_info.space << SECTOR_SHIFT)) { 869 + pr_err("md/llbitmap: %s: chunksize too small %lu < %llu / %lu", 870 + mdname(mddev), chunksize, mddev->resync_max_sectors, 871 + mddev->bitmap_info.space); 872 + goto out_put_page; 873 + } 874 + 875 + daemon_sleep = le32_to_cpu(sb->daemon_sleep); 876 + if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) { 877 + pr_err("md/llbitmap: %s: daemon sleep %lu period out of range", 878 + mdname(mddev), daemon_sleep); 879 + goto out_put_page; 880 + } 881 + 882 + events = le64_to_cpu(sb->events); 883 + if (events < mddev->events) { 884 + pr_warn("md/llbitmap :%s: bitmap file is out of date (%lu < %llu) -- forcing full recovery", 885 + mdname(mddev), events, mddev->events); 886 + set_bit(BITMAP_STALE, &llbitmap->flags); 887 + } 888 + 889 + sb->sync_size = cpu_to_le64(mddev->resync_max_sectors); 890 + mddev->bitmap_info.chunksize = chunksize; 891 + mddev->bitmap_info.daemon_sleep = daemon_sleep; 892 + 893 + llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE; 894 + llbitmap->chunksize = chunksize; 895 + llbitmap->chunks = DIV_ROUND_UP(mddev->resync_max_sectors, chunksize); 896 + llbitmap->chunkshift = ffz(~chunksize); 897 + ret = llbitmap_cache_pages(llbitmap); 898 + 899 + out_put_page: 900 + __free_page(sb_page); 901 + kunmap_local(sb); 902 + return ret; 903 + } 904 + 905 + static void llbitmap_pending_timer_fn(struct timer_list *pending_timer) 906 + { 907 + struct llbitmap *llbitmap = 908 + container_of(pending_timer, struct llbitmap, pending_timer); 909 + 910 + if (work_busy(&llbitmap->daemon_work)) { 911 + pr_warn("md/llbitmap: %s daemon_work not finished in %lu seconds\n", 912 + mdname(llbitmap->mddev), 913 + llbitmap->mddev->bitmap_info.daemon_sleep); 914 + set_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags); 915 + return; 916 + } 917 + 918 + queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work); 919 + } 920 + 921 + static void md_llbitmap_daemon_fn(struct work_struct *work) 922 + { 923 + struct llbitmap *llbitmap = 924 + container_of(work, struct llbitmap, daemon_work); 925 + unsigned long start; 926 + unsigned long end; 927 + bool restart; 928 + int idx; 929 + 930 + if (llbitmap->mddev->degraded) 931 + return; 932 + retry: 933 + start = 0; 934 + end = min(llbitmap->chunks, PAGE_SIZE - BITMAP_DATA_OFFSET) - 1; 935 + restart = false; 936 + 937 + for (idx = 0; idx < llbitmap->nr_pages; idx++) { 938 + struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; 939 + 940 + if (idx > 0) { 941 + start = end + 1; 942 + end = min(end + PAGE_SIZE, llbitmap->chunks - 1); 943 + } 944 + 945 + if (!test_bit(LLPageFlush, &pctl->flags) && 946 + time_before(jiffies, pctl->expire)) { 947 + restart = true; 948 + continue; 949 + } 950 + 951 + if (llbitmap_suspend_timeout(llbitmap, idx) < 0) { 952 + pr_warn("md/llbitmap: %s: %s waiting for page %d timeout\n", 953 + mdname(llbitmap->mddev), __func__, idx); 954 + continue; 955 + } 956 + 957 + llbitmap_state_machine(llbitmap, start, end, BitmapActionDaemon); 958 + llbitmap_resume(llbitmap, idx); 959 + } 960 + 961 + /* 962 + * If the daemon took a long time to finish, retry to prevent missing 963 + * clearing dirty bits. 964 + */ 965 + if (test_and_clear_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags)) 966 + goto retry; 967 + 968 + /* If some page is dirty but not expired, setup timer again */ 969 + if (restart) 970 + mod_timer(&llbitmap->pending_timer, 971 + jiffies + llbitmap->mddev->bitmap_info.daemon_sleep * HZ); 972 + } 973 + 974 + static int llbitmap_create(struct mddev *mddev) 975 + { 976 + struct llbitmap *llbitmap; 977 + int ret; 978 + 979 + ret = llbitmap_check_support(mddev); 980 + if (ret) 981 + return ret; 982 + 983 + llbitmap = kzalloc(sizeof(*llbitmap), GFP_KERNEL); 984 + if (!llbitmap) 985 + return -ENOMEM; 986 + 987 + llbitmap->mddev = mddev; 988 + llbitmap->io_size = bdev_logical_block_size(mddev->gendisk->part0); 989 + llbitmap->blocks_per_page = PAGE_SIZE / llbitmap->io_size; 990 + 991 + timer_setup(&llbitmap->pending_timer, llbitmap_pending_timer_fn, 0); 992 + INIT_WORK(&llbitmap->daemon_work, md_llbitmap_daemon_fn); 993 + atomic_set(&llbitmap->behind_writes, 0); 994 + init_waitqueue_head(&llbitmap->behind_wait); 995 + 996 + mutex_lock(&mddev->bitmap_info.mutex); 997 + mddev->bitmap = llbitmap; 998 + ret = llbitmap_read_sb(llbitmap); 999 + mutex_unlock(&mddev->bitmap_info.mutex); 1000 + if (ret) { 1001 + kfree(llbitmap); 1002 + mddev->bitmap = NULL; 1003 + } 1004 + 1005 + return ret; 1006 + } 1007 + 1008 + static int llbitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize) 1009 + { 1010 + struct llbitmap *llbitmap = mddev->bitmap; 1011 + unsigned long chunks; 1012 + 1013 + if (chunksize == 0) 1014 + chunksize = llbitmap->chunksize; 1015 + 1016 + /* If there is enough space, leave the chunksize unchanged. */ 1017 + chunks = DIV_ROUND_UP(blocks, chunksize); 1018 + while (chunks > mddev->bitmap_info.space << SECTOR_SHIFT) { 1019 + chunksize = chunksize << 1; 1020 + chunks = DIV_ROUND_UP(blocks, chunksize); 1021 + } 1022 + 1023 + llbitmap->chunkshift = ffz(~chunksize); 1024 + llbitmap->chunksize = chunksize; 1025 + llbitmap->chunks = chunks; 1026 + 1027 + return 0; 1028 + } 1029 + 1030 + static int llbitmap_load(struct mddev *mddev) 1031 + { 1032 + enum llbitmap_action action = BitmapActionReload; 1033 + struct llbitmap *llbitmap = mddev->bitmap; 1034 + 1035 + if (test_and_clear_bit(BITMAP_STALE, &llbitmap->flags)) 1036 + action = BitmapActionStale; 1037 + 1038 + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, action); 1039 + return 0; 1040 + } 1041 + 1042 + static void llbitmap_destroy(struct mddev *mddev) 1043 + { 1044 + struct llbitmap *llbitmap = mddev->bitmap; 1045 + 1046 + if (!llbitmap) 1047 + return; 1048 + 1049 + mutex_lock(&mddev->bitmap_info.mutex); 1050 + 1051 + timer_delete_sync(&llbitmap->pending_timer); 1052 + flush_workqueue(md_llbitmap_io_wq); 1053 + flush_workqueue(md_llbitmap_unplug_wq); 1054 + 1055 + mddev->bitmap = NULL; 1056 + llbitmap_free_pages(llbitmap); 1057 + kfree(llbitmap); 1058 + mutex_unlock(&mddev->bitmap_info.mutex); 1059 + } 1060 + 1061 + static void llbitmap_start_write(struct mddev *mddev, sector_t offset, 1062 + unsigned long sectors) 1063 + { 1064 + struct llbitmap *llbitmap = mddev->bitmap; 1065 + unsigned long start = offset >> llbitmap->chunkshift; 1066 + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; 1067 + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; 1068 + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; 1069 + 1070 + llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite); 1071 + 1072 + while (page_start <= page_end) { 1073 + llbitmap_raise_barrier(llbitmap, page_start); 1074 + page_start++; 1075 + } 1076 + } 1077 + 1078 + static void llbitmap_end_write(struct mddev *mddev, sector_t offset, 1079 + unsigned long sectors) 1080 + { 1081 + struct llbitmap *llbitmap = mddev->bitmap; 1082 + unsigned long start = offset >> llbitmap->chunkshift; 1083 + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; 1084 + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; 1085 + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; 1086 + 1087 + while (page_start <= page_end) { 1088 + llbitmap_release_barrier(llbitmap, page_start); 1089 + page_start++; 1090 + } 1091 + } 1092 + 1093 + static void llbitmap_start_discard(struct mddev *mddev, sector_t offset, 1094 + unsigned long sectors) 1095 + { 1096 + struct llbitmap *llbitmap = mddev->bitmap; 1097 + unsigned long start = DIV_ROUND_UP(offset, llbitmap->chunksize); 1098 + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; 1099 + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; 1100 + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; 1101 + 1102 + llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard); 1103 + 1104 + while (page_start <= page_end) { 1105 + llbitmap_raise_barrier(llbitmap, page_start); 1106 + page_start++; 1107 + } 1108 + } 1109 + 1110 + static void llbitmap_end_discard(struct mddev *mddev, sector_t offset, 1111 + unsigned long sectors) 1112 + { 1113 + struct llbitmap *llbitmap = mddev->bitmap; 1114 + unsigned long start = DIV_ROUND_UP(offset, llbitmap->chunksize); 1115 + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; 1116 + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; 1117 + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; 1118 + 1119 + while (page_start <= page_end) { 1120 + llbitmap_release_barrier(llbitmap, page_start); 1121 + page_start++; 1122 + } 1123 + } 1124 + 1125 + static void llbitmap_unplug_fn(struct work_struct *work) 1126 + { 1127 + struct llbitmap_unplug_work *unplug_work = 1128 + container_of(work, struct llbitmap_unplug_work, work); 1129 + struct llbitmap *llbitmap = unplug_work->llbitmap; 1130 + struct blk_plug plug; 1131 + int i; 1132 + 1133 + blk_start_plug(&plug); 1134 + 1135 + for (i = 0; i < llbitmap->nr_pages; i++) { 1136 + if (!test_bit(LLPageDirty, &llbitmap->pctl[i]->flags) || 1137 + !test_and_clear_bit(LLPageDirty, &llbitmap->pctl[i]->flags)) 1138 + continue; 1139 + 1140 + llbitmap_write_page(llbitmap, i); 1141 + } 1142 + 1143 + blk_finish_plug(&plug); 1144 + md_super_wait(llbitmap->mddev); 1145 + complete(unplug_work->done); 1146 + } 1147 + 1148 + static bool llbitmap_dirty(struct llbitmap *llbitmap) 1149 + { 1150 + int i; 1151 + 1152 + for (i = 0; i < llbitmap->nr_pages; i++) 1153 + if (test_bit(LLPageDirty, &llbitmap->pctl[i]->flags)) 1154 + return true; 1155 + 1156 + return false; 1157 + } 1158 + 1159 + static void llbitmap_unplug(struct mddev *mddev, bool sync) 1160 + { 1161 + DECLARE_COMPLETION_ONSTACK(done); 1162 + struct llbitmap *llbitmap = mddev->bitmap; 1163 + struct llbitmap_unplug_work unplug_work = { 1164 + .llbitmap = llbitmap, 1165 + .done = &done, 1166 + }; 1167 + 1168 + if (!llbitmap_dirty(llbitmap)) 1169 + return; 1170 + 1171 + /* 1172 + * Issue new bitmap IO under submit_bio() context will deadlock: 1173 + * - the bio will wait for bitmap bio to be done, before it can be 1174 + * issued; 1175 + * - bitmap bio will be added to current->bio_list and wait for this 1176 + * bio to be issued; 1177 + */ 1178 + INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn); 1179 + queue_work(md_llbitmap_unplug_wq, &unplug_work.work); 1180 + wait_for_completion(&done); 1181 + destroy_work_on_stack(&unplug_work.work); 1182 + } 1183 + 1184 + /* 1185 + * Force to write all bitmap pages to disk, called when stopping the array, or 1186 + * every daemon_sleep seconds when sync_thread is running. 1187 + */ 1188 + static void __llbitmap_flush(struct mddev *mddev) 1189 + { 1190 + struct llbitmap *llbitmap = mddev->bitmap; 1191 + struct blk_plug plug; 1192 + int i; 1193 + 1194 + blk_start_plug(&plug); 1195 + for (i = 0; i < llbitmap->nr_pages; i++) { 1196 + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; 1197 + 1198 + /* mark all blocks as dirty */ 1199 + set_bit(LLPageDirty, &pctl->flags); 1200 + bitmap_fill(pctl->dirty, llbitmap->blocks_per_page); 1201 + llbitmap_write_page(llbitmap, i); 1202 + } 1203 + blk_finish_plug(&plug); 1204 + md_super_wait(llbitmap->mddev); 1205 + } 1206 + 1207 + static void llbitmap_flush(struct mddev *mddev) 1208 + { 1209 + struct llbitmap *llbitmap = mddev->bitmap; 1210 + int i; 1211 + 1212 + for (i = 0; i < llbitmap->nr_pages; i++) 1213 + set_bit(LLPageFlush, &llbitmap->pctl[i]->flags); 1214 + 1215 + timer_delete_sync(&llbitmap->pending_timer); 1216 + queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work); 1217 + flush_work(&llbitmap->daemon_work); 1218 + 1219 + __llbitmap_flush(mddev); 1220 + } 1221 + 1222 + /* This is used for raid5 lazy initial recovery */ 1223 + static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset) 1224 + { 1225 + struct llbitmap *llbitmap = mddev->bitmap; 1226 + unsigned long p = offset >> llbitmap->chunkshift; 1227 + enum llbitmap_state c = llbitmap_read(llbitmap, p); 1228 + 1229 + return c == BitClean || c == BitDirty; 1230 + } 1231 + 1232 + static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset) 1233 + { 1234 + struct llbitmap *llbitmap = mddev->bitmap; 1235 + unsigned long p = offset >> llbitmap->chunkshift; 1236 + int blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); 1237 + enum llbitmap_state c = llbitmap_read(llbitmap, p); 1238 + 1239 + /* always skip unwritten blocks */ 1240 + if (c == BitUnwritten) 1241 + return blocks; 1242 + 1243 + /* For degraded array, don't skip */ 1244 + if (mddev->degraded) 1245 + return 0; 1246 + 1247 + /* For resync also skip clean/dirty blocks */ 1248 + if ((c == BitClean || c == BitDirty) && 1249 + test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 1250 + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 1251 + return blocks; 1252 + 1253 + return 0; 1254 + } 1255 + 1256 + static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset, 1257 + sector_t *blocks, bool degraded) 1258 + { 1259 + struct llbitmap *llbitmap = mddev->bitmap; 1260 + unsigned long p = offset >> llbitmap->chunkshift; 1261 + 1262 + /* 1263 + * Handle one bit at a time, this is much simpler. And it doesn't matter 1264 + * if md_do_sync() loop more times. 1265 + */ 1266 + *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); 1267 + return llbitmap_state_machine(llbitmap, p, p, 1268 + BitmapActionStartsync) == BitSyncing; 1269 + } 1270 + 1271 + /* Something is wrong, sync_thread stop at @offset */ 1272 + static void llbitmap_end_sync(struct mddev *mddev, sector_t offset, 1273 + sector_t *blocks) 1274 + { 1275 + struct llbitmap *llbitmap = mddev->bitmap; 1276 + unsigned long p = offset >> llbitmap->chunkshift; 1277 + 1278 + *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); 1279 + llbitmap_state_machine(llbitmap, p, llbitmap->chunks - 1, 1280 + BitmapActionAbortsync); 1281 + } 1282 + 1283 + /* A full sync_thread is finished */ 1284 + static void llbitmap_close_sync(struct mddev *mddev) 1285 + { 1286 + struct llbitmap *llbitmap = mddev->bitmap; 1287 + int i; 1288 + 1289 + for (i = 0; i < llbitmap->nr_pages; i++) { 1290 + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; 1291 + 1292 + /* let daemon_fn clear dirty bits immediately */ 1293 + WRITE_ONCE(pctl->expire, jiffies); 1294 + } 1295 + 1296 + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, 1297 + BitmapActionEndsync); 1298 + } 1299 + 1300 + /* 1301 + * sync_thread have reached @sector, update metadata every daemon_sleep seconds, 1302 + * just in case sync_thread have to restart after power failure. 1303 + */ 1304 + static void llbitmap_cond_end_sync(struct mddev *mddev, sector_t sector, 1305 + bool force) 1306 + { 1307 + struct llbitmap *llbitmap = mddev->bitmap; 1308 + 1309 + if (sector == 0) { 1310 + llbitmap->last_end_sync = jiffies; 1311 + return; 1312 + } 1313 + 1314 + if (time_before(jiffies, llbitmap->last_end_sync + 1315 + HZ * mddev->bitmap_info.daemon_sleep)) 1316 + return; 1317 + 1318 + wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 1319 + 1320 + mddev->curr_resync_completed = sector; 1321 + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 1322 + llbitmap_state_machine(llbitmap, 0, sector >> llbitmap->chunkshift, 1323 + BitmapActionEndsync); 1324 + __llbitmap_flush(mddev); 1325 + 1326 + llbitmap->last_end_sync = jiffies; 1327 + sysfs_notify_dirent_safe(mddev->sysfs_completed); 1328 + } 1329 + 1330 + static bool llbitmap_enabled(void *data, bool flush) 1331 + { 1332 + struct llbitmap *llbitmap = data; 1333 + 1334 + return llbitmap && !test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags); 1335 + } 1336 + 1337 + static void llbitmap_dirty_bits(struct mddev *mddev, unsigned long s, 1338 + unsigned long e) 1339 + { 1340 + llbitmap_state_machine(mddev->bitmap, s, e, BitmapActionStartwrite); 1341 + } 1342 + 1343 + static void llbitmap_write_sb(struct llbitmap *llbitmap) 1344 + { 1345 + int nr_blocks = DIV_ROUND_UP(BITMAP_DATA_OFFSET, llbitmap->io_size); 1346 + 1347 + bitmap_fill(llbitmap->pctl[0]->dirty, nr_blocks); 1348 + llbitmap_write_page(llbitmap, 0); 1349 + md_super_wait(llbitmap->mddev); 1350 + } 1351 + 1352 + static void llbitmap_update_sb(void *data) 1353 + { 1354 + struct llbitmap *llbitmap = data; 1355 + struct mddev *mddev = llbitmap->mddev; 1356 + struct page *sb_page; 1357 + bitmap_super_t *sb; 1358 + 1359 + if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) 1360 + return; 1361 + 1362 + sb_page = llbitmap_read_page(llbitmap, 0); 1363 + if (IS_ERR(sb_page)) { 1364 + pr_err("%s: %s: read super block failed", __func__, 1365 + mdname(mddev)); 1366 + set_bit(BITMAP_WRITE_ERROR, &llbitmap->flags); 1367 + return; 1368 + } 1369 + 1370 + if (mddev->events < llbitmap->events_cleared) 1371 + llbitmap->events_cleared = mddev->events; 1372 + 1373 + sb = kmap_local_page(sb_page); 1374 + sb->events = cpu_to_le64(mddev->events); 1375 + sb->state = cpu_to_le32(llbitmap->flags); 1376 + sb->chunksize = cpu_to_le32(llbitmap->chunksize); 1377 + sb->sync_size = cpu_to_le64(mddev->resync_max_sectors); 1378 + sb->events_cleared = cpu_to_le64(llbitmap->events_cleared); 1379 + sb->sectors_reserved = cpu_to_le32(mddev->bitmap_info.space); 1380 + sb->daemon_sleep = cpu_to_le32(mddev->bitmap_info.daemon_sleep); 1381 + 1382 + kunmap_local(sb); 1383 + llbitmap_write_sb(llbitmap); 1384 + } 1385 + 1386 + static int llbitmap_get_stats(void *data, struct md_bitmap_stats *stats) 1387 + { 1388 + struct llbitmap *llbitmap = data; 1389 + 1390 + memset(stats, 0, sizeof(*stats)); 1391 + 1392 + stats->missing_pages = 0; 1393 + stats->pages = llbitmap->nr_pages; 1394 + stats->file_pages = llbitmap->nr_pages; 1395 + 1396 + stats->behind_writes = atomic_read(&llbitmap->behind_writes); 1397 + stats->behind_wait = wq_has_sleeper(&llbitmap->behind_wait); 1398 + stats->events_cleared = llbitmap->events_cleared; 1399 + 1400 + return 0; 1401 + } 1402 + 1403 + /* just flag all pages as needing to be written */ 1404 + static void llbitmap_write_all(struct mddev *mddev) 1405 + { 1406 + int i; 1407 + struct llbitmap *llbitmap = mddev->bitmap; 1408 + 1409 + for (i = 0; i < llbitmap->nr_pages; i++) { 1410 + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; 1411 + 1412 + set_bit(LLPageDirty, &pctl->flags); 1413 + bitmap_fill(pctl->dirty, llbitmap->blocks_per_page); 1414 + } 1415 + } 1416 + 1417 + static void llbitmap_start_behind_write(struct mddev *mddev) 1418 + { 1419 + struct llbitmap *llbitmap = mddev->bitmap; 1420 + 1421 + atomic_inc(&llbitmap->behind_writes); 1422 + } 1423 + 1424 + static void llbitmap_end_behind_write(struct mddev *mddev) 1425 + { 1426 + struct llbitmap *llbitmap = mddev->bitmap; 1427 + 1428 + if (atomic_dec_and_test(&llbitmap->behind_writes)) 1429 + wake_up(&llbitmap->behind_wait); 1430 + } 1431 + 1432 + static void llbitmap_wait_behind_writes(struct mddev *mddev) 1433 + { 1434 + struct llbitmap *llbitmap = mddev->bitmap; 1435 + 1436 + if (!llbitmap) 1437 + return; 1438 + 1439 + wait_event(llbitmap->behind_wait, 1440 + atomic_read(&llbitmap->behind_writes) == 0); 1441 + 1442 + } 1443 + 1444 + static ssize_t bits_show(struct mddev *mddev, char *page) 1445 + { 1446 + struct llbitmap *llbitmap; 1447 + int bits[BitStateCount] = {0}; 1448 + loff_t start = 0; 1449 + 1450 + mutex_lock(&mddev->bitmap_info.mutex); 1451 + llbitmap = mddev->bitmap; 1452 + if (!llbitmap || !llbitmap->pctl) { 1453 + mutex_unlock(&mddev->bitmap_info.mutex); 1454 + return sprintf(page, "no bitmap\n"); 1455 + } 1456 + 1457 + if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) { 1458 + mutex_unlock(&mddev->bitmap_info.mutex); 1459 + return sprintf(page, "bitmap io error\n"); 1460 + } 1461 + 1462 + while (start < llbitmap->chunks) { 1463 + enum llbitmap_state c = llbitmap_read(llbitmap, start); 1464 + 1465 + if (c < 0 || c >= BitStateCount) 1466 + pr_err("%s: invalid bit %llu state %d\n", 1467 + __func__, start, c); 1468 + else 1469 + bits[c]++; 1470 + start++; 1471 + } 1472 + 1473 + mutex_unlock(&mddev->bitmap_info.mutex); 1474 + return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n", 1475 + bits[BitUnwritten], bits[BitClean], bits[BitDirty], 1476 + bits[BitNeedSync], bits[BitSyncing]); 1477 + } 1478 + 1479 + static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits); 1480 + 1481 + static ssize_t metadata_show(struct mddev *mddev, char *page) 1482 + { 1483 + struct llbitmap *llbitmap; 1484 + ssize_t ret; 1485 + 1486 + mutex_lock(&mddev->bitmap_info.mutex); 1487 + llbitmap = mddev->bitmap; 1488 + if (!llbitmap) { 1489 + mutex_unlock(&mddev->bitmap_info.mutex); 1490 + return sprintf(page, "no bitmap\n"); 1491 + } 1492 + 1493 + ret = sprintf(page, "chunksize %lu\nchunkshift %lu\nchunks %lu\noffset %llu\ndaemon_sleep %lu\n", 1494 + llbitmap->chunksize, llbitmap->chunkshift, 1495 + llbitmap->chunks, mddev->bitmap_info.offset, 1496 + llbitmap->mddev->bitmap_info.daemon_sleep); 1497 + mutex_unlock(&mddev->bitmap_info.mutex); 1498 + 1499 + return ret; 1500 + } 1501 + 1502 + static struct md_sysfs_entry llbitmap_metadata = __ATTR_RO(metadata); 1503 + 1504 + static ssize_t 1505 + daemon_sleep_show(struct mddev *mddev, char *page) 1506 + { 1507 + return sprintf(page, "%lu\n", mddev->bitmap_info.daemon_sleep); 1508 + } 1509 + 1510 + static ssize_t 1511 + daemon_sleep_store(struct mddev *mddev, const char *buf, size_t len) 1512 + { 1513 + unsigned long timeout; 1514 + int rv = kstrtoul(buf, 10, &timeout); 1515 + 1516 + if (rv) 1517 + return rv; 1518 + 1519 + mddev->bitmap_info.daemon_sleep = timeout; 1520 + return len; 1521 + } 1522 + 1523 + static struct md_sysfs_entry llbitmap_daemon_sleep = __ATTR_RW(daemon_sleep); 1524 + 1525 + static ssize_t 1526 + barrier_idle_show(struct mddev *mddev, char *page) 1527 + { 1528 + struct llbitmap *llbitmap = mddev->bitmap; 1529 + 1530 + return sprintf(page, "%lu\n", llbitmap->barrier_idle); 1531 + } 1532 + 1533 + static ssize_t 1534 + barrier_idle_store(struct mddev *mddev, const char *buf, size_t len) 1535 + { 1536 + struct llbitmap *llbitmap = mddev->bitmap; 1537 + unsigned long timeout; 1538 + int rv = kstrtoul(buf, 10, &timeout); 1539 + 1540 + if (rv) 1541 + return rv; 1542 + 1543 + llbitmap->barrier_idle = timeout; 1544 + return len; 1545 + } 1546 + 1547 + static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle); 1548 + 1549 + static struct attribute *md_llbitmap_attrs[] = { 1550 + &llbitmap_bits.attr, 1551 + &llbitmap_metadata.attr, 1552 + &llbitmap_daemon_sleep.attr, 1553 + &llbitmap_barrier_idle.attr, 1554 + NULL 1555 + }; 1556 + 1557 + static struct attribute_group md_llbitmap_group = { 1558 + .name = "llbitmap", 1559 + .attrs = md_llbitmap_attrs, 1560 + }; 1561 + 1562 + static struct bitmap_operations llbitmap_ops = { 1563 + .head = { 1564 + .type = MD_BITMAP, 1565 + .id = ID_LLBITMAP, 1566 + .name = "llbitmap", 1567 + }, 1568 + 1569 + .enabled = llbitmap_enabled, 1570 + .create = llbitmap_create, 1571 + .resize = llbitmap_resize, 1572 + .load = llbitmap_load, 1573 + .destroy = llbitmap_destroy, 1574 + 1575 + .start_write = llbitmap_start_write, 1576 + .end_write = llbitmap_end_write, 1577 + .start_discard = llbitmap_start_discard, 1578 + .end_discard = llbitmap_end_discard, 1579 + .unplug = llbitmap_unplug, 1580 + .flush = llbitmap_flush, 1581 + 1582 + .start_behind_write = llbitmap_start_behind_write, 1583 + .end_behind_write = llbitmap_end_behind_write, 1584 + .wait_behind_writes = llbitmap_wait_behind_writes, 1585 + 1586 + .blocks_synced = llbitmap_blocks_synced, 1587 + .skip_sync_blocks = llbitmap_skip_sync_blocks, 1588 + .start_sync = llbitmap_start_sync, 1589 + .end_sync = llbitmap_end_sync, 1590 + .close_sync = llbitmap_close_sync, 1591 + .cond_end_sync = llbitmap_cond_end_sync, 1592 + 1593 + .update_sb = llbitmap_update_sb, 1594 + .get_stats = llbitmap_get_stats, 1595 + .dirty_bits = llbitmap_dirty_bits, 1596 + .write_all = llbitmap_write_all, 1597 + 1598 + .group = &md_llbitmap_group, 1599 + }; 1600 + 1601 + int md_llbitmap_init(void) 1602 + { 1603 + md_llbitmap_io_wq = alloc_workqueue("md_llbitmap_io", 1604 + WQ_MEM_RECLAIM | WQ_UNBOUND, 0); 1605 + if (!md_llbitmap_io_wq) 1606 + return -ENOMEM; 1607 + 1608 + md_llbitmap_unplug_wq = alloc_workqueue("md_llbitmap_unplug", 1609 + WQ_MEM_RECLAIM | WQ_UNBOUND, 0); 1610 + if (!md_llbitmap_unplug_wq) { 1611 + destroy_workqueue(md_llbitmap_io_wq); 1612 + md_llbitmap_io_wq = NULL; 1613 + return -ENOMEM; 1614 + } 1615 + 1616 + return register_md_submodule(&llbitmap_ops.head); 1617 + } 1618 + 1619 + void md_llbitmap_exit(void) 1620 + { 1621 + destroy_workqueue(md_llbitmap_io_wq); 1622 + md_llbitmap_io_wq = NULL; 1623 + destroy_workqueue(md_llbitmap_unplug_wq); 1624 + md_llbitmap_unplug_wq = NULL; 1625 + unregister_md_submodule(&llbitmap_ops.head); 1626 + }
+6
drivers/md/md.c
··· 10328 10328 if (ret) 10329 10329 return ret; 10330 10330 10331 + ret = md_llbitmap_init(); 10332 + if (ret) 10333 + goto err_bitmap; 10334 + 10331 10335 ret = -ENOMEM; 10332 10336 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 10333 10337 if (!md_wq) ··· 10363 10359 err_misc_wq: 10364 10360 destroy_workqueue(md_wq); 10365 10361 err_wq: 10362 + md_llbitmap_exit(); 10363 + err_bitmap: 10366 10364 md_bitmap_exit(); 10367 10365 return ret; 10368 10366 }
+2 -2
drivers/md/md.h
··· 26 26 enum md_submodule_type { 27 27 MD_PERSONALITY = 0, 28 28 MD_CLUSTER, 29 - MD_BITMAP, /* TODO */ 29 + MD_BITMAP, 30 30 }; 31 31 32 32 enum md_submodule_id { ··· 39 39 ID_RAID10 = 10, 40 40 ID_CLUSTER, 41 41 ID_BITMAP, 42 - ID_LLBITMAP, /* TODO */ 42 + ID_LLBITMAP, 43 43 ID_BITMAP_NONE, 44 44 }; 45 45