Merge tag 'md-3.5' of git://neil.brown.name/md

+3 -2

arch/x86/Makefile

··· 115 115 116 116 # does binutils support specific instructions? 117 117 asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) 118 + avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) 118 119 119 - KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) 120 - KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) 120 + KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) 121 + KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) 121 122 122 123 LDFLAGS := -m elf_$(UTS_MACHINE) 123 124

+5 -1

arch/x86/include/asm/xor_32.h

··· 861 861 .do_5 = xor_sse_5, 862 862 }; 863 863 864 + /* Also try the AVX routines */ 865 + #include "xor_avx.h" 866 + 864 867 /* Also try the generic routines. */ 865 868 #include <asm-generic/xor.h> 866 869 ··· 874 871 xor_speed(&xor_block_8regs_p); \ 875 872 xor_speed(&xor_block_32regs); \ 876 873 xor_speed(&xor_block_32regs_p); \ 874 + AVX_XOR_SPEED; \ 877 875 if (cpu_has_xmm) \ 878 876 xor_speed(&xor_block_pIII_sse); \ 879 877 if (cpu_has_mmx) { \ ··· 887 883 We may also be able to load into the L1 only depending on how the cpu 888 884 deals with a load to a line that is being prefetched. */ 889 885 #define XOR_SELECT_TEMPLATE(FASTEST) \ 890 - (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) 886 + AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) 891 887 892 888 #endif /* _ASM_X86_XOR_32_H */

+7 -1

arch/x86/include/asm/xor_64.h

··· 347 347 .do_5 = xor_sse_5, 348 348 }; 349 349 350 + 351 + /* Also try the AVX routines */ 352 + #include "xor_avx.h" 353 + 350 354 #undef XOR_TRY_TEMPLATES 351 355 #define XOR_TRY_TEMPLATES \ 352 356 do { \ 357 + AVX_XOR_SPEED; \ 353 358 xor_speed(&xor_block_sse); \ 354 359 } while (0) 355 360 356 361 /* We force the use of the SSE xor block because it can write around L2. 357 362 We may also be able to load into the L1 only depending on how the cpu 358 363 deals with a load to a line that is being prefetched. */ 359 - #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) 364 + #define XOR_SELECT_TEMPLATE(FASTEST) \ 365 + AVX_SELECT(&xor_block_sse) 360 366 361 367 #endif /* _ASM_X86_XOR_64_H */

+214

arch/x86/include/asm/xor_avx.h

··· 1 + #ifndef _ASM_X86_XOR_AVX_H 2 + #define _ASM_X86_XOR_AVX_H 3 + 4 + /* 5 + * Optimized RAID-5 checksumming functions for AVX 6 + * 7 + * Copyright (C) 2012 Intel Corporation 8 + * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> 9 + * 10 + * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines 11 + * 12 + * This program is free software; you can redistribute it and/or 13 + * modify it under the terms of the GNU General Public License 14 + * as published by the Free Software Foundation; version 2 15 + * of the License. 16 + */ 17 + 18 + #ifdef CONFIG_AS_AVX 19 + 20 + #include <linux/compiler.h> 21 + #include <asm/i387.h> 22 + 23 + #define ALIGN32 __aligned(32) 24 + 25 + #define YMM_SAVED_REGS 4 26 + 27 + #define YMMS_SAVE \ 28 + do { \ 29 + preempt_disable(); \ 30 + cr0 = read_cr0(); \ 31 + clts(); \ 32 + asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \ 33 + asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \ 34 + asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \ 35 + asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \ 36 + } while (0); 37 + 38 + #define YMMS_RESTORE \ 39 + do { \ 40 + asm volatile("sfence" : : : "memory"); \ 41 + asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \ 42 + asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \ 43 + asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \ 44 + asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \ 45 + write_cr0(cr0); \ 46 + preempt_enable(); \ 47 + } while (0); 48 + 49 + #define BLOCK4(i) \ 50 + BLOCK(32 * i, 0) \ 51 + BLOCK(32 * (i + 1), 1) \ 52 + BLOCK(32 * (i + 2), 2) \ 53 + BLOCK(32 * (i + 3), 3) 54 + 55 + #define BLOCK16() \ 56 + BLOCK4(0) \ 57 + BLOCK4(4) \ 58 + BLOCK4(8) \ 59 + BLOCK4(12) 60 + 61 + static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) 62 + { 63 + unsigned long cr0, lines = bytes >> 9; 64 + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; 65 + 66 + YMMS_SAVE 67 + 68 + while (lines--) { 69 + #undef BLOCK 70 + #define BLOCK(i, reg) \ 71 + do { \ 72 + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ 73 + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 74 + "m" (p0[i / sizeof(*p0)])); \ 75 + asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 76 + "=m" (p0[i / sizeof(*p0)])); \ 77 + } while (0); 78 + 79 + BLOCK16() 80 + 81 + p0 = (unsigned long *)((uintptr_t)p0 + 512); 82 + p1 = (unsigned long *)((uintptr_t)p1 + 512); 83 + } 84 + 85 + YMMS_RESTORE 86 + } 87 + 88 + static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, 89 + unsigned long *p2) 90 + { 91 + unsigned long cr0, lines = bytes >> 9; 92 + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; 93 + 94 + YMMS_SAVE 95 + 96 + while (lines--) { 97 + #undef BLOCK 98 + #define BLOCK(i, reg) \ 99 + do { \ 100 + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ 101 + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 102 + "m" (p1[i / sizeof(*p1)])); \ 103 + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 104 + "m" (p0[i / sizeof(*p0)])); \ 105 + asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 106 + "=m" (p0[i / sizeof(*p0)])); \ 107 + } while (0); 108 + 109 + BLOCK16() 110 + 111 + p0 = (unsigned long *)((uintptr_t)p0 + 512); 112 + p1 = (unsigned long *)((uintptr_t)p1 + 512); 113 + p2 = (unsigned long *)((uintptr_t)p2 + 512); 114 + } 115 + 116 + YMMS_RESTORE 117 + } 118 + 119 + static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, 120 + unsigned long *p2, unsigned long *p3) 121 + { 122 + unsigned long cr0, lines = bytes >> 9; 123 + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; 124 + 125 + YMMS_SAVE 126 + 127 + while (lines--) { 128 + #undef BLOCK 129 + #define BLOCK(i, reg) \ 130 + do { \ 131 + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ 132 + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 133 + "m" (p2[i / sizeof(*p2)])); \ 134 + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 135 + "m" (p1[i / sizeof(*p1)])); \ 136 + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 137 + "m" (p0[i / sizeof(*p0)])); \ 138 + asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 139 + "=m" (p0[i / sizeof(*p0)])); \ 140 + } while (0); 141 + 142 + BLOCK16(); 143 + 144 + p0 = (unsigned long *)((uintptr_t)p0 + 512); 145 + p1 = (unsigned long *)((uintptr_t)p1 + 512); 146 + p2 = (unsigned long *)((uintptr_t)p2 + 512); 147 + p3 = (unsigned long *)((uintptr_t)p3 + 512); 148 + } 149 + 150 + YMMS_RESTORE 151 + } 152 + 153 + static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, 154 + unsigned long *p2, unsigned long *p3, unsigned long *p4) 155 + { 156 + unsigned long cr0, lines = bytes >> 9; 157 + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; 158 + 159 + YMMS_SAVE 160 + 161 + while (lines--) { 162 + #undef BLOCK 163 + #define BLOCK(i, reg) \ 164 + do { \ 165 + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ 166 + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 167 + "m" (p3[i / sizeof(*p3)])); \ 168 + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 169 + "m" (p2[i / sizeof(*p2)])); \ 170 + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 171 + "m" (p1[i / sizeof(*p1)])); \ 172 + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 173 + "m" (p0[i / sizeof(*p0)])); \ 174 + asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 175 + "=m" (p0[i / sizeof(*p0)])); \ 176 + } while (0); 177 + 178 + BLOCK16() 179 + 180 + p0 = (unsigned long *)((uintptr_t)p0 + 512); 181 + p1 = (unsigned long *)((uintptr_t)p1 + 512); 182 + p2 = (unsigned long *)((uintptr_t)p2 + 512); 183 + p3 = (unsigned long *)((uintptr_t)p3 + 512); 184 + p4 = (unsigned long *)((uintptr_t)p4 + 512); 185 + } 186 + 187 + YMMS_RESTORE 188 + } 189 + 190 + static struct xor_block_template xor_block_avx = { 191 + .name = "avx", 192 + .do_2 = xor_avx_2, 193 + .do_3 = xor_avx_3, 194 + .do_4 = xor_avx_4, 195 + .do_5 = xor_avx_5, 196 + }; 197 + 198 + #define AVX_XOR_SPEED \ 199 + do { \ 200 + if (cpu_has_avx) \ 201 + xor_speed(&xor_block_avx); \ 202 + } while (0) 203 + 204 + #define AVX_SELECT(FASTEST) \ 205 + (cpu_has_avx ? &xor_block_avx : FASTEST) 206 + 207 + #else 208 + 209 + #define AVX_XOR_SPEED {} 210 + 211 + #define AVX_SELECT(FASTEST) (FASTEST) 212 + 213 + #endif 214 + #endif

+10 -3

crypto/xor.c

··· 21 21 #include <linux/gfp.h> 22 22 #include <linux/raid/xor.h> 23 23 #include <linux/jiffies.h> 24 + #include <linux/preempt.h> 24 25 #include <asm/xor.h> 25 26 26 27 /* The xor routines to use. */ ··· 64 63 do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) 65 64 { 66 65 int speed; 67 - unsigned long now; 66 + unsigned long now, j; 68 67 int i, count, max; 69 68 70 69 tmpl->next = template_list; 71 70 template_list = tmpl; 71 + 72 + preempt_disable(); 72 73 73 74 /* 74 75 * Count the number of XORs done during a whole jiffy, and use ··· 79 76 */ 80 77 max = 0; 81 78 for (i = 0; i < 5; i++) { 82 - now = jiffies; 79 + j = jiffies; 83 80 count = 0; 84 - while (jiffies == now) { 81 + while ((now = jiffies) == j) 82 + cpu_relax(); 83 + while (time_before(jiffies, now + 1)) { 85 84 mb(); /* prevent loop optimzation */ 86 85 tmpl->do_2(BENCH_SIZE, b1, b2); 87 86 mb(); ··· 93 88 if (count > max) 94 89 max = count; 95 90 } 91 + 92 + preempt_enable(); 96 93 97 94 speed = max * (HZ * BENCH_SIZE / 1024); 98 95 tmpl->speed = speed;

+636 -472

drivers/md/bitmap.c

··· 45 45 * if we find our page, we increment the page's refcount so that it stays 46 46 * allocated while we're using it 47 47 */ 48 - static int bitmap_checkpage(struct bitmap *bitmap, 48 + static int bitmap_checkpage(struct bitmap_counts *bitmap, 49 49 unsigned long page, int create) 50 50 __releases(bitmap->lock) 51 51 __acquires(bitmap->lock) ··· 76 76 spin_lock_irq(&bitmap->lock); 77 77 78 78 if (mappage == NULL) { 79 - pr_debug("%s: bitmap map page allocation failed, hijacking\n", 80 - bmname(bitmap)); 79 + pr_debug("md/bitmap: map page allocation failed, hijacking\n"); 81 80 /* failed - set the hijacked flag so that we can use the 82 81 * pointer as a counter */ 83 82 if (!bitmap->bp[page].map) ··· 99 100 /* if page is completely empty, put it back on the free list, or dealloc it */ 100 101 /* if page was hijacked, unmark the flag so it might get alloced next time */ 101 102 /* Note: lock should be held when calling this */ 102 - static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) 103 + static void bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page) 103 104 { 104 105 char *ptr; 105 106 ··· 129 130 */ 130 131 131 132 /* IO operations when bitmap is stored near all superblocks */ 132 - static struct page *read_sb_page(struct mddev *mddev, loff_t offset, 133 - struct page *page, 134 - unsigned long index, int size) 133 + static int read_sb_page(struct mddev *mddev, loff_t offset, 134 + struct page *page, 135 + unsigned long index, int size) 135 136 { 136 137 /* choose a good rdev and read the page from there */ 137 138 138 139 struct md_rdev *rdev; 139 140 sector_t target; 140 - int did_alloc = 0; 141 - 142 - if (!page) { 143 - page = alloc_page(GFP_KERNEL); 144 - if (!page) 145 - return ERR_PTR(-ENOMEM); 146 - did_alloc = 1; 147 - } 148 141 149 142 rdev_for_each(rdev, mddev) { 150 143 if (! test_bit(In_sync, &rdev->flags) ··· 149 158 roundup(size, bdev_logical_block_size(rdev->bdev)), 150 159 page, READ, true)) { 151 160 page->index = index; 152 - attach_page_buffers(page, NULL); /* so that free_buffer will 153 - * quietly no-op */ 154 - return page; 161 + return 0; 155 162 } 156 163 } 157 - if (did_alloc) 158 - put_page(page); 159 - return ERR_PTR(-EIO); 160 - 164 + return -EIO; 161 165 } 162 166 163 167 static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev) ··· 194 208 struct md_rdev *rdev = NULL; 195 209 struct block_device *bdev; 196 210 struct mddev *mddev = bitmap->mddev; 211 + struct bitmap_storage *store = &bitmap->storage; 197 212 198 213 while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { 199 214 int size = PAGE_SIZE; ··· 202 215 203 216 bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; 204 217 205 - if (page->index == bitmap->file_pages-1) 206 - size = roundup(bitmap->last_page_size, 218 + if (page->index == store->file_pages-1) { 219 + int last_page_size = store->bytes & (PAGE_SIZE-1); 220 + if (last_page_size == 0) 221 + last_page_size = PAGE_SIZE; 222 + size = roundup(last_page_size, 207 223 bdev_logical_block_size(bdev)); 224 + } 208 225 /* Just make sure we aren't corrupting data or 209 226 * metadata 210 227 */ ··· 267 276 { 268 277 struct buffer_head *bh; 269 278 270 - if (bitmap->file == NULL) { 279 + if (bitmap->storage.file == NULL) { 271 280 switch (write_sb_page(bitmap, page, wait)) { 272 281 case -EINVAL: 273 - bitmap->flags |= BITMAP_WRITE_ERROR; 282 + set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); 274 283 } 275 284 } else { 276 285 ··· 288 297 wait_event(bitmap->write_wait, 289 298 atomic_read(&bitmap->pending_writes)==0); 290 299 } 291 - if (bitmap->flags & BITMAP_WRITE_ERROR) 300 + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) 292 301 bitmap_file_kick(bitmap); 293 302 } 294 303 295 304 static void end_bitmap_write(struct buffer_head *bh, int uptodate) 296 305 { 297 306 struct bitmap *bitmap = bh->b_private; 298 - unsigned long flags; 299 307 300 - if (!uptodate) { 301 - spin_lock_irqsave(&bitmap->lock, flags); 302 - bitmap->flags |= BITMAP_WRITE_ERROR; 303 - spin_unlock_irqrestore(&bitmap->lock, flags); 304 - } 308 + if (!uptodate) 309 + set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); 305 310 if (atomic_dec_and_test(&bitmap->pending_writes)) 306 311 wake_up(&bitmap->write_wait); 307 312 } ··· 312 325 } 313 326 static void free_buffers(struct page *page) 314 327 { 315 - struct buffer_head *bh = page_buffers(page); 328 + struct buffer_head *bh; 316 329 330 + if (!PagePrivate(page)) 331 + return; 332 + 333 + bh = page_buffers(page); 317 334 while (bh) { 318 335 struct buffer_head *next = bh->b_this_page; 319 336 free_buffer_head(bh); ··· 334 343 * This usage is similar to how swap files are handled, and allows us 335 344 * to write to a file with no concerns of memory allocation failing. 336 345 */ 337 - static struct page *read_page(struct file *file, unsigned long index, 338 - struct bitmap *bitmap, 339 - unsigned long count) 346 + static int read_page(struct file *file, unsigned long index, 347 + struct bitmap *bitmap, 348 + unsigned long count, 349 + struct page *page) 340 350 { 341 - struct page *page = NULL; 351 + int ret = 0; 342 352 struct inode *inode = file->f_path.dentry->d_inode; 343 353 struct buffer_head *bh; 344 354 sector_t block; ··· 347 355 pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, 348 356 (unsigned long long)index << PAGE_SHIFT); 349 357 350 - page = alloc_page(GFP_KERNEL); 351 - if (!page) 352 - page = ERR_PTR(-ENOMEM); 353 - if (IS_ERR(page)) 354 - goto out; 355 - 356 358 bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0); 357 359 if (!bh) { 358 - put_page(page); 359 - page = ERR_PTR(-ENOMEM); 360 + ret = -ENOMEM; 360 361 goto out; 361 362 } 362 363 attach_page_buffers(page, bh); ··· 361 376 bh->b_blocknr = bmap(inode, block); 362 377 if (bh->b_blocknr == 0) { 363 378 /* Cannot use this file! */ 364 - free_buffers(page); 365 - page = ERR_PTR(-EINVAL); 379 + ret = -EINVAL; 366 380 goto out; 367 381 } 368 382 bh->b_bdev = inode->i_sb->s_bdev; ··· 384 400 385 401 wait_event(bitmap->write_wait, 386 402 atomic_read(&bitmap->pending_writes)==0); 387 - if (bitmap->flags & BITMAP_WRITE_ERROR) { 388 - free_buffers(page); 389 - page = ERR_PTR(-EIO); 390 - } 403 + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) 404 + ret = -EIO; 391 405 out: 392 - if (IS_ERR(page)) 393 - printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %ld\n", 406 + if (ret) 407 + printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %d\n", 394 408 (int)PAGE_SIZE, 395 409 (unsigned long long)index << PAGE_SHIFT, 396 - PTR_ERR(page)); 397 - return page; 410 + ret); 411 + return ret; 398 412 } 399 413 400 414 /* ··· 408 426 return; 409 427 if (bitmap->mddev->bitmap_info.external) 410 428 return; 411 - if (!bitmap->sb_page) /* no superblock */ 429 + if (!bitmap->storage.sb_page) /* no superblock */ 412 430 return; 413 - sb = kmap_atomic(bitmap->sb_page); 431 + sb = kmap_atomic(bitmap->storage.sb_page); 414 432 sb->events = cpu_to_le64(bitmap->mddev->events); 415 433 if (bitmap->mddev->events < bitmap->events_cleared) 416 434 /* rocking back to read-only */ ··· 420 438 /* Just in case these have been changed via sysfs: */ 421 439 sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); 422 440 sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); 441 + /* This might have been changed by a reshape */ 442 + sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); 443 + sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize); 444 + sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> 445 + bitmap_info.space); 423 446 kunmap_atomic(sb); 424 - write_page(bitmap, bitmap->sb_page, 1); 447 + write_page(bitmap, bitmap->storage.sb_page, 1); 425 448 } 426 449 427 450 /* print out the bitmap file superblock */ ··· 434 447 { 435 448 bitmap_super_t *sb; 436 449 437 - if (!bitmap || !bitmap->sb_page) 450 + if (!bitmap || !bitmap->storage.sb_page) 438 451 return; 439 - sb = kmap_atomic(bitmap->sb_page); 452 + sb = kmap_atomic(bitmap->storage.sb_page); 440 453 printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); 441 454 printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); 442 455 printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); ··· 475 488 unsigned long chunksize, daemon_sleep, write_behind; 476 489 int err = -EINVAL; 477 490 478 - bitmap->sb_page = alloc_page(GFP_KERNEL); 479 - if (IS_ERR(bitmap->sb_page)) { 480 - err = PTR_ERR(bitmap->sb_page); 481 - bitmap->sb_page = NULL; 491 + bitmap->storage.sb_page = alloc_page(GFP_KERNEL); 492 + if (IS_ERR(bitmap->storage.sb_page)) { 493 + err = PTR_ERR(bitmap->storage.sb_page); 494 + bitmap->storage.sb_page = NULL; 482 495 return err; 483 496 } 484 - bitmap->sb_page->index = 0; 497 + bitmap->storage.sb_page->index = 0; 485 498 486 - sb = kmap_atomic(bitmap->sb_page); 499 + sb = kmap_atomic(bitmap->storage.sb_page); 487 500 488 501 sb->magic = cpu_to_le32(BITMAP_MAGIC); 489 502 sb->version = cpu_to_le32(BITMAP_MAJOR_HI); ··· 521 534 522 535 memcpy(sb->uuid, bitmap->mddev->uuid, 16); 523 536 524 - bitmap->flags |= BITMAP_STALE; 525 - sb->state |= cpu_to_le32(BITMAP_STALE); 537 + set_bit(BITMAP_STALE, &bitmap->flags); 538 + sb->state = cpu_to_le32(bitmap->flags); 526 539 bitmap->events_cleared = bitmap->mddev->events; 527 540 sb->events_cleared = cpu_to_le64(bitmap->mddev->events); 528 541 ··· 538 551 bitmap_super_t *sb; 539 552 unsigned long chunksize, daemon_sleep, write_behind; 540 553 unsigned long long events; 554 + unsigned long sectors_reserved = 0; 541 555 int err = -EINVAL; 556 + struct page *sb_page; 542 557 558 + if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) { 559 + chunksize = 128 * 1024 * 1024; 560 + daemon_sleep = 5 * HZ; 561 + write_behind = 0; 562 + set_bit(BITMAP_STALE, &bitmap->flags); 563 + err = 0; 564 + goto out_no_sb; 565 + } 543 566 /* page 0 is the superblock, read it... */ 544 - if (bitmap->file) { 545 - loff_t isize = i_size_read(bitmap->file->f_mapping->host); 567 + sb_page = alloc_page(GFP_KERNEL); 568 + if (!sb_page) 569 + return -ENOMEM; 570 + bitmap->storage.sb_page = sb_page; 571 + 572 + if (bitmap->storage.file) { 573 + loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host); 546 574 int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; 547 575 548 - bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes); 576 + err = read_page(bitmap->storage.file, 0, 577 + bitmap, bytes, sb_page); 549 578 } else { 550 - bitmap->sb_page = read_sb_page(bitmap->mddev, 551 - bitmap->mddev->bitmap_info.offset, 552 - NULL, 553 - 0, sizeof(bitmap_super_t)); 579 + err = read_sb_page(bitmap->mddev, 580 + bitmap->mddev->bitmap_info.offset, 581 + sb_page, 582 + 0, sizeof(bitmap_super_t)); 554 583 } 555 - if (IS_ERR(bitmap->sb_page)) { 556 - err = PTR_ERR(bitmap->sb_page); 557 - bitmap->sb_page = NULL; 584 + if (err) 558 585 return err; 559 - } 560 586 561 - sb = kmap_atomic(bitmap->sb_page); 587 + sb = kmap_atomic(sb_page); 562 588 563 589 chunksize = le32_to_cpu(sb->chunksize); 564 590 daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; 565 591 write_behind = le32_to_cpu(sb->write_behind); 592 + sectors_reserved = le32_to_cpu(sb->sectors_reserved); 566 593 567 594 /* verify that the bitmap-specific fields are valid */ 568 595 if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) ··· 619 618 "-- forcing full recovery\n", 620 619 bmname(bitmap), events, 621 620 (unsigned long long) bitmap->mddev->events); 622 - sb->state |= cpu_to_le32(BITMAP_STALE); 621 + set_bit(BITMAP_STALE, &bitmap->flags); 623 622 } 624 623 } 625 624 626 625 /* assign fields using values from superblock */ 627 - bitmap->mddev->bitmap_info.chunksize = chunksize; 628 - bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; 629 - bitmap->mddev->bitmap_info.max_write_behind = write_behind; 630 626 bitmap->flags |= le32_to_cpu(sb->state); 631 627 if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) 632 - bitmap->flags |= BITMAP_HOSTENDIAN; 628 + set_bit(BITMAP_HOSTENDIAN, &bitmap->flags); 633 629 bitmap->events_cleared = le64_to_cpu(sb->events_cleared); 634 - if (bitmap->flags & BITMAP_STALE) 635 - bitmap->events_cleared = bitmap->mddev->events; 636 630 err = 0; 637 631 out: 638 632 kunmap_atomic(sb); 633 + out_no_sb: 634 + if (test_bit(BITMAP_STALE, &bitmap->flags)) 635 + bitmap->events_cleared = bitmap->mddev->events; 636 + bitmap->mddev->bitmap_info.chunksize = chunksize; 637 + bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; 638 + bitmap->mddev->bitmap_info.max_write_behind = write_behind; 639 + if (bitmap->mddev->bitmap_info.space == 0 || 640 + bitmap->mddev->bitmap_info.space > sectors_reserved) 641 + bitmap->mddev->bitmap_info.space = sectors_reserved; 639 642 if (err) 640 643 bitmap_print_sb(bitmap); 641 644 return err; 642 - } 643 - 644 - enum bitmap_mask_op { 645 - MASK_SET, 646 - MASK_UNSET 647 - }; 648 - 649 - /* record the state of the bitmap in the superblock. Return the old value */ 650 - static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, 651 - enum bitmap_mask_op op) 652 - { 653 - bitmap_super_t *sb; 654 - int old; 655 - 656 - if (!bitmap->sb_page) /* can't set the state */ 657 - return 0; 658 - sb = kmap_atomic(bitmap->sb_page); 659 - old = le32_to_cpu(sb->state) & bits; 660 - switch (op) { 661 - case MASK_SET: 662 - sb->state |= cpu_to_le32(bits); 663 - bitmap->flags |= bits; 664 - break; 665 - case MASK_UNSET: 666 - sb->state &= cpu_to_le32(~bits); 667 - bitmap->flags &= ~bits; 668 - break; 669 - default: 670 - BUG(); 671 - } 672 - kunmap_atomic(sb); 673 - return old; 674 645 } 675 646 676 647 /* ··· 656 683 * file a page at a time. There's a superblock at the start of the file. 657 684 */ 658 685 /* calculate the index of the page that contains this bit */ 659 - static inline unsigned long file_page_index(struct bitmap *bitmap, unsigned long chunk) 686 + static inline unsigned long file_page_index(struct bitmap_storage *store, 687 + unsigned long chunk) 660 688 { 661 - if (!bitmap->mddev->bitmap_info.external) 689 + if (store->sb_page) 662 690 chunk += sizeof(bitmap_super_t) << 3; 663 691 return chunk >> PAGE_BIT_SHIFT; 664 692 } 665 693 666 694 /* calculate the (bit) offset of this bit within a page */ 667 - static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned long chunk) 695 + static inline unsigned long file_page_offset(struct bitmap_storage *store, 696 + unsigned long chunk) 668 697 { 669 - if (!bitmap->mddev->bitmap_info.external) 698 + if (store->sb_page) 670 699 chunk += sizeof(bitmap_super_t) << 3; 671 700 return chunk & (PAGE_BITS - 1); 672 701 } ··· 680 705 * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page 681 706 * 0 or page 1 682 707 */ 683 - static inline struct page *filemap_get_page(struct bitmap *bitmap, 708 + static inline struct page *filemap_get_page(struct bitmap_storage *store, 684 709 unsigned long chunk) 685 710 { 686 - if (file_page_index(bitmap, chunk) >= bitmap->file_pages) 711 + if (file_page_index(store, chunk) >= store->file_pages) 687 712 return NULL; 688 - return bitmap->filemap[file_page_index(bitmap, chunk) 689 - - file_page_index(bitmap, 0)]; 713 + return store->filemap[file_page_index(store, chunk) 714 + - file_page_index(store, 0)]; 690 715 } 691 716 692 - static void bitmap_file_unmap(struct bitmap *bitmap) 717 + static int bitmap_storage_alloc(struct bitmap_storage *store, 718 + unsigned long chunks, int with_super) 719 + { 720 + int pnum; 721 + unsigned long num_pages; 722 + unsigned long bytes; 723 + 724 + bytes = DIV_ROUND_UP(chunks, 8); 725 + if (with_super) 726 + bytes += sizeof(bitmap_super_t); 727 + 728 + num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); 729 + 730 + store->filemap = kmalloc(sizeof(struct page *) 731 + * num_pages, GFP_KERNEL); 732 + if (!store->filemap) 733 + return -ENOMEM; 734 + 735 + if (with_super && !store->sb_page) { 736 + store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO); 737 + if (store->sb_page == NULL) 738 + return -ENOMEM; 739 + store->sb_page->index = 0; 740 + } 741 + pnum = 0; 742 + if (store->sb_page) { 743 + store->filemap[0] = store->sb_page; 744 + pnum = 1; 745 + } 746 + for ( ; pnum < num_pages; pnum++) { 747 + store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO); 748 + if (!store->filemap[pnum]) { 749 + store->file_pages = pnum; 750 + return -ENOMEM; 751 + } 752 + store->filemap[pnum]->index = pnum; 753 + } 754 + store->file_pages = pnum; 755 + 756 + /* We need 4 bits per page, rounded up to a multiple 757 + * of sizeof(unsigned long) */ 758 + store->filemap_attr = kzalloc( 759 + roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)), 760 + GFP_KERNEL); 761 + if (!store->filemap_attr) 762 + return -ENOMEM; 763 + 764 + store->bytes = bytes; 765 + 766 + return 0; 767 + } 768 + 769 + static void bitmap_file_unmap(struct bitmap_storage *store) 693 770 { 694 771 struct page **map, *sb_page; 695 - unsigned long *attr; 696 772 int pages; 697 - unsigned long flags; 773 + struct file *file; 698 774 699 - spin_lock_irqsave(&bitmap->lock, flags); 700 - map = bitmap->filemap; 701 - bitmap->filemap = NULL; 702 - attr = bitmap->filemap_attr; 703 - bitmap->filemap_attr = NULL; 704 - pages = bitmap->file_pages; 705 - bitmap->file_pages = 0; 706 - sb_page = bitmap->sb_page; 707 - bitmap->sb_page = NULL; 708 - spin_unlock_irqrestore(&bitmap->lock, flags); 775 + file = store->file; 776 + map = store->filemap; 777 + pages = store->file_pages; 778 + sb_page = store->sb_page; 709 779 710 780 while (pages--) 711 781 if (map[pages] != sb_page) /* 0 is sb_page, release it below */ 712 782 free_buffers(map[pages]); 713 783 kfree(map); 714 - kfree(attr); 784 + kfree(store->filemap_attr); 715 785 716 786 if (sb_page) 717 787 free_buffers(sb_page); 718 - } 719 - 720 - static void bitmap_file_put(struct bitmap *bitmap) 721 - { 722 - struct file *file; 723 - unsigned long flags; 724 - 725 - spin_lock_irqsave(&bitmap->lock, flags); 726 - file = bitmap->file; 727 - bitmap->file = NULL; 728 - spin_unlock_irqrestore(&bitmap->lock, flags); 729 - 730 - if (file) 731 - wait_event(bitmap->write_wait, 732 - atomic_read(&bitmap->pending_writes)==0); 733 - bitmap_file_unmap(bitmap); 734 788 735 789 if (file) { 736 790 struct inode *inode = file->f_path.dentry->d_inode; ··· 777 773 { 778 774 char *path, *ptr = NULL; 779 775 780 - if (bitmap_mask_state(bitmap, BITMAP_STALE, MASK_SET) == 0) { 776 + if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) { 781 777 bitmap_update_sb(bitmap); 782 778 783 - if (bitmap->file) { 779 + if (bitmap->storage.file) { 784 780 path = kmalloc(PAGE_SIZE, GFP_KERNEL); 785 781 if (path) 786 - ptr = d_path(&bitmap->file->f_path, path, 787 - PAGE_SIZE); 782 + ptr = d_path(&bitmap->storage.file->f_path, 783 + path, PAGE_SIZE); 788 784 789 785 printk(KERN_ALERT 790 786 "%s: kicking failed bitmap file %s from array!\n", ··· 796 792 "%s: disabling internal bitmap due to errors\n", 797 793 bmname(bitmap)); 798 794 } 799 - 800 - bitmap_file_put(bitmap); 801 - 802 - return; 803 795 } 804 796 805 797 enum bitmap_page_attr { ··· 805 805 BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */ 806 806 }; 807 807 808 - static inline void set_page_attr(struct bitmap *bitmap, struct page *page, 809 - enum bitmap_page_attr attr) 808 + static inline void set_page_attr(struct bitmap *bitmap, int pnum, 809 + enum bitmap_page_attr attr) 810 810 { 811 - __set_bit((page->index<<2) + attr, bitmap->filemap_attr); 811 + set_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); 812 812 } 813 813 814 - static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, 815 - enum bitmap_page_attr attr) 814 + static inline void clear_page_attr(struct bitmap *bitmap, int pnum, 815 + enum bitmap_page_attr attr) 816 816 { 817 - __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); 817 + clear_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); 818 818 } 819 819 820 - static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, 820 + static inline int test_page_attr(struct bitmap *bitmap, int pnum, 821 + enum bitmap_page_attr attr) 822 + { 823 + return test_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); 824 + } 825 + 826 + static inline int test_and_clear_page_attr(struct bitmap *bitmap, int pnum, 821 827 enum bitmap_page_attr attr) 822 828 { 823 - return test_bit((page->index<<2) + attr, bitmap->filemap_attr); 829 + return test_and_clear_bit((pnum<<2) + attr, 830 + bitmap->storage.filemap_attr); 824 831 } 825 - 826 832 /* 827 833 * bitmap_file_set_bit -- called before performing a write to the md device 828 834 * to set (and eventually sync) a particular bit in the bitmap file ··· 841 835 unsigned long bit; 842 836 struct page *page; 843 837 void *kaddr; 844 - unsigned long chunk = block >> bitmap->chunkshift; 838 + unsigned long chunk = block >> bitmap->counts.chunkshift; 845 839 846 - if (!bitmap->filemap) 847 - return; 848 - 849 - page = filemap_get_page(bitmap, chunk); 840 + page = filemap_get_page(&bitmap->storage, chunk); 850 841 if (!page) 851 842 return; 852 - bit = file_page_offset(bitmap, chunk); 843 + bit = file_page_offset(&bitmap->storage, chunk); 853 844 854 845 /* set the bit */ 855 846 kaddr = kmap_atomic(page); 856 - if (bitmap->flags & BITMAP_HOSTENDIAN) 847 + if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) 857 848 set_bit(bit, kaddr); 858 849 else 859 - __set_bit_le(bit, kaddr); 850 + test_and_set_bit_le(bit, kaddr); 860 851 kunmap_atomic(kaddr); 861 852 pr_debug("set file bit %lu page %lu\n", bit, page->index); 862 853 /* record page number so it gets flushed to disk when unplug occurs */ 863 - set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); 854 + set_page_attr(bitmap, page->index, BITMAP_PAGE_DIRTY); 855 + } 856 + 857 + static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) 858 + { 859 + unsigned long bit; 860 + struct page *page; 861 + void *paddr; 862 + unsigned long chunk = block >> bitmap->counts.chunkshift; 863 + 864 + page = filemap_get_page(&bitmap->storage, chunk); 865 + if (!page) 866 + return; 867 + bit = file_page_offset(&bitmap->storage, chunk); 868 + paddr = kmap_atomic(page); 869 + if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) 870 + clear_bit(bit, paddr); 871 + else 872 + test_and_clear_bit_le(bit, paddr); 873 + kunmap_atomic(paddr); 874 + if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) { 875 + set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING); 876 + bitmap->allclean = 0; 877 + } 864 878 } 865 879 866 880 /* this gets called when the md device is ready to unplug its underlying ··· 888 862 * sync the dirty pages of the bitmap file to disk */ 889 863 void bitmap_unplug(struct bitmap *bitmap) 890 864 { 891 - unsigned long i, flags; 865 + unsigned long i; 892 866 int dirty, need_write; 893 - struct page *page; 894 867 int wait = 0; 895 868 896 - if (!bitmap) 869 + if (!bitmap || !bitmap->storage.filemap || 870 + test_bit(BITMAP_STALE, &bitmap->flags)) 897 871 return; 898 872 899 873 /* look at each page to see if there are any set bits that need to be 900 874 * flushed out to disk */ 901 - for (i = 0; i < bitmap->file_pages; i++) { 902 - spin_lock_irqsave(&bitmap->lock, flags); 903 - if (!bitmap->filemap) { 904 - spin_unlock_irqrestore(&bitmap->lock, flags); 875 + for (i = 0; i < bitmap->storage.file_pages; i++) { 876 + if (!bitmap->storage.filemap) 905 877 return; 878 + dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); 879 + need_write = test_and_clear_page_attr(bitmap, i, 880 + BITMAP_PAGE_NEEDWRITE); 881 + if (dirty || need_write) { 882 + clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); 883 + write_page(bitmap, bitmap->storage.filemap[i], 0); 906 884 } 907 - page = bitmap->filemap[i]; 908 - dirty = test_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); 909 - need_write = test_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); 910 - clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); 911 - clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); 912 885 if (dirty) 913 886 wait = 1; 914 - spin_unlock_irqrestore(&bitmap->lock, flags); 915 - 916 - if (dirty || need_write) 917 - write_page(bitmap, page, 0); 918 887 } 919 888 if (wait) { /* if any writes were performed, we need to wait on them */ 920 - if (bitmap->file) 889 + if (bitmap->storage.file) 921 890 wait_event(bitmap->write_wait, 922 891 atomic_read(&bitmap->pending_writes)==0); 923 892 else 924 893 md_super_wait(bitmap->mddev); 925 894 } 926 - if (bitmap->flags & BITMAP_WRITE_ERROR) 895 + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) 927 896 bitmap_file_kick(bitmap); 928 897 } 929 898 EXPORT_SYMBOL(bitmap_unplug); ··· 938 917 static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) 939 918 { 940 919 unsigned long i, chunks, index, oldindex, bit; 941 - struct page *page = NULL, *oldpage = NULL; 942 - unsigned long num_pages, bit_cnt = 0; 920 + struct page *page = NULL; 921 + unsigned long bit_cnt = 0; 943 922 struct file *file; 944 - unsigned long bytes, offset; 923 + unsigned long offset; 945 924 int outofdate; 946 925 int ret = -ENOSPC; 947 926 void *paddr; 927 + struct bitmap_storage *store = &bitmap->storage; 948 928 949 - chunks = bitmap->chunks; 950 - file = bitmap->file; 929 + chunks = bitmap->counts.chunks; 930 + file = store->file; 951 931 952 - BUG_ON(!file && !bitmap->mddev->bitmap_info.offset); 932 + if (!file && !bitmap->mddev->bitmap_info.offset) { 933 + /* No permanent bitmap - fill with '1s'. */ 934 + store->filemap = NULL; 935 + store->file_pages = 0; 936 + for (i = 0; i < chunks ; i++) { 937 + /* if the disk bit is set, set the memory bit */ 938 + int needed = ((sector_t)(i+1) << (bitmap->counts.chunkshift) 939 + >= start); 940 + bitmap_set_memory_bits(bitmap, 941 + (sector_t)i << bitmap->counts.chunkshift, 942 + needed); 943 + } 944 + return 0; 945 + } 953 946 954 - outofdate = bitmap->flags & BITMAP_STALE; 947 + outofdate = test_bit(BITMAP_STALE, &bitmap->flags); 955 948 if (outofdate) 956 949 printk(KERN_INFO "%s: bitmap file is out of date, doing full " 957 950 "recovery\n", bmname(bitmap)); 958 951 959 - bytes = DIV_ROUND_UP(bitmap->chunks, 8); 960 - if (!bitmap->mddev->bitmap_info.external) 961 - bytes += sizeof(bitmap_super_t); 962 - 963 - num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); 964 - 965 - if (file && i_size_read(file->f_mapping->host) < bytes) { 952 + if (file && i_size_read(file->f_mapping->host) < store->bytes) { 966 953 printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", 967 - bmname(bitmap), 968 - (unsigned long) i_size_read(file->f_mapping->host), 969 - bytes); 954 + bmname(bitmap), 955 + (unsigned long) i_size_read(file->f_mapping->host), 956 + store->bytes); 970 957 goto err; 971 958 } 972 959 973 - ret = -ENOMEM; 974 - 975 - bitmap->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL); 976 - if (!bitmap->filemap) 977 - goto err; 978 - 979 - /* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */ 980 - bitmap->filemap_attr = kzalloc( 981 - roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)), 982 - GFP_KERNEL); 983 - if (!bitmap->filemap_attr) 984 - goto err; 985 - 986 960 oldindex = ~0L; 961 + offset = 0; 962 + if (!bitmap->mddev->bitmap_info.external) 963 + offset = sizeof(bitmap_super_t); 987 964 988 965 for (i = 0; i < chunks; i++) { 989 966 int b; 990 - index = file_page_index(bitmap, i); 991 - bit = file_page_offset(bitmap, i); 967 + index = file_page_index(&bitmap->storage, i); 968 + bit = file_page_offset(&bitmap->storage, i); 992 969 if (index != oldindex) { /* this is a new page, read it in */ 993 970 int count; 994 971 /* unmap the old page, we're done with it */ 995 - if (index == num_pages-1) 996 - count = bytes - index * PAGE_SIZE; 972 + if (index == store->file_pages-1) 973 + count = store->bytes - index * PAGE_SIZE; 997 974 else 998 975 count = PAGE_SIZE; 999 - if (index == 0 && bitmap->sb_page) { 1000 - /* 1001 - * if we're here then the superblock page 1002 - * contains some bits (PAGE_SIZE != sizeof sb) 1003 - * we've already read it in, so just use it 1004 - */ 1005 - page = bitmap->sb_page; 1006 - offset = sizeof(bitmap_super_t); 1007 - if (!file) 1008 - page = read_sb_page( 1009 - bitmap->mddev, 1010 - bitmap->mddev->bitmap_info.offset, 1011 - page, 1012 - index, count); 1013 - } else if (file) { 1014 - page = read_page(file, index, bitmap, count); 1015 - offset = 0; 1016 - } else { 1017 - page = read_sb_page(bitmap->mddev, 1018 - bitmap->mddev->bitmap_info.offset, 1019 - NULL, 1020 - index, count); 1021 - offset = 0; 1022 - } 1023 - if (IS_ERR(page)) { /* read error */ 1024 - ret = PTR_ERR(page); 976 + page = store->filemap[index]; 977 + if (file) 978 + ret = read_page(file, index, bitmap, 979 + count, page); 980 + else 981 + ret = read_sb_page( 982 + bitmap->mddev, 983 + bitmap->mddev->bitmap_info.offset, 984 + page, 985 + index, count); 986 + 987 + if (ret) 1025 988 goto err; 1026 - } 1027 989 1028 990 oldindex = index; 1029 - oldpage = page; 1030 - 1031 - bitmap->filemap[bitmap->file_pages++] = page; 1032 - bitmap->last_page_size = count; 1033 991 1034 992 if (outofdate) { 1035 993 /* ··· 1022 1022 write_page(bitmap, page, 1); 1023 1023 1024 1024 ret = -EIO; 1025 - if (bitmap->flags & BITMAP_WRITE_ERROR) 1025 + if (test_bit(BITMAP_WRITE_ERROR, 1026 + &bitmap->flags)) 1026 1027 goto err; 1027 1028 } 1028 1029 } 1029 1030 paddr = kmap_atomic(page); 1030 - if (bitmap->flags & BITMAP_HOSTENDIAN) 1031 + if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) 1031 1032 b = test_bit(bit, paddr); 1032 1033 else 1033 1034 b = test_bit_le(bit, paddr); 1034 1035 kunmap_atomic(paddr); 1035 1036 if (b) { 1036 1037 /* if the disk bit is set, set the memory bit */ 1037 - int needed = ((sector_t)(i+1) << bitmap->chunkshift 1038 + int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift 1038 1039 >= start); 1039 1040 bitmap_set_memory_bits(bitmap, 1040 - (sector_t)i << bitmap->chunkshift, 1041 + (sector_t)i << bitmap->counts.chunkshift, 1041 1042 needed); 1042 1043 bit_cnt++; 1043 1044 } 1044 - } 1045 - 1046 - /* everything went OK */ 1047 - ret = 0; 1048 - bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET); 1049 - 1050 - if (bit_cnt) { /* Kick recovery if any bits were set */ 1051 - set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); 1052 - md_wakeup_thread(bitmap->mddev->thread); 1045 + offset = 0; 1053 1046 } 1054 1047 1055 1048 printk(KERN_INFO "%s: bitmap initialized from disk: " 1056 - "read %lu/%lu pages, set %lu of %lu bits\n", 1057 - bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt, chunks); 1049 + "read %lu pages, set %lu of %lu bits\n", 1050 + bmname(bitmap), store->file_pages, 1051 + bit_cnt, chunks); 1058 1052 1059 1053 return 0; 1060 1054 ··· 1065 1071 */ 1066 1072 int i; 1067 1073 1068 - spin_lock_irq(&bitmap->lock); 1069 - for (i = 0; i < bitmap->file_pages; i++) 1070 - set_page_attr(bitmap, bitmap->filemap[i], 1074 + if (!bitmap || !bitmap->storage.filemap) 1075 + return; 1076 + if (bitmap->storage.file) 1077 + /* Only one copy, so nothing needed */ 1078 + return; 1079 + 1080 + for (i = 0; i < bitmap->storage.file_pages; i++) 1081 + set_page_attr(bitmap, i, 1071 1082 BITMAP_PAGE_NEEDWRITE); 1072 1083 bitmap->allclean = 0; 1073 - spin_unlock_irq(&bitmap->lock); 1074 1084 } 1075 1085 1076 - static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) 1086 + static void bitmap_count_page(struct bitmap_counts *bitmap, 1087 + sector_t offset, int inc) 1077 1088 { 1078 1089 sector_t chunk = offset >> bitmap->chunkshift; 1079 1090 unsigned long page = chunk >> PAGE_COUNTER_SHIFT; 1080 1091 bitmap->bp[page].count += inc; 1081 1092 bitmap_checkfree(bitmap, page); 1082 1093 } 1083 - static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, 1094 + 1095 + static void bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset) 1096 + { 1097 + sector_t chunk = offset >> bitmap->chunkshift; 1098 + unsigned long page = chunk >> PAGE_COUNTER_SHIFT; 1099 + struct bitmap_page *bp = &bitmap->bp[page]; 1100 + 1101 + if (!bp->pending) 1102 + bp->pending = 1; 1103 + } 1104 + 1105 + static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap, 1084 1106 sector_t offset, sector_t *blocks, 1085 1107 int create); 1086 1108 ··· 1109 1099 { 1110 1100 struct bitmap *bitmap; 1111 1101 unsigned long j; 1112 - unsigned long flags; 1113 - struct page *page = NULL, *lastpage = NULL; 1102 + unsigned long nextpage; 1114 1103 sector_t blocks; 1115 - void *paddr; 1104 + struct bitmap_counts *counts; 1116 1105 1117 1106 /* Use a mutex to guard daemon_work against 1118 1107 * bitmap_destroy. ··· 1133 1124 } 1134 1125 bitmap->allclean = 1; 1135 1126 1136 - spin_lock_irqsave(&bitmap->lock, flags); 1137 - for (j = 0; j < bitmap->chunks; j++) { 1138 - bitmap_counter_t *bmc; 1139 - if (!bitmap->filemap) 1140 - /* error or shutdown */ 1141 - break; 1127 + /* Any file-page which is PENDING now needs to be written. 1128 + * So set NEEDWRITE now, then after we make any last-minute changes 1129 + * we will write it. 1130 + */ 1131 + for (j = 0; j < bitmap->storage.file_pages; j++) 1132 + if (test_and_clear_page_attr(bitmap, j, 1133 + BITMAP_PAGE_PENDING)) 1134 + set_page_attr(bitmap, j, 1135 + BITMAP_PAGE_NEEDWRITE); 1142 1136 1143 - page = filemap_get_page(bitmap, j); 1144 - 1145 - if (page != lastpage) { 1146 - /* skip this page unless it's marked as needing cleaning */ 1147 - if (!test_page_attr(bitmap, page, BITMAP_PAGE_PENDING)) { 1148 - int need_write = test_page_attr(bitmap, page, 1149 - BITMAP_PAGE_NEEDWRITE); 1150 - if (need_write) 1151 - clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); 1152 - 1153 - spin_unlock_irqrestore(&bitmap->lock, flags); 1154 - if (need_write) 1155 - write_page(bitmap, page, 0); 1156 - spin_lock_irqsave(&bitmap->lock, flags); 1157 - j |= (PAGE_BITS - 1); 1158 - continue; 1159 - } 1160 - 1161 - /* grab the new page, sync and release the old */ 1162 - if (lastpage != NULL) { 1163 - if (test_page_attr(bitmap, lastpage, 1164 - BITMAP_PAGE_NEEDWRITE)) { 1165 - clear_page_attr(bitmap, lastpage, 1166 - BITMAP_PAGE_NEEDWRITE); 1167 - spin_unlock_irqrestore(&bitmap->lock, flags); 1168 - write_page(bitmap, lastpage, 0); 1169 - } else { 1170 - set_page_attr(bitmap, lastpage, 1171 - BITMAP_PAGE_NEEDWRITE); 1172 - bitmap->allclean = 0; 1173 - spin_unlock_irqrestore(&bitmap->lock, flags); 1174 - } 1175 - } else 1176 - spin_unlock_irqrestore(&bitmap->lock, flags); 1177 - lastpage = page; 1178 - 1179 - /* We are possibly going to clear some bits, so make 1180 - * sure that events_cleared is up-to-date. 1181 - */ 1182 - if (bitmap->need_sync && 1183 - mddev->bitmap_info.external == 0) { 1184 - bitmap_super_t *sb; 1185 - bitmap->need_sync = 0; 1186 - sb = kmap_atomic(bitmap->sb_page); 1187 - sb->events_cleared = 1188 - cpu_to_le64(bitmap->events_cleared); 1189 - kunmap_atomic(sb); 1190 - write_page(bitmap, bitmap->sb_page, 1); 1191 - } 1192 - spin_lock_irqsave(&bitmap->lock, flags); 1193 - if (!bitmap->need_sync) 1194 - clear_page_attr(bitmap, page, BITMAP_PAGE_PENDING); 1195 - else 1196 - bitmap->allclean = 0; 1197 - } 1198 - bmc = bitmap_get_counter(bitmap, 1199 - (sector_t)j << bitmap->chunkshift, 1200 - &blocks, 0); 1201 - if (!bmc) 1202 - j |= PAGE_COUNTER_MASK; 1203 - else if (*bmc) { 1204 - if (*bmc == 1 && !bitmap->need_sync) { 1205 - /* we can clear the bit */ 1206 - *bmc = 0; 1207 - bitmap_count_page(bitmap, 1208 - (sector_t)j << bitmap->chunkshift, 1209 - -1); 1210 - 1211 - /* clear the bit */ 1212 - paddr = kmap_atomic(page); 1213 - if (bitmap->flags & BITMAP_HOSTENDIAN) 1214 - clear_bit(file_page_offset(bitmap, j), 1215 - paddr); 1216 - else 1217 - __clear_bit_le( 1218 - file_page_offset(bitmap, 1219 - j), 1220 - paddr); 1221 - kunmap_atomic(paddr); 1222 - } else if (*bmc <= 2) { 1223 - *bmc = 1; /* maybe clear the bit next time */ 1224 - set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); 1225 - bitmap->allclean = 0; 1226 - } 1137 + if (bitmap->need_sync && 1138 + mddev->bitmap_info.external == 0) { 1139 + /* Arrange for superblock update as well as 1140 + * other changes */ 1141 + bitmap_super_t *sb; 1142 + bitmap->need_sync = 0; 1143 + if (bitmap->storage.filemap) { 1144 + sb = kmap_atomic(bitmap->storage.sb_page); 1145 + sb->events_cleared = 1146 + cpu_to_le64(bitmap->events_cleared); 1147 + kunmap_atomic(sb); 1148 + set_page_attr(bitmap, 0, 1149 + BITMAP_PAGE_NEEDWRITE); 1227 1150 } 1228 1151 } 1229 - spin_unlock_irqrestore(&bitmap->lock, flags); 1152 + /* Now look at the bitmap counters and if any are '2' or '1', 1153 + * decrement and handle accordingly. 1154 + */ 1155 + counts = &bitmap->counts; 1156 + spin_lock_irq(&counts->lock); 1157 + nextpage = 0; 1158 + for (j = 0; j < counts->chunks; j++) { 1159 + bitmap_counter_t *bmc; 1160 + sector_t block = (sector_t)j << counts->chunkshift; 1230 1161 1231 - /* now sync the final page */ 1232 - if (lastpage != NULL) { 1233 - spin_lock_irqsave(&bitmap->lock, flags); 1234 - if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { 1235 - clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1236 - spin_unlock_irqrestore(&bitmap->lock, flags); 1237 - write_page(bitmap, lastpage, 0); 1238 - } else { 1239 - set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1162 + if (j == nextpage) { 1163 + nextpage += PAGE_COUNTER_RATIO; 1164 + if (!counts->bp[j >> PAGE_COUNTER_SHIFT].pending) { 1165 + j |= PAGE_COUNTER_MASK; 1166 + continue; 1167 + } 1168 + counts->bp[j >> PAGE_COUNTER_SHIFT].pending = 0; 1169 + } 1170 + bmc = bitmap_get_counter(counts, 1171 + block, 1172 + &blocks, 0); 1173 + 1174 + if (!bmc) { 1175 + j |= PAGE_COUNTER_MASK; 1176 + continue; 1177 + } 1178 + if (*bmc == 1 && !bitmap->need_sync) { 1179 + /* We can clear the bit */ 1180 + *bmc = 0; 1181 + bitmap_count_page(counts, block, -1); 1182 + bitmap_file_clear_bit(bitmap, block); 1183 + } else if (*bmc && *bmc <= 2) { 1184 + *bmc = 1; 1185 + bitmap_set_pending(counts, block); 1240 1186 bitmap->allclean = 0; 1241 - spin_unlock_irqrestore(&bitmap->lock, flags); 1187 + } 1188 + } 1189 + spin_unlock_irq(&counts->lock); 1190 + 1191 + /* Now start writeout on any page in NEEDWRITE that isn't DIRTY. 1192 + * DIRTY pages need to be written by bitmap_unplug so it can wait 1193 + * for them. 1194 + * If we find any DIRTY page we stop there and let bitmap_unplug 1195 + * handle all the rest. This is important in the case where 1196 + * the first blocking holds the superblock and it has been updated. 1197 + * We mustn't write any other blocks before the superblock. 1198 + */ 1199 + for (j = 0; 1200 + j < bitmap->storage.file_pages 1201 + && !test_bit(BITMAP_STALE, &bitmap->flags); 1202 + j++) { 1203 + 1204 + if (test_page_attr(bitmap, j, 1205 + BITMAP_PAGE_DIRTY)) 1206 + /* bitmap_unplug will handle the rest */ 1207 + break; 1208 + if (test_and_clear_page_attr(bitmap, j, 1209 + BITMAP_PAGE_NEEDWRITE)) { 1210 + write_page(bitmap, bitmap->storage.filemap[j], 0); 1242 1211 } 1243 1212 } 1244 1213 ··· 1227 1240 mutex_unlock(&mddev->bitmap_info.mutex); 1228 1241 } 1229 1242 1230 - static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, 1243 + static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap, 1231 1244 sector_t offset, sector_t *blocks, 1232 1245 int create) 1233 1246 __releases(bitmap->lock) ··· 1289 1302 sector_t blocks; 1290 1303 bitmap_counter_t *bmc; 1291 1304 1292 - spin_lock_irq(&bitmap->lock); 1293 - bmc = bitmap_get_counter(bitmap, offset, &blocks, 1); 1305 + spin_lock_irq(&bitmap->counts.lock); 1306 + bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 1); 1294 1307 if (!bmc) { 1295 - spin_unlock_irq(&bitmap->lock); 1308 + spin_unlock_irq(&bitmap->counts.lock); 1296 1309 return 0; 1297 1310 } 1298 1311 ··· 1304 1317 */ 1305 1318 prepare_to_wait(&bitmap->overflow_wait, &__wait, 1306 1319 TASK_UNINTERRUPTIBLE); 1307 - spin_unlock_irq(&bitmap->lock); 1320 + spin_unlock_irq(&bitmap->counts.lock); 1308 1321 io_schedule(); 1309 1322 finish_wait(&bitmap->overflow_wait, &__wait); 1310 1323 continue; ··· 1313 1326 switch (*bmc) { 1314 1327 case 0: 1315 1328 bitmap_file_set_bit(bitmap, offset); 1316 - bitmap_count_page(bitmap, offset, 1); 1329 + bitmap_count_page(&bitmap->counts, offset, 1); 1317 1330 /* fall through */ 1318 1331 case 1: 1319 1332 *bmc = 2; ··· 1321 1334 1322 1335 (*bmc)++; 1323 1336 1324 - spin_unlock_irq(&bitmap->lock); 1337 + spin_unlock_irq(&bitmap->counts.lock); 1325 1338 1326 1339 offset += blocks; 1327 1340 if (sectors > blocks) ··· 1351 1364 unsigned long flags; 1352 1365 bitmap_counter_t *bmc; 1353 1366 1354 - spin_lock_irqsave(&bitmap->lock, flags); 1355 - bmc = bitmap_get_counter(bitmap, offset, &blocks, 0); 1367 + spin_lock_irqsave(&bitmap->counts.lock, flags); 1368 + bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 0); 1356 1369 if (!bmc) { 1357 - spin_unlock_irqrestore(&bitmap->lock, flags); 1370 + spin_unlock_irqrestore(&bitmap->counts.lock, flags); 1358 1371 return; 1359 1372 } 1360 1373 ··· 1373 1386 1374 1387 (*bmc)--; 1375 1388 if (*bmc <= 2) { 1376 - set_page_attr(bitmap, 1377 - filemap_get_page( 1378 - bitmap, 1379 - offset >> bitmap->chunkshift), 1380 - BITMAP_PAGE_PENDING); 1389 + bitmap_set_pending(&bitmap->counts, offset); 1381 1390 bitmap->allclean = 0; 1382 1391 } 1383 - spin_unlock_irqrestore(&bitmap->lock, flags); 1392 + spin_unlock_irqrestore(&bitmap->counts.lock, flags); 1384 1393 offset += blocks; 1385 1394 if (sectors > blocks) 1386 1395 sectors -= blocks; ··· 1395 1412 *blocks = 1024; 1396 1413 return 1; /* always resync if no bitmap */ 1397 1414 } 1398 - spin_lock_irq(&bitmap->lock); 1399 - bmc = bitmap_get_counter(bitmap, offset, blocks, 0); 1415 + spin_lock_irq(&bitmap->counts.lock); 1416 + bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0); 1400 1417 rv = 0; 1401 1418 if (bmc) { 1402 1419 /* locked */ ··· 1410 1427 } 1411 1428 } 1412 1429 } 1413 - spin_unlock_irq(&bitmap->lock); 1430 + spin_unlock_irq(&bitmap->counts.lock); 1414 1431 return rv; 1415 1432 } 1416 1433 ··· 1447 1464 *blocks = 1024; 1448 1465 return; 1449 1466 } 1450 - spin_lock_irqsave(&bitmap->lock, flags); 1451 - bmc = bitmap_get_counter(bitmap, offset, blocks, 0); 1467 + spin_lock_irqsave(&bitmap->counts.lock, flags); 1468 + bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0); 1452 1469 if (bmc == NULL) 1453 1470 goto unlock; 1454 1471 /* locked */ ··· 1459 1476 *bmc |= NEEDED_MASK; 1460 1477 else { 1461 1478 if (*bmc <= 2) { 1462 - set_page_attr(bitmap, 1463 - filemap_get_page(bitmap, offset >> bitmap->chunkshift), 1464 - BITMAP_PAGE_PENDING); 1479 + bitmap_set_pending(&bitmap->counts, offset); 1465 1480 bitmap->allclean = 0; 1466 1481 } 1467 1482 } 1468 1483 } 1469 1484 unlock: 1470 - spin_unlock_irqrestore(&bitmap->lock, flags); 1485 + spin_unlock_irqrestore(&bitmap->counts.lock, flags); 1471 1486 } 1472 1487 EXPORT_SYMBOL(bitmap_end_sync); 1473 1488 ··· 1505 1524 1506 1525 bitmap->mddev->curr_resync_completed = sector; 1507 1526 set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); 1508 - sector &= ~((1ULL << bitmap->chunkshift) - 1); 1527 + sector &= ~((1ULL << bitmap->counts.chunkshift) - 1); 1509 1528 s = 0; 1510 1529 while (s < sector && s < bitmap->mddev->resync_max_sectors) { 1511 1530 bitmap_end_sync(bitmap, s, &blocks, 0); ··· 1519 1538 static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) 1520 1539 { 1521 1540 /* For each chunk covered by any of these sectors, set the 1522 - * counter to 1 and set resync_needed. They should all 1541 + * counter to 2 and possibly set resync_needed. They should all 1523 1542 * be 0 at this point 1524 1543 */ 1525 1544 1526 1545 sector_t secs; 1527 1546 bitmap_counter_t *bmc; 1528 - spin_lock_irq(&bitmap->lock); 1529 - bmc = bitmap_get_counter(bitmap, offset, &secs, 1); 1547 + spin_lock_irq(&bitmap->counts.lock); 1548 + bmc = bitmap_get_counter(&bitmap->counts, offset, &secs, 1); 1530 1549 if (!bmc) { 1531 - spin_unlock_irq(&bitmap->lock); 1550 + spin_unlock_irq(&bitmap->counts.lock); 1532 1551 return; 1533 1552 } 1534 1553 if (!*bmc) { 1535 - struct page *page; 1536 1554 *bmc = 2 | (needed ? NEEDED_MASK : 0); 1537 - bitmap_count_page(bitmap, offset, 1); 1538 - page = filemap_get_page(bitmap, offset >> bitmap->chunkshift); 1539 - set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); 1555 + bitmap_count_page(&bitmap->counts, offset, 1); 1556 + bitmap_set_pending(&bitmap->counts, offset); 1540 1557 bitmap->allclean = 0; 1541 1558 } 1542 - spin_unlock_irq(&bitmap->lock); 1559 + spin_unlock_irq(&bitmap->counts.lock); 1543 1560 } 1544 1561 1545 1562 /* dirty the memory and file bits for bitmap chunks "s" to "e" */ ··· 1546 1567 unsigned long chunk; 1547 1568 1548 1569 for (chunk = s; chunk <= e; chunk++) { 1549 - sector_t sec = (sector_t)chunk << bitmap->chunkshift; 1570 + sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift; 1550 1571 bitmap_set_memory_bits(bitmap, sec, 1); 1551 - spin_lock_irq(&bitmap->lock); 1552 1572 bitmap_file_set_bit(bitmap, sec); 1553 - spin_unlock_irq(&bitmap->lock); 1554 1573 if (sec < bitmap->mddev->recovery_cp) 1555 1574 /* We are asserting that the array is dirty, 1556 1575 * so move the recovery_cp address back so ··· 1593 1616 if (!bitmap) /* there was no bitmap */ 1594 1617 return; 1595 1618 1596 - /* release the bitmap file and kill the daemon */ 1597 - bitmap_file_put(bitmap); 1619 + /* Shouldn't be needed - but just in case.... */ 1620 + wait_event(bitmap->write_wait, 1621 + atomic_read(&bitmap->pending_writes) == 0); 1598 1622 1599 - bp = bitmap->bp; 1600 - pages = bitmap->pages; 1623 + /* release the bitmap file */ 1624 + bitmap_file_unmap(&bitmap->storage); 1625 + 1626 + bp = bitmap->counts.bp; 1627 + pages = bitmap->counts.pages; 1601 1628 1602 1629 /* free all allocated memory */ 1603 1630 ··· 1640 1659 { 1641 1660 struct bitmap *bitmap; 1642 1661 sector_t blocks = mddev->resync_max_sectors; 1643 - unsigned long chunks; 1644 - unsigned long pages; 1645 1662 struct file *file = mddev->bitmap_info.file; 1646 1663 int err; 1647 1664 struct sysfs_dirent *bm = NULL; 1648 1665 1649 1666 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); 1650 - 1651 - if (!file 1652 - && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */ 1653 - return 0; 1654 1667 1655 1668 BUG_ON(file && mddev->bitmap_info.offset); 1656 1669 ··· 1652 1677 if (!bitmap) 1653 1678 return -ENOMEM; 1654 1679 1655 - spin_lock_init(&bitmap->lock); 1680 + spin_lock_init(&bitmap->counts.lock); 1656 1681 atomic_set(&bitmap->pending_writes, 0); 1657 1682 init_waitqueue_head(&bitmap->write_wait); 1658 1683 init_waitqueue_head(&bitmap->overflow_wait); ··· 1668 1693 } else 1669 1694 bitmap->sysfs_can_clear = NULL; 1670 1695 1671 - bitmap->file = file; 1696 + bitmap->storage.file = file; 1672 1697 if (file) { 1673 1698 get_file(file); 1674 1699 /* As future accesses to this file will use bmap, ··· 1699 1724 goto error; 1700 1725 1701 1726 bitmap->daemon_lastrun = jiffies; 1702 - bitmap->chunkshift = (ffz(~mddev->bitmap_info.chunksize) 1703 - - BITMAP_BLOCK_SHIFT); 1704 - 1705 - chunks = (blocks + (1 << bitmap->chunkshift) - 1) >> 1706 - bitmap->chunkshift; 1707 - pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO; 1708 - 1709 - BUG_ON(!pages); 1710 - 1711 - bitmap->chunks = chunks; 1712 - bitmap->pages = pages; 1713 - bitmap->missing_pages = pages; 1714 - 1715 - bitmap->bp = kzalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL); 1716 - 1717 - err = -ENOMEM; 1718 - if (!bitmap->bp) 1727 + err = bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1); 1728 + if (err) 1719 1729 goto error; 1720 1730 1721 1731 printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", 1722 - pages, bmname(bitmap)); 1732 + bitmap->counts.pages, bmname(bitmap)); 1723 1733 1724 1734 mddev->bitmap = bitmap; 1725 - 1726 - 1727 - return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0; 1735 + return test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; 1728 1736 1729 1737 error: 1730 1738 bitmap_free(bitmap); ··· 1748 1790 1749 1791 if (err) 1750 1792 goto out; 1793 + clear_bit(BITMAP_STALE, &bitmap->flags); 1794 + 1795 + /* Kick recovery in case any bits were set */ 1796 + set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); 1751 1797 1752 1798 mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; 1753 1799 md_wakeup_thread(mddev->thread); 1754 1800 1755 1801 bitmap_update_sb(bitmap); 1756 1802 1757 - if (bitmap->flags & BITMAP_WRITE_ERROR) 1803 + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) 1758 1804 err = -EIO; 1759 1805 out: 1760 1806 return err; ··· 1768 1806 void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) 1769 1807 { 1770 1808 unsigned long chunk_kb; 1771 - unsigned long flags; 1809 + struct bitmap_counts *counts; 1772 1810 1773 1811 if (!bitmap) 1774 1812 return; 1775 1813 1776 - spin_lock_irqsave(&bitmap->lock, flags); 1814 + counts = &bitmap->counts; 1815 + 1777 1816 chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; 1778 1817 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 1779 1818 "%lu%s chunk", 1780 - bitmap->pages - bitmap->missing_pages, 1781 - bitmap->pages, 1782 - (bitmap->pages - bitmap->missing_pages) 1819 + counts->pages - counts->missing_pages, 1820 + counts->pages, 1821 + (counts->pages - counts->missing_pages) 1783 1822 << (PAGE_SHIFT - 10), 1784 1823 chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, 1785 1824 chunk_kb ? "KB" : "B"); 1786 - if (bitmap->file) { 1825 + if (bitmap->storage.file) { 1787 1826 seq_printf(seq, ", file: "); 1788 - seq_path(seq, &bitmap->file->f_path, " \t\n"); 1827 + seq_path(seq, &bitmap->storage.file->f_path, " \t\n"); 1789 1828 } 1790 1829 1791 1830 seq_printf(seq, "\n"); 1792 - spin_unlock_irqrestore(&bitmap->lock, flags); 1793 1831 } 1832 + 1833 + int bitmap_resize(struct bitmap *bitmap, sector_t blocks, 1834 + int chunksize, int init) 1835 + { 1836 + /* If chunk_size is 0, choose an appropriate chunk size. 1837 + * Then possibly allocate new storage space. 1838 + * Then quiesce, copy bits, replace bitmap, and re-start 1839 + * 1840 + * This function is called both to set up the initial bitmap 1841 + * and to resize the bitmap while the array is active. 1842 + * If this happens as a result of the array being resized, 1843 + * chunksize will be zero, and we need to choose a suitable 1844 + * chunksize, otherwise we use what we are given. 1845 + */ 1846 + struct bitmap_storage store; 1847 + struct bitmap_counts old_counts; 1848 + unsigned long chunks; 1849 + sector_t block; 1850 + sector_t old_blocks, new_blocks; 1851 + int chunkshift; 1852 + int ret = 0; 1853 + long pages; 1854 + struct bitmap_page *new_bp; 1855 + 1856 + if (chunksize == 0) { 1857 + /* If there is enough space, leave the chunk size unchanged, 1858 + * else increase by factor of two until there is enough space. 1859 + */ 1860 + long bytes; 1861 + long space = bitmap->mddev->bitmap_info.space; 1862 + 1863 + if (space == 0) { 1864 + /* We don't know how much space there is, so limit 1865 + * to current size - in sectors. 1866 + */ 1867 + bytes = DIV_ROUND_UP(bitmap->counts.chunks, 8); 1868 + if (!bitmap->mddev->bitmap_info.external) 1869 + bytes += sizeof(bitmap_super_t); 1870 + space = DIV_ROUND_UP(bytes, 512); 1871 + bitmap->mddev->bitmap_info.space = space; 1872 + } 1873 + chunkshift = bitmap->counts.chunkshift; 1874 + chunkshift--; 1875 + do { 1876 + /* 'chunkshift' is shift from block size to chunk size */ 1877 + chunkshift++; 1878 + chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift); 1879 + bytes = DIV_ROUND_UP(chunks, 8); 1880 + if (!bitmap->mddev->bitmap_info.external) 1881 + bytes += sizeof(bitmap_super_t); 1882 + } while (bytes > (space << 9)); 1883 + } else 1884 + chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT; 1885 + 1886 + chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift); 1887 + memset(&store, 0, sizeof(store)); 1888 + if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file) 1889 + ret = bitmap_storage_alloc(&store, chunks, 1890 + !bitmap->mddev->bitmap_info.external); 1891 + if (ret) 1892 + goto err; 1893 + 1894 + pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO); 1895 + 1896 + new_bp = kzalloc(pages * sizeof(*new_bp), GFP_KERNEL); 1897 + ret = -ENOMEM; 1898 + if (!new_bp) { 1899 + bitmap_file_unmap(&store); 1900 + goto err; 1901 + } 1902 + 1903 + if (!init) 1904 + bitmap->mddev->pers->quiesce(bitmap->mddev, 1); 1905 + 1906 + store.file = bitmap->storage.file; 1907 + bitmap->storage.file = NULL; 1908 + 1909 + if (store.sb_page && bitmap->storage.sb_page) 1910 + memcpy(page_address(store.sb_page), 1911 + page_address(bitmap->storage.sb_page), 1912 + sizeof(bitmap_super_t)); 1913 + bitmap_file_unmap(&bitmap->storage); 1914 + bitmap->storage = store; 1915 + 1916 + old_counts = bitmap->counts; 1917 + bitmap->counts.bp = new_bp; 1918 + bitmap->counts.pages = pages; 1919 + bitmap->counts.missing_pages = pages; 1920 + bitmap->counts.chunkshift = chunkshift; 1921 + bitmap->counts.chunks = chunks; 1922 + bitmap->mddev->bitmap_info.chunksize = 1 << (chunkshift + 1923 + BITMAP_BLOCK_SHIFT); 1924 + 1925 + blocks = min(old_counts.chunks << old_counts.chunkshift, 1926 + chunks << chunkshift); 1927 + 1928 + spin_lock_irq(&bitmap->counts.lock); 1929 + for (block = 0; block < blocks; ) { 1930 + bitmap_counter_t *bmc_old, *bmc_new; 1931 + int set; 1932 + 1933 + bmc_old = bitmap_get_counter(&old_counts, block, 1934 + &old_blocks, 0); 1935 + set = bmc_old && NEEDED(*bmc_old); 1936 + 1937 + if (set) { 1938 + bmc_new = bitmap_get_counter(&bitmap->counts, block, 1939 + &new_blocks, 1); 1940 + if (*bmc_new == 0) { 1941 + /* need to set on-disk bits too. */ 1942 + sector_t end = block + new_blocks; 1943 + sector_t start = block >> chunkshift; 1944 + start <<= chunkshift; 1945 + while (start < end) { 1946 + bitmap_file_set_bit(bitmap, block); 1947 + start += 1 << chunkshift; 1948 + } 1949 + *bmc_new = 2; 1950 + bitmap_count_page(&bitmap->counts, 1951 + block, 1); 1952 + bitmap_set_pending(&bitmap->counts, 1953 + block); 1954 + } 1955 + *bmc_new |= NEEDED_MASK; 1956 + if (new_blocks < old_blocks) 1957 + old_blocks = new_blocks; 1958 + } 1959 + block += old_blocks; 1960 + } 1961 + 1962 + if (!init) { 1963 + int i; 1964 + while (block < (chunks << chunkshift)) { 1965 + bitmap_counter_t *bmc; 1966 + bmc = bitmap_get_counter(&bitmap->counts, block, 1967 + &new_blocks, 1); 1968 + if (bmc) { 1969 + /* new space. It needs to be resynced, so 1970 + * we set NEEDED_MASK. 1971 + */ 1972 + if (*bmc == 0) { 1973 + *bmc = NEEDED_MASK | 2; 1974 + bitmap_count_page(&bitmap->counts, 1975 + block, 1); 1976 + bitmap_set_pending(&bitmap->counts, 1977 + block); 1978 + } 1979 + } 1980 + block += new_blocks; 1981 + } 1982 + for (i = 0; i < bitmap->storage.file_pages; i++) 1983 + set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); 1984 + } 1985 + spin_unlock_irq(&bitmap->counts.lock); 1986 + 1987 + if (!init) { 1988 + bitmap_unplug(bitmap); 1989 + bitmap->mddev->pers->quiesce(bitmap->mddev, 0); 1990 + } 1991 + ret = 0; 1992 + err: 1993 + return ret; 1994 + } 1995 + EXPORT_SYMBOL_GPL(bitmap_resize); 1794 1996 1795 1997 static ssize_t 1796 1998 location_show(struct mddev *mddev, char *page) ··· 2048 1922 2049 1923 static struct md_sysfs_entry bitmap_location = 2050 1924 __ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store); 1925 + 1926 + /* 'bitmap/space' is the space available at 'location' for the 1927 + * bitmap. This allows the kernel to know when it is safe to 1928 + * resize the bitmap to match a resized array. 1929 + */ 1930 + static ssize_t 1931 + space_show(struct mddev *mddev, char *page) 1932 + { 1933 + return sprintf(page, "%lu\n", mddev->bitmap_info.space); 1934 + } 1935 + 1936 + static ssize_t 1937 + space_store(struct mddev *mddev, const char *buf, size_t len) 1938 + { 1939 + unsigned long sectors; 1940 + int rv; 1941 + 1942 + rv = kstrtoul(buf, 10, &sectors); 1943 + if (rv) 1944 + return rv; 1945 + 1946 + if (sectors == 0) 1947 + return -EINVAL; 1948 + 1949 + if (mddev->bitmap && 1950 + sectors < (mddev->bitmap->storage.bytes + 511) >> 9) 1951 + return -EFBIG; /* Bitmap is too big for this small space */ 1952 + 1953 + /* could make sure it isn't too big, but that isn't really 1954 + * needed - user-space should be careful. 1955 + */ 1956 + mddev->bitmap_info.space = sectors; 1957 + return len; 1958 + } 1959 + 1960 + static struct md_sysfs_entry bitmap_space = 1961 + __ATTR(space, S_IRUGO|S_IWUSR, space_show, space_store); 2051 1962 2052 1963 static ssize_t 2053 1964 timeout_show(struct mddev *mddev, char *page) ··· 2261 2098 2262 2099 static struct attribute *md_bitmap_attrs[] = { 2263 2100 &bitmap_location.attr, 2101 + &bitmap_space.attr, 2264 2102 &bitmap_timeout.attr, 2265 2103 &bitmap_backlog.attr, 2266 2104 &bitmap_chunksize.attr,

+39 -21

drivers/md/bitmap.h

··· 111 111 112 112 /* use these for bitmap->flags and bitmap->sb->state bit-fields */ 113 113 enum bitmap_state { 114 - BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */ 115 - BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */ 116 - BITMAP_HOSTENDIAN = 0x8000, 114 + BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ 115 + BITMAP_WRITE_ERROR = 2, /* A write error has occurred */ 116 + BITMAP_HOSTENDIAN =15, 117 117 }; 118 118 119 119 /* the superblock at the front of the bitmap file -- little endian */ ··· 128 128 __le32 chunksize; /* 52 the bitmap chunk size in bytes */ 129 129 __le32 daemon_sleep; /* 56 seconds between disk flushes */ 130 130 __le32 write_behind; /* 60 number of outstanding write-behind writes */ 131 + __le32 sectors_reserved; /* 64 number of 512-byte sectors that are 132 + * reserved for the bitmap. */ 131 133 132 - __u8 pad[256 - 64]; /* set to zero */ 134 + __u8 pad[256 - 68]; /* set to zero */ 133 135 } bitmap_super_t; 134 136 135 137 /* notes: ··· 162 160 */ 163 161 unsigned int hijacked:1; 164 162 /* 163 + * If any counter in this page is '1' or '2' - and so could be 164 + * cleared then that page is marked as 'pending' 165 + */ 166 + unsigned int pending:1; 167 + /* 165 168 * count of dirty bits on the page 166 169 */ 167 - unsigned int count:31; 170 + unsigned int count:30; 168 171 }; 169 172 170 173 /* the main bitmap structure - one per mddev */ 171 174 struct bitmap { 172 - struct bitmap_page *bp; 173 - unsigned long pages; /* total number of pages in the bitmap */ 174 - unsigned long missing_pages; /* number of pages not yet allocated */ 175 + 176 + struct bitmap_counts { 177 + spinlock_t lock; 178 + struct bitmap_page *bp; 179 + unsigned long pages; /* total number of pages 180 + * in the bitmap */ 181 + unsigned long missing_pages; /* number of pages 182 + * not yet allocated */ 183 + unsigned long chunkshift; /* chunksize = 2^chunkshift 184 + * (for bitops) */ 185 + unsigned long chunks; /* Total number of data 186 + * chunks for the array */ 187 + } counts; 175 188 176 189 struct mddev *mddev; /* the md device that the bitmap is for */ 177 - 178 - /* bitmap chunksize -- how much data does each bit represent? */ 179 - unsigned long chunkshift; /* chunksize = 2^(chunkshift+9) (for bitops) */ 180 - unsigned long chunks; /* total number of data chunks for the array */ 181 190 182 191 __u64 events_cleared; 183 192 int need_sync; 184 193 185 - /* bitmap spinlock */ 186 - spinlock_t lock; 187 - 188 - struct file *file; /* backing disk file */ 189 - struct page *sb_page; /* cached copy of the bitmap file superblock */ 190 - struct page **filemap; /* list of cache pages for the file */ 191 - unsigned long *filemap_attr; /* attributes associated w/ filemap pages */ 192 - unsigned long file_pages; /* number of pages in the file */ 193 - int last_page_size; /* bytes in the last page */ 194 + struct bitmap_storage { 195 + struct file *file; /* backing disk file */ 196 + struct page *sb_page; /* cached copy of the bitmap 197 + * file superblock */ 198 + struct page **filemap; /* list of cache pages for 199 + * the file */ 200 + unsigned long *filemap_attr; /* attributes associated 201 + * w/ filemap pages */ 202 + unsigned long file_pages; /* number of pages in the file*/ 203 + unsigned long bytes; /* total bytes in the bitmap */ 204 + } storage; 194 205 195 206 unsigned long flags; 196 207 ··· 257 242 258 243 void bitmap_unplug(struct bitmap *bitmap); 259 244 void bitmap_daemon_work(struct mddev *mddev); 245 + 246 + int bitmap_resize(struct bitmap *bitmap, sector_t blocks, 247 + int chunksize, int init); 260 248 #endif 261 249 262 250 #endif

+11 -11

drivers/md/dm-raid.c

··· 155 155 for (i = 0; i < rs->md.raid_disks; i++) { 156 156 if (rs->dev[i].meta_dev) 157 157 dm_put_device(rs->ti, rs->dev[i].meta_dev); 158 - if (rs->dev[i].rdev.sb_page) 159 - put_page(rs->dev[i].rdev.sb_page); 160 - rs->dev[i].rdev.sb_page = NULL; 161 - rs->dev[i].rdev.sb_loaded = 0; 158 + md_rdev_clear(&rs->dev[i].rdev); 162 159 if (rs->dev[i].data_dev) 163 160 dm_put_device(rs->ti, rs->dev[i].data_dev); 164 161 } ··· 603 606 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { 604 607 DMERR("Failed to read superblock of device at position %d", 605 608 rdev->raid_disk); 606 - set_bit(Faulty, &rdev->flags); 609 + md_error(rdev->mddev, rdev); 607 610 return -EINVAL; 608 611 } 609 612 ··· 614 617 615 618 static void super_sync(struct mddev *mddev, struct md_rdev *rdev) 616 619 { 617 - struct md_rdev *r; 620 + int i; 618 621 uint64_t failed_devices; 619 622 struct dm_raid_superblock *sb; 623 + struct raid_set *rs = container_of(mddev, struct raid_set, md); 620 624 621 625 sb = page_address(rdev->sb_page); 622 626 failed_devices = le64_to_cpu(sb->failed_devices); 623 627 624 - rdev_for_each(r, mddev) 625 - if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) 626 - failed_devices |= (1ULL << r->raid_disk); 628 + for (i = 0; i < mddev->raid_disks; i++) 629 + if (!rs->dev[i].data_dev || 630 + test_bit(Faulty, &(rs->dev[i].rdev.flags))) 631 + failed_devices |= (1ULL << i); 627 632 628 633 memset(sb, 0, sizeof(*sb)); 629 634 ··· 1251 1252 { 1252 1253 struct raid_set *rs = ti->private; 1253 1254 1255 + set_bit(MD_CHANGE_DEVS, &rs->md.flags); 1254 1256 if (!rs->bitmap_loaded) { 1255 1257 bitmap_load(&rs->md); 1256 1258 rs->bitmap_loaded = 1; 1257 - } else 1258 - md_wakeup_thread(rs->md.thread); 1259 + } 1259 1260 1261 + clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); 1260 1262 mddev_resume(&rs->md); 1261 1263 } 1262 1264

+321 -49

drivers/md/md.c

··· 402 402 wake_up(&mddev->sb_wait); 403 403 mddev->pers->quiesce(mddev, 0); 404 404 405 + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 405 406 md_wakeup_thread(mddev->thread); 406 407 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 407 408 } ··· 453 452 atomic_inc(&rdev->nr_pending); 454 453 atomic_inc(&rdev->nr_pending); 455 454 rcu_read_unlock(); 456 - bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev); 455 + bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); 457 456 bi->bi_end_io = md_end_flush; 458 457 bi->bi_private = rdev; 459 458 bi->bi_bdev = rdev->bdev; ··· 608 607 init_waitqueue_head(&mddev->sb_wait); 609 608 init_waitqueue_head(&mddev->recovery_wait); 610 609 mddev->reshape_position = MaxSector; 610 + mddev->reshape_backwards = 0; 611 611 mddev->resync_min = 0; 612 612 mddev->resync_max = MaxSector; 613 613 mddev->level = LEVEL_NONE; ··· 804 802 return 0; 805 803 } 806 804 807 - static void free_disk_sb(struct md_rdev * rdev) 805 + void md_rdev_clear(struct md_rdev *rdev) 808 806 { 809 807 if (rdev->sb_page) { 810 808 put_page(rdev->sb_page); ··· 817 815 put_page(rdev->bb_page); 818 816 rdev->bb_page = NULL; 819 817 } 818 + kfree(rdev->badblocks.page); 819 + rdev->badblocks.page = NULL; 820 820 } 821 - 821 + EXPORT_SYMBOL_GPL(md_rdev_clear); 822 822 823 823 static void super_written(struct bio *bio, int error) 824 824 { ··· 891 887 rdev->meta_bdev : rdev->bdev; 892 888 if (metadata_op) 893 889 bio->bi_sector = sector + rdev->sb_start; 890 + else if (rdev->mddev->reshape_position != MaxSector && 891 + (rdev->mddev->reshape_backwards == 892 + (sector >= rdev->mddev->reshape_position))) 893 + bio->bi_sector = sector + rdev->new_data_offset; 894 894 else 895 895 bio->bi_sector = sector + rdev->data_offset; 896 896 bio_add_page(bio, page, size, 0); ··· 1042 1034 struct super_type { 1043 1035 char *name; 1044 1036 struct module *owner; 1045 - int (*load_super)(struct md_rdev *rdev, struct md_rdev *refdev, 1037 + int (*load_super)(struct md_rdev *rdev, 1038 + struct md_rdev *refdev, 1046 1039 int minor_version); 1047 - int (*validate_super)(struct mddev *mddev, struct md_rdev *rdev); 1048 - void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); 1040 + int (*validate_super)(struct mddev *mddev, 1041 + struct md_rdev *rdev); 1042 + void (*sync_super)(struct mddev *mddev, 1043 + struct md_rdev *rdev); 1049 1044 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 1050 1045 sector_t num_sectors); 1046 + int (*allow_new_offset)(struct md_rdev *rdev, 1047 + unsigned long long new_offset); 1051 1048 }; 1052 1049 1053 1050 /* ··· 1124 1111 1125 1112 rdev->preferred_minor = sb->md_minor; 1126 1113 rdev->data_offset = 0; 1114 + rdev->new_data_offset = 0; 1127 1115 rdev->sb_size = MD_SB_BYTES; 1128 1116 rdev->badblocks.shift = -1; 1129 1117 ··· 1198 1184 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1199 1185 mddev->events = ev1; 1200 1186 mddev->bitmap_info.offset = 0; 1187 + mddev->bitmap_info.space = 0; 1188 + /* bitmap can use 60 K after the 4K superblocks */ 1201 1189 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1190 + mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1191 + mddev->reshape_backwards = 0; 1202 1192 1203 1193 if (mddev->minor_version >= 91) { 1204 1194 mddev->reshape_position = sb->reshape_position; ··· 1210 1192 mddev->new_level = sb->new_level; 1211 1193 mddev->new_layout = sb->new_layout; 1212 1194 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1195 + if (mddev->delta_disks < 0) 1196 + mddev->reshape_backwards = 1; 1213 1197 } else { 1214 1198 mddev->reshape_position = MaxSector; 1215 1199 mddev->delta_disks = 0; ··· 1238 1218 mddev->max_disks = MD_SB_DISKS; 1239 1219 1240 1220 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1241 - mddev->bitmap_info.file == NULL) 1221 + mddev->bitmap_info.file == NULL) { 1242 1222 mddev->bitmap_info.offset = 1243 1223 mddev->bitmap_info.default_offset; 1224 + mddev->bitmap_info.space = 1225 + mddev->bitmap_info.space; 1226 + } 1244 1227 1245 1228 } else if (mddev->pers == NULL) { 1246 1229 /* Insist on good event counter while assembling, except ··· 1457 1434 return num_sectors; 1458 1435 } 1459 1436 1437 + static int 1438 + super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1439 + { 1440 + /* non-zero offset changes not possible with v0.90 */ 1441 + return new_offset == 0; 1442 + } 1460 1443 1461 1444 /* 1462 1445 * version 1 superblock ··· 1498 1469 struct mdp_superblock_1 *sb; 1499 1470 int ret; 1500 1471 sector_t sb_start; 1472 + sector_t sectors; 1501 1473 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1502 1474 int bmask; 1503 1475 ··· 1553 1523 bdevname(rdev->bdev,b)); 1554 1524 return -EINVAL; 1555 1525 } 1526 + if (sb->pad0 || 1527 + sb->pad3[0] || 1528 + memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1529 + /* Some padding is non-zero, might be a new feature */ 1530 + return -EINVAL; 1556 1531 1557 1532 rdev->preferred_minor = 0xffff; 1558 1533 rdev->data_offset = le64_to_cpu(sb->data_offset); 1534 + rdev->new_data_offset = rdev->data_offset; 1535 + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1536 + (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1537 + rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1559 1538 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1560 1539 1561 1540 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; ··· 1574 1535 1575 1536 if (minor_version 1576 1537 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1538 + return -EINVAL; 1539 + if (minor_version 1540 + && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1577 1541 return -EINVAL; 1578 1542 1579 1543 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) ··· 1649 1607 else 1650 1608 ret = 0; 1651 1609 } 1652 - if (minor_version) 1653 - rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - 1654 - le64_to_cpu(sb->data_offset); 1655 - else 1656 - rdev->sectors = rdev->sb_start; 1657 - if (rdev->sectors < le64_to_cpu(sb->data_size)) 1610 + if (minor_version) { 1611 + sectors = (i_size_read(rdev->bdev->bd_inode) >> 9); 1612 + sectors -= rdev->data_offset; 1613 + } else 1614 + sectors = rdev->sb_start; 1615 + if (sectors < le64_to_cpu(sb->data_size)) 1658 1616 return -EINVAL; 1659 1617 rdev->sectors = le64_to_cpu(sb->data_size); 1660 - if (le64_to_cpu(sb->size) > rdev->sectors) 1661 - return -EINVAL; 1662 1618 return ret; 1663 1619 } 1664 1620 ··· 1684 1644 mddev->dev_sectors = le64_to_cpu(sb->size); 1685 1645 mddev->events = ev1; 1686 1646 mddev->bitmap_info.offset = 0; 1647 + mddev->bitmap_info.space = 0; 1648 + /* Default location for bitmap is 1K after superblock 1649 + * using 3K - total of 4K 1650 + */ 1687 1651 mddev->bitmap_info.default_offset = 1024 >> 9; 1688 - 1652 + mddev->bitmap_info.default_space = (4096-1024) >> 9; 1653 + mddev->reshape_backwards = 0; 1654 + 1689 1655 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1690 1656 memcpy(mddev->uuid, sb->set_uuid, 16); 1691 1657 1692 1658 mddev->max_disks = (4096-256)/2; 1693 1659 1694 1660 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1695 - mddev->bitmap_info.file == NULL ) 1661 + mddev->bitmap_info.file == NULL) { 1696 1662 mddev->bitmap_info.offset = 1697 1663 (__s32)le32_to_cpu(sb->bitmap_offset); 1664 + /* Metadata doesn't record how much space is available. 1665 + * For 1.0, we assume we can use up to the superblock 1666 + * if before, else to 4K beyond superblock. 1667 + * For others, assume no change is possible. 1668 + */ 1669 + if (mddev->minor_version > 0) 1670 + mddev->bitmap_info.space = 0; 1671 + else if (mddev->bitmap_info.offset > 0) 1672 + mddev->bitmap_info.space = 1673 + 8 - mddev->bitmap_info.offset; 1674 + else 1675 + mddev->bitmap_info.space = 1676 + -mddev->bitmap_info.offset; 1677 + } 1698 1678 1699 1679 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1700 1680 mddev->reshape_position = le64_to_cpu(sb->reshape_position); ··· 1722 1662 mddev->new_level = le32_to_cpu(sb->new_level); 1723 1663 mddev->new_layout = le32_to_cpu(sb->new_layout); 1724 1664 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1665 + if (mddev->delta_disks < 0 || 1666 + (mddev->delta_disks == 0 && 1667 + (le32_to_cpu(sb->feature_map) 1668 + & MD_FEATURE_RESHAPE_BACKWARDS))) 1669 + mddev->reshape_backwards = 1; 1725 1670 } else { 1726 1671 mddev->reshape_position = MaxSector; 1727 1672 mddev->delta_disks = 0; ··· 1800 1735 sb->feature_map = 0; 1801 1736 sb->pad0 = 0; 1802 1737 sb->recovery_offset = cpu_to_le64(0); 1803 - memset(sb->pad1, 0, sizeof(sb->pad1)); 1804 1738 memset(sb->pad3, 0, sizeof(sb->pad3)); 1805 1739 1806 1740 sb->utime = cpu_to_le64((__u64)mddev->utime); ··· 1821 1757 sb->devflags |= WriteMostly1; 1822 1758 else 1823 1759 sb->devflags &= ~WriteMostly1; 1760 + sb->data_offset = cpu_to_le64(rdev->data_offset); 1761 + sb->data_size = cpu_to_le64(rdev->sectors); 1824 1762 1825 1763 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 1826 1764 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); ··· 1847 1781 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1848 1782 sb->new_level = cpu_to_le32(mddev->new_level); 1849 1783 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 1784 + if (mddev->delta_disks == 0 && 1785 + mddev->reshape_backwards) 1786 + sb->feature_map 1787 + |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 1788 + if (rdev->new_data_offset != rdev->data_offset) { 1789 + sb->feature_map 1790 + |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 1791 + sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 1792 + - rdev->data_offset)); 1793 + } 1850 1794 } 1851 1795 1852 1796 if (rdev->badblocks.count == 0) ··· 1933 1857 sector_t max_sectors; 1934 1858 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1935 1859 return 0; /* component must fit device */ 1860 + if (rdev->data_offset != rdev->new_data_offset) 1861 + return 0; /* too confusing */ 1936 1862 if (rdev->sb_start < rdev->data_offset) { 1937 1863 /* minor versions 1 and 2; superblock before data */ 1938 1864 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; ··· 1962 1884 rdev->sb_page); 1963 1885 md_super_wait(rdev->mddev); 1964 1886 return num_sectors; 1887 + 1888 + } 1889 + 1890 + static int 1891 + super_1_allow_new_offset(struct md_rdev *rdev, 1892 + unsigned long long new_offset) 1893 + { 1894 + /* All necessary checks on new >= old have been done */ 1895 + struct bitmap *bitmap; 1896 + if (new_offset >= rdev->data_offset) 1897 + return 1; 1898 + 1899 + /* with 1.0 metadata, there is no metadata to tread on 1900 + * so we can always move back */ 1901 + if (rdev->mddev->minor_version == 0) 1902 + return 1; 1903 + 1904 + /* otherwise we must be sure not to step on 1905 + * any metadata, so stay: 1906 + * 36K beyond start of superblock 1907 + * beyond end of badblocks 1908 + * beyond write-intent bitmap 1909 + */ 1910 + if (rdev->sb_start + (32+4)*2 > new_offset) 1911 + return 0; 1912 + bitmap = rdev->mddev->bitmap; 1913 + if (bitmap && !rdev->mddev->bitmap_info.file && 1914 + rdev->sb_start + rdev->mddev->bitmap_info.offset + 1915 + bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) 1916 + return 0; 1917 + if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 1918 + return 0; 1919 + 1920 + return 1; 1965 1921 } 1966 1922 1967 1923 static struct super_type super_types[] = { ··· 2006 1894 .validate_super = super_90_validate, 2007 1895 .sync_super = super_90_sync, 2008 1896 .rdev_size_change = super_90_rdev_size_change, 1897 + .allow_new_offset = super_90_allow_new_offset, 2009 1898 }, 2010 1899 [1] = { 2011 1900 .name = "md-1", ··· 2015 1902 .validate_super = super_1_validate, 2016 1903 .sync_super = super_1_sync, 2017 1904 .rdev_size_change = super_1_rdev_size_change, 1905 + .allow_new_offset = super_1_allow_new_offset, 2018 1906 }, 2019 1907 }; 2020 1908 ··· 2219 2105 sysfs_remove_link(&rdev->kobj, "block"); 2220 2106 sysfs_put(rdev->sysfs_state); 2221 2107 rdev->sysfs_state = NULL; 2222 - kfree(rdev->badblocks.page); 2223 2108 rdev->badblocks.count = 0; 2224 - rdev->badblocks.page = NULL; 2225 2109 /* We need to delay this, otherwise we can deadlock when 2226 2110 * writing to 'remove' to "dev/state". We also need 2227 2111 * to delay it due to rcu usage. ··· 2270 2158 bdevname(rdev->bdev,b)); 2271 2159 if (rdev->mddev) 2272 2160 MD_BUG(); 2273 - free_disk_sb(rdev); 2161 + md_rdev_clear(rdev); 2274 2162 #ifndef MODULE 2275 2163 if (test_bit(AutoDetected, &rdev->flags)) 2276 2164 md_autodetect_dev(rdev->bdev->bd_dev); ··· 2921 2809 static ssize_t 2922 2810 offset_store(struct md_rdev *rdev, const char *buf, size_t len) 2923 2811 { 2924 - char *e; 2925 - unsigned long long offset = simple_strtoull(buf, &e, 10); 2926 - if (e==buf || (*e && *e != '\n')) 2812 + unsigned long long offset; 2813 + if (strict_strtoull(buf, 10, &offset) < 0) 2927 2814 return -EINVAL; 2928 2815 if (rdev->mddev->pers && rdev->raid_disk >= 0) 2929 2816 return -EBUSY; ··· 2936 2825 2937 2826 static struct rdev_sysfs_entry rdev_offset = 2938 2827 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 2828 + 2829 + static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 2830 + { 2831 + return sprintf(page, "%llu\n", 2832 + (unsigned long long)rdev->new_data_offset); 2833 + } 2834 + 2835 + static ssize_t new_offset_store(struct md_rdev *rdev, 2836 + const char *buf, size_t len) 2837 + { 2838 + unsigned long long new_offset; 2839 + struct mddev *mddev = rdev->mddev; 2840 + 2841 + if (strict_strtoull(buf, 10, &new_offset) < 0) 2842 + return -EINVAL; 2843 + 2844 + if (mddev->sync_thread) 2845 + return -EBUSY; 2846 + if (new_offset == rdev->data_offset) 2847 + /* reset is always permitted */ 2848 + ; 2849 + else if (new_offset > rdev->data_offset) { 2850 + /* must not push array size beyond rdev_sectors */ 2851 + if (new_offset - rdev->data_offset 2852 + + mddev->dev_sectors > rdev->sectors) 2853 + return -E2BIG; 2854 + } 2855 + /* Metadata worries about other space details. */ 2856 + 2857 + /* decreasing the offset is inconsistent with a backwards 2858 + * reshape. 2859 + */ 2860 + if (new_offset < rdev->data_offset && 2861 + mddev->reshape_backwards) 2862 + return -EINVAL; 2863 + /* Increasing offset is inconsistent with forwards 2864 + * reshape. reshape_direction should be set to 2865 + * 'backwards' first. 2866 + */ 2867 + if (new_offset > rdev->data_offset && 2868 + !mddev->reshape_backwards) 2869 + return -EINVAL; 2870 + 2871 + if (mddev->pers && mddev->persistent && 2872 + !super_types[mddev->major_version] 2873 + .allow_new_offset(rdev, new_offset)) 2874 + return -E2BIG; 2875 + rdev->new_data_offset = new_offset; 2876 + if (new_offset > rdev->data_offset) 2877 + mddev->reshape_backwards = 1; 2878 + else if (new_offset < rdev->data_offset) 2879 + mddev->reshape_backwards = 0; 2880 + 2881 + return len; 2882 + } 2883 + static struct rdev_sysfs_entry rdev_new_offset = 2884 + __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 2939 2885 2940 2886 static ssize_t 2941 2887 rdev_size_show(struct md_rdev *rdev, char *page) ··· 3038 2870 3039 2871 if (strict_blocks_to_sectors(buf, &sectors) < 0) 3040 2872 return -EINVAL; 2873 + if (rdev->data_offset != rdev->new_data_offset) 2874 + return -EINVAL; /* too confusing */ 3041 2875 if (my_mddev->pers && rdev->raid_disk >= 0) { 3042 2876 if (my_mddev->persistent) { 3043 2877 sectors = super_types[my_mddev->major_version]. ··· 3176 3006 &rdev_errors.attr, 3177 3007 &rdev_slot.attr, 3178 3008 &rdev_offset.attr, 3009 + &rdev_new_offset.attr, 3179 3010 &rdev_size.attr, 3180 3011 &rdev_recovery_start.attr, 3181 3012 &rdev_bad_blocks.attr, ··· 3251 3080 rdev->raid_disk = -1; 3252 3081 rdev->flags = 0; 3253 3082 rdev->data_offset = 0; 3083 + rdev->new_data_offset = 0; 3254 3084 rdev->sb_events = 0; 3255 3085 rdev->last_read_error.tv_sec = 0; 3256 3086 rdev->last_read_error.tv_nsec = 0; ··· 3350 3178 abort_free: 3351 3179 if (rdev->bdev) 3352 3180 unlock_rdev(rdev); 3353 - free_disk_sb(rdev); 3354 - kfree(rdev->badblocks.page); 3181 + md_rdev_clear(rdev); 3355 3182 kfree(rdev); 3356 3183 return ERR_PTR(err); 3357 3184 } ··· 3590 3419 mddev->new_chunk_sectors = mddev->chunk_sectors; 3591 3420 mddev->raid_disks -= mddev->delta_disks; 3592 3421 mddev->delta_disks = 0; 3422 + mddev->reshape_backwards = 0; 3593 3423 module_put(pers->owner); 3594 3424 printk(KERN_WARNING "md: %s: %s would not accept array\n", 3595 3425 mdname(mddev), clevel); ··· 3664 3492 mddev->layout = mddev->new_layout; 3665 3493 mddev->chunk_sectors = mddev->new_chunk_sectors; 3666 3494 mddev->delta_disks = 0; 3495 + mddev->reshape_backwards = 0; 3667 3496 mddev->degraded = 0; 3668 3497 if (mddev->pers->sync_request == NULL) { 3669 3498 /* this is now an array without redundancy, so ··· 3674 3501 del_timer_sync(&mddev->safemode_timer); 3675 3502 } 3676 3503 pers->run(mddev); 3677 - mddev_resume(mddev); 3678 3504 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3679 - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3680 - md_wakeup_thread(mddev->thread); 3505 + mddev_resume(mddev); 3681 3506 sysfs_notify(&mddev->kobj, NULL, "level"); 3682 3507 md_new_event(mddev); 3683 3508 return rv; ··· 3753 3582 if (mddev->pers) 3754 3583 rv = update_raid_disks(mddev, n); 3755 3584 else if (mddev->reshape_position != MaxSector) { 3585 + struct md_rdev *rdev; 3756 3586 int olddisks = mddev->raid_disks - mddev->delta_disks; 3587 + 3588 + rdev_for_each(rdev, mddev) { 3589 + if (olddisks < n && 3590 + rdev->data_offset < rdev->new_data_offset) 3591 + return -EINVAL; 3592 + if (olddisks > n && 3593 + rdev->data_offset > rdev->new_data_offset) 3594 + return -EINVAL; 3595 + } 3757 3596 mddev->delta_disks = n - olddisks; 3758 3597 mddev->raid_disks = n; 3598 + mddev->reshape_backwards = (mddev->delta_disks < 0); 3759 3599 } else 3760 3600 mddev->raid_disks = n; 3761 3601 return rv ? rv : len; ··· 4448 4266 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4449 4267 return sprintf(page, "none\n"); 4450 4268 4451 - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 4269 + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 4270 + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4452 4271 max_sectors = mddev->resync_max_sectors; 4453 4272 else 4454 4273 max_sectors = mddev->dev_sectors; ··· 4611 4428 static ssize_t 4612 4429 reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 4613 4430 { 4431 + struct md_rdev *rdev; 4614 4432 char *e; 4615 4433 unsigned long long new = simple_strtoull(buf, &e, 10); 4616 4434 if (mddev->pers) ··· 4620 4436 return -EINVAL; 4621 4437 mddev->reshape_position = new; 4622 4438 mddev->delta_disks = 0; 4439 + mddev->reshape_backwards = 0; 4623 4440 mddev->new_level = mddev->level; 4624 4441 mddev->new_layout = mddev->layout; 4625 4442 mddev->new_chunk_sectors = mddev->chunk_sectors; 4443 + rdev_for_each(rdev, mddev) 4444 + rdev->new_data_offset = rdev->data_offset; 4626 4445 return len; 4627 4446 } 4628 4447 4629 4448 static struct md_sysfs_entry md_reshape_position = 4630 4449 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 4631 4450 reshape_position_store); 4451 + 4452 + static ssize_t 4453 + reshape_direction_show(struct mddev *mddev, char *page) 4454 + { 4455 + return sprintf(page, "%s\n", 4456 + mddev->reshape_backwards ? "backwards" : "forwards"); 4457 + } 4458 + 4459 + static ssize_t 4460 + reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 4461 + { 4462 + int backwards = 0; 4463 + if (cmd_match(buf, "forwards")) 4464 + backwards = 0; 4465 + else if (cmd_match(buf, "backwards")) 4466 + backwards = 1; 4467 + else 4468 + return -EINVAL; 4469 + if (mddev->reshape_backwards == backwards) 4470 + return len; 4471 + 4472 + /* check if we are allowed to change */ 4473 + if (mddev->delta_disks) 4474 + return -EBUSY; 4475 + 4476 + if (mddev->persistent && 4477 + mddev->major_version == 0) 4478 + return -EINVAL; 4479 + 4480 + mddev->reshape_backwards = backwards; 4481 + return len; 4482 + } 4483 + 4484 + static struct md_sysfs_entry md_reshape_direction = 4485 + __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 4486 + reshape_direction_store); 4632 4487 4633 4488 static ssize_t 4634 4489 array_size_show(struct mddev *mddev, char *page) ··· 4724 4501 &md_safe_delay.attr, 4725 4502 &md_array_state.attr, 4726 4503 &md_reshape_position.attr, 4504 + &md_reshape_direction.attr, 4727 4505 &md_array_size.attr, 4728 4506 &max_corr_read_errors.attr, 4729 4507 NULL, ··· 5138 4914 err = -EINVAL; 5139 4915 mddev->pers->stop(mddev); 5140 4916 } 5141 - if (err == 0 && mddev->pers->sync_request) { 4917 + if (err == 0 && mddev->pers->sync_request && 4918 + (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 5142 4919 err = bitmap_create(mddev); 5143 4920 if (err) { 5144 4921 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", ··· 5289 5064 mddev->events = 0; 5290 5065 mddev->can_decrease_events = 0; 5291 5066 mddev->delta_disks = 0; 5067 + mddev->reshape_backwards = 0; 5292 5068 mddev->new_level = LEVEL_NONE; 5293 5069 mddev->new_layout = 0; 5294 5070 mddev->new_chunk_sectors = 0; ··· 5305 5079 mddev->merge_check_needed = 0; 5306 5080 mddev->bitmap_info.offset = 0; 5307 5081 mddev->bitmap_info.default_offset = 0; 5082 + mddev->bitmap_info.default_space = 0; 5308 5083 mddev->bitmap_info.chunksize = 0; 5309 5084 mddev->bitmap_info.daemon_sleep = 0; 5310 5085 mddev->bitmap_info.max_write_behind = 0; ··· 5648 5421 goto out; 5649 5422 5650 5423 /* bitmap disabled, zero the first byte and copy out */ 5651 - if (!mddev->bitmap || !mddev->bitmap->file) { 5424 + if (!mddev->bitmap || !mddev->bitmap->storage.file) { 5652 5425 file->pathname[0] = '\0'; 5653 5426 goto copy_out; 5654 5427 } ··· 5657 5430 if (!buf) 5658 5431 goto out; 5659 5432 5660 - ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname)); 5433 + ptr = d_path(&mddev->bitmap->storage.file->f_path, 5434 + buf, sizeof(file->pathname)); 5661 5435 if (IS_ERR(ptr)) 5662 5436 goto out; 5663 5437 ··· 6103 5875 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6104 5876 6105 5877 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 5878 + mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 6106 5879 mddev->bitmap_info.offset = 0; 6107 5880 6108 5881 mddev->reshape_position = MaxSector; ··· 6117 5888 mddev->new_chunk_sectors = mddev->chunk_sectors; 6118 5889 mddev->new_layout = mddev->layout; 6119 5890 mddev->delta_disks = 0; 5891 + mddev->reshape_backwards = 0; 6120 5892 6121 5893 return 0; 6122 5894 } ··· 6152 5922 */ 6153 5923 if (mddev->sync_thread) 6154 5924 return -EBUSY; 6155 - if (mddev->bitmap) 6156 - /* Sorry, cannot grow a bitmap yet, just remove it, 6157 - * grow, and re-add. 6158 - */ 6159 - return -EBUSY; 5925 + 6160 5926 rdev_for_each(rdev, mddev) { 6161 5927 sector_t avail = rdev->sectors; 6162 5928 ··· 6170 5944 static int update_raid_disks(struct mddev *mddev, int raid_disks) 6171 5945 { 6172 5946 int rv; 5947 + struct md_rdev *rdev; 6173 5948 /* change the number of raid disks */ 6174 5949 if (mddev->pers->check_reshape == NULL) 6175 5950 return -EINVAL; ··· 6179 5952 return -EINVAL; 6180 5953 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 6181 5954 return -EBUSY; 5955 + 5956 + rdev_for_each(rdev, mddev) { 5957 + if (mddev->raid_disks < raid_disks && 5958 + rdev->data_offset < rdev->new_data_offset) 5959 + return -EINVAL; 5960 + if (mddev->raid_disks > raid_disks && 5961 + rdev->data_offset > rdev->new_data_offset) 5962 + return -EINVAL; 5963 + } 5964 + 6182 5965 mddev->delta_disks = raid_disks - mddev->raid_disks; 5966 + if (mddev->delta_disks < 0) 5967 + mddev->reshape_backwards = 1; 5968 + else if (mddev->delta_disks > 0) 5969 + mddev->reshape_backwards = 0; 6183 5970 6184 5971 rv = mddev->pers->check_reshape(mddev); 6185 - if (rv < 0) 5972 + if (rv < 0) { 6186 5973 mddev->delta_disks = 0; 5974 + mddev->reshape_backwards = 0; 5975 + } 6187 5976 return rv; 6188 5977 } 6189 5978 ··· 6282 6039 return -EINVAL; 6283 6040 mddev->bitmap_info.offset = 6284 6041 mddev->bitmap_info.default_offset; 6042 + mddev->bitmap_info.space = 6043 + mddev->bitmap_info.default_space; 6285 6044 mddev->pers->quiesce(mddev, 1); 6286 6045 rv = bitmap_create(mddev); 6287 6046 if (!rv) ··· 6295 6050 /* remove the bitmap */ 6296 6051 if (!mddev->bitmap) 6297 6052 return -ENOENT; 6298 - if (mddev->bitmap->file) 6053 + if (mddev->bitmap->storage.file) 6299 6054 return -EINVAL; 6300 6055 mddev->pers->quiesce(mddev, 1); 6301 6056 bitmap_destroy(mddev); ··· 6618 6373 struct mddev *mddev = mddev_find(bdev->bd_dev); 6619 6374 int err; 6620 6375 6376 + if (!mddev) 6377 + return -ENODEV; 6378 + 6621 6379 if (mddev->gendisk != bdev->bd_disk) { 6622 6380 /* we are racing with mddev_put which is discarding this 6623 6381 * bd_disk. ··· 6832 6584 6833 6585 resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); 6834 6586 6835 - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 6587 + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 6588 + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6836 6589 max_sectors = mddev->resync_max_sectors; 6837 6590 else 6838 6591 max_sectors = mddev->dev_sectors; ··· 7396 7147 j = mddev->recovery_cp; 7397 7148 7398 7149 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7399 - max_sectors = mddev->dev_sectors; 7150 + max_sectors = mddev->resync_max_sectors; 7400 7151 else { 7401 7152 /* recovery follows the physical size of devices */ 7402 7153 max_sectors = mddev->dev_sectors; ··· 7847 7598 goto unlock; 7848 7599 7849 7600 if (mddev->pers->sync_request) { 7850 - if (spares && mddev->bitmap && ! mddev->bitmap->file) { 7601 + if (spares) { 7851 7602 /* We are adding a device or devices to an array 7852 7603 * which has the bitmap stored on all devices. 7853 7604 * So make sure all bitmap pages get written ··· 7895 7646 } 7896 7647 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 7897 7648 7649 + void md_finish_reshape(struct mddev *mddev) 7650 + { 7651 + /* called be personality module when reshape completes. */ 7652 + struct md_rdev *rdev; 7653 + 7654 + rdev_for_each(rdev, mddev) { 7655 + if (rdev->data_offset > rdev->new_data_offset) 7656 + rdev->sectors += rdev->data_offset - rdev->new_data_offset; 7657 + else 7658 + rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 7659 + rdev->data_offset = rdev->new_data_offset; 7660 + } 7661 + } 7662 + EXPORT_SYMBOL(md_finish_reshape); 7898 7663 7899 7664 /* Bad block management. 7900 7665 * We can record which blocks on each device are 'bad' and so just ··· 8157 7894 } 8158 7895 8159 7896 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8160 - int acknowledged) 7897 + int is_new) 8161 7898 { 8162 - int rv = md_set_badblocks(&rdev->badblocks, 8163 - s + rdev->data_offset, sectors, acknowledged); 7899 + int rv; 7900 + if (is_new) 7901 + s += rdev->new_data_offset; 7902 + else 7903 + s += rdev->data_offset; 7904 + rv = md_set_badblocks(&rdev->badblocks, 7905 + s, sectors, 0); 8164 7906 if (rv) { 8165 7907 /* Make sure they get written out promptly */ 8166 7908 sysfs_notify_dirent_safe(rdev->sysfs_state); ··· 8271 8003 return rv; 8272 8004 } 8273 8005 8274 - int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors) 8006 + int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8007 + int is_new) 8275 8008 { 8009 + if (is_new) 8010 + s += rdev->new_data_offset; 8011 + else 8012 + s += rdev->data_offset; 8276 8013 return md_clear_badblocks(&rdev->badblocks, 8277 - s + rdev->data_offset, 8278 - sectors); 8014 + s, sectors); 8279 8015 } 8280 8016 EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 8281 8017

+10 -2

drivers/md/md.h

··· 55 55 int sb_loaded; 56 56 __u64 sb_events; 57 57 sector_t data_offset; /* start of data in array */ 58 + sector_t new_data_offset;/* only relevant while reshaping */ 58 59 sector_t sb_start; /* offset of the super block (in 512byte sectors) */ 59 60 int sb_size; /* bytes in the superblock */ 60 61 int preferred_minor; /* autorun support */ ··· 194 193 return 0; 195 194 } 196 195 extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 197 - int acknowledged); 198 - extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors); 196 + int is_new); 197 + extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 198 + int is_new); 199 199 extern void md_ack_all_badblocks(struct badblocks *bb); 200 200 201 201 struct mddev { ··· 264 262 sector_t reshape_position; 265 263 int delta_disks, new_level, new_layout; 266 264 int new_chunk_sectors; 265 + int reshape_backwards; 267 266 268 267 atomic_t plug_cnt; /* If device is expecting 269 268 * more bios soon. ··· 393 390 * For external metadata, offset 394 391 * from start of device. 395 392 */ 393 + unsigned long space; /* space available at this offset */ 396 394 loff_t default_offset; /* this is the offset to use when 397 395 * hot-adding a bitmap. It should 398 396 * eventually be settable by sysfs. 399 397 */ 398 + unsigned long default_space; /* space available at 399 + * default offset */ 400 400 struct mutex mutex; 401 401 unsigned long chunksize; 402 402 unsigned long daemon_sleep; /* how many jiffies between updates? */ ··· 597 591 extern void md_write_end(struct mddev *mddev); 598 592 extern void md_done_sync(struct mddev *mddev, int blocks, int ok); 599 593 extern void md_error(struct mddev *mddev, struct md_rdev *rdev); 594 + extern void md_finish_reshape(struct mddev *mddev); 600 595 601 596 extern int mddev_congested(struct mddev *mddev, int bits); 602 597 extern void md_flush_request(struct mddev *mddev, struct bio *bio); ··· 622 615 extern void md_stop(struct mddev *mddev); 623 616 extern void md_stop_writes(struct mddev *mddev); 624 617 extern int md_rdev_init(struct md_rdev *rdev); 618 + extern void md_rdev_clear(struct md_rdev *rdev); 625 619 626 620 extern void mddev_suspend(struct mddev *mddev); 627 621 extern void mddev_resume(struct mddev *mddev);

+16 -6

drivers/md/raid1.c

··· 1859 1859 1860 1860 rdev = conf->mirrors[d].rdev; 1861 1861 if (rdev && 1862 - test_bit(In_sync, &rdev->flags) && 1862 + (test_bit(In_sync, &rdev->flags) || 1863 + (!test_bit(Faulty, &rdev->flags) && 1864 + rdev->recovery_offset >= sect + s)) && 1863 1865 is_badblock(rdev, sect, s, 1864 1866 &first_bad, &bad_sectors) == 0 && 1865 1867 sync_page_io(rdev, sect, s<<9, ··· 2026 2024 continue; 2027 2025 if (test_bit(BIO_UPTODATE, &bio->bi_flags) && 2028 2026 test_bit(R1BIO_MadeGood, &r1_bio->state)) { 2029 - rdev_clear_badblocks(rdev, r1_bio->sector, s); 2027 + rdev_clear_badblocks(rdev, r1_bio->sector, s, 0); 2030 2028 } 2031 2029 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 2032 2030 test_bit(R1BIO_WriteError, &r1_bio->state)) { ··· 2046 2044 struct md_rdev *rdev = conf->mirrors[m].rdev; 2047 2045 rdev_clear_badblocks(rdev, 2048 2046 r1_bio->sector, 2049 - r1_bio->sectors); 2047 + r1_bio->sectors, 0); 2050 2048 rdev_dec_pending(rdev, conf->mddev); 2051 2049 } else if (r1_bio->bios[m] != NULL) { 2052 2050 /* This drive got a write error. We need to ··· 2600 2598 if (!disk->rdev || 2601 2599 !test_bit(In_sync, &disk->rdev->flags)) { 2602 2600 disk->head_position = 0; 2603 - if (disk->rdev) 2601 + if (disk->rdev && 2602 + (disk->rdev->saved_raid_disk < 0)) 2604 2603 conf->fullsync = 1; 2605 2604 } else if (conf->last_used < 0) 2606 2605 /* ··· 2753 2750 * any io in the removed space completes, but it hardly seems 2754 2751 * worth it. 2755 2752 */ 2756 - md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0)); 2757 - if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) 2753 + sector_t newsize = raid1_size(mddev, sectors, 0); 2754 + if (mddev->external_size && 2755 + mddev->array_sectors > newsize) 2758 2756 return -EINVAL; 2757 + if (mddev->bitmap) { 2758 + int ret = bitmap_resize(mddev->bitmap, newsize, 0, 0); 2759 + if (ret) 2760 + return ret; 2761 + } 2762 + md_set_array_sectors(mddev, newsize); 2759 2763 set_capacity(mddev->gendisk, mddev->array_sectors); 2760 2764 revalidate_disk(mddev->gendisk); 2761 2765 if (sectors > mddev->dev_sectors &&

+1128 -153

drivers/md/raid10.c

··· 24 24 #include <linux/module.h> 25 25 #include <linux/seq_file.h> 26 26 #include <linux/ratelimit.h> 27 + #include <linux/kthread.h> 27 28 #include "md.h" 28 29 #include "raid10.h" 29 30 #include "raid0.h" ··· 69 68 static void allow_barrier(struct r10conf *conf); 70 69 static void lower_barrier(struct r10conf *conf); 71 70 static int enough(struct r10conf *conf, int ignore); 71 + static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, 72 + int *skipped); 73 + static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); 74 + static void end_reshape_write(struct bio *bio, int error); 75 + static void end_reshape(struct r10conf *conf); 72 76 73 77 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) 74 78 { ··· 118 112 if (!r10_bio) 119 113 return NULL; 120 114 121 - if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) 115 + if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || 116 + test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery)) 122 117 nalloc = conf->copies; /* resync */ 123 118 else 124 119 nalloc = 2; /* recovery */ ··· 147 140 struct bio *rbio = r10_bio->devs[j].repl_bio; 148 141 bio = r10_bio->devs[j].bio; 149 142 for (i = 0; i < RESYNC_PAGES; i++) { 150 - if (j == 1 && !test_bit(MD_RECOVERY_SYNC, 151 - &conf->mddev->recovery)) { 152 - /* we can share bv_page's during recovery */ 143 + if (j > 0 && !test_bit(MD_RECOVERY_SYNC, 144 + &conf->mddev->recovery)) { 145 + /* we can share bv_page's during recovery 146 + * and reshape */ 153 147 struct bio *rbio = r10_bio->devs[0].bio; 154 148 page = rbio->bi_io_vec[i].bv_page; 155 149 get_page(page); ··· 173 165 while (j--) 174 166 for (i = 0; i < RESYNC_PAGES ; i++) 175 167 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); 176 - j = -1; 168 + j = 0; 177 169 out_free_bio: 178 - while (++j < nalloc) { 179 - bio_put(r10_bio->devs[j].bio); 170 + for ( ; j < nalloc; j++) { 171 + if (r10_bio->devs[j].bio) 172 + bio_put(r10_bio->devs[j].bio); 180 173 if (r10_bio->devs[j].repl_bio) 181 174 bio_put(r10_bio->devs[j].repl_bio); 182 175 } ··· 513 504 * sector offset to a virtual address 514 505 */ 515 506 516 - static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) 507 + static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) 517 508 { 518 509 int n,f; 519 510 sector_t sector; 520 511 sector_t chunk; 521 512 sector_t stripe; 522 513 int dev; 523 - 524 514 int slot = 0; 525 515 526 516 /* now calculate first sector/dev */ 527 - chunk = r10bio->sector >> conf->chunk_shift; 528 - sector = r10bio->sector & conf->chunk_mask; 517 + chunk = r10bio->sector >> geo->chunk_shift; 518 + sector = r10bio->sector & geo->chunk_mask; 529 519 530 - chunk *= conf->near_copies; 520 + chunk *= geo->near_copies; 531 521 stripe = chunk; 532 - dev = sector_div(stripe, conf->raid_disks); 533 - if (conf->far_offset) 534 - stripe *= conf->far_copies; 522 + dev = sector_div(stripe, geo->raid_disks); 523 + if (geo->far_offset) 524 + stripe *= geo->far_copies; 535 525 536 - sector += stripe << conf->chunk_shift; 526 + sector += stripe << geo->chunk_shift; 537 527 538 528 /* and calculate all the others */ 539 - for (n=0; n < conf->near_copies; n++) { 529 + for (n = 0; n < geo->near_copies; n++) { 540 530 int d = dev; 541 531 sector_t s = sector; 542 532 r10bio->devs[slot].addr = sector; 543 533 r10bio->devs[slot].devnum = d; 544 534 slot++; 545 535 546 - for (f = 1; f < conf->far_copies; f++) { 547 - d += conf->near_copies; 548 - if (d >= conf->raid_disks) 549 - d -= conf->raid_disks; 550 - s += conf->stride; 536 + for (f = 1; f < geo->far_copies; f++) { 537 + d += geo->near_copies; 538 + if (d >= geo->raid_disks) 539 + d -= geo->raid_disks; 540 + s += geo->stride; 551 541 r10bio->devs[slot].devnum = d; 552 542 r10bio->devs[slot].addr = s; 553 543 slot++; 554 544 } 555 545 dev++; 556 - if (dev >= conf->raid_disks) { 546 + if (dev >= geo->raid_disks) { 557 547 dev = 0; 558 - sector += (conf->chunk_mask + 1); 548 + sector += (geo->chunk_mask + 1); 559 549 } 560 550 } 561 - BUG_ON(slot != conf->copies); 551 + } 552 + 553 + static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) 554 + { 555 + struct geom *geo = &conf->geo; 556 + 557 + if (conf->reshape_progress != MaxSector && 558 + ((r10bio->sector >= conf->reshape_progress) != 559 + conf->mddev->reshape_backwards)) { 560 + set_bit(R10BIO_Previous, &r10bio->state); 561 + geo = &conf->prev; 562 + } else 563 + clear_bit(R10BIO_Previous, &r10bio->state); 564 + 565 + __raid10_find_phys(geo, r10bio); 562 566 } 563 567 564 568 static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) 565 569 { 566 570 sector_t offset, chunk, vchunk; 571 + /* Never use conf->prev as this is only called during resync 572 + * or recovery, so reshape isn't happening 573 + */ 574 + struct geom *geo = &conf->geo; 567 575 568 - offset = sector & conf->chunk_mask; 569 - if (conf->far_offset) { 576 + offset = sector & geo->chunk_mask; 577 + if (geo->far_offset) { 570 578 int fc; 571 - chunk = sector >> conf->chunk_shift; 572 - fc = sector_div(chunk, conf->far_copies); 573 - dev -= fc * conf->near_copies; 579 + chunk = sector >> geo->chunk_shift; 580 + fc = sector_div(chunk, geo->far_copies); 581 + dev -= fc * geo->near_copies; 574 582 if (dev < 0) 575 - dev += conf->raid_disks; 583 + dev += geo->raid_disks; 576 584 } else { 577 - while (sector >= conf->stride) { 578 - sector -= conf->stride; 579 - if (dev < conf->near_copies) 580 - dev += conf->raid_disks - conf->near_copies; 585 + while (sector >= geo->stride) { 586 + sector -= geo->stride; 587 + if (dev < geo->near_copies) 588 + dev += geo->raid_disks - geo->near_copies; 581 589 else 582 - dev -= conf->near_copies; 590 + dev -= geo->near_copies; 583 591 } 584 - chunk = sector >> conf->chunk_shift; 592 + chunk = sector >> geo->chunk_shift; 585 593 } 586 - vchunk = chunk * conf->raid_disks + dev; 587 - sector_div(vchunk, conf->near_copies); 588 - return (vchunk << conf->chunk_shift) + offset; 594 + vchunk = chunk * geo->raid_disks + dev; 595 + sector_div(vchunk, geo->near_copies); 596 + return (vchunk << geo->chunk_shift) + offset; 589 597 } 590 598 591 599 /** ··· 623 597 struct r10conf *conf = mddev->private; 624 598 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 625 599 int max; 626 - unsigned int chunk_sectors = mddev->chunk_sectors; 600 + unsigned int chunk_sectors; 627 601 unsigned int bio_sectors = bvm->bi_size >> 9; 602 + struct geom *geo = &conf->geo; 628 603 629 - if (conf->near_copies < conf->raid_disks) { 604 + chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1; 605 + if (conf->reshape_progress != MaxSector && 606 + ((sector >= conf->reshape_progress) != 607 + conf->mddev->reshape_backwards)) 608 + geo = &conf->prev; 609 + 610 + if (geo->near_copies < geo->raid_disks) { 630 611 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) 631 612 + bio_sectors)) << 9; 632 613 if (max < 0) ··· 647 614 if (mddev->merge_check_needed) { 648 615 struct r10bio r10_bio; 649 616 int s; 617 + if (conf->reshape_progress != MaxSector) { 618 + /* Cannot give any guidance during reshape */ 619 + if (max <= biovec->bv_len && bio_sectors == 0) 620 + return biovec->bv_len; 621 + return 0; 622 + } 650 623 r10_bio.sector = sector; 651 624 raid10_find_phys(conf, &r10_bio); 652 625 rcu_read_lock(); ··· 720 681 struct md_rdev *rdev, *best_rdev; 721 682 int do_balance; 722 683 int best_slot; 684 + struct geom *geo = &conf->geo; 723 685 724 686 raid10_find_phys(conf, r10_bio); 725 687 rcu_read_lock(); ··· 801 761 * sequential read speed for 'far copies' arrays. So only 802 762 * keep it for 'near' arrays, and review those later. 803 763 */ 804 - if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) 764 + if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending)) 805 765 break; 806 766 807 767 /* for far > 1 always use the lowest address */ 808 - if (conf->far_copies > 1) 768 + if (geo->far_copies > 1) 809 769 new_distance = r10_bio->devs[slot].addr; 810 770 else 811 771 new_distance = abs(r10_bio->devs[slot].addr - ··· 852 812 if (mddev_congested(mddev, bits)) 853 813 return 1; 854 814 rcu_read_lock(); 855 - for (i = 0; i < conf->raid_disks && ret == 0; i++) { 815 + for (i = 0; 816 + (i < conf->geo.raid_disks || i < conf->prev.raid_disks) 817 + && ret == 0; 818 + i++) { 856 819 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 857 820 if (rdev && !test_bit(Faulty, &rdev->flags)) { 858 821 struct request_queue *q = bdev_get_queue(rdev->bdev); ··· 1016 973 spin_unlock_irq(&conf->resync_lock); 1017 974 } 1018 975 976 + static sector_t choose_data_offset(struct r10bio *r10_bio, 977 + struct md_rdev *rdev) 978 + { 979 + if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) || 980 + test_bit(R10BIO_Previous, &r10_bio->state)) 981 + return rdev->data_offset; 982 + else 983 + return rdev->new_data_offset; 984 + } 985 + 1019 986 static void make_request(struct mddev *mddev, struct bio * bio) 1020 987 { 1021 988 struct r10conf *conf = mddev->private; 1022 989 struct r10bio *r10_bio; 1023 990 struct bio *read_bio; 1024 991 int i; 1025 - int chunk_sects = conf->chunk_mask + 1; 992 + sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); 993 + int chunk_sects = chunk_mask + 1; 1026 994 const int rw = bio_data_dir(bio); 1027 995 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 1028 996 const unsigned long do_fua = (bio->bi_rw & REQ_FUA); ··· 1042 988 int plugged; 1043 989 int sectors_handled; 1044 990 int max_sectors; 991 + int sectors; 1045 992 1046 993 if (unlikely(bio->bi_rw & REQ_FLUSH)) { 1047 994 md_flush_request(mddev, bio); ··· 1052 997 /* If this request crosses a chunk boundary, we need to 1053 998 * split it. This will only happen for 1 PAGE (or less) requests. 1054 999 */ 1055 - if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9) 1056 - > chunk_sects && 1057 - conf->near_copies < conf->raid_disks)) { 1000 + if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9) 1001 + > chunk_sects 1002 + && (conf->geo.near_copies < conf->geo.raid_disks 1003 + || conf->prev.near_copies < conf->prev.raid_disks))) { 1058 1004 struct bio_pair *bp; 1059 1005 /* Sanity check -- queue functions should prevent this happening */ 1060 1006 if (bio->bi_vcnt != 1 || ··· 1107 1051 */ 1108 1052 wait_barrier(conf); 1109 1053 1054 + sectors = bio->bi_size >> 9; 1055 + while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 1056 + bio->bi_sector < conf->reshape_progress && 1057 + bio->bi_sector + sectors > conf->reshape_progress) { 1058 + /* IO spans the reshape position. Need to wait for 1059 + * reshape to pass 1060 + */ 1061 + allow_barrier(conf); 1062 + wait_event(conf->wait_barrier, 1063 + conf->reshape_progress <= bio->bi_sector || 1064 + conf->reshape_progress >= bio->bi_sector + sectors); 1065 + wait_barrier(conf); 1066 + } 1067 + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 1068 + bio_data_dir(bio) == WRITE && 1069 + (mddev->reshape_backwards 1070 + ? (bio->bi_sector < conf->reshape_safe && 1071 + bio->bi_sector + sectors > conf->reshape_progress) 1072 + : (bio->bi_sector + sectors > conf->reshape_safe && 1073 + bio->bi_sector < conf->reshape_progress))) { 1074 + /* Need to update reshape_position in metadata */ 1075 + mddev->reshape_position = conf->reshape_progress; 1076 + set_bit(MD_CHANGE_DEVS, &mddev->flags); 1077 + set_bit(MD_CHANGE_PENDING, &mddev->flags); 1078 + md_wakeup_thread(mddev->thread); 1079 + wait_event(mddev->sb_wait, 1080 + !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 1081 + 1082 + conf->reshape_safe = mddev->reshape_position; 1083 + } 1084 + 1110 1085 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); 1111 1086 1112 1087 r10_bio->master_bio = bio; 1113 - r10_bio->sectors = bio->bi_size >> 9; 1088 + r10_bio->sectors = sectors; 1114 1089 1115 1090 r10_bio->mddev = mddev; 1116 1091 r10_bio->sector = bio->bi_sector; ··· 1180 1093 r10_bio->devs[slot].rdev = rdev; 1181 1094 1182 1095 read_bio->bi_sector = r10_bio->devs[slot].addr + 1183 - rdev->data_offset; 1096 + choose_data_offset(r10_bio, rdev); 1184 1097 read_bio->bi_bdev = rdev->bdev; 1185 1098 read_bio->bi_end_io = raid10_end_read_request; 1186 1099 read_bio->bi_rw = READ | do_sync; ··· 1384 1297 r10_bio->devs[i].bio = mbio; 1385 1298 1386 1299 mbio->bi_sector = (r10_bio->devs[i].addr+ 1387 - conf->mirrors[d].rdev->data_offset); 1300 + choose_data_offset(r10_bio, 1301 + conf->mirrors[d].rdev)); 1388 1302 mbio->bi_bdev = conf->mirrors[d].rdev->bdev; 1389 1303 mbio->bi_end_io = raid10_end_write_request; 1390 1304 mbio->bi_rw = WRITE | do_sync | do_fua; ··· 1409 1321 * so it cannot disappear, so the replacement cannot 1410 1322 * become NULL here 1411 1323 */ 1412 - mbio->bi_sector = (r10_bio->devs[i].addr+ 1413 - conf->mirrors[d].replacement->data_offset); 1324 + mbio->bi_sector = (r10_bio->devs[i].addr + 1325 + choose_data_offset( 1326 + r10_bio, 1327 + conf->mirrors[d].replacement)); 1414 1328 mbio->bi_bdev = conf->mirrors[d].replacement->bdev; 1415 1329 mbio->bi_end_io = raid10_end_write_request; 1416 1330 mbio->bi_rw = WRITE | do_sync | do_fua; ··· 1458 1368 struct r10conf *conf = mddev->private; 1459 1369 int i; 1460 1370 1461 - if (conf->near_copies < conf->raid_disks) 1371 + if (conf->geo.near_copies < conf->geo.raid_disks) 1462 1372 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); 1463 - if (conf->near_copies > 1) 1464 - seq_printf(seq, " %d near-copies", conf->near_copies); 1465 - if (conf->far_copies > 1) { 1466 - if (conf->far_offset) 1467 - seq_printf(seq, " %d offset-copies", conf->far_copies); 1373 + if (conf->geo.near_copies > 1) 1374 + seq_printf(seq, " %d near-copies", conf->geo.near_copies); 1375 + if (conf->geo.far_copies > 1) { 1376 + if (conf->geo.far_offset) 1377 + seq_printf(seq, " %d offset-copies", conf->geo.far_copies); 1468 1378 else 1469 - seq_printf(seq, " %d far-copies", conf->far_copies); 1379 + seq_printf(seq, " %d far-copies", conf->geo.far_copies); 1470 1380 } 1471 - seq_printf(seq, " [%d/%d] [", conf->raid_disks, 1472 - conf->raid_disks - mddev->degraded); 1473 - for (i = 0; i < conf->raid_disks; i++) 1381 + seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks, 1382 + conf->geo.raid_disks - mddev->degraded); 1383 + for (i = 0; i < conf->geo.raid_disks; i++) 1474 1384 seq_printf(seq, "%s", 1475 1385 conf->mirrors[i].rdev && 1476 1386 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); ··· 1482 1392 * Don't consider the device numbered 'ignore' 1483 1393 * as we might be about to remove it. 1484 1394 */ 1485 - static int enough(struct r10conf *conf, int ignore) 1395 + static int _enough(struct r10conf *conf, struct geom *geo, int ignore) 1486 1396 { 1487 1397 int first = 0; 1488 1398 ··· 1493 1403 if (conf->mirrors[first].rdev && 1494 1404 first != ignore) 1495 1405 cnt++; 1496 - first = (first+1) % conf->raid_disks; 1406 + first = (first+1) % geo->raid_disks; 1497 1407 } 1498 1408 if (cnt == 0) 1499 1409 return 0; 1500 1410 } while (first != 0); 1501 1411 return 1; 1412 + } 1413 + 1414 + static int enough(struct r10conf *conf, int ignore) 1415 + { 1416 + return _enough(conf, &conf->geo, ignore) && 1417 + _enough(conf, &conf->prev, ignore); 1502 1418 } 1503 1419 1504 1420 static void error(struct mddev *mddev, struct md_rdev *rdev) ··· 1541 1445 "md/raid10:%s: Disk failure on %s, disabling device.\n" 1542 1446 "md/raid10:%s: Operation continuing on %d devices.\n", 1543 1447 mdname(mddev), bdevname(rdev->bdev, b), 1544 - mdname(mddev), conf->raid_disks - mddev->degraded); 1448 + mdname(mddev), conf->geo.raid_disks - mddev->degraded); 1545 1449 } 1546 1450 1547 1451 static void print_conf(struct r10conf *conf) ··· 1554 1458 printk(KERN_DEBUG "(!conf)\n"); 1555 1459 return; 1556 1460 } 1557 - printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, 1558 - conf->raid_disks); 1461 + printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, 1462 + conf->geo.raid_disks); 1559 1463 1560 - for (i = 0; i < conf->raid_disks; i++) { 1464 + for (i = 0; i < conf->geo.raid_disks; i++) { 1561 1465 char b[BDEVNAME_SIZE]; 1562 1466 tmp = conf->mirrors + i; 1563 1467 if (tmp->rdev) ··· 1589 1493 * Find all non-in_sync disks within the RAID10 configuration 1590 1494 * and mark them in_sync 1591 1495 */ 1592 - for (i = 0; i < conf->raid_disks; i++) { 1496 + for (i = 0; i < conf->geo.raid_disks; i++) { 1593 1497 tmp = conf->mirrors + i; 1594 1498 if (tmp->replacement 1595 1499 && tmp->replacement->recovery_offset == MaxSector ··· 1631 1535 int err = -EEXIST; 1632 1536 int mirror; 1633 1537 int first = 0; 1634 - int last = conf->raid_disks - 1; 1538 + int last = conf->geo.raid_disks - 1; 1635 1539 struct request_queue *q = bdev_get_queue(rdev->bdev); 1636 1540 1637 1541 if (mddev->recovery_cp < MaxSector) ··· 1639 1543 * very different from resync 1640 1544 */ 1641 1545 return -EBUSY; 1642 - if (rdev->saved_raid_disk < 0 && !enough(conf, -1)) 1546 + if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1)) 1643 1547 return -EINVAL; 1644 1548 1645 1549 if (rdev->raid_disk >= 0) ··· 1731 1635 if (!test_bit(Faulty, &rdev->flags) && 1732 1636 mddev->recovery_disabled != p->recovery_disabled && 1733 1637 (!p->replacement || p->replacement == rdev) && 1638 + number < conf->geo.raid_disks && 1734 1639 enough(conf, -1)) { 1735 1640 err = -EBUSY; 1736 1641 goto abort; ··· 1773 1676 struct r10conf *conf = r10_bio->mddev->private; 1774 1677 int d; 1775 1678 1776 - d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); 1679 + if (bio == r10_bio->master_bio) { 1680 + /* this is a reshape read */ 1681 + d = r10_bio->read_slot; /* really the read dev */ 1682 + } else 1683 + d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); 1777 1684 1778 1685 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 1779 1686 set_bit(R10BIO_Uptodate, &r10_bio->state); ··· 2319 2218 " (%d sectors at %llu on %s)\n", 2320 2219 mdname(mddev), s, 2321 2220 (unsigned long long)( 2322 - sect + rdev->data_offset), 2221 + sect + 2222 + choose_data_offset(r10_bio, 2223 + rdev)), 2323 2224 bdevname(rdev->bdev, b)); 2324 2225 printk(KERN_NOTICE "md/raid10:%s: %s: failing " 2325 2226 "drive\n", ··· 2359 2256 " (%d sectors at %llu on %s)\n", 2360 2257 mdname(mddev), s, 2361 2258 (unsigned long long)( 2362 - sect + rdev->data_offset), 2259 + sect + 2260 + choose_data_offset(r10_bio, rdev)), 2363 2261 bdevname(rdev->bdev, b)); 2364 2262 printk(KERN_NOTICE "md/raid10:%s: %s: failing " 2365 2263 "drive\n", ··· 2373 2269 " (%d sectors at %llu on %s)\n", 2374 2270 mdname(mddev), s, 2375 2271 (unsigned long long)( 2376 - sect + rdev->data_offset), 2272 + sect + 2273 + choose_data_offset(r10_bio, rdev)), 2377 2274 bdevname(rdev->bdev, b)); 2378 2275 atomic_add(s, &rdev->corrected_errors); 2379 2276 } ··· 2448 2343 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 2449 2344 md_trim_bio(wbio, sector - bio->bi_sector, sectors); 2450 2345 wbio->bi_sector = (r10_bio->devs[i].addr+ 2451 - rdev->data_offset+ 2346 + choose_data_offset(r10_bio, rdev) + 2452 2347 (sector - r10_bio->sector)); 2453 2348 wbio->bi_bdev = rdev->bdev; 2454 2349 if (submit_bio_wait(WRITE, wbio) == 0) ··· 2525 2420 r10_bio->devs[slot].bio = bio; 2526 2421 r10_bio->devs[slot].rdev = rdev; 2527 2422 bio->bi_sector = r10_bio->devs[slot].addr 2528 - + rdev->data_offset; 2423 + + choose_data_offset(r10_bio, rdev); 2529 2424 bio->bi_bdev = rdev->bdev; 2530 2425 bio->bi_rw = READ | do_sync; 2531 2426 bio->bi_private = r10_bio; ··· 2585 2480 rdev_clear_badblocks( 2586 2481 rdev, 2587 2482 r10_bio->devs[m].addr, 2588 - r10_bio->sectors); 2483 + r10_bio->sectors, 0); 2589 2484 } else { 2590 2485 if (!rdev_set_badblocks( 2591 2486 rdev, ··· 2601 2496 rdev_clear_badblocks( 2602 2497 rdev, 2603 2498 r10_bio->devs[m].addr, 2604 - r10_bio->sectors); 2499 + r10_bio->sectors, 0); 2605 2500 } else { 2606 2501 if (!rdev_set_badblocks( 2607 2502 rdev, ··· 2620 2515 rdev_clear_badblocks( 2621 2516 rdev, 2622 2517 r10_bio->devs[m].addr, 2623 - r10_bio->sectors); 2518 + r10_bio->sectors, 0); 2624 2519 rdev_dec_pending(rdev, conf->mddev); 2625 2520 } else if (bio != NULL && 2626 2521 !test_bit(BIO_UPTODATE, &bio->bi_flags)) { ··· 2637 2532 rdev_clear_badblocks( 2638 2533 rdev, 2639 2534 r10_bio->devs[m].addr, 2640 - r10_bio->sectors); 2535 + r10_bio->sectors, 0); 2641 2536 rdev_dec_pending(rdev, conf->mddev); 2642 2537 } 2643 2538 } ··· 2678 2573 if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 2679 2574 test_bit(R10BIO_WriteError, &r10_bio->state)) 2680 2575 handle_write_completed(conf, r10_bio); 2576 + else if (test_bit(R10BIO_IsReshape, &r10_bio->state)) 2577 + reshape_request_write(mddev, r10_bio); 2681 2578 else if (test_bit(R10BIO_IsSync, &r10_bio->state)) 2682 2579 sync_request_write(mddev, r10_bio); 2683 2580 else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) ··· 2710 2603 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; 2711 2604 BUG_ON(conf->r10buf_pool); 2712 2605 conf->have_replacement = 0; 2713 - for (i = 0; i < conf->raid_disks; i++) 2606 + for (i = 0; i < conf->geo.raid_disks; i++) 2714 2607 if (conf->mirrors[i].replacement) 2715 2608 conf->have_replacement = 1; 2716 2609 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); ··· 2764 2657 sector_t sync_blocks; 2765 2658 sector_t sectors_skipped = 0; 2766 2659 int chunks_skipped = 0; 2660 + sector_t chunk_mask = conf->geo.chunk_mask; 2767 2661 2768 2662 if (!conf->r10buf_pool) 2769 2663 if (init_resync(conf)) ··· 2772 2664 2773 2665 skipped: 2774 2666 max_sector = mddev->dev_sectors; 2775 - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 2667 + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 2668 + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 2776 2669 max_sector = mddev->resync_max_sectors; 2777 2670 if (sector_nr >= max_sector) { 2778 2671 /* If we aborted, we need to abort the ··· 2785 2676 * we need to convert that to several 2786 2677 * virtual addresses. 2787 2678 */ 2679 + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 2680 + end_reshape(conf); 2681 + return 0; 2682 + } 2683 + 2788 2684 if (mddev->curr_resync < max_sector) { /* aborted */ 2789 2685 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 2790 2686 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 2791 2687 &sync_blocks, 1); 2792 - else for (i=0; i<conf->raid_disks; i++) { 2688 + else for (i = 0; i < conf->geo.raid_disks; i++) { 2793 2689 sector_t sect = 2794 2690 raid10_find_virt(conf, mddev->curr_resync, i); 2795 2691 bitmap_end_sync(mddev->bitmap, sect, ··· 2808 2694 /* Completed a full sync so the replacements 2809 2695 * are now fully recovered. 2810 2696 */ 2811 - for (i = 0; i < conf->raid_disks; i++) 2697 + for (i = 0; i < conf->geo.raid_disks; i++) 2812 2698 if (conf->mirrors[i].replacement) 2813 2699 conf->mirrors[i].replacement 2814 2700 ->recovery_offset ··· 2821 2707 *skipped = 1; 2822 2708 return sectors_skipped; 2823 2709 } 2824 - if (chunks_skipped >= conf->raid_disks) { 2710 + 2711 + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 2712 + return reshape_request(mddev, sector_nr, skipped); 2713 + 2714 + if (chunks_skipped >= conf->geo.raid_disks) { 2825 2715 /* if there has been nothing to do on any drive, 2826 2716 * then there is nothing to do at all.. 2827 2717 */ ··· 2839 2721 /* make sure whole request will fit in a chunk - if chunks 2840 2722 * are meaningful 2841 2723 */ 2842 - if (conf->near_copies < conf->raid_disks && 2843 - max_sector > (sector_nr | conf->chunk_mask)) 2844 - max_sector = (sector_nr | conf->chunk_mask) + 1; 2724 + if (conf->geo.near_copies < conf->geo.raid_disks && 2725 + max_sector > (sector_nr | chunk_mask)) 2726 + max_sector = (sector_nr | chunk_mask) + 1; 2845 2727 /* 2846 2728 * If there is non-resync activity waiting for us then 2847 2729 * put in a delay to throttle resync. ··· 2870 2752 int j; 2871 2753 r10_bio = NULL; 2872 2754 2873 - for (i=0 ; i<conf->raid_disks; i++) { 2755 + for (i = 0 ; i < conf->geo.raid_disks; i++) { 2874 2756 int still_degraded; 2875 2757 struct r10bio *rb2; 2876 2758 sector_t sect; ··· 2924 2806 /* Need to check if the array will still be 2925 2807 * degraded 2926 2808 */ 2927 - for (j=0; j<conf->raid_disks; j++) 2809 + for (j = 0; j < conf->geo.raid_disks; j++) 2928 2810 if (conf->mirrors[j].rdev == NULL || 2929 2811 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { 2930 2812 still_degraded = 1; ··· 3102 2984 r10_bio->sector = sector_nr; 3103 2985 set_bit(R10BIO_IsSync, &r10_bio->state); 3104 2986 raid10_find_phys(conf, r10_bio); 3105 - r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1; 2987 + r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1; 3106 2988 3107 - for (i=0; i<conf->copies; i++) { 2989 + for (i = 0; i < conf->copies; i++) { 3108 2990 int d = r10_bio->devs[i].devnum; 3109 2991 sector_t first_bad, sector; 3110 2992 int bad_sectors; ··· 3270 3152 struct r10conf *conf = mddev->private; 3271 3153 3272 3154 if (!raid_disks) 3273 - raid_disks = conf->raid_disks; 3155 + raid_disks = min(conf->geo.raid_disks, 3156 + conf->prev.raid_disks); 3274 3157 if (!sectors) 3275 3158 sectors = conf->dev_sectors; 3276 3159 3277 - size = sectors >> conf->chunk_shift; 3278 - sector_div(size, conf->far_copies); 3160 + size = sectors >> conf->geo.chunk_shift; 3161 + sector_div(size, conf->geo.far_copies); 3279 3162 size = size * raid_disks; 3280 - sector_div(size, conf->near_copies); 3163 + sector_div(size, conf->geo.near_copies); 3281 3164 3282 - return size << conf->chunk_shift; 3165 + return size << conf->geo.chunk_shift; 3283 3166 } 3284 3167 3285 3168 static void calc_sectors(struct r10conf *conf, sector_t size) ··· 3290 3171 * conf->stride 3291 3172 */ 3292 3173 3293 - size = size >> conf->chunk_shift; 3294 - sector_div(size, conf->far_copies); 3295 - size = size * conf->raid_disks; 3296 - sector_div(size, conf->near_copies); 3174 + size = size >> conf->geo.chunk_shift; 3175 + sector_div(size, conf->geo.far_copies); 3176 + size = size * conf->geo.raid_disks; 3177 + sector_div(size, conf->geo.near_copies); 3297 3178 /* 'size' is now the number of chunks in the array */ 3298 3179 /* calculate "used chunks per device" */ 3299 3180 size = size * conf->copies; ··· 3301 3182 /* We need to round up when dividing by raid_disks to 3302 3183 * get the stride size. 3303 3184 */ 3304 - size = DIV_ROUND_UP_SECTOR_T(size, conf->raid_disks); 3185 + size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks); 3305 3186 3306 - conf->dev_sectors = size << conf->chunk_shift; 3187 + conf->dev_sectors = size << conf->geo.chunk_shift; 3307 3188 3308 - if (conf->far_offset) 3309 - conf->stride = 1 << conf->chunk_shift; 3189 + if (conf->geo.far_offset) 3190 + conf->geo.stride = 1 << conf->geo.chunk_shift; 3310 3191 else { 3311 - sector_div(size, conf->far_copies); 3312 - conf->stride = size << conf->chunk_shift; 3192 + sector_div(size, conf->geo.far_copies); 3193 + conf->geo.stride = size << conf->geo.chunk_shift; 3313 3194 } 3195 + } 3196 + 3197 + enum geo_type {geo_new, geo_old, geo_start}; 3198 + static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) 3199 + { 3200 + int nc, fc, fo; 3201 + int layout, chunk, disks; 3202 + switch (new) { 3203 + case geo_old: 3204 + layout = mddev->layout; 3205 + chunk = mddev->chunk_sectors; 3206 + disks = mddev->raid_disks - mddev->delta_disks; 3207 + break; 3208 + case geo_new: 3209 + layout = mddev->new_layout; 3210 + chunk = mddev->new_chunk_sectors; 3211 + disks = mddev->raid_disks; 3212 + break; 3213 + default: /* avoid 'may be unused' warnings */ 3214 + case geo_start: /* new when starting reshape - raid_disks not 3215 + * updated yet. */ 3216 + layout = mddev->new_layout; 3217 + chunk = mddev->new_chunk_sectors; 3218 + disks = mddev->raid_disks + mddev->delta_disks; 3219 + break; 3220 + } 3221 + if (layout >> 17) 3222 + return -1; 3223 + if (chunk < (PAGE_SIZE >> 9) || 3224 + !is_power_of_2(chunk)) 3225 + return -2; 3226 + nc = layout & 255; 3227 + fc = (layout >> 8) & 255; 3228 + fo = layout & (1<<16); 3229 + geo->raid_disks = disks; 3230 + geo->near_copies = nc; 3231 + geo->far_copies = fc; 3232 + geo->far_offset = fo; 3233 + geo->chunk_mask = chunk - 1; 3234 + geo->chunk_shift = ffz(~chunk); 3235 + return nc*fc; 3314 3236 } 3315 3237 3316 3238 static struct r10conf *setup_conf(struct mddev *mddev) 3317 3239 { 3318 3240 struct r10conf *conf = NULL; 3319 - int nc, fc, fo; 3320 3241 int err = -EINVAL; 3242 + struct geom geo; 3243 + int copies; 3321 3244 3322 - if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) || 3323 - !is_power_of_2(mddev->new_chunk_sectors)) { 3245 + copies = setup_geo(&geo, mddev, geo_new); 3246 + 3247 + if (copies == -2) { 3324 3248 printk(KERN_ERR "md/raid10:%s: chunk size must be " 3325 3249 "at least PAGE_SIZE(%ld) and be a power of 2.\n", 3326 3250 mdname(mddev), PAGE_SIZE); 3327 3251 goto out; 3328 3252 } 3329 3253 3330 - nc = mddev->new_layout & 255; 3331 - fc = (mddev->new_layout >> 8) & 255; 3332 - fo = mddev->new_layout & (1<<16); 3333 - 3334 - if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || 3335 - (mddev->new_layout >> 17)) { 3254 + if (copies < 2 || copies > mddev->raid_disks) { 3336 3255 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", 3337 3256 mdname(mddev), mddev->new_layout); 3338 3257 goto out; ··· 3381 3224 if (!conf) 3382 3225 goto out; 3383 3226 3384 - conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, 3227 + /* FIXME calc properly */ 3228 + conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks + 3229 + max(0,mddev->delta_disks)), 3385 3230 GFP_KERNEL); 3386 3231 if (!conf->mirrors) 3387 3232 goto out; ··· 3392 3233 if (!conf->tmppage) 3393 3234 goto out; 3394 3235 3395 - 3396 - conf->raid_disks = mddev->raid_disks; 3397 - conf->near_copies = nc; 3398 - conf->far_copies = fc; 3399 - conf->copies = nc*fc; 3400 - conf->far_offset = fo; 3401 - conf->chunk_mask = mddev->new_chunk_sectors - 1; 3402 - conf->chunk_shift = ffz(~mddev->new_chunk_sectors); 3403 - 3236 + conf->geo = geo; 3237 + conf->copies = copies; 3404 3238 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, 3405 3239 r10bio_pool_free, conf); 3406 3240 if (!conf->r10bio_pool) 3407 3241 goto out; 3408 3242 3409 3243 calc_sectors(conf, mddev->dev_sectors); 3410 - 3244 + if (mddev->reshape_position == MaxSector) { 3245 + conf->prev = conf->geo; 3246 + conf->reshape_progress = MaxSector; 3247 + } else { 3248 + if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) { 3249 + err = -EINVAL; 3250 + goto out; 3251 + } 3252 + conf->reshape_progress = mddev->reshape_position; 3253 + if (conf->prev.far_offset) 3254 + conf->prev.stride = 1 << conf->prev.chunk_shift; 3255 + else 3256 + /* far_copies must be 1 */ 3257 + conf->prev.stride = conf->dev_sectors; 3258 + } 3411 3259 spin_lock_init(&conf->device_lock); 3412 3260 INIT_LIST_HEAD(&conf->retry_list); 3413 3261 ··· 3429 3263 return conf; 3430 3264 3431 3265 out: 3432 - printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", 3433 - mdname(mddev)); 3266 + if (err == -ENOMEM) 3267 + printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", 3268 + mdname(mddev)); 3434 3269 if (conf) { 3435 3270 if (conf->r10bio_pool) 3436 3271 mempool_destroy(conf->r10bio_pool); ··· 3449 3282 struct mirror_info *disk; 3450 3283 struct md_rdev *rdev; 3451 3284 sector_t size; 3452 - 3453 - /* 3454 - * copy the already verified devices into our private RAID10 3455 - * bookkeeping area. [whatever we allocate in run(), 3456 - * should be freed in stop()] 3457 - */ 3285 + sector_t min_offset_diff = 0; 3286 + int first = 1; 3458 3287 3459 3288 if (mddev->private == NULL) { 3460 3289 conf = setup_conf(mddev); ··· 3467 3304 3468 3305 chunk_size = mddev->chunk_sectors << 9; 3469 3306 blk_queue_io_min(mddev->queue, chunk_size); 3470 - if (conf->raid_disks % conf->near_copies) 3471 - blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks); 3307 + if (conf->geo.raid_disks % conf->geo.near_copies) 3308 + blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); 3472 3309 else 3473 3310 blk_queue_io_opt(mddev->queue, chunk_size * 3474 - (conf->raid_disks / conf->near_copies)); 3311 + (conf->geo.raid_disks / conf->geo.near_copies)); 3475 3312 3476 3313 rdev_for_each(rdev, mddev) { 3314 + long long diff; 3477 3315 3478 3316 disk_idx = rdev->raid_disk; 3479 - if (disk_idx >= conf->raid_disks 3480 - || disk_idx < 0) 3317 + if (disk_idx < 0) 3318 + continue; 3319 + if (disk_idx >= conf->geo.raid_disks && 3320 + disk_idx >= conf->prev.raid_disks) 3481 3321 continue; 3482 3322 disk = conf->mirrors + disk_idx; 3483 3323 ··· 3493 3327 goto out_free_conf; 3494 3328 disk->rdev = rdev; 3495 3329 } 3330 + diff = (rdev->new_data_offset - rdev->data_offset); 3331 + if (!mddev->reshape_backwards) 3332 + diff = -diff; 3333 + if (diff < 0) 3334 + diff = 0; 3335 + if (first || diff < min_offset_diff) 3336 + min_offset_diff = diff; 3496 3337 3497 3338 disk_stack_limits(mddev->gendisk, rdev->bdev, 3498 3339 rdev->data_offset << 9); 3499 3340 3500 3341 disk->head_position = 0; 3501 3342 } 3343 + 3502 3344 /* need to check that every block has at least one working mirror */ 3503 3345 if (!enough(conf, -1)) { 3504 3346 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", ··· 3514 3340 goto out_free_conf; 3515 3341 } 3516 3342 3343 + if (conf->reshape_progress != MaxSector) { 3344 + /* must ensure that shape change is supported */ 3345 + if (conf->geo.far_copies != 1 && 3346 + conf->geo.far_offset == 0) 3347 + goto out_free_conf; 3348 + if (conf->prev.far_copies != 1 && 3349 + conf->geo.far_offset == 0) 3350 + goto out_free_conf; 3351 + } 3352 + 3517 3353 mddev->degraded = 0; 3518 - for (i = 0; i < conf->raid_disks; i++) { 3354 + for (i = 0; 3355 + i < conf->geo.raid_disks 3356 + || i < conf->prev.raid_disks; 3357 + i++) { 3519 3358 3520 3359 disk = conf->mirrors + i; 3521 3360 ··· 3555 3368 mdname(mddev)); 3556 3369 printk(KERN_INFO 3557 3370 "md/raid10:%s: active with %d out of %d devices\n", 3558 - mdname(mddev), conf->raid_disks - mddev->degraded, 3559 - conf->raid_disks); 3371 + mdname(mddev), conf->geo.raid_disks - mddev->degraded, 3372 + conf->geo.raid_disks); 3560 3373 /* 3561 3374 * Ok, everything is just fine now 3562 3375 */ ··· 3573 3386 * maybe... 3574 3387 */ 3575 3388 { 3576 - int stripe = conf->raid_disks * 3389 + int stripe = conf->geo.raid_disks * 3577 3390 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 3578 - stripe /= conf->near_copies; 3579 - if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) 3580 - mddev->queue->backing_dev_info.ra_pages = 2* stripe; 3391 + stripe /= conf->geo.near_copies; 3392 + if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 3393 + mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 3581 3394 } 3582 3395 3583 3396 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); 3584 3397 3585 3398 if (md_integrity_register(mddev)) 3586 3399 goto out_free_conf; 3400 + 3401 + if (conf->reshape_progress != MaxSector) { 3402 + unsigned long before_length, after_length; 3403 + 3404 + before_length = ((1 << conf->prev.chunk_shift) * 3405 + conf->prev.far_copies); 3406 + after_length = ((1 << conf->geo.chunk_shift) * 3407 + conf->geo.far_copies); 3408 + 3409 + if (max(before_length, after_length) > min_offset_diff) { 3410 + /* This cannot work */ 3411 + printk("md/raid10: offset difference not enough to continue reshape\n"); 3412 + goto out_free_conf; 3413 + } 3414 + conf->offset_diff = min_offset_diff; 3415 + 3416 + conf->reshape_safe = conf->reshape_progress; 3417 + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 3418 + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 3419 + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 3420 + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 3421 + mddev->sync_thread = md_register_thread(md_do_sync, mddev, 3422 + "reshape"); 3423 + } 3587 3424 3588 3425 return 0; 3589 3426 ··· 3671 3460 struct r10conf *conf = mddev->private; 3672 3461 sector_t oldsize, size; 3673 3462 3674 - if (conf->far_copies > 1 && !conf->far_offset) 3463 + if (mddev->reshape_position != MaxSector) 3464 + return -EBUSY; 3465 + 3466 + if (conf->geo.far_copies > 1 && !conf->geo.far_offset) 3675 3467 return -EINVAL; 3676 3468 3677 3469 oldsize = raid10_size(mddev, 0, 0); 3678 3470 size = raid10_size(mddev, sectors, 0); 3679 - md_set_array_sectors(mddev, size); 3680 - if (mddev->array_sectors > size) 3471 + if (mddev->external_size && 3472 + mddev->array_sectors > size) 3681 3473 return -EINVAL; 3474 + if (mddev->bitmap) { 3475 + int ret = bitmap_resize(mddev->bitmap, size, 0, 0); 3476 + if (ret) 3477 + return ret; 3478 + } 3479 + md_set_array_sectors(mddev, size); 3682 3480 set_capacity(mddev->gendisk, mddev->array_sectors); 3683 3481 revalidate_disk(mddev->gendisk); 3684 3482 if (sectors > mddev->dev_sectors && ··· 3754 3534 return ERR_PTR(-EINVAL); 3755 3535 } 3756 3536 3537 + static int raid10_check_reshape(struct mddev *mddev) 3538 + { 3539 + /* Called when there is a request to change 3540 + * - layout (to ->new_layout) 3541 + * - chunk size (to ->new_chunk_sectors) 3542 + * - raid_disks (by delta_disks) 3543 + * or when trying to restart a reshape that was ongoing. 3544 + * 3545 + * We need to validate the request and possibly allocate 3546 + * space if that might be an issue later. 3547 + * 3548 + * Currently we reject any reshape of a 'far' mode array, 3549 + * allow chunk size to change if new is generally acceptable, 3550 + * allow raid_disks to increase, and allow 3551 + * a switch between 'near' mode and 'offset' mode. 3552 + */ 3553 + struct r10conf *conf = mddev->private; 3554 + struct geom geo; 3555 + 3556 + if (conf->geo.far_copies != 1 && !conf->geo.far_offset) 3557 + return -EINVAL; 3558 + 3559 + if (setup_geo(&geo, mddev, geo_start) != conf->copies) 3560 + /* mustn't change number of copies */ 3561 + return -EINVAL; 3562 + if (geo.far_copies > 1 && !geo.far_offset) 3563 + /* Cannot switch to 'far' mode */ 3564 + return -EINVAL; 3565 + 3566 + if (mddev->array_sectors & geo.chunk_mask) 3567 + /* not factor of array size */ 3568 + return -EINVAL; 3569 + 3570 + if (!enough(conf, -1)) 3571 + return -EINVAL; 3572 + 3573 + kfree(conf->mirrors_new); 3574 + conf->mirrors_new = NULL; 3575 + if (mddev->delta_disks > 0) { 3576 + /* allocate new 'mirrors' list */ 3577 + conf->mirrors_new = kzalloc( 3578 + sizeof(struct mirror_info) 3579 + *(mddev->raid_disks + 3580 + mddev->delta_disks), 3581 + GFP_KERNEL); 3582 + if (!conf->mirrors_new) 3583 + return -ENOMEM; 3584 + } 3585 + return 0; 3586 + } 3587 + 3588 + /* 3589 + * Need to check if array has failed when deciding whether to: 3590 + * - start an array 3591 + * - remove non-faulty devices 3592 + * - add a spare 3593 + * - allow a reshape 3594 + * This determination is simple when no reshape is happening. 3595 + * However if there is a reshape, we need to carefully check 3596 + * both the before and after sections. 3597 + * This is because some failed devices may only affect one 3598 + * of the two sections, and some non-in_sync devices may 3599 + * be insync in the section most affected by failed devices. 3600 + */ 3601 + static int calc_degraded(struct r10conf *conf) 3602 + { 3603 + int degraded, degraded2; 3604 + int i; 3605 + 3606 + rcu_read_lock(); 3607 + degraded = 0; 3608 + /* 'prev' section first */ 3609 + for (i = 0; i < conf->prev.raid_disks; i++) { 3610 + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 3611 + if (!rdev || test_bit(Faulty, &rdev->flags)) 3612 + degraded++; 3613 + else if (!test_bit(In_sync, &rdev->flags)) 3614 + /* When we can reduce the number of devices in 3615 + * an array, this might not contribute to 3616 + * 'degraded'. It does now. 3617 + */ 3618 + degraded++; 3619 + } 3620 + rcu_read_unlock(); 3621 + if (conf->geo.raid_disks == conf->prev.raid_disks) 3622 + return degraded; 3623 + rcu_read_lock(); 3624 + degraded2 = 0; 3625 + for (i = 0; i < conf->geo.raid_disks; i++) { 3626 + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 3627 + if (!rdev || test_bit(Faulty, &rdev->flags)) 3628 + degraded2++; 3629 + else if (!test_bit(In_sync, &rdev->flags)) { 3630 + /* If reshape is increasing the number of devices, 3631 + * this section has already been recovered, so 3632 + * it doesn't contribute to degraded. 3633 + * else it does. 3634 + */ 3635 + if (conf->geo.raid_disks <= conf->prev.raid_disks) 3636 + degraded2++; 3637 + } 3638 + } 3639 + rcu_read_unlock(); 3640 + if (degraded2 > degraded) 3641 + return degraded2; 3642 + return degraded; 3643 + } 3644 + 3645 + static int raid10_start_reshape(struct mddev *mddev) 3646 + { 3647 + /* A 'reshape' has been requested. This commits 3648 + * the various 'new' fields and sets MD_RECOVER_RESHAPE 3649 + * This also checks if there are enough spares and adds them 3650 + * to the array. 3651 + * We currently require enough spares to make the final 3652 + * array non-degraded. We also require that the difference 3653 + * between old and new data_offset - on each device - is 3654 + * enough that we never risk over-writing. 3655 + */ 3656 + 3657 + unsigned long before_length, after_length; 3658 + sector_t min_offset_diff = 0; 3659 + int first = 1; 3660 + struct geom new; 3661 + struct r10conf *conf = mddev->private; 3662 + struct md_rdev *rdev; 3663 + int spares = 0; 3664 + int ret; 3665 + 3666 + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3667 + return -EBUSY; 3668 + 3669 + if (setup_geo(&new, mddev, geo_start) != conf->copies) 3670 + return -EINVAL; 3671 + 3672 + before_length = ((1 << conf->prev.chunk_shift) * 3673 + conf->prev.far_copies); 3674 + after_length = ((1 << conf->geo.chunk_shift) * 3675 + conf->geo.far_copies); 3676 + 3677 + rdev_for_each(rdev, mddev) { 3678 + if (!test_bit(In_sync, &rdev->flags) 3679 + && !test_bit(Faulty, &rdev->flags)) 3680 + spares++; 3681 + if (rdev->raid_disk >= 0) { 3682 + long long diff = (rdev->new_data_offset 3683 + - rdev->data_offset); 3684 + if (!mddev->reshape_backwards) 3685 + diff = -diff; 3686 + if (diff < 0) 3687 + diff = 0; 3688 + if (first || diff < min_offset_diff) 3689 + min_offset_diff = diff; 3690 + } 3691 + } 3692 + 3693 + if (max(before_length, after_length) > min_offset_diff) 3694 + return -EINVAL; 3695 + 3696 + if (spares < mddev->delta_disks) 3697 + return -EINVAL; 3698 + 3699 + conf->offset_diff = min_offset_diff; 3700 + spin_lock_irq(&conf->device_lock); 3701 + if (conf->mirrors_new) { 3702 + memcpy(conf->mirrors_new, conf->mirrors, 3703 + sizeof(struct mirror_info)*conf->prev.raid_disks); 3704 + smp_mb(); 3705 + kfree(conf->mirrors_old); /* FIXME and elsewhere */ 3706 + conf->mirrors_old = conf->mirrors; 3707 + conf->mirrors = conf->mirrors_new; 3708 + conf->mirrors_new = NULL; 3709 + } 3710 + setup_geo(&conf->geo, mddev, geo_start); 3711 + smp_mb(); 3712 + if (mddev->reshape_backwards) { 3713 + sector_t size = raid10_size(mddev, 0, 0); 3714 + if (size < mddev->array_sectors) { 3715 + spin_unlock_irq(&conf->device_lock); 3716 + printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n", 3717 + mdname(mddev)); 3718 + return -EINVAL; 3719 + } 3720 + mddev->resync_max_sectors = size; 3721 + conf->reshape_progress = size; 3722 + } else 3723 + conf->reshape_progress = 0; 3724 + spin_unlock_irq(&conf->device_lock); 3725 + 3726 + if (mddev->delta_disks && mddev->bitmap) { 3727 + ret = bitmap_resize(mddev->bitmap, 3728 + raid10_size(mddev, 0, 3729 + conf->geo.raid_disks), 3730 + 0, 0); 3731 + if (ret) 3732 + goto abort; 3733 + } 3734 + if (mddev->delta_disks > 0) { 3735 + rdev_for_each(rdev, mddev) 3736 + if (rdev->raid_disk < 0 && 3737 + !test_bit(Faulty, &rdev->flags)) { 3738 + if (raid10_add_disk(mddev, rdev) == 0) { 3739 + if (rdev->raid_disk >= 3740 + conf->prev.raid_disks) 3741 + set_bit(In_sync, &rdev->flags); 3742 + else 3743 + rdev->recovery_offset = 0; 3744 + 3745 + if (sysfs_link_rdev(mddev, rdev)) 3746 + /* Failure here is OK */; 3747 + } 3748 + } else if (rdev->raid_disk >= conf->prev.raid_disks 3749 + && !test_bit(Faulty, &rdev->flags)) { 3750 + /* This is a spare that was manually added */ 3751 + set_bit(In_sync, &rdev->flags); 3752 + } 3753 + } 3754 + /* When a reshape changes the number of devices, 3755 + * ->degraded is measured against the larger of the 3756 + * pre and post numbers. 3757 + */ 3758 + spin_lock_irq(&conf->device_lock); 3759 + mddev->degraded = calc_degraded(conf); 3760 + spin_unlock_irq(&conf->device_lock); 3761 + mddev->raid_disks = conf->geo.raid_disks; 3762 + mddev->reshape_position = conf->reshape_progress; 3763 + set_bit(MD_CHANGE_DEVS, &mddev->flags); 3764 + 3765 + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 3766 + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 3767 + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 3768 + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 3769 + 3770 + mddev->sync_thread = md_register_thread(md_do_sync, mddev, 3771 + "reshape"); 3772 + if (!mddev->sync_thread) { 3773 + ret = -EAGAIN; 3774 + goto abort; 3775 + } 3776 + conf->reshape_checkpoint = jiffies; 3777 + md_wakeup_thread(mddev->sync_thread); 3778 + md_new_event(mddev); 3779 + return 0; 3780 + 3781 + abort: 3782 + mddev->recovery = 0; 3783 + spin_lock_irq(&conf->device_lock); 3784 + conf->geo = conf->prev; 3785 + mddev->raid_disks = conf->geo.raid_disks; 3786 + rdev_for_each(rdev, mddev) 3787 + rdev->new_data_offset = rdev->data_offset; 3788 + smp_wmb(); 3789 + conf->reshape_progress = MaxSector; 3790 + mddev->reshape_position = MaxSector; 3791 + spin_unlock_irq(&conf->device_lock); 3792 + return ret; 3793 + } 3794 + 3795 + /* Calculate the last device-address that could contain 3796 + * any block from the chunk that includes the array-address 's' 3797 + * and report the next address. 3798 + * i.e. the address returned will be chunk-aligned and after 3799 + * any data that is in the chunk containing 's'. 3800 + */ 3801 + static sector_t last_dev_address(sector_t s, struct geom *geo) 3802 + { 3803 + s = (s | geo->chunk_mask) + 1; 3804 + s >>= geo->chunk_shift; 3805 + s *= geo->near_copies; 3806 + s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks); 3807 + s *= geo->far_copies; 3808 + s <<= geo->chunk_shift; 3809 + return s; 3810 + } 3811 + 3812 + /* Calculate the first device-address that could contain 3813 + * any block from the chunk that includes the array-address 's'. 3814 + * This too will be the start of a chunk 3815 + */ 3816 + static sector_t first_dev_address(sector_t s, struct geom *geo) 3817 + { 3818 + s >>= geo->chunk_shift; 3819 + s *= geo->near_copies; 3820 + sector_div(s, geo->raid_disks); 3821 + s *= geo->far_copies; 3822 + s <<= geo->chunk_shift; 3823 + return s; 3824 + } 3825 + 3826 + static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, 3827 + int *skipped) 3828 + { 3829 + /* We simply copy at most one chunk (smallest of old and new) 3830 + * at a time, possibly less if that exceeds RESYNC_PAGES, 3831 + * or we hit a bad block or something. 3832 + * This might mean we pause for normal IO in the middle of 3833 + * a chunk, but that is not a problem was mddev->reshape_position 3834 + * can record any location. 3835 + * 3836 + * If we will want to write to a location that isn't 3837 + * yet recorded as 'safe' (i.e. in metadata on disk) then 3838 + * we need to flush all reshape requests and update the metadata. 3839 + * 3840 + * When reshaping forwards (e.g. to more devices), we interpret 3841 + * 'safe' as the earliest block which might not have been copied 3842 + * down yet. We divide this by previous stripe size and multiply 3843 + * by previous stripe length to get lowest device offset that we 3844 + * cannot write to yet. 3845 + * We interpret 'sector_nr' as an address that we want to write to. 3846 + * From this we use last_device_address() to find where we might 3847 + * write to, and first_device_address on the 'safe' position. 3848 + * If this 'next' write position is after the 'safe' position, 3849 + * we must update the metadata to increase the 'safe' position. 3850 + * 3851 + * When reshaping backwards, we round in the opposite direction 3852 + * and perform the reverse test: next write position must not be 3853 + * less than current safe position. 3854 + * 3855 + * In all this the minimum difference in data offsets 3856 + * (conf->offset_diff - always positive) allows a bit of slack, 3857 + * so next can be after 'safe', but not by more than offset_disk 3858 + * 3859 + * We need to prepare all the bios here before we start any IO 3860 + * to ensure the size we choose is acceptable to all devices. 3861 + * The means one for each copy for write-out and an extra one for 3862 + * read-in. 3863 + * We store the read-in bio in ->master_bio and the others in 3864 + * ->devs[x].bio and ->devs[x].repl_bio. 3865 + */ 3866 + struct r10conf *conf = mddev->private; 3867 + struct r10bio *r10_bio; 3868 + sector_t next, safe, last; 3869 + int max_sectors; 3870 + int nr_sectors; 3871 + int s; 3872 + struct md_rdev *rdev; 3873 + int need_flush = 0; 3874 + struct bio *blist; 3875 + struct bio *bio, *read_bio; 3876 + int sectors_done = 0; 3877 + 3878 + if (sector_nr == 0) { 3879 + /* If restarting in the middle, skip the initial sectors */ 3880 + if (mddev->reshape_backwards && 3881 + conf->reshape_progress < raid10_size(mddev, 0, 0)) { 3882 + sector_nr = (raid10_size(mddev, 0, 0) 3883 + - conf->reshape_progress); 3884 + } else if (!mddev->reshape_backwards && 3885 + conf->reshape_progress > 0) 3886 + sector_nr = conf->reshape_progress; 3887 + if (sector_nr) { 3888 + mddev->curr_resync_completed = sector_nr; 3889 + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 3890 + *skipped = 1; 3891 + return sector_nr; 3892 + } 3893 + } 3894 + 3895 + /* We don't use sector_nr to track where we are up to 3896 + * as that doesn't work well for ->reshape_backwards. 3897 + * So just use ->reshape_progress. 3898 + */ 3899 + if (mddev->reshape_backwards) { 3900 + /* 'next' is the earliest device address that we might 3901 + * write to for this chunk in the new layout 3902 + */ 3903 + next = first_dev_address(conf->reshape_progress - 1, 3904 + &conf->geo); 3905 + 3906 + /* 'safe' is the last device address that we might read from 3907 + * in the old layout after a restart 3908 + */ 3909 + safe = last_dev_address(conf->reshape_safe - 1, 3910 + &conf->prev); 3911 + 3912 + if (next + conf->offset_diff < safe) 3913 + need_flush = 1; 3914 + 3915 + last = conf->reshape_progress - 1; 3916 + sector_nr = last & ~(sector_t)(conf->geo.chunk_mask 3917 + & conf->prev.chunk_mask); 3918 + if (sector_nr + RESYNC_BLOCK_SIZE/512 < last) 3919 + sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512; 3920 + } else { 3921 + /* 'next' is after the last device address that we 3922 + * might write to for this chunk in the new layout 3923 + */ 3924 + next = last_dev_address(conf->reshape_progress, &conf->geo); 3925 + 3926 + /* 'safe' is the earliest device address that we might 3927 + * read from in the old layout after a restart 3928 + */ 3929 + safe = first_dev_address(conf->reshape_safe, &conf->prev); 3930 + 3931 + /* Need to update metadata if 'next' might be beyond 'safe' 3932 + * as that would possibly corrupt data 3933 + */ 3934 + if (next > safe + conf->offset_diff) 3935 + need_flush = 1; 3936 + 3937 + sector_nr = conf->reshape_progress; 3938 + last = sector_nr | (conf->geo.chunk_mask 3939 + & conf->prev.chunk_mask); 3940 + 3941 + if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last) 3942 + last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1; 3943 + } 3944 + 3945 + if (need_flush || 3946 + time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 3947 + /* Need to update reshape_position in metadata */ 3948 + wait_barrier(conf); 3949 + mddev->reshape_position = conf->reshape_progress; 3950 + if (mddev->reshape_backwards) 3951 + mddev->curr_resync_completed = raid10_size(mddev, 0, 0) 3952 + - conf->reshape_progress; 3953 + else 3954 + mddev->curr_resync_completed = conf->reshape_progress; 3955 + conf->reshape_checkpoint = jiffies; 3956 + set_bit(MD_CHANGE_DEVS, &mddev->flags); 3957 + md_wakeup_thread(mddev->thread); 3958 + wait_event(mddev->sb_wait, mddev->flags == 0 || 3959 + kthread_should_stop()); 3960 + conf->reshape_safe = mddev->reshape_position; 3961 + allow_barrier(conf); 3962 + } 3963 + 3964 + read_more: 3965 + /* Now schedule reads for blocks from sector_nr to last */ 3966 + r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); 3967 + raise_barrier(conf, sectors_done != 0); 3968 + atomic_set(&r10_bio->remaining, 0); 3969 + r10_bio->mddev = mddev; 3970 + r10_bio->sector = sector_nr; 3971 + set_bit(R10BIO_IsReshape, &r10_bio->state); 3972 + r10_bio->sectors = last - sector_nr + 1; 3973 + rdev = read_balance(conf, r10_bio, &max_sectors); 3974 + BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state)); 3975 + 3976 + if (!rdev) { 3977 + /* Cannot read from here, so need to record bad blocks 3978 + * on all the target devices. 3979 + */ 3980 + // FIXME 3981 + set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3982 + return sectors_done; 3983 + } 3984 + 3985 + read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev); 3986 + 3987 + read_bio->bi_bdev = rdev->bdev; 3988 + read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr 3989 + + rdev->data_offset); 3990 + read_bio->bi_private = r10_bio; 3991 + read_bio->bi_end_io = end_sync_read; 3992 + read_bio->bi_rw = READ; 3993 + read_bio->bi_flags &= ~(BIO_POOL_MASK - 1); 3994 + read_bio->bi_flags |= 1 << BIO_UPTODATE; 3995 + read_bio->bi_vcnt = 0; 3996 + read_bio->bi_idx = 0; 3997 + read_bio->bi_size = 0; 3998 + r10_bio->master_bio = read_bio; 3999 + r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; 4000 + 4001 + /* Now find the locations in the new layout */ 4002 + __raid10_find_phys(&conf->geo, r10_bio); 4003 + 4004 + blist = read_bio; 4005 + read_bio->bi_next = NULL; 4006 + 4007 + for (s = 0; s < conf->copies*2; s++) { 4008 + struct bio *b; 4009 + int d = r10_bio->devs[s/2].devnum; 4010 + struct md_rdev *rdev2; 4011 + if (s&1) { 4012 + rdev2 = conf->mirrors[d].replacement; 4013 + b = r10_bio->devs[s/2].repl_bio; 4014 + } else { 4015 + rdev2 = conf->mirrors[d].rdev; 4016 + b = r10_bio->devs[s/2].bio; 4017 + } 4018 + if (!rdev2 || test_bit(Faulty, &rdev2->flags)) 4019 + continue; 4020 + b->bi_bdev = rdev2->bdev; 4021 + b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset; 4022 + b->bi_private = r10_bio; 4023 + b->bi_end_io = end_reshape_write; 4024 + b->bi_rw = WRITE; 4025 + b->bi_flags &= ~(BIO_POOL_MASK - 1); 4026 + b->bi_flags |= 1 << BIO_UPTODATE; 4027 + b->bi_next = blist; 4028 + b->bi_vcnt = 0; 4029 + b->bi_idx = 0; 4030 + b->bi_size = 0; 4031 + blist = b; 4032 + } 4033 + 4034 + /* Now add as many pages as possible to all of these bios. */ 4035 + 4036 + nr_sectors = 0; 4037 + for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) { 4038 + struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page; 4039 + int len = (max_sectors - s) << 9; 4040 + if (len > PAGE_SIZE) 4041 + len = PAGE_SIZE; 4042 + for (bio = blist; bio ; bio = bio->bi_next) { 4043 + struct bio *bio2; 4044 + if (bio_add_page(bio, page, len, 0)) 4045 + continue; 4046 + 4047 + /* Didn't fit, must stop */ 4048 + for (bio2 = blist; 4049 + bio2 && bio2 != bio; 4050 + bio2 = bio2->bi_next) { 4051 + /* Remove last page from this bio */ 4052 + bio2->bi_vcnt--; 4053 + bio2->bi_size -= len; 4054 + bio2->bi_flags &= ~(1<<BIO_SEG_VALID); 4055 + } 4056 + goto bio_full; 4057 + } 4058 + sector_nr += len >> 9; 4059 + nr_sectors += len >> 9; 4060 + } 4061 + bio_full: 4062 + r10_bio->sectors = nr_sectors; 4063 + 4064 + /* Now submit the read */ 4065 + md_sync_acct(read_bio->bi_bdev, r10_bio->sectors); 4066 + atomic_inc(&r10_bio->remaining); 4067 + read_bio->bi_next = NULL; 4068 + generic_make_request(read_bio); 4069 + sector_nr += nr_sectors; 4070 + sectors_done += nr_sectors; 4071 + if (sector_nr <= last) 4072 + goto read_more; 4073 + 4074 + /* Now that we have done the whole section we can 4075 + * update reshape_progress 4076 + */ 4077 + if (mddev->reshape_backwards) 4078 + conf->reshape_progress -= sectors_done; 4079 + else 4080 + conf->reshape_progress += sectors_done; 4081 + 4082 + return sectors_done; 4083 + } 4084 + 4085 + static void end_reshape_request(struct r10bio *r10_bio); 4086 + static int handle_reshape_read_error(struct mddev *mddev, 4087 + struct r10bio *r10_bio); 4088 + static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) 4089 + { 4090 + /* Reshape read completed. Hopefully we have a block 4091 + * to write out. 4092 + * If we got a read error then we do sync 1-page reads from 4093 + * elsewhere until we find the data - or give up. 4094 + */ 4095 + struct r10conf *conf = mddev->private; 4096 + int s; 4097 + 4098 + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) 4099 + if (handle_reshape_read_error(mddev, r10_bio) < 0) { 4100 + /* Reshape has been aborted */ 4101 + md_done_sync(mddev, r10_bio->sectors, 0); 4102 + return; 4103 + } 4104 + 4105 + /* We definitely have the data in the pages, schedule the 4106 + * writes. 4107 + */ 4108 + atomic_set(&r10_bio->remaining, 1); 4109 + for (s = 0; s < conf->copies*2; s++) { 4110 + struct bio *b; 4111 + int d = r10_bio->devs[s/2].devnum; 4112 + struct md_rdev *rdev; 4113 + if (s&1) { 4114 + rdev = conf->mirrors[d].replacement; 4115 + b = r10_bio->devs[s/2].repl_bio; 4116 + } else { 4117 + rdev = conf->mirrors[d].rdev; 4118 + b = r10_bio->devs[s/2].bio; 4119 + } 4120 + if (!rdev || test_bit(Faulty, &rdev->flags)) 4121 + continue; 4122 + atomic_inc(&rdev->nr_pending); 4123 + md_sync_acct(b->bi_bdev, r10_bio->sectors); 4124 + atomic_inc(&r10_bio->remaining); 4125 + b->bi_next = NULL; 4126 + generic_make_request(b); 4127 + } 4128 + end_reshape_request(r10_bio); 4129 + } 4130 + 4131 + static void end_reshape(struct r10conf *conf) 4132 + { 4133 + if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) 4134 + return; 4135 + 4136 + spin_lock_irq(&conf->device_lock); 4137 + conf->prev = conf->geo; 4138 + md_finish_reshape(conf->mddev); 4139 + smp_wmb(); 4140 + conf->reshape_progress = MaxSector; 4141 + spin_unlock_irq(&conf->device_lock); 4142 + 4143 + /* read-ahead size must cover two whole stripes, which is 4144 + * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 4145 + */ 4146 + if (conf->mddev->queue) { 4147 + int stripe = conf->geo.raid_disks * 4148 + ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE); 4149 + stripe /= conf->geo.near_copies; 4150 + if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 4151 + conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 4152 + } 4153 + conf->fullsync = 0; 4154 + } 4155 + 4156 + 4157 + static int handle_reshape_read_error(struct mddev *mddev, 4158 + struct r10bio *r10_bio) 4159 + { 4160 + /* Use sync reads to get the blocks from somewhere else */ 4161 + int sectors = r10_bio->sectors; 4162 + struct r10bio r10b; 4163 + struct r10conf *conf = mddev->private; 4164 + int slot = 0; 4165 + int idx = 0; 4166 + struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec; 4167 + 4168 + r10b.sector = r10_bio->sector; 4169 + __raid10_find_phys(&conf->prev, &r10b); 4170 + 4171 + while (sectors) { 4172 + int s = sectors; 4173 + int success = 0; 4174 + int first_slot = slot; 4175 + 4176 + if (s > (PAGE_SIZE >> 9)) 4177 + s = PAGE_SIZE >> 9; 4178 + 4179 + while (!success) { 4180 + int d = r10b.devs[slot].devnum; 4181 + struct md_rdev *rdev = conf->mirrors[d].rdev; 4182 + sector_t addr; 4183 + if (rdev == NULL || 4184 + test_bit(Faulty, &rdev->flags) || 4185 + !test_bit(In_sync, &rdev->flags)) 4186 + goto failed; 4187 + 4188 + addr = r10b.devs[slot].addr + idx * PAGE_SIZE; 4189 + success = sync_page_io(rdev, 4190 + addr, 4191 + s << 9, 4192 + bvec[idx].bv_page, 4193 + READ, false); 4194 + if (success) 4195 + break; 4196 + failed: 4197 + slot++; 4198 + if (slot >= conf->copies) 4199 + slot = 0; 4200 + if (slot == first_slot) 4201 + break; 4202 + } 4203 + if (!success) { 4204 + /* couldn't read this block, must give up */ 4205 + set_bit(MD_RECOVERY_INTR, 4206 + &mddev->recovery); 4207 + return -EIO; 4208 + } 4209 + sectors -= s; 4210 + idx++; 4211 + } 4212 + return 0; 4213 + } 4214 + 4215 + static void end_reshape_write(struct bio *bio, int error) 4216 + { 4217 + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 4218 + struct r10bio *r10_bio = bio->bi_private; 4219 + struct mddev *mddev = r10_bio->mddev; 4220 + struct r10conf *conf = mddev->private; 4221 + int d; 4222 + int slot; 4223 + int repl; 4224 + struct md_rdev *rdev = NULL; 4225 + 4226 + d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 4227 + if (repl) 4228 + rdev = conf->mirrors[d].replacement; 4229 + if (!rdev) { 4230 + smp_mb(); 4231 + rdev = conf->mirrors[d].rdev; 4232 + } 4233 + 4234 + if (!uptodate) { 4235 + /* FIXME should record badblock */ 4236 + md_error(mddev, rdev); 4237 + } 4238 + 4239 + rdev_dec_pending(rdev, mddev); 4240 + end_reshape_request(r10_bio); 4241 + } 4242 + 4243 + static void end_reshape_request(struct r10bio *r10_bio) 4244 + { 4245 + if (!atomic_dec_and_test(&r10_bio->remaining)) 4246 + return; 4247 + md_done_sync(r10_bio->mddev, r10_bio->sectors, 1); 4248 + bio_put(r10_bio->master_bio); 4249 + put_buf(r10_bio); 4250 + } 4251 + 4252 + static void raid10_finish_reshape(struct mddev *mddev) 4253 + { 4254 + struct r10conf *conf = mddev->private; 4255 + 4256 + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 4257 + return; 4258 + 4259 + if (mddev->delta_disks > 0) { 4260 + sector_t size = raid10_size(mddev, 0, 0); 4261 + md_set_array_sectors(mddev, size); 4262 + if (mddev->recovery_cp > mddev->resync_max_sectors) { 4263 + mddev->recovery_cp = mddev->resync_max_sectors; 4264 + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4265 + } 4266 + mddev->resync_max_sectors = size; 4267 + set_capacity(mddev->gendisk, mddev->array_sectors); 4268 + revalidate_disk(mddev->gendisk); 4269 + } else { 4270 + int d; 4271 + for (d = conf->geo.raid_disks ; 4272 + d < conf->geo.raid_disks - mddev->delta_disks; 4273 + d++) { 4274 + struct md_rdev *rdev = conf->mirrors[d].rdev; 4275 + if (rdev) 4276 + clear_bit(In_sync, &rdev->flags); 4277 + rdev = conf->mirrors[d].replacement; 4278 + if (rdev) 4279 + clear_bit(In_sync, &rdev->flags); 4280 + } 4281 + } 4282 + mddev->layout = mddev->new_layout; 4283 + mddev->chunk_sectors = 1 << conf->geo.chunk_shift; 4284 + mddev->reshape_position = MaxSector; 4285 + mddev->delta_disks = 0; 4286 + mddev->reshape_backwards = 0; 4287 + } 4288 + 3757 4289 static struct md_personality raid10_personality = 3758 4290 { 3759 4291 .name = "raid10", ··· 4524 3552 .size = raid10_size, 4525 3553 .resize = raid10_resize, 4526 3554 .takeover = raid10_takeover, 3555 + .check_reshape = raid10_check_reshape, 3556 + .start_reshape = raid10_start_reshape, 3557 + .finish_reshape = raid10_finish_reshape, 4527 3558 }; 4528 3559 4529 3560 static int __init raid_init(void)

+23 -11

drivers/md/raid10.h

··· 14 14 struct r10conf { 15 15 struct mddev *mddev; 16 16 struct mirror_info *mirrors; 17 - int raid_disks; 17 + struct mirror_info *mirrors_new, *mirrors_old; 18 18 spinlock_t device_lock; 19 19 20 20 /* geometry */ 21 - int near_copies; /* number of copies laid out 21 + struct geom { 22 + int raid_disks; 23 + int near_copies; /* number of copies laid out 22 24 * raid0 style */ 23 - int far_copies; /* number of copies laid out 25 + int far_copies; /* number of copies laid out 24 26 * at large strides across drives 25 27 */ 26 - int far_offset; /* far_copies are offset by 1 28 + int far_offset; /* far_copies are offset by 1 27 29 * stripe instead of many 28 30 */ 29 - int copies; /* near_copies * far_copies. 30 - * must be <= raid_disks 31 - */ 32 - sector_t stride; /* distance between far copies. 31 + sector_t stride; /* distance between far copies. 33 32 * This is size / far_copies unless 34 33 * far_offset, in which case it is 35 34 * 1 stripe. 36 35 */ 36 + int chunk_shift; /* shift from chunks to sectors */ 37 + sector_t chunk_mask; 38 + } prev, geo; 39 + int copies; /* near_copies * far_copies. 40 + * must be <= raid_disks 41 + */ 37 42 38 43 sector_t dev_sectors; /* temp copy of 39 44 * mddev->dev_sectors */ 40 - 41 - int chunk_shift; /* shift from chunks to sectors */ 42 - sector_t chunk_mask; 45 + sector_t reshape_progress; 46 + sector_t reshape_safe; 47 + unsigned long reshape_checkpoint; 48 + sector_t offset_diff; 43 49 44 50 struct list_head retry_list; 45 51 /* queue pending writes and submit them on unplug */ ··· 142 136 R10BIO_Uptodate, 143 137 R10BIO_IsSync, 144 138 R10BIO_IsRecover, 139 + R10BIO_IsReshape, 145 140 R10BIO_Degraded, 146 141 /* Set ReadError on bios that experience a read error 147 142 * so that raid10d knows what to do with them. ··· 153 146 */ 154 147 R10BIO_MadeGood, 155 148 R10BIO_WriteError, 149 + /* During a reshape we might be performing IO on the 150 + * 'previous' part of the array, in which case this 151 + * flag is set 152 + */ 153 + R10BIO_Previous, 156 154 }; 157 155 #endif

+178 -74

drivers/md/raid5.c

··· 488 488 return sh; 489 489 } 490 490 491 + /* Determine if 'data_offset' or 'new_data_offset' should be used 492 + * in this stripe_head. 493 + */ 494 + static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 495 + { 496 + sector_t progress = conf->reshape_progress; 497 + /* Need a memory barrier to make sure we see the value 498 + * of conf->generation, or ->data_offset that was set before 499 + * reshape_progress was updated. 500 + */ 501 + smp_rmb(); 502 + if (progress == MaxSector) 503 + return 0; 504 + if (sh->generation == conf->generation - 1) 505 + return 0; 506 + /* We are in a reshape, and this is a new-generation stripe, 507 + * so use new_data_offset. 508 + */ 509 + return 1; 510 + } 511 + 491 512 static void 492 513 raid5_end_read_request(struct bio *bi, int error); 493 514 static void ··· 539 518 replace_only = 1; 540 519 } else 541 520 continue; 521 + if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 522 + rw |= REQ_SYNC; 542 523 543 524 bi = &sh->dev[i].req; 544 525 rbi = &sh->dev[i].rreq; /* For writing to replacement */ ··· 626 603 __func__, (unsigned long long)sh->sector, 627 604 bi->bi_rw, i); 628 605 atomic_inc(&sh->count); 629 - bi->bi_sector = sh->sector + rdev->data_offset; 606 + if (use_new_offset(conf, sh)) 607 + bi->bi_sector = (sh->sector 608 + + rdev->new_data_offset); 609 + else 610 + bi->bi_sector = (sh->sector 611 + + rdev->data_offset); 630 612 bi->bi_flags = 1 << BIO_UPTODATE; 631 613 bi->bi_idx = 0; 632 614 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; ··· 655 627 __func__, (unsigned long long)sh->sector, 656 628 rbi->bi_rw, i); 657 629 atomic_inc(&sh->count); 658 - rbi->bi_sector = sh->sector + rrdev->data_offset; 630 + if (use_new_offset(conf, sh)) 631 + rbi->bi_sector = (sh->sector 632 + + rrdev->new_data_offset); 633 + else 634 + rbi->bi_sector = (sh->sector 635 + + rrdev->data_offset); 659 636 rbi->bi_flags = 1 << BIO_UPTODATE; 660 637 rbi->bi_idx = 0; 661 638 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; ··· 1147 1114 dev->sector + STRIPE_SECTORS) { 1148 1115 if (wbi->bi_rw & REQ_FUA) 1149 1116 set_bit(R5_WantFUA, &dev->flags); 1117 + if (wbi->bi_rw & REQ_SYNC) 1118 + set_bit(R5_SyncIO, &dev->flags); 1150 1119 tx = async_copy_data(1, wbi, dev->page, 1151 1120 dev->sector, tx); 1152 1121 wbi = r5_next_bio(wbi, dev->sector); ··· 1166 1131 int pd_idx = sh->pd_idx; 1167 1132 int qd_idx = sh->qd_idx; 1168 1133 int i; 1169 - bool fua = false; 1134 + bool fua = false, sync = false; 1170 1135 1171 1136 pr_debug("%s: stripe %llu\n", __func__, 1172 1137 (unsigned long long)sh->sector); 1173 1138 1174 - for (i = disks; i--; ) 1139 + for (i = disks; i--; ) { 1175 1140 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1141 + sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1142 + } 1176 1143 1177 1144 for (i = disks; i--; ) { 1178 1145 struct r5dev *dev = &sh->dev[i]; ··· 1183 1146 set_bit(R5_UPTODATE, &dev->flags); 1184 1147 if (fua) 1185 1148 set_bit(R5_WantFUA, &dev->flags); 1149 + if (sync) 1150 + set_bit(R5_SyncIO, &dev->flags); 1186 1151 } 1187 1152 } 1188 1153 ··· 1687 1648 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1688 1649 char b[BDEVNAME_SIZE]; 1689 1650 struct md_rdev *rdev = NULL; 1690 - 1651 + sector_t s; 1691 1652 1692 1653 for (i=0 ; i<disks; i++) 1693 1654 if (bi == &sh->dev[i].req) ··· 1710 1671 if (!rdev) 1711 1672 rdev = conf->disks[i].rdev; 1712 1673 1674 + if (use_new_offset(conf, sh)) 1675 + s = sh->sector + rdev->new_data_offset; 1676 + else 1677 + s = sh->sector + rdev->data_offset; 1713 1678 if (uptodate) { 1714 1679 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1715 1680 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { ··· 1726 1683 "md/raid:%s: read error corrected" 1727 1684 " (%lu sectors at %llu on %s)\n", 1728 1685 mdname(conf->mddev), STRIPE_SECTORS, 1729 - (unsigned long long)(sh->sector 1730 - + rdev->data_offset), 1686 + (unsigned long long)s, 1731 1687 bdevname(rdev->bdev, b)); 1732 1688 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 1733 1689 clear_bit(R5_ReadError, &sh->dev[i].flags); ··· 1746 1704 "md/raid:%s: read error on replacement device " 1747 1705 "(sector %llu on %s).\n", 1748 1706 mdname(conf->mddev), 1749 - (unsigned long long)(sh->sector 1750 - + rdev->data_offset), 1707 + (unsigned long long)s, 1751 1708 bdn); 1752 1709 else if (conf->mddev->degraded >= conf->max_degraded) 1753 1710 printk_ratelimited( ··· 1754 1713 "md/raid:%s: read error not correctable " 1755 1714 "(sector %llu on %s).\n", 1756 1715 mdname(conf->mddev), 1757 - (unsigned long long)(sh->sector 1758 - + rdev->data_offset), 1716 + (unsigned long long)s, 1759 1717 bdn); 1760 1718 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1761 1719 /* Oh, no!!! */ ··· 1763 1723 "md/raid:%s: read error NOT corrected!! " 1764 1724 "(sector %llu on %s).\n", 1765 1725 mdname(conf->mddev), 1766 - (unsigned long long)(sh->sector 1767 - + rdev->data_offset), 1726 + (unsigned long long)s, 1768 1727 bdn); 1769 1728 else if (atomic_read(&rdev->read_errors) 1770 1729 > conf->max_nr_stripes) ··· 3600 3561 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 3601 3562 rdev = conf->disks[i].rdev; 3602 3563 rdev_clear_badblocks(rdev, sh->sector, 3603 - STRIPE_SECTORS); 3564 + STRIPE_SECTORS, 0); 3604 3565 rdev_dec_pending(rdev, conf->mddev); 3605 3566 } 3606 3567 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { ··· 3609 3570 /* rdev have been moved down */ 3610 3571 rdev = conf->disks[i].rdev; 3611 3572 rdev_clear_badblocks(rdev, sh->sector, 3612 - STRIPE_SECTORS); 3573 + STRIPE_SECTORS, 0); 3613 3574 rdev_dec_pending(rdev, conf->mddev); 3614 3575 } 3615 3576 } ··· 3881 3842 raid_bio->bi_next = (void*)rdev; 3882 3843 align_bi->bi_bdev = rdev->bdev; 3883 3844 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 3845 + /* No reshape active, so we can trust rdev->data_offset */ 3884 3846 align_bi->bi_sector += rdev->data_offset; 3885 3847 3886 3848 if (!bio_fits_rdev(align_bi) || ··· 3993 3953 plugged = mddev_check_plugged(mddev); 3994 3954 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 3995 3955 DEFINE_WAIT(w); 3996 - int disks, data_disks; 3997 3956 int previous; 3998 3957 3999 3958 retry: 4000 3959 previous = 0; 4001 - disks = conf->raid_disks; 4002 3960 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 4003 3961 if (unlikely(conf->reshape_progress != MaxSector)) { 4004 3962 /* spinlock is needed as reshape_progress may be ··· 4008 3970 * to check again. 4009 3971 */ 4010 3972 spin_lock_irq(&conf->device_lock); 4011 - if (mddev->delta_disks < 0 3973 + if (mddev->reshape_backwards 4012 3974 ? logical_sector < conf->reshape_progress 4013 3975 : logical_sector >= conf->reshape_progress) { 4014 - disks = conf->previous_raid_disks; 4015 3976 previous = 1; 4016 3977 } else { 4017 - if (mddev->delta_disks < 0 3978 + if (mddev->reshape_backwards 4018 3979 ? logical_sector < conf->reshape_safe 4019 3980 : logical_sector >= conf->reshape_safe) { 4020 3981 spin_unlock_irq(&conf->device_lock); ··· 4023 3986 } 4024 3987 spin_unlock_irq(&conf->device_lock); 4025 3988 } 4026 - data_disks = disks - conf->max_degraded; 4027 3989 4028 3990 new_sector = raid5_compute_sector(conf, logical_sector, 4029 3991 previous, ··· 4045 4009 */ 4046 4010 int must_retry = 0; 4047 4011 spin_lock_irq(&conf->device_lock); 4048 - if (mddev->delta_disks < 0 4012 + if (mddev->reshape_backwards 4049 4013 ? logical_sector >= conf->reshape_progress 4050 4014 : logical_sector < conf->reshape_progress) 4051 4015 /* mismatch, need to try again */ ··· 4144 4108 4145 4109 if (sector_nr == 0) { 4146 4110 /* If restarting in the middle, skip the initial sectors */ 4147 - if (mddev->delta_disks < 0 && 4111 + if (mddev->reshape_backwards && 4148 4112 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 4149 4113 sector_nr = raid5_size(mddev, 0, 0) 4150 4114 - conf->reshape_progress; 4151 - } else if (mddev->delta_disks >= 0 && 4115 + } else if (!mddev->reshape_backwards && 4152 4116 conf->reshape_progress > 0) 4153 4117 sector_nr = conf->reshape_progress; 4154 4118 sector_div(sector_nr, new_data_disks); ··· 4169 4133 else 4170 4134 reshape_sectors = mddev->chunk_sectors; 4171 4135 4172 - /* we update the metadata when there is more than 3Meg 4173 - * in the block range (that is rather arbitrary, should 4174 - * probably be time based) or when the data about to be 4175 - * copied would over-write the source of the data at 4176 - * the front of the range. 4177 - * i.e. one new_stripe along from reshape_progress new_maps 4178 - * to after where reshape_safe old_maps to 4136 + /* We update the metadata at least every 10 seconds, or when 4137 + * the data about to be copied would over-write the source of 4138 + * the data at the front of the range. i.e. one new_stripe 4139 + * along from reshape_progress new_maps to after where 4140 + * reshape_safe old_maps to 4179 4141 */ 4180 4142 writepos = conf->reshape_progress; 4181 4143 sector_div(writepos, new_data_disks); ··· 4181 4147 sector_div(readpos, data_disks); 4182 4148 safepos = conf->reshape_safe; 4183 4149 sector_div(safepos, data_disks); 4184 - if (mddev->delta_disks < 0) { 4150 + if (mddev->reshape_backwards) { 4185 4151 writepos -= min_t(sector_t, reshape_sectors, writepos); 4186 4152 readpos += reshape_sectors; 4187 4153 safepos += reshape_sectors; ··· 4191 4157 safepos -= min_t(sector_t, reshape_sectors, safepos); 4192 4158 } 4193 4159 4160 + /* Having calculated the 'writepos' possibly use it 4161 + * to set 'stripe_addr' which is where we will write to. 4162 + */ 4163 + if (mddev->reshape_backwards) { 4164 + BUG_ON(conf->reshape_progress == 0); 4165 + stripe_addr = writepos; 4166 + BUG_ON((mddev->dev_sectors & 4167 + ~((sector_t)reshape_sectors - 1)) 4168 + - reshape_sectors - stripe_addr 4169 + != sector_nr); 4170 + } else { 4171 + BUG_ON(writepos != sector_nr + reshape_sectors); 4172 + stripe_addr = sector_nr; 4173 + } 4174 + 4194 4175 /* 'writepos' is the most advanced device address we might write. 4195 4176 * 'readpos' is the least advanced device address we might read. 4196 4177 * 'safepos' is the least address recorded in the metadata as having 4197 4178 * been reshaped. 4198 - * If 'readpos' is behind 'writepos', then there is no way that we can 4179 + * If there is a min_offset_diff, these are adjusted either by 4180 + * increasing the safepos/readpos if diff is negative, or 4181 + * increasing writepos if diff is positive. 4182 + * If 'readpos' is then behind 'writepos', there is no way that we can 4199 4183 * ensure safety in the face of a crash - that must be done by userspace 4200 4184 * making a backup of the data. So in that case there is no particular 4201 4185 * rush to update metadata. ··· 4226 4174 * Maybe that number should be configurable, but I'm not sure it is 4227 4175 * worth it.... maybe it could be a multiple of safemode_delay??? 4228 4176 */ 4229 - if ((mddev->delta_disks < 0 4177 + if (conf->min_offset_diff < 0) { 4178 + safepos += -conf->min_offset_diff; 4179 + readpos += -conf->min_offset_diff; 4180 + } else 4181 + writepos += conf->min_offset_diff; 4182 + 4183 + if ((mddev->reshape_backwards 4230 4184 ? (safepos > writepos && readpos < writepos) 4231 4185 : (safepos < writepos && readpos > writepos)) || 4232 4186 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { ··· 4253 4195 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4254 4196 } 4255 4197 4256 - if (mddev->delta_disks < 0) { 4257 - BUG_ON(conf->reshape_progress == 0); 4258 - stripe_addr = writepos; 4259 - BUG_ON((mddev->dev_sectors & 4260 - ~((sector_t)reshape_sectors - 1)) 4261 - - reshape_sectors - stripe_addr 4262 - != sector_nr); 4263 - } else { 4264 - BUG_ON(writepos != sector_nr + reshape_sectors); 4265 - stripe_addr = sector_nr; 4266 - } 4267 4198 INIT_LIST_HEAD(&stripes); 4268 4199 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 4269 4200 int j; ··· 4286 4239 list_add(&sh->lru, &stripes); 4287 4240 } 4288 4241 spin_lock_irq(&conf->device_lock); 4289 - if (mddev->delta_disks < 0) 4242 + if (mddev->reshape_backwards) 4290 4243 conf->reshape_progress -= reshape_sectors * new_data_disks; 4291 4244 else 4292 4245 conf->reshape_progress += reshape_sectors * new_data_disks; ··· 4999 4952 struct md_rdev *rdev; 5000 4953 sector_t reshape_offset = 0; 5001 4954 int i; 4955 + long long min_offset_diff = 0; 4956 + int first = 1; 5002 4957 5003 4958 if (mddev->recovery_cp != MaxSector) 5004 4959 printk(KERN_NOTICE "md/raid:%s: not clean" 5005 4960 " -- starting background reconstruction\n", 5006 4961 mdname(mddev)); 4962 + 4963 + rdev_for_each(rdev, mddev) { 4964 + long long diff; 4965 + if (rdev->raid_disk < 0) 4966 + continue; 4967 + diff = (rdev->new_data_offset - rdev->data_offset); 4968 + if (first) { 4969 + min_offset_diff = diff; 4970 + first = 0; 4971 + } else if (mddev->reshape_backwards && 4972 + diff < min_offset_diff) 4973 + min_offset_diff = diff; 4974 + else if (!mddev->reshape_backwards && 4975 + diff > min_offset_diff) 4976 + min_offset_diff = diff; 4977 + } 4978 + 5007 4979 if (mddev->reshape_position != MaxSector) { 5008 4980 /* Check that we can continue the reshape. 5009 - * Currently only disks can change, it must 5010 - * increase, and we must be past the point where 5011 - * a stripe over-writes itself 4981 + * Difficulties arise if the stripe we would write to 4982 + * next is at or after the stripe we would read from next. 4983 + * For a reshape that changes the number of devices, this 4984 + * is only possible for a very short time, and mdadm makes 4985 + * sure that time appears to have past before assembling 4986 + * the array. So we fail if that time hasn't passed. 4987 + * For a reshape that keeps the number of devices the same 4988 + * mdadm must be monitoring the reshape can keeping the 4989 + * critical areas read-only and backed up. It will start 4990 + * the array in read-only mode, so we check for that. 5012 4991 */ 5013 4992 sector_t here_new, here_old; 5014 4993 int old_disks; ··· 5066 4993 /* here_old is the first stripe that we might need to read 5067 4994 * from */ 5068 4995 if (mddev->delta_disks == 0) { 4996 + if ((here_new * mddev->new_chunk_sectors != 4997 + here_old * mddev->chunk_sectors)) { 4998 + printk(KERN_ERR "md/raid:%s: reshape position is" 4999 + " confused - aborting\n", mdname(mddev)); 5000 + return -EINVAL; 5001 + } 5069 5002 /* We cannot be sure it is safe to start an in-place 5070 - * reshape. It is only safe if user-space if monitoring 5003 + * reshape. It is only safe if user-space is monitoring 5071 5004 * and taking constant backups. 5072 5005 * mdadm always starts a situation like this in 5073 5006 * readonly mode so it can take control before 5074 5007 * allowing any writes. So just check for that. 5075 5008 */ 5076 - if ((here_new * mddev->new_chunk_sectors != 5077 - here_old * mddev->chunk_sectors) || 5078 - mddev->ro == 0) { 5079 - printk(KERN_ERR "md/raid:%s: in-place reshape must be started" 5080 - " in read-only mode - aborting\n", 5009 + if (abs(min_offset_diff) >= mddev->chunk_sectors && 5010 + abs(min_offset_diff) >= mddev->new_chunk_sectors) 5011 + /* not really in-place - so OK */; 5012 + else if (mddev->ro == 0) { 5013 + printk(KERN_ERR "md/raid:%s: in-place reshape " 5014 + "must be started in read-only mode " 5015 + "- aborting\n", 5081 5016 mdname(mddev)); 5082 5017 return -EINVAL; 5083 5018 } 5084 - } else if (mddev->delta_disks < 0 5085 - ? (here_new * mddev->new_chunk_sectors <= 5019 + } else if (mddev->reshape_backwards 5020 + ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= 5086 5021 here_old * mddev->chunk_sectors) 5087 5022 : (here_new * mddev->new_chunk_sectors >= 5088 - here_old * mddev->chunk_sectors)) { 5023 + here_old * mddev->chunk_sectors + (-min_offset_diff))) { 5089 5024 /* Reading from the same stripe as writing to - bad */ 5090 5025 printk(KERN_ERR "md/raid:%s: reshape_position too early for " 5091 5026 "auto-recovery - aborting.\n", ··· 5118 5037 if (IS_ERR(conf)) 5119 5038 return PTR_ERR(conf); 5120 5039 5040 + conf->min_offset_diff = min_offset_diff; 5121 5041 mddev->thread = conf->thread; 5122 5042 conf->thread = NULL; 5123 5043 mddev->private = conf; ··· 5264 5182 blk_queue_io_opt(mddev->queue, chunk_size * 5265 5183 (conf->raid_disks - conf->max_degraded)); 5266 5184 5267 - rdev_for_each(rdev, mddev) 5185 + rdev_for_each(rdev, mddev) { 5268 5186 disk_stack_limits(mddev->gendisk, rdev->bdev, 5269 5187 rdev->data_offset << 9); 5188 + disk_stack_limits(mddev->gendisk, rdev->bdev, 5189 + rdev->new_data_offset << 9); 5190 + } 5270 5191 } 5271 5192 5272 5193 return 0; ··· 5503 5418 * any io in the removed space completes, but it hardly seems 5504 5419 * worth it. 5505 5420 */ 5421 + sector_t newsize; 5506 5422 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 5507 - md_set_array_sectors(mddev, raid5_size(mddev, sectors, 5508 - mddev->raid_disks)); 5509 - if (mddev->array_sectors > 5510 - raid5_size(mddev, sectors, mddev->raid_disks)) 5423 + newsize = raid5_size(mddev, sectors, mddev->raid_disks); 5424 + if (mddev->external_size && 5425 + mddev->array_sectors > newsize) 5511 5426 return -EINVAL; 5427 + if (mddev->bitmap) { 5428 + int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); 5429 + if (ret) 5430 + return ret; 5431 + } 5432 + md_set_array_sectors(mddev, newsize); 5512 5433 set_capacity(mddev->gendisk, mddev->array_sectors); 5513 5434 revalidate_disk(mddev->gendisk); 5514 5435 if (sectors > mddev->dev_sectors && ··· 5559 5468 mddev->new_layout == mddev->layout && 5560 5469 mddev->new_chunk_sectors == mddev->chunk_sectors) 5561 5470 return 0; /* nothing to do */ 5562 - if (mddev->bitmap) 5563 - /* Cannot grow a bitmap yet */ 5564 - return -EBUSY; 5565 5471 if (has_failed(conf)) 5566 5472 return -EINVAL; 5567 5473 if (mddev->delta_disks < 0) { ··· 5593 5505 if (!check_stripe_cache(mddev)) 5594 5506 return -ENOSPC; 5595 5507 5596 - rdev_for_each(rdev, mddev) 5508 + if (has_failed(conf)) 5509 + return -EINVAL; 5510 + 5511 + rdev_for_each(rdev, mddev) { 5597 5512 if (!test_bit(In_sync, &rdev->flags) 5598 5513 && !test_bit(Faulty, &rdev->flags)) 5599 5514 spares++; 5515 + } 5600 5516 5601 5517 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 5602 5518 /* Not enough devices even to make a degraded array ··· 5627 5535 conf->chunk_sectors = mddev->new_chunk_sectors; 5628 5536 conf->prev_algo = conf->algorithm; 5629 5537 conf->algorithm = mddev->new_layout; 5630 - if (mddev->delta_disks < 0) 5538 + conf->generation++; 5539 + /* Code that selects data_offset needs to see the generation update 5540 + * if reshape_progress has been set - so a memory barrier needed. 5541 + */ 5542 + smp_mb(); 5543 + if (mddev->reshape_backwards) 5631 5544 conf->reshape_progress = raid5_size(mddev, 0, 0); 5632 5545 else 5633 5546 conf->reshape_progress = 0; 5634 5547 conf->reshape_safe = conf->reshape_progress; 5635 - conf->generation++; 5636 5548 spin_unlock_irq(&conf->device_lock); 5637 5549 5638 5550 /* Add some new drives, as many as will fit. ··· 5688 5592 mddev->recovery = 0; 5689 5593 spin_lock_irq(&conf->device_lock); 5690 5594 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 5595 + rdev_for_each(rdev, mddev) 5596 + rdev->new_data_offset = rdev->data_offset; 5597 + smp_wmb(); 5691 5598 conf->reshape_progress = MaxSector; 5692 5599 mddev->reshape_position = MaxSector; 5693 5600 spin_unlock_irq(&conf->device_lock); ··· 5709 5610 { 5710 5611 5711 5612 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 5613 + struct md_rdev *rdev; 5712 5614 5713 5615 spin_lock_irq(&conf->device_lock); 5714 5616 conf->previous_raid_disks = conf->raid_disks; 5617 + rdev_for_each(rdev, conf->mddev) 5618 + rdev->data_offset = rdev->new_data_offset; 5619 + smp_wmb(); 5715 5620 conf->reshape_progress = MaxSector; 5716 5621 spin_unlock_irq(&conf->device_lock); 5717 5622 wake_up(&conf->wait_for_overlap); ··· 5755 5652 d < conf->raid_disks - mddev->delta_disks; 5756 5653 d++) { 5757 5654 struct md_rdev *rdev = conf->disks[d].rdev; 5758 - if (rdev && 5759 - raid5_remove_disk(mddev, rdev) == 0) { 5760 - sysfs_unlink_rdev(mddev, rdev); 5761 - rdev->raid_disk = -1; 5762 - } 5655 + if (rdev) 5656 + clear_bit(In_sync, &rdev->flags); 5657 + rdev = conf->disks[d].replacement; 5658 + if (rdev) 5659 + clear_bit(In_sync, &rdev->flags); 5763 5660 } 5764 5661 } 5765 5662 mddev->layout = conf->algorithm; 5766 5663 mddev->chunk_sectors = conf->chunk_sectors; 5767 5664 mddev->reshape_position = MaxSector; 5768 5665 mddev->delta_disks = 0; 5666 + mddev->reshape_backwards = 0; 5769 5667 } 5770 5668 } 5771 5669

+7

drivers/md/raid5.h

··· 285 285 */ 286 286 R5_Wantdrain, /* dev->towrite needs to be drained */ 287 287 R5_WantFUA, /* Write should be FUA */ 288 + R5_SyncIO, /* The IO is sync */ 288 289 R5_WriteError, /* got a write error - need to record it */ 289 290 R5_MadeGood, /* A bad block has been fixed by writing to it */ 290 291 R5_ReadRepl, /* Will/did read from replacement rather than orig */ ··· 386 385 short generation; /* increments with every reshape */ 387 386 unsigned long reshape_checkpoint; /* Time we last updated 388 387 * metadata */ 388 + long long min_offset_diff; /* minimum difference between 389 + * data_offset and 390 + * new_data_offset across all 391 + * devices. May be negative, 392 + * but is closest to zero. 393 + */ 389 394 390 395 struct list_head handle_list; /* stripes needing handling */ 391 396 struct list_head hold_list; /* preread ready stripes */

+13 -2

include/linux/raid/md_p.h

··· 233 233 __le32 delta_disks; /* change in number of raid_disks */ 234 234 __le32 new_layout; /* new layout */ 235 235 __le32 new_chunk; /* new chunk size (512byte sectors) */ 236 - __u8 pad1[128-124]; /* set to 0 when written */ 236 + __le32 new_offset; /* signed number to add to data_offset in new 237 + * layout. 0 == no-change. This can be 238 + * different on each device in the array. 239 + */ 237 240 238 241 /* constant this-device information - 64 bytes */ 239 242 __le64 data_offset; /* sector start of data, often 0 */ ··· 284 281 * active device with same 'role'. 285 282 * 'recovery_offset' is also set. 286 283 */ 284 + #define MD_FEATURE_RESHAPE_BACKWARDS 32 /* Reshape doesn't change number 285 + * of devices, but is going 286 + * backwards anyway. 287 + */ 288 + #define MD_FEATURE_NEW_OFFSET 64 /* new_offset must be honoured */ 287 289 #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ 288 290 |MD_FEATURE_RECOVERY_OFFSET \ 289 291 |MD_FEATURE_RESHAPE_ACTIVE \ 290 292 |MD_FEATURE_BAD_BLOCKS \ 291 - |MD_FEATURE_REPLACEMENT) 293 + |MD_FEATURE_REPLACEMENT \ 294 + |MD_FEATURE_RESHAPE_BACKWARDS \ 295 + |MD_FEATURE_NEW_OFFSET \ 296 + ) 292 297 293 298 #endif

+16 -2

include/linux/raid/pq.h

··· 99 99 extern const struct raid6_calls raid6_altivec4; 100 100 extern const struct raid6_calls raid6_altivec8; 101 101 102 + struct raid6_recov_calls { 103 + void (*data2)(int, size_t, int, int, void **); 104 + void (*datap)(int, size_t, int, void **); 105 + int (*valid)(void); 106 + const char *name; 107 + int priority; 108 + }; 109 + 110 + extern const struct raid6_recov_calls raid6_recov_intx1; 111 + extern const struct raid6_recov_calls raid6_recov_ssse3; 112 + 102 113 /* Algorithm list */ 103 114 extern const struct raid6_calls * const raid6_algos[]; 115 + extern const struct raid6_recov_calls *const raid6_recov_algos[]; 104 116 int raid6_select_algo(void); 105 117 106 118 /* Return values from chk_syndrome */ ··· 123 111 124 112 /* Galois field tables */ 125 113 extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256))); 114 + extern const u8 raid6_vgfmul[256][32] __attribute__((aligned(256))); 126 115 extern const u8 raid6_gfexp[256] __attribute__((aligned(256))); 127 116 extern const u8 raid6_gfinv[256] __attribute__((aligned(256))); 128 117 extern const u8 raid6_gfexi[256] __attribute__((aligned(256))); 129 118 130 119 /* Recovery routines */ 131 - void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, 120 + extern void (*raid6_2data_recov)(int disks, size_t bytes, int faila, int failb, 132 121 void **ptrs); 133 - void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs); 122 + extern void (*raid6_datap_recov)(int disks, size_t bytes, int faila, 123 + void **ptrs); 134 124 void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, 135 125 void **ptrs); 136 126

+1 -1

lib/raid6/Makefile

··· 1 1 obj-$(CONFIG_RAID6_PQ) += raid6_pq.o 2 2 3 - raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ 3 + raid6_pq-y += algos.o recov.o recov_ssse3.o tables.o int1.o int2.o int4.o \ 4 4 int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \ 5 5 altivec8.o mmx.o sse1.o sse2.o 6 6 hostprogs-y += mktables

+87 -40

lib/raid6/algos.c

··· 17 17 */ 18 18 19 19 #include <linux/raid/pq.h> 20 - #include <linux/module.h> 21 20 #ifndef __KERNEL__ 22 21 #include <sys/mman.h> 23 22 #include <stdio.h> 24 23 #else 24 + #include <linux/module.h> 25 25 #include <linux/gfp.h> 26 26 #if !RAID6_USE_EMPTY_ZERO_PAGE 27 27 /* In .bss so it's zeroed */ ··· 34 34 EXPORT_SYMBOL_GPL(raid6_call); 35 35 36 36 const struct raid6_calls * const raid6_algos[] = { 37 - &raid6_intx1, 38 - &raid6_intx2, 39 - &raid6_intx4, 40 - &raid6_intx8, 41 37 #if defined(__ia64__) 42 38 &raid6_intx16, 43 39 &raid6_intx32, ··· 57 61 &raid6_altivec4, 58 62 &raid6_altivec8, 59 63 #endif 64 + &raid6_intx1, 65 + &raid6_intx2, 66 + &raid6_intx4, 67 + &raid6_intx8, 68 + NULL 69 + }; 70 + 71 + void (*raid6_2data_recov)(int, size_t, int, int, void **); 72 + EXPORT_SYMBOL_GPL(raid6_2data_recov); 73 + 74 + void (*raid6_datap_recov)(int, size_t, int, void **); 75 + EXPORT_SYMBOL_GPL(raid6_datap_recov); 76 + 77 + const struct raid6_recov_calls *const raid6_recov_algos[] = { 78 + #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) 79 + &raid6_recov_ssse3, 80 + #endif 81 + &raid6_recov_intx1, 60 82 NULL 61 83 }; 62 84 ··· 86 72 #define time_before(x, y) ((x) < (y)) 87 73 #endif 88 74 89 - /* Try to pick the best algorithm */ 90 - /* This code uses the gfmul table as convenient data set to abuse */ 91 - 92 - int __init raid6_select_algo(void) 75 + static inline const struct raid6_recov_calls *raid6_choose_recov(void) 93 76 { 94 - const struct raid6_calls * const * algo; 95 - const struct raid6_calls * best; 96 - char *syndromes; 97 - void *dptrs[(65536/PAGE_SIZE)+2]; 98 - int i, disks; 99 - unsigned long perf, bestperf; 100 - int bestprefer; 101 - unsigned long j0, j1; 77 + const struct raid6_recov_calls *const *algo; 78 + const struct raid6_recov_calls *best; 102 79 103 - disks = (65536/PAGE_SIZE)+2; 104 - for ( i = 0 ; i < disks-2 ; i++ ) { 105 - dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i; 106 - } 80 + for (best = NULL, algo = raid6_recov_algos; *algo; algo++) 81 + if (!best || (*algo)->priority > best->priority) 82 + if (!(*algo)->valid || (*algo)->valid()) 83 + best = *algo; 107 84 108 - /* Normal code - use a 2-page allocation to avoid D$ conflict */ 109 - syndromes = (void *) __get_free_pages(GFP_KERNEL, 1); 85 + if (best) { 86 + raid6_2data_recov = best->data2; 87 + raid6_datap_recov = best->datap; 110 88 111 - if ( !syndromes ) { 112 - printk("raid6: Yikes! No memory available.\n"); 113 - return -ENOMEM; 114 - } 89 + printk("raid6: using %s recovery algorithm\n", best->name); 90 + } else 91 + printk("raid6: Yikes! No recovery algorithm found!\n"); 115 92 116 - dptrs[disks-2] = syndromes; 117 - dptrs[disks-1] = syndromes + PAGE_SIZE; 93 + return best; 94 + } 118 95 119 - bestperf = 0; bestprefer = 0; best = NULL; 96 + static inline const struct raid6_calls *raid6_choose_gen( 97 + void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks) 98 + { 99 + unsigned long perf, bestperf, j0, j1; 100 + const struct raid6_calls *const *algo; 101 + const struct raid6_calls *best; 120 102 121 - for ( algo = raid6_algos ; *algo ; algo++ ) { 122 - if ( !(*algo)->valid || (*algo)->valid() ) { 103 + for (bestperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) { 104 + if (!best || (*algo)->prefer >= best->prefer) { 105 + if ((*algo)->valid && !(*algo)->valid()) 106 + continue; 107 + 123 108 perf = 0; 124 109 125 110 preempt_disable(); 126 111 j0 = jiffies; 127 - while ( (j1 = jiffies) == j0 ) 112 + while ((j1 = jiffies) == j0) 128 113 cpu_relax(); 129 114 while (time_before(jiffies, 130 115 j1 + (1<<RAID6_TIME_JIFFIES_LG2))) { 131 - (*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs); 116 + (*algo)->gen_syndrome(disks, PAGE_SIZE, *dptrs); 132 117 perf++; 133 118 } 134 119 preempt_enable(); 135 120 136 - if ( (*algo)->prefer > bestprefer || 137 - ((*algo)->prefer == bestprefer && 138 - perf > bestperf) ) { 139 - best = *algo; 140 - bestprefer = best->prefer; 121 + if (perf > bestperf) { 141 122 bestperf = perf; 123 + best = *algo; 142 124 } 143 125 printk("raid6: %-8s %5ld MB/s\n", (*algo)->name, 144 126 (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); ··· 149 139 } else 150 140 printk("raid6: Yikes! No algorithm found!\n"); 151 141 142 + return best; 143 + } 144 + 145 + 146 + /* Try to pick the best algorithm */ 147 + /* This code uses the gfmul table as convenient data set to abuse */ 148 + 149 + int __init raid6_select_algo(void) 150 + { 151 + const int disks = (65536/PAGE_SIZE)+2; 152 + 153 + const struct raid6_calls *gen_best; 154 + const struct raid6_recov_calls *rec_best; 155 + char *syndromes; 156 + void *dptrs[(65536/PAGE_SIZE)+2]; 157 + int i; 158 + 159 + for (i = 0; i < disks-2; i++) 160 + dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i; 161 + 162 + /* Normal code - use a 2-page allocation to avoid D$ conflict */ 163 + syndromes = (void *) __get_free_pages(GFP_KERNEL, 1); 164 + 165 + if (!syndromes) { 166 + printk("raid6: Yikes! No memory available.\n"); 167 + return -ENOMEM; 168 + } 169 + 170 + dptrs[disks-2] = syndromes; 171 + dptrs[disks-1] = syndromes + PAGE_SIZE; 172 + 173 + /* select raid gen_syndrome function */ 174 + gen_best = raid6_choose_gen(&dptrs, disks); 175 + 176 + /* select raid recover functions */ 177 + rec_best = raid6_choose_recov(); 178 + 152 179 free_pages((unsigned long)syndromes, 1); 153 180 154 - return best ? 0 : -EINVAL; 181 + return gen_best && rec_best ? 0 : -EINVAL; 155 182 } 156 183 157 184 static void raid6_exit(void)

+25

lib/raid6/mktables.c

··· 81 81 printf("EXPORT_SYMBOL(raid6_gfmul);\n"); 82 82 printf("#endif\n"); 83 83 84 + /* Compute vector multiplication table */ 85 + printf("\nconst u8 __attribute__((aligned(256)))\n" 86 + "raid6_vgfmul[256][32] =\n" 87 + "{\n"); 88 + for (i = 0; i < 256; i++) { 89 + printf("\t{\n"); 90 + for (j = 0; j < 16; j += 8) { 91 + printf("\t\t"); 92 + for (k = 0; k < 8; k++) 93 + printf("0x%02x,%c", gfmul(i, j + k), 94 + (k == 7) ? '\n' : ' '); 95 + } 96 + for (j = 0; j < 16; j += 8) { 97 + printf("\t\t"); 98 + for (k = 0; k < 8; k++) 99 + printf("0x%02x,%c", gfmul(i, (j + k) << 4), 100 + (k == 7) ? '\n' : ' '); 101 + } 102 + printf("\t},\n"); 103 + } 104 + printf("};\n"); 105 + printf("#ifdef __KERNEL__\n"); 106 + printf("EXPORT_SYMBOL(raid6_vgfmul);\n"); 107 + printf("#endif\n"); 108 + 84 109 /* Compute power-of-2 table (exponent) */ 85 110 v = 1; 86 111 printf("\nconst u8 __attribute__((aligned(256)))\n"

+11 -4

lib/raid6/recov.c

··· 22 22 #include <linux/raid/pq.h> 23 23 24 24 /* Recover two failed data blocks. */ 25 - void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, 25 + void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, int failb, 26 26 void **ptrs) 27 27 { 28 28 u8 *p, *q, *dp, *dq; ··· 64 64 p++; q++; 65 65 } 66 66 } 67 - EXPORT_SYMBOL_GPL(raid6_2data_recov); 68 67 69 68 /* Recover failure of one data block plus the P block */ 70 - void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) 69 + void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, void **ptrs) 71 70 { 72 71 u8 *p, *q, *dq; 73 72 const u8 *qmul; /* Q multiplier table */ ··· 95 96 q++; dq++; 96 97 } 97 98 } 98 - EXPORT_SYMBOL_GPL(raid6_datap_recov); 99 + 100 + 101 + const struct raid6_recov_calls raid6_recov_intx1 = { 102 + .data2 = raid6_2data_recov_intx1, 103 + .datap = raid6_datap_recov_intx1, 104 + .valid = NULL, 105 + .name = "intx1", 106 + .priority = 0, 107 + }; 99 108 100 109 #ifndef __KERNEL__ 101 110 /* Testing only */

+335

lib/raid6/recov_ssse3.c

··· 1 + /* 2 + * Copyright (C) 2012 Intel Corporation 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public License 6 + * as published by the Free Software Foundation; version 2 7 + * of the License. 8 + */ 9 + 10 + #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) 11 + 12 + #include <linux/raid/pq.h> 13 + #include "x86.h" 14 + 15 + static int raid6_has_ssse3(void) 16 + { 17 + return boot_cpu_has(X86_FEATURE_XMM) && 18 + boot_cpu_has(X86_FEATURE_XMM2) && 19 + boot_cpu_has(X86_FEATURE_SSSE3); 20 + } 21 + 22 + void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, int failb, 23 + void **ptrs) 24 + { 25 + u8 *p, *q, *dp, *dq; 26 + const u8 *pbmul; /* P multiplier table for B data */ 27 + const u8 *qmul; /* Q multiplier table (for both) */ 28 + static const u8 __aligned(16) x0f[16] = { 29 + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 30 + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f}; 31 + 32 + p = (u8 *)ptrs[disks-2]; 33 + q = (u8 *)ptrs[disks-1]; 34 + 35 + /* Compute syndrome with zero for the missing data pages 36 + Use the dead data pages as temporary storage for 37 + delta p and delta q */ 38 + dp = (u8 *)ptrs[faila]; 39 + ptrs[faila] = (void *)raid6_empty_zero_page; 40 + ptrs[disks-2] = dp; 41 + dq = (u8 *)ptrs[failb]; 42 + ptrs[failb] = (void *)raid6_empty_zero_page; 43 + ptrs[disks-1] = dq; 44 + 45 + raid6_call.gen_syndrome(disks, bytes, ptrs); 46 + 47 + /* Restore pointer table */ 48 + ptrs[faila] = dp; 49 + ptrs[failb] = dq; 50 + ptrs[disks-2] = p; 51 + ptrs[disks-1] = q; 52 + 53 + /* Now, pick the proper data tables */ 54 + pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; 55 + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ 56 + raid6_gfexp[failb]]]; 57 + 58 + kernel_fpu_begin(); 59 + 60 + asm volatile("movdqa %0,%%xmm7" : : "m" (x0f[0])); 61 + 62 + #ifdef CONFIG_X86_64 63 + asm volatile("movdqa %0,%%xmm6" : : "m" (qmul[0])); 64 + asm volatile("movdqa %0,%%xmm14" : : "m" (pbmul[0])); 65 + asm volatile("movdqa %0,%%xmm15" : : "m" (pbmul[16])); 66 + #endif 67 + 68 + /* Now do it... */ 69 + while (bytes) { 70 + #ifdef CONFIG_X86_64 71 + /* xmm6, xmm14, xmm15 */ 72 + 73 + asm volatile("movdqa %0,%%xmm1" : : "m" (q[0])); 74 + asm volatile("movdqa %0,%%xmm9" : : "m" (q[16])); 75 + asm volatile("movdqa %0,%%xmm0" : : "m" (p[0])); 76 + asm volatile("movdqa %0,%%xmm8" : : "m" (p[16])); 77 + asm volatile("pxor %0,%%xmm1" : : "m" (dq[0])); 78 + asm volatile("pxor %0,%%xmm9" : : "m" (dq[16])); 79 + asm volatile("pxor %0,%%xmm0" : : "m" (dp[0])); 80 + asm volatile("pxor %0,%%xmm8" : : "m" (dp[16])); 81 + 82 + /* xmm0/8 = px */ 83 + 84 + asm volatile("movdqa %xmm6,%xmm4"); 85 + asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16])); 86 + asm volatile("movdqa %xmm6,%xmm12"); 87 + asm volatile("movdqa %xmm5,%xmm13"); 88 + asm volatile("movdqa %xmm1,%xmm3"); 89 + asm volatile("movdqa %xmm9,%xmm11"); 90 + asm volatile("movdqa %xmm0,%xmm2"); /* xmm2/10 = px */ 91 + asm volatile("movdqa %xmm8,%xmm10"); 92 + asm volatile("psraw $4,%xmm1"); 93 + asm volatile("psraw $4,%xmm9"); 94 + asm volatile("pand %xmm7,%xmm3"); 95 + asm volatile("pand %xmm7,%xmm11"); 96 + asm volatile("pand %xmm7,%xmm1"); 97 + asm volatile("pand %xmm7,%xmm9"); 98 + asm volatile("pshufb %xmm3,%xmm4"); 99 + asm volatile("pshufb %xmm11,%xmm12"); 100 + asm volatile("pshufb %xmm1,%xmm5"); 101 + asm volatile("pshufb %xmm9,%xmm13"); 102 + asm volatile("pxor %xmm4,%xmm5"); 103 + asm volatile("pxor %xmm12,%xmm13"); 104 + 105 + /* xmm5/13 = qx */ 106 + 107 + asm volatile("movdqa %xmm14,%xmm4"); 108 + asm volatile("movdqa %xmm15,%xmm1"); 109 + asm volatile("movdqa %xmm14,%xmm12"); 110 + asm volatile("movdqa %xmm15,%xmm9"); 111 + asm volatile("movdqa %xmm2,%xmm3"); 112 + asm volatile("movdqa %xmm10,%xmm11"); 113 + asm volatile("psraw $4,%xmm2"); 114 + asm volatile("psraw $4,%xmm10"); 115 + asm volatile("pand %xmm7,%xmm3"); 116 + asm volatile("pand %xmm7,%xmm11"); 117 + asm volatile("pand %xmm7,%xmm2"); 118 + asm volatile("pand %xmm7,%xmm10"); 119 + asm volatile("pshufb %xmm3,%xmm4"); 120 + asm volatile("pshufb %xmm11,%xmm12"); 121 + asm volatile("pshufb %xmm2,%xmm1"); 122 + asm volatile("pshufb %xmm10,%xmm9"); 123 + asm volatile("pxor %xmm4,%xmm1"); 124 + asm volatile("pxor %xmm12,%xmm9"); 125 + 126 + /* xmm1/9 = pbmul[px] */ 127 + asm volatile("pxor %xmm5,%xmm1"); 128 + asm volatile("pxor %xmm13,%xmm9"); 129 + /* xmm1/9 = db = DQ */ 130 + asm volatile("movdqa %%xmm1,%0" : "=m" (dq[0])); 131 + asm volatile("movdqa %%xmm9,%0" : "=m" (dq[16])); 132 + 133 + asm volatile("pxor %xmm1,%xmm0"); 134 + asm volatile("pxor %xmm9,%xmm8"); 135 + asm volatile("movdqa %%xmm0,%0" : "=m" (dp[0])); 136 + asm volatile("movdqa %%xmm8,%0" : "=m" (dp[16])); 137 + 138 + bytes -= 32; 139 + p += 32; 140 + q += 32; 141 + dp += 32; 142 + dq += 32; 143 + #else 144 + asm volatile("movdqa %0,%%xmm1" : : "m" (*q)); 145 + asm volatile("movdqa %0,%%xmm0" : : "m" (*p)); 146 + asm volatile("pxor %0,%%xmm1" : : "m" (*dq)); 147 + asm volatile("pxor %0,%%xmm0" : : "m" (*dp)); 148 + 149 + /* 1 = dq ^ q 150 + * 0 = dp ^ p 151 + */ 152 + asm volatile("movdqa %0,%%xmm4" : : "m" (qmul[0])); 153 + asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16])); 154 + 155 + asm volatile("movdqa %xmm1,%xmm3"); 156 + asm volatile("psraw $4,%xmm1"); 157 + asm volatile("pand %xmm7,%xmm3"); 158 + asm volatile("pand %xmm7,%xmm1"); 159 + asm volatile("pshufb %xmm3,%xmm4"); 160 + asm volatile("pshufb %xmm1,%xmm5"); 161 + asm volatile("pxor %xmm4,%xmm5"); 162 + 163 + asm volatile("movdqa %xmm0,%xmm2"); /* xmm2 = px */ 164 + 165 + /* xmm5 = qx */ 166 + 167 + asm volatile("movdqa %0,%%xmm4" : : "m" (pbmul[0])); 168 + asm volatile("movdqa %0,%%xmm1" : : "m" (pbmul[16])); 169 + asm volatile("movdqa %xmm2,%xmm3"); 170 + asm volatile("psraw $4,%xmm2"); 171 + asm volatile("pand %xmm7,%xmm3"); 172 + asm volatile("pand %xmm7,%xmm2"); 173 + asm volatile("pshufb %xmm3,%xmm4"); 174 + asm volatile("pshufb %xmm2,%xmm1"); 175 + asm volatile("pxor %xmm4,%xmm1"); 176 + 177 + /* xmm1 = pbmul[px] */ 178 + asm volatile("pxor %xmm5,%xmm1"); 179 + /* xmm1 = db = DQ */ 180 + asm volatile("movdqa %%xmm1,%0" : "=m" (*dq)); 181 + 182 + asm volatile("pxor %xmm1,%xmm0"); 183 + asm volatile("movdqa %%xmm0,%0" : "=m" (*dp)); 184 + 185 + bytes -= 16; 186 + p += 16; 187 + q += 16; 188 + dp += 16; 189 + dq += 16; 190 + #endif 191 + } 192 + 193 + kernel_fpu_end(); 194 + } 195 + 196 + 197 + void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, void **ptrs) 198 + { 199 + u8 *p, *q, *dq; 200 + const u8 *qmul; /* Q multiplier table */ 201 + static const u8 __aligned(16) x0f[16] = { 202 + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 203 + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f}; 204 + 205 + p = (u8 *)ptrs[disks-2]; 206 + q = (u8 *)ptrs[disks-1]; 207 + 208 + /* Compute syndrome with zero for the missing data page 209 + Use the dead data page as temporary storage for delta q */ 210 + dq = (u8 *)ptrs[faila]; 211 + ptrs[faila] = (void *)raid6_empty_zero_page; 212 + ptrs[disks-1] = dq; 213 + 214 + raid6_call.gen_syndrome(disks, bytes, ptrs); 215 + 216 + /* Restore pointer table */ 217 + ptrs[faila] = dq; 218 + ptrs[disks-1] = q; 219 + 220 + /* Now, pick the proper data tables */ 221 + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; 222 + 223 + kernel_fpu_begin(); 224 + 225 + asm volatile("movdqa %0, %%xmm7" : : "m" (x0f[0])); 226 + 227 + while (bytes) { 228 + #ifdef CONFIG_X86_64 229 + asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0])); 230 + asm volatile("movdqa %0, %%xmm4" : : "m" (dq[16])); 231 + asm volatile("pxor %0, %%xmm3" : : "m" (q[0])); 232 + asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0])); 233 + 234 + /* xmm3 = q[0] ^ dq[0] */ 235 + 236 + asm volatile("pxor %0, %%xmm4" : : "m" (q[16])); 237 + asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16])); 238 + 239 + /* xmm4 = q[16] ^ dq[16] */ 240 + 241 + asm volatile("movdqa %xmm3, %xmm6"); 242 + asm volatile("movdqa %xmm4, %xmm8"); 243 + 244 + /* xmm4 = xmm8 = q[16] ^ dq[16] */ 245 + 246 + asm volatile("psraw $4, %xmm3"); 247 + asm volatile("pand %xmm7, %xmm6"); 248 + asm volatile("pand %xmm7, %xmm3"); 249 + asm volatile("pshufb %xmm6, %xmm0"); 250 + asm volatile("pshufb %xmm3, %xmm1"); 251 + asm volatile("movdqa %0, %%xmm10" : : "m" (qmul[0])); 252 + asm volatile("pxor %xmm0, %xmm1"); 253 + asm volatile("movdqa %0, %%xmm11" : : "m" (qmul[16])); 254 + 255 + /* xmm1 = qmul[q[0] ^ dq[0]] */ 256 + 257 + asm volatile("psraw $4, %xmm4"); 258 + asm volatile("pand %xmm7, %xmm8"); 259 + asm volatile("pand %xmm7, %xmm4"); 260 + asm volatile("pshufb %xmm8, %xmm10"); 261 + asm volatile("pshufb %xmm4, %xmm11"); 262 + asm volatile("movdqa %0, %%xmm2" : : "m" (p[0])); 263 + asm volatile("pxor %xmm10, %xmm11"); 264 + asm volatile("movdqa %0, %%xmm12" : : "m" (p[16])); 265 + 266 + /* xmm11 = qmul[q[16] ^ dq[16]] */ 267 + 268 + asm volatile("pxor %xmm1, %xmm2"); 269 + 270 + /* xmm2 = p[0] ^ qmul[q[0] ^ dq[0]] */ 271 + 272 + asm volatile("pxor %xmm11, %xmm12"); 273 + 274 + /* xmm12 = p[16] ^ qmul[q[16] ^ dq[16]] */ 275 + 276 + asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0])); 277 + asm volatile("movdqa %%xmm11, %0" : "=m" (dq[16])); 278 + 279 + asm volatile("movdqa %%xmm2, %0" : "=m" (p[0])); 280 + asm volatile("movdqa %%xmm12, %0" : "=m" (p[16])); 281 + 282 + bytes -= 32; 283 + p += 32; 284 + q += 32; 285 + dq += 32; 286 + 287 + #else 288 + asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0])); 289 + asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0])); 290 + asm volatile("pxor %0, %%xmm3" : : "m" (q[0])); 291 + asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16])); 292 + 293 + /* xmm3 = *q ^ *dq */ 294 + 295 + asm volatile("movdqa %xmm3, %xmm6"); 296 + asm volatile("movdqa %0, %%xmm2" : : "m" (p[0])); 297 + asm volatile("psraw $4, %xmm3"); 298 + asm volatile("pand %xmm7, %xmm6"); 299 + asm volatile("pand %xmm7, %xmm3"); 300 + asm volatile("pshufb %xmm6, %xmm0"); 301 + asm volatile("pshufb %xmm3, %xmm1"); 302 + asm volatile("pxor %xmm0, %xmm1"); 303 + 304 + /* xmm1 = qmul[*q ^ *dq */ 305 + 306 + asm volatile("pxor %xmm1, %xmm2"); 307 + 308 + /* xmm2 = *p ^ qmul[*q ^ *dq] */ 309 + 310 + asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0])); 311 + asm volatile("movdqa %%xmm2, %0" : "=m" (p[0])); 312 + 313 + bytes -= 16; 314 + p += 16; 315 + q += 16; 316 + dq += 16; 317 + #endif 318 + } 319 + 320 + kernel_fpu_end(); 321 + } 322 + 323 + const struct raid6_recov_calls raid6_recov_ssse3 = { 324 + .data2 = raid6_2data_recov_ssse3, 325 + .datap = raid6_datap_recov_ssse3, 326 + .valid = raid6_has_ssse3, 327 + #ifdef CONFIG_X86_64 328 + .name = "ssse3x2", 329 + #else 330 + .name = "ssse3x1", 331 + #endif 332 + .priority = 1, 333 + }; 334 + 335 + #endif

+1 -1

lib/raid6/test/Makefile

··· 23 23 all: raid6.a raid6test 24 24 25 25 raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \ 26 - altivec1.o altivec2.o altivec4.o altivec8.o recov.o algos.o \ 26 + altivec1.o altivec2.o altivec4.o altivec8.o recov.o recov_ssse3.o algos.o \ 27 27 tables.o 28 28 rm -f $@ 29 29 $(AR) cq $@ $^

+21 -11

lib/raid6/test/test.c

··· 90 90 int main(int argc, char *argv[]) 91 91 { 92 92 const struct raid6_calls *const *algo; 93 + const struct raid6_recov_calls *const *ra; 93 94 int i, j; 94 95 int err = 0; 95 96 96 97 makedata(); 97 98 98 - for (algo = raid6_algos; *algo; algo++) { 99 - if (!(*algo)->valid || (*algo)->valid()) { 100 - raid6_call = **algo; 99 + for (ra = raid6_recov_algos; *ra; ra++) { 100 + if ((*ra)->valid && !(*ra)->valid()) 101 + continue; 102 + raid6_2data_recov = (*ra)->data2; 103 + raid6_datap_recov = (*ra)->datap; 101 104 102 - /* Nuke syndromes */ 103 - memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); 105 + printf("using recovery %s\n", (*ra)->name); 104 106 105 - /* Generate assumed good syndrome */ 106 - raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, 107 - (void **)&dataptrs); 107 + for (algo = raid6_algos; *algo; algo++) { 108 + if (!(*algo)->valid || (*algo)->valid()) { 109 + raid6_call = **algo; 108 110 109 - for (i = 0; i < NDISKS-1; i++) 110 - for (j = i+1; j < NDISKS; j++) 111 - err += test_disks(i, j); 111 + /* Nuke syndromes */ 112 + memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); 113 + 114 + /* Generate assumed good syndrome */ 115 + raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, 116 + (void **)&dataptrs); 117 + 118 + for (i = 0; i < NDISKS-1; i++) 119 + for (j = i+1; j < NDISKS; j++) 120 + err += test_disks(i, j); 121 + } 112 122 } 113 123 printf("\n"); 114 124 }

+10 -5

lib/raid6/x86.h

··· 35 35 { 36 36 } 37 37 38 + #define __aligned(x) __attribute__((aligned(x))) 39 + 38 40 #define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ 39 41 #define X86_FEATURE_FXSR (0*32+24) /* FXSAVE and FXRSTOR instructions 40 42 * (fast save and restore) */ 41 43 #define X86_FEATURE_XMM (0*32+25) /* Streaming SIMD Extensions */ 42 44 #define X86_FEATURE_XMM2 (0*32+26) /* Streaming SIMD Extensions-2 */ 45 + #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ 46 + #define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */ 47 + #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ 43 48 #define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ 44 49 45 50 /* Should work well enough on modern CPUs for testing */ 46 51 static inline int boot_cpu_has(int flag) 47 52 { 48 - u32 eax = (flag >> 5) ? 0x80000001 : 1; 49 - u32 edx; 53 + u32 eax = (flag & 0x20) ? 0x80000001 : 1; 54 + u32 ecx, edx; 50 55 51 56 asm volatile("cpuid" 52 - : "+a" (eax), "=d" (edx) 53 - : : "ecx", "ebx"); 57 + : "+a" (eax), "=d" (edx), "=c" (ecx) 58 + : : "ebx"); 54 59 55 - return (edx >> (flag & 31)) & 1; 60 + return ((flag & 0x80 ? ecx : edx) >> (flag & 31)) & 1; 56 61 } 57 62 58 63 #endif /* ndef __KERNEL__ */