Merge branch 'for-linus' of git://neil.brown.name/md

* 'for-linus' of git://neil.brown.name/md: (24 commits)
md: clean up do_md_stop
md: fix another deadlock with removing sysfs attributes.
md: move revalidate_disk() back outside open_mutex
md/raid10: fix deadlock with unaligned read during resync
md/bitmap: separate out loading a bitmap from initialising the structures.
md/bitmap: prepare for storing write-intent-bitmap via dm-dirty-log.
md/bitmap: optimise scanning of empty bitmaps.
md/bitmap: clean up plugging calls.
md/bitmap: reduce dependence on sysfs.
md/bitmap: white space clean up and similar.
md/raid5: export raid5 unplugging interface.
md/plug: optionally use plugger to unplug an array during resync/recovery.
md/raid5: add simple plugging infrastructure.
md/raid5: export is_congested test
raid5: Don't set read-ahead when there is no queue
md: add support for raising dm events.
md: export various start/stop interfaces
md: split out md_rdev_init
md: be more careful setting MD_CHANGE_CLEAN
md/raid5: ensure we create a unique name for kmem_cache when mddev has no gendisk
...

authored by Linus Torvalds and committed by Chris Metcalf 90a9ed95 8cbd84f2

+748 -509
+14
crypto/async_tx/Kconfig
··· 22 tristate 23 select ASYNC_CORE 24 select ASYNC_PQ 25 26 config ASYNC_TX_DISABLE_PQ_VAL_DMA 27 bool
··· 22 tristate 23 select ASYNC_CORE 24 select ASYNC_PQ 25 + select ASYNC_XOR 26 + 27 + config ASYNC_RAID6_TEST 28 + tristate "Self test for hardware accelerated raid6 recovery" 29 + depends on ASYNC_RAID6_RECOV 30 + select ASYNC_MEMCPY 31 + ---help--- 32 + This is a one-shot self test that permutes through the 33 + recovery of all the possible two disk failure scenarios for a 34 + N-disk array. Recovery is performed with the asynchronous 35 + raid6 recovery routines, and will optionally use an offload 36 + engine if one is available. 37 + 38 + If unsure, say N. 39 40 config ASYNC_TX_DISABLE_PQ_VAL_DMA 41 bool
+1 -17
drivers/md/Kconfig
··· 121 config MD_RAID456 122 tristate "RAID-4/RAID-5/RAID-6 mode" 123 depends on BLK_DEV_MD 124 - select MD_RAID6_PQ 125 select ASYNC_MEMCPY 126 select ASYNC_XOR 127 select ASYNC_PQ ··· 162 ---help--- 163 Enable the raid456 module to dispatch per-stripe raid operations to a 164 thread pool. 165 - 166 - If unsure, say N. 167 - 168 - config MD_RAID6_PQ 169 - tristate 170 - 171 - config ASYNC_RAID6_TEST 172 - tristate "Self test for hardware accelerated raid6 recovery" 173 - depends on MD_RAID6_PQ 174 - select ASYNC_RAID6_RECOV 175 - ---help--- 176 - This is a one-shot self test that permutes through the 177 - recovery of all the possible two disk failure scenarios for a 178 - N-disk array. Recovery is performed with the asynchronous 179 - raid6 recovery routines, and will optionally use an offload 180 - engine if one is available. 181 182 If unsure, say N. 183
··· 121 config MD_RAID456 122 tristate "RAID-4/RAID-5/RAID-6 mode" 123 depends on BLK_DEV_MD 124 + select RAID6_PQ 125 select ASYNC_MEMCPY 126 select ASYNC_XOR 127 select ASYNC_PQ ··· 162 ---help--- 163 Enable the raid456 module to dispatch per-stripe raid operations to a 164 thread pool. 165 166 If unsure, say N. 167
-77
drivers/md/Makefile
··· 12 += dm-log-userspace-base.o dm-log-userspace-transfer.o 13 md-mod-y += md.o bitmap.o 14 raid456-y += raid5.o 15 - raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \ 16 - raid6int1.o raid6int2.o raid6int4.o \ 17 - raid6int8.o raid6int16.o raid6int32.o \ 18 - raid6altivec1.o raid6altivec2.o raid6altivec4.o \ 19 - raid6altivec8.o \ 20 - raid6mmx.o raid6sse1.o raid6sse2.o 21 - hostprogs-y += mktables 22 23 # Note: link order is important. All raid personalities 24 # and must come before md.o, as they each initialise ··· 22 obj-$(CONFIG_MD_RAID0) += raid0.o 23 obj-$(CONFIG_MD_RAID1) += raid1.o 24 obj-$(CONFIG_MD_RAID10) += raid10.o 25 - obj-$(CONFIG_MD_RAID6_PQ) += raid6_pq.o 26 obj-$(CONFIG_MD_RAID456) += raid456.o 27 obj-$(CONFIG_MD_MULTIPATH) += multipath.o 28 obj-$(CONFIG_MD_FAULTY) += faulty.o ··· 37 obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o 38 obj-$(CONFIG_DM_ZERO) += dm-zero.o 39 40 - quiet_cmd_unroll = UNROLL $@ 41 - cmd_unroll = $(AWK) -f$(srctree)/$(src)/unroll.awk -vN=$(UNROLL) \ 42 - < $< > $@ || ( rm -f $@ && exit 1 ) 43 - 44 - ifeq ($(CONFIG_ALTIVEC),y) 45 - altivec_flags := -maltivec -mabi=altivec 46 - endif 47 - 48 ifeq ($(CONFIG_DM_UEVENT),y) 49 dm-mod-objs += dm-uevent.o 50 endif 51 - 52 - targets += raid6int1.c 53 - $(obj)/raid6int1.c: UNROLL := 1 54 - $(obj)/raid6int1.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE 55 - $(call if_changed,unroll) 56 - 57 - targets += raid6int2.c 58 - $(obj)/raid6int2.c: UNROLL := 2 59 - $(obj)/raid6int2.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE 60 - $(call if_changed,unroll) 61 - 62 - targets += raid6int4.c 63 - $(obj)/raid6int4.c: UNROLL := 4 64 - $(obj)/raid6int4.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE 65 - $(call if_changed,unroll) 66 - 67 - targets += raid6int8.c 68 - $(obj)/raid6int8.c: UNROLL := 8 69 - $(obj)/raid6int8.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE 70 - $(call if_changed,unroll) 71 - 72 - targets += raid6int16.c 73 - $(obj)/raid6int16.c: UNROLL := 16 74 - $(obj)/raid6int16.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE 75 - $(call if_changed,unroll) 76 - 77 - targets += raid6int32.c 78 - $(obj)/raid6int32.c: UNROLL := 32 79 - $(obj)/raid6int32.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE 80 - $(call if_changed,unroll) 81 - 82 - CFLAGS_raid6altivec1.o += $(altivec_flags) 83 - targets += raid6altivec1.c 84 - $(obj)/raid6altivec1.c: UNROLL := 1 85 - $(obj)/raid6altivec1.c: $(src)/raid6altivec.uc $(src)/unroll.awk FORCE 86 - $(call if_changed,unroll) 87 - 88 - CFLAGS_raid6altivec2.o += $(altivec_flags) 89 - targets += raid6altivec2.c 90 - $(obj)/raid6altivec2.c: UNROLL := 2 91 - $(obj)/raid6altivec2.c: $(src)/raid6altivec.uc $(src)/unroll.awk FORCE 92 - $(call if_changed,unroll) 93 - 94 - CFLAGS_raid6altivec4.o += $(altivec_flags) 95 - targets += raid6altivec4.c 96 - $(obj)/raid6altivec4.c: UNROLL := 4 97 - $(obj)/raid6altivec4.c: $(src)/raid6altivec.uc $(src)/unroll.awk FORCE 98 - $(call if_changed,unroll) 99 - 100 - CFLAGS_raid6altivec8.o += $(altivec_flags) 101 - targets += raid6altivec8.c 102 - $(obj)/raid6altivec8.c: UNROLL := 8 103 - $(obj)/raid6altivec8.c: $(src)/raid6altivec.uc $(src)/unroll.awk FORCE 104 - $(call if_changed,unroll) 105 - 106 - quiet_cmd_mktable = TABLE $@ 107 - cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) 108 - 109 - targets += raid6tables.c 110 - $(obj)/raid6tables.c: $(obj)/mktables FORCE 111 - $(call if_changed,mktable)
··· 12 += dm-log-userspace-base.o dm-log-userspace-transfer.o 13 md-mod-y += md.o bitmap.o 14 raid456-y += raid5.o 15 16 # Note: link order is important. All raid personalities 17 # and must come before md.o, as they each initialise ··· 29 obj-$(CONFIG_MD_RAID0) += raid0.o 30 obj-$(CONFIG_MD_RAID1) += raid1.o 31 obj-$(CONFIG_MD_RAID10) += raid10.o 32 obj-$(CONFIG_MD_RAID456) += raid456.o 33 obj-$(CONFIG_MD_MULTIPATH) += multipath.o 34 obj-$(CONFIG_MD_FAULTY) += faulty.o ··· 45 obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o 46 obj-$(CONFIG_DM_ZERO) += dm-zero.o 47 48 ifeq ($(CONFIG_DM_UEVENT),y) 49 dm-mod-objs += dm-uevent.o 50 endif
+281 -231
drivers/md/bitmap.c
··· 13 * Still to do: 14 * 15 * flush after percent set rather than just time based. (maybe both). 16 - * wait if count gets too high, wake when it drops to half. 17 */ 18 19 #include <linux/blkdev.h> ··· 29 #include "md.h" 30 #include "bitmap.h" 31 32 /* debug macros */ 33 34 #define DEBUG 0 ··· 51 #define INJECT_FATAL_FAULT_3 0 /* undef */ 52 #endif 53 54 - //#define DPRINTK PRINTK /* set this NULL to avoid verbose debug output */ 55 - #define DPRINTK(x...) do { } while(0) 56 - 57 #ifndef PRINTK 58 # if DEBUG > 0 59 # define PRINTK(x...) printk(KERN_DEBUG x) ··· 59 # endif 60 #endif 61 62 - static inline char * bmname(struct bitmap *bitmap) 63 { 64 return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; 65 } 66 - 67 68 /* 69 * just a placeholder - calls kmalloc for bitmap pages ··· 74 #ifdef INJECT_FAULTS_1 75 page = NULL; 76 #else 77 - page = kmalloc(PAGE_SIZE, GFP_NOIO); 78 #endif 79 if (!page) 80 printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap)); ··· 103 * if we find our page, we increment the page's refcount so that it stays 104 * allocated while we're using it 105 */ 106 - static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int create) 107 __releases(bitmap->lock) 108 __acquires(bitmap->lock) 109 { ··· 118 return -EINVAL; 119 } 120 121 - 122 if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */ 123 return 0; 124 ··· 127 if (!create) 128 return -ENOENT; 129 130 - spin_unlock_irq(&bitmap->lock); 131 - 132 /* this page has not been allocated yet */ 133 134 - if ((mappage = bitmap_alloc_page(bitmap)) == NULL) { 135 PRINTK("%s: bitmap map page allocation failed, hijacking\n", 136 bmname(bitmap)); 137 /* failed - set the hijacked flag so that we can use the 138 * pointer as a counter */ 139 - spin_lock_irq(&bitmap->lock); 140 if (!bitmap->bp[page].map) 141 bitmap->bp[page].hijacked = 1; 142 - goto out; 143 - } 144 - 145 - /* got a page */ 146 - 147 - spin_lock_irq(&bitmap->lock); 148 - 149 - /* recheck the page */ 150 - 151 - if (bitmap->bp[page].map || bitmap->bp[page].hijacked) { 152 /* somebody beat us to getting the page */ 153 bitmap_free_page(bitmap, mappage); 154 return 0; 155 } 156 - 157 - /* no page was in place and we have one, so install it */ 158 - 159 - memset(mappage, 0, PAGE_SIZE); 160 - bitmap->bp[page].map = mappage; 161 - bitmap->missing_pages--; 162 - out: 163 return 0; 164 } 165 - 166 167 /* if page is completely empty, put it back on the free list, or dealloc it */ 168 /* if page was hijacked, unmark the flag so it might get alloced next time */ ··· 170 if (bitmap->bp[page].hijacked) { /* page was hijacked, undo this now */ 171 bitmap->bp[page].hijacked = 0; 172 bitmap->bp[page].map = NULL; 173 - return; 174 } 175 - 176 - /* normal case, free the page */ 177 - 178 - #if 0 179 - /* actually ... let's not. We will probably need the page again exactly when 180 - * memory is tight and we are flusing to disk 181 - */ 182 - return; 183 - #else 184 - ptr = bitmap->bp[page].map; 185 - bitmap->bp[page].map = NULL; 186 - bitmap->missing_pages++; 187 - bitmap_free_page(bitmap, ptr); 188 - return; 189 - #endif 190 } 191 - 192 193 /* 194 * bitmap file handling - read and write the bitmap file and its superblock ··· 196 197 mdk_rdev_t *rdev; 198 sector_t target; 199 200 - if (!page) 201 page = alloc_page(GFP_KERNEL); 202 - if (!page) 203 - return ERR_PTR(-ENOMEM); 204 205 list_for_each_entry(rdev, &mddev->disks, same_set) { 206 if (! test_bit(In_sync, &rdev->flags) ··· 221 return page; 222 } 223 } 224 return ERR_PTR(-EIO); 225 226 } ··· 267 mddev_t *mddev = bitmap->mddev; 268 269 while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { 270 - int size = PAGE_SIZE; 271 - loff_t offset = mddev->bitmap_info.offset; 272 - if (page->index == bitmap->file_pages-1) 273 - size = roundup(bitmap->last_page_size, 274 - bdev_logical_block_size(rdev->bdev)); 275 - /* Just make sure we aren't corrupting data or 276 - * metadata 277 - */ 278 - if (mddev->external) { 279 - /* Bitmap could be anywhere. */ 280 - if (rdev->sb_start + offset + (page->index *(PAGE_SIZE/512)) > 281 - rdev->data_offset && 282 - rdev->sb_start + offset < 283 - rdev->data_offset + mddev->dev_sectors + 284 - (PAGE_SIZE/512)) 285 - goto bad_alignment; 286 - } else if (offset < 0) { 287 - /* DATA BITMAP METADATA */ 288 - if (offset 289 - + (long)(page->index * (PAGE_SIZE/512)) 290 - + size/512 > 0) 291 - /* bitmap runs in to metadata */ 292 - goto bad_alignment; 293 - if (rdev->data_offset + mddev->dev_sectors 294 - > rdev->sb_start + offset) 295 - /* data runs in to bitmap */ 296 - goto bad_alignment; 297 - } else if (rdev->sb_start < rdev->data_offset) { 298 - /* METADATA BITMAP DATA */ 299 - if (rdev->sb_start 300 - + offset 301 - + page->index*(PAGE_SIZE/512) + size/512 302 - > rdev->data_offset) 303 - /* bitmap runs in to data */ 304 - goto bad_alignment; 305 - } else { 306 - /* DATA METADATA BITMAP - no problems */ 307 - } 308 - md_super_write(mddev, rdev, 309 - rdev->sb_start + offset 310 - + page->index * (PAGE_SIZE/512), 311 - size, 312 - page); 313 } 314 315 if (wait) ··· 347 bh = bh->b_this_page; 348 } 349 350 - if (wait) { 351 wait_event(bitmap->write_wait, 352 atomic_read(&bitmap->pending_writes)==0); 353 - } 354 } 355 if (bitmap->flags & BITMAP_WRITE_ERROR) 356 bitmap_file_kick(bitmap); ··· 406 struct buffer_head *bh; 407 sector_t block; 408 409 - PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_SIZE, 410 (unsigned long long)index << PAGE_SHIFT); 411 412 page = alloc_page(GFP_KERNEL); ··· 460 } 461 out: 462 if (IS_ERR(page)) 463 - printk(KERN_ALERT "md: bitmap read error: (%dB @ %Lu): %ld\n", 464 (int)PAGE_SIZE, 465 (unsigned long long)index << PAGE_SHIFT, 466 PTR_ERR(page)); ··· 646 sb = kmap_atomic(bitmap->sb_page, KM_USER0); 647 old = le32_to_cpu(sb->state) & bits; 648 switch (op) { 649 - case MASK_SET: sb->state |= cpu_to_le32(bits); 650 - break; 651 - case MASK_UNSET: sb->state &= cpu_to_le32(~bits); 652 - break; 653 - default: BUG(); 654 } 655 kunmap_atomic(sb, KM_USER0); 656 return old; ··· 695 static inline struct page *filemap_get_page(struct bitmap *bitmap, 696 unsigned long chunk) 697 { 698 - if (file_page_index(bitmap, chunk) >= bitmap->file_pages) return NULL; 699 return bitmap->filemap[file_page_index(bitmap, chunk) 700 - file_page_index(bitmap, 0)]; 701 } 702 - 703 704 static void bitmap_file_unmap(struct bitmap *bitmap) 705 { ··· 753 } 754 } 755 756 - 757 /* 758 * bitmap_file_kick - if an error occurs while manipulating the bitmap file 759 * then it is no longer reliable, so we stop using it and we mark the file ··· 771 ptr = d_path(&bitmap->file->f_path, path, 772 PAGE_SIZE); 773 774 - 775 printk(KERN_ALERT 776 "%s: kicking failed bitmap file %s from array!\n", 777 bmname(bitmap), IS_ERR(ptr) ? "" : ptr); ··· 788 } 789 790 enum bitmap_page_attr { 791 - BITMAP_PAGE_DIRTY = 0, // there are set bits that need to be synced 792 - BITMAP_PAGE_CLEAN = 1, // there are bits that might need to be cleared 793 - BITMAP_PAGE_NEEDWRITE=2, // there are cleared bits that need to be synced 794 }; 795 796 static inline void set_page_attr(struct bitmap *bitmap, struct page *page, 797 enum bitmap_page_attr attr) 798 { 799 - __set_bit((page->index<<2) + attr, bitmap->filemap_attr); 800 } 801 802 static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, 803 enum bitmap_page_attr attr) 804 { 805 - __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); 806 } 807 808 static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, 809 enum bitmap_page_attr attr) 810 { 811 - return test_bit((page->index<<2) + attr, bitmap->filemap_attr); 812 } 813 814 /* ··· 830 static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) 831 { 832 unsigned long bit; 833 - struct page *page; 834 void *kaddr; 835 unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); 836 837 if (!bitmap->filemap) { 838 - return; 839 } 840 - 841 - page = filemap_get_page(bitmap, chunk); 842 - if (!page) return; 843 - bit = file_page_offset(bitmap, chunk); 844 - 845 - /* set the bit */ 846 - kaddr = kmap_atomic(page, KM_USER0); 847 - if (bitmap->flags & BITMAP_HOSTENDIAN) 848 - set_bit(bit, kaddr); 849 - else 850 - ext2_set_bit(bit, kaddr); 851 - kunmap_atomic(kaddr, KM_USER0); 852 - PRINTK("set file bit %lu page %lu\n", bit, page->index); 853 - 854 /* record page number so it gets flushed to disk when unplug occurs */ 855 set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); 856 - 857 } 858 859 /* this gets called when the md device is ready to unplug its underlying ··· 870 871 if (!bitmap) 872 return; 873 874 /* look at each page to see if there are any set bits that need to be 875 * flushed out to disk */ ··· 898 wait = 1; 899 spin_unlock_irqrestore(&bitmap->lock, flags); 900 901 - if (dirty | need_write) 902 write_page(bitmap, page, 0); 903 } 904 if (wait) { /* if any writes were performed, we need to wait on them */ ··· 908 else 909 md_super_wait(bitmap->mddev); 910 } 911 if (bitmap->flags & BITMAP_WRITE_ERROR) 912 bitmap_file_kick(bitmap); 913 } 914 915 static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); 916 /* * bitmap_init_from_disk -- called at bitmap_create time to initialize ··· 951 printk(KERN_INFO "%s: bitmap file is out of date, doing full " 952 "recovery\n", bmname(bitmap)); 953 954 - bytes = (chunks + 7) / 8; 955 if (!bitmap->mddev->bitmap_info.external) 956 bytes += sizeof(bitmap_super_t); 957 958 - 959 - num_pages = (bytes + PAGE_SIZE - 1) / PAGE_SIZE; 960 961 if (file && i_size_read(file->f_mapping->host) < bytes) { 962 printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", ··· 973 974 /* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */ 975 bitmap->filemap_attr = kzalloc( 976 - roundup( DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)), 977 GFP_KERNEL); 978 if (!bitmap->filemap_attr) 979 goto err; ··· 1028 if (outofdate) { 1029 /* 1030 * if bitmap is out of date, dirty the 1031 - * whole page and write it out 1032 */ 1033 paddr = kmap_atomic(page, KM_USER0); 1034 memset(paddr + offset, 0xff, ··· 1059 } 1060 } 1061 1062 - /* everything went OK */ 1063 ret = 0; 1064 bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET); 1065 ··· 1087 */ 1088 int i; 1089 1090 - for (i=0; i < bitmap->file_pages; i++) 1091 set_page_attr(bitmap, bitmap->filemap[i], 1092 BITMAP_PAGE_NEEDWRITE); 1093 } 1094 - 1095 1096 static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) 1097 { 1098 sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap); 1099 unsigned long page = chunk >> PAGE_COUNTER_SHIFT; 1100 bitmap->bp[page].count += inc; 1101 - /* 1102 - if (page == 0) printk("count page 0, offset %llu: %d gives %d\n", 1103 - (unsigned long long)offset, inc, bitmap->bp[page].count); 1104 - */ 1105 bitmap_checkfree(bitmap, page); 1106 } 1107 static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, ··· 1116 struct page *page = NULL, *lastpage = NULL; 1117 int blocks; 1118 void *paddr; 1119 1120 /* Use a mutex to guard daemon_work against 1121 * bitmap_destroy. ··· 1141 spin_lock_irqsave(&bitmap->lock, flags); 1142 for (j = 0; j < bitmap->chunks; j++) { 1143 bitmap_counter_t *bmc; 1144 - if (!bitmap->filemap) 1145 - /* error or shutdown */ 1146 - break; 1147 - 1148 - page = filemap_get_page(bitmap, j); 1149 1150 if (page != lastpage) { 1151 /* skip this page unless it's marked as needing cleaning */ ··· 1201 (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), 1202 &blocks, 0); 1203 if (bmc) { 1204 - /* 1205 - if (j < 100) printk("bitmap: j=%lu, *bmc = 0x%x\n", j, *bmc); 1206 - */ 1207 if (*bmc) 1208 bitmap->allclean = 0; 1209 1210 if (*bmc == 2) { 1211 - *bmc=1; /* maybe clear the bit next time */ 1212 set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); 1213 } else if (*bmc == 1 && !bitmap->need_sync) { 1214 /* we can clear the bit */ ··· 1215 -1); 1216 1217 /* clear the bit */ 1218 - paddr = kmap_atomic(page, KM_USER0); 1219 - if (bitmap->flags & BITMAP_HOSTENDIAN) 1220 - clear_bit(file_page_offset(bitmap, j), 1221 - paddr); 1222 - else 1223 - ext2_clear_bit(file_page_offset(bitmap, j), 1224 - paddr); 1225 - kunmap_atomic(paddr, KM_USER0); 1226 } 1227 } else 1228 j |= PAGE_COUNTER_MASK; ··· 1233 spin_unlock_irqrestore(&bitmap->lock, flags); 1234 1235 /* now sync the final page */ 1236 - if (lastpage != NULL) { 1237 spin_lock_irqsave(&bitmap->lock, flags); 1238 if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { 1239 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1240 spin_unlock_irqrestore(&bitmap->lock, flags); 1241 - write_page(bitmap, lastpage, 0); 1242 } else { 1243 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1244 spin_unlock_irqrestore(&bitmap->lock, flags); ··· 1251 1252 done: 1253 if (bitmap->allclean == 0) 1254 - bitmap->mddev->thread->timeout = 1255 bitmap->mddev->bitmap_info.daemon_sleep; 1256 mutex_unlock(&mddev->bitmap_info.mutex); 1257 } ··· 1270 unsigned long page = chunk >> PAGE_COUNTER_SHIFT; 1271 unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; 1272 sector_t csize; 1273 1274 - if (bitmap_checkpage(bitmap, page, create) < 0) { 1275 csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap)); 1276 - *blocks = csize - (offset & (csize- 1)); 1277 return NULL; 1278 - } 1279 /* now locked ... */ 1280 1281 if (bitmap->bp[page].hijacked) { /* hijacked pointer */ 1282 /* should we use the first or second counter field 1283 * of the hijacked pointer? */ 1284 int hi = (pageoff > PAGE_COUNTER_MASK); 1285 - csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) + 1286 - PAGE_COUNTER_SHIFT - 1); 1287 - *blocks = csize - (offset & (csize- 1)); 1288 return &((bitmap_counter_t *) 1289 &bitmap->bp[page].map)[hi]; 1290 - } else { /* page is allocated */ 1291 - csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap)); 1292 - *blocks = csize - (offset & (csize- 1)); 1293 return (bitmap_counter_t *) 1294 &(bitmap->bp[page].map[pageoff]); 1295 - } 1296 } 1297 1298 int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) 1299 { 1300 - if (!bitmap) return 0; 1301 1302 if (behind) { 1303 int bw; ··· 1334 prepare_to_wait(&bitmap->overflow_wait, &__wait, 1335 TASK_UNINTERRUPTIBLE); 1336 spin_unlock_irq(&bitmap->lock); 1337 - blk_unplug(bitmap->mddev->queue); 1338 schedule(); 1339 finish_wait(&bitmap->overflow_wait, &__wait); 1340 continue; 1341 } 1342 1343 - switch(*bmc) { 1344 case 0: 1345 bitmap_file_set_bit(bitmap, offset); 1346 - bitmap_count_page(bitmap,offset, 1); 1347 - blk_plug_device_unlocked(bitmap->mddev->queue); 1348 /* fall through */ 1349 case 1: 1350 *bmc = 2; ··· 1356 offset += blocks; 1357 if (sectors > blocks) 1358 sectors -= blocks; 1359 - else sectors = 0; 1360 } 1361 bitmap->allclean = 0; 1362 return 0; 1363 } 1364 1365 void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, 1366 int success, int behind) 1367 { 1368 - if (!bitmap) return; 1369 if (behind) { 1370 if (atomic_dec_and_test(&bitmap->behind_writes)) 1371 wake_up(&bitmap->behind_wait); ··· 1395 bitmap->events_cleared < bitmap->mddev->events) { 1396 bitmap->events_cleared = bitmap->mddev->events; 1397 bitmap->need_sync = 1; 1398 - sysfs_notify_dirent(bitmap->sysfs_can_clear); 1399 } 1400 1401 if (!success && ! (*bmc & NEEDED_MASK)) ··· 1405 wake_up(&bitmap->overflow_wait); 1406 1407 (*bmc)--; 1408 - if (*bmc <= 2) { 1409 set_page_attr(bitmap, 1410 - filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)), 1411 BITMAP_PAGE_CLEAN); 1412 - } 1413 spin_unlock_irqrestore(&bitmap->lock, flags); 1414 offset += blocks; 1415 if (sectors > blocks) 1416 sectors -= blocks; 1417 - else sectors = 0; 1418 } 1419 } 1420 1421 static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, 1422 int degraded) ··· 1473 } 1474 return rv; 1475 } 1476 1477 void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted) 1478 { 1479 bitmap_counter_t *bmc; 1480 unsigned long flags; 1481 - /* 1482 - if (offset == 0) printk("bitmap_end_sync 0 (%d)\n", aborted); 1483 - */ if (bitmap == NULL) { 1484 *blocks = 1024; 1485 return; 1486 } ··· 1489 if (bmc == NULL) 1490 goto unlock; 1491 /* locked */ 1492 - /* 1493 - if (offset == 0) printk("bitmap_end sync found 0x%x, blocks %d\n", *bmc, *blocks); 1494 - */ 1495 if (RESYNC(*bmc)) { 1496 *bmc &= ~RESYNC_MASK; 1497 1498 if (!NEEDED(*bmc) && aborted) 1499 *bmc |= NEEDED_MASK; 1500 else { 1501 - if (*bmc <= 2) { 1502 set_page_attr(bitmap, 1503 filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)), 1504 BITMAP_PAGE_CLEAN); 1505 - } 1506 } 1507 } 1508 unlock: 1509 spin_unlock_irqrestore(&bitmap->lock, flags); 1510 bitmap->allclean = 0; 1511 } 1512 1513 void bitmap_close_sync(struct bitmap *bitmap) 1514 { ··· 1522 sector += blocks; 1523 } 1524 } 1525 1526 void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) 1527 { ··· 1542 atomic_read(&bitmap->mddev->recovery_active) == 0); 1543 1544 bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync; 1545 - set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); 1546 sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); 1547 s = 0; 1548 while (s < sector && s < bitmap->mddev->resync_max_sectors) { ··· 1553 bitmap->last_end_sync = jiffies; 1554 sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed"); 1555 } 1556 1557 static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) 1558 { ··· 1570 spin_unlock_irq(&bitmap->lock); 1571 return; 1572 } 1573 - if (! *bmc) { 1574 struct page *page; 1575 - *bmc = 1 | (needed?NEEDED_MASK:0); 1576 bitmap_count_page(bitmap, offset, 1); 1577 page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); 1578 set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); ··· 1681 unsigned long pages; 1682 struct file *file = mddev->bitmap_info.file; 1683 int err; 1684 - sector_t start; 1685 - struct sysfs_dirent *bm; 1686 1687 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); 1688 1689 - if (!file && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */ 1690 return 0; 1691 1692 BUG_ON(file && mddev->bitmap_info.offset); 1693 1694 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); 1695 if (!bitmap) ··· 1705 1706 bitmap->mddev = mddev; 1707 1708 - bm = sysfs_get_dirent(mddev->kobj.sd, NULL, "bitmap"); 1709 if (bm) { 1710 bitmap->sysfs_can_clear = sysfs_get_dirent(bm, NULL, "can_clear"); 1711 sysfs_put(bm); ··· 1740 bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize); 1741 1742 /* now that chunksize and chunkshift are set, we can use these macros */ 1743 - chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >> 1744 CHUNK_BLOCK_SHIFT(bitmap); 1745 - pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO; 1746 1747 BUG_ON(!pages); 1748 ··· 1762 if (!bitmap->bp) 1763 goto error; 1764 1765 - /* now that we have some pages available, initialize the in-memory 1766 - * bitmap from the on-disk bitmap */ 1767 - start = 0; 1768 - if (mddev->degraded == 0 1769 - || bitmap->events_cleared == mddev->events) 1770 - /* no need to keep dirty bits to optimise a re-add of a missing device */ 1771 - start = mddev->recovery_cp; 1772 - err = bitmap_init_from_disk(bitmap, start); 1773 - 1774 - if (err) 1775 - goto error; 1776 - 1777 printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", 1778 pages, bmname(bitmap)); 1779 1780 mddev->bitmap = bitmap; 1781 1782 - mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; 1783 - md_wakeup_thread(mddev->thread); 1784 - 1785 - bitmap_update_sb(bitmap); 1786 1787 return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0; 1788 ··· 1775 return err; 1776 } 1777 1778 static ssize_t 1779 location_show(mddev_t *mddev, char *page) 1780 { 1781 ssize_t len; 1782 - if (mddev->bitmap_info.file) { 1783 len = sprintf(page, "file"); 1784 - } else if (mddev->bitmap_info.offset) { 1785 len = sprintf(page, "%+lld", (long long)mddev->bitmap_info.offset); 1786 - } else 1787 len = sprintf(page, "none"); 1788 len += sprintf(page+len, "\n"); 1789 return len; ··· 1926 ssize_t len; 1927 unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ; 1928 unsigned long jifs = mddev->bitmap_info.daemon_sleep % HZ; 1929 - 1930 len = sprintf(page, "%lu", secs); 1931 if (jifs) 1932 len += sprintf(page+len, ".%03u", jiffies_to_msecs(jifs)); ··· 2108 .attrs = md_bitmap_attrs, 2109 }; 2110 2111 - 2112 - /* the bitmap API -- for raid personalities */ 2113 - EXPORT_SYMBOL(bitmap_startwrite); 2114 - EXPORT_SYMBOL(bitmap_endwrite); 2115 - EXPORT_SYMBOL(bitmap_start_sync); 2116 - EXPORT_SYMBOL(bitmap_end_sync); 2117 - EXPORT_SYMBOL(bitmap_unplug); 2118 - EXPORT_SYMBOL(bitmap_close_sync); 2119 - EXPORT_SYMBOL(bitmap_cond_end_sync);
··· 13 * Still to do: 14 * 15 * flush after percent set rather than just time based. (maybe both). 16 */ 17 18 #include <linux/blkdev.h> ··· 30 #include "md.h" 31 #include "bitmap.h" 32 33 + #include <linux/dm-dirty-log.h> 34 /* debug macros */ 35 36 #define DEBUG 0 ··· 51 #define INJECT_FATAL_FAULT_3 0 /* undef */ 52 #endif 53 54 #ifndef PRINTK 55 # if DEBUG > 0 56 # define PRINTK(x...) printk(KERN_DEBUG x) ··· 62 # endif 63 #endif 64 65 + static inline char *bmname(struct bitmap *bitmap) 66 { 67 return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; 68 } 69 70 /* 71 * just a placeholder - calls kmalloc for bitmap pages ··· 78 #ifdef INJECT_FAULTS_1 79 page = NULL; 80 #else 81 + page = kzalloc(PAGE_SIZE, GFP_NOIO); 82 #endif 83 if (!page) 84 printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap)); ··· 107 * if we find our page, we increment the page's refcount so that it stays 108 * allocated while we're using it 109 */ 110 + static int bitmap_checkpage(struct bitmap *bitmap, 111 + unsigned long page, int create) 112 __releases(bitmap->lock) 113 __acquires(bitmap->lock) 114 { ··· 121 return -EINVAL; 122 } 123 124 if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */ 125 return 0; 126 ··· 131 if (!create) 132 return -ENOENT; 133 134 /* this page has not been allocated yet */ 135 136 + spin_unlock_irq(&bitmap->lock); 137 + mappage = bitmap_alloc_page(bitmap); 138 + spin_lock_irq(&bitmap->lock); 139 + 140 + if (mappage == NULL) { 141 PRINTK("%s: bitmap map page allocation failed, hijacking\n", 142 bmname(bitmap)); 143 /* failed - set the hijacked flag so that we can use the 144 * pointer as a counter */ 145 if (!bitmap->bp[page].map) 146 bitmap->bp[page].hijacked = 1; 147 + } else if (bitmap->bp[page].map || 148 + bitmap->bp[page].hijacked) { 149 /* somebody beat us to getting the page */ 150 bitmap_free_page(bitmap, mappage); 151 return 0; 152 + } else { 153 + 154 + /* no page was in place and we have one, so install it */ 155 + 156 + bitmap->bp[page].map = mappage; 157 + bitmap->missing_pages--; 158 } 159 return 0; 160 } 161 162 /* if page is completely empty, put it back on the free list, or dealloc it */ 163 /* if page was hijacked, unmark the flag so it might get alloced next time */ ··· 183 if (bitmap->bp[page].hijacked) { /* page was hijacked, undo this now */ 184 bitmap->bp[page].hijacked = 0; 185 bitmap->bp[page].map = NULL; 186 + } else { 187 + /* normal case, free the page */ 188 + ptr = bitmap->bp[page].map; 189 + bitmap->bp[page].map = NULL; 190 + bitmap->missing_pages++; 191 + bitmap_free_page(bitmap, ptr); 192 } 193 } 194 195 /* 196 * bitmap file handling - read and write the bitmap file and its superblock ··· 220 221 mdk_rdev_t *rdev; 222 sector_t target; 223 + int did_alloc = 0; 224 225 + if (!page) { 226 page = alloc_page(GFP_KERNEL); 227 + if (!page) 228 + return ERR_PTR(-ENOMEM); 229 + did_alloc = 1; 230 + } 231 232 list_for_each_entry(rdev, &mddev->disks, same_set) { 233 if (! test_bit(In_sync, &rdev->flags) ··· 242 return page; 243 } 244 } 245 + if (did_alloc) 246 + put_page(page); 247 return ERR_PTR(-EIO); 248 249 } ··· 286 mddev_t *mddev = bitmap->mddev; 287 288 while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { 289 + int size = PAGE_SIZE; 290 + loff_t offset = mddev->bitmap_info.offset; 291 + if (page->index == bitmap->file_pages-1) 292 + size = roundup(bitmap->last_page_size, 293 + bdev_logical_block_size(rdev->bdev)); 294 + /* Just make sure we aren't corrupting data or 295 + * metadata 296 + */ 297 + if (mddev->external) { 298 + /* Bitmap could be anywhere. */ 299 + if (rdev->sb_start + offset + (page->index 300 + * (PAGE_SIZE/512)) 301 + > rdev->data_offset 302 + && 303 + rdev->sb_start + offset 304 + < (rdev->data_offset + mddev->dev_sectors 305 + + (PAGE_SIZE/512))) 306 + goto bad_alignment; 307 + } else if (offset < 0) { 308 + /* DATA BITMAP METADATA */ 309 + if (offset 310 + + (long)(page->index * (PAGE_SIZE/512)) 311 + + size/512 > 0) 312 + /* bitmap runs in to metadata */ 313 + goto bad_alignment; 314 + if (rdev->data_offset + mddev->dev_sectors 315 + > rdev->sb_start + offset) 316 + /* data runs in to bitmap */ 317 + goto bad_alignment; 318 + } else if (rdev->sb_start < rdev->data_offset) { 319 + /* METADATA BITMAP DATA */ 320 + if (rdev->sb_start 321 + + offset 322 + + page->index*(PAGE_SIZE/512) + size/512 323 + > rdev->data_offset) 324 + /* bitmap runs in to data */ 325 + goto bad_alignment; 326 + } else { 327 + /* DATA METADATA BITMAP - no problems */ 328 + } 329 + md_super_write(mddev, rdev, 330 + rdev->sb_start + offset 331 + + page->index * (PAGE_SIZE/512), 332 + size, 333 + page); 334 } 335 336 if (wait) ··· 364 bh = bh->b_this_page; 365 } 366 367 + if (wait) 368 wait_event(bitmap->write_wait, 369 atomic_read(&bitmap->pending_writes)==0); 370 } 371 if (bitmap->flags & BITMAP_WRITE_ERROR) 372 bitmap_file_kick(bitmap); ··· 424 struct buffer_head *bh; 425 sector_t block; 426 427 + PRINTK("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, 428 (unsigned long long)index << PAGE_SHIFT); 429 430 page = alloc_page(GFP_KERNEL); ··· 478 } 479 out: 480 if (IS_ERR(page)) 481 + printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %ld\n", 482 (int)PAGE_SIZE, 483 (unsigned long long)index << PAGE_SHIFT, 484 PTR_ERR(page)); ··· 664 sb = kmap_atomic(bitmap->sb_page, KM_USER0); 665 old = le32_to_cpu(sb->state) & bits; 666 switch (op) { 667 + case MASK_SET: 668 + sb->state |= cpu_to_le32(bits); 669 + break; 670 + case MASK_UNSET: 671 + sb->state &= cpu_to_le32(~bits); 672 + break; 673 + default: 674 + BUG(); 675 } 676 kunmap_atomic(sb, KM_USER0); 677 return old; ··· 710 static inline struct page *filemap_get_page(struct bitmap *bitmap, 711 unsigned long chunk) 712 { 713 + if (bitmap->filemap == NULL) 714 + return NULL; 715 + if (file_page_index(bitmap, chunk) >= bitmap->file_pages) 716 + return NULL; 717 return bitmap->filemap[file_page_index(bitmap, chunk) 718 - file_page_index(bitmap, 0)]; 719 } 720 721 static void bitmap_file_unmap(struct bitmap *bitmap) 722 { ··· 766 } 767 } 768 769 /* 770 * bitmap_file_kick - if an error occurs while manipulating the bitmap file 771 * then it is no longer reliable, so we stop using it and we mark the file ··· 785 ptr = d_path(&bitmap->file->f_path, path, 786 PAGE_SIZE); 787 788 printk(KERN_ALERT 789 "%s: kicking failed bitmap file %s from array!\n", 790 bmname(bitmap), IS_ERR(ptr) ? "" : ptr); ··· 803 } 804 805 enum bitmap_page_attr { 806 + BITMAP_PAGE_DIRTY = 0, /* there are set bits that need to be synced */ 807 + BITMAP_PAGE_CLEAN = 1, /* there are bits that might need to be cleared */ 808 + BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */ 809 }; 810 811 static inline void set_page_attr(struct bitmap *bitmap, struct page *page, 812 enum bitmap_page_attr attr) 813 { 814 + if (page) 815 + __set_bit((page->index<<2) + attr, bitmap->filemap_attr); 816 + else 817 + __set_bit(attr, &bitmap->logattrs); 818 } 819 820 static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, 821 enum bitmap_page_attr attr) 822 { 823 + if (page) 824 + __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); 825 + else 826 + __clear_bit(attr, &bitmap->logattrs); 827 } 828 829 static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, 830 enum bitmap_page_attr attr) 831 { 832 + if (page) 833 + return test_bit((page->index<<2) + attr, bitmap->filemap_attr); 834 + else 835 + return test_bit(attr, &bitmap->logattrs); 836 } 837 838 /* ··· 836 static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) 837 { 838 unsigned long bit; 839 + struct page *page = NULL; 840 void *kaddr; 841 unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); 842 843 if (!bitmap->filemap) { 844 + struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log; 845 + if (log) 846 + log->type->mark_region(log, chunk); 847 + } else { 848 + 849 + page = filemap_get_page(bitmap, chunk); 850 + if (!page) 851 + return; 852 + bit = file_page_offset(bitmap, chunk); 853 + 854 + /* set the bit */ 855 + kaddr = kmap_atomic(page, KM_USER0); 856 + if (bitmap->flags & BITMAP_HOSTENDIAN) 857 + set_bit(bit, kaddr); 858 + else 859 + ext2_set_bit(bit, kaddr); 860 + kunmap_atomic(kaddr, KM_USER0); 861 + PRINTK("set file bit %lu page %lu\n", bit, page->index); 862 } 863 /* record page number so it gets flushed to disk when unplug occurs */ 864 set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); 865 } 866 867 /* this gets called when the md device is ready to unplug its underlying ··· 874 875 if (!bitmap) 876 return; 877 + if (!bitmap->filemap) { 878 + /* Must be using a dirty_log */ 879 + struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log; 880 + dirty = test_and_clear_bit(BITMAP_PAGE_DIRTY, &bitmap->logattrs); 881 + need_write = test_and_clear_bit(BITMAP_PAGE_NEEDWRITE, &bitmap->logattrs); 882 + if (dirty || need_write) 883 + if (log->type->flush(log)) 884 + bitmap->flags |= BITMAP_WRITE_ERROR; 885 + goto out; 886 + } 887 888 /* look at each page to see if there are any set bits that need to be 889 * flushed out to disk */ ··· 892 wait = 1; 893 spin_unlock_irqrestore(&bitmap->lock, flags); 894 895 + if (dirty || need_write) 896 write_page(bitmap, page, 0); 897 } 898 if (wait) { /* if any writes were performed, we need to wait on them */ ··· 902 else 903 md_super_wait(bitmap->mddev); 904 } 905 + out: 906 if (bitmap->flags & BITMAP_WRITE_ERROR) 907 bitmap_file_kick(bitmap); 908 } 909 + EXPORT_SYMBOL(bitmap_unplug); 910 911 static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); 912 /* * bitmap_init_from_disk -- called at bitmap_create time to initialize ··· 943 printk(KERN_INFO "%s: bitmap file is out of date, doing full " 944 "recovery\n", bmname(bitmap)); 945 946 + bytes = DIV_ROUND_UP(bitmap->chunks, 8); 947 if (!bitmap->mddev->bitmap_info.external) 948 bytes += sizeof(bitmap_super_t); 949 950 + num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); 951 952 if (file && i_size_read(file->f_mapping->host) < bytes) { 953 printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", ··· 966 967 /* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */ 968 bitmap->filemap_attr = kzalloc( 969 + roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)), 970 GFP_KERNEL); 971 if (!bitmap->filemap_attr) 972 goto err; ··· 1021 if (outofdate) { 1022 /* 1023 * if bitmap is out of date, dirty the 1024 + * whole page and write it out 1025 */ 1026 paddr = kmap_atomic(page, KM_USER0); 1027 memset(paddr + offset, 0xff, ··· 1052 } 1053 } 1054 1055 + /* everything went OK */ 1056 ret = 0; 1057 bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET); 1058 ··· 1080 */ 1081 int i; 1082 1083 + for (i = 0; i < bitmap->file_pages; i++) 1084 set_page_attr(bitmap, bitmap->filemap[i], 1085 BITMAP_PAGE_NEEDWRITE); 1086 } 1087 1088 static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) 1089 { 1090 sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap); 1091 unsigned long page = chunk >> PAGE_COUNTER_SHIFT; 1092 bitmap->bp[page].count += inc; 1093 bitmap_checkfree(bitmap, page); 1094 } 1095 static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, ··· 1114 struct page *page = NULL, *lastpage = NULL; 1115 int blocks; 1116 void *paddr; 1117 + struct dm_dirty_log *log = mddev->bitmap_info.log; 1118 1119 /* Use a mutex to guard daemon_work against 1120 * bitmap_destroy. ··· 1138 spin_lock_irqsave(&bitmap->lock, flags); 1139 for (j = 0; j < bitmap->chunks; j++) { 1140 bitmap_counter_t *bmc; 1141 + if (!bitmap->filemap) { 1142 + if (!log) 1143 + /* error or shutdown */ 1144 + break; 1145 + } else 1146 + page = filemap_get_page(bitmap, j); 1147 1148 if (page != lastpage) { 1149 /* skip this page unless it's marked as needing cleaning */ ··· 1197 (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), 1198 &blocks, 0); 1199 if (bmc) { 1200 if (*bmc) 1201 bitmap->allclean = 0; 1202 1203 if (*bmc == 2) { 1204 + *bmc = 1; /* maybe clear the bit next time */ 1205 set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); 1206 } else if (*bmc == 1 && !bitmap->need_sync) { 1207 /* we can clear the bit */ ··· 1214 -1); 1215 1216 /* clear the bit */ 1217 + if (page) { 1218 + paddr = kmap_atomic(page, KM_USER0); 1219 + if (bitmap->flags & BITMAP_HOSTENDIAN) 1220 + clear_bit(file_page_offset(bitmap, j), 1221 + paddr); 1222 + else 1223 + ext2_clear_bit(file_page_offset(bitmap, j), 1224 + paddr); 1225 + kunmap_atomic(paddr, KM_USER0); 1226 + } else 1227 + log->type->clear_region(log, j); 1228 } 1229 } else 1230 j |= PAGE_COUNTER_MASK; ··· 1229 spin_unlock_irqrestore(&bitmap->lock, flags); 1230 1231 /* now sync the final page */ 1232 + if (lastpage != NULL || log != NULL) { 1233 spin_lock_irqsave(&bitmap->lock, flags); 1234 if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { 1235 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1236 spin_unlock_irqrestore(&bitmap->lock, flags); 1237 + if (lastpage) 1238 + write_page(bitmap, lastpage, 0); 1239 + else 1240 + if (log->type->flush(log)) 1241 + bitmap->flags |= BITMAP_WRITE_ERROR; 1242 } else { 1243 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1244 spin_unlock_irqrestore(&bitmap->lock, flags); ··· 1243 1244 done: 1245 if (bitmap->allclean == 0) 1246 + bitmap->mddev->thread->timeout = 1247 bitmap->mddev->bitmap_info.daemon_sleep; 1248 mutex_unlock(&mddev->bitmap_info.mutex); 1249 } ··· 1262 unsigned long page = chunk >> PAGE_COUNTER_SHIFT; 1263 unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; 1264 sector_t csize; 1265 + int err; 1266 1267 + err = bitmap_checkpage(bitmap, page, create); 1268 + 1269 + if (bitmap->bp[page].hijacked || 1270 + bitmap->bp[page].map == NULL) 1271 + csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) + 1272 + PAGE_COUNTER_SHIFT - 1); 1273 + else 1274 csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap)); 1275 + *blocks = csize - (offset & (csize - 1)); 1276 + 1277 + if (err < 0) 1278 return NULL; 1279 + 1280 /* now locked ... */ 1281 1282 if (bitmap->bp[page].hijacked) { /* hijacked pointer */ 1283 /* should we use the first or second counter field 1284 * of the hijacked pointer? */ 1285 int hi = (pageoff > PAGE_COUNTER_MASK); 1286 return &((bitmap_counter_t *) 1287 &bitmap->bp[page].map)[hi]; 1288 + } else /* page is allocated */ 1289 return (bitmap_counter_t *) 1290 &(bitmap->bp[page].map[pageoff]); 1291 } 1292 1293 int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) 1294 { 1295 + if (!bitmap) 1296 + return 0; 1297 1298 if (behind) { 1299 int bw; ··· 1322 prepare_to_wait(&bitmap->overflow_wait, &__wait, 1323 TASK_UNINTERRUPTIBLE); 1324 spin_unlock_irq(&bitmap->lock); 1325 + md_unplug(bitmap->mddev); 1326 schedule(); 1327 finish_wait(&bitmap->overflow_wait, &__wait); 1328 continue; 1329 } 1330 1331 + switch (*bmc) { 1332 case 0: 1333 bitmap_file_set_bit(bitmap, offset); 1334 + bitmap_count_page(bitmap, offset, 1); 1335 /* fall through */ 1336 case 1: 1337 *bmc = 2; ··· 1345 offset += blocks; 1346 if (sectors > blocks) 1347 sectors -= blocks; 1348 + else 1349 + sectors = 0; 1350 } 1351 bitmap->allclean = 0; 1352 return 0; 1353 } 1354 + EXPORT_SYMBOL(bitmap_startwrite); 1355 1356 void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, 1357 int success, int behind) 1358 { 1359 + if (!bitmap) 1360 + return; 1361 if (behind) { 1362 if (atomic_dec_and_test(&bitmap->behind_writes)) 1363 wake_up(&bitmap->behind_wait); ··· 1381 bitmap->events_cleared < bitmap->mddev->events) { 1382 bitmap->events_cleared = bitmap->mddev->events; 1383 bitmap->need_sync = 1; 1384 + sysfs_notify_dirent_safe(bitmap->sysfs_can_clear); 1385 } 1386 1387 if (!success && ! (*bmc & NEEDED_MASK)) ··· 1391 wake_up(&bitmap->overflow_wait); 1392 1393 (*bmc)--; 1394 + if (*bmc <= 2) 1395 set_page_attr(bitmap, 1396 + filemap_get_page( 1397 + bitmap, 1398 + offset >> CHUNK_BLOCK_SHIFT(bitmap)), 1399 BITMAP_PAGE_CLEAN); 1400 + 1401 spin_unlock_irqrestore(&bitmap->lock, flags); 1402 offset += blocks; 1403 if (sectors > blocks) 1404 sectors -= blocks; 1405 + else 1406 + sectors = 0; 1407 } 1408 } 1409 + EXPORT_SYMBOL(bitmap_endwrite); 1410 1411 static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, 1412 int degraded) ··· 1455 } 1456 return rv; 1457 } 1458 + EXPORT_SYMBOL(bitmap_start_sync); 1459 1460 void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted) 1461 { 1462 bitmap_counter_t *bmc; 1463 unsigned long flags; 1464 + 1465 + if (bitmap == NULL) { 1466 *blocks = 1024; 1467 return; 1468 } ··· 1471 if (bmc == NULL) 1472 goto unlock; 1473 /* locked */ 1474 if (RESYNC(*bmc)) { 1475 *bmc &= ~RESYNC_MASK; 1476 1477 if (!NEEDED(*bmc) && aborted) 1478 *bmc |= NEEDED_MASK; 1479 else { 1480 + if (*bmc <= 2) 1481 set_page_attr(bitmap, 1482 filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)), 1483 BITMAP_PAGE_CLEAN); 1484 } 1485 } 1486 unlock: 1487 spin_unlock_irqrestore(&bitmap->lock, flags); 1488 bitmap->allclean = 0; 1489 } 1490 + EXPORT_SYMBOL(bitmap_end_sync); 1491 1492 void bitmap_close_sync(struct bitmap *bitmap) 1493 { ··· 1507 sector += blocks; 1508 } 1509 } 1510 + EXPORT_SYMBOL(bitmap_close_sync); 1511 1512 void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) 1513 { ··· 1526 atomic_read(&bitmap->mddev->recovery_active) == 0); 1527 1528 bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync; 1529 + if (bitmap->mddev->persistent) 1530 + set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); 1531 sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); 1532 s = 0; 1533 while (s < sector && s < bitmap->mddev->resync_max_sectors) { ··· 1536 bitmap->last_end_sync = jiffies; 1537 sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed"); 1538 } 1539 + EXPORT_SYMBOL(bitmap_cond_end_sync); 1540 1541 static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) 1542 { ··· 1552 spin_unlock_irq(&bitmap->lock); 1553 return; 1554 } 1555 + if (!*bmc) { 1556 struct page *page; 1557 + *bmc = 1 | (needed ? NEEDED_MASK : 0); 1558 bitmap_count_page(bitmap, offset, 1); 1559 page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); 1560 set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); ··· 1663 unsigned long pages; 1664 struct file *file = mddev->bitmap_info.file; 1665 int err; 1666 + struct sysfs_dirent *bm = NULL; 1667 1668 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); 1669 1670 + if (!file 1671 + && !mddev->bitmap_info.offset 1672 + && !mddev->bitmap_info.log) /* bitmap disabled, nothing to do */ 1673 return 0; 1674 1675 BUG_ON(file && mddev->bitmap_info.offset); 1676 + BUG_ON(mddev->bitmap_info.offset && mddev->bitmap_info.log); 1677 1678 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); 1679 if (!bitmap) ··· 1685 1686 bitmap->mddev = mddev; 1687 1688 + if (mddev->kobj.sd) 1689 + bm = sysfs_get_dirent(mddev->kobj.sd, NULL, "bitmap"); 1690 if (bm) { 1691 bitmap->sysfs_can_clear = sysfs_get_dirent(bm, NULL, "can_clear"); 1692 sysfs_put(bm); ··· 1719 bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize); 1720 1721 /* now that chunksize and chunkshift are set, we can use these macros */ 1722 + chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >> 1723 CHUNK_BLOCK_SHIFT(bitmap); 1724 + pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO; 1725 1726 BUG_ON(!pages); 1727 ··· 1741 if (!bitmap->bp) 1742 goto error; 1743 1744 printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", 1745 pages, bmname(bitmap)); 1746 1747 mddev->bitmap = bitmap; 1748 1749 1750 return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0; 1751 ··· 1770 return err; 1771 } 1772 1773 + int bitmap_load(mddev_t *mddev) 1774 + { 1775 + int err = 0; 1776 + sector_t sector = 0; 1777 + struct bitmap *bitmap = mddev->bitmap; 1778 + 1779 + if (!bitmap) 1780 + goto out; 1781 + 1782 + /* Clear out old bitmap info first: Either there is none, or we 1783 + * are resuming after someone else has possibly changed things, 1784 + * so we should forget old cached info. 1785 + * All chunks should be clean, but some might need_sync. 1786 + */ 1787 + while (sector < mddev->resync_max_sectors) { 1788 + int blocks; 1789 + bitmap_start_sync(bitmap, sector, &blocks, 0); 1790 + sector += blocks; 1791 + } 1792 + bitmap_close_sync(bitmap); 1793 + 1794 + if (mddev->bitmap_info.log) { 1795 + unsigned long i; 1796 + struct dm_dirty_log *log = mddev->bitmap_info.log; 1797 + for (i = 0; i < bitmap->chunks; i++) 1798 + if (!log->type->in_sync(log, i, 1)) 1799 + bitmap_set_memory_bits(bitmap, 1800 + (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap), 1801 + 1); 1802 + } else { 1803 + sector_t start = 0; 1804 + if (mddev->degraded == 0 1805 + || bitmap->events_cleared == mddev->events) 1806 + /* no need to keep dirty bits to optimise a 1807 + * re-add of a missing device */ 1808 + start = mddev->recovery_cp; 1809 + 1810 + err = bitmap_init_from_disk(bitmap, start); 1811 + } 1812 + if (err) 1813 + goto out; 1814 + 1815 + mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; 1816 + md_wakeup_thread(mddev->thread); 1817 + 1818 + bitmap_update_sb(bitmap); 1819 + 1820 + if (bitmap->flags & BITMAP_WRITE_ERROR) 1821 + err = -EIO; 1822 + out: 1823 + return err; 1824 + } 1825 + EXPORT_SYMBOL_GPL(bitmap_load); 1826 + 1827 static ssize_t 1828 location_show(mddev_t *mddev, char *page) 1829 { 1830 ssize_t len; 1831 + if (mddev->bitmap_info.file) 1832 len = sprintf(page, "file"); 1833 + else if (mddev->bitmap_info.offset) 1834 len = sprintf(page, "%+lld", (long long)mddev->bitmap_info.offset); 1835 + else 1836 len = sprintf(page, "none"); 1837 len += sprintf(page+len, "\n"); 1838 return len; ··· 1867 ssize_t len; 1868 unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ; 1869 unsigned long jifs = mddev->bitmap_info.daemon_sleep % HZ; 1870 + 1871 len = sprintf(page, "%lu", secs); 1872 if (jifs) 1873 len += sprintf(page+len, ".%03u", jiffies_to_msecs(jifs)); ··· 2049 .attrs = md_bitmap_attrs, 2050 }; 2051
+6
drivers/md/bitmap.h
··· 222 unsigned long file_pages; /* number of pages in the file */ 223 int last_page_size; /* bytes in the last page */ 224 225 unsigned long flags; 226 227 int allclean; ··· 247 wait_queue_head_t behind_wait; 248 249 struct sysfs_dirent *sysfs_can_clear; 250 }; 251 252 /* the bitmap API */ 253 254 /* these are used only by md/bitmap */ 255 int bitmap_create(mddev_t *mddev); 256 void bitmap_flush(mddev_t *mddev); 257 void bitmap_destroy(mddev_t *mddev); 258
··· 222 unsigned long file_pages; /* number of pages in the file */ 223 int last_page_size; /* bytes in the last page */ 224 225 + unsigned long logattrs; /* used when filemap_attr doesn't exist 226 + * because we are working with a dirty_log 227 + */ 228 + 229 unsigned long flags; 230 231 int allclean; ··· 243 wait_queue_head_t behind_wait; 244 245 struct sysfs_dirent *sysfs_can_clear; 246 + 247 }; 248 249 /* the bitmap API */ 250 251 /* these are used only by md/bitmap */ 252 int bitmap_create(mddev_t *mddev); 253 + int bitmap_load(mddev_t *mddev); 254 void bitmap_flush(mddev_t *mddev); 255 void bitmap_destroy(mddev_t *mddev); 256
+176 -110
drivers/md/md.c
··· 262 * Once ->stop is called and completes, the module will be completely 263 * unused. 264 */ 265 - static void mddev_suspend(mddev_t *mddev) 266 { 267 BUG_ON(mddev->suspended); 268 mddev->suspended = 1; ··· 270 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 271 mddev->pers->quiesce(mddev, 1); 272 } 273 274 - static void mddev_resume(mddev_t *mddev) 275 { 276 mddev->suspended = 0; 277 wake_up(&mddev->sb_wait); 278 mddev->pers->quiesce(mddev, 0); 279 } 280 281 int mddev_congested(mddev_t *mddev, int bits) 282 { ··· 387 } 388 EXPORT_SYMBOL(md_barrier_request); 389 390 static inline mddev_t *mddev_get(mddev_t *mddev) 391 { 392 atomic_inc(&mddev->active); ··· 464 spin_unlock(&all_mddevs_lock); 465 } 466 467 - static void mddev_init(mddev_t *mddev) 468 { 469 mutex_init(&mddev->open_mutex); 470 mutex_init(&mddev->reconfig_mutex); ··· 484 mddev->resync_max = MaxSector; 485 mddev->level = LEVEL_NONE; 486 } 487 488 static mddev_t * mddev_find(dev_t unit) 489 { ··· 581 * an access to the files will try to take reconfig_mutex 582 * while holding the file unremovable, which leads to 583 * a deadlock. 584 - * So hold open_mutex instead - we are allowed to take 585 - * it while holding reconfig_mutex, and md_run can 586 - * use it to wait for the remove to complete. 587 */ 588 struct attribute_group *to_remove = mddev->to_remove; 589 mddev->to_remove = NULL; 590 - mutex_lock(&mddev->open_mutex); 591 mutex_unlock(&mddev->reconfig_mutex); 592 593 - if (to_remove != &md_redundancy_group) 594 - sysfs_remove_group(&mddev->kobj, to_remove); 595 - if (mddev->pers == NULL || 596 - mddev->pers->sync_request == NULL) { 597 - sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 598 - if (mddev->sysfs_action) 599 - sysfs_put(mddev->sysfs_action); 600 - mddev->sysfs_action = NULL; 601 } 602 - mutex_unlock(&mddev->open_mutex); 603 } else 604 mutex_unlock(&mddev->reconfig_mutex); 605 ··· 1866 goto fail; 1867 1868 ko = &part_to_dev(rdev->bdev->bd_part)->kobj; 1869 - if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) { 1870 - kobject_del(&rdev->kobj); 1871 - goto fail; 1872 - } 1873 - rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, NULL, "state"); 1874 1875 list_add_rcu(&rdev->same_set, &mddev->disks); 1876 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); ··· 2387 set_bit(In_sync, &rdev->flags); 2388 err = 0; 2389 } 2390 - if (!err && rdev->sysfs_state) 2391 - sysfs_notify_dirent(rdev->sysfs_state); 2392 return err ? err : len; 2393 } 2394 static struct rdev_sysfs_entry rdev_state = ··· 2483 rdev->raid_disk = -1; 2484 return err; 2485 } else 2486 - sysfs_notify_dirent(rdev->sysfs_state); 2487 sprintf(nm, "rd%d", rdev->raid_disk); 2488 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) 2489 - printk(KERN_WARNING 2490 - "md: cannot register " 2491 - "%s for %s\n", 2492 - nm, mdname(rdev->mddev)); 2493 - 2494 /* don't wakeup anyone, leave that to userspace. */ 2495 } else { 2496 if (slot >= rdev->mddev->raid_disks) ··· 2496 clear_bit(Faulty, &rdev->flags); 2497 clear_bit(WriteMostly, &rdev->flags); 2498 set_bit(In_sync, &rdev->flags); 2499 - sysfs_notify_dirent(rdev->sysfs_state); 2500 } 2501 return len; 2502 } ··· 2744 .default_attrs = rdev_default_attrs, 2745 }; 2746 2747 /* 2748 * Import a device. If 'super_format' >= 0, then sanity check the superblock 2749 * ··· 2785 return ERR_PTR(-ENOMEM); 2786 } 2787 2788 if ((err = alloc_disk_sb(rdev))) 2789 goto abort_free; 2790 ··· 2794 goto abort_free; 2795 2796 kobject_init(&rdev->kobj, &rdev_ktype); 2797 - 2798 - rdev->desc_nr = -1; 2799 - rdev->saved_raid_disk = -1; 2800 - rdev->raid_disk = -1; 2801 - rdev->flags = 0; 2802 - rdev->data_offset = 0; 2803 - rdev->sb_events = 0; 2804 - rdev->last_read_error.tv_sec = 0; 2805 - rdev->last_read_error.tv_nsec = 0; 2806 - atomic_set(&rdev->nr_pending, 0); 2807 - atomic_set(&rdev->read_errors, 0); 2808 - atomic_set(&rdev->corrected_errors, 0); 2809 2810 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2811 if (!size) { ··· 2822 goto abort_free; 2823 } 2824 } 2825 - 2826 - INIT_LIST_HEAD(&rdev->same_set); 2827 - init_waitqueue_head(&rdev->blocked_wait); 2828 2829 return rdev; 2830 ··· 3013 * - new personality will access other array. 3014 */ 3015 3016 - if (mddev->sync_thread || mddev->reshape_position != MaxSector) 3017 return -EBUSY; 3018 3019 if (!mddev->pers->quiesce) { ··· 3492 if (err) 3493 return err; 3494 else { 3495 - sysfs_notify_dirent(mddev->sysfs_state); 3496 return len; 3497 } 3498 } ··· 3790 } 3791 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3792 md_wakeup_thread(mddev->thread); 3793 - sysfs_notify_dirent(mddev->sysfs_action); 3794 return len; 3795 } 3796 ··· 4336 disk->disk_name); 4337 error = 0; 4338 } 4339 - if (sysfs_create_group(&mddev->kobj, &md_bitmap_group)) 4340 printk(KERN_DEBUG "pointless warning\n"); 4341 abort: 4342 mutex_unlock(&disks_mutex); 4343 - if (!error) { 4344 kobject_uevent(&mddev->kobj, KOBJ_ADD); 4345 - mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, NULL, "array_state"); 4346 } 4347 mddev_put(mddev); 4348 return error; ··· 4381 if (!atomic_read(&mddev->writes_pending)) { 4382 mddev->safemode = 1; 4383 if (mddev->external) 4384 - sysfs_notify_dirent(mddev->sysfs_state); 4385 } 4386 md_wakeup_thread(mddev->thread); 4387 } 4388 4389 static int start_dirty_degraded; 4390 4391 - static int md_run(mddev_t *mddev) 4392 { 4393 int err; 4394 mdk_rdev_t *rdev; ··· 4400 4401 if (mddev->pers) 4402 return -EBUSY; 4403 - 4404 - /* These two calls synchronise us with the 4405 - * sysfs_remove_group calls in mddev_unlock, 4406 - * so they must have completed. 4407 - */ 4408 - mutex_lock(&mddev->open_mutex); 4409 - mutex_unlock(&mddev->open_mutex); 4410 4411 /* 4412 * Analyze all RAID superblock(s) ··· 4449 return -EINVAL; 4450 } 4451 } 4452 - sysfs_notify_dirent(rdev->sysfs_state); 4453 } 4454 4455 spin_lock(&pers_lock); ··· 4548 return err; 4549 } 4550 if (mddev->pers->sync_request) { 4551 - if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4552 printk(KERN_WARNING 4553 "md: cannot register extra attributes for %s\n", 4554 mdname(mddev)); 4555 - mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action"); 4556 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 4557 mddev->ro = 0; 4558 ··· 4571 char nm[20]; 4572 sprintf(nm, "rd%d", rdev->raid_disk); 4573 if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) 4574 - printk("md: cannot register %s for %s\n", 4575 - nm, mdname(mddev)); 4576 } 4577 4578 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); ··· 4583 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 4584 4585 md_new_event(mddev); 4586 - sysfs_notify_dirent(mddev->sysfs_state); 4587 - if (mddev->sysfs_action) 4588 - sysfs_notify_dirent(mddev->sysfs_action); 4589 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4590 return 0; 4591 } 4592 4593 static int do_md_run(mddev_t *mddev) 4594 { ··· 4597 err = md_run(mddev); 4598 if (err) 4599 goto out; 4600 - 4601 set_capacity(mddev->gendisk, mddev->array_sectors); 4602 revalidate_disk(mddev->gendisk); 4603 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); ··· 4629 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4630 md_wakeup_thread(mddev->thread); 4631 md_wakeup_thread(mddev->sync_thread); 4632 - sysfs_notify_dirent(mddev->sysfs_state); 4633 return 0; 4634 } 4635 ··· 4700 mddev->bitmap_info.chunksize = 0; 4701 mddev->bitmap_info.daemon_sleep = 0; 4702 mddev->bitmap_info.max_write_behind = 0; 4703 } 4704 4705 - static void md_stop_writes(mddev_t *mddev) 4706 { 4707 if (mddev->sync_thread) { 4708 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); ··· 4723 md_update_sb(mddev, 1); 4724 } 4725 } 4726 4727 - static void md_stop(mddev_t *mddev) 4728 { 4729 - md_stop_writes(mddev); 4730 - 4731 mddev->pers->stop(mddev); 4732 if (mddev->pers->sync_request && mddev->to_remove == NULL) 4733 mddev->to_remove = &md_redundancy_group; ··· 4734 mddev->pers = NULL; 4735 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4736 } 4737 4738 static int md_set_readonly(mddev_t *mddev, int is_open) 4739 { ··· 4754 mddev->ro = 1; 4755 set_disk_ro(mddev->gendisk, 1); 4756 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4757 - sysfs_notify_dirent(mddev->sysfs_state); 4758 err = 0; 4759 } 4760 out: ··· 4768 */ 4769 static int do_md_stop(mddev_t * mddev, int mode, int is_open) 4770 { 4771 - int err = 0; 4772 struct gendisk *disk = mddev->gendisk; 4773 mdk_rdev_t *rdev; 4774 4775 mutex_lock(&mddev->open_mutex); 4776 - if (atomic_read(&mddev->openers) > is_open) { 4777 printk("md: %s still in use.\n",mdname(mddev)); 4778 - err = -EBUSY; 4779 - } else if (mddev->pers) { 4780 4781 if (mddev->ro) 4782 set_disk_ro(disk, 0); 4783 4784 md_stop(mddev); 4785 mddev->queue->merge_bvec_fn = NULL; 4786 mddev->queue->unplug_fn = NULL; 4787 mddev->queue->backing_dev_info.congested_fn = NULL; 4788 4789 /* tell userspace to handle 'inactive' */ 4790 - sysfs_notify_dirent(mddev->sysfs_state); 4791 4792 list_for_each_entry(rdev, &mddev->disks, same_set) 4793 if (rdev->raid_disk >= 0) { ··· 4800 } 4801 4802 set_capacity(disk, 0); 4803 revalidate_disk(disk); 4804 4805 if (mddev->ro) 4806 mddev->ro = 0; 4807 - 4808 - err = 0; 4809 - } 4810 - mutex_unlock(&mddev->open_mutex); 4811 - if (err) 4812 - return err; 4813 /* 4814 * Free resources if final stop 4815 */ 4816 if (mode == 0) { 4817 - 4818 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 4819 4820 bitmap_destroy(mddev); ··· 4827 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4828 if (mddev->hold_active == UNTIL_STOP) 4829 mddev->hold_active = 0; 4830 - 4831 } 4832 - err = 0; 4833 blk_integrity_unregister(disk); 4834 md_new_event(mddev); 4835 - sysfs_notify_dirent(mddev->sysfs_state); 4836 - return err; 4837 } 4838 4839 #ifndef MODULE ··· 5192 if (err) 5193 export_rdev(rdev); 5194 else 5195 - sysfs_notify_dirent(rdev->sysfs_state); 5196 5197 md_update_sb(mddev, 1); 5198 if (mddev->degraded) ··· 5385 err = 0; 5386 if (mddev->pers) { 5387 mddev->pers->quiesce(mddev, 1); 5388 - if (fd >= 0) 5389 err = bitmap_create(mddev); 5390 if (fd < 0 || err) { 5391 bitmap_destroy(mddev); 5392 fd = -1; /* make sure to put the file */ ··· 5638 mddev->bitmap_info.default_offset; 5639 mddev->pers->quiesce(mddev, 1); 5640 rv = bitmap_create(mddev); 5641 if (rv) 5642 bitmap_destroy(mddev); 5643 mddev->pers->quiesce(mddev, 0); ··· 5872 if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { 5873 if (mddev->ro == 2) { 5874 mddev->ro = 0; 5875 - sysfs_notify_dirent(mddev->sysfs_state); 5876 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5877 md_wakeup_thread(mddev->thread); 5878 } else { ··· 6123 mddev->pers->error_handler(mddev,rdev); 6124 if (mddev->degraded) 6125 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6126 - sysfs_notify_dirent(rdev->sysfs_state); 6127 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6128 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6129 md_wakeup_thread(mddev->thread); 6130 md_new_event_inintr(mddev); 6131 } 6132 ··· 6586 spin_unlock_irq(&mddev->write_lock); 6587 } 6588 if (did_change) 6589 - sysfs_notify_dirent(mddev->sysfs_state); 6590 wait_event(mddev->sb_wait, 6591 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && 6592 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); ··· 6629 mddev->safemode = 1; 6630 spin_unlock_irq(&mddev->write_lock); 6631 md_update_sb(mddev, 0); 6632 - sysfs_notify_dirent(mddev->sysfs_state); 6633 } else 6634 spin_unlock_irq(&mddev->write_lock); 6635 ··· 6639 return 0; 6640 } 6641 EXPORT_SYMBOL_GPL(md_allow_write); 6642 6643 #define SYNC_MARKS 10 6644 #define SYNC_MARK_STEP (3*HZ) ··· 6826 >= mddev->resync_max - mddev->curr_resync_completed 6827 )) { 6828 /* time to update curr_resync_completed */ 6829 - blk_unplug(mddev->queue); 6830 wait_event(mddev->recovery_wait, 6831 atomic_read(&mddev->recovery_active) == 0); 6832 mddev->curr_resync_completed = 6833 mddev->curr_resync; 6834 - set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6835 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6836 } 6837 ··· 6904 * about not overloading the IO subsystem. (things like an 6905 * e2fsck being done on the RAID array should execute fast) 6906 */ 6907 - blk_unplug(mddev->queue); 6908 cond_resched(); 6909 6910 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 ··· 6923 * this also signals 'finished resyncing' to md_stop 6924 */ 6925 out: 6926 - blk_unplug(mddev->queue); 6927 6928 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 6929 ··· 7025 sprintf(nm, "rd%d", rdev->raid_disk); 7026 if (sysfs_create_link(&mddev->kobj, 7027 &rdev->kobj, nm)) 7028 - printk(KERN_WARNING 7029 - "md: cannot register " 7030 - "%s for %s\n", 7031 - nm, mdname(mddev)); 7032 spares++; 7033 md_new_event(mddev); 7034 set_bit(MD_CHANGE_DEVS, &mddev->flags); ··· 7118 mddev->safemode = 0; 7119 spin_unlock_irq(&mddev->write_lock); 7120 if (did_change) 7121 - sysfs_notify_dirent(mddev->sysfs_state); 7122 } 7123 7124 if (mddev->flags) ··· 7157 mddev->recovery = 0; 7158 /* flag recovery needed just to double check */ 7159 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7160 - sysfs_notify_dirent(mddev->sysfs_action); 7161 md_new_event(mddev); 7162 goto unlock; 7163 } ··· 7219 mddev->recovery = 0; 7220 } else 7221 md_wakeup_thread(mddev->sync_thread); 7222 - sysfs_notify_dirent(mddev->sysfs_action); 7223 md_new_event(mddev); 7224 } 7225 unlock: ··· 7228 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 7229 &mddev->recovery)) 7230 if (mddev->sysfs_action) 7231 - sysfs_notify_dirent(mddev->sysfs_action); 7232 } 7233 mddev_unlock(mddev); 7234 } ··· 7236 7237 void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) 7238 { 7239 - sysfs_notify_dirent(rdev->sysfs_state); 7240 wait_event_timeout(rdev->blocked_wait, 7241 !test_bit(Blocked, &rdev->flags), 7242 msecs_to_jiffies(5000));
··· 262 * Once ->stop is called and completes, the module will be completely 263 * unused. 264 */ 265 + void mddev_suspend(mddev_t *mddev) 266 { 267 BUG_ON(mddev->suspended); 268 mddev->suspended = 1; ··· 270 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 271 mddev->pers->quiesce(mddev, 1); 272 } 273 + EXPORT_SYMBOL_GPL(mddev_suspend); 274 275 + void mddev_resume(mddev_t *mddev) 276 { 277 mddev->suspended = 0; 278 wake_up(&mddev->sb_wait); 279 mddev->pers->quiesce(mddev, 0); 280 } 281 + EXPORT_SYMBOL_GPL(mddev_resume); 282 283 int mddev_congested(mddev_t *mddev, int bits) 284 { ··· 385 } 386 EXPORT_SYMBOL(md_barrier_request); 387 388 + /* Support for plugging. 389 + * This mirrors the plugging support in request_queue, but does not 390 + * require having a whole queue 391 + */ 392 + static void plugger_work(struct work_struct *work) 393 + { 394 + struct plug_handle *plug = 395 + container_of(work, struct plug_handle, unplug_work); 396 + plug->unplug_fn(plug); 397 + } 398 + static void plugger_timeout(unsigned long data) 399 + { 400 + struct plug_handle *plug = (void *)data; 401 + kblockd_schedule_work(NULL, &plug->unplug_work); 402 + } 403 + void plugger_init(struct plug_handle *plug, 404 + void (*unplug_fn)(struct plug_handle *)) 405 + { 406 + plug->unplug_flag = 0; 407 + plug->unplug_fn = unplug_fn; 408 + init_timer(&plug->unplug_timer); 409 + plug->unplug_timer.function = plugger_timeout; 410 + plug->unplug_timer.data = (unsigned long)plug; 411 + INIT_WORK(&plug->unplug_work, plugger_work); 412 + } 413 + EXPORT_SYMBOL_GPL(plugger_init); 414 + 415 + void plugger_set_plug(struct plug_handle *plug) 416 + { 417 + if (!test_and_set_bit(PLUGGED_FLAG, &plug->unplug_flag)) 418 + mod_timer(&plug->unplug_timer, jiffies + msecs_to_jiffies(3)+1); 419 + } 420 + EXPORT_SYMBOL_GPL(plugger_set_plug); 421 + 422 + int plugger_remove_plug(struct plug_handle *plug) 423 + { 424 + if (test_and_clear_bit(PLUGGED_FLAG, &plug->unplug_flag)) { 425 + del_timer(&plug->unplug_timer); 426 + return 1; 427 + } else 428 + return 0; 429 + } 430 + EXPORT_SYMBOL_GPL(plugger_remove_plug); 431 + 432 + 433 static inline mddev_t *mddev_get(mddev_t *mddev) 434 { 435 atomic_inc(&mddev->active); ··· 417 spin_unlock(&all_mddevs_lock); 418 } 419 420 + void mddev_init(mddev_t *mddev) 421 { 422 mutex_init(&mddev->open_mutex); 423 mutex_init(&mddev->reconfig_mutex); ··· 437 mddev->resync_max = MaxSector; 438 mddev->level = LEVEL_NONE; 439 } 440 + EXPORT_SYMBOL_GPL(mddev_init); 441 442 static mddev_t * mddev_find(dev_t unit) 443 { ··· 533 * an access to the files will try to take reconfig_mutex 534 * while holding the file unremovable, which leads to 535 * a deadlock. 536 + * So hold set sysfs_active while the remove in happeing, 537 + * and anything else which might set ->to_remove or my 538 + * otherwise change the sysfs namespace will fail with 539 + * -EBUSY if sysfs_active is still set. 540 + * We set sysfs_active under reconfig_mutex and elsewhere 541 + * test it under the same mutex to ensure its correct value 542 + * is seen. 543 */ 544 struct attribute_group *to_remove = mddev->to_remove; 545 mddev->to_remove = NULL; 546 + mddev->sysfs_active = 1; 547 mutex_unlock(&mddev->reconfig_mutex); 548 549 + if (mddev->kobj.sd) { 550 + if (to_remove != &md_redundancy_group) 551 + sysfs_remove_group(&mddev->kobj, to_remove); 552 + if (mddev->pers == NULL || 553 + mddev->pers->sync_request == NULL) { 554 + sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 555 + if (mddev->sysfs_action) 556 + sysfs_put(mddev->sysfs_action); 557 + mddev->sysfs_action = NULL; 558 + } 559 } 560 + mddev->sysfs_active = 0; 561 } else 562 mutex_unlock(&mddev->reconfig_mutex); 563 ··· 1812 goto fail; 1813 1814 ko = &part_to_dev(rdev->bdev->bd_part)->kobj; 1815 + if (sysfs_create_link(&rdev->kobj, ko, "block")) 1816 + /* failure here is OK */; 1817 + rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 1818 1819 list_add_rcu(&rdev->same_set, &mddev->disks); 1820 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); ··· 2335 set_bit(In_sync, &rdev->flags); 2336 err = 0; 2337 } 2338 + if (!err) 2339 + sysfs_notify_dirent_safe(rdev->sysfs_state); 2340 return err ? err : len; 2341 } 2342 static struct rdev_sysfs_entry rdev_state = ··· 2431 rdev->raid_disk = -1; 2432 return err; 2433 } else 2434 + sysfs_notify_dirent_safe(rdev->sysfs_state); 2435 sprintf(nm, "rd%d", rdev->raid_disk); 2436 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) 2437 + /* failure here is OK */; 2438 /* don't wakeup anyone, leave that to userspace. */ 2439 } else { 2440 if (slot >= rdev->mddev->raid_disks) ··· 2448 clear_bit(Faulty, &rdev->flags); 2449 clear_bit(WriteMostly, &rdev->flags); 2450 set_bit(In_sync, &rdev->flags); 2451 + sysfs_notify_dirent_safe(rdev->sysfs_state); 2452 } 2453 return len; 2454 } ··· 2696 .default_attrs = rdev_default_attrs, 2697 }; 2698 2699 + void md_rdev_init(mdk_rdev_t *rdev) 2700 + { 2701 + rdev->desc_nr = -1; 2702 + rdev->saved_raid_disk = -1; 2703 + rdev->raid_disk = -1; 2704 + rdev->flags = 0; 2705 + rdev->data_offset = 0; 2706 + rdev->sb_events = 0; 2707 + rdev->last_read_error.tv_sec = 0; 2708 + rdev->last_read_error.tv_nsec = 0; 2709 + atomic_set(&rdev->nr_pending, 0); 2710 + atomic_set(&rdev->read_errors, 0); 2711 + atomic_set(&rdev->corrected_errors, 0); 2712 + 2713 + INIT_LIST_HEAD(&rdev->same_set); 2714 + init_waitqueue_head(&rdev->blocked_wait); 2715 + } 2716 + EXPORT_SYMBOL_GPL(md_rdev_init); 2717 /* 2718 * Import a device. If 'super_format' >= 0, then sanity check the superblock 2719 * ··· 2719 return ERR_PTR(-ENOMEM); 2720 } 2721 2722 + md_rdev_init(rdev); 2723 if ((err = alloc_disk_sb(rdev))) 2724 goto abort_free; 2725 ··· 2727 goto abort_free; 2728 2729 kobject_init(&rdev->kobj, &rdev_ktype); 2730 2731 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2732 if (!size) { ··· 2767 goto abort_free; 2768 } 2769 } 2770 2771 return rdev; 2772 ··· 2961 * - new personality will access other array. 2962 */ 2963 2964 + if (mddev->sync_thread || 2965 + mddev->reshape_position != MaxSector || 2966 + mddev->sysfs_active) 2967 return -EBUSY; 2968 2969 if (!mddev->pers->quiesce) { ··· 3438 if (err) 3439 return err; 3440 else { 3441 + sysfs_notify_dirent_safe(mddev->sysfs_state); 3442 return len; 3443 } 3444 } ··· 3736 } 3737 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3738 md_wakeup_thread(mddev->thread); 3739 + sysfs_notify_dirent_safe(mddev->sysfs_action); 3740 return len; 3741 } 3742 ··· 4282 disk->disk_name); 4283 error = 0; 4284 } 4285 + if (mddev->kobj.sd && 4286 + sysfs_create_group(&mddev->kobj, &md_bitmap_group)) 4287 printk(KERN_DEBUG "pointless warning\n"); 4288 abort: 4289 mutex_unlock(&disks_mutex); 4290 + if (!error && mddev->kobj.sd) { 4291 kobject_uevent(&mddev->kobj, KOBJ_ADD); 4292 + mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 4293 } 4294 mddev_put(mddev); 4295 return error; ··· 4326 if (!atomic_read(&mddev->writes_pending)) { 4327 mddev->safemode = 1; 4328 if (mddev->external) 4329 + sysfs_notify_dirent_safe(mddev->sysfs_state); 4330 } 4331 md_wakeup_thread(mddev->thread); 4332 } 4333 4334 static int start_dirty_degraded; 4335 4336 + int md_run(mddev_t *mddev) 4337 { 4338 int err; 4339 mdk_rdev_t *rdev; ··· 4345 4346 if (mddev->pers) 4347 return -EBUSY; 4348 + /* Cannot run until previous stop completes properly */ 4349 + if (mddev->sysfs_active) 4350 + return -EBUSY; 4351 4352 /* 4353 * Analyze all RAID superblock(s) ··· 4398 return -EINVAL; 4399 } 4400 } 4401 + sysfs_notify_dirent_safe(rdev->sysfs_state); 4402 } 4403 4404 spin_lock(&pers_lock); ··· 4497 return err; 4498 } 4499 if (mddev->pers->sync_request) { 4500 + if (mddev->kobj.sd && 4501 + sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4502 printk(KERN_WARNING 4503 "md: cannot register extra attributes for %s\n", 4504 mdname(mddev)); 4505 + mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 4506 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 4507 mddev->ro = 0; 4508 ··· 4519 char nm[20]; 4520 sprintf(nm, "rd%d", rdev->raid_disk); 4521 if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) 4522 + /* failure here is OK */; 4523 } 4524 4525 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); ··· 4532 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 4533 4534 md_new_event(mddev); 4535 + sysfs_notify_dirent_safe(mddev->sysfs_state); 4536 + sysfs_notify_dirent_safe(mddev->sysfs_action); 4537 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4538 return 0; 4539 } 4540 + EXPORT_SYMBOL_GPL(md_run); 4541 4542 static int do_md_run(mddev_t *mddev) 4543 { ··· 4546 err = md_run(mddev); 4547 if (err) 4548 goto out; 4549 + err = bitmap_load(mddev); 4550 + if (err) { 4551 + bitmap_destroy(mddev); 4552 + goto out; 4553 + } 4554 set_capacity(mddev->gendisk, mddev->array_sectors); 4555 revalidate_disk(mddev->gendisk); 4556 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); ··· 4574 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4575 md_wakeup_thread(mddev->thread); 4576 md_wakeup_thread(mddev->sync_thread); 4577 + sysfs_notify_dirent_safe(mddev->sysfs_state); 4578 return 0; 4579 } 4580 ··· 4645 mddev->bitmap_info.chunksize = 0; 4646 mddev->bitmap_info.daemon_sleep = 0; 4647 mddev->bitmap_info.max_write_behind = 0; 4648 + mddev->plug = NULL; 4649 } 4650 4651 + void md_stop_writes(mddev_t *mddev) 4652 { 4653 if (mddev->sync_thread) { 4654 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); ··· 4667 md_update_sb(mddev, 1); 4668 } 4669 } 4670 + EXPORT_SYMBOL_GPL(md_stop_writes); 4671 4672 + void md_stop(mddev_t *mddev) 4673 { 4674 mddev->pers->stop(mddev); 4675 if (mddev->pers->sync_request && mddev->to_remove == NULL) 4676 mddev->to_remove = &md_redundancy_group; ··· 4679 mddev->pers = NULL; 4680 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4681 } 4682 + EXPORT_SYMBOL_GPL(md_stop); 4683 4684 static int md_set_readonly(mddev_t *mddev, int is_open) 4685 { ··· 4698 mddev->ro = 1; 4699 set_disk_ro(mddev->gendisk, 1); 4700 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4701 + sysfs_notify_dirent_safe(mddev->sysfs_state); 4702 err = 0; 4703 } 4704 out: ··· 4712 */ 4713 static int do_md_stop(mddev_t * mddev, int mode, int is_open) 4714 { 4715 struct gendisk *disk = mddev->gendisk; 4716 mdk_rdev_t *rdev; 4717 4718 mutex_lock(&mddev->open_mutex); 4719 + if (atomic_read(&mddev->openers) > is_open || 4720 + mddev->sysfs_active) { 4721 printk("md: %s still in use.\n",mdname(mddev)); 4722 + mutex_unlock(&mddev->open_mutex); 4723 + return -EBUSY; 4724 + } 4725 4726 + if (mddev->pers) { 4727 if (mddev->ro) 4728 set_disk_ro(disk, 0); 4729 4730 + md_stop_writes(mddev); 4731 md_stop(mddev); 4732 mddev->queue->merge_bvec_fn = NULL; 4733 mddev->queue->unplug_fn = NULL; 4734 mddev->queue->backing_dev_info.congested_fn = NULL; 4735 4736 /* tell userspace to handle 'inactive' */ 4737 + sysfs_notify_dirent_safe(mddev->sysfs_state); 4738 4739 list_for_each_entry(rdev, &mddev->disks, same_set) 4740 if (rdev->raid_disk >= 0) { ··· 4741 } 4742 4743 set_capacity(disk, 0); 4744 + mutex_unlock(&mddev->open_mutex); 4745 revalidate_disk(disk); 4746 4747 if (mddev->ro) 4748 mddev->ro = 0; 4749 + } else 4750 + mutex_unlock(&mddev->open_mutex); 4751 /* 4752 * Free resources if final stop 4753 */ 4754 if (mode == 0) { 4755 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 4756 4757 bitmap_destroy(mddev); ··· 4772 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4773 if (mddev->hold_active == UNTIL_STOP) 4774 mddev->hold_active = 0; 4775 } 4776 blk_integrity_unregister(disk); 4777 md_new_event(mddev); 4778 + sysfs_notify_dirent_safe(mddev->sysfs_state); 4779 + return 0; 4780 } 4781 4782 #ifndef MODULE ··· 5139 if (err) 5140 export_rdev(rdev); 5141 else 5142 + sysfs_notify_dirent_safe(rdev->sysfs_state); 5143 5144 md_update_sb(mddev, 1); 5145 if (mddev->degraded) ··· 5332 err = 0; 5333 if (mddev->pers) { 5334 mddev->pers->quiesce(mddev, 1); 5335 + if (fd >= 0) { 5336 err = bitmap_create(mddev); 5337 + if (!err) 5338 + err = bitmap_load(mddev); 5339 + } 5340 if (fd < 0 || err) { 5341 bitmap_destroy(mddev); 5342 fd = -1; /* make sure to put the file */ ··· 5582 mddev->bitmap_info.default_offset; 5583 mddev->pers->quiesce(mddev, 1); 5584 rv = bitmap_create(mddev); 5585 + if (!rv) 5586 + rv = bitmap_load(mddev); 5587 if (rv) 5588 bitmap_destroy(mddev); 5589 mddev->pers->quiesce(mddev, 0); ··· 5814 if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { 5815 if (mddev->ro == 2) { 5816 mddev->ro = 0; 5817 + sysfs_notify_dirent_safe(mddev->sysfs_state); 5818 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5819 md_wakeup_thread(mddev->thread); 5820 } else { ··· 6065 mddev->pers->error_handler(mddev,rdev); 6066 if (mddev->degraded) 6067 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6068 + sysfs_notify_dirent_safe(rdev->sysfs_state); 6069 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6070 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6071 md_wakeup_thread(mddev->thread); 6072 + if (mddev->event_work.func) 6073 + schedule_work(&mddev->event_work); 6074 md_new_event_inintr(mddev); 6075 } 6076 ··· 6526 spin_unlock_irq(&mddev->write_lock); 6527 } 6528 if (did_change) 6529 + sysfs_notify_dirent_safe(mddev->sysfs_state); 6530 wait_event(mddev->sb_wait, 6531 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && 6532 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); ··· 6569 mddev->safemode = 1; 6570 spin_unlock_irq(&mddev->write_lock); 6571 md_update_sb(mddev, 0); 6572 + sysfs_notify_dirent_safe(mddev->sysfs_state); 6573 } else 6574 spin_unlock_irq(&mddev->write_lock); 6575 ··· 6579 return 0; 6580 } 6581 EXPORT_SYMBOL_GPL(md_allow_write); 6582 + 6583 + void md_unplug(mddev_t *mddev) 6584 + { 6585 + if (mddev->queue) 6586 + blk_unplug(mddev->queue); 6587 + if (mddev->plug) 6588 + mddev->plug->unplug_fn(mddev->plug); 6589 + } 6590 6591 #define SYNC_MARKS 10 6592 #define SYNC_MARK_STEP (3*HZ) ··· 6758 >= mddev->resync_max - mddev->curr_resync_completed 6759 )) { 6760 /* time to update curr_resync_completed */ 6761 + md_unplug(mddev); 6762 wait_event(mddev->recovery_wait, 6763 atomic_read(&mddev->recovery_active) == 0); 6764 mddev->curr_resync_completed = 6765 mddev->curr_resync; 6766 + if (mddev->persistent) 6767 + set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6768 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6769 } 6770 ··· 6835 * about not overloading the IO subsystem. (things like an 6836 * e2fsck being done on the RAID array should execute fast) 6837 */ 6838 + md_unplug(mddev); 6839 cond_resched(); 6840 6841 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 ··· 6854 * this also signals 'finished resyncing' to md_stop 6855 */ 6856 out: 6857 + md_unplug(mddev); 6858 6859 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 6860 ··· 6956 sprintf(nm, "rd%d", rdev->raid_disk); 6957 if (sysfs_create_link(&mddev->kobj, 6958 &rdev->kobj, nm)) 6959 + /* failure here is OK */; 6960 spares++; 6961 md_new_event(mddev); 6962 set_bit(MD_CHANGE_DEVS, &mddev->flags); ··· 7052 mddev->safemode = 0; 7053 spin_unlock_irq(&mddev->write_lock); 7054 if (did_change) 7055 + sysfs_notify_dirent_safe(mddev->sysfs_state); 7056 } 7057 7058 if (mddev->flags) ··· 7091 mddev->recovery = 0; 7092 /* flag recovery needed just to double check */ 7093 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7094 + sysfs_notify_dirent_safe(mddev->sysfs_action); 7095 md_new_event(mddev); 7096 goto unlock; 7097 } ··· 7153 mddev->recovery = 0; 7154 } else 7155 md_wakeup_thread(mddev->sync_thread); 7156 + sysfs_notify_dirent_safe(mddev->sysfs_action); 7157 md_new_event(mddev); 7158 } 7159 unlock: ··· 7162 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 7163 &mddev->recovery)) 7164 if (mddev->sysfs_action) 7165 + sysfs_notify_dirent_safe(mddev->sysfs_action); 7166 } 7167 mddev_unlock(mddev); 7168 } ··· 7170 7171 void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) 7172 { 7173 + sysfs_notify_dirent_safe(rdev->sysfs_state); 7174 wait_event_timeout(rdev->blocked_wait, 7175 !test_bit(Blocked, &rdev->flags), 7176 msecs_to_jiffies(5000));
+54 -1
drivers/md/md.h
··· 29 typedef struct mddev_s mddev_t; 30 typedef struct mdk_rdev_s mdk_rdev_t; 31 32 /* 33 * MD's 'extended' device 34 */ ··· 145 int suspended; 146 atomic_t active_io; 147 int ro; 148 149 struct gendisk *gendisk; 150 ··· 321 * hot-adding a bitmap. It should 322 * eventually be settable by sysfs. 323 */ 324 struct mutex mutex; 325 unsigned long chunksize; 326 - unsigned long daemon_sleep; /* how many seconds between updates? */ 327 unsigned long max_write_behind; /* write-behind mode */ 328 int external; 329 } bitmap_info; ··· 337 struct list_head all_mddevs; 338 339 struct attribute_group *to_remove; 340 /* Generic barrier handling. 341 * If there is a pending barrier request, all other 342 * writes are blocked while the devices are flushed. ··· 349 struct bio *barrier; 350 atomic_t flush_pending; 351 struct work_struct barrier_work; 352 }; 353 354 ··· 413 ssize_t (*store)(mddev_t *, const char *, size_t); 414 }; 415 extern struct attribute_group md_bitmap_group; 416 417 static inline char * mdname (mddev_t * mddev) 418 { ··· 518 extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); 519 extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); 520 extern void restore_bitmap_write_access(struct file *file); 521 522 #endif /* _MD_MD_H */
··· 29 typedef struct mddev_s mddev_t; 30 typedef struct mdk_rdev_s mdk_rdev_t; 31 32 + /* generic plugging support - like that provided with request_queue, 33 + * but does not require a request_queue 34 + */ 35 + struct plug_handle { 36 + void (*unplug_fn)(struct plug_handle *); 37 + struct timer_list unplug_timer; 38 + struct work_struct unplug_work; 39 + unsigned long unplug_flag; 40 + }; 41 + #define PLUGGED_FLAG 1 42 + void plugger_init(struct plug_handle *plug, 43 + void (*unplug_fn)(struct plug_handle *)); 44 + void plugger_set_plug(struct plug_handle *plug); 45 + int plugger_remove_plug(struct plug_handle *plug); 46 + static inline void plugger_flush(struct plug_handle *plug) 47 + { 48 + del_timer_sync(&plug->unplug_timer); 49 + cancel_work_sync(&plug->unplug_work); 50 + } 51 + 52 /* 53 * MD's 'extended' device 54 */ ··· 125 int suspended; 126 atomic_t active_io; 127 int ro; 128 + int sysfs_active; /* set when sysfs deletes 129 + * are happening, so run/ 130 + * takeover/stop are not safe 131 + */ 132 133 struct gendisk *gendisk; 134 ··· 297 * hot-adding a bitmap. It should 298 * eventually be settable by sysfs. 299 */ 300 + /* When md is serving under dm, it might use a 301 + * dirty_log to store the bits. 302 + */ 303 + struct dm_dirty_log *log; 304 + 305 struct mutex mutex; 306 unsigned long chunksize; 307 + unsigned long daemon_sleep; /* how many jiffies between updates? */ 308 unsigned long max_write_behind; /* write-behind mode */ 309 int external; 310 } bitmap_info; ··· 308 struct list_head all_mddevs; 309 310 struct attribute_group *to_remove; 311 + struct plug_handle *plug; /* if used by personality */ 312 + 313 /* Generic barrier handling. 314 * If there is a pending barrier request, all other 315 * writes are blocked while the devices are flushed. ··· 318 struct bio *barrier; 319 atomic_t flush_pending; 320 struct work_struct barrier_work; 321 + struct work_struct event_work; /* used by dm to report failure event */ 322 }; 323 324 ··· 381 ssize_t (*store)(mddev_t *, const char *, size_t); 382 }; 383 extern struct attribute_group md_bitmap_group; 384 + 385 + static inline struct sysfs_dirent *sysfs_get_dirent_safe(struct sysfs_dirent *sd, char *name) 386 + { 387 + if (sd) 388 + return sysfs_get_dirent(sd, NULL, name); 389 + return sd; 390 + } 391 + static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd) 392 + { 393 + if (sd) 394 + sysfs_notify_dirent(sd); 395 + } 396 397 static inline char * mdname (mddev_t * mddev) 398 { ··· 474 extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); 475 extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); 476 extern void restore_bitmap_write_access(struct file *file); 477 + extern void md_unplug(mddev_t *mddev); 478 479 + extern void mddev_init(mddev_t *mddev); 480 + extern int md_run(mddev_t *mddev); 481 + extern void md_stop(mddev_t *mddev); 482 + extern void md_stop_writes(mddev_t *mddev); 483 + extern void md_rdev_init(mdk_rdev_t *rdev); 484 + 485 + extern void mddev_suspend(mddev_t *mddev); 486 + extern void mddev_resume(mddev_t *mddev); 487 #endif /* _MD_MD_H */
drivers/md/mktables.c lib/raid6/mktables.c
+18
drivers/md/raid10.c
··· 825 */ 826 bp = bio_split(bio, 827 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); 828 if (make_request(mddev, &bp->bio1)) 829 generic_make_request(&bp->bio1); 830 if (make_request(mddev, &bp->bio2)) 831 generic_make_request(&bp->bio2); 832 833 bio_pair_release(bp); 834 return 0;
··· 825 */ 826 bp = bio_split(bio, 827 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); 828 + 829 + /* Each of these 'make_request' calls will call 'wait_barrier'. 830 + * If the first succeeds but the second blocks due to the resync 831 + * thread raising the barrier, we will deadlock because the 832 + * IO to the underlying device will be queued in generic_make_request 833 + * and will never complete, so will never reduce nr_pending. 834 + * So increment nr_waiting here so no new raise_barriers will 835 + * succeed, and so the second wait_barrier cannot block. 836 + */ 837 + spin_lock_irq(&conf->resync_lock); 838 + conf->nr_waiting++; 839 + spin_unlock_irq(&conf->resync_lock); 840 + 841 if (make_request(mddev, &bp->bio1)) 842 generic_make_request(&bp->bio1); 843 if (make_request(mddev, &bp->bio2)) 844 generic_make_request(&bp->bio2); 845 + 846 + spin_lock_irq(&conf->resync_lock); 847 + conf->nr_waiting--; 848 + wake_up(&conf->wait_barrier); 849 + spin_unlock_irq(&conf->resync_lock); 850 851 bio_pair_release(bp); 852 return 0;
+108 -72
drivers/md/raid5.c
··· 201 if (test_bit(STRIPE_HANDLE, &sh->state)) { 202 if (test_bit(STRIPE_DELAYED, &sh->state)) { 203 list_add_tail(&sh->lru, &conf->delayed_list); 204 - blk_plug_device(conf->mddev->queue); 205 } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 206 sh->bm_seq - conf->seq_write > 0) { 207 list_add_tail(&sh->lru, &conf->bitmap_list); 208 - blk_plug_device(conf->mddev->queue); 209 } else { 210 clear_bit(STRIPE_BIT_DELAY, &sh->state); 211 list_add_tail(&sh->lru, &conf->handle_list); ··· 434 } 435 436 static void unplug_slaves(mddev_t *mddev); 437 - static void raid5_unplug_device(struct request_queue *q); 438 439 static struct stripe_head * 440 get_active_stripe(raid5_conf_t *conf, sector_t sector, ··· 463 < (conf->max_nr_stripes *3/4) 464 || !conf->inactive_blocked), 465 conf->device_lock, 466 - raid5_unplug_device(conf->mddev->queue) 467 ); 468 conf->inactive_blocked = 0; 469 } else ··· 1336 struct kmem_cache *sc; 1337 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1338 1339 - sprintf(conf->cache_name[0], 1340 - "raid%d-%s", conf->level, mdname(conf->mddev)); 1341 - sprintf(conf->cache_name[1], 1342 - "raid%d-%s-alt", conf->level, mdname(conf->mddev)); 1343 conf->active_name = 0; 1344 sc = kmem_cache_create(conf->cache_name[conf->active_name], 1345 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), ··· 3617 list_add_tail(&sh->lru, &conf->hold_list); 3618 } 3619 } else 3620 - blk_plug_device(conf->mddev->queue); 3621 } 3622 3623 static void activate_bit_delay(raid5_conf_t *conf) ··· 3658 rcu_read_unlock(); 3659 } 3660 3661 - static void raid5_unplug_device(struct request_queue *q) 3662 { 3663 - mddev_t *mddev = q->queuedata; 3664 - raid5_conf_t *conf = mddev->private; 3665 unsigned long flags; 3666 3667 spin_lock_irqsave(&conf->device_lock, flags); 3668 3669 - if (blk_remove_plug(q)) { 3670 conf->seq_flush++; 3671 raid5_activate_delayed(conf); 3672 } 3673 - md_wakeup_thread(mddev->thread); 3674 3675 spin_unlock_irqrestore(&conf->device_lock, flags); 3676 3677 - unplug_slaves(mddev); 3678 } 3679 3680 - static int raid5_congested(void *data, int bits) 3681 { 3682 - mddev_t *mddev = data; 3683 raid5_conf_t *conf = mddev->private; 3684 3685 /* No difference between reads and writes. Just check 3686 * how busy the stripe_cache is 3687 */ 3688 3689 - if (mddev_congested(mddev, bits)) 3690 - return 1; 3691 if (conf->inactive_blocked) 3692 return 1; 3693 if (conf->quiesce) ··· 3704 return 1; 3705 3706 return 0; 3707 } 3708 3709 /* We want read requests to align with chunks where possible, ··· 4095 * add failed due to overlap. Flush everything 4096 * and wait a while 4097 */ 4098 - raid5_unplug_device(mddev->queue); 4099 release_stripe(sh); 4100 schedule(); 4101 goto retry; ··· 4586 return 0; 4587 } 4588 4589 static ssize_t 4590 raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) 4591 { ··· 4626 4627 if (strict_strtoul(page, 10, &new)) 4628 return -EINVAL; 4629 - if (new <= 16 || new > 32768) 4630 - return -EINVAL; 4631 - while (new < conf->max_nr_stripes) { 4632 - if (drop_one_stripe(conf)) 4633 - conf->max_nr_stripes--; 4634 - else 4635 - break; 4636 - } 4637 - err = md_allow_write(mddev); 4638 if (err) 4639 return err; 4640 - while (new > conf->max_nr_stripes) { 4641 - if (grow_one_stripe(conf)) 4642 - conf->max_nr_stripes++; 4643 - else break; 4644 - } 4645 return len; 4646 } 4647 ··· 4991 static int run(mddev_t *mddev) 4992 { 4993 raid5_conf_t *conf; 4994 - int working_disks = 0, chunk_size; 4995 int dirty_parity_disks = 0; 4996 mdk_rdev_t *rdev; 4997 sector_t reshape_offset = 0; ··· 5177 "reshape"); 5178 } 5179 5180 - /* read-ahead size must cover two whole stripes, which is 5181 - * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 5182 - */ 5183 - { 5184 int data_disks = conf->previous_raid_disks - conf->max_degraded; 5185 int stripe = data_disks * 5186 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 5187 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5188 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5189 } 5190 - 5191 - /* Ok, everything is just fine now */ 5192 - if (mddev->to_remove == &raid5_attrs_group) 5193 - mddev->to_remove = NULL; 5194 - else if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 5195 - printk(KERN_WARNING 5196 - "md/raid:%s: failed to create sysfs attributes.\n", 5197 - mdname(mddev)); 5198 - 5199 - mddev->queue->queue_lock = &conf->device_lock; 5200 - 5201 - mddev->queue->unplug_fn = raid5_unplug_device; 5202 - mddev->queue->backing_dev_info.congested_data = mddev; 5203 - mddev->queue->backing_dev_info.congested_fn = raid5_congested; 5204 - 5205 - md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5206 - 5207 - blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 5208 - chunk_size = mddev->chunk_sectors << 9; 5209 - blk_queue_io_min(mddev->queue, chunk_size); 5210 - blk_queue_io_opt(mddev->queue, chunk_size * 5211 - (conf->raid_disks - conf->max_degraded)); 5212 - 5213 - list_for_each_entry(rdev, &mddev->disks, same_set) 5214 - disk_stack_limits(mddev->gendisk, rdev->bdev, 5215 - rdev->data_offset << 9); 5216 5217 return 0; 5218 abort: ··· 5238 5239 md_unregister_thread(mddev->thread); 5240 mddev->thread = NULL; 5241 - mddev->queue->backing_dev_info.congested_fn = NULL; 5242 - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 5243 free_conf(conf); 5244 mddev->private = NULL; 5245 mddev->to_remove = &raid5_attrs_group; ··· 5584 sprintf(nm, "rd%d", rdev->raid_disk); 5585 if (sysfs_create_link(&mddev->kobj, 5586 &rdev->kobj, nm)) 5587 - printk(KERN_WARNING 5588 - "md/raid:%s: failed to create " 5589 - " link %s\n", 5590 - mdname(mddev), nm); 5591 } else 5592 break; 5593 } ··· 5639 /* read-ahead size must cover two whole stripes, which is 5640 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 5641 */ 5642 - { 5643 int data_disks = conf->raid_disks - conf->max_degraded; 5644 int stripe = data_disks * ((conf->chunk_sectors << 9) 5645 / PAGE_SIZE);
··· 201 if (test_bit(STRIPE_HANDLE, &sh->state)) { 202 if (test_bit(STRIPE_DELAYED, &sh->state)) { 203 list_add_tail(&sh->lru, &conf->delayed_list); 204 + plugger_set_plug(&conf->plug); 205 } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 206 sh->bm_seq - conf->seq_write > 0) { 207 list_add_tail(&sh->lru, &conf->bitmap_list); 208 + plugger_set_plug(&conf->plug); 209 } else { 210 clear_bit(STRIPE_BIT_DELAY, &sh->state); 211 list_add_tail(&sh->lru, &conf->handle_list); ··· 434 } 435 436 static void unplug_slaves(mddev_t *mddev); 437 438 static struct stripe_head * 439 get_active_stripe(raid5_conf_t *conf, sector_t sector, ··· 464 < (conf->max_nr_stripes *3/4) 465 || !conf->inactive_blocked), 466 conf->device_lock, 467 + md_raid5_unplug_device(conf) 468 ); 469 conf->inactive_blocked = 0; 470 } else ··· 1337 struct kmem_cache *sc; 1338 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1339 1340 + if (conf->mddev->gendisk) 1341 + sprintf(conf->cache_name[0], 1342 + "raid%d-%s", conf->level, mdname(conf->mddev)); 1343 + else 1344 + sprintf(conf->cache_name[0], 1345 + "raid%d-%p", conf->level, conf->mddev); 1346 + sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 1347 + 1348 conf->active_name = 0; 1349 sc = kmem_cache_create(conf->cache_name[conf->active_name], 1350 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), ··· 3614 list_add_tail(&sh->lru, &conf->hold_list); 3615 } 3616 } else 3617 + plugger_set_plug(&conf->plug); 3618 } 3619 3620 static void activate_bit_delay(raid5_conf_t *conf) ··· 3655 rcu_read_unlock(); 3656 } 3657 3658 + void md_raid5_unplug_device(raid5_conf_t *conf) 3659 { 3660 unsigned long flags; 3661 3662 spin_lock_irqsave(&conf->device_lock, flags); 3663 3664 + if (plugger_remove_plug(&conf->plug)) { 3665 conf->seq_flush++; 3666 raid5_activate_delayed(conf); 3667 } 3668 + md_wakeup_thread(conf->mddev->thread); 3669 3670 spin_unlock_irqrestore(&conf->device_lock, flags); 3671 3672 + unplug_slaves(conf->mddev); 3673 + } 3674 + EXPORT_SYMBOL_GPL(md_raid5_unplug_device); 3675 + 3676 + static void raid5_unplug(struct plug_handle *plug) 3677 + { 3678 + raid5_conf_t *conf = container_of(plug, raid5_conf_t, plug); 3679 + md_raid5_unplug_device(conf); 3680 } 3681 3682 + static void raid5_unplug_queue(struct request_queue *q) 3683 { 3684 + mddev_t *mddev = q->queuedata; 3685 + md_raid5_unplug_device(mddev->private); 3686 + } 3687 + 3688 + int md_raid5_congested(mddev_t *mddev, int bits) 3689 + { 3690 raid5_conf_t *conf = mddev->private; 3691 3692 /* No difference between reads and writes. Just check 3693 * how busy the stripe_cache is 3694 */ 3695 3696 if (conf->inactive_blocked) 3697 return 1; 3698 if (conf->quiesce) ··· 3693 return 1; 3694 3695 return 0; 3696 + } 3697 + EXPORT_SYMBOL_GPL(md_raid5_congested); 3698 + 3699 + static int raid5_congested(void *data, int bits) 3700 + { 3701 + mddev_t *mddev = data; 3702 + 3703 + return mddev_congested(mddev, bits) || 3704 + md_raid5_congested(mddev, bits); 3705 } 3706 3707 /* We want read requests to align with chunks where possible, ··· 4075 * add failed due to overlap. Flush everything 4076 * and wait a while 4077 */ 4078 + md_raid5_unplug_device(conf); 4079 release_stripe(sh); 4080 schedule(); 4081 goto retry; ··· 4566 return 0; 4567 } 4568 4569 + int 4570 + raid5_set_cache_size(mddev_t *mddev, int size) 4571 + { 4572 + raid5_conf_t *conf = mddev->private; 4573 + int err; 4574 + 4575 + if (size <= 16 || size > 32768) 4576 + return -EINVAL; 4577 + while (size < conf->max_nr_stripes) { 4578 + if (drop_one_stripe(conf)) 4579 + conf->max_nr_stripes--; 4580 + else 4581 + break; 4582 + } 4583 + err = md_allow_write(mddev); 4584 + if (err) 4585 + return err; 4586 + while (size > conf->max_nr_stripes) { 4587 + if (grow_one_stripe(conf)) 4588 + conf->max_nr_stripes++; 4589 + else break; 4590 + } 4591 + return 0; 4592 + } 4593 + EXPORT_SYMBOL(raid5_set_cache_size); 4594 + 4595 static ssize_t 4596 raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) 4597 { ··· 4580 4581 if (strict_strtoul(page, 10, &new)) 4582 return -EINVAL; 4583 + err = raid5_set_cache_size(mddev, new); 4584 if (err) 4585 return err; 4586 return len; 4587 } 4588 ··· 4958 static int run(mddev_t *mddev) 4959 { 4960 raid5_conf_t *conf; 4961 + int working_disks = 0; 4962 int dirty_parity_disks = 0; 4963 mdk_rdev_t *rdev; 4964 sector_t reshape_offset = 0; ··· 5144 "reshape"); 5145 } 5146 5147 + 5148 + /* Ok, everything is just fine now */ 5149 + if (mddev->to_remove == &raid5_attrs_group) 5150 + mddev->to_remove = NULL; 5151 + else if (mddev->kobj.sd && 5152 + sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 5153 + printk(KERN_WARNING 5154 + "raid5: failed to create sysfs attributes for %s\n", 5155 + mdname(mddev)); 5156 + md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5157 + 5158 + plugger_init(&conf->plug, raid5_unplug); 5159 + mddev->plug = &conf->plug; 5160 + if (mddev->queue) { 5161 + int chunk_size; 5162 + /* read-ahead size must cover two whole stripes, which 5163 + * is 2 * (datadisks) * chunksize where 'n' is the 5164 + * number of raid devices 5165 + */ 5166 int data_disks = conf->previous_raid_disks - conf->max_degraded; 5167 int stripe = data_disks * 5168 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 5169 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5170 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5171 + 5172 + blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 5173 + 5174 + mddev->queue->backing_dev_info.congested_data = mddev; 5175 + mddev->queue->backing_dev_info.congested_fn = raid5_congested; 5176 + mddev->queue->queue_lock = &conf->device_lock; 5177 + mddev->queue->unplug_fn = raid5_unplug_queue; 5178 + 5179 + chunk_size = mddev->chunk_sectors << 9; 5180 + blk_queue_io_min(mddev->queue, chunk_size); 5181 + blk_queue_io_opt(mddev->queue, chunk_size * 5182 + (conf->raid_disks - conf->max_degraded)); 5183 + 5184 + list_for_each_entry(rdev, &mddev->disks, same_set) 5185 + disk_stack_limits(mddev->gendisk, rdev->bdev, 5186 + rdev->data_offset << 9); 5187 } 5188 5189 return 0; 5190 abort: ··· 5200 5201 md_unregister_thread(mddev->thread); 5202 mddev->thread = NULL; 5203 + if (mddev->queue) 5204 + mddev->queue->backing_dev_info.congested_fn = NULL; 5205 + plugger_flush(&conf->plug); /* the unplug fn references 'conf'*/ 5206 free_conf(conf); 5207 mddev->private = NULL; 5208 mddev->to_remove = &raid5_attrs_group; ··· 5545 sprintf(nm, "rd%d", rdev->raid_disk); 5546 if (sysfs_create_link(&mddev->kobj, 5547 &rdev->kobj, nm)) 5548 + /* Failure here is OK */; 5549 } else 5550 break; 5551 } ··· 5603 /* read-ahead size must cover two whole stripes, which is 5604 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 5605 */ 5606 + if (conf->mddev->queue) { 5607 int data_disks = conf->raid_disks - conf->max_degraded; 5608 int stripe = data_disks * ((conf->chunk_sectors << 9) 5609 / PAGE_SIZE);
+8 -1
drivers/md/raid5.h
··· 388 * two caches. 389 */ 390 int active_name; 391 - char cache_name[2][20]; 392 struct kmem_cache *slab_cache; /* for allocating stripes */ 393 394 int seq_flush, seq_write; ··· 398 * (fresh device added). 399 * Cleared when a sync completes. 400 */ 401 /* per cpu variables */ 402 struct raid5_percpu { 403 struct page *spare_page; /* Used when checking P/Q in raid6 */ ··· 500 { 501 return layout >= 8 && layout <= 10; 502 } 503 #endif
··· 388 * two caches. 389 */ 390 int active_name; 391 + char cache_name[2][32]; 392 struct kmem_cache *slab_cache; /* for allocating stripes */ 393 394 int seq_flush, seq_write; ··· 398 * (fresh device added). 399 * Cleared when a sync completes. 400 */ 401 + 402 + struct plug_handle plug; 403 + 404 /* per cpu variables */ 405 struct raid5_percpu { 406 struct page *spare_page; /* Used when checking P/Q in raid6 */ ··· 497 { 498 return layout >= 8 && layout <= 10; 499 } 500 + 501 + extern int md_raid5_congested(mddev_t *mddev, int bits); 502 + extern void md_raid5_unplug_device(raid5_conf_t *conf); 503 + extern int raid5_set_cache_size(mddev_t *mddev, int size); 504 #endif
drivers/md/raid6algos.c lib/raid6/raid6algos.c
drivers/md/raid6altivec.uc lib/raid6/raid6altivec.uc
drivers/md/raid6int.uc lib/raid6/raid6int.uc
drivers/md/raid6mmx.c lib/raid6/raid6mmx.c
drivers/md/raid6recov.c lib/raid6/raid6recov.c
drivers/md/raid6sse1.c lib/raid6/raid6sse1.c
drivers/md/raid6sse2.c lib/raid6/raid6sse2.c
drivers/md/raid6test/Makefile lib/raid6/raid6test/Makefile
drivers/md/raid6test/test.c lib/raid6/raid6test/test.c
drivers/md/raid6x86.h lib/raid6/raid6x86.h
drivers/md/unroll.awk lib/raid6/unroll.awk
+3
lib/Kconfig
··· 7 8 menu "Library routines" 9 10 config BITREVERSE 11 tristate 12
··· 7 8 menu "Library routines" 9 10 + config RAID6_PQ 11 + tristate 12 + 13 config BITREVERSE 14 tristate 15
+1
lib/Makefile
··· 69 obj-$(CONFIG_REED_SOLOMON) += reed_solomon/ 70 obj-$(CONFIG_LZO_COMPRESS) += lzo/ 71 obj-$(CONFIG_LZO_DECOMPRESS) += lzo/ 72 73 lib-$(CONFIG_DECOMPRESS_GZIP) += decompress_inflate.o 74 lib-$(CONFIG_DECOMPRESS_BZIP2) += decompress_bunzip2.o
··· 69 obj-$(CONFIG_REED_SOLOMON) += reed_solomon/ 70 obj-$(CONFIG_LZO_COMPRESS) += lzo/ 71 obj-$(CONFIG_LZO_DECOMPRESS) += lzo/ 72 + obj-$(CONFIG_RAID6_PQ) += raid6/ 73 74 lib-$(CONFIG_DECOMPRESS_GZIP) += decompress_inflate.o 75 lib-$(CONFIG_DECOMPRESS_BZIP2) += decompress_bunzip2.o
+78
lib/raid6/Makefile
···
··· 1 + obj-$(CONFIG_RAID6_PQ) += raid6_pq.o 2 + 3 + raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \ 4 + raid6int1.o raid6int2.o raid6int4.o \ 5 + raid6int8.o raid6int16.o raid6int32.o \ 6 + raid6altivec1.o raid6altivec2.o raid6altivec4.o \ 7 + raid6altivec8.o \ 8 + raid6mmx.o raid6sse1.o raid6sse2.o 9 + hostprogs-y += mktables 10 + 11 + quiet_cmd_unroll = UNROLL $@ 12 + cmd_unroll = $(AWK) -f$(srctree)/$(src)/unroll.awk -vN=$(UNROLL) \ 13 + < $< > $@ || ( rm -f $@ && exit 1 ) 14 + 15 + ifeq ($(CONFIG_ALTIVEC),y) 16 + altivec_flags := -maltivec -mabi=altivec 17 + endif 18 + 19 + targets += raid6int1.c 20 + $(obj)/raid6int1.c: UNROLL := 1 21 + $(obj)/raid6int1.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE 22 + $(call if_changed,unroll) 23 + 24 + targets += raid6int2.c 25 + $(obj)/raid6int2.c: UNROLL := 2 26 + $(obj)/raid6int2.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE 27 + $(call if_changed,unroll) 28 + 29 + targets += raid6int4.c 30 + $(obj)/raid6int4.c: UNROLL := 4 31 + $(obj)/raid6int4.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE 32 + $(call if_changed,unroll) 33 + 34 + targets += raid6int8.c 35 + $(obj)/raid6int8.c: UNROLL := 8 36 + $(obj)/raid6int8.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE 37 + $(call if_changed,unroll) 38 + 39 + targets += raid6int16.c 40 + $(obj)/raid6int16.c: UNROLL := 16 41 + $(obj)/raid6int16.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE 42 + $(call if_changed,unroll) 43 + 44 + targets += raid6int32.c 45 + $(obj)/raid6int32.c: UNROLL := 32 46 + $(obj)/raid6int32.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE 47 + $(call if_changed,unroll) 48 + 49 + CFLAGS_raid6altivec1.o += $(altivec_flags) 50 + targets += raid6altivec1.c 51 + $(obj)/raid6altivec1.c: UNROLL := 1 52 + $(obj)/raid6altivec1.c: $(src)/raid6altivec.uc $(src)/unroll.awk FORCE 53 + $(call if_changed,unroll) 54 + 55 + CFLAGS_raid6altivec2.o += $(altivec_flags) 56 + targets += raid6altivec2.c 57 + $(obj)/raid6altivec2.c: UNROLL := 2 58 + $(obj)/raid6altivec2.c: $(src)/raid6altivec.uc $(src)/unroll.awk FORCE 59 + $(call if_changed,unroll) 60 + 61 + CFLAGS_raid6altivec4.o += $(altivec_flags) 62 + targets += raid6altivec4.c 63 + $(obj)/raid6altivec4.c: UNROLL := 4 64 + $(obj)/raid6altivec4.c: $(src)/raid6altivec.uc $(src)/unroll.awk FORCE 65 + $(call if_changed,unroll) 66 + 67 + CFLAGS_raid6altivec8.o += $(altivec_flags) 68 + targets += raid6altivec8.c 69 + $(obj)/raid6altivec8.c: UNROLL := 8 70 + $(obj)/raid6altivec8.c: $(src)/raid6altivec.uc $(src)/unroll.awk FORCE 71 + $(call if_changed,unroll) 72 + 73 + quiet_cmd_mktable = TABLE $@ 74 + cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) 75 + 76 + targets += raid6tables.c 77 + $(obj)/raid6tables.c: $(obj)/mktables FORCE 78 + $(call if_changed,mktable)