Merge tag 'md/3.12' of git://neil.brown.name/md

+36 -18

drivers/md/md.c

··· 1180 1180 mddev->bitmap_info.offset = 1181 1181 mddev->bitmap_info.default_offset; 1182 1182 mddev->bitmap_info.space = 1183 - mddev->bitmap_info.space; 1183 + mddev->bitmap_info.default_space; 1184 1184 } 1185 1185 1186 1186 } else if (mddev->pers == NULL) { ··· 3429 3429 mddev->safemode_delay = (msec*HZ)/1000; 3430 3430 if (mddev->safemode_delay == 0) 3431 3431 mddev->safemode_delay = 1; 3432 - if (mddev->safemode_delay < old_delay) 3432 + if (mddev->safemode_delay < old_delay || old_delay == 0) 3433 3433 md_safemode_timeout((unsigned long)mddev); 3434 3434 } 3435 3435 return len; ··· 5144 5144 5145 5145 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5146 5146 5147 - if (mddev->flags) 5147 + if (mddev->flags & MD_UPDATE_SB_FLAGS) 5148 5148 md_update_sb(mddev, 0); 5149 5149 5150 5150 md_new_event(mddev); ··· 5289 5289 md_super_wait(mddev); 5290 5290 5291 5291 if (mddev->ro == 0 && 5292 - (!mddev->in_sync || mddev->flags)) { 5292 + (!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) { 5293 5293 /* mark array as shutdown cleanly */ 5294 5294 mddev->in_sync = 1; 5295 5295 md_update_sb(mddev, 1); ··· 5337 5337 err = -EBUSY; 5338 5338 goto out; 5339 5339 } 5340 - if (bdev) 5341 - sync_blockdev(bdev); 5340 + if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) { 5341 + /* Someone opened the device since we flushed it 5342 + * so page cache could be dirty and it is too late 5343 + * to flush. So abort 5344 + */ 5345 + mutex_unlock(&mddev->open_mutex); 5346 + return -EBUSY; 5347 + } 5342 5348 if (mddev->pers) { 5343 5349 __md_stop_writes(mddev); 5344 5350 ··· 5379 5373 mutex_unlock(&mddev->open_mutex); 5380 5374 return -EBUSY; 5381 5375 } 5382 - if (bdev) 5383 - /* It is possible IO was issued on some other 5384 - * open file which was closed before we took ->open_mutex. 5385 - * As that was not the last close __blkdev_put will not 5386 - * have called sync_blockdev, so we must. 5376 + if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) { 5377 + /* Someone opened the device since we flushed it 5378 + * so page cache could be dirty and it is too late 5379 + * to flush. So abort 5387 5380 */ 5388 - sync_blockdev(bdev); 5389 - 5381 + mutex_unlock(&mddev->open_mutex); 5382 + return -EBUSY; 5383 + } 5390 5384 if (mddev->pers) { 5391 5385 if (mddev->ro) 5392 5386 set_disk_ro(disk, 0); ··· 5634 5628 char *ptr, *buf = NULL; 5635 5629 int err = -ENOMEM; 5636 5630 5637 - if (md_allow_write(mddev)) 5638 - file = kmalloc(sizeof(*file), GFP_NOIO); 5639 - else 5640 - file = kmalloc(sizeof(*file), GFP_KERNEL); 5631 + file = kmalloc(sizeof(*file), GFP_NOIO); 5641 5632 5642 5633 if (!file) 5643 5634 goto out; ··· 6423 6420 !test_bit(MD_RECOVERY_NEEDED, 6424 6421 &mddev->flags), 6425 6422 msecs_to_jiffies(5000)); 6423 + if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 6424 + /* Need to flush page cache, and ensure no-one else opens 6425 + * and writes 6426 + */ 6427 + mutex_lock(&mddev->open_mutex); 6428 + if (atomic_read(&mddev->openers) > 1) { 6429 + mutex_unlock(&mddev->open_mutex); 6430 + err = -EBUSY; 6431 + goto abort; 6432 + } 6433 + set_bit(MD_STILL_CLOSED, &mddev->flags); 6434 + mutex_unlock(&mddev->open_mutex); 6435 + sync_blockdev(bdev); 6436 + } 6426 6437 err = mddev_lock(mddev); 6427 6438 if (err) { 6428 6439 printk(KERN_INFO ··· 6690 6673 6691 6674 err = 0; 6692 6675 atomic_inc(&mddev->openers); 6676 + clear_bit(MD_STILL_CLOSED, &mddev->flags); 6693 6677 mutex_unlock(&mddev->open_mutex); 6694 6678 6695 6679 check_disk_change(bdev); ··· 7835 7817 sysfs_notify_dirent_safe(mddev->sysfs_state); 7836 7818 } 7837 7819 7838 - if (mddev->flags) 7820 + if (mddev->flags & MD_UPDATE_SB_FLAGS) 7839 7821 md_update_sb(mddev, 0); 7840 7822 7841 7823 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&

+6 -2

drivers/md/md.h

··· 204 204 struct md_personality *pers; 205 205 dev_t unit; 206 206 int md_minor; 207 - struct list_head disks; 207 + struct list_head disks; 208 208 unsigned long flags; 209 209 #define MD_CHANGE_DEVS 0 /* Some device status has changed */ 210 210 #define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ 211 211 #define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */ 212 + #define MD_UPDATE_SB_FLAGS (1 | 2 | 4) /* If these are set, md_update_sb needed */ 212 213 #define MD_ARRAY_FIRST_USE 3 /* First use of array, needs initialization */ 214 + #define MD_STILL_CLOSED 4 /* If set, then array has not been opened since 215 + * md_ioctl checked on it. 216 + */ 213 217 214 218 int suspended; 215 219 atomic_t active_io; ··· 222 218 * are happening, so run/ 223 219 * takeover/stop are not safe 224 220 */ 225 - int ready; /* See when safe to pass 221 + int ready; /* See when safe to pass 226 222 * IO requests down */ 227 223 struct gendisk *gendisk; 228 224

+342 -20

drivers/md/raid5.c

··· 53 53 #include <linux/cpu.h> 54 54 #include <linux/slab.h> 55 55 #include <linux/ratelimit.h> 56 + #include <linux/nodemask.h> 56 57 #include <trace/events/block.h> 57 58 58 59 #include "md.h" ··· 61 60 #include "raid0.h" 62 61 #include "bitmap.h" 63 62 63 + #define cpu_to_group(cpu) cpu_to_node(cpu) 64 + #define ANY_GROUP NUMA_NO_NODE 65 + 66 + static struct workqueue_struct *raid5_wq; 64 67 /* 65 68 * Stripe cache 66 69 */ ··· 77 72 #define BYPASS_THRESHOLD 1 78 73 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 79 74 #define HASH_MASK (NR_HASH - 1) 75 + #define MAX_STRIPE_BATCH 8 80 76 81 77 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 82 78 { ··· 206 200 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 207 201 } 208 202 203 + static void raid5_wakeup_stripe_thread(struct stripe_head *sh) 204 + { 205 + struct r5conf *conf = sh->raid_conf; 206 + struct r5worker_group *group; 207 + int thread_cnt; 208 + int i, cpu = sh->cpu; 209 + 210 + if (!cpu_online(cpu)) { 211 + cpu = cpumask_any(cpu_online_mask); 212 + sh->cpu = cpu; 213 + } 214 + 215 + if (list_empty(&sh->lru)) { 216 + struct r5worker_group *group; 217 + group = conf->worker_groups + cpu_to_group(cpu); 218 + list_add_tail(&sh->lru, &group->handle_list); 219 + group->stripes_cnt++; 220 + sh->group = group; 221 + } 222 + 223 + if (conf->worker_cnt_per_group == 0) { 224 + md_wakeup_thread(conf->mddev->thread); 225 + return; 226 + } 227 + 228 + group = conf->worker_groups + cpu_to_group(sh->cpu); 229 + 230 + group->workers[0].working = true; 231 + /* at least one worker should run to avoid race */ 232 + queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work); 233 + 234 + thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1; 235 + /* wakeup more workers */ 236 + for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) { 237 + if (group->workers[i].working == false) { 238 + group->workers[i].working = true; 239 + queue_work_on(sh->cpu, raid5_wq, 240 + &group->workers[i].work); 241 + thread_cnt--; 242 + } 243 + } 244 + } 245 + 209 246 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) 210 247 { 211 248 BUG_ON(!list_empty(&sh->lru)); ··· 263 214 else { 264 215 clear_bit(STRIPE_DELAYED, &sh->state); 265 216 clear_bit(STRIPE_BIT_DELAY, &sh->state); 266 - list_add_tail(&sh->lru, &conf->handle_list); 217 + if (conf->worker_cnt_per_group == 0) { 218 + list_add_tail(&sh->lru, &conf->handle_list); 219 + } else { 220 + raid5_wakeup_stripe_thread(sh); 221 + return; 222 + } 267 223 } 268 224 md_wakeup_thread(conf->mddev->thread); 269 225 } else { ··· 293 239 do_release_stripe(conf, sh); 294 240 } 295 241 242 + static struct llist_node *llist_reverse_order(struct llist_node *head) 243 + { 244 + struct llist_node *new_head = NULL; 245 + 246 + while (head) { 247 + struct llist_node *tmp = head; 248 + head = head->next; 249 + tmp->next = new_head; 250 + new_head = tmp; 251 + } 252 + 253 + return new_head; 254 + } 255 + 256 + /* should hold conf->device_lock already */ 257 + static int release_stripe_list(struct r5conf *conf) 258 + { 259 + struct stripe_head *sh; 260 + int count = 0; 261 + struct llist_node *head; 262 + 263 + head = llist_del_all(&conf->released_stripes); 264 + head = llist_reverse_order(head); 265 + while (head) { 266 + sh = llist_entry(head, struct stripe_head, release_list); 267 + head = llist_next(head); 268 + /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ 269 + smp_mb(); 270 + clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); 271 + /* 272 + * Don't worry the bit is set here, because if the bit is set 273 + * again, the count is always > 1. This is true for 274 + * STRIPE_ON_UNPLUG_LIST bit too. 275 + */ 276 + __release_stripe(conf, sh); 277 + count++; 278 + } 279 + 280 + return count; 281 + } 282 + 296 283 static void release_stripe(struct stripe_head *sh) 297 284 { 298 285 struct r5conf *conf = sh->raid_conf; 299 286 unsigned long flags; 287 + bool wakeup; 300 288 289 + if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) 290 + goto slow_path; 291 + wakeup = llist_add(&sh->release_list, &conf->released_stripes); 292 + if (wakeup) 293 + md_wakeup_thread(conf->mddev->thread); 294 + return; 295 + slow_path: 301 296 local_irq_save(flags); 297 + /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ 302 298 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 303 299 do_release_stripe(conf, sh); 304 300 spin_unlock(&conf->device_lock); ··· 463 359 raid5_build_block(sh, i, previous); 464 360 } 465 361 insert_hash(conf, sh); 362 + sh->cpu = smp_processor_id(); 466 363 } 467 364 468 365 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, ··· 596 491 if (atomic_read(&sh->count)) { 597 492 BUG_ON(!list_empty(&sh->lru) 598 493 && !test_bit(STRIPE_EXPANDING, &sh->state) 599 - && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)); 494 + && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) 495 + && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 600 496 } else { 601 497 if (!test_bit(STRIPE_HANDLE, &sh->state)) 602 498 atomic_inc(&conf->active_stripes); ··· 605 499 !test_bit(STRIPE_EXPANDING, &sh->state)) 606 500 BUG(); 607 501 list_del_init(&sh->lru); 502 + if (sh->group) { 503 + sh->group->stripes_cnt--; 504 + sh->group = NULL; 505 + } 608 506 } 609 507 } 610 508 } while (sh == NULL); ··· 3889 3779 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3890 3780 atomic_inc(&conf->preread_active_stripes); 3891 3781 list_add_tail(&sh->lru, &conf->hold_list); 3782 + raid5_wakeup_stripe_thread(sh); 3892 3783 } 3893 3784 } 3894 3785 } ··· 4169 4058 * head of the hold_list has changed, i.e. the head was promoted to the 4170 4059 * handle_list. 4171 4060 */ 4172 - static struct stripe_head *__get_priority_stripe(struct r5conf *conf) 4061 + static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) 4173 4062 { 4174 - struct stripe_head *sh; 4063 + struct stripe_head *sh = NULL, *tmp; 4064 + struct list_head *handle_list = NULL; 4065 + struct r5worker_group *wg = NULL; 4066 + 4067 + if (conf->worker_cnt_per_group == 0) { 4068 + handle_list = &conf->handle_list; 4069 + } else if (group != ANY_GROUP) { 4070 + handle_list = &conf->worker_groups[group].handle_list; 4071 + wg = &conf->worker_groups[group]; 4072 + } else { 4073 + int i; 4074 + for (i = 0; i < conf->group_cnt; i++) { 4075 + handle_list = &conf->worker_groups[i].handle_list; 4076 + wg = &conf->worker_groups[i]; 4077 + if (!list_empty(handle_list)) 4078 + break; 4079 + } 4080 + } 4175 4081 4176 4082 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 4177 4083 __func__, 4178 - list_empty(&conf->handle_list) ? "empty" : "busy", 4084 + list_empty(handle_list) ? "empty" : "busy", 4179 4085 list_empty(&conf->hold_list) ? "empty" : "busy", 4180 4086 atomic_read(&conf->pending_full_writes), conf->bypass_count); 4181 4087 4182 - if (!list_empty(&conf->handle_list)) { 4183 - sh = list_entry(conf->handle_list.next, typeof(*sh), lru); 4088 + if (!list_empty(handle_list)) { 4089 + sh = list_entry(handle_list->next, typeof(*sh), lru); 4184 4090 4185 4091 if (list_empty(&conf->hold_list)) 4186 4092 conf->bypass_count = 0; ··· 4215 4087 ((conf->bypass_threshold && 4216 4088 conf->bypass_count > conf->bypass_threshold) || 4217 4089 atomic_read(&conf->pending_full_writes) == 0)) { 4218 - sh = list_entry(conf->hold_list.next, 4219 - typeof(*sh), lru); 4220 - conf->bypass_count -= conf->bypass_threshold; 4221 - if (conf->bypass_count < 0) 4222 - conf->bypass_count = 0; 4223 - } else 4090 + 4091 + list_for_each_entry(tmp, &conf->hold_list, lru) { 4092 + if (conf->worker_cnt_per_group == 0 || 4093 + group == ANY_GROUP || 4094 + !cpu_online(tmp->cpu) || 4095 + cpu_to_group(tmp->cpu) == group) { 4096 + sh = tmp; 4097 + break; 4098 + } 4099 + } 4100 + 4101 + if (sh) { 4102 + conf->bypass_count -= conf->bypass_threshold; 4103 + if (conf->bypass_count < 0) 4104 + conf->bypass_count = 0; 4105 + } 4106 + wg = NULL; 4107 + } 4108 + 4109 + if (!sh) 4224 4110 return NULL; 4225 4111 4112 + if (wg) { 4113 + wg->stripes_cnt--; 4114 + sh->group = NULL; 4115 + } 4226 4116 list_del_init(&sh->lru); 4227 4117 atomic_inc(&sh->count); 4228 4118 BUG_ON(atomic_read(&sh->count) != 1); ··· 4273 4127 */ 4274 4128 smp_mb__before_clear_bit(); 4275 4129 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 4130 + /* 4131 + * STRIPE_ON_RELEASE_LIST could be set here. In that 4132 + * case, the count is always > 1 here 4133 + */ 4276 4134 __release_stripe(conf, sh); 4277 4135 cnt++; 4278 4136 } ··· 4436 4286 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 4437 4287 DEFINE_WAIT(w); 4438 4288 int previous; 4289 + int seq; 4439 4290 4440 4291 retry: 4292 + seq = read_seqcount_begin(&conf->gen_lock); 4441 4293 previous = 0; 4442 4294 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 4443 4295 if (unlikely(conf->reshape_progress != MaxSector)) { ··· 4472 4320 previous, 4473 4321 &dd_idx, NULL); 4474 4322 pr_debug("raid456: make_request, sector %llu logical %llu\n", 4475 - (unsigned long long)new_sector, 4323 + (unsigned long long)new_sector, 4476 4324 (unsigned long long)logical_sector); 4477 4325 4478 4326 sh = get_active_stripe(conf, new_sector, previous, ··· 4500 4348 schedule(); 4501 4349 goto retry; 4502 4350 } 4351 + } 4352 + if (read_seqcount_retry(&conf->gen_lock, seq)) { 4353 + /* Might have got the wrong stripe_head 4354 + * by accident 4355 + */ 4356 + release_stripe(sh); 4357 + goto retry; 4503 4358 } 4504 4359 4505 4360 if (rw == WRITE && ··· 4947 4788 return handled; 4948 4789 } 4949 4790 4950 - #define MAX_STRIPE_BATCH 8 4951 - static int handle_active_stripes(struct r5conf *conf) 4791 + static int handle_active_stripes(struct r5conf *conf, int group, 4792 + struct r5worker *worker) 4952 4793 { 4953 4794 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 4954 4795 int i, batch_size = 0; 4955 4796 4956 4797 while (batch_size < MAX_STRIPE_BATCH && 4957 - (sh = __get_priority_stripe(conf)) != NULL) 4798 + (sh = __get_priority_stripe(conf, group)) != NULL) 4958 4799 batch[batch_size++] = sh; 4959 4800 4960 4801 if (batch_size == 0) ··· 4970 4811 for (i = 0; i < batch_size; i++) 4971 4812 __release_stripe(conf, batch[i]); 4972 4813 return batch_size; 4814 + } 4815 + 4816 + static void raid5_do_work(struct work_struct *work) 4817 + { 4818 + struct r5worker *worker = container_of(work, struct r5worker, work); 4819 + struct r5worker_group *group = worker->group; 4820 + struct r5conf *conf = group->conf; 4821 + int group_id = group - conf->worker_groups; 4822 + int handled; 4823 + struct blk_plug plug; 4824 + 4825 + pr_debug("+++ raid5worker active\n"); 4826 + 4827 + blk_start_plug(&plug); 4828 + handled = 0; 4829 + spin_lock_irq(&conf->device_lock); 4830 + while (1) { 4831 + int batch_size, released; 4832 + 4833 + released = release_stripe_list(conf); 4834 + 4835 + batch_size = handle_active_stripes(conf, group_id, worker); 4836 + worker->working = false; 4837 + if (!batch_size && !released) 4838 + break; 4839 + handled += batch_size; 4840 + } 4841 + pr_debug("%d stripes handled\n", handled); 4842 + 4843 + spin_unlock_irq(&conf->device_lock); 4844 + blk_finish_plug(&plug); 4845 + 4846 + pr_debug("--- raid5worker inactive\n"); 4973 4847 } 4974 4848 4975 4849 /* ··· 5028 4836 spin_lock_irq(&conf->device_lock); 5029 4837 while (1) { 5030 4838 struct bio *bio; 5031 - int batch_size; 4839 + int batch_size, released; 4840 + 4841 + released = release_stripe_list(conf); 5032 4842 5033 4843 if ( 5034 4844 !list_empty(&conf->bitmap_list)) { ··· 5054 4860 handled++; 5055 4861 } 5056 4862 5057 - batch_size = handle_active_stripes(conf); 5058 - if (!batch_size) 4863 + batch_size = handle_active_stripes(conf, ANY_GROUP, NULL); 4864 + if (!batch_size && !released) 5059 4865 break; 5060 4866 handled += batch_size; 5061 4867 ··· 5183 4989 static struct md_sysfs_entry 5184 4990 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 5185 4991 4992 + static ssize_t 4993 + raid5_show_group_thread_cnt(struct mddev *mddev, char *page) 4994 + { 4995 + struct r5conf *conf = mddev->private; 4996 + if (conf) 4997 + return sprintf(page, "%d\n", conf->worker_cnt_per_group); 4998 + else 4999 + return 0; 5000 + } 5001 + 5002 + static int alloc_thread_groups(struct r5conf *conf, int cnt); 5003 + static ssize_t 5004 + raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) 5005 + { 5006 + struct r5conf *conf = mddev->private; 5007 + unsigned long new; 5008 + int err; 5009 + struct r5worker_group *old_groups; 5010 + int old_group_cnt; 5011 + 5012 + if (len >= PAGE_SIZE) 5013 + return -EINVAL; 5014 + if (!conf) 5015 + return -ENODEV; 5016 + 5017 + if (kstrtoul(page, 10, &new)) 5018 + return -EINVAL; 5019 + 5020 + if (new == conf->worker_cnt_per_group) 5021 + return len; 5022 + 5023 + mddev_suspend(mddev); 5024 + 5025 + old_groups = conf->worker_groups; 5026 + old_group_cnt = conf->worker_cnt_per_group; 5027 + 5028 + conf->worker_groups = NULL; 5029 + err = alloc_thread_groups(conf, new); 5030 + if (err) { 5031 + conf->worker_groups = old_groups; 5032 + conf->worker_cnt_per_group = old_group_cnt; 5033 + } else { 5034 + if (old_groups) 5035 + kfree(old_groups[0].workers); 5036 + kfree(old_groups); 5037 + } 5038 + 5039 + mddev_resume(mddev); 5040 + 5041 + if (err) 5042 + return err; 5043 + return len; 5044 + } 5045 + 5046 + static struct md_sysfs_entry 5047 + raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR, 5048 + raid5_show_group_thread_cnt, 5049 + raid5_store_group_thread_cnt); 5050 + 5186 5051 static struct attribute *raid5_attrs[] = { 5187 5052 &raid5_stripecache_size.attr, 5188 5053 &raid5_stripecache_active.attr, 5189 5054 &raid5_preread_bypass_threshold.attr, 5055 + &raid5_group_thread_cnt.attr, 5190 5056 NULL, 5191 5057 }; 5192 5058 static struct attribute_group raid5_attrs_group = { 5193 5059 .name = NULL, 5194 5060 .attrs = raid5_attrs, 5195 5061 }; 5062 + 5063 + static int alloc_thread_groups(struct r5conf *conf, int cnt) 5064 + { 5065 + int i, j; 5066 + ssize_t size; 5067 + struct r5worker *workers; 5068 + 5069 + conf->worker_cnt_per_group = cnt; 5070 + if (cnt == 0) { 5071 + conf->worker_groups = NULL; 5072 + return 0; 5073 + } 5074 + conf->group_cnt = num_possible_nodes(); 5075 + size = sizeof(struct r5worker) * cnt; 5076 + workers = kzalloc(size * conf->group_cnt, GFP_NOIO); 5077 + conf->worker_groups = kzalloc(sizeof(struct r5worker_group) * 5078 + conf->group_cnt, GFP_NOIO); 5079 + if (!conf->worker_groups || !workers) { 5080 + kfree(workers); 5081 + kfree(conf->worker_groups); 5082 + conf->worker_groups = NULL; 5083 + return -ENOMEM; 5084 + } 5085 + 5086 + for (i = 0; i < conf->group_cnt; i++) { 5087 + struct r5worker_group *group; 5088 + 5089 + group = &conf->worker_groups[i]; 5090 + INIT_LIST_HEAD(&group->handle_list); 5091 + group->conf = conf; 5092 + group->workers = workers + i * cnt; 5093 + 5094 + for (j = 0; j < cnt; j++) { 5095 + group->workers[j].group = group; 5096 + INIT_WORK(&group->workers[j].work, raid5_do_work); 5097 + } 5098 + } 5099 + 5100 + return 0; 5101 + } 5102 + 5103 + static void free_thread_groups(struct r5conf *conf) 5104 + { 5105 + if (conf->worker_groups) 5106 + kfree(conf->worker_groups[0].workers); 5107 + kfree(conf->worker_groups); 5108 + conf->worker_groups = NULL; 5109 + } 5196 5110 5197 5111 static sector_t 5198 5112 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) ··· 5342 5040 5343 5041 static void free_conf(struct r5conf *conf) 5344 5042 { 5043 + free_thread_groups(conf); 5345 5044 shrink_stripes(conf); 5346 5045 raid5_free_percpu(conf); 5347 5046 kfree(conf->disks); ··· 5471 5168 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 5472 5169 if (conf == NULL) 5473 5170 goto abort; 5171 + /* Don't enable multi-threading by default*/ 5172 + if (alloc_thread_groups(conf, 0)) 5173 + goto abort; 5474 5174 spin_lock_init(&conf->device_lock); 5175 + seqcount_init(&conf->gen_lock); 5475 5176 init_waitqueue_head(&conf->wait_for_stripe); 5476 5177 init_waitqueue_head(&conf->wait_for_overlap); 5477 5178 INIT_LIST_HEAD(&conf->handle_list); ··· 5483 5176 INIT_LIST_HEAD(&conf->delayed_list); 5484 5177 INIT_LIST_HEAD(&conf->bitmap_list); 5485 5178 INIT_LIST_HEAD(&conf->inactive_list); 5179 + init_llist_head(&conf->released_stripes); 5486 5180 atomic_set(&conf->active_stripes, 0); 5487 5181 atomic_set(&conf->preread_active_stripes, 0); 5488 5182 atomic_set(&conf->active_aligned_reads, 0); ··· 6288 5980 6289 5981 atomic_set(&conf->reshape_stripes, 0); 6290 5982 spin_lock_irq(&conf->device_lock); 5983 + write_seqcount_begin(&conf->gen_lock); 6291 5984 conf->previous_raid_disks = conf->raid_disks; 6292 5985 conf->raid_disks += mddev->delta_disks; 6293 5986 conf->prev_chunk_sectors = conf->chunk_sectors; ··· 6305 5996 else 6306 5997 conf->reshape_progress = 0; 6307 5998 conf->reshape_safe = conf->reshape_progress; 5999 + write_seqcount_end(&conf->gen_lock); 6308 6000 spin_unlock_irq(&conf->device_lock); 6001 + 6002 + /* Now make sure any requests that proceeded on the assumption 6003 + * the reshape wasn't running - like Discard or Read - have 6004 + * completed. 6005 + */ 6006 + mddev_suspend(mddev); 6007 + mddev_resume(mddev); 6309 6008 6310 6009 /* Add some new drives, as many as will fit. 6311 6010 * We know there are enough to make the newly sized array work. ··· 6789 6472 6790 6473 static int __init raid5_init(void) 6791 6474 { 6475 + raid5_wq = alloc_workqueue("raid5wq", 6476 + WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); 6477 + if (!raid5_wq) 6478 + return -ENOMEM; 6792 6479 register_md_personality(&raid6_personality); 6793 6480 register_md_personality(&raid5_personality); 6794 6481 register_md_personality(&raid4_personality); ··· 6804 6483 unregister_md_personality(&raid6_personality); 6805 6484 unregister_md_personality(&raid5_personality); 6806 6485 unregister_md_personality(&raid4_personality); 6486 + destroy_workqueue(raid5_wq); 6807 6487 } 6808 6488 6809 6489 module_init(raid5_init);

+22

drivers/md/raid5.h

··· 197 197 struct stripe_head { 198 198 struct hlist_node hash; 199 199 struct list_head lru; /* inactive_list or handle_list */ 200 + struct llist_node release_list; 200 201 struct r5conf *raid_conf; 201 202 short generation; /* increments with every 202 203 * reshape */ ··· 212 211 enum check_states check_state; 213 212 enum reconstruct_states reconstruct_state; 214 213 spinlock_t stripe_lock; 214 + int cpu; 215 + struct r5worker_group *group; 215 216 /** 216 217 * struct stripe_operations 217 218 * @target - STRIPE_OP_COMPUTE_BLK target ··· 324 321 STRIPE_OPS_REQ_PENDING, 325 322 STRIPE_ON_UNPLUG_LIST, 326 323 STRIPE_DISCARD, 324 + STRIPE_ON_RELEASE_LIST, 327 325 }; 328 326 329 327 /* ··· 367 363 struct md_rdev *rdev, *replacement; 368 364 }; 369 365 366 + struct r5worker { 367 + struct work_struct work; 368 + struct r5worker_group *group; 369 + bool working; 370 + }; 371 + 372 + struct r5worker_group { 373 + struct list_head handle_list; 374 + struct r5conf *conf; 375 + struct r5worker *workers; 376 + int stripes_cnt; 377 + }; 378 + 370 379 struct r5conf { 371 380 struct hlist_head *stripe_hashtbl; 372 381 struct mddev *mddev; ··· 403 386 int prev_chunk_sectors; 404 387 int prev_algo; 405 388 short generation; /* increments with every reshape */ 389 + seqcount_t gen_lock; /* lock against generation changes */ 406 390 unsigned long reshape_checkpoint; /* Time we last updated 407 391 * metadata */ 408 392 long long min_offset_diff; /* minimum difference between ··· 463 445 */ 464 446 atomic_t active_stripes; 465 447 struct list_head inactive_list; 448 + struct llist_head released_stripes; 466 449 wait_queue_head_t wait_for_stripe; 467 450 wait_queue_head_t wait_for_overlap; 468 451 int inactive_blocked; /* release of inactive stripes blocked, ··· 477 458 * the new thread here until we fully activate the array. 478 459 */ 479 460 struct md_thread *thread; 461 + struct r5worker_group *worker_groups; 462 + int group_cnt; 463 + int worker_cnt_per_group; 480 464 }; 481 465 482 466 /*

+1

include/linux/raid/pq.h

··· 101 101 extern const struct raid6_calls raid6_avx2x1; 102 102 extern const struct raid6_calls raid6_avx2x2; 103 103 extern const struct raid6_calls raid6_avx2x4; 104 + extern const struct raid6_calls raid6_tilegx8; 104 105 105 106 struct raid6_recov_calls { 106 107 void (*data2)(int, size_t, int, int, void **);

+6

lib/raid6/Makefile

··· 6 6 raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o 7 7 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o 8 8 raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o 9 + raid6_pq-$(CONFIG_TILEGX) += tilegx8.o 9 10 10 11 hostprogs-y += mktables 11 12 ··· 109 108 targets += neon8.c 110 109 $(obj)/neon8.c: UNROLL := 8 111 110 $(obj)/neon8.c: $(src)/neon.uc $(src)/unroll.awk FORCE 111 + $(call if_changed,unroll) 112 + 113 + targets += tilegx8.c 114 + $(obj)/tilegx8.c: UNROLL := 8 115 + $(obj)/tilegx8.c: $(src)/tilegx.uc $(src)/unroll.awk FORCE 112 116 $(call if_changed,unroll) 113 117 114 118 quiet_cmd_mktable = TABLE $@

+3

lib/raid6/algos.c

··· 66 66 &raid6_altivec4, 67 67 &raid6_altivec8, 68 68 #endif 69 + #if defined(CONFIG_TILEGX) 70 + &raid6_tilegx8, 71 + #endif 69 72 &raid6_intx1, 70 73 &raid6_intx2, 71 74 &raid6_intx4,

+8 -1

lib/raid6/test/Makefile

··· 40 40 OBJS += neon.o neon1.o neon2.o neon4.o neon8.o 41 41 CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 42 42 else 43 - HAS_ALTIVEC := $(shell echo -e '\#include <altivec.h>\nvector int a;' |\ 43 + HAS_ALTIVEC := $(shell printf '\#include <altivec.h>\nvector int a;\n' |\ 44 44 gcc -c -x c - >&/dev/null && \ 45 45 rm ./-.o && echo yes) 46 46 ifeq ($(HAS_ALTIVEC),yes) 47 47 OBJS += altivec1.o altivec2.o altivec4.o altivec8.o 48 48 endif 49 + endif 50 + ifeq ($(ARCH),tilegx) 51 + OBJS += tilegx8.o 49 52 endif 50 53 51 54 .c.o: ··· 112 109 int32.c: int.uc ../unroll.awk 113 110 $(AWK) ../unroll.awk -vN=32 < int.uc > $@ 114 111 112 + tilegx8.c: tilegx.uc ../unroll.awk 113 + $(AWK) ../unroll.awk -vN=8 < tilegx.uc > $@ 114 + 115 115 tables.c: mktables 116 116 ./mktables > tables.c 117 117 118 118 clean: 119 119 rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c neon*.c tables.c raid6test 120 + rm -f tilegx*.c 120 121 121 122 spotless: clean 122 123 rm -f *~

+86

lib/raid6/tilegx.uc

··· 1 + /* -*- linux-c -*- ------------------------------------------------------- * 2 + * 3 + * Copyright 2002 H. Peter Anvin - All Rights Reserved 4 + * Copyright 2012 Tilera Corporation - All Rights Reserved 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License as published by 8 + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, 9 + * Boston MA 02111-1307, USA; either version 2 of the License, or 10 + * (at your option) any later version; incorporated herein by reference. 11 + * 12 + * ----------------------------------------------------------------------- */ 13 + 14 + /* 15 + * tilegx$#.c 16 + * 17 + * $#-way unrolled TILE-Gx SIMD for RAID-6 math. 18 + * 19 + * This file is postprocessed using unroll.awk. 20 + * 21 + */ 22 + 23 + #include <linux/raid/pq.h> 24 + 25 + /* Create 8 byte copies of constant byte */ 26 + # define NBYTES(x) (__insn_v1addi(0, x)) 27 + # define NSIZE 8 28 + 29 + /* 30 + * The SHLBYTE() operation shifts each byte left by 1, *not* 31 + * rolling over into the next byte 32 + */ 33 + static inline __attribute_const__ u64 SHLBYTE(u64 v) 34 + { 35 + /* Vector One Byte Shift Left Immediate. */ 36 + return __insn_v1shli(v, 1); 37 + } 38 + 39 + /* 40 + * The MASK() operation returns 0xFF in any byte for which the high 41 + * bit is 1, 0x00 for any byte for which the high bit is 0. 42 + */ 43 + static inline __attribute_const__ u64 MASK(u64 v) 44 + { 45 + /* Vector One Byte Shift Right Signed Immediate. */ 46 + return __insn_v1shrsi(v, 7); 47 + } 48 + 49 + 50 + void raid6_tilegx$#_gen_syndrome(int disks, size_t bytes, void **ptrs) 51 + { 52 + u8 **dptr = (u8 **)ptrs; 53 + u64 *p, *q; 54 + int d, z, z0; 55 + 56 + u64 wd$$, wq$$, wp$$, w1$$, w2$$; 57 + u64 x1d = NBYTES(0x1d); 58 + u64 * z0ptr; 59 + 60 + z0 = disks - 3; /* Highest data disk */ 61 + p = (u64 *)dptr[z0+1]; /* XOR parity */ 62 + q = (u64 *)dptr[z0+2]; /* RS syndrome */ 63 + 64 + z0ptr = (u64 *)&dptr[z0][0]; 65 + for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { 66 + wq$$ = wp$$ = *z0ptr++; 67 + for ( z = z0-1 ; z >= 0 ; z-- ) { 68 + wd$$ = *(u64 *)&dptr[z][d+$$*NSIZE]; 69 + wp$$ = wp$$ ^ wd$$; 70 + w2$$ = MASK(wq$$); 71 + w1$$ = SHLBYTE(wq$$); 72 + w2$$ = w2$$ & x1d; 73 + w1$$ = w1$$ ^ w2$$; 74 + wq$$ = w1$$ ^ wd$$; 75 + } 76 + *p++ = wp$$; 77 + *q++ = wq$$; 78 + } 79 + } 80 + 81 + const struct raid6_calls raid6_tilegx$# = { 82 + raid6_tilegx$#_gen_syndrome, 83 + NULL, 84 + "tilegx$#", 85 + 0 86 + };