drivers/md/multipath.c at v2.6.24

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / drivers / md / multipath.c
at v2.6.24 559 lines 14 kB view raw
wrap content
  1/*
  2 * multipath.c : Multiple Devices driver for Linux
  3 *
  4 * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
  5 *
  6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
  7 *
  8 * MULTIPATH management functions.
  9 *
 10 * derived from raid1.c.
 11 *
 12 * This program is free software; you can redistribute it and/or modify
 13 * it under the terms of the GNU General Public License as published by
 14 * the Free Software Foundation; either version 2, or (at your option)
 15 * any later version.
 16 *
 17 * You should have received a copy of the GNU General Public License
 18 * (for example /usr/src/linux/COPYING); if not, write to the Free
 19 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 20 */
 21
 22#include <linux/module.h>
 23#include <linux/slab.h>
 24#include <linux/spinlock.h>
 25#include <linux/raid/multipath.h>
 26#include <linux/buffer_head.h>
 27#include <asm/atomic.h>
 28
 29#define MAJOR_NR MD_MAJOR
 30#define MD_DRIVER
 31#define MD_PERSONALITY
 32
 33#define MAX_WORK_PER_DISK 128
 34
 35#define	NR_RESERVED_BUFS	32
 36
 37
 38static int multipath_map (multipath_conf_t *conf)
 39{
 40	int i, disks = conf->raid_disks;
 41
 42	/*
 43	 * Later we do read balancing on the read side 
 44	 * now we use the first available disk.
 45	 */
 46
 47	rcu_read_lock();
 48	for (i = 0; i < disks; i++) {
 49		mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
 50		if (rdev && test_bit(In_sync, &rdev->flags)) {
 51			atomic_inc(&rdev->nr_pending);
 52			rcu_read_unlock();
 53			return i;
 54		}
 55	}
 56	rcu_read_unlock();
 57
 58	printk(KERN_ERR "multipath_map(): no more operational IO paths?\n");
 59	return (-1);
 60}
 61
 62static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
 63{
 64	unsigned long flags;
 65	mddev_t *mddev = mp_bh->mddev;
 66	multipath_conf_t *conf = mddev_to_conf(mddev);
 67
 68	spin_lock_irqsave(&conf->device_lock, flags);
 69	list_add(&mp_bh->retry_list, &conf->retry_list);
 70	spin_unlock_irqrestore(&conf->device_lock, flags);
 71	md_wakeup_thread(mddev->thread);
 72}
 73
 74
 75/*
 76 * multipath_end_bh_io() is called when we have finished servicing a multipathed
 77 * operation and are ready to return a success/failure code to the buffer
 78 * cache layer.
 79 */
 80static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
 81{
 82	struct bio *bio = mp_bh->master_bio;
 83	multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
 84
 85	bio_endio(bio, err);
 86	mempool_free(mp_bh, conf->pool);
 87}
 88
 89static void multipath_end_request(struct bio *bio, int error)
 90{
 91	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 92	struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private);
 93	multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
 94	mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;
 95
 96	if (uptodate)
 97		multipath_end_bh_io(mp_bh, 0);
 98	else if (!bio_rw_ahead(bio)) {
 99		/*
100		 * oops, IO error:
101		 */
102		char b[BDEVNAME_SIZE];
103		md_error (mp_bh->mddev, rdev);
104		printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", 
105		       bdevname(rdev->bdev,b), 
106		       (unsigned long long)bio->bi_sector);
107		multipath_reschedule_retry(mp_bh);
108	} else
109		multipath_end_bh_io(mp_bh, error);
110	rdev_dec_pending(rdev, conf->mddev);
111}
112
113static void unplug_slaves(mddev_t *mddev)
114{
115	multipath_conf_t *conf = mddev_to_conf(mddev);
116	int i;
117
118	rcu_read_lock();
119	for (i=0; i<mddev->raid_disks; i++) {
120		mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
121		if (rdev && !test_bit(Faulty, &rdev->flags)
122		    && atomic_read(&rdev->nr_pending)) {
123			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
124
125			atomic_inc(&rdev->nr_pending);
126			rcu_read_unlock();
127
128			blk_unplug(r_queue);
129
130			rdev_dec_pending(rdev, mddev);
131			rcu_read_lock();
132		}
133	}
134	rcu_read_unlock();
135}
136
137static void multipath_unplug(struct request_queue *q)
138{
139	unplug_slaves(q->queuedata);
140}
141
142
143static int multipath_make_request (struct request_queue *q, struct bio * bio)
144{
145	mddev_t *mddev = q->queuedata;
146	multipath_conf_t *conf = mddev_to_conf(mddev);
147	struct multipath_bh * mp_bh;
148	struct multipath_info *multipath;
149	const int rw = bio_data_dir(bio);
150
151	if (unlikely(bio_barrier(bio))) {
152		bio_endio(bio, -EOPNOTSUPP);
153		return 0;
154	}
155
156	mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
157
158	mp_bh->master_bio = bio;
159	mp_bh->mddev = mddev;
160
161	disk_stat_inc(mddev->gendisk, ios[rw]);
162	disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
163
164	mp_bh->path = multipath_map(conf);
165	if (mp_bh->path < 0) {
166		bio_endio(bio, -EIO);
167		mempool_free(mp_bh, conf->pool);
168		return 0;
169	}
170	multipath = conf->multipaths + mp_bh->path;
171
172	mp_bh->bio = *bio;
173	mp_bh->bio.bi_sector += multipath->rdev->data_offset;
174	mp_bh->bio.bi_bdev = multipath->rdev->bdev;
175	mp_bh->bio.bi_rw |= (1 << BIO_RW_FAILFAST);
176	mp_bh->bio.bi_end_io = multipath_end_request;
177	mp_bh->bio.bi_private = mp_bh;
178	generic_make_request(&mp_bh->bio);
179	return 0;
180}
181
182static void multipath_status (struct seq_file *seq, mddev_t *mddev)
183{
184	multipath_conf_t *conf = mddev_to_conf(mddev);
185	int i;
186	
187	seq_printf (seq, " [%d/%d] [", conf->raid_disks,
188						 conf->working_disks);
189	for (i = 0; i < conf->raid_disks; i++)
190		seq_printf (seq, "%s",
191			       conf->multipaths[i].rdev && 
192			       test_bit(In_sync, &conf->multipaths[i].rdev->flags) ? "U" : "_");
193	seq_printf (seq, "]");
194}
195
196static int multipath_congested(void *data, int bits)
197{
198	mddev_t *mddev = data;
199	multipath_conf_t *conf = mddev_to_conf(mddev);
200	int i, ret = 0;
201
202	rcu_read_lock();
203	for (i = 0; i < mddev->raid_disks ; i++) {
204		mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
205		if (rdev && !test_bit(Faulty, &rdev->flags)) {
206			struct request_queue *q = bdev_get_queue(rdev->bdev);
207
208			ret |= bdi_congested(&q->backing_dev_info, bits);
209			/* Just like multipath_map, we just check the
210			 * first available device
211			 */
212			break;
213		}
214	}
215	rcu_read_unlock();
216	return ret;
217}
218
219/*
220 * Careful, this can execute in IRQ contexts as well!
221 */
222static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
223{
224	multipath_conf_t *conf = mddev_to_conf(mddev);
225
226	if (conf->working_disks <= 1) {
227		/*
228		 * Uh oh, we can do nothing if this is our last path, but
229		 * first check if this is a queued request for a device
230		 * which has just failed.
231		 */
232		printk(KERN_ALERT 
233			"multipath: only one IO path left and IO error.\n");
234		/* leave it active... it's all we have */
235	} else {
236		/*
237		 * Mark disk as unusable
238		 */
239		if (!test_bit(Faulty, &rdev->flags)) {
240			char b[BDEVNAME_SIZE];
241			clear_bit(In_sync, &rdev->flags);
242			set_bit(Faulty, &rdev->flags);
243			set_bit(MD_CHANGE_DEVS, &mddev->flags);
244			conf->working_disks--;
245			mddev->degraded++;
246			printk(KERN_ALERT "multipath: IO failure on %s,"
247				" disabling IO path. \n	Operation continuing"
248				" on %d IO paths.\n",
249				bdevname (rdev->bdev,b),
250				conf->working_disks);
251		}
252	}
253}
254
255static void print_multipath_conf (multipath_conf_t *conf)
256{
257	int i;
258	struct multipath_info *tmp;
259
260	printk("MULTIPATH conf printout:\n");
261	if (!conf) {
262		printk("(conf==NULL)\n");
263		return;
264	}
265	printk(" --- wd:%d rd:%d\n", conf->working_disks,
266			 conf->raid_disks);
267
268	for (i = 0; i < conf->raid_disks; i++) {
269		char b[BDEVNAME_SIZE];
270		tmp = conf->multipaths + i;
271		if (tmp->rdev)
272			printk(" disk%d, o:%d, dev:%s\n",
273				i,!test_bit(Faulty, &tmp->rdev->flags),
274			       bdevname(tmp->rdev->bdev,b));
275	}
276}
277
278
279static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
280{
281	multipath_conf_t *conf = mddev->private;
282	struct request_queue *q;
283	int found = 0;
284	int path;
285	struct multipath_info *p;
286
287	print_multipath_conf(conf);
288
289	for (path=0; path<mddev->raid_disks; path++) 
290		if ((p=conf->multipaths+path)->rdev == NULL) {
291			q = rdev->bdev->bd_disk->queue;
292			blk_queue_stack_limits(mddev->queue, q);
293
294		/* as we don't honour merge_bvec_fn, we must never risk
295		 * violating it, so limit ->max_sector to one PAGE, as
296		 * a one page request is never in violation.
297		 * (Note: it is very unlikely that a device with
298		 * merge_bvec_fn will be involved in multipath.)
299		 */
300			if (q->merge_bvec_fn &&
301			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
302				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
303
304			conf->working_disks++;
305			mddev->degraded--;
306			rdev->raid_disk = path;
307			set_bit(In_sync, &rdev->flags);
308			rcu_assign_pointer(p->rdev, rdev);
309			found = 1;
310		}
311
312	print_multipath_conf(conf);
313	return found;
314}
315
316static int multipath_remove_disk(mddev_t *mddev, int number)
317{
318	multipath_conf_t *conf = mddev->private;
319	int err = 0;
320	mdk_rdev_t *rdev;
321	struct multipath_info *p = conf->multipaths + number;
322
323	print_multipath_conf(conf);
324
325	rdev = p->rdev;
326	if (rdev) {
327		if (test_bit(In_sync, &rdev->flags) ||
328		    atomic_read(&rdev->nr_pending)) {
329			printk(KERN_ERR "hot-remove-disk, slot %d is identified"				" but is still operational!\n", number);
330			err = -EBUSY;
331			goto abort;
332		}
333		p->rdev = NULL;
334		synchronize_rcu();
335		if (atomic_read(&rdev->nr_pending)) {
336			/* lost the race, try later */
337			err = -EBUSY;
338			p->rdev = rdev;
339		}
340	}
341abort:
342
343	print_multipath_conf(conf);
344	return err;
345}
346
347
348
349/*
350 * This is a kernel thread which:
351 *
352 *	1.	Retries failed read operations on working multipaths.
353 *	2.	Updates the raid superblock when problems encounter.
354 *	3.	Performs writes following reads for array syncronising.
355 */
356
357static void multipathd (mddev_t *mddev)
358{
359	struct multipath_bh *mp_bh;
360	struct bio *bio;
361	unsigned long flags;
362	multipath_conf_t *conf = mddev_to_conf(mddev);
363	struct list_head *head = &conf->retry_list;
364
365	md_check_recovery(mddev);
366	for (;;) {
367		char b[BDEVNAME_SIZE];
368		spin_lock_irqsave(&conf->device_lock, flags);
369		if (list_empty(head))
370			break;
371		mp_bh = list_entry(head->prev, struct multipath_bh, retry_list);
372		list_del(head->prev);
373		spin_unlock_irqrestore(&conf->device_lock, flags);
374
375		bio = &mp_bh->bio;
376		bio->bi_sector = mp_bh->master_bio->bi_sector;
377		
378		if ((mp_bh->path = multipath_map (conf))<0) {
379			printk(KERN_ALERT "multipath: %s: unrecoverable IO read"
380				" error for block %llu\n",
381				bdevname(bio->bi_bdev,b),
382				(unsigned long long)bio->bi_sector);
383			multipath_end_bh_io(mp_bh, -EIO);
384		} else {
385			printk(KERN_ERR "multipath: %s: redirecting sector %llu"
386				" to another IO path\n",
387				bdevname(bio->bi_bdev,b),
388				(unsigned long long)bio->bi_sector);
389			*bio = *(mp_bh->master_bio);
390			bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset;
391			bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev;
392			bio->bi_rw |= (1 << BIO_RW_FAILFAST);
393			bio->bi_end_io = multipath_end_request;
394			bio->bi_private = mp_bh;
395			generic_make_request(bio);
396		}
397	}
398	spin_unlock_irqrestore(&conf->device_lock, flags);
399}
400
401static int multipath_run (mddev_t *mddev)
402{
403	multipath_conf_t *conf;
404	int disk_idx;
405	struct multipath_info *disk;
406	mdk_rdev_t *rdev;
407	struct list_head *tmp;
408
409	if (mddev->level != LEVEL_MULTIPATH) {
410		printk("multipath: %s: raid level not set to multipath IO (%d)\n",
411		       mdname(mddev), mddev->level);
412		goto out;
413	}
414	/*
415	 * copy the already verified devices into our private MULTIPATH
416	 * bookkeeping area. [whatever we allocate in multipath_run(),
417	 * should be freed in multipath_stop()]
418	 */
419
420	conf = kzalloc(sizeof(multipath_conf_t), GFP_KERNEL);
421	mddev->private = conf;
422	if (!conf) {
423		printk(KERN_ERR 
424			"multipath: couldn't allocate memory for %s\n",
425			mdname(mddev));
426		goto out;
427	}
428
429	conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks,
430				   GFP_KERNEL);
431	if (!conf->multipaths) {
432		printk(KERN_ERR 
433			"multipath: couldn't allocate memory for %s\n",
434			mdname(mddev));
435		goto out_free_conf;
436	}
437
438	conf->working_disks = 0;
439	ITERATE_RDEV(mddev,rdev,tmp) {
440		disk_idx = rdev->raid_disk;
441		if (disk_idx < 0 ||
442		    disk_idx >= mddev->raid_disks)
443			continue;
444
445		disk = conf->multipaths + disk_idx;
446		disk->rdev = rdev;
447
448		blk_queue_stack_limits(mddev->queue,
449				       rdev->bdev->bd_disk->queue);
450		/* as we don't honour merge_bvec_fn, we must never risk
451		 * violating it, not that we ever expect a device with
452		 * a merge_bvec_fn to be involved in multipath */
453		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
454		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
455			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
456
457		if (!test_bit(Faulty, &rdev->flags))
458			conf->working_disks++;
459	}
460
461	conf->raid_disks = mddev->raid_disks;
462	conf->mddev = mddev;
463	spin_lock_init(&conf->device_lock);
464	INIT_LIST_HEAD(&conf->retry_list);
465
466	if (!conf->working_disks) {
467		printk(KERN_ERR "multipath: no operational IO paths for %s\n",
468			mdname(mddev));
469		goto out_free_conf;
470	}
471	mddev->degraded = conf->raid_disks - conf->working_disks;
472
473	conf->pool = mempool_create_kzalloc_pool(NR_RESERVED_BUFS,
474						 sizeof(struct multipath_bh));
475	if (conf->pool == NULL) {
476		printk(KERN_ERR 
477			"multipath: couldn't allocate memory for %s\n",
478			mdname(mddev));
479		goto out_free_conf;
480	}
481
482	{
483		mddev->thread = md_register_thread(multipathd, mddev, "%s_multipath");
484		if (!mddev->thread) {
485			printk(KERN_ERR "multipath: couldn't allocate thread"
486				" for %s\n", mdname(mddev));
487			goto out_free_conf;
488		}
489	}
490
491	printk(KERN_INFO 
492		"multipath: array %s active with %d out of %d IO paths\n",
493		mdname(mddev), conf->working_disks, mddev->raid_disks);
494	/*
495	 * Ok, everything is just fine now
496	 */
497	mddev->array_size = mddev->size;
498
499	mddev->queue->unplug_fn = multipath_unplug;
500	mddev->queue->backing_dev_info.congested_fn = multipath_congested;
501	mddev->queue->backing_dev_info.congested_data = mddev;
502
503	return 0;
504
505out_free_conf:
506	if (conf->pool)
507		mempool_destroy(conf->pool);
508	kfree(conf->multipaths);
509	kfree(conf);
510	mddev->private = NULL;
511out:
512	return -EIO;
513}
514
515
516static int multipath_stop (mddev_t *mddev)
517{
518	multipath_conf_t *conf = mddev_to_conf(mddev);
519
520	md_unregister_thread(mddev->thread);
521	mddev->thread = NULL;
522	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
523	mempool_destroy(conf->pool);
524	kfree(conf->multipaths);
525	kfree(conf);
526	mddev->private = NULL;
527	return 0;
528}
529
530static struct mdk_personality multipath_personality =
531{
532	.name		= "multipath",
533	.level		= LEVEL_MULTIPATH,
534	.owner		= THIS_MODULE,
535	.make_request	= multipath_make_request,
536	.run		= multipath_run,
537	.stop		= multipath_stop,
538	.status		= multipath_status,
539	.error_handler	= multipath_error,
540	.hot_add_disk	= multipath_add_disk,
541	.hot_remove_disk= multipath_remove_disk,
542};
543
544static int __init multipath_init (void)
545{
546	return register_md_personality (&multipath_personality);
547}
548
549static void __exit multipath_exit (void)
550{
551	unregister_md_personality (&multipath_personality);
552}
553
554module_init(multipath_init);
555module_exit(multipath_exit);
556MODULE_LICENSE("GPL");
557MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
558MODULE_ALIAS("md-multipath");
559MODULE_ALIAS("md-level--4");