[PATCH] (1/6) blksize_size[] removal
[opensuse:kernel.git] / drivers / md / multipath.c
1 /*
2  * multipath.c : Multiple Devices driver for Linux
3  *
4  * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
5  *
6  * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
7  *
8  * MULTIPATH management functions.
9  *
10  * derived from raid1.c.
11  *
12  * This program is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU General Public License as published by
14  * the Free Software Foundation; either version 2, or (at your option)
15  * any later version.
16  *
17  * You should have received a copy of the GNU General Public License
18  * (for example /usr/src/linux/COPYING); if not, write to the Free
19  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20  */
21
22 #include <linux/module.h>
23 #include <linux/slab.h>
24 #include <linux/spinlock.h>
25 #include <linux/raid/multipath.h>
26 #include <asm/atomic.h>
27
28 #define MAJOR_NR MD_MAJOR
29 #define MD_DRIVER
30 #define MD_PERSONALITY
31
32 #define MAX_WORK_PER_DISK 128
33
34 #define NR_RESERVED_BUFS        32
35
36
37 /*
38  * The following can be used to debug the driver
39  */
40 #define MULTIPATH_DEBUG 0
41
42 #if MULTIPATH_DEBUG
43 #define PRINTK(x...)   printk(x)
44 #define inline
45 #define __inline__
46 #else
47 #define PRINTK(x...)  do { } while (0)
48 #endif
49
50
51 static mdk_personality_t multipath_personality;
52 static spinlock_t retry_list_lock = SPIN_LOCK_UNLOCKED;
53 struct multipath_bh *multipath_retry_list = NULL, **multipath_retry_tail;
54
55 static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state);
56
57
58
59 static struct multipath_bh *multipath_alloc_mpbh(multipath_conf_t *conf)
60 {
61         struct multipath_bh *mp_bh = NULL;
62
63         do {
64                 spin_lock_irq(&conf->device_lock);
65                 if (!conf->freer1_blocked && conf->freer1) {
66                         mp_bh = conf->freer1;
67                         conf->freer1 = mp_bh->next_mp;
68                         conf->freer1_cnt--;
69                         mp_bh->next_mp = NULL;
70                         mp_bh->state = (1 << MPBH_PreAlloc);
71                 }
72                 spin_unlock_irq(&conf->device_lock);
73                 if (mp_bh)
74                         return mp_bh;
75                 mp_bh = (struct multipath_bh *) kmalloc(sizeof(struct multipath_bh),
76                                         GFP_NOIO);
77                 if (mp_bh) {
78                         memset(mp_bh, 0, sizeof(*mp_bh));
79                         return mp_bh;
80                 }
81                 conf->freer1_blocked = 1;
82                 wait_disk_event(conf->wait_buffer,
83                                 !conf->freer1_blocked ||
84                                 conf->freer1_cnt > NR_RESERVED_BUFS/2
85                     );
86                 conf->freer1_blocked = 0;
87         } while (1);
88 }
89
90 static inline void multipath_free_mpbh(struct multipath_bh *mp_bh)
91 {
92         multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
93
94         if (test_bit(MPBH_PreAlloc, &mp_bh->state)) {
95                 unsigned long flags;
96                 mp_bh->bio = NULL;
97                 spin_lock_irqsave(&conf->device_lock, flags);
98                 mp_bh->next_mp = conf->freer1;
99                 conf->freer1 = mp_bh;
100                 conf->freer1_cnt++;
101                 spin_unlock_irqrestore(&conf->device_lock, flags);
102                 wake_up(&conf->wait_buffer);
103         } else {
104                 kfree(mp_bh);
105         }
106 }
107
108 static int multipath_grow_mpbh (multipath_conf_t *conf, int cnt)
109 {
110         int i = 0;
111
112         while (i < cnt) {
113                 struct multipath_bh *mp_bh;
114                 mp_bh = (struct multipath_bh*)kmalloc(sizeof(*mp_bh), GFP_KERNEL);
115                 if (!mp_bh)
116                         break;
117                 memset(mp_bh, 0, sizeof(*mp_bh));
118                 set_bit(MPBH_PreAlloc, &mp_bh->state);
119                 mp_bh->mddev = conf->mddev;            
120
121                 multipath_free_mpbh(mp_bh);
122                 i++;
123         }
124         return i;
125 }
126
127 static void multipath_shrink_mpbh(multipath_conf_t *conf)
128 {
129         spin_lock_irq(&conf->device_lock);
130         while (conf->freer1) {
131                 struct multipath_bh *mp_bh = conf->freer1;
132                 conf->freer1 = mp_bh->next_mp;
133                 conf->freer1_cnt--;
134                 kfree(mp_bh);
135         }
136         spin_unlock_irq(&conf->device_lock);
137 }
138
139
140 static int multipath_map (mddev_t *mddev, struct block_device **bdev)
141 {
142         multipath_conf_t *conf = mddev_to_conf(mddev);
143         int i, disks = MD_SB_DISKS;
144
145         /*
146          * Later we do read balancing on the read side 
147          * now we use the first available disk.
148          */
149
150         for (i = 0; i < disks; i++) {
151                 if (conf->multipaths[i].operational) {
152                         *bdev = conf->multipaths[i].bdev;
153                         return (0);
154                 }
155         }
156
157         printk (KERN_ERR "multipath_map(): no more operational IO paths?\n");
158         return (-1);
159 }
160
161 static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
162 {
163         unsigned long flags;
164         mddev_t *mddev = mp_bh->mddev;
165         multipath_conf_t *conf = mddev_to_conf(mddev);
166
167         spin_lock_irqsave(&retry_list_lock, flags);
168         if (multipath_retry_list == NULL)
169                 multipath_retry_tail = &multipath_retry_list;
170         *multipath_retry_tail = mp_bh;
171         multipath_retry_tail = &mp_bh->next_mp;
172         mp_bh->next_mp = NULL;
173         spin_unlock_irqrestore(&retry_list_lock, flags);
174         md_wakeup_thread(conf->thread);
175 }
176
177
178 /*
179  * multipath_end_bh_io() is called when we have finished servicing a multipathed
180  * operation and are ready to return a success/failure code to the buffer
181  * cache layer.
182  */
183 static void multipath_end_bh_io (struct multipath_bh *mp_bh, int uptodate)
184 {
185         struct bio *bio = mp_bh->master_bio;
186
187         bio_endio(bio, uptodate);
188         bio_put(mp_bh->bio);
189         multipath_free_mpbh(mp_bh);
190 }
191
192 void multipath_end_request(struct bio *bio)
193 {
194         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
195         struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private);
196
197         /*
198          * this branch is our 'one multipath IO has finished' event handler:
199          */
200         if (!uptodate)
201                 md_error (mp_bh->mddev, bio->bi_bdev);
202         else
203                 /*
204                  * Set MPBH_Uptodate in our master buffer_head, so that
205                  * we will return a good error code for to the higher
206                  * levels even if IO on some other multipathed buffer fails.
207                  *
208                  * The 'master' represents the complex operation to 
209                  * user-side. So if something waits for IO, then it will
210                  * wait for the 'master' buffer_head.
211                  */
212                 set_bit (MPBH_Uptodate, &mp_bh->state);
213
214                 
215         if (uptodate) {
216                 multipath_end_bh_io(mp_bh, uptodate);
217                 return;
218         }
219         /*
220          * oops, IO error:
221          */
222         printk(KERN_ERR "multipath: %s: rescheduling sector %lu\n", 
223                  bdev_partition_name(bio->bi_bdev), bio->bi_sector);
224         multipath_reschedule_retry(mp_bh);
225         return;
226 }
227
228 /*
229  * This routine returns the disk from which the requested read should
230  * be done.
231  */
232
233 static int multipath_read_balance (multipath_conf_t *conf)
234 {
235         int disk;
236
237         for (disk = 0; disk < conf->raid_disks; disk++) 
238                 if (conf->multipaths[disk].operational)
239                         return disk;
240         BUG();
241         return 0;
242 }
243
244 static int multipath_make_request (mddev_t *mddev, int rw, struct bio * bio)
245 {
246         multipath_conf_t *conf = mddev_to_conf(mddev);
247         struct bio *real_bio;
248         struct multipath_bh * mp_bh;
249         struct multipath_info *multipath;
250
251 /*
252  * make_request() can abort the operation when READA is being
253  * used and no empty request is available.
254  *
255  * Currently, just replace the command with READ/WRITE.
256  */
257         if (rw == READA)
258                 rw = READ;
259
260         mp_bh = multipath_alloc_mpbh (conf);
261
262         mp_bh->master_bio = bio;
263         mp_bh->mddev = mddev;
264         mp_bh->cmd = rw;
265
266         /*
267          * read balancing logic:
268          */
269         multipath = conf->multipaths + multipath_read_balance(conf);
270
271         real_bio = bio_clone(bio, GFP_NOIO);
272         real_bio->bi_bdev = multipath->bdev;
273         real_bio->bi_rw = rw;
274         real_bio->bi_end_io = multipath_end_request;
275         real_bio->bi_private = mp_bh;
276         mp_bh->bio = real_bio;
277         generic_make_request(real_bio);
278         return 0;
279 }
280
281 static int multipath_status (char *page, mddev_t *mddev)
282 {
283         multipath_conf_t *conf = mddev_to_conf(mddev);
284         int sz = 0, i;
285         
286         sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
287                                                  conf->working_disks);
288         for (i = 0; i < conf->raid_disks; i++)
289                 sz += sprintf (page+sz, "%s",
290                         conf->multipaths[i].operational ? "U" : "_");
291         sz += sprintf (page+sz, "]");
292         return sz;
293 }
294
295 #define LAST_DISK KERN_ALERT \
296 "multipath: only one IO path left and IO error.\n"
297
298 #define NO_SPARE_DISK KERN_ALERT \
299 "multipath: no spare IO path left!\n"
300
301 #define DISK_FAILED KERN_ALERT \
302 "multipath: IO failure on %s, disabling IO path. \n" \
303 "       Operation continuing on %d IO paths.\n"
304
305 static void mark_disk_bad (mddev_t *mddev, int failed)
306 {
307         multipath_conf_t *conf = mddev_to_conf(mddev);
308         struct multipath_info *multipath = conf->multipaths+failed;
309         mdp_super_t *sb = mddev->sb;
310
311         multipath->operational = 0;
312         mark_disk_faulty(sb->disks+multipath->number);
313         mark_disk_nonsync(sb->disks+multipath->number);
314         mark_disk_inactive(sb->disks+multipath->number);
315         sb->active_disks--;
316         sb->working_disks--;
317         sb->failed_disks++;
318         mddev->sb_dirty = 1;
319         md_wakeup_thread(conf->thread);
320         conf->working_disks--;
321         printk (DISK_FAILED, partition_name (multipath->dev),
322                                  conf->working_disks);
323 }
324
325 /*
326  * Careful, this can execute in IRQ contexts as well!
327  */
328 static int multipath_error (mddev_t *mddev, kdev_t dev)
329 {
330         multipath_conf_t *conf = mddev_to_conf(mddev);
331         struct multipath_info * multipaths = conf->multipaths;
332         int disks = MD_SB_DISKS;
333         int other_paths = 1;
334         int i;
335
336         if (conf->working_disks == 1) {
337                 other_paths = 0;
338                 for (i = 0; i < disks; i++) {
339                         if (multipaths[i].spare) {
340                                 other_paths = 1;
341                                 break;
342                         }
343                 }
344         }
345
346         if (!other_paths) {
347                 /*
348                  * Uh oh, we can do nothing if this is our last path, but
349                  * first check if this is a queued request for a device
350                  * which has just failed.
351                  */
352                 for (i = 0; i < disks; i++) {
353                         if (kdev_same(multipaths[i].dev, dev) && !multipaths[i].operational)
354                                 return 0;
355                 }
356                 printk (LAST_DISK);
357         } else {
358                 /*
359                  * Mark disk as unusable
360                  */
361                 for (i = 0; i < disks; i++) {
362                         if (kdev_same(multipaths[i].dev,dev) && multipaths[i].operational) {
363                                 mark_disk_bad(mddev, i);
364                                 break;
365                         }
366                 }
367                 if (!conf->working_disks) {
368                         int err = 1;
369                         mdp_disk_t *spare;
370                         mdp_super_t *sb = mddev->sb;
371
372                         spare = get_spare(mddev);
373                         if (spare) {
374                                 err = multipath_diskop(mddev, &spare, DISKOP_SPARE_WRITE);
375                                 printk("got DISKOP_SPARE_WRITE err: %d. (spare_faulty(): %d)\n", err, disk_faulty(spare));
376                         }
377                         if (!err && !disk_faulty(spare)) {
378                                 multipath_diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
379                                 mark_disk_sync(spare);
380                                 mark_disk_active(spare);
381                                 sb->active_disks++;
382                                 sb->spare_disks--;
383                         }
384                 }
385         }
386         return 0;
387 }
388
389 #undef LAST_DISK
390 #undef NO_SPARE_DISK
391 #undef DISK_FAILED
392
393
394 static void print_multipath_conf (multipath_conf_t *conf)
395 {
396         int i;
397         struct multipath_info *tmp;
398
399         printk("MULTIPATH conf printout:\n");
400         if (!conf) {
401                 printk("(conf==NULL)\n");
402                 return;
403         }
404         printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
405                          conf->raid_disks, conf->nr_disks);
406
407         for (i = 0; i < MD_SB_DISKS; i++) {
408                 tmp = conf->multipaths + i;
409                 if (tmp->spare || tmp->operational || tmp->number ||
410                                 tmp->raid_disk || tmp->used_slot)
411                         printk(" disk%d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
412                                 i, tmp->spare,tmp->operational,
413                                 tmp->number,tmp->raid_disk,tmp->used_slot,
414                                 partition_name(tmp->dev));
415         }
416 }
417
418 static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
419 {
420         int err = 0;
421         int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
422         multipath_conf_t *conf = mddev->private;
423         struct multipath_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
424         mdp_super_t *sb = mddev->sb;
425         mdp_disk_t *failed_desc, *spare_desc, *added_desc;
426         mdk_rdev_t *spare_rdev, *failed_rdev;
427         struct block_device *bdev;
428
429         print_multipath_conf(conf);
430         spin_lock_irq(&conf->device_lock);
431         /*
432          * find the disk ...
433          */
434         switch (state) {
435
436         case DISKOP_SPARE_ACTIVE:
437
438                 /*
439                  * Find the failed disk within the MULTIPATH configuration ...
440                  * (this can only be in the first conf->working_disks part)
441                  */
442                 for (i = 0; i < conf->raid_disks; i++) {
443                         tmp = conf->multipaths + i;
444                         if ((!tmp->operational && !tmp->spare) ||
445                                         !tmp->used_slot) {
446                                 failed_disk = i;
447                                 break;
448                         }
449                 }
450                 /*
451                  * When we activate a spare disk we _must_ have a disk in
452                  * the lower (active) part of the array to replace. 
453                  */
454                 if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
455                         MD_BUG();
456                         err = 1;
457                         goto abort;
458                 }
459                 /* fall through */
460
461         case DISKOP_SPARE_WRITE:
462         case DISKOP_SPARE_INACTIVE:
463
464                 /*
465                  * Find the spare disk ... (can only be in the 'high'
466                  * area of the array)
467                  */
468                 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
469                         tmp = conf->multipaths + i;
470                         if (tmp->spare && tmp->number == (*d)->number) {
471                                 spare_disk = i;
472                                 break;
473                         }
474                 }
475                 if (spare_disk == -1) {
476                         MD_BUG();
477                         err = 1;
478                         goto abort;
479                 }
480                 break;
481
482         case DISKOP_HOT_REMOVE_DISK:
483
484                 for (i = 0; i < MD_SB_DISKS; i++) {
485                         tmp = conf->multipaths + i;
486                         if (tmp->used_slot && (tmp->number == (*d)->number)) {
487                                 if (tmp->operational) {
488                                         printk(KERN_ERR "hot-remove-disk, slot %d is identified to be the requested disk (number %d), but is still operational!\n", i, (*d)->number);
489                                         err = -EBUSY;
490                                         goto abort;
491                                 }
492                                 removed_disk = i;
493                                 break;
494                         }
495                 }
496                 if (removed_disk == -1) {
497                         MD_BUG();
498                         err = 1;
499                         goto abort;
500                 }
501                 break;
502
503         case DISKOP_HOT_ADD_DISK:
504
505                 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
506                         tmp = conf->multipaths + i;
507                         if (!tmp->used_slot) {
508                                 added_disk = i;
509                                 break;
510                         }
511                 }
512                 if (added_disk == -1) {
513                         MD_BUG();
514                         err = 1;
515                         goto abort;
516                 }
517                 break;
518         }
519
520         switch (state) {
521         /*
522          * Switch the spare disk to write-only mode:
523          */
524         case DISKOP_SPARE_WRITE:
525                 sdisk = conf->multipaths + spare_disk;
526                 sdisk->operational = 1;
527                 break;
528         /*
529          * Deactivate a spare disk:
530          */
531         case DISKOP_SPARE_INACTIVE:
532                 sdisk = conf->multipaths + spare_disk;
533                 sdisk->operational = 0;
534                 break;
535         /*
536          * Activate (mark read-write) the (now sync) spare disk,
537          * which means we switch it's 'raid position' (->raid_disk)
538          * with the failed disk. (only the first 'conf->nr_disks'
539          * slots are used for 'real' disks and we must preserve this
540          * property)
541          */
542         case DISKOP_SPARE_ACTIVE:
543                 sdisk = conf->multipaths + spare_disk;
544                 fdisk = conf->multipaths + failed_disk;
545
546                 spare_desc = &sb->disks[sdisk->number];
547                 failed_desc = &sb->disks[fdisk->number];
548
549                 if (spare_desc != *d) {
550                         MD_BUG();
551                         err = 1;
552                         goto abort;
553                 }
554
555                 if (spare_desc->raid_disk != sdisk->raid_disk) {
556                         MD_BUG();
557                         err = 1;
558                         goto abort;
559                 }
560                         
561                 if (sdisk->raid_disk != spare_disk) {
562                         MD_BUG();
563                         err = 1;
564                         goto abort;
565                 }
566
567                 if (failed_desc->raid_disk != fdisk->raid_disk) {
568                         MD_BUG();
569                         err = 1;
570                         goto abort;
571                 }
572
573                 if (fdisk->raid_disk != failed_disk) {
574                         MD_BUG();
575                         err = 1;
576                         goto abort;
577                 }
578
579                 /*
580                  * do the switch finally
581                  */
582                 spare_rdev = find_rdev_nr(mddev, spare_desc->number);
583                 failed_rdev = find_rdev_nr(mddev, failed_desc->number);
584                 xchg_values(spare_rdev->desc_nr, failed_rdev->desc_nr);
585                 spare_rdev->alias_device = 0;
586                 failed_rdev->alias_device = 1;
587
588                 xchg_values(*spare_desc, *failed_desc);
589                 xchg_values(*fdisk, *sdisk);
590
591                 /*
592                  * (careful, 'failed' and 'spare' are switched from now on)
593                  *
594                  * we want to preserve linear numbering and we want to
595                  * give the proper raid_disk number to the now activated
596                  * disk. (this means we switch back these values)
597                  */
598         
599                 xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
600                 xchg_values(sdisk->raid_disk, fdisk->raid_disk);
601                 xchg_values(spare_desc->number, failed_desc->number);
602                 xchg_values(sdisk->number, fdisk->number);
603
604                 *d = failed_desc;
605
606                 if (!sdisk->bdev)
607                         sdisk->used_slot = 0;
608                 /*
609                  * this really activates the spare.
610                  */
611                 fdisk->spare = 0;
612
613                 /*
614                  * if we activate a spare, we definitely replace a
615                  * non-operational disk slot in the 'low' area of
616                  * the disk array.
617                  */
618
619                 conf->working_disks++;
620
621                 break;
622
623         case DISKOP_HOT_REMOVE_DISK:
624                 rdisk = conf->multipaths + removed_disk;
625
626                 if (rdisk->spare && (removed_disk < conf->raid_disks)) {
627                         MD_BUG();       
628                         err = 1;
629                         goto abort;
630                 }
631                 bdev = rdisk->bdev;
632                 rdisk->dev = NODEV;
633                 rdisk->bdev = NULL;
634                 rdisk->used_slot = 0;
635                 conf->nr_disks--;
636                 bdput(bdev);
637                 break;
638
639         case DISKOP_HOT_ADD_DISK:
640                 adisk = conf->multipaths + added_disk;
641                 added_desc = *d;
642
643                 if (added_disk != added_desc->number) {
644                         MD_BUG();       
645                         err = 1;
646                         goto abort;
647                 }
648
649                 adisk->number = added_desc->number;
650                 adisk->raid_disk = added_desc->raid_disk;
651                 adisk->dev = mk_kdev(added_desc->major,added_desc->minor);
652                 /* it will be held open by rdev */
653                 adisk->bdev = bdget(kdev_t_to_nr(adisk->dev));
654
655                 adisk->operational = 0;
656                 adisk->spare = 1;
657                 adisk->used_slot = 1;
658                 conf->nr_disks++;
659
660                 break;
661
662         default:
663                 MD_BUG();
664                 err = 1;
665                 goto abort;
666         }
667 abort:
668         spin_unlock_irq(&conf->device_lock);
669
670         print_multipath_conf(conf);
671         return err;
672 }
673
674
675 #define IO_ERROR KERN_ALERT \
676 "multipath: %s: unrecoverable IO read error for block %lu\n"
677
678 #define REDIRECT_SECTOR KERN_ERR \
679 "multipath: %s: redirecting sector %lu to another IO path\n"
680
681 /*
682  * This is a kernel thread which:
683  *
684  *      1.      Retries failed read operations on working multipaths.
685  *      2.      Updates the raid superblock when problems encounter.
686  *      3.      Performs writes following reads for array syncronising.
687  */
688
689 static void multipathd (void *data)
690 {
691         struct multipath_bh *mp_bh;
692         struct bio *bio;
693         unsigned long flags;
694         mddev_t *mddev;
695         struct block_device *bdev;
696
697         for (;;) {
698                 spin_lock_irqsave(&retry_list_lock, flags);
699                 mp_bh = multipath_retry_list;
700                 if (!mp_bh)
701                         break;
702                 multipath_retry_list = mp_bh->next_mp;
703                 spin_unlock_irqrestore(&retry_list_lock, flags);
704
705                 mddev = mp_bh->mddev;
706                 if (mddev->sb_dirty) {
707                         printk(KERN_INFO "dirty sb detected, updating.\n");
708                         mddev->sb_dirty = 0;
709                         md_update_sb(mddev);
710                 }
711                 bio = mp_bh->bio;
712                 bdev = bio->bi_bdev;
713                 
714                 multipath_map (mddev, &bio->bi_bdev);
715                 if (bio->bi_bdev == bdev) {
716                         printk(IO_ERROR,
717                                 bdev_partition_name(bio->bi_bdev), bio->bi_sector);
718                         multipath_end_bh_io(mp_bh, 0);
719                 } else {
720                         printk(REDIRECT_SECTOR,
721                                 bdev_partition_name(bio->bi_bdev), bio->bi_sector);
722                         generic_make_request(bio);
723                 }
724         }
725         spin_unlock_irqrestore(&retry_list_lock, flags);
726 }
727 #undef IO_ERROR
728 #undef REDIRECT_SECTOR
729
730 /*
731  * This will catch the scenario in which one of the multipaths was
732  * mounted as a normal device rather than as a part of a raid set.
733  *
734  * check_consistency is very personality-dependent, eg. RAID5 cannot
735  * do this check, it uses another method.
736  */
737 static int __check_consistency (mddev_t *mddev, int row)
738 {
739         multipath_conf_t *conf = mddev_to_conf(mddev);
740         int disks = MD_SB_DISKS;
741         struct block_device *bdev;
742         int i, rc = 0;
743         char *buffer;
744         struct page *page = NULL;
745         int first = 1;
746         int order = PAGE_CACHE_SHIFT-PAGE_SHIFT;
747
748         buffer = (char *) __get_free_pages(GFP_KERNEL, order);
749         if (!buffer)
750                 return rc;
751
752         for (i = 0; i < disks; i++) {
753                 struct address_space *mapping;
754                 char *p;
755                 if (!conf->multipaths[i].operational)
756                         continue;
757                 printk("(checking disk %d)\n",i);
758                 bdev = conf->multipaths[i].bdev;
759                 mapping = bdev->bd_inode->i_mapping;
760                 page = read_cache_page(mapping, row/(PAGE_CACHE_SIZE/1024),
761                                 (filler_t *)mapping->a_ops->readpage, NULL);
762                 if (IS_ERR(page)) {
763                         page = NULL;
764                         break;
765                 }
766                 wait_on_page_locked(page);
767                 if (!PageUptodate(page))
768                         break;
769                 if (PageError(page))
770                         break;
771                 p = page_address(page);
772                 if (first) {
773                         memcpy(buffer, p, PAGE_CACHE_SIZE);
774                         first = 0;
775                 } else if (memcmp(buffer, p, PAGE_CACHE_SIZE)) {
776                         rc = 1;
777                         break;
778                 }
779                 page_cache_release(page);
780                 fsync_bdev(bdev);
781                 invalidate_bdev(bdev, 0);
782                 page = NULL;
783         }
784         if (page) {
785                 bdev = page->mapping->host->i_bdev;
786                 page_cache_release(page);
787                 fsync_bdev(bdev);
788                 invalidate_bdev(bdev, 0);
789         }
790         free_pages((unsigned long) buffer, order);
791         return rc;
792 }
793
794 static int check_consistency (mddev_t *mddev)
795 {
796         if (__check_consistency(mddev, 0))
797 /*
798  * we do not do this currently, as it's perfectly possible to
799  * have an inconsistent array when it's freshly created. Only
800  * newly written data has to be consistent.
801  */
802                 return 0;
803
804         return 0;
805 }
806
807 #define INVALID_LEVEL KERN_WARNING \
808 "multipath: md%d: raid level not set to multipath IO (%d)\n"
809
810 #define NO_SB KERN_ERR \
811 "multipath: disabled IO path %s (couldn't access raid superblock)\n"
812
813 #define ERRORS KERN_ERR \
814 "multipath: disabled IO path %s (errors detected)\n"
815
816 #define NOT_IN_SYNC KERN_ERR \
817 "multipath: making IO path %s a spare path (not in sync)\n"
818
819 #define INCONSISTENT KERN_ERR \
820 "multipath: disabled IO path %s (inconsistent descriptor)\n"
821
822 #define ALREADY_RUNNING KERN_ERR \
823 "multipath: disabled IO path %s (multipath %d already operational)\n"
824
825 #define OPERATIONAL KERN_INFO \
826 "multipath: device %s operational as IO path %d\n"
827
828 #define MEM_ERROR KERN_ERR \
829 "multipath: couldn't allocate memory for md%d\n"
830
831 #define SPARE KERN_INFO \
832 "multipath: spare IO path %s\n"
833
834 #define NONE_OPERATIONAL KERN_ERR \
835 "multipath: no operational IO paths for md%d\n"
836
837 #define SB_DIFFERENCES KERN_ERR \
838 "multipath: detected IO path differences!\n"
839
840 #define ARRAY_IS_ACTIVE KERN_INFO \
841 "multipath: array md%d active with %d out of %d IO paths (%d spare IO paths)\n"
842
843 #define THREAD_ERROR KERN_ERR \
844 "multipath: couldn't allocate thread for md%d\n"
845
846 static int multipath_run (mddev_t *mddev)
847 {
848         multipath_conf_t *conf;
849         int i, j, disk_idx;
850         struct multipath_info *disk, *disk2;
851         mdp_super_t *sb = mddev->sb;
852         mdp_disk_t *desc, *desc2;
853         mdk_rdev_t *rdev, *def_rdev = NULL;
854         struct list_head *tmp;
855         int num_rdevs = 0;
856
857         MOD_INC_USE_COUNT;
858
859         if (sb->level != -4) {
860                 printk(INVALID_LEVEL, mdidx(mddev), sb->level);
861                 goto out;
862         }
863         /*
864          * copy the already verified devices into our private MULTIPATH
865          * bookkeeping area. [whatever we allocate in multipath_run(),
866          * should be freed in multipath_stop()]
867          */
868
869         conf = kmalloc(sizeof(multipath_conf_t), GFP_KERNEL);
870         mddev->private = conf;
871         if (!conf) {
872                 printk(MEM_ERROR, mdidx(mddev));
873                 goto out;
874         }
875         memset(conf, 0, sizeof(*conf));
876
877         ITERATE_RDEV(mddev,rdev,tmp) {
878                 if (rdev->faulty) {
879                         /* this is a "should never happen" case and if it */
880                         /* ever does happen, a continue; won't help */
881                         printk(ERRORS, partition_name(rdev->dev));
882                         continue;
883                 } else {
884                         /* this is a "should never happen" case and if it */
885                         /* ever does happen, a continue; won't help */
886                         if (!rdev->sb) {
887                                 MD_BUG();
888                                 continue;
889                         }
890                 }
891                 if (rdev->desc_nr == -1) {
892                         MD_BUG();
893                         continue;
894                 }
895
896                 desc = &sb->disks[rdev->desc_nr];
897                 disk_idx = desc->raid_disk;
898                 disk = conf->multipaths + disk_idx;
899
900                 if (!disk_sync(desc))
901                         printk(NOT_IN_SYNC, partition_name(rdev->dev));
902
903                 /*
904                  * Mark all disks as spare to start with, then pick our
905                  * active disk.  If we have a disk that is marked active
906                  * in the sb, then use it, else use the first rdev.
907                  */
908                 disk->number = desc->number;
909                 disk->raid_disk = desc->raid_disk;
910                 disk->dev = rdev->dev;
911                 disk->bdev = rdev->bdev;
912                 atomic_inc(&rdev->bdev->bd_count);
913                 disk->operational = 0;
914                 disk->spare = 1;
915                 disk->used_slot = 1;
916                 mark_disk_sync(desc);
917
918                 if (disk_active(desc)) {
919                         if(!conf->working_disks) {
920                                 printk(OPERATIONAL, partition_name(rdev->dev),
921                                         desc->raid_disk);
922                                 disk->operational = 1;
923                                 disk->spare = 0;
924                                 conf->working_disks++;
925                                 def_rdev = rdev;
926                         } else {
927                                 mark_disk_spare(desc);
928                         }
929                 } else
930                         mark_disk_spare(desc);
931
932                 if(!num_rdevs++) def_rdev = rdev;
933         }
934         if(!conf->working_disks && num_rdevs) {
935                 desc = &sb->disks[def_rdev->desc_nr];
936                 disk = conf->multipaths + desc->raid_disk;
937                 printk(OPERATIONAL, partition_name(def_rdev->dev),
938                         disk->raid_disk);
939                 disk->operational = 1;
940                 disk->spare = 0;
941                 conf->working_disks++;
942                 mark_disk_active(desc);
943         }
944         /*
945          * Make sure our active path is in desc spot 0
946          */
947         if(def_rdev->desc_nr != 0) {
948                 rdev = find_rdev_nr(mddev, 0);
949                 desc = &sb->disks[def_rdev->desc_nr];
950                 desc2 = sb->disks;
951                 disk = conf->multipaths + desc->raid_disk;
952                 disk2 = conf->multipaths + desc2->raid_disk;
953                 xchg_values(*desc2,*desc);
954                 xchg_values(*disk2,*disk);
955                 xchg_values(desc2->number, desc->number);
956                 xchg_values(disk2->number, disk->number);
957                 xchg_values(desc2->raid_disk, desc->raid_disk);
958                 xchg_values(disk2->raid_disk, disk->raid_disk);
959                 if(rdev) {
960                         xchg_values(def_rdev->desc_nr,rdev->desc_nr);
961                 } else {
962                         def_rdev->desc_nr = 0;
963                 }
964         }
965         conf->raid_disks = sb->raid_disks = sb->active_disks = 1;
966         conf->nr_disks = sb->nr_disks = sb->working_disks = num_rdevs;
967         sb->failed_disks = 0;
968         sb->spare_disks = num_rdevs - 1;
969         mddev->sb_dirty = 1;
970         conf->mddev = mddev;
971         conf->device_lock = SPIN_LOCK_UNLOCKED;
972
973         init_waitqueue_head(&conf->wait_buffer);
974
975         if (!conf->working_disks) {
976                 printk(NONE_OPERATIONAL, mdidx(mddev));
977                 goto out_free_conf;
978         }
979
980
981         /* pre-allocate some buffer_head structures.
982          * As a minimum, 1 mpbh and raid_disks buffer_heads
983          * would probably get us by in tight memory situations,
984          * but a few more is probably a good idea.
985          * For now, try NR_RESERVED_BUFS mpbh and
986          * NR_RESERVED_BUFS*raid_disks bufferheads
987          * This will allow at least NR_RESERVED_BUFS concurrent
988          * reads or writes even if kmalloc starts failing
989          */
990         if (multipath_grow_mpbh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS) {
991                 printk(MEM_ERROR, mdidx(mddev));
992                 goto out_free_conf;
993         }
994
995         if ((sb->state & (1 << MD_SB_CLEAN))) {
996                 /*
997                  * we do sanity checks even if the device says
998                  * it's clean ...
999                  */
1000                 if (check_consistency(mddev)) {
1001                         printk(SB_DIFFERENCES);
1002                         sb->state &= ~(1 << MD_SB_CLEAN);
1003                 }
1004         }
1005
1006         {
1007                 const char * name = "multipathd";
1008
1009                 conf->thread = md_register_thread(multipathd, conf, name);
1010                 if (!conf->thread) {
1011                         printk(THREAD_ERROR, mdidx(mddev));
1012                         goto out_free_conf;
1013                 }
1014         }
1015
1016         /*
1017          * Regenerate the "device is in sync with the raid set" bit for
1018          * each device.
1019          */
1020         for (i = 0; i < MD_SB_DISKS; i++) {
1021                 mark_disk_nonsync(sb->disks+i);
1022                 for (j = 0; j < sb->raid_disks; j++) {
1023                         if (sb->disks[i].number == conf->multipaths[j].number)
1024                                 mark_disk_sync(sb->disks+i);
1025                 }
1026         }
1027
1028         printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks,
1029                         sb->raid_disks, sb->spare_disks);
1030         /*
1031          * Ok, everything is just fine now
1032          */
1033         return 0;
1034
1035 out_free_conf:
1036         multipath_shrink_mpbh(conf);
1037         for (i = 0; i < MD_SB_DISKS; i++)
1038                 if (conf->multipaths[i].bdev)
1039                         bdput(conf->multipaths[i].bdev);
1040         kfree(conf);
1041         mddev->private = NULL;
1042 out:
1043         MOD_DEC_USE_COUNT;
1044         return -EIO;
1045 }
1046
1047 #undef INVALID_LEVEL
1048 #undef NO_SB
1049 #undef ERRORS
1050 #undef NOT_IN_SYNC
1051 #undef INCONSISTENT
1052 #undef ALREADY_RUNNING
1053 #undef OPERATIONAL
1054 #undef SPARE
1055 #undef NONE_OPERATIONAL
1056 #undef SB_DIFFERENCES
1057 #undef ARRAY_IS_ACTIVE
1058
1059 static int multipath_stop (mddev_t *mddev)
1060 {
1061         multipath_conf_t *conf = mddev_to_conf(mddev);
1062         int i;
1063
1064         md_unregister_thread(conf->thread);
1065         multipath_shrink_mpbh(conf);
1066         for (i = 0; i < MD_SB_DISKS; i++)
1067                 if (conf->multipaths[i].bdev)
1068                         bdput(conf->multipaths[i].bdev);
1069         kfree(conf);
1070         mddev->private = NULL;
1071         MOD_DEC_USE_COUNT;
1072         return 0;
1073 }
1074
1075 static mdk_personality_t multipath_personality=
1076 {
1077         name:           "multipath",
1078         make_request:   multipath_make_request,
1079         run:            multipath_run,
1080         stop:           multipath_stop,
1081         status:         multipath_status,
1082         error_handler:  multipath_error,
1083         diskop:         multipath_diskop,
1084 };
1085
1086 static int __init multipath_init (void)
1087 {
1088         return register_md_personality (MULTIPATH, &multipath_personality);
1089 }
1090
1091 static void __exit multipath_exit (void)
1092 {
1093         unregister_md_personality (MULTIPATH);
1094 }
1095
1096 module_init(multipath_init);
1097 module_exit(multipath_exit);
1098 MODULE_LICENSE("GPL");