v2.4.8 -> v2.4.8.1
[opensuse:kernel.git] / drivers / md / raid1.c
1 /*
2  * raid1.c : Multiple Devices driver for Linux
3  *
4  * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
5  *
6  * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
7  *
8  * RAID-1 management functions.
9  *
10  * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
11  *
12  * Fixes to reconstruction by Jakob Ć˜stergaard" <jakob@ostenfeld.dk>
13  * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
14  *
15  * This program is free software; you can redistribute it and/or modify
16  * it under the terms of the GNU General Public License as published by
17  * the Free Software Foundation; either version 2, or (at your option)
18  * any later version.
19  *
20  * You should have received a copy of the GNU General Public License
21  * (for example /usr/src/linux/COPYING); if not, write to the Free
22  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23  */
24
25 #include <linux/module.h>
26 #include <linux/slab.h>
27 #include <linux/raid/raid1.h>
28 #include <asm/atomic.h>
29
30 #define MAJOR_NR MD_MAJOR
31 #define MD_DRIVER
32 #define MD_PERSONALITY
33
34 #define MAX_WORK_PER_DISK 128
35
36 #define NR_RESERVED_BUFS        32
37
38
39 /*
40  * The following can be used to debug the driver
41  */
42 #define RAID1_DEBUG     0
43
44 #if RAID1_DEBUG
45 #define PRINTK(x...)   printk(x)
46 #define inline
47 #define __inline__
48 #else
49 #define PRINTK(x...)  do { } while (0)
50 #endif
51
52
53 static mdk_personality_t raid1_personality;
54 static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
55 struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail;
56
57 static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
58 {
59         /* return a linked list of "cnt" struct buffer_heads.
60          * don't take any off the free list unless we know we can
61          * get all we need, otherwise we could deadlock
62          */
63         struct buffer_head *bh=NULL;
64
65         while(cnt) {
66                 struct buffer_head *t;
67                 md_spin_lock_irq(&conf->device_lock);
68                 if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
69                         while (cnt) {
70                                 t = conf->freebh;
71                                 conf->freebh = t->b_next;
72                                 t->b_next = bh;
73                                 bh = t;
74                                 t->b_state = 0;
75                                 conf->freebh_cnt--;
76                                 cnt--;
77                         }
78                 md_spin_unlock_irq(&conf->device_lock);
79                 if (cnt == 0)
80                         break;
81                 t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
82                 if (t) {
83                         t->b_next = bh;
84                         bh = t;
85                         cnt--;
86                 } else {
87                         PRINTK("raid1: waiting for %d bh\n", cnt);
88                         conf->freebh_blocked = 1;
89                         wait_disk_event(conf->wait_buffer,
90                                         !conf->freebh_blocked ||
91                                         conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2);
92                         conf->freebh_blocked = 0;
93                 }
94         }
95         return bh;
96 }
97
98 static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
99 {
100         unsigned long flags;
101         spin_lock_irqsave(&conf->device_lock, flags);
102         while (bh) {
103                 struct buffer_head *t = bh;
104                 bh=bh->b_next;
105                 if (t->b_pprev == NULL)
106                         kmem_cache_free(bh_cachep, t);
107                 else {
108                         t->b_next= conf->freebh;
109                         conf->freebh = t;
110                         conf->freebh_cnt++;
111                 }
112         }
113         spin_unlock_irqrestore(&conf->device_lock, flags);
114         wake_up(&conf->wait_buffer);
115 }
116
117 static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
118 {
119         /* allocate cnt buffer_heads, possibly less if kmalloc fails */
120         int i = 0;
121
122         while (i < cnt) {
123                 struct buffer_head *bh;
124                 bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
125                 if (!bh) break;
126
127                 md_spin_lock_irq(&conf->device_lock);
128                 bh->b_pprev = &conf->freebh;
129                 bh->b_next = conf->freebh;
130                 conf->freebh = bh;
131                 conf->freebh_cnt++;
132                 md_spin_unlock_irq(&conf->device_lock);
133
134                 i++;
135         }
136         return i;
137 }
138
139 static void raid1_shrink_bh(raid1_conf_t *conf)
140 {
141         /* discard all buffer_heads */
142
143         md_spin_lock_irq(&conf->device_lock);
144         while (conf->freebh) {
145                 struct buffer_head *bh = conf->freebh;
146                 conf->freebh = bh->b_next;
147                 kmem_cache_free(bh_cachep, bh);
148                 conf->freebh_cnt--;
149         }
150         md_spin_unlock_irq(&conf->device_lock);
151 }
152                 
153
154 static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
155 {
156         struct raid1_bh *r1_bh = NULL;
157
158         do {
159                 md_spin_lock_irq(&conf->device_lock);
160                 if (!conf->freer1_blocked && conf->freer1) {
161                         r1_bh = conf->freer1;
162                         conf->freer1 = r1_bh->next_r1;
163                         conf->freer1_cnt--;
164                         r1_bh->next_r1 = NULL;
165                         r1_bh->state = 0;
166                         r1_bh->bh_req.b_state = 0;
167                 }
168                 md_spin_unlock_irq(&conf->device_lock);
169                 if (r1_bh)
170                         return r1_bh;
171                 r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
172                 if (r1_bh) {
173                         memset(r1_bh, 0, sizeof(*r1_bh));
174                         return r1_bh;
175                 }
176                 conf->freer1_blocked = 1;
177                 wait_disk_event(conf->wait_buffer,
178                                 !conf->freer1_blocked ||
179                                 conf->freer1_cnt > NR_RESERVED_BUFS/2
180                         );
181                 conf->freer1_blocked = 0;
182         } while (1);
183 }
184
185 static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
186 {
187         struct buffer_head *bh = r1_bh->mirror_bh_list;
188         raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
189
190         r1_bh->mirror_bh_list = NULL;
191
192         if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
193                 unsigned long flags;
194                 spin_lock_irqsave(&conf->device_lock, flags);
195                 r1_bh->next_r1 = conf->freer1;
196                 conf->freer1 = r1_bh;
197                 conf->freer1_cnt++;
198                 spin_unlock_irqrestore(&conf->device_lock, flags);
199                 /* don't need to wakeup wait_buffer because
200                  *  raid1_free_bh below will do that
201                  */
202         } else {
203                 kfree(r1_bh);
204         }
205         raid1_free_bh(conf, bh);
206 }
207
208 static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
209 {
210         int i = 0;
211
212         while (i < cnt) {
213                 struct raid1_bh *r1_bh;
214                 r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
215                 if (!r1_bh)
216                         break;
217                 memset(r1_bh, 0, sizeof(*r1_bh));
218                 set_bit(R1BH_PreAlloc, &r1_bh->state);
219                 r1_bh->mddev = conf->mddev;
220
221                 raid1_free_r1bh(r1_bh);
222                 i++;
223         }
224         return i;
225 }
226
227 static void raid1_shrink_r1bh(raid1_conf_t *conf)
228 {
229         md_spin_lock_irq(&conf->device_lock);
230         while (conf->freer1) {
231                 struct raid1_bh *r1_bh = conf->freer1;
232                 conf->freer1 = r1_bh->next_r1;
233                 conf->freer1_cnt--;
234                 kfree(r1_bh);
235         }
236         md_spin_unlock_irq(&conf->device_lock);
237 }
238
239
240
241 static inline void raid1_free_buf(struct raid1_bh *r1_bh)
242 {
243         unsigned long flags;
244         struct buffer_head *bh = r1_bh->mirror_bh_list;
245         raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
246         r1_bh->mirror_bh_list = NULL;
247         
248         spin_lock_irqsave(&conf->device_lock, flags);
249         r1_bh->next_r1 = conf->freebuf;
250         conf->freebuf = r1_bh;
251         spin_unlock_irqrestore(&conf->device_lock, flags);
252         raid1_free_bh(conf, bh);
253 }
254
255 static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
256 {
257         struct raid1_bh *r1_bh;
258
259         md_spin_lock_irq(&conf->device_lock);
260         wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
261         r1_bh = conf->freebuf;
262         conf->freebuf = r1_bh->next_r1;
263         r1_bh->next_r1= NULL;
264         md_spin_unlock_irq(&conf->device_lock);
265
266         return r1_bh;
267 }
268
269 static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
270 {
271         int i = 0;
272
273         md_spin_lock_irq(&conf->device_lock);
274         while (i < cnt) {
275                 struct raid1_bh *r1_bh;
276                 struct page *page;
277
278                 page = alloc_page(GFP_KERNEL);
279                 if (!page)
280                         break;
281
282                 r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
283                 if (!r1_bh) {
284                         __free_page(page);
285                         break;
286                 }
287                 memset(r1_bh, 0, sizeof(*r1_bh));
288                 r1_bh->bh_req.b_page = page;
289                 r1_bh->bh_req.b_data = page_address(page);
290                 r1_bh->next_r1 = conf->freebuf;
291                 conf->freebuf = r1_bh;
292                 i++;
293         }
294         md_spin_unlock_irq(&conf->device_lock);
295         return i;
296 }
297
298 static void raid1_shrink_buffers (raid1_conf_t *conf)
299 {
300         md_spin_lock_irq(&conf->device_lock);
301         while (conf->freebuf) {
302                 struct raid1_bh *r1_bh = conf->freebuf;
303                 conf->freebuf = r1_bh->next_r1;
304                 __free_page(r1_bh->bh_req.b_page);
305                 kfree(r1_bh);
306         }
307         md_spin_unlock_irq(&conf->device_lock);
308 }
309
310 static int raid1_map (mddev_t *mddev, kdev_t *rdev)
311 {
312         raid1_conf_t *conf = mddev_to_conf(mddev);
313         int i, disks = MD_SB_DISKS;
314
315         /*
316          * Later we do read balancing on the read side 
317          * now we use the first available disk.
318          */
319
320         for (i = 0; i < disks; i++) {
321                 if (conf->mirrors[i].operational) {
322                         *rdev = conf->mirrors[i].dev;
323                         return (0);
324                 }
325         }
326
327         printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n");
328         return (-1);
329 }
330
331 static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
332 {
333         unsigned long flags;
334         mddev_t *mddev = r1_bh->mddev;
335         raid1_conf_t *conf = mddev_to_conf(mddev);
336
337         md_spin_lock_irqsave(&retry_list_lock, flags);
338         if (raid1_retry_list == NULL)
339                 raid1_retry_tail = &raid1_retry_list;
340         *raid1_retry_tail = r1_bh;
341         raid1_retry_tail = &r1_bh->next_r1;
342         r1_bh->next_r1 = NULL;
343         md_spin_unlock_irqrestore(&retry_list_lock, flags);
344         md_wakeup_thread(conf->thread);
345 }
346
347
348 static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
349 {
350         unsigned long flags;
351         spin_lock_irqsave(&conf->segment_lock, flags);
352         if (sector < conf->start_active)
353                 conf->cnt_done--;
354         else if (sector >= conf->start_future && conf->phase == phase)
355                 conf->cnt_future--;
356         else if (!--conf->cnt_pending)
357                 wake_up(&conf->wait_ready);
358
359         spin_unlock_irqrestore(&conf->segment_lock, flags);
360 }
361
362 static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
363 {
364         unsigned long flags;
365         spin_lock_irqsave(&conf->segment_lock, flags);
366         if (sector >= conf->start_ready)
367                 --conf->cnt_ready;
368         else if (sector >= conf->start_active) {
369                 if (!--conf->cnt_active) {
370                         conf->start_active = conf->start_ready;
371                         wake_up(&conf->wait_done);
372                 }
373         }
374         spin_unlock_irqrestore(&conf->segment_lock, flags);
375 }
376
377 /*
378  * raid1_end_bh_io() is called when we have finished servicing a mirrored
379  * operation and are ready to return a success/failure code to the buffer
380  * cache layer.
381  */
382 static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
383 {
384         struct buffer_head *bh = r1_bh->master_bh;
385
386         io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
387                         test_bit(R1BH_SyncPhase, &r1_bh->state));
388
389         bh->b_end_io(bh, uptodate);
390         raid1_free_r1bh(r1_bh);
391 }
392 void raid1_end_request (struct buffer_head *bh, int uptodate)
393 {
394         struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
395
396         /*
397          * this branch is our 'one mirror IO has finished' event handler:
398          */
399         if (!uptodate)
400                 md_error (r1_bh->mddev, bh->b_dev);
401         else
402                 /*
403                  * Set R1BH_Uptodate in our master buffer_head, so that
404                  * we will return a good error code for to the higher
405                  * levels even if IO on some other mirrored buffer fails.
406                  *
407                  * The 'master' represents the complex operation to 
408                  * user-side. So if something waits for IO, then it will
409                  * wait for the 'master' buffer_head.
410                  */
411                 set_bit (R1BH_Uptodate, &r1_bh->state);
412
413         /*
414          * We split up the read and write side, imho they are 
415          * conceptually different.
416          */
417
418         if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
419                 /*
420                  * we have only one buffer_head on the read side
421                  */
422                 
423                 if (uptodate) {
424                         raid1_end_bh_io(r1_bh, uptodate);
425                         return;
426                 }
427                 /*
428                  * oops, read error:
429                  */
430                 printk(KERN_ERR "raid1: %s: rescheduling block %lu\n", 
431                          partition_name(bh->b_dev), bh->b_blocknr);
432                 raid1_reschedule_retry(r1_bh);
433                 return;
434         }
435
436         /*
437          * WRITE:
438          *
439          * Let's see if all mirrored write operations have finished 
440          * already.
441          */
442
443         if (atomic_dec_and_test(&r1_bh->remaining))
444                 raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
445 }
446
447 /*
448  * This routine returns the disk from which the requested read should
449  * be done. It bookkeeps the last read position for every disk
450  * in array and when new read requests come, the disk which last
451  * position is nearest to the request, is chosen.
452  *
453  * TODO: now if there are 2 mirrors in the same 2 devices, performance
454  * degrades dramatically because position is mirror, not device based.
455  * This should be changed to be device based. Also atomic sequential
456  * reads should be somehow balanced.
457  */
458
459 static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
460 {
461         int new_disk = conf->last_used;
462         const int sectors = bh->b_size >> 9;
463         const unsigned long this_sector = bh->b_rsector;
464         int disk = new_disk;
465         unsigned long new_distance;
466         unsigned long current_distance;
467         
468         /*
469          * Check if it is sane at all to balance
470          */
471         
472         if (conf->resync_mirrors)
473                 goto rb_out;
474         
475
476         /* make sure that disk is operational */
477         while( !conf->mirrors[new_disk].operational) {
478                 if (new_disk <= 0) new_disk = conf->raid_disks;
479                 new_disk--;
480                 if (new_disk == disk) {
481                         /*
482                          * This means no working disk was found
483                          * Nothing much to do, lets not change anything
484                          * and hope for the best...
485                          */
486                         
487                         new_disk = conf->last_used;
488
489                         goto rb_out;
490                 }
491         }
492         disk = new_disk;
493         /* now disk == new_disk == starting point for search */
494         
495         /*
496          * Don't touch anything for sequential reads.
497          */
498
499         if (this_sector == conf->mirrors[new_disk].head_position)
500                 goto rb_out;
501         
502         /*
503          * If reads have been done only on a single disk
504          * for a time, lets give another disk a change.
505          * This is for kicking those idling disks so that
506          * they would find work near some hotspot.
507          */
508         
509         if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
510                 conf->sect_count = 0;
511
512                 do {
513                         if (new_disk<=0)
514                                 new_disk = conf->raid_disks;
515                         new_disk--;
516                         if (new_disk == disk)
517                                 break;
518                 } while ((conf->mirrors[new_disk].write_only) ||
519                          (!conf->mirrors[new_disk].operational));
520
521                 goto rb_out;
522         }
523         
524         current_distance = abs(this_sector -
525                                 conf->mirrors[disk].head_position);
526         
527         /* Find the disk which is closest */
528         
529         do {
530                 if (disk <= 0)
531                         disk = conf->raid_disks;
532                 disk--;
533                 
534                 if ((conf->mirrors[disk].write_only) ||
535                                 (!conf->mirrors[disk].operational))
536                         continue;
537                 
538                 new_distance = abs(this_sector -
539                                         conf->mirrors[disk].head_position);
540                 
541                 if (new_distance < current_distance) {
542                         conf->sect_count = 0;
543                         current_distance = new_distance;
544                         new_disk = disk;
545                 }
546         } while (disk != conf->last_used);
547
548 rb_out:
549         conf->mirrors[new_disk].head_position = this_sector + sectors;
550
551         conf->last_used = new_disk;
552         conf->sect_count += sectors;
553
554         return new_disk;
555 }
556
557 static int raid1_make_request (mddev_t *mddev, int rw,
558                                struct buffer_head * bh)
559 {
560         raid1_conf_t *conf = mddev_to_conf(mddev);
561         struct buffer_head *bh_req, *bhl;
562         struct raid1_bh * r1_bh;
563         int disks = MD_SB_DISKS;
564         int i, sum_bhs = 0;
565         struct mirror_info *mirror;
566
567         if (!buffer_locked(bh))
568                 BUG();
569         
570 /*
571  * make_request() can abort the operation when READA is being
572  * used and no empty request is available.
573  *
574  * Currently, just replace the command with READ/WRITE.
575  */
576         if (rw == READA)
577                 rw = READ;
578
579         r1_bh = raid1_alloc_r1bh (conf);
580
581         spin_lock_irq(&conf->segment_lock);
582         wait_event_lock_irq(conf->wait_done,
583                         bh->b_rsector < conf->start_active ||
584                         bh->b_rsector >= conf->start_future,
585                         conf->segment_lock);
586         if (bh->b_rsector < conf->start_active) 
587                 conf->cnt_done++;
588         else {
589                 conf->cnt_future++;
590                 if (conf->phase)
591                         set_bit(R1BH_SyncPhase, &r1_bh->state);
592         }
593         spin_unlock_irq(&conf->segment_lock);
594         
595         /*
596          * i think the read and write branch should be separated completely,
597          * since we want to do read balancing on the read side for example.
598          * Alternative implementations? :) --mingo
599          */
600
601         r1_bh->master_bh = bh;
602         r1_bh->mddev = mddev;
603         r1_bh->cmd = rw;
604
605         if (rw == READ) {
606                 /*
607                  * read balancing logic:
608                  */
609                 mirror = conf->mirrors + raid1_read_balance(conf, bh);
610
611                 bh_req = &r1_bh->bh_req;
612                 memcpy(bh_req, bh, sizeof(*bh));
613                 bh_req->b_blocknr = bh->b_rsector;
614                 bh_req->b_dev = mirror->dev;
615                 bh_req->b_rdev = mirror->dev;
616         /*      bh_req->b_rsector = bh->n_rsector; */
617                 bh_req->b_end_io = raid1_end_request;
618                 bh_req->b_private = r1_bh;
619                 generic_make_request (rw, bh_req);
620                 return 0;
621         }
622
623         /*
624          * WRITE:
625          */
626
627         bhl = raid1_alloc_bh(conf, conf->raid_disks);
628         for (i = 0; i < disks; i++) {
629                 struct buffer_head *mbh;
630                 if (!conf->mirrors[i].operational) 
631                         continue;
632  
633         /*
634          * We should use a private pool (size depending on NR_REQUEST),
635          * to avoid writes filling up the memory with bhs
636          *
637          * Such pools are much faster than kmalloc anyways (so we waste
638          * almost nothing by not using the master bh when writing and
639          * win alot of cleanness) but for now we are cool enough. --mingo
640          *
641          * It's safe to sleep here, buffer heads cannot be used in a shared
642          * manner in the write branch. Look how we lock the buffer at the
643          * beginning of this function to grok the difference ;)
644          */
645                 mbh = bhl;
646                 if (mbh == NULL) {
647                         MD_BUG();
648                         break;
649                 }
650                 bhl = mbh->b_next;
651                 mbh->b_next = NULL;
652                 mbh->b_this_page = (struct buffer_head *)1;
653                 
654         /*
655          * prepare mirrored mbh (fields ordered for max mem throughput):
656          */
657                 mbh->b_blocknr    = bh->b_rsector;
658                 mbh->b_dev        = conf->mirrors[i].dev;
659                 mbh->b_rdev       = conf->mirrors[i].dev;
660                 mbh->b_rsector    = bh->b_rsector;
661                 mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
662                                                 (1<<BH_Mapped) | (1<<BH_Lock);
663
664                 atomic_set(&mbh->b_count, 1);
665                 mbh->b_size       = bh->b_size;
666                 mbh->b_page       = bh->b_page;
667                 mbh->b_data       = bh->b_data;
668                 mbh->b_list       = BUF_LOCKED;
669                 mbh->b_end_io     = raid1_end_request;
670                 mbh->b_private    = r1_bh;
671
672                 mbh->b_next = r1_bh->mirror_bh_list;
673                 r1_bh->mirror_bh_list = mbh;
674                 sum_bhs++;
675         }
676         if (bhl) raid1_free_bh(conf,bhl);
677         if (!sum_bhs) {
678                 /* Gag - all mirrors non-operational.. */
679                 raid1_end_bh_io(r1_bh, 0);
680                 return 0;
681         }
682         md_atomic_set(&r1_bh->remaining, sum_bhs);
683
684         /*
685          * We have to be a bit careful about the semaphore above, thats
686          * why we start the requests separately. Since kmalloc() could
687          * fail, sleep and make_request() can sleep too, this is the
688          * safer solution. Imagine, end_request decreasing the semaphore
689          * before we could have set it up ... We could play tricks with
690          * the semaphore (presetting it and correcting at the end if
691          * sum_bhs is not 'n' but we have to do end_request by hand if
692          * all requests finish until we had a chance to set up the
693          * semaphore correctly ... lots of races).
694          */
695         bh = r1_bh->mirror_bh_list;
696         while(bh) {
697                 struct buffer_head *bh2 = bh;
698                 bh = bh->b_next;
699                 generic_make_request(rw, bh2);
700         }
701         return (0);
702 }
703
704 static int raid1_status (char *page, mddev_t *mddev)
705 {
706         raid1_conf_t *conf = mddev_to_conf(mddev);
707         int sz = 0, i;
708         
709         sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
710                                                  conf->working_disks);
711         for (i = 0; i < conf->raid_disks; i++)
712                 sz += sprintf (page+sz, "%s",
713                         conf->mirrors[i].operational ? "U" : "_");
714         sz += sprintf (page+sz, "]");
715         return sz;
716 }
717
718 #define LAST_DISK KERN_ALERT \
719 "raid1: only one disk left and IO error.\n"
720
721 #define NO_SPARE_DISK KERN_ALERT \
722 "raid1: no spare disk left, degrading mirror level by one.\n"
723
724 #define DISK_FAILED KERN_ALERT \
725 "raid1: Disk failure on %s, disabling device. \n" \
726 "       Operation continuing on %d devices\n"
727
728 #define START_SYNCING KERN_ALERT \
729 "raid1: start syncing spare disk.\n"
730
731 #define ALREADY_SYNCING KERN_INFO \
732 "raid1: syncing already in progress.\n"
733
734 static void mark_disk_bad (mddev_t *mddev, int failed)
735 {
736         raid1_conf_t *conf = mddev_to_conf(mddev);
737         struct mirror_info *mirror = conf->mirrors+failed;
738         mdp_super_t *sb = mddev->sb;
739
740         mirror->operational = 0;
741         mark_disk_faulty(sb->disks+mirror->number);
742         mark_disk_nonsync(sb->disks+mirror->number);
743         mark_disk_inactive(sb->disks+mirror->number);
744         if (!mirror->write_only)
745                 sb->active_disks--;
746         sb->working_disks--;
747         sb->failed_disks++;
748         mddev->sb_dirty = 1;
749         md_wakeup_thread(conf->thread);
750         if (!mirror->write_only)
751                 conf->working_disks--;
752         printk (DISK_FAILED, partition_name (mirror->dev),
753                                  conf->working_disks);
754 }
755
756 static int raid1_error (mddev_t *mddev, kdev_t dev)
757 {
758         raid1_conf_t *conf = mddev_to_conf(mddev);
759         struct mirror_info * mirrors = conf->mirrors;
760         int disks = MD_SB_DISKS;
761         int i;
762
763         /* Find the drive.
764          * If it is not operational, then we have already marked it as dead
765          * else if it is the last working disks, ignore the error, let the
766          * next level up know.
767          * else mark the drive as failed
768          */
769
770         for (i = 0; i < disks; i++)
771                 if (mirrors[i].dev==dev && mirrors[i].operational)
772                         break;
773         if (i == disks)
774                 return 0;
775
776         if (i < conf->raid_disks && conf->working_disks == 1) {
777                 /* Don't fail the drive, act as though we were just a
778                  * normal single drive
779                  */
780
781                 return 1;
782         }
783         mark_disk_bad(mddev, i);
784         return 0;
785 }
786
787 #undef LAST_DISK
788 #undef NO_SPARE_DISK
789 #undef DISK_FAILED
790 #undef START_SYNCING
791
792
793 static void print_raid1_conf (raid1_conf_t *conf)
794 {
795         int i;
796         struct mirror_info *tmp;
797
798         printk("RAID1 conf printout:\n");
799         if (!conf) {
800                 printk("(conf==NULL)\n");
801                 return;
802         }
803         printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
804                          conf->raid_disks, conf->nr_disks);
805
806         for (i = 0; i < MD_SB_DISKS; i++) {
807                 tmp = conf->mirrors + i;
808                 printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
809                         i, tmp->spare,tmp->operational,
810                         tmp->number,tmp->raid_disk,tmp->used_slot,
811                         partition_name(tmp->dev));
812         }
813 }
814
815 static void close_sync(raid1_conf_t *conf)
816 {
817         mddev_t *mddev = conf->mddev;
818         /* If reconstruction was interrupted, we need to close the "active" and "pending"
819          * holes.
820          * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
821          */
822         /* this is really needed when recovery stops too... */
823         spin_lock_irq(&conf->segment_lock);
824         conf->start_active = conf->start_pending;
825         conf->start_ready = conf->start_pending;
826         wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
827         conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
828         conf->start_future = mddev->sb->size+1;
829         conf->cnt_pending = conf->cnt_future;
830         conf->cnt_future = 0;
831         conf->phase = conf->phase ^1;
832         wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
833         conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
834         conf->phase = 0;
835         conf->cnt_future = conf->cnt_done;;
836         conf->cnt_done = 0;
837         spin_unlock_irq(&conf->segment_lock);
838         wake_up(&conf->wait_done);
839 }
840
841 static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
842 {
843         int err = 0;
844         int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
845         raid1_conf_t *conf = mddev->private;
846         struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
847         mdp_super_t *sb = mddev->sb;
848         mdp_disk_t *failed_desc, *spare_desc, *added_desc;
849         mdk_rdev_t *spare_rdev, *failed_rdev;
850
851         print_raid1_conf(conf);
852         md_spin_lock_irq(&conf->device_lock);
853         /*
854          * find the disk ...
855          */
856         switch (state) {
857
858         case DISKOP_SPARE_ACTIVE:
859
860                 /*
861                  * Find the failed disk within the RAID1 configuration ...
862                  * (this can only be in the first conf->working_disks part)
863                  */
864                 for (i = 0; i < conf->raid_disks; i++) {
865                         tmp = conf->mirrors + i;
866                         if ((!tmp->operational && !tmp->spare) ||
867                                         !tmp->used_slot) {
868                                 failed_disk = i;
869                                 break;
870                         }
871                 }
872                 /*
873                  * When we activate a spare disk we _must_ have a disk in
874                  * the lower (active) part of the array to replace. 
875                  */
876                 if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
877                         MD_BUG();
878                         err = 1;
879                         goto abort;
880                 }
881                 /* fall through */
882
883         case DISKOP_SPARE_WRITE:
884         case DISKOP_SPARE_INACTIVE:
885
886                 /*
887                  * Find the spare disk ... (can only be in the 'high'
888                  * area of the array)
889                  */
890                 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
891                         tmp = conf->mirrors + i;
892                         if (tmp->spare && tmp->number == (*d)->number) {
893                                 spare_disk = i;
894                                 break;
895                         }
896                 }
897                 if (spare_disk == -1) {
898                         MD_BUG();
899                         err = 1;
900                         goto abort;
901                 }
902                 break;
903
904         case DISKOP_HOT_REMOVE_DISK:
905
906                 for (i = 0; i < MD_SB_DISKS; i++) {
907                         tmp = conf->mirrors + i;
908                         if (tmp->used_slot && (tmp->number == (*d)->number)) {
909                                 if (tmp->operational) {
910                                         err = -EBUSY;
911                                         goto abort;
912                                 }
913                                 removed_disk = i;
914                                 break;
915                         }
916                 }
917                 if (removed_disk == -1) {
918                         MD_BUG();
919                         err = 1;
920                         goto abort;
921                 }
922                 break;
923
924         case DISKOP_HOT_ADD_DISK:
925
926                 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
927                         tmp = conf->mirrors + i;
928                         if (!tmp->used_slot) {
929                                 added_disk = i;
930                                 break;
931                         }
932                 }
933                 if (added_disk == -1) {
934                         MD_BUG();
935                         err = 1;
936                         goto abort;
937                 }
938                 break;
939         }
940
941         switch (state) {
942         /*
943          * Switch the spare disk to write-only mode:
944          */
945         case DISKOP_SPARE_WRITE:
946                 sdisk = conf->mirrors + spare_disk;
947                 sdisk->operational = 1;
948                 sdisk->write_only = 1;
949                 break;
950         /*
951          * Deactivate a spare disk:
952          */
953         case DISKOP_SPARE_INACTIVE:
954                 close_sync(conf);
955                 sdisk = conf->mirrors + spare_disk;
956                 sdisk->operational = 0;
957                 sdisk->write_only = 0;
958                 break;
959         /*
960          * Activate (mark read-write) the (now sync) spare disk,
961          * which means we switch it's 'raid position' (->raid_disk)
962          * with the failed disk. (only the first 'conf->nr_disks'
963          * slots are used for 'real' disks and we must preserve this
964          * property)
965          */
966         case DISKOP_SPARE_ACTIVE:
967                 close_sync(conf);
968                 sdisk = conf->mirrors + spare_disk;
969                 fdisk = conf->mirrors + failed_disk;
970
971                 spare_desc = &sb->disks[sdisk->number];
972                 failed_desc = &sb->disks[fdisk->number];
973
974                 if (spare_desc != *d) {
975                         MD_BUG();
976                         err = 1;
977                         goto abort;
978                 }
979
980                 if (spare_desc->raid_disk != sdisk->raid_disk) {
981                         MD_BUG();
982                         err = 1;
983                         goto abort;
984                 }
985                         
986                 if (sdisk->raid_disk != spare_disk) {
987                         MD_BUG();
988                         err = 1;
989                         goto abort;
990                 }
991
992                 if (failed_desc->raid_disk != fdisk->raid_disk) {
993                         MD_BUG();
994                         err = 1;
995                         goto abort;
996                 }
997
998                 if (fdisk->raid_disk != failed_disk) {
999                         MD_BUG();
1000                         err = 1;
1001                         goto abort;
1002                 }
1003
1004                 /*
1005                  * do the switch finally
1006                  */
1007                 spare_rdev = find_rdev_nr(mddev, spare_desc->number);
1008                 failed_rdev = find_rdev_nr(mddev, failed_desc->number);
1009
1010                 /* There must be a spare_rdev, but there may not be a
1011                  * failed_rdev.  That slot might be empty...
1012                  */
1013                 spare_rdev->desc_nr = failed_desc->number;
1014                 if (failed_rdev)
1015                         failed_rdev->desc_nr = spare_desc->number;
1016                 
1017                 xchg_values(*spare_desc, *failed_desc);
1018                 xchg_values(*fdisk, *sdisk);
1019
1020                 /*
1021                  * (careful, 'failed' and 'spare' are switched from now on)
1022                  *
1023                  * we want to preserve linear numbering and we want to
1024                  * give the proper raid_disk number to the now activated
1025                  * disk. (this means we switch back these values)
1026                  */
1027         
1028                 xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1029                 xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1030                 xchg_values(spare_desc->number, failed_desc->number);
1031                 xchg_values(sdisk->number, fdisk->number);
1032
1033                 *d = failed_desc;
1034
1035                 if (sdisk->dev == MKDEV(0,0))
1036                         sdisk->used_slot = 0;
1037                 /*
1038                  * this really activates the spare.
1039                  */
1040                 fdisk->spare = 0;
1041                 fdisk->write_only = 0;
1042
1043                 /*
1044                  * if we activate a spare, we definitely replace a
1045                  * non-operational disk slot in the 'low' area of
1046                  * the disk array.
1047                  */
1048
1049                 conf->working_disks++;
1050
1051                 break;
1052
1053         case DISKOP_HOT_REMOVE_DISK:
1054                 rdisk = conf->mirrors + removed_disk;
1055
1056                 if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1057                         MD_BUG();       
1058                         err = 1;
1059                         goto abort;
1060                 }
1061                 rdisk->dev = MKDEV(0,0);
1062                 rdisk->used_slot = 0;
1063                 conf->nr_disks--;
1064                 break;
1065
1066         case DISKOP_HOT_ADD_DISK:
1067                 adisk = conf->mirrors + added_disk;
1068                 added_desc = *d;
1069
1070                 if (added_disk != added_desc->number) {
1071                         MD_BUG();       
1072                         err = 1;
1073                         goto abort;
1074                 }
1075
1076                 adisk->number = added_desc->number;
1077                 adisk->raid_disk = added_desc->raid_disk;
1078                 adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1079
1080                 adisk->operational = 0;
1081                 adisk->write_only = 0;
1082                 adisk->spare = 1;
1083                 adisk->used_slot = 1;
1084                 adisk->head_position = 0;
1085                 conf->nr_disks++;
1086
1087                 break;
1088
1089         default:
1090                 MD_BUG();       
1091                 err = 1;
1092                 goto abort;
1093         }
1094 abort:
1095         md_spin_unlock_irq(&conf->device_lock);
1096         if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
1097                 /* should move to "END_REBUILD" when such exists */
1098                 raid1_shrink_buffers(conf);
1099
1100         print_raid1_conf(conf);
1101         return err;
1102 }
1103
1104
1105 #define IO_ERROR KERN_ALERT \
1106 "raid1: %s: unrecoverable I/O read error for block %lu\n"
1107
1108 #define REDIRECT_SECTOR KERN_ERR \
1109 "raid1: %s: redirecting sector %lu to another mirror\n"
1110
1111 /*
1112  * This is a kernel thread which:
1113  *
1114  *      1.      Retries failed read operations on working mirrors.
1115  *      2.      Updates the raid superblock when problems encounter.
1116  *      3.      Performs writes following reads for array syncronising.
1117  */
1118 static void end_sync_write(struct buffer_head *bh, int uptodate);
1119 static void end_sync_read(struct buffer_head *bh, int uptodate);
1120
1121 static void raid1d (void *data)
1122 {
1123         struct raid1_bh *r1_bh;
1124         struct buffer_head *bh;
1125         unsigned long flags;
1126         mddev_t *mddev;
1127         kdev_t dev;
1128
1129
1130         for (;;) {
1131                 md_spin_lock_irqsave(&retry_list_lock, flags);
1132                 r1_bh = raid1_retry_list;
1133                 if (!r1_bh)
1134                         break;
1135                 raid1_retry_list = r1_bh->next_r1;
1136                 md_spin_unlock_irqrestore(&retry_list_lock, flags);
1137
1138                 mddev = r1_bh->mddev;
1139                 if (mddev->sb_dirty) {
1140                         printk(KERN_INFO "raid1: dirty sb detected, updating.\n");
1141                         mddev->sb_dirty = 0;
1142                         md_update_sb(mddev);
1143                 }
1144                 bh = &r1_bh->bh_req;
1145                 switch(r1_bh->cmd) {
1146                 case SPECIAL:
1147                         /* have to allocate lots of bh structures and
1148                          * schedule writes
1149                          */
1150                         if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
1151                                 int i, sum_bhs = 0;
1152                                 int disks = MD_SB_DISKS;
1153                                 struct buffer_head *bhl, *mbh;
1154                                 raid1_conf_t *conf;
1155                                 
1156                                 conf = mddev_to_conf(mddev);
1157                                 bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
1158                                 for (i = 0; i < disks ; i++) {
1159                                         if (!conf->mirrors[i].operational)
1160                                                 continue;
1161                                         if (i==conf->last_used)
1162                                                 /* we read from here, no need to write */
1163                                                 continue;
1164                                         if (i < conf->raid_disks
1165                                             && !conf->resync_mirrors)
1166                                                 /* don't need to write this,
1167                                                  * we are just rebuilding */
1168                                                 continue;
1169                                         mbh = bhl;
1170                                         if (!mbh) {
1171                                                 MD_BUG();
1172                                                 break;
1173                                         }
1174                                         bhl = mbh->b_next;
1175                                         mbh->b_this_page = (struct buffer_head *)1;
1176
1177                                                 
1178                                 /*
1179                                  * prepare mirrored bh (fields ordered for max mem throughput):
1180                                  */
1181                                         mbh->b_blocknr    = bh->b_blocknr;
1182                                         mbh->b_dev        = conf->mirrors[i].dev;
1183                                         mbh->b_rdev       = conf->mirrors[i].dev;
1184                                         mbh->b_rsector    = bh->b_blocknr;
1185                                         mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
1186                                                 (1<<BH_Mapped) | (1<<BH_Lock);
1187                                         atomic_set(&mbh->b_count, 1);
1188                                         mbh->b_size       = bh->b_size;
1189                                         mbh->b_page       = bh->b_page;
1190                                         mbh->b_data       = bh->b_data;
1191                                         mbh->b_list       = BUF_LOCKED;
1192                                         mbh->b_end_io     = end_sync_write;
1193                                         mbh->b_private    = r1_bh;
1194
1195                                         mbh->b_next = r1_bh->mirror_bh_list;
1196                                         r1_bh->mirror_bh_list = mbh;
1197
1198                                         sum_bhs++;
1199                                 }
1200                                 md_atomic_set(&r1_bh->remaining, sum_bhs);
1201                                 if (bhl) raid1_free_bh(conf, bhl);
1202                                 mbh = r1_bh->mirror_bh_list;
1203
1204                                 if (!sum_bhs) {
1205                                         /* nowhere to write this too... I guess we
1206                                          * must be done
1207                                          */
1208                                         sync_request_done(bh->b_blocknr, conf);
1209                                         md_done_sync(mddev, bh->b_size>>9, 0);
1210                                         raid1_free_buf(r1_bh);
1211                                 } else
1212                                 while (mbh) {
1213                                         struct buffer_head *bh1 = mbh;
1214                                         mbh = mbh->b_next;
1215                                         generic_make_request(WRITE, bh1);
1216                                         md_sync_acct(bh1->b_dev, bh1->b_size/512);
1217                                 }
1218                         } else {
1219                                 /* There is no point trying a read-for-reconstruct
1220                                  * as reconstruct is about to be aborted
1221                                  */
1222
1223                                 printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1224                                 md_done_sync(mddev, bh->b_size>>9, 0);
1225                         }
1226
1227                         break;
1228                 case READ:
1229                 case READA:
1230                         dev = bh->b_dev;
1231                         raid1_map (mddev, &bh->b_dev);
1232                         if (bh->b_dev == dev) {
1233                                 printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1234                                 raid1_end_bh_io(r1_bh, 0);
1235                         } else {
1236                                 printk (REDIRECT_SECTOR,
1237                                         partition_name(bh->b_dev), bh->b_blocknr);
1238                                 bh->b_rdev = bh->b_dev;
1239                                 bh->b_rsector = bh->b_blocknr;
1240                                 generic_make_request (r1_bh->cmd, bh);
1241                         }
1242                         break;
1243                 }
1244         }
1245         md_spin_unlock_irqrestore(&retry_list_lock, flags);
1246 }
1247 #undef IO_ERROR
1248 #undef REDIRECT_SECTOR
1249
1250 /*
1251  * Private kernel thread to reconstruct mirrors after an unclean
1252  * shutdown.
1253  */
1254 static void raid1syncd (void *data)
1255 {
1256         raid1_conf_t *conf = data;
1257         mddev_t *mddev = conf->mddev;
1258
1259         if (!conf->resync_mirrors)
1260                 return;
1261         if (conf->resync_mirrors == 2)
1262                 return;
1263         down(&mddev->recovery_sem);
1264         if (!md_do_sync(mddev, NULL)) {
1265                 /*
1266                  * Only if everything went Ok.
1267                  */
1268                 conf->resync_mirrors = 0;
1269         }
1270
1271         close_sync(conf);
1272
1273         up(&mddev->recovery_sem);
1274         raid1_shrink_buffers(conf);
1275 }
1276
1277 /*
1278  * perform a "sync" on one "block"
1279  *
1280  * We need to make sure that no normal I/O request - particularly write
1281  * requests - conflict with active sync requests.
1282  * This is achieved by conceptually dividing the device space into a
1283  * number of sections:
1284  *  DONE: 0 .. a-1     These blocks are in-sync
1285  *  ACTIVE: a.. b-1    These blocks may have active sync requests, but
1286  *                     no normal IO requests
1287  *  READY: b .. c-1    These blocks have no normal IO requests - sync
1288  *                     request may be happening
1289  *  PENDING: c .. d-1  These blocks may have IO requests, but no new
1290  *                     ones will be added
1291  *  FUTURE:  d .. end  These blocks are not to be considered yet. IO may
1292  *                     be happening, but not sync
1293  *
1294  * We keep a
1295  *   phase    which flips (0 or 1) each time d moves and
1296  * a count of:
1297  *   z =  active io requests in FUTURE since d moved - marked with
1298  *        current phase
1299  *   y =  active io requests in FUTURE before d moved, or PENDING -
1300  *        marked with previous phase
1301  *   x =  active sync requests in READY
1302  *   w =  active sync requests in ACTIVE
1303  *   v =  active io requests in DONE
1304  *
1305  * Normally, a=b=c=d=0 and z= active io requests
1306  *   or a=b=c=d=END and v= active io requests
1307  * Allowed changes to a,b,c,d:
1308  * A:  c==d &&  y==0 -> d+=window, y=z, z=0, phase=!phase
1309  * B:  y==0 -> c=d
1310  * C:   b=c, w+=x, x=0
1311  * D:  w==0 -> a=b
1312  * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
1313  *
1314  * At start of sync we apply A.
1315  * When y reaches 0, we apply B then A then being sync requests
1316  * When sync point reaches c-1, we wait for y==0, and W==0, and
1317  * then apply apply B then A then D then C.
1318  * Finally, we apply E
1319  *
1320  * The sync request simply issues a "read" against a working drive
1321  * This is marked so that on completion the raid1d thread is woken to
1322  * issue suitable write requests
1323  */
1324
1325 static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr)
1326 {
1327         raid1_conf_t *conf = mddev_to_conf(mddev);
1328         struct mirror_info *mirror;
1329         struct raid1_bh *r1_bh;
1330         struct buffer_head *bh;
1331         int bsize;
1332         int disk;
1333         int block_nr;
1334
1335         spin_lock_irq(&conf->segment_lock);
1336         if (!sector_nr) {
1337                 /* initialize ...*/
1338                 int buffs;
1339                 conf->start_active = 0;
1340                 conf->start_ready = 0;
1341                 conf->start_pending = 0;
1342                 conf->start_future = 0;
1343                 conf->phase = 0;
1344                 /* we want enough buffers to hold twice the window of 128*/
1345                 buffs = 128 *2 / (PAGE_SIZE>>9);
1346                 buffs = raid1_grow_buffers(conf, buffs);
1347                 if (buffs < 2)
1348                         goto nomem;
1349                 
1350                 conf->window = buffs*(PAGE_SIZE>>9)/2;
1351                 conf->cnt_future += conf->cnt_done+conf->cnt_pending;
1352                 conf->cnt_done = conf->cnt_pending = 0;
1353                 if (conf->cnt_ready || conf->cnt_active)
1354                         MD_BUG();
1355         }
1356         while (sector_nr >= conf->start_pending) {
1357                 PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
1358                         sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
1359                         conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
1360                 wait_event_lock_irq(conf->wait_done,
1361                                         !conf->cnt_active,
1362                                         conf->segment_lock);
1363                 wait_event_lock_irq(conf->wait_ready,
1364                                         !conf->cnt_pending,
1365                                         conf->segment_lock);
1366                 conf->start_active = conf->start_ready;
1367                 conf->start_ready = conf->start_pending;
1368                 conf->start_pending = conf->start_future;
1369                 conf->start_future = conf->start_future+conf->window;
1370                 // Note: falling off the end is not a problem
1371                 conf->phase = conf->phase ^1;
1372                 conf->cnt_active = conf->cnt_ready;
1373                 conf->cnt_ready = 0;
1374                 conf->cnt_pending = conf->cnt_future;
1375                 conf->cnt_future = 0;
1376                 wake_up(&conf->wait_done);
1377         }
1378         conf->cnt_ready++;
1379         spin_unlock_irq(&conf->segment_lock);
1380                 
1381
1382         /* If reconstructing, and >1 working disc,
1383          * could dedicate one to rebuild and others to
1384          * service read requests ..
1385          */
1386         disk = conf->last_used;
1387         /* make sure disk is operational */
1388         while (!conf->mirrors[disk].operational) {
1389                 if (disk <= 0) disk = conf->raid_disks;
1390                 disk--;
1391                 if (disk == conf->last_used)
1392                         break;
1393         }
1394         conf->last_used = disk;
1395         
1396         mirror = conf->mirrors+conf->last_used;
1397         
1398         r1_bh = raid1_alloc_buf (conf);
1399         r1_bh->master_bh = NULL;
1400         r1_bh->mddev = mddev;
1401         r1_bh->cmd = SPECIAL;
1402         bh = &r1_bh->bh_req;
1403
1404         block_nr = sector_nr;
1405         bsize = 512;
1406         while (!(block_nr & 1) && bsize < PAGE_SIZE
1407                         && (block_nr+2)*(bsize>>9) < (mddev->sb->size *2)) {
1408                 block_nr >>= 1;
1409                 bsize <<= 1;
1410         }
1411         bh->b_size = bsize;
1412         bh->b_list = BUF_LOCKED;
1413         bh->b_dev = mirror->dev;
1414         bh->b_rdev = mirror->dev;
1415         bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);
1416         if (!bh->b_page)
1417                 BUG();
1418         if (!bh->b_data)
1419                 BUG();
1420         if (bh->b_data != page_address(bh->b_page))
1421                 BUG();
1422         bh->b_end_io = end_sync_read;
1423         bh->b_private = r1_bh;
1424         bh->b_blocknr = sector_nr;
1425         bh->b_rsector = sector_nr;
1426         init_waitqueue_head(&bh->b_wait);
1427
1428         generic_make_request(READ, bh);
1429         md_sync_acct(bh->b_dev, bh->b_size/512);
1430
1431         return (bsize >> 9);
1432
1433 nomem:
1434         raid1_shrink_buffers(conf);
1435         spin_unlock_irq(&conf->segment_lock);
1436         return -ENOMEM;
1437 }
1438
1439 static void end_sync_read(struct buffer_head *bh, int uptodate)
1440 {
1441         struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1442
1443         /* we have read a block, now it needs to be re-written,
1444          * or re-read if the read failed.
1445          * We don't do much here, just schedule handling by raid1d
1446          */
1447         if (!uptodate)
1448                 md_error (r1_bh->mddev, bh->b_dev);
1449         else
1450                 set_bit(R1BH_Uptodate, &r1_bh->state);
1451         raid1_reschedule_retry(r1_bh);
1452 }
1453
1454 static void end_sync_write(struct buffer_head *bh, int uptodate)
1455 {
1456         struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1457         
1458         if (!uptodate)
1459                 md_error (r1_bh->mddev, bh->b_dev);
1460         if (atomic_dec_and_test(&r1_bh->remaining)) {
1461                 mddev_t *mddev = r1_bh->mddev;
1462                 unsigned long sect = bh->b_blocknr;
1463                 int size = bh->b_size;
1464                 raid1_free_buf(r1_bh);
1465                 sync_request_done(sect, mddev_to_conf(mddev));
1466                 md_done_sync(mddev,size>>9, uptodate);
1467         }
1468 }
1469
1470 #define INVALID_LEVEL KERN_WARNING \
1471 "raid1: md%d: raid level not set to mirroring (%d)\n"
1472
1473 #define NO_SB KERN_ERR \
1474 "raid1: disabled mirror %s (couldn't access raid superblock)\n"
1475
1476 #define ERRORS KERN_ERR \
1477 "raid1: disabled mirror %s (errors detected)\n"
1478
1479 #define NOT_IN_SYNC KERN_ERR \
1480 "raid1: disabled mirror %s (not in sync)\n"
1481
1482 #define INCONSISTENT KERN_ERR \
1483 "raid1: disabled mirror %s (inconsistent descriptor)\n"
1484
1485 #define ALREADY_RUNNING KERN_ERR \
1486 "raid1: disabled mirror %s (mirror %d already operational)\n"
1487
1488 #define OPERATIONAL KERN_INFO \
1489 "raid1: device %s operational as mirror %d\n"
1490
1491 #define MEM_ERROR KERN_ERR \
1492 "raid1: couldn't allocate memory for md%d\n"
1493
1494 #define SPARE KERN_INFO \
1495 "raid1: spare disk %s\n"
1496
1497 #define NONE_OPERATIONAL KERN_ERR \
1498 "raid1: no operational mirrors for md%d\n"
1499
1500 #define ARRAY_IS_ACTIVE KERN_INFO \
1501 "raid1: raid set md%d active with %d out of %d mirrors\n"
1502
1503 #define THREAD_ERROR KERN_ERR \
1504 "raid1: couldn't allocate thread for md%d\n"
1505
1506 #define START_RESYNC KERN_WARNING \
1507 "raid1: raid set md%d not clean; reconstructing mirrors\n"
1508
1509 static int raid1_run (mddev_t *mddev)
1510 {
1511         raid1_conf_t *conf;
1512         int i, j, disk_idx;
1513         struct mirror_info *disk;
1514         mdp_super_t *sb = mddev->sb;
1515         mdp_disk_t *descriptor;
1516         mdk_rdev_t *rdev;
1517         struct md_list_head *tmp;
1518         int start_recovery = 0;
1519
1520         MOD_INC_USE_COUNT;
1521
1522         if (sb->level != 1) {
1523                 printk(INVALID_LEVEL, mdidx(mddev), sb->level);
1524                 goto out;
1525         }
1526         /*
1527          * copy the already verified devices into our private RAID1
1528          * bookkeeping area. [whatever we allocate in raid1_run(),
1529          * should be freed in raid1_stop()]
1530          */
1531
1532         conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
1533         mddev->private = conf;
1534         if (!conf) {
1535                 printk(MEM_ERROR, mdidx(mddev));
1536                 goto out;
1537         }
1538         memset(conf, 0, sizeof(*conf));
1539
1540         ITERATE_RDEV(mddev,rdev,tmp) {
1541                 if (rdev->faulty) {
1542                         printk(ERRORS, partition_name(rdev->dev));
1543                 } else {
1544                         if (!rdev->sb) {
1545                                 MD_BUG();
1546                                 continue;
1547                         }
1548                 }
1549                 if (rdev->desc_nr == -1) {
1550                         MD_BUG();
1551                         continue;
1552                 }
1553                 descriptor = &sb->disks[rdev->desc_nr];
1554                 disk_idx = descriptor->raid_disk;
1555                 disk = conf->mirrors + disk_idx;
1556
1557                 if (disk_faulty(descriptor)) {
1558                         disk->number = descriptor->number;
1559                         disk->raid_disk = disk_idx;
1560                         disk->dev = rdev->dev;
1561                         disk->sect_limit = MAX_WORK_PER_DISK;
1562                         disk->operational = 0;
1563                         disk->write_only = 0;
1564                         disk->spare = 0;
1565                         disk->used_slot = 1;
1566                         disk->head_position = 0;
1567                         continue;
1568                 }
1569                 if (disk_active(descriptor)) {
1570                         if (!disk_sync(descriptor)) {
1571                                 printk(NOT_IN_SYNC,
1572                                         partition_name(rdev->dev));
1573                                 continue;
1574                         }
1575                         if ((descriptor->number > MD_SB_DISKS) ||
1576                                          (disk_idx > sb->raid_disks)) {
1577
1578                                 printk(INCONSISTENT,
1579                                         partition_name(rdev->dev));
1580                                 continue;
1581                         }
1582                         if (disk->operational) {
1583                                 printk(ALREADY_RUNNING,
1584                                         partition_name(rdev->dev),
1585                                         disk_idx);
1586                                 continue;
1587                         }
1588                         printk(OPERATIONAL, partition_name(rdev->dev),
1589                                         disk_idx);
1590                         disk->number = descriptor->number;
1591                         disk->raid_disk = disk_idx;
1592                         disk->dev = rdev->dev;
1593                         disk->sect_limit = MAX_WORK_PER_DISK;
1594                         disk->operational = 1;
1595                         disk->write_only = 0;
1596                         disk->spare = 0;
1597                         disk->used_slot = 1;
1598                         disk->head_position = 0;
1599                         conf->working_disks++;
1600                 } else {
1601                 /*
1602                  * Must be a spare disk ..
1603                  */
1604                         printk(SPARE, partition_name(rdev->dev));
1605                         disk->number = descriptor->number;
1606                         disk->raid_disk = disk_idx;
1607                         disk->dev = rdev->dev;
1608                         disk->sect_limit = MAX_WORK_PER_DISK;
1609                         disk->operational = 0;
1610                         disk->write_only = 0;
1611                         disk->spare = 1;
1612                         disk->used_slot = 1;
1613                         disk->head_position = 0;
1614                 }
1615         }
1616         conf->raid_disks = sb->raid_disks;
1617         conf->nr_disks = sb->nr_disks;
1618         conf->mddev = mddev;
1619         conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1620
1621         conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
1622         init_waitqueue_head(&conf->wait_buffer);
1623         init_waitqueue_head(&conf->wait_done);
1624         init_waitqueue_head(&conf->wait_ready);
1625
1626         if (!conf->working_disks) {
1627                 printk(NONE_OPERATIONAL, mdidx(mddev));
1628                 goto out_free_conf;
1629         }
1630
1631
1632         /* pre-allocate some buffer_head structures.
1633          * As a minimum, 1 r1bh and raid_disks buffer_heads
1634          * would probably get us by in tight memory situations,
1635          * but a few more is probably a good idea.
1636          * For now, try NR_RESERVED_BUFS r1bh and
1637          * NR_RESERVED_BUFS*raid_disks bufferheads
1638          * This will allow at least NR_RESERVED_BUFS concurrent
1639          * reads or writes even if kmalloc starts failing
1640          */
1641         if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS ||
1642             raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks)
1643                               < NR_RESERVED_BUFS*conf->raid_disks) {
1644                 printk(MEM_ERROR, mdidx(mddev));
1645                 goto out_free_conf;
1646         }
1647
1648         for (i = 0; i < MD_SB_DISKS; i++) {
1649                 
1650                 descriptor = sb->disks+i;
1651                 disk_idx = descriptor->raid_disk;
1652                 disk = conf->mirrors + disk_idx;
1653
1654                 if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
1655                                 !disk->used_slot) {
1656
1657                         disk->number = descriptor->number;
1658                         disk->raid_disk = disk_idx;
1659                         disk->dev = MKDEV(0,0);
1660
1661                         disk->operational = 0;
1662                         disk->write_only = 0;
1663                         disk->spare = 0;
1664                         disk->used_slot = 1;
1665                         disk->head_position = 0;
1666                 }
1667         }
1668
1669         /*
1670          * find the first working one and use it as a starting point
1671          * to read balancing.
1672          */
1673         for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
1674                 /* nothing */;
1675         conf->last_used = j;
1676
1677
1678         if (conf->working_disks != sb->raid_disks) {
1679                 printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
1680                 start_recovery = 1;
1681         }
1682
1683         {
1684                 const char * name = "raid1d";
1685
1686                 conf->thread = md_register_thread(raid1d, conf, name);
1687                 if (!conf->thread) {
1688                         printk(THREAD_ERROR, mdidx(mddev));
1689                         goto out_free_conf;
1690                 }
1691         }
1692
1693         if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
1694                 const char * name = "raid1syncd";
1695
1696                 conf->resync_thread = md_register_thread(raid1syncd, conf,name);
1697                 if (!conf->resync_thread) {
1698                         printk(THREAD_ERROR, mdidx(mddev));
1699                         goto out_free_conf;
1700                 }
1701
1702                 printk(START_RESYNC, mdidx(mddev));
1703                 conf->resync_mirrors = 1;
1704                 md_wakeup_thread(conf->resync_thread);
1705         }
1706
1707         /*
1708          * Regenerate the "device is in sync with the raid set" bit for
1709          * each device.
1710          */
1711         for (i = 0; i < MD_SB_DISKS; i++) {
1712                 mark_disk_nonsync(sb->disks+i);
1713                 for (j = 0; j < sb->raid_disks; j++) {
1714                         if (!conf->mirrors[j].operational)
1715                                 continue;
1716                         if (sb->disks[i].number == conf->mirrors[j].number)
1717                                 mark_disk_sync(sb->disks+i);
1718                 }
1719         }
1720         sb->active_disks = conf->working_disks;
1721
1722         if (start_recovery)
1723                 md_recover_arrays();
1724
1725
1726         printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
1727         /*
1728          * Ok, everything is just fine now
1729          */
1730         return 0;
1731
1732 out_free_conf:
1733         raid1_shrink_r1bh(conf);
1734         raid1_shrink_bh(conf);
1735         raid1_shrink_buffers(conf);
1736         kfree(conf);
1737         mddev->private = NULL;
1738 out:
1739         MOD_DEC_USE_COUNT;
1740         return -EIO;
1741 }
1742
1743 #undef INVALID_LEVEL
1744 #undef NO_SB
1745 #undef ERRORS
1746 #undef NOT_IN_SYNC
1747 #undef INCONSISTENT
1748 #undef ALREADY_RUNNING
1749 #undef OPERATIONAL
1750 #undef SPARE
1751 #undef NONE_OPERATIONAL
1752 #undef ARRAY_IS_ACTIVE
1753
1754 static int raid1_stop_resync (mddev_t *mddev)
1755 {
1756         raid1_conf_t *conf = mddev_to_conf(mddev);
1757
1758         if (conf->resync_thread) {
1759                 if (conf->resync_mirrors) {
1760                         conf->resync_mirrors = 2;
1761                         md_interrupt_thread(conf->resync_thread);
1762
1763                         printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
1764                         return 1;
1765                 }
1766                 return 0;
1767         }
1768         return 0;
1769 }
1770
1771 static int raid1_restart_resync (mddev_t *mddev)
1772 {
1773         raid1_conf_t *conf = mddev_to_conf(mddev);
1774
1775         if (conf->resync_mirrors) {
1776                 if (!conf->resync_thread) {
1777                         MD_BUG();
1778                         return 0;
1779                 }
1780                 conf->resync_mirrors = 1;
1781                 md_wakeup_thread(conf->resync_thread);
1782                 return 1;
1783         }
1784         return 0;
1785 }
1786
1787 static int raid1_stop (mddev_t *mddev)
1788 {
1789         raid1_conf_t *conf = mddev_to_conf(mddev);
1790
1791         md_unregister_thread(conf->thread);
1792         if (conf->resync_thread)
1793                 md_unregister_thread(conf->resync_thread);
1794         raid1_shrink_r1bh(conf);
1795         raid1_shrink_bh(conf);
1796         raid1_shrink_buffers(conf);
1797         kfree(conf);
1798         mddev->private = NULL;
1799         MOD_DEC_USE_COUNT;
1800         return 0;
1801 }
1802
1803 static mdk_personality_t raid1_personality=
1804 {
1805         name:           "raid1",
1806         make_request:   raid1_make_request,
1807         run:            raid1_run,
1808         stop:           raid1_stop,
1809         status:         raid1_status,
1810         error_handler:  raid1_error,
1811         diskop:         raid1_diskop,
1812         stop_resync:    raid1_stop_resync,
1813         restart_resync: raid1_restart_resync,
1814         sync_request:   raid1_sync_request
1815 };
1816
1817 static int md__init raid1_init (void)
1818 {
1819         return register_md_personality (RAID1, &raid1_personality);
1820 }
1821
1822 static void raid1_exit (void)
1823 {
1824         unregister_md_personality (RAID1);
1825 }
1826
1827 module_init(raid1_init);
1828 module_exit(raid1_exit);
1829