v2.4.9.10 -> v2.4.9.11
[opensuse:kernel.git] / fs / buffer.c
1 /*
2  *  linux/fs/buffer.c
3  *
4  *  Copyright (C) 1991, 1992  Linus Torvalds
5  */
6
7 /*
8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
9  * been avoided by NEVER letting an interrupt change a buffer (except for the
10  * data, of course), but instead letting the caller do it.
11  */
12
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
14
15 /* Removed a lot of unnecessary code and simplified things now that
16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
17  */
18
19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
20  * hash table, use SLAB cache for buffer heads. -DaveM
21  */
22
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24  * - RMK
25  */
26
27 /* Thread it... -DaveM */
28
29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
30
31 #include <linux/config.h>
32 #include <linux/sched.h>
33 #include <linux/fs.h>
34 #include <linux/slab.h>
35 #include <linux/locks.h>
36 #include <linux/errno.h>
37 #include <linux/swap.h>
38 #include <linux/swapctl.h>
39 #include <linux/smp_lock.h>
40 #include <linux/vmalloc.h>
41 #include <linux/blkdev.h>
42 #include <linux/sysrq.h>
43 #include <linux/file.h>
44 #include <linux/init.h>
45 #include <linux/quotaops.h>
46 #include <linux/iobuf.h>
47 #include <linux/highmem.h>
48 #include <linux/completion.h>
49
50 #include <asm/uaccess.h>
51 #include <asm/io.h>
52 #include <asm/bitops.h>
53 #include <asm/mmu_context.h>
54
55 #define NR_SIZES 7
56 static char buffersize_index[65] =
57 {-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
58   4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
59   5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
60  -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
61   6};
62
63 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
64 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
65 #define NR_RESERVED (10*MAX_BUF_PER_PAGE)
66 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this 
67                                              number of unused buffer heads */
68
69 /* Anti-deadlock ordering:
70  *      lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
71  */
72
73 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
74
75 /*
76  * Hash table gook..
77  */
78 static unsigned int bh_hash_mask;
79 static unsigned int bh_hash_shift;
80 static struct buffer_head **hash_table;
81 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
82
83 static struct buffer_head *lru_list[NR_LIST];
84 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
85 static int nr_buffers_type[NR_LIST];
86 static unsigned long size_buffers_type[NR_LIST];
87
88 static struct buffer_head * unused_list;
89 static int nr_unused_buffer_heads;
90 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
91 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
92
93 struct bh_free_head {
94         struct buffer_head *list;
95         spinlock_t lock;
96 };
97 static struct bh_free_head free_list[NR_SIZES];
98
99 static int grow_buffers(int size);
100 static void __refile_buffer(struct buffer_head *);
101
102 /* This is used by some architectures to estimate available memory. */
103 atomic_t buffermem_pages = ATOMIC_INIT(0);
104
105 /* Here is the parameter block for the bdflush process. If you add or
106  * remove any of the parameters, make sure to update kernel/sysctl.c
107  * and the documentation at linux/Documentation/sysctl/vm.txt.
108  */
109
110 #define N_PARAM 9
111
112 /* The dummy values in this structure are left in there for compatibility
113  * with old programs that play with the /proc entries.
114  */
115 union bdflush_param {
116         struct {
117                 int nfract;     /* Percentage of buffer cache dirty to 
118                                    activate bdflush */
119                 int dummy1;     /* old "ndirty" */
120                 int dummy2;     /* old "nrefill" */
121                 int dummy3;     /* unused */
122                 int interval;   /* jiffies delay between kupdate flushes */
123                 int age_buffer; /* Time for normal buffer to age before we flush it */
124                 int nfract_sync;/* Percentage of buffer cache dirty to 
125                                    activate bdflush synchronously */
126                 int dummy4;     /* unused */
127                 int dummy5;     /* unused */
128         } b_un;
129         unsigned int data[N_PARAM];
130 } bdf_prm = {{30, 64, 64, 256, 5*HZ, 30*HZ, 60, 0, 0}};
131
132 /* These are the min and max parameter values that we will allow to be assigned */
133 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   0, 0, 0};
134 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 6000*HZ, 100, 0, 0};
135
136 inline void unlock_buffer(struct buffer_head *bh)
137 {
138         clear_bit(BH_Wait_IO, &bh->b_state);
139         clear_bit(BH_Lock, &bh->b_state);
140         smp_mb__after_clear_bit();
141         if (waitqueue_active(&bh->b_wait))
142                 wake_up(&bh->b_wait);
143 }
144
145 /*
146  * Rewrote the wait-routines to use the "new" wait-queue functionality,
147  * and getting rid of the cli-sti pairs. The wait-queue routines still
148  * need cli-sti, but now it's just a couple of 386 instructions or so.
149  *
150  * Note that the real wait_on_buffer() is an inline function that checks
151  * if 'b_wait' is set before calling this, so that the queues aren't set
152  * up unnecessarily.
153  */
154 void __wait_on_buffer(struct buffer_head * bh)
155 {
156         struct task_struct *tsk = current;
157         DECLARE_WAITQUEUE(wait, tsk);
158
159         get_bh(bh);
160         add_wait_queue(&bh->b_wait, &wait);
161         do {
162                 run_task_queue(&tq_disk);
163                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
164                 if (!buffer_locked(bh))
165                         break;
166                 schedule();
167         } while (buffer_locked(bh));
168         tsk->state = TASK_RUNNING;
169         remove_wait_queue(&bh->b_wait, &wait);
170         put_bh(bh);
171 }
172
173 /*
174  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
175  * unlock the buffer. This is what ll_rw_block uses too.
176  */
177 void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
178 {
179         mark_buffer_uptodate(bh, uptodate);
180         unlock_buffer(bh);
181         put_bh(bh);
182 }
183
184 /*
185  * The buffers have been marked clean and locked.  Just submit the dang
186  * things.. 
187  */
188 static void write_locked_buffers(struct buffer_head **array, unsigned int count)
189 {
190         do {
191                 struct buffer_head * bh = *array++;
192                 bh->b_end_io = end_buffer_io_sync;
193                 submit_bh(WRITE, bh);
194         } while (--count);
195 }
196
197 /*
198  * Write some buffers from the head of the dirty queue.
199  *
200  * This must be called with the LRU lock held, and will
201  * return without it!
202  */
203 #define NRSYNC (32)
204 static int write_some_buffers(kdev_t dev)
205 {
206         struct buffer_head *next;
207         struct buffer_head *array[NRSYNC];
208         unsigned int count;
209         int nr;
210
211         next = lru_list[BUF_DIRTY];
212         nr = nr_buffers_type[BUF_DIRTY];
213         count = 0;
214         while (next && --nr >= 0) {
215                 struct buffer_head * bh = next;
216                 next = bh->b_next_free;
217
218                 if (dev && bh->b_dev != dev)
219                         continue;
220                 if (test_and_set_bit(BH_Lock, &bh->b_state))
221                         continue;
222                 if (atomic_set_buffer_clean(bh)) {
223                         __refile_buffer(bh);
224                         get_bh(bh);
225                         array[count++] = bh;
226                         if (count < NRSYNC)
227                                 continue;
228
229                         spin_unlock(&lru_list_lock);
230                         write_locked_buffers(array, count);
231                         return -EAGAIN;
232                 }
233                 unlock_buffer(bh);
234                 __refile_buffer(bh);
235         }
236         spin_unlock(&lru_list_lock);
237
238         if (count)
239                 write_locked_buffers(array, count);
240         return 0;
241 }
242
243 /*
244  * Write out all buffers on the dirty list.
245  */
246 static void write_unlocked_buffers(kdev_t dev)
247 {
248         do {
249                 spin_lock(&lru_list_lock);
250         } while (write_some_buffers(dev));
251         run_task_queue(&tq_disk);
252 }
253
254 /*
255  * Wait for a buffer on the proper list.
256  *
257  * This must be called with the LRU lock held, and
258  * will return with it released.
259  */
260 static int wait_for_buffers(kdev_t dev, int index, int refile)
261 {
262         struct buffer_head * next;
263         int nr;
264
265         next = lru_list[index];
266         nr = nr_buffers_type[index];
267         while (next && --nr >= 0) {
268                 struct buffer_head *bh = next;
269                 next = bh->b_next_free;
270
271                 if (!buffer_locked(bh)) {
272                         if (refile)
273                                 __refile_buffer(bh);
274                         continue;
275                 }
276                 if (dev && bh->b_dev != dev)
277                         continue;
278
279                 get_bh(bh);
280                 spin_unlock(&lru_list_lock);
281                 wait_on_buffer (bh);
282                 put_bh(bh);
283                 return -EAGAIN;
284         }
285         spin_unlock(&lru_list_lock);
286         return 0;
287 }
288
289 static inline void wait_for_some_buffers(kdev_t dev)
290 {
291         spin_lock(&lru_list_lock);
292         wait_for_buffers(dev, BUF_LOCKED, 1);
293 }
294
295 static int wait_for_locked_buffers(kdev_t dev, int index, int refile)
296 {
297         do {
298                 spin_lock(&lru_list_lock);
299         } while (wait_for_buffers(dev, index, refile));
300         return 0;
301 }
302
303 /* Call sync_buffers with wait!=0 to ensure that the call does not
304  * return until all buffer writes have completed.  Sync() may return
305  * before the writes have finished; fsync() may not.
306  */
307
308 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
309  * spontaneously dirty themselves without ever brelse being called.
310  * We will ultimately want to put these in a separate list, but for
311  * now we search all of the lists for dirty buffers.
312  */
313 int sync_buffers(kdev_t dev, int wait)
314 {
315         int err = 0;
316
317         /* One pass for no-wait, three for wait:
318          * 0) write out all dirty, unlocked buffers;
319          * 1) wait for all dirty locked buffers;
320          * 2) write out all dirty, unlocked buffers;
321          * 2) wait for completion by waiting for all buffers to unlock.
322          */
323         write_unlocked_buffers(dev);
324         if (wait) {
325                 err = wait_for_locked_buffers(dev, BUF_DIRTY, 0);
326                 write_unlocked_buffers(dev);
327                 err |= wait_for_locked_buffers(dev, BUF_LOCKED, 1);
328         }
329         return err;
330 }
331
332 int fsync_super(struct super_block *sb)
333 {
334         kdev_t dev = sb->s_dev;
335         sync_buffers(dev, 0);
336
337         lock_kernel();
338         sync_inodes_sb(sb);
339         DQUOT_SYNC(dev);
340         lock_super(sb);
341         if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
342                 sb->s_op->write_super(sb);
343         unlock_super(sb);
344         unlock_kernel();
345
346         return sync_buffers(dev, 1);
347 }
348
349 int fsync_no_super(kdev_t dev)
350 {
351         sync_buffers(dev, 0);
352         return sync_buffers(dev, 1);
353 }
354
355 int fsync_dev(kdev_t dev)
356 {
357         sync_buffers(dev, 0);
358
359         lock_kernel();
360         sync_inodes(dev);
361         DQUOT_SYNC(dev);
362         sync_supers(dev);
363         unlock_kernel();
364
365         return sync_buffers(dev, 1);
366 }
367
368 /*
369  * There's no real reason to pretend we should
370  * ever do anything differently
371  */
372 void sync_dev(kdev_t dev)
373 {
374         fsync_dev(dev);
375 }
376
377 asmlinkage long sys_sync(void)
378 {
379         fsync_dev(0);
380         return 0;
381 }
382
383 /*
384  *      filp may be NULL if called via the msync of a vma.
385  */
386  
387 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
388 {
389         struct inode * inode = dentry->d_inode;
390         struct super_block * sb;
391         kdev_t dev;
392         int ret;
393
394         lock_kernel();
395         /* sync the inode to buffers */
396         write_inode_now(inode, 0);
397
398         /* sync the superblock to buffers */
399         sb = inode->i_sb;
400         lock_super(sb);
401         if (sb->s_op && sb->s_op->write_super)
402                 sb->s_op->write_super(sb);
403         unlock_super(sb);
404
405         /* .. finally sync the buffers to disk */
406         dev = inode->i_dev;
407         ret = sync_buffers(dev, 1);
408         unlock_kernel();
409         return ret;
410 }
411
412 asmlinkage long sys_fsync(unsigned int fd)
413 {
414         struct file * file;
415         struct dentry * dentry;
416         struct inode * inode;
417         int err;
418
419         err = -EBADF;
420         file = fget(fd);
421         if (!file)
422                 goto out;
423
424         dentry = file->f_dentry;
425         inode = dentry->d_inode;
426
427         err = -EINVAL;
428         if (!file->f_op || !file->f_op->fsync)
429                 goto out_putf;
430
431         /* We need to protect against concurrent writers.. */
432         down(&inode->i_sem);
433         filemap_fdatasync(inode->i_mapping);
434         err = file->f_op->fsync(file, dentry, 0);
435         filemap_fdatawait(inode->i_mapping);
436         up(&inode->i_sem);
437
438 out_putf:
439         fput(file);
440 out:
441         return err;
442 }
443
444 asmlinkage long sys_fdatasync(unsigned int fd)
445 {
446         struct file * file;
447         struct dentry * dentry;
448         struct inode * inode;
449         int err;
450
451         err = -EBADF;
452         file = fget(fd);
453         if (!file)
454                 goto out;
455
456         dentry = file->f_dentry;
457         inode = dentry->d_inode;
458
459         err = -EINVAL;
460         if (!file->f_op || !file->f_op->fsync)
461                 goto out_putf;
462
463         down(&inode->i_sem);
464         filemap_fdatasync(inode->i_mapping);
465         err = file->f_op->fsync(file, dentry, 1);
466         filemap_fdatawait(inode->i_mapping);
467         up(&inode->i_sem);
468
469 out_putf:
470         fput(file);
471 out:
472         return err;
473 }
474
475 /* After several hours of tedious analysis, the following hash
476  * function won.  Do not mess with it... -DaveM
477  */
478 #define _hashfn(dev,block)      \
479         ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
480          (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
481           ((block) << (bh_hash_shift - 12))))
482 #define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
483
484 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
485 {
486         if ((bh->b_next = *head) != NULL)
487                 bh->b_next->b_pprev = &bh->b_next;
488         *head = bh;
489         bh->b_pprev = head;
490 }
491
492 static __inline__ void __hash_unlink(struct buffer_head *bh)
493 {
494         if (bh->b_pprev) {
495                 if (bh->b_next)
496                         bh->b_next->b_pprev = bh->b_pprev;
497                 *(bh->b_pprev) = bh->b_next;
498                 bh->b_pprev = NULL;
499         }
500 }
501
502 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
503 {
504         struct buffer_head **bhp = &lru_list[blist];
505
506         if(!*bhp) {
507                 *bhp = bh;
508                 bh->b_prev_free = bh;
509         }
510         bh->b_next_free = *bhp;
511         bh->b_prev_free = (*bhp)->b_prev_free;
512         (*bhp)->b_prev_free->b_next_free = bh;
513         (*bhp)->b_prev_free = bh;
514         nr_buffers_type[blist]++;
515         size_buffers_type[blist] += bh->b_size;
516 }
517
518 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
519 {
520         if (bh->b_prev_free || bh->b_next_free) {
521                 bh->b_prev_free->b_next_free = bh->b_next_free;
522                 bh->b_next_free->b_prev_free = bh->b_prev_free;
523                 if (lru_list[blist] == bh)
524                         lru_list[blist] = bh->b_next_free;
525                 if (lru_list[blist] == bh)
526                         lru_list[blist] = NULL;
527                 bh->b_next_free = bh->b_prev_free = NULL;
528                 nr_buffers_type[blist]--;
529                 size_buffers_type[blist] -= bh->b_size;
530         }
531 }
532
533 static void __remove_from_free_list(struct buffer_head * bh, int index)
534 {
535         if(bh->b_next_free == bh)
536                  free_list[index].list = NULL;
537         else {
538                 bh->b_prev_free->b_next_free = bh->b_next_free;
539                 bh->b_next_free->b_prev_free = bh->b_prev_free;
540                 if (free_list[index].list == bh)
541                          free_list[index].list = bh->b_next_free;
542         }
543         bh->b_next_free = bh->b_prev_free = NULL;
544 }
545
546 /* must be called with both the hash_table_lock and the lru_list_lock
547    held */
548 static void __remove_from_queues(struct buffer_head *bh)
549 {
550         __hash_unlink(bh);
551         __remove_from_lru_list(bh, bh->b_list);
552 }
553
554 static void __insert_into_queues(struct buffer_head *bh)
555 {
556         struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
557
558         __hash_link(bh, head);
559         __insert_into_lru_list(bh, bh->b_list);
560 }
561
562 /* This function must only run if there are no other
563  * references _anywhere_ to this buffer head.
564  */
565 static void put_last_free(struct buffer_head * bh)
566 {
567         struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
568         struct buffer_head **bhp = &head->list;
569
570         bh->b_state = 0;
571
572         spin_lock(&head->lock);
573         bh->b_dev = B_FREE;
574         if(!*bhp) {
575                 *bhp = bh;
576                 bh->b_prev_free = bh;
577         }
578         bh->b_next_free = *bhp;
579         bh->b_prev_free = (*bhp)->b_prev_free;
580         (*bhp)->b_prev_free->b_next_free = bh;
581         (*bhp)->b_prev_free = bh;
582         spin_unlock(&head->lock);
583 }
584
585 /*
586  * Why like this, I hear you say... The reason is race-conditions.
587  * As we don't lock buffers (unless we are reading them, that is),
588  * something might happen to it while we sleep (ie a read-error
589  * will force it bad). This shouldn't really happen currently, but
590  * the code is ready.
591  */
592 static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size)
593 {
594         struct buffer_head *bh = hash(dev, block);
595
596         for (; bh; bh = bh->b_next)
597                 if (bh->b_blocknr == block      &&
598                     bh->b_size    == size       &&
599                     bh->b_dev     == dev)
600                         break;
601         if (bh)
602                 get_bh(bh);
603
604         return bh;
605 }
606
607 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
608 {
609         struct buffer_head *bh;
610
611         read_lock(&hash_table_lock);
612         bh = __get_hash_table(dev, block, size);
613         read_unlock(&hash_table_lock);
614
615         return bh;
616 }
617
618 void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
619 {
620         spin_lock(&lru_list_lock);
621         if (bh->b_inode)
622                 list_del(&bh->b_inode_buffers);
623         bh->b_inode = inode;
624         list_add(&bh->b_inode_buffers, &inode->i_dirty_buffers);
625         spin_unlock(&lru_list_lock);
626 }
627
628 void buffer_insert_inode_data_queue(struct buffer_head *bh, struct inode *inode)
629 {
630         spin_lock(&lru_list_lock);
631         if (bh->b_inode)
632                 list_del(&bh->b_inode_buffers);
633         bh->b_inode = inode;
634         list_add(&bh->b_inode_buffers, &inode->i_dirty_data_buffers);
635         spin_unlock(&lru_list_lock);
636 }
637
638 /* The caller must have the lru_list lock before calling the 
639    remove_inode_queue functions.  */
640 static void __remove_inode_queue(struct buffer_head *bh)
641 {
642         bh->b_inode = NULL;
643         list_del(&bh->b_inode_buffers);
644 }
645
646 static inline void remove_inode_queue(struct buffer_head *bh)
647 {
648         if (bh->b_inode)
649                 __remove_inode_queue(bh);
650 }
651
652 int inode_has_buffers(struct inode *inode)
653 {
654         int ret;
655         
656         spin_lock(&lru_list_lock);
657         ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers);
658         spin_unlock(&lru_list_lock);
659         
660         return ret;
661 }
662
663 /* If invalidate_buffers() will trash dirty buffers, it means some kind
664    of fs corruption is going on. Trashing dirty data always imply losing
665    information that was supposed to be just stored on the physical layer
666    by the user.
667
668    Thus invalidate_buffers in general usage is not allwowed to trash dirty
669    buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
670
671    NOTE: In the case where the user removed a removable-media-disk even if
672    there's still dirty data not synced on disk (due a bug in the device driver
673    or due an error of the user), by not destroying the dirty buffers we could
674    generate corruption also on the next media inserted, thus a parameter is
675    necessary to handle this case in the most safe way possible (trying
676    to not corrupt also the new disk inserted with the data belonging to
677    the old now corrupted disk). Also for the ramdisk the natural thing
678    to do in order to release the ramdisk memory is to destroy dirty buffers.
679
680    These are two special cases. Normal usage imply the device driver
681    to issue a sync on the device (without waiting I/O completion) and
682    then an invalidate_buffers call that doesn't trash dirty buffers.
683
684    For handling cache coherency with the blkdev pagecache the 'update' case
685    is been introduced. It is needed to re-read from disk any pinned
686    buffer. NOTE: re-reading from disk is destructive so we can do it only
687    when we assume nobody is changing the buffercache under our I/O and when
688    we think the disk contains more recent information than the buffercache.
689    The update == 1 pass marks the buffers we need to update, the update == 2
690    pass does the actual I/O. */
691 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers, int update)
692 {
693         int i, nlist, slept;
694         struct buffer_head * bh, * bh_next;
695
696  retry:
697         slept = 0;
698         spin_lock(&lru_list_lock);
699         for(nlist = 0; nlist < NR_LIST; nlist++) {
700                 bh = lru_list[nlist];
701                 if (!bh)
702                         continue;
703                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
704                         bh_next = bh->b_next_free;
705
706                         /* Another device? */
707                         if (bh->b_dev != dev)
708                                 continue;
709                         /* Not hashed? */
710                         if (!bh->b_pprev)
711                                 continue;
712                         if (buffer_locked(bh)) {
713                                 get_bh(bh);
714                                 spin_unlock(&lru_list_lock);
715                                 wait_on_buffer(bh);
716                                 slept = 1;
717                                 spin_lock(&lru_list_lock);
718                                 put_bh(bh);
719                         }
720
721                         write_lock(&hash_table_lock);
722                         /* All buffers in the lru lists are mapped */
723                         if (!buffer_mapped(bh))
724                                 BUG();
725                         if (!atomic_read(&bh->b_count)) {
726                                 if (destroy_dirty_buffers || !buffer_dirty(bh)) {
727                                         remove_inode_queue(bh);
728                                         __remove_from_queues(bh);
729                                         put_last_free(bh);
730                                 }
731                         } else if (update) {
732                                 if ((update == 2) ^ buffer_uptodate(bh)  &&
733                                     (update == 2) ^ buffer_req(bh)) {
734                                         write_unlock(&hash_table_lock);
735                                         atomic_inc(&bh->b_count);
736                                         spin_unlock(&lru_list_lock);
737
738                                         if (update == 2) {
739                                                 ll_rw_block(READ, 1, &bh);
740                                                 wait_on_buffer(bh);
741                                         } else {
742                                                 lock_buffer(bh);
743                                                 clear_bit(BH_Uptodate, &bh->b_state);
744                                                 clear_bit(BH_Req, &bh->b_state);
745                                                 unlock_buffer(bh);
746                                         }                                               
747
748                                         atomic_dec(&bh->b_count);
749                                         goto retry;
750                                 }
751                         }
752
753                         write_unlock(&hash_table_lock);
754                         if (slept)
755                                 goto out;
756                 }
757         }
758 out:
759         spin_unlock(&lru_list_lock);
760         if (slept)
761                 goto retry;
762 }
763
764 void set_blocksize(kdev_t dev, int size)
765 {
766         extern int *blksize_size[];
767         int i, nlist, slept;
768         struct buffer_head * bh, * bh_next;
769
770         if (!blksize_size[MAJOR(dev)])
771                 return;
772
773         /* Size must be a power of two, and between 512 and PAGE_SIZE */
774         if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
775                 panic("Invalid blocksize passed to set_blocksize");
776
777         if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
778                 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
779                 return;
780         }
781         if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
782                 return;
783         sync_buffers(dev, 2);
784         blksize_size[MAJOR(dev)][MINOR(dev)] = size;
785
786  retry:
787         slept = 0;
788         spin_lock(&lru_list_lock);
789         for(nlist = 0; nlist < NR_LIST; nlist++) {
790                 bh = lru_list[nlist];
791                 if (!bh)
792                         continue;
793                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
794                         bh_next = bh->b_next_free;
795                         if (bh->b_dev != dev || bh->b_size == size)
796                                 continue;
797                         /* Unhashed? */
798                         if (!bh->b_pprev)
799                                 continue;
800                         if (buffer_locked(bh)) {
801                                 get_bh(bh);
802                                 spin_unlock(&lru_list_lock);
803                                 wait_on_buffer(bh);
804                                 slept = 1;
805                                 spin_lock(&lru_list_lock);
806                                 put_bh(bh);
807                         }
808
809                         write_lock(&hash_table_lock);
810                         if (!atomic_read(&bh->b_count)) {
811                                 if (buffer_dirty(bh))
812                                         printk(KERN_WARNING
813                                                "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
814                                                kdevname(dev), bh->b_blocknr, bh->b_size);
815                                 remove_inode_queue(bh);
816                                 __remove_from_queues(bh);
817                                 put_last_free(bh);
818                         } else {
819                                 if (atomic_set_buffer_clean(bh))
820                                         __refile_buffer(bh);
821                                 clear_bit(BH_Uptodate, &bh->b_state);
822                                 printk(KERN_WARNING
823                                        "set_blocksize: "
824                                        "b_count %d, dev %s, block %lu, from %p\n",
825                                        atomic_read(&bh->b_count), bdevname(bh->b_dev),
826                                        bh->b_blocknr, __builtin_return_address(0));
827                         }
828                         write_unlock(&hash_table_lock);
829                         if (slept)
830                                 goto out;
831                 }
832         }
833  out:
834         spin_unlock(&lru_list_lock);
835         if (slept)
836                 goto retry;
837 }
838
839 static void free_more_memory(void)
840 {
841         balance_dirty();
842         wakeup_bdflush();
843         current->policy |= SCHED_YIELD;
844         __set_current_state(TASK_RUNNING);
845         schedule();
846 }
847
848 /*
849  * We used to try various strange things. Let's not.
850  * We'll just try to balance dirty buffers, and possibly
851  * launder some pages and do our best to make more memory
852  * available.
853  */
854 static void refill_freelist(int size)
855 {
856         if (!grow_buffers(size))
857                 free_more_memory();
858 }
859
860 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
861 {
862         bh->b_list = BUF_CLEAN;
863         bh->b_end_io = handler;
864         bh->b_private = private;
865 }
866
867 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
868 {
869         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
870         unsigned long flags;
871         struct buffer_head *tmp;
872         struct page *page;
873
874         mark_buffer_uptodate(bh, uptodate);
875
876         /* This is a temporary buffer used for page I/O. */
877         page = bh->b_page;
878
879         if (!uptodate)
880                 SetPageError(page);
881
882         /*
883          * Be _very_ careful from here on. Bad things can happen if
884          * two buffer heads end IO at almost the same time and both
885          * decide that the page is now completely done.
886          *
887          * Async buffer_heads are here only as labels for IO, and get
888          * thrown away once the IO for this page is complete.  IO is
889          * deemed complete once all buffers have been visited
890          * (b_count==0) and are now unlocked. We must make sure that
891          * only the _last_ buffer that decrements its count is the one
892          * that unlock the page..
893          */
894         spin_lock_irqsave(&page_uptodate_lock, flags);
895         mark_buffer_async(bh, 0);
896         unlock_buffer(bh);
897         tmp = bh->b_this_page;
898         while (tmp != bh) {
899                 if (buffer_async(tmp) && buffer_locked(tmp))
900                         goto still_busy;
901                 tmp = tmp->b_this_page;
902         }
903
904         /* OK, the async IO on this page is complete. */
905         spin_unlock_irqrestore(&page_uptodate_lock, flags);
906
907         /*
908          * if none of the buffers had errors then we can set the
909          * page uptodate:
910          */
911         if (!PageError(page))
912                 SetPageUptodate(page);
913
914         /*
915          * Run the hooks that have to be done when a page I/O has completed.
916          */
917         if (PageTestandClearDecrAfter(page))
918                 atomic_dec(&nr_async_pages);
919
920         UnlockPage(page);
921
922         return;
923
924 still_busy:
925         spin_unlock_irqrestore(&page_uptodate_lock, flags);
926         return;
927 }
928
929 inline void set_buffer_async_io(struct buffer_head *bh) {
930     bh->b_end_io = end_buffer_io_async ;
931     mark_buffer_async(bh, 1);
932 }
933
934 /*
935  * Synchronise all the inode's dirty buffers to the disk.
936  *
937  * We have conflicting pressures: we want to make sure that all
938  * initially dirty buffers get waited on, but that any subsequently
939  * dirtied buffers don't.  After all, we don't want fsync to last
940  * forever if somebody is actively writing to the file.
941  *
942  * Do this in two main stages: first we copy dirty buffers to a
943  * temporary inode list, queueing the writes as we go.  Then we clean
944  * up, waiting for those writes to complete.
945  * 
946  * During this second stage, any subsequent updates to the file may end
947  * up refiling the buffer on the original inode's dirty list again, so
948  * there is a chance we will end up with a buffer queued for write but
949  * not yet completed on that list.  So, as a final cleanup we go through
950  * the osync code to catch these locked, dirty buffers without requeuing
951  * any newly dirty buffers for write.
952  */
953
954 int fsync_inode_buffers(struct inode *inode)
955 {
956         struct buffer_head *bh;
957         struct inode tmp;
958         int err = 0, err2;
959         
960         INIT_LIST_HEAD(&tmp.i_dirty_buffers);
961         
962         spin_lock(&lru_list_lock);
963
964         while (!list_empty(&inode->i_dirty_buffers)) {
965                 bh = BH_ENTRY(inode->i_dirty_buffers.next);
966                 list_del(&bh->b_inode_buffers);
967                 if (!buffer_dirty(bh) && !buffer_locked(bh))
968                         bh->b_inode = NULL;
969                 else {
970                         bh->b_inode = &tmp;
971                         list_add(&bh->b_inode_buffers, &tmp.i_dirty_buffers);
972                         if (buffer_dirty(bh)) {
973                                 get_bh(bh);
974                                 spin_unlock(&lru_list_lock);
975                                 ll_rw_block(WRITE, 1, &bh);
976                                 brelse(bh);
977                                 spin_lock(&lru_list_lock);
978                         }
979                 }
980         }
981
982         while (!list_empty(&tmp.i_dirty_buffers)) {
983                 bh = BH_ENTRY(tmp.i_dirty_buffers.prev);
984                 remove_inode_queue(bh);
985                 get_bh(bh);
986                 spin_unlock(&lru_list_lock);
987                 wait_on_buffer(bh);
988                 if (!buffer_uptodate(bh))
989                         err = -EIO;
990                 brelse(bh);
991                 spin_lock(&lru_list_lock);
992         }
993         
994         spin_unlock(&lru_list_lock);
995         err2 = osync_inode_buffers(inode);
996
997         if (err)
998                 return err;
999         else
1000                 return err2;
1001 }
1002
1003 int fsync_inode_data_buffers(struct inode *inode)
1004 {
1005         struct buffer_head *bh;
1006         struct inode tmp;
1007         int err = 0, err2;
1008         
1009         INIT_LIST_HEAD(&tmp.i_dirty_data_buffers);
1010         
1011         spin_lock(&lru_list_lock);
1012
1013         while (!list_empty(&inode->i_dirty_data_buffers)) {
1014                 bh = BH_ENTRY(inode->i_dirty_data_buffers.next);
1015                 list_del(&bh->b_inode_buffers);
1016                 if (!buffer_dirty(bh) && !buffer_locked(bh))
1017                         bh->b_inode = NULL;
1018                 else {
1019                         bh->b_inode = &tmp;
1020                         list_add(&bh->b_inode_buffers, &tmp.i_dirty_data_buffers);
1021                         if (buffer_dirty(bh)) {
1022                                 get_bh(bh);
1023                                 spin_unlock(&lru_list_lock);
1024                                 ll_rw_block(WRITE, 1, &bh);
1025                                 brelse(bh);
1026                                 spin_lock(&lru_list_lock);
1027                         }
1028                 }
1029         }
1030
1031         while (!list_empty(&tmp.i_dirty_data_buffers)) {
1032                 bh = BH_ENTRY(tmp.i_dirty_data_buffers.prev);
1033                 remove_inode_queue(bh);
1034                 get_bh(bh);
1035                 spin_unlock(&lru_list_lock);
1036                 wait_on_buffer(bh);
1037                 if (!buffer_uptodate(bh))
1038                         err = -EIO;
1039                 brelse(bh);
1040                 spin_lock(&lru_list_lock);
1041         }
1042         
1043         spin_unlock(&lru_list_lock);
1044         err2 = osync_inode_data_buffers(inode);
1045
1046         if (err)
1047                 return err;
1048         else
1049                 return err2;
1050 }
1051
1052 /*
1053  * osync is designed to support O_SYNC io.  It waits synchronously for
1054  * all already-submitted IO to complete, but does not queue any new
1055  * writes to the disk.
1056  *
1057  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
1058  * you dirty the buffers, and then use osync_inode_buffers to wait for
1059  * completion.  Any other dirty buffers which are not yet queued for
1060  * write will not be flushed to disk by the osync.
1061  */
1062
1063 int osync_inode_buffers(struct inode *inode)
1064 {
1065         struct buffer_head *bh;
1066         struct list_head *list;
1067         int err = 0;
1068
1069         spin_lock(&lru_list_lock);
1070         
1071  repeat:
1072         
1073         for (list = inode->i_dirty_buffers.prev; 
1074              bh = BH_ENTRY(list), list != &inode->i_dirty_buffers;
1075              list = bh->b_inode_buffers.prev) {
1076                 if (buffer_locked(bh)) {
1077                         get_bh(bh);
1078                         spin_unlock(&lru_list_lock);
1079                         wait_on_buffer(bh);
1080                         if (!buffer_uptodate(bh))
1081                                 err = -EIO;
1082                         brelse(bh);
1083                         spin_lock(&lru_list_lock);
1084                         goto repeat;
1085                 }
1086         }
1087
1088         spin_unlock(&lru_list_lock);
1089         return err;
1090 }
1091
1092 int osync_inode_data_buffers(struct inode *inode)
1093 {
1094         struct buffer_head *bh;
1095         struct list_head *list;
1096         int err = 0;
1097
1098         spin_lock(&lru_list_lock);
1099         
1100  repeat:
1101
1102         for (list = inode->i_dirty_data_buffers.prev; 
1103              bh = BH_ENTRY(list), list != &inode->i_dirty_data_buffers;
1104              list = bh->b_inode_buffers.prev) {
1105                 if (buffer_locked(bh)) {
1106                         get_bh(bh);
1107                         spin_unlock(&lru_list_lock);
1108                         wait_on_buffer(bh);
1109                         if (!buffer_uptodate(bh))
1110                                 err = -EIO;
1111                         brelse(bh);
1112                         spin_lock(&lru_list_lock);
1113                         goto repeat;
1114                 }
1115         }
1116
1117         spin_unlock(&lru_list_lock);
1118         return err;
1119 }
1120
1121
1122 /*
1123  * Invalidate any and all dirty buffers on a given inode.  We are
1124  * probably unmounting the fs, but that doesn't mean we have already
1125  * done a sync().  Just drop the buffers from the inode list.
1126  */
1127 void invalidate_inode_buffers(struct inode *inode)
1128 {
1129         struct list_head * entry;
1130         
1131         spin_lock(&lru_list_lock);
1132         while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers)
1133                 remove_inode_queue(BH_ENTRY(entry));
1134         while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers)
1135                 remove_inode_queue(BH_ENTRY(entry));
1136         spin_unlock(&lru_list_lock);
1137 }
1138
1139
1140 /*
1141  * Ok, this is getblk, and it isn't very clear, again to hinder
1142  * race-conditions. Most of the code is seldom used, (ie repeating),
1143  * so it should be much more efficient than it looks.
1144  *
1145  * The algorithm is changed: hopefully better, and an elusive bug removed.
1146  *
1147  * 14.02.92: changed it to sync dirty buffers a bit: better performance
1148  * when the filesystem starts to get full of dirty blocks (I hope).
1149  */
1150 struct buffer_head * getblk(kdev_t dev, int block, int size)
1151 {
1152         struct buffer_head * bh;
1153         int isize;
1154
1155 repeat:
1156         spin_lock(&lru_list_lock);
1157         write_lock(&hash_table_lock);
1158         bh = __get_hash_table(dev, block, size);
1159         if (bh)
1160                 goto out;
1161
1162         isize = BUFSIZE_INDEX(size);
1163         spin_lock(&free_list[isize].lock);
1164         bh = free_list[isize].list;
1165         if (bh) {
1166                 __remove_from_free_list(bh, isize);
1167                 atomic_set(&bh->b_count, 1);
1168         }
1169         spin_unlock(&free_list[isize].lock);
1170
1171         /*
1172          * OK, FINALLY we know that this buffer is the only one of
1173          * its kind, we hold a reference (b_count>0), it is unlocked,
1174          * and it is clean.
1175          */
1176         if (bh) {
1177                 init_buffer(bh, NULL, NULL);
1178                 bh->b_dev = dev;
1179                 bh->b_blocknr = block;
1180                 bh->b_state = 1 << BH_Mapped;
1181
1182                 /* Insert the buffer into the regular lists */
1183                 __insert_into_queues(bh);
1184         out:
1185                 write_unlock(&hash_table_lock);
1186                 spin_unlock(&lru_list_lock);
1187                 touch_buffer(bh);
1188                 return bh;
1189         }
1190
1191         /*
1192          * If we block while refilling the free list, somebody may
1193          * create the buffer first ... search the hashes again.
1194          */
1195         write_unlock(&hash_table_lock);
1196         spin_unlock(&lru_list_lock);
1197         refill_freelist(size);
1198         /* FIXME: getblk should fail if there's no enough memory */
1199         goto repeat;
1200 }
1201
1202 /* -1 -> no need to flush
1203     0 -> async flush
1204     1 -> sync flush (wait for I/O completion) */
1205 static int balance_dirty_state(void)
1206 {
1207         unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
1208
1209         dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
1210         tot = nr_free_buffer_pages();
1211
1212         dirty *= 100;
1213         soft_dirty_limit = tot * bdf_prm.b_un.nfract;
1214         hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync;
1215
1216         /* First, check for the "real" dirty limit. */
1217         if (dirty > soft_dirty_limit) {
1218                 if (dirty > hard_dirty_limit)
1219                         return 1;
1220                 return 0;
1221         }
1222
1223         return -1;
1224 }
1225
1226 /*
1227  * if a new dirty buffer is created we need to balance bdflush.
1228  *
1229  * in the future we might want to make bdflush aware of different
1230  * pressures on different devices - thus the (currently unused)
1231  * 'dev' parameter.
1232  */
1233 void balance_dirty(void)
1234 {
1235         int state = balance_dirty_state();
1236
1237         if (state < 0)
1238                 return;
1239
1240         /* If we're getting into imbalance, start write-out */
1241         spin_lock(&lru_list_lock);
1242         write_some_buffers(NODEV);
1243
1244         /*
1245          * And if we're _really_ out of balance, wait for
1246          * some of the dirty/locked buffers ourselves and
1247          * start bdflush.
1248          * This will throttle heavy writers.
1249          */
1250         if (state > 0) {
1251                 wait_for_some_buffers(NODEV);
1252                 wakeup_bdflush();
1253         }
1254 }
1255
1256 inline void __mark_dirty(struct buffer_head *bh)
1257 {
1258         bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1259         refile_buffer(bh);
1260 }
1261
1262 /* atomic version, the user must call balance_dirty() by hand
1263    as soon as it become possible to block */
1264 void __mark_buffer_dirty(struct buffer_head *bh)
1265 {
1266         if (!atomic_set_buffer_dirty(bh))
1267                 __mark_dirty(bh);
1268 }
1269
1270 void mark_buffer_dirty(struct buffer_head *bh)
1271 {
1272         if (!atomic_set_buffer_dirty(bh)) {
1273                 __mark_dirty(bh);
1274                 balance_dirty();
1275         }
1276 }
1277
1278 /*
1279  * A buffer may need to be moved from one buffer list to another
1280  * (e.g. in case it is not shared any more). Handle this.
1281  */
1282 static void __refile_buffer(struct buffer_head *bh)
1283 {
1284         int dispose = BUF_CLEAN;
1285         if (buffer_locked(bh))
1286                 dispose = BUF_LOCKED;
1287         if (buffer_dirty(bh))
1288                 dispose = BUF_DIRTY;
1289         if (dispose != bh->b_list) {
1290                 __remove_from_lru_list(bh, bh->b_list);
1291                 bh->b_list = dispose;
1292                 if (dispose == BUF_CLEAN)
1293                         remove_inode_queue(bh);
1294                 __insert_into_lru_list(bh, dispose);
1295         }
1296 }
1297
1298 void refile_buffer(struct buffer_head *bh)
1299 {
1300         spin_lock(&lru_list_lock);
1301         __refile_buffer(bh);
1302         spin_unlock(&lru_list_lock);
1303 }
1304
1305 /*
1306  * Release a buffer head
1307  */
1308 void __brelse(struct buffer_head * buf)
1309 {
1310         if (atomic_read(&buf->b_count)) {
1311                 put_bh(buf);
1312                 return;
1313         }
1314         printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1315 }
1316
1317 /*
1318  * bforget() is like brelse(), except it puts the buffer on the
1319  * free list if it can.. We can NOT free the buffer if:
1320  *  - there are other users of it
1321  *  - it is locked and thus can have active IO
1322  */
1323 void __bforget(struct buffer_head * buf)
1324 {
1325         /* grab the lru lock here to block bdflush. */
1326         spin_lock(&lru_list_lock);
1327         write_lock(&hash_table_lock);
1328         if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
1329                 goto in_use;
1330         __hash_unlink(buf);
1331         write_unlock(&hash_table_lock);
1332         remove_inode_queue(buf);
1333         __remove_from_lru_list(buf, buf->b_list);
1334         spin_unlock(&lru_list_lock);
1335         put_last_free(buf);
1336         return;
1337
1338  in_use:
1339         write_unlock(&hash_table_lock);
1340         spin_unlock(&lru_list_lock);
1341 }
1342
1343 /*
1344  * bread() reads a specified block and returns the buffer that contains
1345  * it. It returns NULL if the block was unreadable.
1346  */
1347 struct buffer_head * bread(kdev_t dev, int block, int size)
1348 {
1349         struct buffer_head * bh;
1350
1351         bh = getblk(dev, block, size);
1352         if (buffer_uptodate(bh))
1353                 return bh;
1354         ll_rw_block(READ, 1, &bh);
1355         wait_on_buffer(bh);
1356         if (buffer_uptodate(bh))
1357                 return bh;
1358         brelse(bh);
1359         return NULL;
1360 }
1361
1362 /*
1363  * Note: the caller should wake up the buffer_wait list if needed.
1364  */
1365 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1366 {
1367         if (bh->b_inode)
1368                 BUG();
1369         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1370                 kmem_cache_free(bh_cachep, bh);
1371         } else {
1372                 bh->b_blocknr = -1;
1373                 bh->b_this_page = NULL;
1374
1375                 nr_unused_buffer_heads++;
1376                 bh->b_next_free = unused_list;
1377                 unused_list = bh;
1378         }
1379 }
1380
1381 /*
1382  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1383  * no-buffer-head deadlock.  Return NULL on failure; waiting for
1384  * buffer heads is now handled in create_buffers().
1385  */ 
1386 static struct buffer_head * get_unused_buffer_head(int async)
1387 {
1388         struct buffer_head * bh;
1389
1390         spin_lock(&unused_list_lock);
1391         if (nr_unused_buffer_heads > NR_RESERVED) {
1392                 bh = unused_list;
1393                 unused_list = bh->b_next_free;
1394                 nr_unused_buffer_heads--;
1395                 spin_unlock(&unused_list_lock);
1396                 return bh;
1397         }
1398         spin_unlock(&unused_list_lock);
1399
1400         /* This is critical.  We can't call out to the FS
1401          * to get more buffer heads, because the FS may need
1402          * more buffer-heads itself.  Thus SLAB_NOFS.
1403          */
1404         if((bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS)) != NULL) {
1405                 bh->b_blocknr = -1;
1406                 bh->b_this_page = NULL;
1407                 return bh;
1408         }
1409
1410         /*
1411          * If we need an async buffer, use the reserved buffer heads.
1412          */
1413         if (async) {
1414                 spin_lock(&unused_list_lock);
1415                 if (unused_list) {
1416                         bh = unused_list;
1417                         unused_list = bh->b_next_free;
1418                         nr_unused_buffer_heads--;
1419                         spin_unlock(&unused_list_lock);
1420                         return bh;
1421                 }
1422                 spin_unlock(&unused_list_lock);
1423         }
1424 #if 0
1425         /*
1426          * (Pending further analysis ...)
1427          * Ordinary (non-async) requests can use a different memory priority
1428          * to free up pages. Any swapping thus generated will use async
1429          * buffer heads.
1430          */
1431         if(!async &&
1432            (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1433                 memset(bh, 0, sizeof(*bh));
1434                 init_waitqueue_head(&bh->b_wait);
1435                 return bh;
1436         }
1437 #endif
1438
1439         return NULL;
1440 }
1441
1442 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1443 {
1444         bh->b_page = page;
1445         if (offset >= PAGE_SIZE)
1446                 BUG();
1447         if (PageHighMem(page))
1448                 /*
1449                  * This catches illegal uses and preserves the offset:
1450                  */
1451                 bh->b_data = (char *)(0 + offset);
1452         else
1453                 bh->b_data = page_address(page) + offset;
1454 }
1455
1456 /*
1457  * Create the appropriate buffers when given a page for data area and
1458  * the size of each buffer.. Use the bh->b_this_page linked list to
1459  * follow the buffers created.  Return NULL if unable to create more
1460  * buffers.
1461  * The async flag is used to differentiate async IO (paging, swapping)
1462  * from ordinary buffer allocations, and only async requests are allowed
1463  * to sleep waiting for buffer heads. 
1464  */
1465 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1466 {
1467         struct buffer_head *bh, *head;
1468         long offset;
1469
1470 try_again:
1471         head = NULL;
1472         offset = PAGE_SIZE;
1473         while ((offset -= size) >= 0) {
1474                 bh = get_unused_buffer_head(async);
1475                 if (!bh)
1476                         goto no_grow;
1477
1478                 bh->b_dev = B_FREE;  /* Flag as unused */
1479                 bh->b_this_page = head;
1480                 head = bh;
1481
1482                 bh->b_state = 0;
1483                 bh->b_next_free = NULL;
1484                 bh->b_pprev = NULL;
1485                 atomic_set(&bh->b_count, 0);
1486                 bh->b_size = size;
1487
1488                 set_bh_page(bh, page, offset);
1489
1490                 bh->b_list = BUF_CLEAN;
1491                 bh->b_end_io = NULL;
1492         }
1493         return head;
1494 /*
1495  * In case anything failed, we just free everything we got.
1496  */
1497 no_grow:
1498         if (head) {
1499                 spin_lock(&unused_list_lock);
1500                 do {
1501                         bh = head;
1502                         head = head->b_this_page;
1503                         __put_unused_buffer_head(bh);
1504                 } while (head);
1505                 spin_unlock(&unused_list_lock);
1506
1507                 /* Wake up any waiters ... */
1508                 wake_up(&buffer_wait);
1509         }
1510
1511         /*
1512          * Return failure for non-async IO requests.  Async IO requests
1513          * are not allowed to fail, so we have to wait until buffer heads
1514          * become available.  But we don't want tasks sleeping with 
1515          * partially complete buffers, so all were released above.
1516          */
1517         if (!async)
1518                 return NULL;
1519
1520         /* We're _really_ low on memory. Now we just
1521          * wait for old buffer heads to become free due to
1522          * finishing IO.  Since this is an async request and
1523          * the reserve list is empty, we're sure there are 
1524          * async buffer heads in use.
1525          */
1526         run_task_queue(&tq_disk);
1527
1528         free_more_memory();
1529         goto try_again;
1530 }
1531
1532 static void unmap_buffer(struct buffer_head * bh)
1533 {
1534         if (buffer_mapped(bh)) {
1535                 mark_buffer_clean(bh);
1536                 lock_buffer(bh);
1537                 clear_bit(BH_Uptodate, &bh->b_state);
1538                 clear_bit(BH_Mapped, &bh->b_state);
1539                 clear_bit(BH_Req, &bh->b_state);
1540                 clear_bit(BH_New, &bh->b_state);
1541                 unlock_buffer(bh);
1542         }
1543 }
1544
1545 /*
1546  * We don't have to release all buffers here, but
1547  * we have to be sure that no dirty buffer is left
1548  * and no IO is going on (no buffer is locked), because
1549  * we have truncated the file and are going to free the
1550  * blocks on-disk..
1551  */
1552 int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
1553 {
1554         struct buffer_head *head, *bh, *next;
1555         unsigned int curr_off = 0;
1556
1557         if (!PageLocked(page))
1558                 BUG();
1559         if (!page->buffers)
1560                 return 1;
1561
1562         head = page->buffers;
1563         bh = head;
1564         do {
1565                 unsigned int next_off = curr_off + bh->b_size;
1566                 next = bh->b_this_page;
1567
1568                 /*
1569                  * is this block fully flushed?
1570                  */
1571                 if (offset <= curr_off)
1572                         unmap_buffer(bh);
1573                 curr_off = next_off;
1574                 bh = next;
1575         } while (bh != head);
1576
1577         /*
1578          * subtle. We release buffer-heads only if this is
1579          * the 'final' flushpage. We have invalidated the get_block
1580          * cached value unconditionally, so real IO is not
1581          * possible anymore.
1582          *
1583          * If the free doesn't work out, the buffers can be
1584          * left around - they just turn into anonymous buffers
1585          * instead.
1586          */
1587         if (!offset) {
1588                 if (!try_to_free_buffers(page, 0)) {
1589                         if (drop_pagecache)
1590                                 atomic_inc(&buffermem_pages);
1591                         return 0;
1592                 }
1593         }
1594
1595         return 1;
1596 }
1597
1598 void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
1599 {
1600         struct buffer_head *bh, *head, *tail;
1601
1602         /* FIXME: create_buffers should fail if there's no enough memory */
1603         head = create_buffers(page, blocksize, 1);
1604         if (page->buffers)
1605                 BUG();
1606
1607         bh = head;
1608         do {
1609                 bh->b_dev = dev;
1610                 bh->b_blocknr = 0;
1611                 bh->b_end_io = NULL;
1612                 tail = bh;
1613                 bh = bh->b_this_page;
1614         } while (bh);
1615         tail->b_this_page = head;
1616         page->buffers = head;
1617         page_cache_get(page);
1618 }
1619
1620 /*
1621  * We are taking a block for data and we don't want any output from any
1622  * buffer-cache aliases starting from return from that function and
1623  * until the moment when something will explicitly mark the buffer
1624  * dirty (hopefully that will not happen until we will free that block ;-)
1625  * We don't even need to mark it not-uptodate - nobody can expect
1626  * anything from a newly allocated buffer anyway. We used to used
1627  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1628  * don't want to mark the alias unmapped, for example - it would confuse
1629  * anyone who might pick it with bread() afterwards...
1630  */
1631
1632 static void unmap_underlying_metadata(struct buffer_head * bh)
1633 {
1634         struct buffer_head *old_bh;
1635
1636         old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1637         if (old_bh) {
1638                 mark_buffer_clean(old_bh);
1639                 wait_on_buffer(old_bh);
1640                 clear_bit(BH_Req, &old_bh->b_state);
1641                 /* Here we could run brelse or bforget. We use
1642                    bforget because it will try to put the buffer
1643                    in the freelist. */
1644                 __bforget(old_bh);
1645         }
1646 }
1647
1648 /*
1649  * NOTE! All mapped/uptodate combinations are valid:
1650  *
1651  *      Mapped  Uptodate        Meaning
1652  *
1653  *      No      No              "unknown" - must do get_block()
1654  *      No      Yes             "hole" - zero-filled
1655  *      Yes     No              "allocated" - allocated on disk, not read in
1656  *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1657  *
1658  * "Dirty" is valid only with the last case (mapped+uptodate).
1659  */
1660
1661 /*
1662  * block_write_full_page() is SMP-safe - currently it's still
1663  * being called with the kernel lock held, but the code is ready.
1664  */
1665 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1666 {
1667         int err, i;
1668         unsigned long block;
1669         struct buffer_head *bh, *head;
1670
1671         if (!PageLocked(page))
1672                 BUG();
1673
1674         if (!page->buffers)
1675                 create_empty_buffers(page, inode->i_dev, inode->i_sb->s_blocksize);
1676         head = page->buffers;
1677
1678         block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1679
1680         bh = head;
1681         i = 0;
1682
1683         /* Stage 1: make sure we have all the buffers mapped! */
1684         do {
1685                 /*
1686                  * If the buffer isn't up-to-date, we can't be sure
1687                  * that the buffer has been initialized with the proper
1688                  * block number information etc..
1689                  *
1690                  * Leave it to the low-level FS to make all those
1691                  * decisions (block #0 may actually be a valid block)
1692                  */
1693                 if (!buffer_mapped(bh)) {
1694                         err = get_block(inode, block, bh, 1);
1695                         if (err)
1696                                 goto out;
1697                         if (buffer_new(bh))
1698                                 unmap_underlying_metadata(bh);
1699                 }
1700                 bh = bh->b_this_page;
1701                 block++;
1702         } while (bh != head);
1703
1704         /* Stage 2: lock the buffers, mark them clean */
1705         do {
1706                 lock_buffer(bh);
1707                 set_buffer_async_io(bh);
1708                 set_bit(BH_Uptodate, &bh->b_state);
1709                 clear_bit(BH_Dirty, &bh->b_state);
1710                 bh = bh->b_this_page;
1711         } while (bh != head);
1712
1713         /* Stage 3: submit the IO */
1714         do {
1715                 struct buffer_head *next = bh->b_this_page;
1716                 submit_bh(WRITE, bh);
1717                 bh = next;
1718         } while (bh != head);
1719
1720         /* Done - end_buffer_io_async will unlock */
1721         SetPageUptodate(page);
1722         return 0;
1723
1724 out:
1725         ClearPageUptodate(page);
1726         UnlockPage(page);
1727         return err;
1728 }
1729
1730 static int __block_prepare_write(struct inode *inode, struct page *page,
1731                 unsigned from, unsigned to, get_block_t *get_block)
1732 {
1733         unsigned block_start, block_end;
1734         unsigned long block;
1735         int err = 0;
1736         unsigned blocksize, bbits;
1737         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1738         char *kaddr = kmap(page);
1739
1740         blocksize = inode->i_sb->s_blocksize;
1741         if (!page->buffers)
1742                 create_empty_buffers(page, inode->i_dev, blocksize);
1743         head = page->buffers;
1744
1745         bbits = inode->i_sb->s_blocksize_bits;
1746         block = page->index << (PAGE_CACHE_SHIFT - bbits);
1747
1748         for(bh = head, block_start = 0; bh != head || !block_start;
1749             block++, block_start=block_end, bh = bh->b_this_page) {
1750                 if (!bh)
1751                         BUG();
1752                 block_end = block_start+blocksize;
1753                 if (block_end <= from)
1754                         continue;
1755                 if (block_start >= to)
1756                         break;
1757                 if (!buffer_mapped(bh)) {
1758                         err = get_block(inode, block, bh, 1);
1759                         if (err)
1760                                 goto out;
1761                         if (buffer_new(bh)) {
1762                                 unmap_underlying_metadata(bh);
1763                                 if (Page_Uptodate(page)) {
1764                                         set_bit(BH_Uptodate, &bh->b_state);
1765                                         continue;
1766                                 }
1767                                 if (block_end > to)
1768                                         memset(kaddr+to, 0, block_end-to);
1769                                 if (block_start < from)
1770                                         memset(kaddr+block_start, 0, from-block_start);
1771                                 if (block_end > to || block_start < from)
1772                                         flush_dcache_page(page);
1773                                 continue;
1774                         }
1775                 }
1776                 if (Page_Uptodate(page)) {
1777                         set_bit(BH_Uptodate, &bh->b_state);
1778                         continue; 
1779                 }
1780                 if (!buffer_uptodate(bh) &&
1781                      (block_start < from || block_end > to)) {
1782                         ll_rw_block(READ, 1, &bh);
1783                         *wait_bh++=bh;
1784                 }
1785         }
1786         /*
1787          * If we issued read requests - let them complete.
1788          */
1789         while(wait_bh > wait) {
1790                 wait_on_buffer(*--wait_bh);
1791                 err = -EIO;
1792                 if (!buffer_uptodate(*wait_bh))
1793                         goto out;
1794         }
1795         return 0;
1796 out:
1797         return err;
1798 }
1799
1800 static int __block_commit_write(struct inode *inode, struct page *page,
1801                 unsigned from, unsigned to)
1802 {
1803         unsigned block_start, block_end;
1804         int partial = 0, need_balance_dirty = 0;
1805         unsigned blocksize;
1806         struct buffer_head *bh, *head;
1807
1808         blocksize = inode->i_sb->s_blocksize;
1809
1810         for(bh = head = page->buffers, block_start = 0;
1811             bh != head || !block_start;
1812             block_start=block_end, bh = bh->b_this_page) {
1813                 block_end = block_start + blocksize;
1814                 if (block_end <= from || block_start >= to) {
1815                         if (!buffer_uptodate(bh))
1816                                 partial = 1;
1817                 } else {
1818                         set_bit(BH_Uptodate, &bh->b_state);
1819                         if (!atomic_set_buffer_dirty(bh)) {
1820                                 __mark_dirty(bh);
1821                                 buffer_insert_inode_data_queue(bh, inode);
1822                                 need_balance_dirty = 1;
1823                         }
1824                 }
1825         }
1826
1827         if (need_balance_dirty)
1828                 balance_dirty();
1829         /*
1830          * is this a partial write that happened to make all buffers
1831          * uptodate then we can optimize away a bogus readpage() for
1832          * the next read(). Here we 'discover' wether the page went
1833          * uptodate as a result of this (potentially partial) write.
1834          */
1835         if (!partial)
1836                 SetPageUptodate(page);
1837         return 0;
1838 }
1839
1840 /*
1841  * Generic "read page" function for block devices that have the normal
1842  * get_block functionality. This is most of the block device filesystems.
1843  * Reads the page asynchronously --- the unlock_buffer() and
1844  * mark_buffer_uptodate() functions propagate buffer state into the
1845  * page struct once IO has completed.
1846  */
1847 int block_read_full_page(struct page *page, get_block_t *get_block)
1848 {
1849         struct inode *inode = page->mapping->host;
1850         unsigned long iblock, lblock;
1851         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1852         unsigned int blocksize, blocks;
1853         int nr, i;
1854
1855         if (!PageLocked(page))
1856                 PAGE_BUG(page);
1857         blocksize = inode->i_sb->s_blocksize;
1858         if (!page->buffers)
1859                 create_empty_buffers(page, inode->i_dev, blocksize);
1860         head = page->buffers;
1861
1862         blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1863         iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1864         lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1865         bh = head;
1866         nr = 0;
1867         i = 0;
1868
1869         do {
1870                 if (buffer_uptodate(bh))
1871                         continue;
1872
1873                 if (!buffer_mapped(bh)) {
1874                         if (iblock < lblock) {
1875                                 if (get_block(inode, iblock, bh, 0))
1876                                         continue;
1877                         }
1878                         if (!buffer_mapped(bh)) {
1879                                 memset(kmap(page) + i*blocksize, 0, blocksize);
1880                                 flush_dcache_page(page);
1881                                 kunmap(page);
1882                                 set_bit(BH_Uptodate, &bh->b_state);
1883                                 continue;
1884                         }
1885                         /* get_block() might have updated the buffer synchronously */
1886                         if (buffer_uptodate(bh))
1887                                 continue;
1888                 }
1889
1890                 arr[nr] = bh;
1891                 nr++;
1892         } while (i++, iblock++, (bh = bh->b_this_page) != head);
1893
1894         if (!nr) {
1895                 /*
1896                  * all buffers are uptodate - we can set the page
1897                  * uptodate as well.
1898                  */
1899                 SetPageUptodate(page);
1900                 UnlockPage(page);
1901                 return 0;
1902         }
1903
1904         /* Stage two: lock the buffers */
1905         for (i = 0; i < nr; i++) {
1906                 struct buffer_head * bh = arr[i];
1907                 lock_buffer(bh);
1908                 set_buffer_async_io(bh);
1909         }
1910
1911         /* Stage 3: start the IO */
1912         for (i = 0; i < nr; i++)
1913                 submit_bh(READ, arr[i]);
1914
1915         return 0;
1916 }
1917
1918 /*
1919  * For moronic filesystems that do not allow holes in file.
1920  * We may have to extend the file.
1921  */
1922
1923 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1924 {
1925         struct address_space *mapping = page->mapping;
1926         struct inode *inode = mapping->host;
1927         struct page *new_page;
1928         unsigned long pgpos;
1929         long status;
1930         unsigned zerofrom;
1931         unsigned blocksize = inode->i_sb->s_blocksize;
1932         char *kaddr;
1933
1934         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1935                 status = -ENOMEM;
1936                 new_page = grab_cache_page(mapping, pgpos);
1937                 if (!new_page)
1938                         goto out;
1939                 /* we might sleep */
1940                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1941                         UnlockPage(new_page);
1942                         page_cache_release(new_page);
1943                         continue;
1944                 }
1945                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1946                 if (zerofrom & (blocksize-1)) {
1947                         *bytes |= (blocksize-1);
1948                         (*bytes)++;
1949                 }
1950                 status = __block_prepare_write(inode, new_page, zerofrom,
1951                                                 PAGE_CACHE_SIZE, get_block);
1952                 if (status)
1953                         goto out_unmap;
1954                 kaddr = page_address(new_page);
1955                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1956                 flush_dcache_page(new_page);
1957                 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1958                 kunmap(new_page);
1959                 UnlockPage(new_page);
1960                 page_cache_release(new_page);
1961         }
1962
1963         if (page->index < pgpos) {
1964                 /* completely inside the area */
1965                 zerofrom = offset;
1966         } else {
1967                 /* page covers the boundary, find the boundary offset */
1968                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1969
1970                 /* if we will expand the thing last block will be filled */
1971                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1972                         *bytes |= (blocksize-1);
1973                         (*bytes)++;
1974                 }
1975
1976                 /* starting below the boundary? Nothing to zero out */
1977                 if (offset <= zerofrom)
1978                         zerofrom = offset;
1979         }
1980         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1981         if (status)
1982                 goto out1;
1983         kaddr = page_address(page);
1984         if (zerofrom < offset) {
1985                 memset(kaddr+zerofrom, 0, offset-zerofrom);
1986                 flush_dcache_page(page);
1987                 __block_commit_write(inode, page, zerofrom, offset);
1988         }
1989         return 0;
1990 out1:
1991         ClearPageUptodate(page);
1992         kunmap(page);
1993         return status;
1994
1995 out_unmap:
1996         ClearPageUptodate(new_page);
1997         kunmap(new_page);
1998         UnlockPage(new_page);
1999         page_cache_release(new_page);
2000 out:
2001         return status;
2002 }
2003
2004 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2005                         get_block_t *get_block)
2006 {
2007         struct inode *inode = page->mapping->host;
2008         int err = __block_prepare_write(inode, page, from, to, get_block);
2009         if (err) {
2010                 ClearPageUptodate(page);
2011                 kunmap(page);
2012         }
2013         return err;
2014 }
2015
2016 int generic_commit_write(struct file *file, struct page *page,
2017                 unsigned from, unsigned to)
2018 {
2019         struct inode *inode = page->mapping->host;
2020         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2021         __block_commit_write(inode,page,from,to);
2022         kunmap(page);
2023         if (pos > inode->i_size) {
2024                 inode->i_size = pos;
2025                 mark_inode_dirty(inode);
2026         }
2027         return 0;
2028 }
2029
2030 int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
2031 {
2032         unsigned long index = from >> PAGE_CACHE_SHIFT;
2033         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2034         unsigned blocksize, iblock, length, pos;
2035         struct inode *inode = mapping->host;
2036         struct page *page;
2037         struct buffer_head *bh;
2038         int err;
2039
2040         blocksize = inode->i_sb->s_blocksize;
2041         length = offset & (blocksize - 1);
2042
2043         /* Block boundary? Nothing to do */
2044         if (!length)
2045                 return 0;
2046
2047         length = blocksize - length;
2048         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
2049         
2050         page = grab_cache_page(mapping, index);
2051         err = -ENOMEM;
2052         if (!page)
2053                 goto out;
2054
2055         if (!page->buffers)
2056                 create_empty_buffers(page, inode->i_dev, blocksize);
2057
2058         /* Find the buffer that contains "offset" */
2059         bh = page->buffers;
2060         pos = blocksize;
2061         while (offset >= pos) {
2062                 bh = bh->b_this_page;
2063                 iblock++;
2064                 pos += blocksize;
2065         }
2066
2067         err = 0;
2068         if (!buffer_mapped(bh)) {
2069                 /* Hole? Nothing to do */
2070                 if (buffer_uptodate(bh))
2071                         goto unlock;
2072                 get_block(inode, iblock, bh, 0);
2073                 /* Still unmapped? Nothing to do */
2074                 if (!buffer_mapped(bh))
2075                         goto unlock;
2076         }
2077
2078         /* Ok, it's mapped. Make sure it's up-to-date */
2079         if (Page_Uptodate(page))
2080                 set_bit(BH_Uptodate, &bh->b_state);
2081
2082         if (!buffer_uptodate(bh)) {
2083                 err = -EIO;
2084                 ll_rw_block(READ, 1, &bh);
2085                 wait_on_buffer(bh);
2086                 /* Uhhuh. Read error. Complain and punt. */
2087                 if (!buffer_uptodate(bh))
2088                         goto unlock;
2089         }
2090
2091         memset(kmap(page) + offset, 0, length);
2092         flush_dcache_page(page);
2093         kunmap(page);
2094
2095         __mark_buffer_dirty(bh);
2096         err = 0;
2097
2098 unlock:
2099         UnlockPage(page);
2100         page_cache_release(page);
2101 out:
2102         return err;
2103 }
2104
2105 int block_write_full_page(struct page *page, get_block_t *get_block)
2106 {
2107         struct inode *inode = page->mapping->host;
2108         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2109         unsigned offset;
2110         int err;
2111
2112         /* easy case */
2113         if (page->index < end_index)
2114                 return __block_write_full_page(inode, page, get_block);
2115
2116         /* things got complicated... */
2117         offset = inode->i_size & (PAGE_CACHE_SIZE-1);
2118         /* OK, are we completely out? */
2119         if (page->index >= end_index+1 || !offset) {
2120                 UnlockPage(page);
2121                 return -EIO;
2122         }
2123
2124         /* Sigh... will have to work, then... */
2125         err = __block_prepare_write(inode, page, 0, offset, get_block);
2126         if (!err) {
2127                 memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
2128                 flush_dcache_page(page);
2129                 __block_commit_write(inode,page,0,offset);
2130 done:
2131                 kunmap(page);
2132                 UnlockPage(page);
2133                 return err;
2134         }
2135         ClearPageUptodate(page);
2136         goto done;
2137 }
2138
2139 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
2140 {
2141         struct buffer_head tmp;
2142         struct inode *inode = mapping->host;
2143         tmp.b_state = 0;
2144         tmp.b_blocknr = 0;
2145         get_block(inode, block, &tmp, 0);
2146         return tmp.b_blocknr;
2147 }
2148
2149 int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block)
2150 {
2151         int i, nr_blocks, retval;
2152         unsigned long * blocks = iobuf->blocks;
2153
2154         nr_blocks = iobuf->length / blocksize;
2155         /* build the blocklist */
2156         for (i = 0; i < nr_blocks; i++, blocknr++) {
2157                 struct buffer_head bh;
2158
2159                 bh.b_state = 0;
2160                 bh.b_dev = inode->i_dev;
2161                 bh.b_size = blocksize;
2162
2163                 retval = get_block(inode, blocknr, &bh, rw == READ ? 0 : 1);
2164                 if (retval)
2165                         goto out;
2166
2167                 if (rw == READ) {
2168                         if (buffer_new(&bh))
2169                                 BUG();
2170                         if (!buffer_mapped(&bh)) {
2171                                 /* there was an hole in the filesystem */
2172                                 blocks[i] = -1UL;
2173                                 continue;
2174                         }
2175                 } else {
2176                         if (buffer_new(&bh))
2177                                 unmap_underlying_metadata(&bh);
2178                         if (!buffer_mapped(&bh))
2179                                 BUG();
2180                 }
2181                 blocks[i] = bh.b_blocknr;
2182         }
2183
2184         retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize);
2185
2186  out:
2187         return retval;
2188 }
2189
2190 /*
2191  * IO completion routine for a buffer_head being used for kiobuf IO: we
2192  * can't dispatch the kiobuf callback until io_count reaches 0.  
2193  */
2194
2195 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
2196 {
2197         struct kiobuf *kiobuf;
2198         
2199         mark_buffer_uptodate(bh, uptodate);
2200
2201         kiobuf = bh->b_private;
2202         unlock_buffer(bh);
2203         end_kio_request(kiobuf, uptodate);
2204 }
2205
2206 /*
2207  * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
2208  * for them to complete.  Clean up the buffer_heads afterwards.  
2209  */
2210
2211 static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
2212 {
2213         int iosize, err;
2214         int i;
2215         struct buffer_head *tmp;
2216
2217         iosize = 0;
2218         err = 0;
2219
2220         for (i = nr; --i >= 0; ) {
2221                 iosize += size;
2222                 tmp = bh[i];
2223                 if (buffer_locked(tmp)) {
2224                         wait_on_buffer(tmp);
2225                 }
2226                 
2227                 if (!buffer_uptodate(tmp)) {
2228                         /* We are traversing bh'es in reverse order so
2229                            clearing iosize on error calculates the
2230                            amount of IO before the first error. */
2231                         iosize = 0;
2232                         err = -EIO;
2233                 }
2234         }
2235         
2236         if (iosize)
2237                 return iosize;
2238         return err;
2239 }
2240
2241 /*
2242  * Start I/O on a physical range of kernel memory, defined by a vector
2243  * of kiobuf structs (much like a user-space iovec list).
2244  *
2245  * The kiobuf must already be locked for IO.  IO is submitted
2246  * asynchronously: you need to check page->locked, page->uptodate, and
2247  * maybe wait on page->wait.
2248  *
2249  * It is up to the caller to make sure that there are enough blocks
2250  * passed in to completely map the iobufs to disk.
2251  */
2252
2253 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], 
2254                kdev_t dev, unsigned long b[], int size)
2255 {
2256         int             err;
2257         int             length;
2258         int             transferred;
2259         int             i;
2260         int             bufind;
2261         int             pageind;
2262         int             bhind;
2263         int             offset;
2264         unsigned long   blocknr;
2265         struct kiobuf * iobuf = NULL;
2266         struct page *   map;
2267         struct buffer_head *tmp, **bhs = NULL;
2268
2269         if (!nr)
2270                 return 0;
2271         
2272         /* 
2273          * First, do some alignment and validity checks 
2274          */
2275         for (i = 0; i < nr; i++) {
2276                 iobuf = iovec[i];
2277                 if ((iobuf->offset & (size-1)) ||
2278                     (iobuf->length & (size-1)))
2279                         return -EINVAL;
2280                 if (!iobuf->nr_pages)
2281                         panic("brw_kiovec: iobuf not initialised");
2282         }
2283
2284         /* 
2285          * OK to walk down the iovec doing page IO on each page we find. 
2286          */
2287         bufind = bhind = transferred = err = 0;
2288         for (i = 0; i < nr; i++) {
2289                 iobuf = iovec[i];
2290                 offset = iobuf->offset;
2291                 length = iobuf->length;
2292                 iobuf->errno = 0;
2293                 if (!bhs)
2294                         bhs = iobuf->bh;
2295                 
2296                 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
2297                         map  = iobuf->maplist[pageind];
2298                         if (!map) {
2299                                 err = -EFAULT;
2300                                 goto finished;
2301                         }
2302                         
2303                         while (length > 0) {
2304                                 blocknr = b[bufind++];
2305                                 if (blocknr == -1UL) {
2306                                         if (rw == READ) {
2307                                                 /* there was an hole in the filesystem */
2308                                                 memset(kmap(map) + offset, 0, size);
2309                                                 flush_dcache_page(map);
2310                                                 kunmap(map);
2311
2312                                                 transferred += size;
2313                                                 goto skip_block;
2314                                         } else
2315                                                 BUG();
2316                                 }
2317                                 tmp = bhs[bhind++];
2318
2319                                 tmp->b_dev = B_FREE;
2320                                 tmp->b_size = size;
2321                                 set_bh_page(tmp, map, offset);
2322                                 tmp->b_this_page = tmp;
2323
2324                                 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
2325                                 tmp->b_dev = dev;
2326                                 tmp->b_blocknr = blocknr;
2327                                 tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
2328
2329                                 if (rw == WRITE) {
2330                                         set_bit(BH_Uptodate, &tmp->b_state);
2331                                         clear_bit(BH_Dirty, &tmp->b_state);
2332                                 } else
2333                                         set_bit(BH_Uptodate, &tmp->b_state);
2334
2335                                 atomic_inc(&iobuf->io_count);
2336                                 submit_bh(rw, tmp);
2337                                 /* 
2338                                  * Wait for IO if we have got too much 
2339                                  */
2340                                 if (bhind >= KIO_MAX_SECTORS) {
2341                                         kiobuf_wait_for_io(iobuf); /* wake-one */
2342                                         err = wait_kio(rw, bhind, bhs, size);
2343                                         if (err >= 0)
2344                                                 transferred += err;
2345                                         else
2346                                                 goto finished;
2347                                         bhind = 0;
2348                                 }
2349
2350                         skip_block:
2351                                 length -= size;
2352                                 offset += size;
2353
2354                                 if (offset >= PAGE_SIZE) {
2355                                         offset = 0;
2356                                         break;
2357                                 }
2358                         } /* End of block loop */
2359                 } /* End of page loop */                
2360         } /* End of iovec loop */
2361
2362         /* Is there any IO still left to submit? */
2363         if (bhind) {
2364                 kiobuf_wait_for_io(iobuf); /* wake-one */
2365                 err = wait_kio(rw, bhind, bhs, size);
2366                 if (err >= 0)
2367                         transferred += err;
2368                 else
2369                         goto finished;
2370         }
2371
2372  finished:
2373         if (transferred)
2374                 return transferred;
2375         return err;
2376 }
2377
2378 /*
2379  * Start I/O on a page.
2380  * This function expects the page to be locked and may return
2381  * before I/O is complete. You then have to check page->locked,
2382  * page->uptodate, and maybe wait on page->wait.
2383  *
2384  * brw_page() is SMP-safe, although it's being called with the
2385  * kernel lock held - but the code is ready.
2386  *
2387  * FIXME: we need a swapper_inode->get_block function to remove
2388  *        some of the bmap kludges and interface ugliness here.
2389  */
2390 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2391 {
2392         struct buffer_head *head, *bh;
2393
2394         if (!PageLocked(page))
2395                 panic("brw_page: page not locked for I/O");
2396
2397         if (!page->buffers)
2398                 create_empty_buffers(page, dev, size);
2399         head = bh = page->buffers;
2400
2401         /* Stage 1: lock all the buffers */
2402         do {
2403                 lock_buffer(bh);
2404                 bh->b_blocknr = *(b++);
2405                 set_bit(BH_Mapped, &bh->b_state);
2406                 set_buffer_async_io(bh);
2407                 bh = bh->b_this_page;
2408         } while (bh != head);
2409
2410         /* Stage 2: start the IO */
2411         do {
2412                 struct buffer_head *next = bh->b_this_page;
2413                 submit_bh(rw, bh);
2414                 bh = next;
2415         } while (bh != head);
2416         return 0;
2417 }
2418
2419 int block_symlink(struct inode *inode, const char *symname, int len)
2420 {
2421         struct address_space *mapping = inode->i_mapping;
2422         struct page *page = grab_cache_page(mapping, 0);
2423         int err = -ENOMEM;
2424         char *kaddr;
2425
2426         if (!page)
2427                 goto fail;
2428         err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2429         if (err)
2430                 goto fail_map;
2431         kaddr = page_address(page);
2432         memcpy(kaddr, symname, len-1);
2433         mapping->a_ops->commit_write(NULL, page, 0, len-1);
2434         /*
2435          * Notice that we are _not_ going to block here - end of page is
2436          * unmapped, so this will only try to map the rest of page, see
2437          * that it is unmapped (typically even will not look into inode -
2438          * ->i_size will be enough for everything) and zero it out.
2439          * OTOH it's obviously correct and should make the page up-to-date.
2440          */
2441         err = mapping->a_ops->readpage(NULL, page);
2442         wait_on_page(page);
2443         page_cache_release(page);
2444         if (err < 0)
2445                 goto fail;
2446         mark_inode_dirty(inode);
2447         return 0;
2448 fail_map:
2449         UnlockPage(page);
2450         page_cache_release(page);
2451 fail:
2452         return err;
2453 }
2454
2455 /*
2456  * Try to increase the number of buffers available: the size argument
2457  * is used to determine what kind of buffers we want.
2458  */
2459 static int grow_buffers(int size)
2460 {
2461         struct page * page;
2462         struct buffer_head *bh, *tmp;
2463         struct buffer_head * insert_point;
2464         int isize;
2465
2466         if ((size & 511) || (size > PAGE_SIZE)) {
2467                 printk(KERN_ERR "VFS: grow_buffers: size = %d\n",size);
2468                 return 0;
2469         }
2470
2471         page = alloc_page(GFP_NOFS);
2472         if (!page)
2473                 goto out;
2474         LockPage(page);
2475         bh = create_buffers(page, size, 0);
2476         if (!bh)
2477                 goto no_buffer_head;
2478
2479         isize = BUFSIZE_INDEX(size);
2480
2481         spin_lock(&free_list[isize].lock);
2482         insert_point = free_list[isize].list;
2483         tmp = bh;
2484         while (1) {
2485                 if (insert_point) {
2486                         tmp->b_next_free = insert_point->b_next_free;
2487                         tmp->b_prev_free = insert_point;
2488                         insert_point->b_next_free->b_prev_free = tmp;
2489                         insert_point->b_next_free = tmp;
2490                 } else {
2491                         tmp->b_prev_free = tmp;
2492                         tmp->b_next_free = tmp;
2493                 }
2494                 insert_point = tmp;
2495                 if (tmp->b_this_page)
2496                         tmp = tmp->b_this_page;
2497                 else
2498                         break;
2499         }
2500         tmp->b_this_page = bh;
2501         free_list[isize].list = bh;
2502         spin_unlock(&free_list[isize].lock);
2503
2504         page->buffers = bh;
2505         page->flags &= ~(1 << PG_referenced);
2506         lru_cache_add(page);
2507         UnlockPage(page);
2508         atomic_inc(&buffermem_pages);
2509         return 1;
2510
2511 no_buffer_head:
2512         UnlockPage(page);
2513         page_cache_release(page);
2514 out:
2515         return 0;
2516 }
2517
2518 static int sync_page_buffers(struct buffer_head *bh, unsigned int gfp_mask)
2519 {
2520         struct buffer_head * p = bh;
2521         int tryagain = 1;
2522
2523         do {
2524                 if (buffer_dirty(p) || buffer_locked(p)) {
2525                         if (test_and_set_bit(BH_Wait_IO, &p->b_state)) {
2526                                 if (buffer_dirty(p)) {
2527                                         ll_rw_block(WRITE, 1, &p);
2528                                         tryagain = 0;
2529                                 } else if (buffer_locked(p)) {
2530                                         if (gfp_mask & __GFP_WAIT) {
2531                                                 wait_on_buffer(p);
2532                                                 tryagain = 1;
2533                                         } else
2534                                                 tryagain = 0;
2535                                 }
2536                         } else
2537                                 tryagain = 0;
2538                 }
2539                 p = p->b_this_page;
2540         } while (p != bh);
2541
2542         return tryagain;
2543 }
2544
2545 /*
2546  * Can the buffer be thrown out?
2547  */
2548 #define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock))
2549 #define buffer_busy(bh)         (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2550
2551 /*
2552  * try_to_free_buffers() checks if all the buffers on this particular page
2553  * are unused, and free's the page if so.
2554  *
2555  * Wake up bdflush() if this fails - if we're running low on memory due
2556  * to dirty buffers, we need to flush them out as quickly as possible.
2557  *
2558  * NOTE: There are quite a number of ways that threads of control can
2559  *       obtain a reference to a buffer head within a page.  So we must
2560  *       lock out all of these paths to cleanly toss the page.
2561  */
2562 int try_to_free_buffers(struct page * page, unsigned int gfp_mask)
2563 {
2564         struct buffer_head * tmp, * bh = page->buffers;
2565         int index = BUFSIZE_INDEX(bh->b_size);
2566
2567 cleaned_buffers_try_again:
2568         spin_lock(&lru_list_lock);
2569         write_lock(&hash_table_lock);
2570         spin_lock(&free_list[index].lock);
2571         tmp = bh;
2572         do {
2573                 if (buffer_busy(tmp))
2574                         goto busy_buffer_page;
2575                 tmp = tmp->b_this_page;
2576         } while (tmp != bh);
2577
2578         spin_lock(&unused_list_lock);
2579         tmp = bh;
2580         do {
2581                 struct buffer_head * p = tmp;
2582                 tmp = tmp->b_this_page;
2583
2584                 /* The buffer can be either on the regular
2585                  * queues or on the free list..
2586                  */
2587                 if (p->b_dev != B_FREE) {
2588                         remove_inode_queue(p);
2589                         __remove_from_queues(p);
2590                 } else
2591                         __remove_from_free_list(p, index);
2592                 __put_unused_buffer_head(p);
2593         } while (tmp != bh);
2594         spin_unlock(&unused_list_lock);
2595
2596         /* Wake up anyone waiting for buffer heads */
2597         wake_up(&buffer_wait);
2598
2599         /* And free the page */
2600         page->buffers = NULL;
2601         page_cache_release(page);
2602         spin_unlock(&free_list[index].lock);
2603         write_unlock(&hash_table_lock);
2604         spin_unlock(&lru_list_lock);
2605         return 1;
2606
2607 busy_buffer_page:
2608         /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2609         spin_unlock(&free_list[index].lock);
2610         write_unlock(&hash_table_lock);
2611         spin_unlock(&lru_list_lock);
2612         if (gfp_mask & __GFP_IO) {
2613                 if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
2614                         if (sync_page_buffers(bh, gfp_mask)) {
2615                                 /* no IO or waiting next time */
2616                                 gfp_mask = 0;
2617                                 goto cleaned_buffers_try_again;
2618                         }
2619                 }
2620         }
2621         if (balance_dirty_state() >= 0)
2622                 wakeup_bdflush();
2623         return 0;
2624 }
2625
2626 /* ================== Debugging =================== */
2627
2628 void show_buffers(void)
2629 {
2630 #ifdef CONFIG_SMP
2631         struct buffer_head * bh;
2632         int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2633         int nlist;
2634         static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", };
2635 #endif
2636
2637         printk("Buffer memory:   %6dkB\n",
2638                         atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2639
2640 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2641         if (!spin_trylock(&lru_list_lock))
2642                 return;
2643         for(nlist = 0; nlist < NR_LIST; nlist++) {
2644                 found = locked = dirty = used = lastused = 0;
2645                 bh = lru_list[nlist];
2646                 if(!bh) continue;
2647
2648                 do {
2649                         found++;
2650                         if (buffer_locked(bh))
2651                                 locked++;
2652                         if (buffer_dirty(bh))
2653                                 dirty++;
2654                         if (atomic_read(&bh->b_count))
2655                                 used++, lastused = found;
2656                         bh = bh->b_next_free;
2657                 } while (bh != lru_list[nlist]);
2658                 {
2659                         int tmp = nr_buffers_type[nlist];
2660                         if (found != tmp)
2661                                 printk("%9s: BUG -> found %d, reported %d\n",
2662                                        buf_types[nlist], found, tmp);
2663                 }
2664                 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2665                        "%d locked, %d dirty\n",
2666                        buf_types[nlist], found, size_buffers_type[nlist]>>10,
2667                        used, lastused, locked, dirty);
2668         }
2669         spin_unlock(&lru_list_lock);
2670 #endif
2671 }
2672
2673 /* ===================== Init ======================= */
2674
2675 /*
2676  * allocate the hash table and init the free list
2677  * Use gfp() for the hash table to decrease TLB misses, use
2678  * SLAB cache for buffer heads.
2679  */
2680 void __init buffer_init(unsigned long mempages)
2681 {
2682         int order, i;
2683         unsigned int nr_hash;
2684
2685         /* The buffer cache hash table is less important these days,
2686          * trim it a bit.
2687          */
2688         mempages >>= 14;
2689
2690         mempages *= sizeof(struct buffer_head *);
2691
2692         for (order = 0; (1 << order) < mempages; order++)
2693                 ;
2694
2695         /* try to allocate something until we get it or we're asking
2696            for something that is really too small */
2697
2698         do {
2699                 unsigned long tmp;
2700
2701                 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2702                 bh_hash_mask = (nr_hash - 1);
2703
2704                 tmp = nr_hash;
2705                 bh_hash_shift = 0;
2706                 while((tmp >>= 1UL) != 0UL)
2707                         bh_hash_shift++;
2708
2709                 hash_table = (struct buffer_head **)
2710                     __get_free_pages(GFP_ATOMIC, order);
2711         } while (hash_table == NULL && --order > 0);
2712         printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2713                nr_hash, order, (PAGE_SIZE << order));
2714
2715         if (!hash_table)
2716                 panic("Failed to allocate buffer hash table\n");
2717
2718         /* Setup hash chains. */
2719         for(i = 0; i < nr_hash; i++)
2720                 hash_table[i] = NULL;
2721
2722         /* Setup free lists. */
2723         for(i = 0; i < NR_SIZES; i++) {
2724                 free_list[i].list = NULL;
2725                 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2726         }
2727
2728         /* Setup lru lists. */
2729         for(i = 0; i < NR_LIST; i++)
2730                 lru_list[i] = NULL;
2731
2732 }
2733
2734
2735 /* ====================== bdflush support =================== */
2736
2737 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2738  * response to dirty buffers.  Once this process is activated, we write back
2739  * a limited number of buffers to the disks and then go back to sleep again.
2740  */
2741
2742 DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
2743
2744 void wakeup_bdflush(void)
2745 {
2746         wake_up_interruptible(&bdflush_wait);
2747 }
2748
2749 /* 
2750  * Here we attempt to write back old buffers.  We also try to flush inodes 
2751  * and supers as well, since this function is essentially "update", and 
2752  * otherwise there would be no way of ensuring that these quantities ever 
2753  * get written back.  Ideally, we would have a timestamp on the inodes
2754  * and superblocks so that we could write back only the old ones as well
2755  */
2756
2757 static int sync_old_buffers(void)
2758 {
2759         lock_kernel();
2760         sync_unlocked_inodes();
2761         sync_supers(0);
2762         unlock_kernel();
2763
2764         for (;;) {
2765                 struct buffer_head *bh;
2766
2767                 spin_lock(&lru_list_lock);
2768                 bh = lru_list[BUF_DIRTY];
2769                 if (!bh || time_before(jiffies, bh->b_flushtime))
2770                         break;
2771                 if (write_some_buffers(NODEV))
2772                         continue;
2773                 return 0;
2774         }
2775         spin_unlock(&lru_list_lock);
2776         return 0;
2777 }
2778
2779 int block_sync_page(struct page *page)
2780 {
2781         run_task_queue(&tq_disk);
2782         return 0;
2783 }
2784
2785 /* This is the interface to bdflush.  As we get more sophisticated, we can
2786  * pass tuning parameters to this "process", to adjust how it behaves. 
2787  * We would want to verify each parameter, however, to make sure that it 
2788  * is reasonable. */
2789
2790 asmlinkage long sys_bdflush(int func, long data)
2791 {
2792         if (!capable(CAP_SYS_ADMIN))
2793                 return -EPERM;
2794
2795         if (func == 1) {
2796                 /* do_exit directly and let kupdate to do its work alone. */
2797                 do_exit(0);
2798 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2799          a syscall that doesn't care about the current mm context. */
2800                 int error;
2801                 struct mm_struct *user_mm;
2802
2803                 /*
2804                  * bdflush will spend all of it's time in kernel-space,
2805                  * without touching user-space, so we can switch it into
2806                  * 'lazy TLB mode' to reduce the cost of context-switches
2807                  * to and from bdflush.
2808                  */
2809                 user_mm = start_lazy_tlb();
2810                 error = sync_old_buffers();
2811                 end_lazy_tlb(user_mm);
2812                 return error;
2813 #endif
2814         }
2815
2816         /* Basically func 1 means read param 1, 2 means write param 1, etc */
2817         if (func >= 2) {
2818                 int i = (func-2) >> 1;
2819                 if (i >= 0 && i < N_PARAM) {
2820                         if ((func & 1) == 0)
2821                                 return put_user(bdf_prm.data[i], (int*)data);
2822
2823                         if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2824                                 bdf_prm.data[i] = data;
2825                                 return 0;
2826                         }
2827                 }
2828                 return -EINVAL;
2829         }
2830
2831         /* Having func 0 used to launch the actual bdflush and then never
2832          * return (unless explicitly killed). We return zero here to 
2833          * remain semi-compatible with present update(8) programs.
2834          */
2835         return 0;
2836 }
2837
2838 /*
2839  * This is the actual bdflush daemon itself. It used to be started from
2840  * the syscall above, but now we launch it ourselves internally with
2841  * kernel_thread(...)  directly after the first thread in init/main.c
2842  */
2843 int bdflush(void *startup)
2844 {
2845         struct task_struct *tsk = current;
2846
2847         /*
2848          *      We have a bare-bones task_struct, and really should fill
2849          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
2850          *      display semi-sane things. Not real crucial though...  
2851          */
2852
2853         tsk->session = 1;
2854         tsk->pgrp = 1;
2855         strcpy(tsk->comm, "bdflush");
2856
2857         /* avoid getting signals */
2858         spin_lock_irq(&tsk->sigmask_lock);
2859         flush_signals(tsk);
2860         sigfillset(&tsk->blocked);
2861         recalc_sigpending(tsk);
2862         spin_unlock_irq(&tsk->sigmask_lock);
2863
2864         complete((struct completion *)startup);
2865
2866         for (;;) {
2867                 CHECK_EMERGENCY_SYNC
2868
2869                 spin_lock(&lru_list_lock);
2870                 if (!write_some_buffers(NODEV) || balance_dirty_state() < 0) {
2871                         wait_for_some_buffers(NODEV);
2872                         interruptible_sleep_on(&bdflush_wait);
2873                 }
2874         }
2875 }
2876
2877 /*
2878  * This is the kernel update daemon. It was used to live in userspace
2879  * but since it's need to run safely we want it unkillable by mistake.
2880  * You don't need to change your userspace configuration since
2881  * the userspace `update` will do_exit(0) at the first sys_bdflush().
2882  */
2883 int kupdate(void *startup)
2884 {
2885         struct task_struct * tsk = current;