]> CyberLeo.Net >> Repos - FreeBSD/releng/7.2.git/blob - sys/kern/vfs_cluster.c
Create releng/7.2 from stable/7 in preparation for 7.2-RELEASE.
[FreeBSD/releng/7.2.git] / sys / kern / vfs_cluster.c
1 /*-
2  * Copyright (c) 1993
3  *      The Regents of the University of California.  All rights reserved.
4  * Modifications/enhancements:
5  *      Copyright (c) 1995 John S. Dyson.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 4. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  *      @(#)vfs_cluster.c       8.7 (Berkeley) 2/13/94
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include "opt_debug_cluster.h"
38
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/kernel.h>
42 #include <sys/proc.h>
43 #include <sys/bio.h>
44 #include <sys/buf.h>
45 #include <sys/vnode.h>
46 #include <sys/malloc.h>
47 #include <sys/mount.h>
48 #include <sys/resourcevar.h>
49 #include <sys/vmmeter.h>
50 #include <vm/vm.h>
51 #include <vm/vm_object.h>
52 #include <vm/vm_page.h>
53 #include <sys/sysctl.h>
54
55 #if defined(CLUSTERDEBUG)
56 static int      rcluster= 0;
57 SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0,
58     "Debug VFS clustering code");
59 #endif
60
61 static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer");
62
63 static struct cluster_save *
64         cluster_collectbufs(struct vnode *vp, struct buf *last_bp);
65 static struct buf *
66         cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
67                          daddr_t blkno, long size, int run, struct buf *fbp);
68 static void cluster_callback(struct buf *);
69
70 static int write_behind = 1;
71 SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0,
72     "Cluster write-behind; 0: disable, 1: enable, 2: backed off");
73
74 static int read_max = 8;
75 SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0,
76     "Cluster read-ahead max block count");
77
78 /* Page expended to mark partially backed buffers */
79 extern vm_page_t        bogus_page;
80
81 /*
82  * Read data to a buf, including read-ahead if we find this to be beneficial.
83  * cluster_read replaces bread.
84  */
85 int
86 cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)
87         struct vnode *vp;
88         u_quad_t filesize;
89         daddr_t lblkno;
90         long size;
91         struct ucred *cred;
92         long totread;
93         int seqcount;
94         struct buf **bpp;
95 {
96         struct buf *bp, *rbp, *reqbp;
97         daddr_t blkno, origblkno;
98         int maxra, racluster;
99         int error, ncontig;
100         int i;
101
102         error = 0;
103
104         /*
105          * Try to limit the amount of read-ahead by a few
106          * ad-hoc parameters.  This needs work!!!
107          */
108         racluster = vp->v_mount->mnt_iosize_max / size;
109         maxra = seqcount;
110         maxra = min(read_max, maxra);
111         maxra = min(nbuf/8, maxra);
112         if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize)
113                 maxra = (filesize / size) - lblkno;
114
115         /*
116          * get the requested block
117          */
118         *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, 0);
119         origblkno = lblkno;
120
121         /*
122          * if it is in the cache, then check to see if the reads have been
123          * sequential.  If they have, then try some read-ahead, otherwise
124          * back-off on prospective read-aheads.
125          */
126         if (bp->b_flags & B_CACHE) {
127                 if (!seqcount) {
128                         return 0;
129                 } else if ((bp->b_flags & B_RAM) == 0) {
130                         return 0;
131                 } else {
132                         bp->b_flags &= ~B_RAM;
133                         VI_LOCK(vp);
134                         for (i = 1; i < maxra; i++) {
135                                 /*
136                                  * Stop if the buffer does not exist or it
137                                  * is invalid (about to go away?)
138                                  */
139                                 rbp = gbincore(&vp->v_bufobj, lblkno+i);
140                                 if (rbp == NULL || (rbp->b_flags & B_INVAL))
141                                         break;
142
143                                 /*
144                                  * Set another read-ahead mark so we know 
145                                  * to check again. (If we can lock the
146                                  * buffer without waiting)
147                                  */
148                                 if ((((i % racluster) == (racluster - 1)) ||
149                                     (i == (maxra - 1))) 
150                                     && (0 == BUF_LOCK(rbp, 
151                                         LK_EXCLUSIVE | LK_NOWAIT, NULL))) {
152                                         rbp->b_flags |= B_RAM;
153                                         BUF_UNLOCK(rbp);
154                                 }                       
155                         }
156                         VI_UNLOCK(vp);
157                         if (i >= maxra) {
158                                 return 0;
159                         }
160                         lblkno += i;
161                 }
162                 reqbp = bp = NULL;
163         /*
164          * If it isn't in the cache, then get a chunk from
165          * disk if sequential, otherwise just get the block.
166          */
167         } else {
168                 off_t firstread = bp->b_offset;
169                 int nblks;
170
171                 KASSERT(bp->b_offset != NOOFFSET,
172                     ("cluster_read: no buffer offset"));
173
174                 ncontig = 0;
175
176                 /*
177                  * Compute the total number of blocks that we should read
178                  * synchronously.
179                  */
180                 if (firstread + totread > filesize)
181                         totread = filesize - firstread;
182                 nblks = howmany(totread, size);
183                 if (nblks > racluster)
184                         nblks = racluster;
185
186                 /*
187                  * Now compute the number of contiguous blocks.
188                  */
189                 if (nblks > 1) {
190                         error = VOP_BMAP(vp, lblkno, NULL,
191                                 &blkno, &ncontig, NULL);
192                         /*
193                          * If this failed to map just do the original block.
194                          */
195                         if (error || blkno == -1)
196                                 ncontig = 0;
197                 }
198
199                 /*
200                  * If we have contiguous data available do a cluster
201                  * otherwise just read the requested block.
202                  */
203                 if (ncontig) {
204                         /* Account for our first block. */
205                         ncontig = min(ncontig + 1, nblks);
206                         if (ncontig < nblks)
207                                 nblks = ncontig;
208                         bp = cluster_rbuild(vp, filesize, lblkno,
209                                 blkno, size, nblks, bp);
210                         lblkno += (bp->b_bufsize / size);
211                 } else {
212                         bp->b_flags |= B_RAM;
213                         bp->b_iocmd = BIO_READ;
214                         lblkno += 1;
215                 }
216         }
217
218         /*
219          * handle the synchronous read so that it is available ASAP.
220          */
221         if (bp) {
222                 if ((bp->b_flags & B_CLUSTER) == 0) {
223                         vfs_busy_pages(bp, 0);
224                 }
225                 bp->b_flags &= ~B_INVAL;
226                 bp->b_ioflags &= ~BIO_ERROR;
227                 if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL)
228                         BUF_KERNPROC(bp);
229                 bp->b_iooffset = dbtob(bp->b_blkno);
230                 bstrategy(bp);
231                 curthread->td_ru.ru_inblock++;
232         }
233
234         /*
235          * If we have been doing sequential I/O, then do some read-ahead.
236          */
237         while (lblkno < (origblkno + maxra)) {
238                 error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL);
239                 if (error)
240                         break;
241
242                 if (blkno == -1)
243                         break;
244
245                 /*
246                  * We could throttle ncontig here by maxra but we might as
247                  * well read the data if it is contiguous.  We're throttled
248                  * by racluster anyway.
249                  */
250                 if (ncontig) {
251                         ncontig = min(ncontig + 1, racluster);
252                         rbp = cluster_rbuild(vp, filesize, lblkno, blkno,
253                                 size, ncontig, NULL);
254                         lblkno += (rbp->b_bufsize / size);
255                         if (rbp->b_flags & B_DELWRI) {
256                                 bqrelse(rbp);
257                                 continue;
258                         }
259                 } else {
260                         rbp = getblk(vp, lblkno, size, 0, 0, 0);
261                         lblkno += 1;
262                         if (rbp->b_flags & B_DELWRI) {
263                                 bqrelse(rbp);
264                                 continue;
265                         }
266                         rbp->b_flags |= B_ASYNC | B_RAM;
267                         rbp->b_iocmd = BIO_READ;
268                         rbp->b_blkno = blkno;
269                 }
270                 if (rbp->b_flags & B_CACHE) {
271                         rbp->b_flags &= ~B_ASYNC;
272                         bqrelse(rbp);
273                         continue;
274                 }
275                 if ((rbp->b_flags & B_CLUSTER) == 0) {
276                         vfs_busy_pages(rbp, 0);
277                 }
278                 rbp->b_flags &= ~B_INVAL;
279                 rbp->b_ioflags &= ~BIO_ERROR;
280                 if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL)
281                         BUF_KERNPROC(rbp);
282                 rbp->b_iooffset = dbtob(rbp->b_blkno);
283                 bstrategy(rbp);
284                 curthread->td_ru.ru_inblock++;
285         }
286
287         if (reqbp)
288                 return (bufwait(reqbp));
289         else
290                 return (error);
291 }
292
293 /*
294  * If blocks are contiguous on disk, use this to provide clustered
295  * read ahead.  We will read as many blocks as possible sequentially
296  * and then parcel them up into logical blocks in the buffer hash table.
297  */
298 static struct buf *
299 cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
300         struct vnode *vp;
301         u_quad_t filesize;
302         daddr_t lbn;
303         daddr_t blkno;
304         long size;
305         int run;
306         struct buf *fbp;
307 {
308         struct buf *bp, *tbp;
309         daddr_t bn;
310         int i, inc, j;
311
312         KASSERT(size == vp->v_mount->mnt_stat.f_iosize,
313             ("cluster_rbuild: size %ld != filesize %jd\n",
314             size, (intmax_t)vp->v_mount->mnt_stat.f_iosize));
315
316         /*
317          * avoid a division
318          */
319         while ((u_quad_t) size * (lbn + run) > filesize) {
320                 --run;
321         }
322
323         if (fbp) {
324                 tbp = fbp;
325                 tbp->b_iocmd = BIO_READ; 
326         } else {
327                 tbp = getblk(vp, lbn, size, 0, 0, 0);
328                 if (tbp->b_flags & B_CACHE)
329                         return tbp;
330                 tbp->b_flags |= B_ASYNC | B_RAM;
331                 tbp->b_iocmd = BIO_READ;
332         }
333
334         tbp->b_blkno = blkno;
335         if( (tbp->b_flags & B_MALLOC) ||
336                 ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
337                 return tbp;
338
339         bp = trypbuf(&cluster_pbuf_freecnt);
340         if (bp == 0)
341                 return tbp;
342
343         /*
344          * We are synthesizing a buffer out of vm_page_t's, but
345          * if the block size is not page aligned then the starting
346          * address may not be either.  Inherit the b_data offset
347          * from the original buffer.
348          */
349         bp->b_data = (char *)((vm_offset_t)bp->b_data |
350             ((vm_offset_t)tbp->b_data & PAGE_MASK));
351         bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO;
352         bp->b_iocmd = BIO_READ;
353         bp->b_iodone = cluster_callback;
354         bp->b_blkno = blkno;
355         bp->b_lblkno = lbn;
356         bp->b_offset = tbp->b_offset;
357         KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset"));
358         pbgetvp(vp, bp);
359
360         TAILQ_INIT(&bp->b_cluster.cluster_head);
361
362         bp->b_bcount = 0;
363         bp->b_bufsize = 0;
364         bp->b_npages = 0;
365
366         inc = btodb(size);
367         for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
368                 if (i != 0) {
369                         if ((bp->b_npages * PAGE_SIZE) +
370                             round_page(size) > vp->v_mount->mnt_iosize_max) {
371                                 break;
372                         }
373
374                         tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT);
375
376                         /* Don't wait around for locked bufs. */
377                         if (tbp == NULL)
378                                 break;
379
380                         /*
381                          * Stop scanning if the buffer is fully valid
382                          * (marked B_CACHE), or locked (may be doing a
383                          * background write), or if the buffer is not
384                          * VMIO backed.  The clustering code can only deal
385                          * with VMIO-backed buffers.
386                          */
387                         VI_LOCK(vp);
388                         if ((tbp->b_vflags & BV_BKGRDINPROG) ||
389                             (tbp->b_flags & B_CACHE) ||
390                             (tbp->b_flags & B_VMIO) == 0) {
391                                 VI_UNLOCK(vp);
392                                 bqrelse(tbp);
393                                 break;
394                         }
395                         VI_UNLOCK(vp);
396
397                         /*
398                          * The buffer must be completely invalid in order to
399                          * take part in the cluster.  If it is partially valid
400                          * then we stop.
401                          */
402                         VM_OBJECT_LOCK(tbp->b_bufobj->bo_object);
403                         for (j = 0;j < tbp->b_npages; j++) {
404                                 VM_OBJECT_LOCK_ASSERT(tbp->b_pages[j]->object,
405                                     MA_OWNED);
406                                 if (tbp->b_pages[j]->valid)
407                                         break;
408                         }
409                         VM_OBJECT_UNLOCK(tbp->b_bufobj->bo_object);
410                         if (j != tbp->b_npages) {
411                                 bqrelse(tbp);
412                                 break;
413                         }
414
415                         /*
416                          * Set a read-ahead mark as appropriate
417                          */
418                         if ((fbp && (i == 1)) || (i == (run - 1)))
419                                 tbp->b_flags |= B_RAM;
420
421                         /*
422                          * Set the buffer up for an async read (XXX should
423                          * we do this only if we do not wind up brelse()ing?).
424                          * Set the block number if it isn't set, otherwise
425                          * if it is make sure it matches the block number we
426                          * expect.
427                          */
428                         tbp->b_flags |= B_ASYNC;
429                         tbp->b_iocmd = BIO_READ;
430                         if (tbp->b_blkno == tbp->b_lblkno) {
431                                 tbp->b_blkno = bn;
432                         } else if (tbp->b_blkno != bn) {
433                                 brelse(tbp);
434                                 break;
435                         }
436                 }
437                 /*
438                  * XXX fbp from caller may not be B_ASYNC, but we are going
439                  * to biodone() it in cluster_callback() anyway
440                  */
441                 BUF_KERNPROC(tbp);
442                 TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
443                         tbp, b_cluster.cluster_entry);
444                 VM_OBJECT_LOCK(tbp->b_bufobj->bo_object);
445                 for (j = 0; j < tbp->b_npages; j += 1) {
446                         vm_page_t m;
447                         m = tbp->b_pages[j];
448                         vm_page_io_start(m);
449                         vm_object_pip_add(m->object, 1);
450                         if ((bp->b_npages == 0) ||
451                                 (bp->b_pages[bp->b_npages-1] != m)) {
452                                 bp->b_pages[bp->b_npages] = m;
453                                 bp->b_npages++;
454                         }
455                         if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
456                                 tbp->b_pages[j] = bogus_page;
457                 }
458                 VM_OBJECT_UNLOCK(tbp->b_bufobj->bo_object);
459                 /*
460                  * XXX shouldn't this be += size for both, like in
461                  * cluster_wbuild()?
462                  *
463                  * Don't inherit tbp->b_bufsize as it may be larger due to
464                  * a non-page-aligned size.  Instead just aggregate using
465                  * 'size'.
466                  */
467                 if (tbp->b_bcount != size)
468                         printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size);
469                 if (tbp->b_bufsize != size)
470                         printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size);
471                 bp->b_bcount += size;
472                 bp->b_bufsize += size;
473         }
474
475         /*
476          * Fully valid pages in the cluster are already good and do not need
477          * to be re-read from disk.  Replace the page with bogus_page
478          */
479         VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
480         for (j = 0; j < bp->b_npages; j++) {
481                 VM_OBJECT_LOCK_ASSERT(bp->b_pages[j]->object, MA_OWNED);
482                 if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) ==
483                     VM_PAGE_BITS_ALL) {
484                         bp->b_pages[j] = bogus_page;
485                 }
486         }
487         VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
488         if (bp->b_bufsize > bp->b_kvasize)
489                 panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
490                     bp->b_bufsize, bp->b_kvasize);
491         bp->b_kvasize = bp->b_bufsize;
492
493         pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
494                 (vm_page_t *)bp->b_pages, bp->b_npages);
495         return (bp);
496 }
497
498 /*
499  * Cleanup after a clustered read or write.
500  * This is complicated by the fact that any of the buffers might have
501  * extra memory (if there were no empty buffer headers at allocbuf time)
502  * that we will need to shift around.
503  */
504 static void
505 cluster_callback(bp)
506         struct buf *bp;
507 {
508         struct buf *nbp, *tbp;
509         int error = 0;
510
511         /*
512          * Must propogate errors to all the components.
513          */
514         if (bp->b_ioflags & BIO_ERROR)
515                 error = bp->b_error;
516
517         pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
518         /*
519          * Move memory from the large cluster buffer into the component
520          * buffers and mark IO as done on these.
521          */
522         for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
523                 tbp; tbp = nbp) {
524                 nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
525                 if (error) {
526                         tbp->b_ioflags |= BIO_ERROR;
527                         tbp->b_error = error;
528                 } else {
529                         tbp->b_dirtyoff = tbp->b_dirtyend = 0;
530                         tbp->b_flags &= ~B_INVAL;
531                         tbp->b_ioflags &= ~BIO_ERROR;
532                         /*
533                          * XXX the bdwrite()/bqrelse() issued during
534                          * cluster building clears B_RELBUF (see bqrelse()
535                          * comment).  If direct I/O was specified, we have
536                          * to restore it here to allow the buffer and VM
537                          * to be freed.
538                          */
539                         if (tbp->b_flags & B_DIRECT)
540                                 tbp->b_flags |= B_RELBUF;
541                 }
542                 bufdone(tbp);
543         }
544         pbrelvp(bp);
545         relpbuf(bp, &cluster_pbuf_freecnt);
546 }
547
548 /*
549  *      cluster_wbuild_wb:
550  *
551  *      Implement modified write build for cluster.
552  *
553  *              write_behind = 0        write behind disabled
554  *              write_behind = 1        write behind normal (default)
555  *              write_behind = 2        write behind backed-off
556  */
557
558 static __inline int
559 cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len)
560 {
561         int r = 0;
562
563         switch(write_behind) {
564         case 2:
565                 if (start_lbn < len)
566                         break;
567                 start_lbn -= len;
568                 /* FALLTHROUGH */
569         case 1:
570                 r = cluster_wbuild(vp, size, start_lbn, len);
571                 /* FALLTHROUGH */
572         default:
573                 /* FALLTHROUGH */
574                 break;
575         }
576         return(r);
577 }
578
579 /*
580  * Do clustered write for FFS.
581  *
582  * Three cases:
583  *      1. Write is not sequential (write asynchronously)
584  *      Write is sequential:
585  *      2.      beginning of cluster - begin cluster
586  *      3.      middle of a cluster - add to cluster
587  *      4.      end of a cluster - asynchronously write cluster
588  */
589 void
590 cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount)
591 {
592         daddr_t lbn;
593         int maxclen, cursize;
594         int lblocksize;
595         int async;
596
597         if (vp->v_type == VREG) {
598                 async = vp->v_mount->mnt_kern_flag & MNTK_ASYNC;
599                 lblocksize = vp->v_mount->mnt_stat.f_iosize;
600         } else {
601                 async = 0;
602                 lblocksize = bp->b_bufsize;
603         }
604         lbn = bp->b_lblkno;
605         KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset"));
606
607         /* Initialize vnode to beginning of file. */
608         if (lbn == 0)
609                 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
610
611         if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
612             (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
613                 maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1;
614                 if (vp->v_clen != 0) {
615                         /*
616                          * Next block is not sequential.
617                          *
618                          * If we are not writing at end of file, the process
619                          * seeked to another point in the file since its last
620                          * write, or we have reached our maximum cluster size,
621                          * then push the previous cluster. Otherwise try
622                          * reallocating to make it sequential.
623                          *
624                          * Change to algorithm: only push previous cluster if
625                          * it was sequential from the point of view of the
626                          * seqcount heuristic, otherwise leave the buffer 
627                          * intact so we can potentially optimize the I/O
628                          * later on in the buf_daemon or update daemon
629                          * flush.
630                          */
631                         cursize = vp->v_lastw - vp->v_cstart + 1;
632                         if (((u_quad_t) bp->b_offset + lblocksize) != filesize ||
633                             lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
634                                 if (!async && seqcount > 0) {
635                                         cluster_wbuild_wb(vp, lblocksize,
636                                                 vp->v_cstart, cursize);
637                                 }
638                         } else {
639                                 struct buf **bpp, **endbp;
640                                 struct cluster_save *buflist;
641
642                                 buflist = cluster_collectbufs(vp, bp);
643                                 endbp = &buflist->bs_children
644                                     [buflist->bs_nchildren - 1];
645                                 if (VOP_REALLOCBLKS(vp, buflist)) {
646                                         /*
647                                          * Failed, push the previous cluster
648                                          * if *really* writing sequentially
649                                          * in the logical file (seqcount > 1),
650                                          * otherwise delay it in the hopes that
651                                          * the low level disk driver can
652                                          * optimize the write ordering.
653                                          */
654                                         for (bpp = buflist->bs_children;
655                                              bpp < endbp; bpp++)
656                                                 brelse(*bpp);
657                                         free(buflist, M_SEGMENT);
658                                         if (seqcount > 1) {
659                                                 cluster_wbuild_wb(vp, 
660                                                     lblocksize, vp->v_cstart, 
661                                                     cursize);
662                                         }
663                                 } else {
664                                         /*
665                                          * Succeeded, keep building cluster.
666                                          */
667                                         for (bpp = buflist->bs_children;
668                                              bpp <= endbp; bpp++)
669                                                 bdwrite(*bpp);
670                                         free(buflist, M_SEGMENT);
671                                         vp->v_lastw = lbn;
672                                         vp->v_lasta = bp->b_blkno;
673                                         return;
674                                 }
675                         }
676                 }
677                 /*
678                  * Consider beginning a cluster. If at end of file, make
679                  * cluster as large as possible, otherwise find size of
680                  * existing cluster.
681                  */
682                 if ((vp->v_type == VREG) &&
683                         ((u_quad_t) bp->b_offset + lblocksize) != filesize &&
684                     (bp->b_blkno == bp->b_lblkno) &&
685                     (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
686                      bp->b_blkno == -1)) {
687                         bawrite(bp);
688                         vp->v_clen = 0;
689                         vp->v_lasta = bp->b_blkno;
690                         vp->v_cstart = lbn + 1;
691                         vp->v_lastw = lbn;
692                         return;
693                 }
694                 vp->v_clen = maxclen;
695                 if (!async && maxclen == 0) {   /* I/O not contiguous */
696                         vp->v_cstart = lbn + 1;
697                         bawrite(bp);
698                 } else {        /* Wait for rest of cluster */
699                         vp->v_cstart = lbn;
700                         bdwrite(bp);
701                 }
702         } else if (lbn == vp->v_cstart + vp->v_clen) {
703                 /*
704                  * At end of cluster, write it out if seqcount tells us we
705                  * are operating sequentially, otherwise let the buf or
706                  * update daemon handle it.
707                  */
708                 bdwrite(bp);
709                 if (seqcount > 1)
710                         cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
711                 vp->v_clen = 0;
712                 vp->v_cstart = lbn + 1;
713         } else if (vm_page_count_severe()) {
714                 /*
715                  * We are low on memory, get it going NOW
716                  */
717                 bawrite(bp);
718         } else {
719                 /*
720                  * In the middle of a cluster, so just delay the I/O for now.
721                  */
722                 bdwrite(bp);
723         }
724         vp->v_lastw = lbn;
725         vp->v_lasta = bp->b_blkno;
726 }
727
728
729 /*
730  * This is an awful lot like cluster_rbuild...wish they could be combined.
731  * The last lbn argument is the current block on which I/O is being
732  * performed.  Check to see that it doesn't fall in the middle of
733  * the current block (if last_bp == NULL).
734  */
735 int
736 cluster_wbuild(vp, size, start_lbn, len)
737         struct vnode *vp;
738         long size;
739         daddr_t start_lbn;
740         int len;
741 {
742         struct buf *bp, *tbp;
743         int i, j;
744         int totalwritten = 0;
745         int dbsize = btodb(size);
746
747         while (len > 0) {
748                 /*
749                  * If the buffer is not delayed-write (i.e. dirty), or it
750                  * is delayed-write but either locked or inval, it cannot
751                  * partake in the clustered write.
752                  */
753                 VI_LOCK(vp);
754                 if ((tbp = gbincore(&vp->v_bufobj, start_lbn)) == NULL ||
755                     (tbp->b_vflags & BV_BKGRDINPROG)) {
756                         VI_UNLOCK(vp);
757                         ++start_lbn;
758                         --len;
759                         continue;
760                 }
761                 if (BUF_LOCK(tbp,
762                     LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, VI_MTX(vp))) {
763                         ++start_lbn;
764                         --len;
765                         continue;
766                 }
767                 if ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) {
768                         BUF_UNLOCK(tbp);
769                         ++start_lbn;
770                         --len;
771                         continue;
772                 }
773                 if (tbp->b_pin_count >  0) {
774                         BUF_UNLOCK(tbp);
775                         ++start_lbn;
776                         --len;
777                         continue;
778                 }
779                 bremfree(tbp);
780                 tbp->b_flags &= ~B_DONE;
781
782                 /*
783                  * Extra memory in the buffer, punt on this buffer.
784                  * XXX we could handle this in most cases, but we would
785                  * have to push the extra memory down to after our max
786                  * possible cluster size and then potentially pull it back
787                  * up if the cluster was terminated prematurely--too much
788                  * hassle.
789                  */
790                 if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) != 
791                      (B_CLUSTEROK | B_VMIO)) ||
792                   (tbp->b_bcount != tbp->b_bufsize) ||
793                   (tbp->b_bcount != size) ||
794                   (len == 1) ||
795                   ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
796                         totalwritten += tbp->b_bufsize;
797                         bawrite(tbp);
798                         ++start_lbn;
799                         --len;
800                         continue;
801                 }
802
803                 /*
804                  * We got a pbuf to make the cluster in.
805                  * so initialise it.
806                  */
807                 TAILQ_INIT(&bp->b_cluster.cluster_head);
808                 bp->b_bcount = 0;
809                 bp->b_bufsize = 0;
810                 bp->b_npages = 0;
811                 if (tbp->b_wcred != NOCRED)
812                         bp->b_wcred = crhold(tbp->b_wcred);
813
814                 bp->b_blkno = tbp->b_blkno;
815                 bp->b_lblkno = tbp->b_lblkno;
816                 bp->b_offset = tbp->b_offset;
817
818                 /*
819                  * We are synthesizing a buffer out of vm_page_t's, but
820                  * if the block size is not page aligned then the starting
821                  * address may not be either.  Inherit the b_data offset
822                  * from the original buffer.
823                  */
824                 bp->b_data = (char *)((vm_offset_t)bp->b_data |
825                     ((vm_offset_t)tbp->b_data & PAGE_MASK));
826                 bp->b_flags |= B_CLUSTER |
827                                 (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT));
828                 bp->b_iodone = cluster_callback;
829                 pbgetvp(vp, bp);
830                 /*
831                  * From this location in the file, scan forward to see
832                  * if there are buffers with adjacent data that need to
833                  * be written as well.
834                  */
835                 for (i = 0; i < len; ++i, ++start_lbn) {
836                         if (i != 0) { /* If not the first buffer */
837                                 /*
838                                  * If the adjacent data is not even in core it
839                                  * can't need to be written.
840                                  */
841                                 VI_LOCK(vp);
842                                 if ((tbp = gbincore(&vp->v_bufobj, start_lbn)) == NULL ||
843                                     (tbp->b_vflags & BV_BKGRDINPROG)) {
844                                         VI_UNLOCK(vp);
845                                         break;
846                                 }
847
848                                 /*
849                                  * If it IS in core, but has different
850                                  * characteristics, or is locked (which
851                                  * means it could be undergoing a background
852                                  * I/O or be in a weird state), then don't
853                                  * cluster with it.
854                                  */
855                                 if (BUF_LOCK(tbp,
856                                     LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK,
857                                     VI_MTX(vp)))
858                                         break;
859
860                                 if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK |
861                                     B_INVAL | B_DELWRI | B_NEEDCOMMIT))
862                                     != (B_DELWRI | B_CLUSTEROK |
863                                     (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) ||
864                                     tbp->b_wcred != bp->b_wcred) {
865                                         BUF_UNLOCK(tbp);
866                                         break;
867                                 }
868
869                                 /*
870                                  * Check that the combined cluster
871                                  * would make sense with regard to pages
872                                  * and would not be too large
873                                  */
874                                 if ((tbp->b_bcount != size) ||
875                                   ((bp->b_blkno + (dbsize * i)) !=
876                                     tbp->b_blkno) ||
877                                   ((tbp->b_npages + bp->b_npages) >
878                                     (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) {
879                                         BUF_UNLOCK(tbp);
880                                         break;
881                                 }
882
883                                 /*
884                                  * Do not pull in pinned buffers.
885                                  */
886                                 if (tbp->b_pin_count > 0) {
887                                         BUF_UNLOCK(tbp);
888                                         break;
889                                 }
890
891                                 /*
892                                  * Ok, it's passed all the tests,
893                                  * so remove it from the free list
894                                  * and mark it busy. We will use it.
895                                  */
896                                 bremfree(tbp);
897                                 tbp->b_flags &= ~B_DONE;
898                         } /* end of code for non-first buffers only */
899                         /*
900                          * If the IO is via the VM then we do some
901                          * special VM hackery (yuck).  Since the buffer's
902                          * block size may not be page-aligned it is possible
903                          * for a page to be shared between two buffers.  We
904                          * have to get rid of the duplication when building
905                          * the cluster.
906                          */
907                         if (tbp->b_flags & B_VMIO) {
908                                 vm_page_t m;
909
910                                 VM_OBJECT_LOCK(tbp->b_bufobj->bo_object);
911                                 if (i != 0) { /* if not first buffer */
912                                         for (j = 0; j < tbp->b_npages; j += 1) {
913                                                 m = tbp->b_pages[j];
914                                                 if (m->oflags & VPO_BUSY) {
915                                                         VM_OBJECT_UNLOCK(
916                                                             tbp->b_object);
917                                                         bqrelse(tbp);
918                                                         goto finishcluster;
919                                                 }
920                                         }
921                                 }
922                                 for (j = 0; j < tbp->b_npages; j += 1) {
923                                         m = tbp->b_pages[j];
924                                         vm_page_io_start(m);
925                                         vm_object_pip_add(m->object, 1);
926                                         if ((bp->b_npages == 0) ||
927                                           (bp->b_pages[bp->b_npages - 1] != m)) {
928                                                 bp->b_pages[bp->b_npages] = m;
929                                                 bp->b_npages++;
930                                         }
931                                 }
932                                 VM_OBJECT_UNLOCK(tbp->b_bufobj->bo_object);
933                         }
934                         bp->b_bcount += size;
935                         bp->b_bufsize += size;
936                         bundirty(tbp);
937                         tbp->b_flags &= ~B_DONE;
938                         tbp->b_ioflags &= ~BIO_ERROR;
939                         tbp->b_flags |= B_ASYNC;
940                         tbp->b_iocmd = BIO_WRITE;
941                         reassignbuf(tbp);               /* put on clean list */
942                         bufobj_wref(tbp->b_bufobj);
943                         BUF_KERNPROC(tbp);
944                         TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
945                                 tbp, b_cluster.cluster_entry);
946                 }
947         finishcluster:
948                 pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
949                         (vm_page_t *) bp->b_pages, bp->b_npages);
950                 if (bp->b_bufsize > bp->b_kvasize)
951                         panic(
952                             "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
953                             bp->b_bufsize, bp->b_kvasize);
954                 bp->b_kvasize = bp->b_bufsize;
955                 totalwritten += bp->b_bufsize;
956                 bp->b_dirtyoff = 0;
957                 bp->b_dirtyend = bp->b_bufsize;
958                 bawrite(bp);
959
960                 len -= i;
961         }
962         return totalwritten;
963 }
964
965 /*
966  * Collect together all the buffers in a cluster.
967  * Plus add one additional buffer.
968  */
969 static struct cluster_save *
970 cluster_collectbufs(vp, last_bp)
971         struct vnode *vp;
972         struct buf *last_bp;
973 {
974         struct cluster_save *buflist;
975         struct buf *bp;
976         daddr_t lbn;
977         int i, len;
978
979         len = vp->v_lastw - vp->v_cstart + 1;
980         buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
981             M_SEGMENT, M_WAITOK);
982         buflist->bs_nchildren = 0;
983         buflist->bs_children = (struct buf **) (buflist + 1);
984         for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) {
985                 (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp);
986                 buflist->bs_children[i] = bp;
987                 if (bp->b_blkno == bp->b_lblkno)
988                         VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
989                                 NULL, NULL);
990         }
991         buflist->bs_children[i] = bp = last_bp;
992         if (bp->b_blkno == bp->b_lblkno)
993                 VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
994         buflist->bs_nchildren = i + 1;
995         return (buflist);
996 }