]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/ufs/ffs/ffs_balloc.c
- Convert the bufobj lock to rwlock.
[FreeBSD/FreeBSD.git] / sys / ufs / ffs / ffs_balloc.c
1 /*-
2  * Copyright (c) 2002 Networks Associates Technology, Inc.
3  * All rights reserved.
4  *
5  * This software was developed for the FreeBSD Project by Marshall
6  * Kirk McKusick and Network Associates Laboratories, the Security
7  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9  * research program
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  * Copyright (c) 1982, 1986, 1989, 1993
33  *      The Regents of the University of California.  All rights reserved.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  * 4. Neither the name of the University nor the names of its contributors
44  *    may be used to endorse or promote products derived from this software
45  *    without specific prior written permission.
46  *
47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57  * SUCH DAMAGE.
58  *
59  *      @(#)ffs_balloc.c        8.8 (Berkeley) 6/16/95
60  */
61
62 #include <sys/cdefs.h>
63 __FBSDID("$FreeBSD$");
64
65 #include <sys/param.h>
66 #include <sys/systm.h>
67 #include <sys/bio.h>
68 #include <sys/buf.h>
69 #include <sys/lock.h>
70 #include <sys/mount.h>
71 #include <sys/vnode.h>
72
73 #include <ufs/ufs/quota.h>
74 #include <ufs/ufs/inode.h>
75 #include <ufs/ufs/ufs_extern.h>
76 #include <ufs/ufs/extattr.h>
77 #include <ufs/ufs/ufsmount.h>
78
79 #include <ufs/ffs/fs.h>
80 #include <ufs/ffs/ffs_extern.h>
81
82 /*
83  * Balloc defines the structure of filesystem storage
84  * by allocating the physical blocks on a device given
85  * the inode and the logical block number in a file.
86  * This is the allocation strategy for UFS1. Below is
87  * the allocation strategy for UFS2.
88  */
89 int
90 ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
91     struct ucred *cred, int flags, struct buf **bpp)
92 {
93         struct inode *ip;
94         struct ufs1_dinode *dp;
95         ufs_lbn_t lbn, lastlbn;
96         struct fs *fs;
97         ufs1_daddr_t nb;
98         struct buf *bp, *nbp;
99         struct ufsmount *ump;
100         struct indir indirs[NIADDR + 2];
101         int deallocated, osize, nsize, num, i, error;
102         ufs2_daddr_t newb;
103         ufs1_daddr_t *bap, pref;
104         ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
105         ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
106         int unwindidx = -1;
107         int saved_inbdflush;
108         static struct timeval lastfail;
109         static int curfail;
110         int gbflags, reclaimed;
111
112         ip = VTOI(vp);
113         dp = ip->i_din1;
114         fs = ip->i_fs;
115         ump = ip->i_ump;
116         lbn = lblkno(fs, startoffset);
117         size = blkoff(fs, startoffset) + size;
118         reclaimed = 0;
119         if (size > fs->fs_bsize)
120                 panic("ffs_balloc_ufs1: blk too big");
121         *bpp = NULL;
122         if (flags & IO_EXT)
123                 return (EOPNOTSUPP);
124         if (lbn < 0)
125                 return (EFBIG);
126         gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
127
128         if (DOINGSOFTDEP(vp))
129                 softdep_prealloc(vp, MNT_WAIT);
130         /*
131          * If the next write will extend the file into a new block,
132          * and the file is currently composed of a fragment
133          * this fragment has to be extended to be a full block.
134          */
135         lastlbn = lblkno(fs, ip->i_size);
136         if (lastlbn < NDADDR && lastlbn < lbn) {
137                 nb = lastlbn;
138                 osize = blksize(fs, ip, nb);
139                 if (osize < fs->fs_bsize && osize > 0) {
140                         UFS_LOCK(ump);
141                         error = ffs_realloccg(ip, nb, dp->di_db[nb],
142                            ffs_blkpref_ufs1(ip, lastlbn, (int)nb,
143                            &dp->di_db[0]), osize, (int)fs->fs_bsize, flags,
144                            cred, &bp);
145                         if (error)
146                                 return (error);
147                         if (DOINGSOFTDEP(vp))
148                                 softdep_setup_allocdirect(ip, nb,
149                                     dbtofsb(fs, bp->b_blkno), dp->di_db[nb],
150                                     fs->fs_bsize, osize, bp);
151                         ip->i_size = smalllblktosize(fs, nb + 1);
152                         dp->di_size = ip->i_size;
153                         dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
154                         ip->i_flag |= IN_CHANGE | IN_UPDATE;
155                         if (flags & IO_SYNC)
156                                 bwrite(bp);
157                         else
158                                 bawrite(bp);
159                 }
160         }
161         /*
162          * The first NDADDR blocks are direct blocks
163          */
164         if (lbn < NDADDR) {
165                 if (flags & BA_METAONLY)
166                         panic("ffs_balloc_ufs1: BA_METAONLY for direct block");
167                 nb = dp->di_db[lbn];
168                 if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
169                         error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
170                         if (error) {
171                                 brelse(bp);
172                                 return (error);
173                         }
174                         bp->b_blkno = fsbtodb(fs, nb);
175                         *bpp = bp;
176                         return (0);
177                 }
178                 if (nb != 0) {
179                         /*
180                          * Consider need to reallocate a fragment.
181                          */
182                         osize = fragroundup(fs, blkoff(fs, ip->i_size));
183                         nsize = fragroundup(fs, size);
184                         if (nsize <= osize) {
185                                 error = bread(vp, lbn, osize, NOCRED, &bp);
186                                 if (error) {
187                                         brelse(bp);
188                                         return (error);
189                                 }
190                                 bp->b_blkno = fsbtodb(fs, nb);
191                         } else {
192                                 UFS_LOCK(ump);
193                                 error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
194                                     ffs_blkpref_ufs1(ip, lbn, (int)lbn,
195                                     &dp->di_db[0]), osize, nsize, flags,
196                                     cred, &bp);
197                                 if (error)
198                                         return (error);
199                                 if (DOINGSOFTDEP(vp))
200                                         softdep_setup_allocdirect(ip, lbn,
201                                             dbtofsb(fs, bp->b_blkno), nb,
202                                             nsize, osize, bp);
203                         }
204                 } else {
205                         if (ip->i_size < smalllblktosize(fs, lbn + 1))
206                                 nsize = fragroundup(fs, size);
207                         else
208                                 nsize = fs->fs_bsize;
209                         UFS_LOCK(ump);
210                         error = ffs_alloc(ip, lbn,
211                             ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]),
212                             nsize, flags, cred, &newb);
213                         if (error)
214                                 return (error);
215                         bp = getblk(vp, lbn, nsize, 0, 0, gbflags);
216                         bp->b_blkno = fsbtodb(fs, newb);
217                         if (flags & BA_CLRBUF)
218                                 vfs_bio_clrbuf(bp);
219                         if (DOINGSOFTDEP(vp))
220                                 softdep_setup_allocdirect(ip, lbn, newb, 0,
221                                     nsize, 0, bp);
222                 }
223                 dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
224                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
225                 *bpp = bp;
226                 return (0);
227         }
228         /*
229          * Determine the number of levels of indirection.
230          */
231         pref = 0;
232         if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
233                 return(error);
234 #ifdef INVARIANTS
235         if (num < 1)
236                 panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block");
237 #endif
238         saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH);
239         /*
240          * Fetch the first indirect block allocating if necessary.
241          */
242         --num;
243         nb = dp->di_ib[indirs[0].in_off];
244         allocib = NULL;
245         allocblk = allociblk;
246         lbns_remfree = lbns;
247         if (nb == 0) {
248                 UFS_LOCK(ump);
249                 pref = ffs_blkpref_ufs1(ip, lbn, -indirs[0].in_off - 1,
250                     (ufs1_daddr_t *)0);
251                 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
252                     flags, cred, &newb)) != 0) {
253                         curthread_pflags_restore(saved_inbdflush);
254                         return (error);
255                 }
256                 pref = newb + fs->fs_frag;
257                 nb = newb;
258                 *allocblk++ = nb;
259                 *lbns_remfree++ = indirs[1].in_lbn;
260                 bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags);
261                 bp->b_blkno = fsbtodb(fs, nb);
262                 vfs_bio_clrbuf(bp);
263                 if (DOINGSOFTDEP(vp)) {
264                         softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
265                             newb, 0, fs->fs_bsize, 0, bp);
266                         bdwrite(bp);
267                 } else {
268                         /*
269                          * Write synchronously so that indirect blocks
270                          * never point at garbage.
271                          */
272                         if (DOINGASYNC(vp))
273                                 bdwrite(bp);
274                         else if ((error = bwrite(bp)) != 0)
275                                 goto fail;
276                 }
277                 allocib = &dp->di_ib[indirs[0].in_off];
278                 *allocib = nb;
279                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
280         }
281         /*
282          * Fetch through the indirect blocks, allocating as necessary.
283          */
284 retry:
285         for (i = 1;;) {
286                 error = bread(vp,
287                     indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
288                 if (error) {
289                         brelse(bp);
290                         goto fail;
291                 }
292                 bap = (ufs1_daddr_t *)bp->b_data;
293                 nb = bap[indirs[i].in_off];
294                 if (i == num)
295                         break;
296                 i += 1;
297                 if (nb != 0) {
298                         bqrelse(bp);
299                         continue;
300                 }
301                 UFS_LOCK(ump);
302                 if (pref == 0)
303                         pref = ffs_blkpref_ufs1(ip, lbn, i - num - 1,
304                             (ufs1_daddr_t *)0);
305                 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
306                     flags | IO_BUFLOCKED, cred, &newb)) != 0) {
307                         brelse(bp);
308                         if (++reclaimed == 1) {
309                                 UFS_LOCK(ump);
310                                 softdep_request_cleanup(fs, vp, cred,
311                                     FLUSH_BLOCKS_WAIT);
312                                 UFS_UNLOCK(ump);
313                                 goto retry;
314                         }
315                         if (ppsratecheck(&lastfail, &curfail, 1)) {
316                                 ffs_fserr(fs, ip->i_number, "filesystem full");
317                                 uprintf("\n%s: write failed, filesystem "
318                                     "is full\n", fs->fs_fsmnt);
319                         }
320                         goto fail;
321                 }
322                 pref = newb + fs->fs_frag;
323                 nb = newb;
324                 *allocblk++ = nb;
325                 *lbns_remfree++ = indirs[i].in_lbn;
326                 nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
327                 nbp->b_blkno = fsbtodb(fs, nb);
328                 vfs_bio_clrbuf(nbp);
329                 if (DOINGSOFTDEP(vp)) {
330                         softdep_setup_allocindir_meta(nbp, ip, bp,
331                             indirs[i - 1].in_off, nb);
332                         bdwrite(nbp);
333                 } else {
334                         /*
335                          * Write synchronously so that indirect blocks
336                          * never point at garbage.
337                          */
338                         if ((error = bwrite(nbp)) != 0) {
339                                 brelse(bp);
340                                 goto fail;
341                         }
342                 }
343                 bap[indirs[i - 1].in_off] = nb;
344                 if (allocib == NULL && unwindidx < 0)
345                         unwindidx = i - 1;
346                 /*
347                  * If required, write synchronously, otherwise use
348                  * delayed write.
349                  */
350                 if (flags & IO_SYNC) {
351                         bwrite(bp);
352                 } else {
353                         if (bp->b_bufsize == fs->fs_bsize)
354                                 bp->b_flags |= B_CLUSTEROK;
355                         bdwrite(bp);
356                 }
357         }
358         /*
359          * If asked only for the indirect block, then return it.
360          */
361         if (flags & BA_METAONLY) {
362                 curthread_pflags_restore(saved_inbdflush);
363                 *bpp = bp;
364                 return (0);
365         }
366         /*
367          * Get the data block, allocating if necessary.
368          */
369         if (nb == 0) {
370                 UFS_LOCK(ump);
371                 if (pref == 0)
372                         pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off,
373                             &bap[0]);
374                 error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
375                     flags | IO_BUFLOCKED, cred, &newb);
376                 if (error) {
377                         brelse(bp);
378                         if (++reclaimed == 1) {
379                                 UFS_LOCK(ump);
380                                 softdep_request_cleanup(fs, vp, cred,
381                                     FLUSH_BLOCKS_WAIT);
382                                 UFS_UNLOCK(ump);
383                                 goto retry;
384                         }
385                         if (ppsratecheck(&lastfail, &curfail, 1)) {
386                                 ffs_fserr(fs, ip->i_number, "filesystem full");
387                                 uprintf("\n%s: write failed, filesystem "
388                                     "is full\n", fs->fs_fsmnt);
389                         }
390                         goto fail;
391                 }
392                 nb = newb;
393                 *allocblk++ = nb;
394                 *lbns_remfree++ = lbn;
395                 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
396                 nbp->b_blkno = fsbtodb(fs, nb);
397                 if (flags & BA_CLRBUF)
398                         vfs_bio_clrbuf(nbp);
399                 if (DOINGSOFTDEP(vp))
400                         softdep_setup_allocindir_page(ip, lbn, bp,
401                             indirs[i].in_off, nb, 0, nbp);
402                 bap[indirs[i].in_off] = nb;
403                 /*
404                  * If required, write synchronously, otherwise use
405                  * delayed write.
406                  */
407                 if (flags & IO_SYNC) {
408                         bwrite(bp);
409                 } else {
410                         if (bp->b_bufsize == fs->fs_bsize)
411                                 bp->b_flags |= B_CLUSTEROK;
412                         bdwrite(bp);
413                 }
414                 curthread_pflags_restore(saved_inbdflush);
415                 *bpp = nbp;
416                 return (0);
417         }
418         brelse(bp);
419         if (flags & BA_CLRBUF) {
420                 int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
421                 if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
422                         error = cluster_read(vp, ip->i_size, lbn,
423                             (int)fs->fs_bsize, NOCRED,
424                             MAXBSIZE, seqcount, gbflags, &nbp);
425                 } else {
426                         error = bread_gb(vp, lbn, (int)fs->fs_bsize, NOCRED,
427                             gbflags, &nbp);
428                 }
429                 if (error) {
430                         brelse(nbp);
431                         goto fail;
432                 }
433         } else {
434                 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
435                 nbp->b_blkno = fsbtodb(fs, nb);
436         }
437         curthread_pflags_restore(saved_inbdflush);
438         *bpp = nbp;
439         return (0);
440 fail:
441         curthread_pflags_restore(saved_inbdflush);
442         /*
443          * If we have failed to allocate any blocks, simply return the error.
444          * This is the usual case and avoids the need to fsync the file.
445          */
446         if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
447                 return (error);
448         /*
449          * If we have failed part way through block allocation, we
450          * have to deallocate any indirect blocks that we have allocated.
451          * We have to fsync the file before we start to get rid of all
452          * of its dependencies so that we do not leave them dangling.
453          * We have to sync it at the end so that the soft updates code
454          * does not find any untracked changes. Although this is really
455          * slow, running out of disk space is not expected to be a common
456          * occurrence. The error return from fsync is ignored as we already
457          * have an error to return to the user.
458          *
459          * XXX Still have to journal the free below
460          */
461         (void) ffs_syncvnode(vp, MNT_WAIT, 0);
462         for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
463              blkp < allocblk; blkp++, lbns_remfree++) {
464                 /*
465                  * We shall not leave the freed blocks on the vnode
466                  * buffer object lists.
467                  */
468                 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
469                 if (bp != NULL) {
470                         bp->b_flags |= (B_INVAL | B_RELBUF);
471                         bp->b_flags &= ~B_ASYNC;
472                         brelse(bp);
473                 }
474                 deallocated += fs->fs_bsize;
475         }
476         if (allocib != NULL) {
477                 *allocib = 0;
478         } else if (unwindidx >= 0) {
479                 int r;
480
481                 r = bread(vp, indirs[unwindidx].in_lbn, 
482                     (int)fs->fs_bsize, NOCRED, &bp);
483                 if (r) {
484                         panic("Could not unwind indirect block, error %d", r);
485                         brelse(bp);
486                 } else {
487                         bap = (ufs1_daddr_t *)bp->b_data;
488                         bap[indirs[unwindidx].in_off] = 0;
489                         if (flags & IO_SYNC) {
490                                 bwrite(bp);
491                         } else {
492                                 if (bp->b_bufsize == fs->fs_bsize)
493                                         bp->b_flags |= B_CLUSTEROK;
494                                 bdwrite(bp);
495                         }
496                 }
497         }
498         if (deallocated) {
499 #ifdef QUOTA
500                 /*
501                  * Restore user's disk quota because allocation failed.
502                  */
503                 (void) chkdq(ip, -btodb(deallocated), cred, FORCE);
504 #endif
505                 dp->di_blocks -= btodb(deallocated);
506                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
507         }
508         (void) ffs_syncvnode(vp, MNT_WAIT, 0);
509         /*
510          * After the buffers are invalidated and on-disk pointers are
511          * cleared, free the blocks.
512          */
513         for (blkp = allociblk; blkp < allocblk; blkp++) {
514                 ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
515                     ip->i_number, vp->v_type, NULL);
516         }
517         return (error);
518 }
519
520 /*
521  * Balloc defines the structure of file system storage
522  * by allocating the physical blocks on a device given
523  * the inode and the logical block number in a file.
524  * This is the allocation strategy for UFS2. Above is
525  * the allocation strategy for UFS1.
526  */
527 int
528 ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
529     struct ucred *cred, int flags, struct buf **bpp)
530 {
531         struct inode *ip;
532         struct ufs2_dinode *dp;
533         ufs_lbn_t lbn, lastlbn;
534         struct fs *fs;
535         struct buf *bp, *nbp;
536         struct ufsmount *ump;
537         struct indir indirs[NIADDR + 2];
538         ufs2_daddr_t nb, newb, *bap, pref;
539         ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
540         ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
541         int deallocated, osize, nsize, num, i, error;
542         int unwindidx = -1;
543         int saved_inbdflush;
544         static struct timeval lastfail;
545         static int curfail;
546         int gbflags, reclaimed;
547
548         ip = VTOI(vp);
549         dp = ip->i_din2;
550         fs = ip->i_fs;
551         ump = ip->i_ump;
552         lbn = lblkno(fs, startoffset);
553         size = blkoff(fs, startoffset) + size;
554         reclaimed = 0;
555         if (size > fs->fs_bsize)
556                 panic("ffs_balloc_ufs2: blk too big");
557         *bpp = NULL;
558         if (lbn < 0)
559                 return (EFBIG);
560         gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
561
562         if (DOINGSOFTDEP(vp))
563                 softdep_prealloc(vp, MNT_WAIT);
564         
565         /*
566          * Check for allocating external data.
567          */
568         if (flags & IO_EXT) {
569                 if (lbn >= NXADDR)
570                         return (EFBIG);
571                 /*
572                  * If the next write will extend the data into a new block,
573                  * and the data is currently composed of a fragment
574                  * this fragment has to be extended to be a full block.
575                  */
576                 lastlbn = lblkno(fs, dp->di_extsize);
577                 if (lastlbn < lbn) {
578                         nb = lastlbn;
579                         osize = sblksize(fs, dp->di_extsize, nb);
580                         if (osize < fs->fs_bsize && osize > 0) {
581                                 UFS_LOCK(ump);
582                                 error = ffs_realloccg(ip, -1 - nb,
583                                     dp->di_extb[nb],
584                                     ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
585                                     &dp->di_extb[0]), osize,
586                                     (int)fs->fs_bsize, flags, cred, &bp);
587                                 if (error)
588                                         return (error);
589                                 if (DOINGSOFTDEP(vp))
590                                         softdep_setup_allocext(ip, nb,
591                                             dbtofsb(fs, bp->b_blkno),
592                                             dp->di_extb[nb],
593                                             fs->fs_bsize, osize, bp);
594                                 dp->di_extsize = smalllblktosize(fs, nb + 1);
595                                 dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
596                                 bp->b_xflags |= BX_ALTDATA;
597                                 ip->i_flag |= IN_CHANGE;
598                                 if (flags & IO_SYNC)
599                                         bwrite(bp);
600                                 else
601                                         bawrite(bp);
602                         }
603                 }
604                 /*
605                  * All blocks are direct blocks
606                  */
607                 if (flags & BA_METAONLY)
608                         panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
609                 nb = dp->di_extb[lbn];
610                 if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
611                         error = bread_gb(vp, -1 - lbn, fs->fs_bsize, NOCRED,
612                             gbflags, &bp);
613                         if (error) {
614                                 brelse(bp);
615                                 return (error);
616                         }
617                         bp->b_blkno = fsbtodb(fs, nb);
618                         bp->b_xflags |= BX_ALTDATA;
619                         *bpp = bp;
620                         return (0);
621                 }
622                 if (nb != 0) {
623                         /*
624                          * Consider need to reallocate a fragment.
625                          */
626                         osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
627                         nsize = fragroundup(fs, size);
628                         if (nsize <= osize) {
629                                 error = bread_gb(vp, -1 - lbn, osize, NOCRED,
630                                     gbflags, &bp);
631                                 if (error) {
632                                         brelse(bp);
633                                         return (error);
634                                 }
635                                 bp->b_blkno = fsbtodb(fs, nb);
636                                 bp->b_xflags |= BX_ALTDATA;
637                         } else {
638                                 UFS_LOCK(ump);
639                                 error = ffs_realloccg(ip, -1 - lbn,
640                                     dp->di_extb[lbn],
641                                     ffs_blkpref_ufs2(ip, lbn, (int)lbn,
642                                     &dp->di_extb[0]), osize, nsize, flags,
643                                     cred, &bp);
644                                 if (error)
645                                         return (error);
646                                 bp->b_xflags |= BX_ALTDATA;
647                                 if (DOINGSOFTDEP(vp))
648                                         softdep_setup_allocext(ip, lbn,
649                                             dbtofsb(fs, bp->b_blkno), nb,
650                                             nsize, osize, bp);
651                         }
652                 } else {
653                         if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
654                                 nsize = fragroundup(fs, size);
655                         else
656                                 nsize = fs->fs_bsize;
657                         UFS_LOCK(ump);
658                         error = ffs_alloc(ip, lbn,
659                            ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]),
660                            nsize, flags, cred, &newb);
661                         if (error)
662                                 return (error);
663                         bp = getblk(vp, -1 - lbn, nsize, 0, 0, gbflags);
664                         bp->b_blkno = fsbtodb(fs, newb);
665                         bp->b_xflags |= BX_ALTDATA;
666                         if (flags & BA_CLRBUF)
667                                 vfs_bio_clrbuf(bp);
668                         if (DOINGSOFTDEP(vp))
669                                 softdep_setup_allocext(ip, lbn, newb, 0,
670                                     nsize, 0, bp);
671                 }
672                 dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
673                 ip->i_flag |= IN_CHANGE;
674                 *bpp = bp;
675                 return (0);
676         }
677         /*
678          * If the next write will extend the file into a new block,
679          * and the file is currently composed of a fragment
680          * this fragment has to be extended to be a full block.
681          */
682         lastlbn = lblkno(fs, ip->i_size);
683         if (lastlbn < NDADDR && lastlbn < lbn) {
684                 nb = lastlbn;
685                 osize = blksize(fs, ip, nb);
686                 if (osize < fs->fs_bsize && osize > 0) {
687                         UFS_LOCK(ump);
688                         error = ffs_realloccg(ip, nb, dp->di_db[nb],
689                             ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
690                             &dp->di_db[0]), osize, (int)fs->fs_bsize,
691                             flags, cred, &bp);
692                         if (error)
693                                 return (error);
694                         if (DOINGSOFTDEP(vp))
695                                 softdep_setup_allocdirect(ip, nb,
696                                     dbtofsb(fs, bp->b_blkno),
697                                     dp->di_db[nb],
698                                     fs->fs_bsize, osize, bp);
699                         ip->i_size = smalllblktosize(fs, nb + 1);
700                         dp->di_size = ip->i_size;
701                         dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
702                         ip->i_flag |= IN_CHANGE | IN_UPDATE;
703                         if (flags & IO_SYNC)
704                                 bwrite(bp);
705                         else
706                                 bawrite(bp);
707                 }
708         }
709         /*
710          * The first NDADDR blocks are direct blocks
711          */
712         if (lbn < NDADDR) {
713                 if (flags & BA_METAONLY)
714                         panic("ffs_balloc_ufs2: BA_METAONLY for direct block");
715                 nb = dp->di_db[lbn];
716                 if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
717                         error = bread_gb(vp, lbn, fs->fs_bsize, NOCRED,
718                             gbflags, &bp);
719                         if (error) {
720                                 brelse(bp);
721                                 return (error);
722                         }
723                         bp->b_blkno = fsbtodb(fs, nb);
724                         *bpp = bp;
725                         return (0);
726                 }
727                 if (nb != 0) {
728                         /*
729                          * Consider need to reallocate a fragment.
730                          */
731                         osize = fragroundup(fs, blkoff(fs, ip->i_size));
732                         nsize = fragroundup(fs, size);
733                         if (nsize <= osize) {
734                                 error = bread_gb(vp, lbn, osize, NOCRED,
735                                     gbflags, &bp);
736                                 if (error) {
737                                         brelse(bp);
738                                         return (error);
739                                 }
740                                 bp->b_blkno = fsbtodb(fs, nb);
741                         } else {
742                                 UFS_LOCK(ump);
743                                 error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
744                                     ffs_blkpref_ufs2(ip, lbn, (int)lbn,
745                                     &dp->di_db[0]), osize, nsize, flags,
746                                     cred, &bp);
747                                 if (error)
748                                         return (error);
749                                 if (DOINGSOFTDEP(vp))
750                                         softdep_setup_allocdirect(ip, lbn,
751                                             dbtofsb(fs, bp->b_blkno), nb,
752                                             nsize, osize, bp);
753                         }
754                 } else {
755                         if (ip->i_size < smalllblktosize(fs, lbn + 1))
756                                 nsize = fragroundup(fs, size);
757                         else
758                                 nsize = fs->fs_bsize;
759                         UFS_LOCK(ump);
760                         error = ffs_alloc(ip, lbn,
761                             ffs_blkpref_ufs2(ip, lbn, (int)lbn,
762                                 &dp->di_db[0]), nsize, flags, cred, &newb);
763                         if (error)
764                                 return (error);
765                         bp = getblk(vp, lbn, nsize, 0, 0, gbflags);
766                         bp->b_blkno = fsbtodb(fs, newb);
767                         if (flags & BA_CLRBUF)
768                                 vfs_bio_clrbuf(bp);
769                         if (DOINGSOFTDEP(vp))
770                                 softdep_setup_allocdirect(ip, lbn, newb, 0,
771                                     nsize, 0, bp);
772                 }
773                 dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
774                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
775                 *bpp = bp;
776                 return (0);
777         }
778         /*
779          * Determine the number of levels of indirection.
780          */
781         pref = 0;
782         if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
783                 return(error);
784 #ifdef INVARIANTS
785         if (num < 1)
786                 panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block");
787 #endif
788         saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH);
789         /*
790          * Fetch the first indirect block allocating if necessary.
791          */
792         --num;
793         nb = dp->di_ib[indirs[0].in_off];
794         allocib = NULL;
795         allocblk = allociblk;
796         lbns_remfree = lbns;
797         if (nb == 0) {
798                 UFS_LOCK(ump);
799                 pref = ffs_blkpref_ufs2(ip, lbn, -indirs[0].in_off - 1,
800                     (ufs2_daddr_t *)0);
801                 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
802                     flags, cred, &newb)) != 0) {
803                         curthread_pflags_restore(saved_inbdflush);
804                         return (error);
805                 }
806                 pref = newb + fs->fs_frag;
807                 nb = newb;
808                 *allocblk++ = nb;
809                 *lbns_remfree++ = indirs[1].in_lbn;
810                 bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0,
811                     GB_UNMAPPED);
812                 bp->b_blkno = fsbtodb(fs, nb);
813                 vfs_bio_clrbuf(bp);
814                 if (DOINGSOFTDEP(vp)) {
815                         softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
816                             newb, 0, fs->fs_bsize, 0, bp);
817                         bdwrite(bp);
818                 } else {
819                         /*
820                          * Write synchronously so that indirect blocks
821                          * never point at garbage.
822                          */
823                         if (DOINGASYNC(vp))
824                                 bdwrite(bp);
825                         else if ((error = bwrite(bp)) != 0)
826                                 goto fail;
827                 }
828                 allocib = &dp->di_ib[indirs[0].in_off];
829                 *allocib = nb;
830                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
831         }
832         /*
833          * Fetch through the indirect blocks, allocating as necessary.
834          */
835 retry:
836         for (i = 1;;) {
837                 error = bread(vp,
838                     indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
839                 if (error) {
840                         brelse(bp);
841                         goto fail;
842                 }
843                 bap = (ufs2_daddr_t *)bp->b_data;
844                 nb = bap[indirs[i].in_off];
845                 if (i == num)
846                         break;
847                 i += 1;
848                 if (nb != 0) {
849                         bqrelse(bp);
850                         continue;
851                 }
852                 UFS_LOCK(ump);
853                 if (pref == 0)
854                         pref = ffs_blkpref_ufs2(ip, lbn, i - num - 1,
855                             (ufs2_daddr_t *)0);
856                 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
857                     flags | IO_BUFLOCKED, cred, &newb)) != 0) {
858                         brelse(bp);
859                         if (++reclaimed == 1) {
860                                 UFS_LOCK(ump);
861                                 softdep_request_cleanup(fs, vp, cred,
862                                     FLUSH_BLOCKS_WAIT);
863                                 UFS_UNLOCK(ump);
864                                 goto retry;
865                         }
866                         if (ppsratecheck(&lastfail, &curfail, 1)) {
867                                 ffs_fserr(fs, ip->i_number, "filesystem full");
868                                 uprintf("\n%s: write failed, filesystem "
869                                     "is full\n", fs->fs_fsmnt);
870                         }
871                         goto fail;
872                 }
873                 pref = newb + fs->fs_frag;
874                 nb = newb;
875                 *allocblk++ = nb;
876                 *lbns_remfree++ = indirs[i].in_lbn;
877                 nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0,
878                     GB_UNMAPPED);
879                 nbp->b_blkno = fsbtodb(fs, nb);
880                 vfs_bio_clrbuf(nbp);
881                 if (DOINGSOFTDEP(vp)) {
882                         softdep_setup_allocindir_meta(nbp, ip, bp,
883                             indirs[i - 1].in_off, nb);
884                         bdwrite(nbp);
885                 } else {
886                         /*
887                          * Write synchronously so that indirect blocks
888                          * never point at garbage.
889                          */
890                         if ((error = bwrite(nbp)) != 0) {
891                                 brelse(bp);
892                                 goto fail;
893                         }
894                 }
895                 bap[indirs[i - 1].in_off] = nb;
896                 if (allocib == NULL && unwindidx < 0)
897                         unwindidx = i - 1;
898                 /*
899                  * If required, write synchronously, otherwise use
900                  * delayed write.
901                  */
902                 if (flags & IO_SYNC) {
903                         bwrite(bp);
904                 } else {
905                         if (bp->b_bufsize == fs->fs_bsize)
906                                 bp->b_flags |= B_CLUSTEROK;
907                         bdwrite(bp);
908                 }
909         }
910         /*
911          * If asked only for the indirect block, then return it.
912          */
913         if (flags & BA_METAONLY) {
914                 curthread_pflags_restore(saved_inbdflush);
915                 *bpp = bp;
916                 return (0);
917         }
918         /*
919          * Get the data block, allocating if necessary.
920          */
921         if (nb == 0) {
922                 UFS_LOCK(ump);
923                 if (pref == 0)
924                         pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off,
925                             &bap[0]);
926                 error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
927                     flags | IO_BUFLOCKED, cred, &newb);
928                 if (error) {
929                         brelse(bp);
930                         if (++reclaimed == 1) {
931                                 UFS_LOCK(ump);
932                                 softdep_request_cleanup(fs, vp, cred,
933                                     FLUSH_BLOCKS_WAIT);
934                                 UFS_UNLOCK(ump);
935                                 goto retry;
936                         }
937                         if (ppsratecheck(&lastfail, &curfail, 1)) {
938                                 ffs_fserr(fs, ip->i_number, "filesystem full");
939                                 uprintf("\n%s: write failed, filesystem "
940                                     "is full\n", fs->fs_fsmnt);
941                         }
942                         goto fail;
943                 }
944                 nb = newb;
945                 *allocblk++ = nb;
946                 *lbns_remfree++ = lbn;
947                 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
948                 nbp->b_blkno = fsbtodb(fs, nb);
949                 if (flags & BA_CLRBUF)
950                         vfs_bio_clrbuf(nbp);
951                 if (DOINGSOFTDEP(vp))
952                         softdep_setup_allocindir_page(ip, lbn, bp,
953                             indirs[i].in_off, nb, 0, nbp);
954                 bap[indirs[i].in_off] = nb;
955                 /*
956                  * If required, write synchronously, otherwise use
957                  * delayed write.
958                  */
959                 if (flags & IO_SYNC) {
960                         bwrite(bp);
961                 } else {
962                         if (bp->b_bufsize == fs->fs_bsize)
963                                 bp->b_flags |= B_CLUSTEROK;
964                         bdwrite(bp);
965                 }
966                 curthread_pflags_restore(saved_inbdflush);
967                 *bpp = nbp;
968                 return (0);
969         }
970         brelse(bp);
971         /*
972          * If requested clear invalid portions of the buffer.  If we
973          * have to do a read-before-write (typical if BA_CLRBUF is set),
974          * try to do some read-ahead in the sequential case to reduce
975          * the number of I/O transactions.
976          */
977         if (flags & BA_CLRBUF) {
978                 int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
979                 if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
980                         error = cluster_read(vp, ip->i_size, lbn,
981                             (int)fs->fs_bsize, NOCRED,
982                             MAXBSIZE, seqcount, gbflags, &nbp);
983                 } else {
984                         error = bread_gb(vp, lbn, (int)fs->fs_bsize,
985                             NOCRED, gbflags, &nbp);
986                 }
987                 if (error) {
988                         brelse(nbp);
989                         goto fail;
990                 }
991         } else {
992                 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
993                 nbp->b_blkno = fsbtodb(fs, nb);
994         }
995         curthread_pflags_restore(saved_inbdflush);
996         *bpp = nbp;
997         return (0);
998 fail:
999         curthread_pflags_restore(saved_inbdflush);
1000         /*
1001          * If we have failed to allocate any blocks, simply return the error.
1002          * This is the usual case and avoids the need to fsync the file.
1003          */
1004         if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
1005                 return (error);
1006         /*
1007          * If we have failed part way through block allocation, we
1008          * have to deallocate any indirect blocks that we have allocated.
1009          * We have to fsync the file before we start to get rid of all
1010          * of its dependencies so that we do not leave them dangling.
1011          * We have to sync it at the end so that the soft updates code
1012          * does not find any untracked changes. Although this is really
1013          * slow, running out of disk space is not expected to be a common
1014          * occurrence. The error return from fsync is ignored as we already
1015          * have an error to return to the user.
1016          *
1017          * XXX Still have to journal the free below
1018          */
1019         (void) ffs_syncvnode(vp, MNT_WAIT, 0);
1020         for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
1021              blkp < allocblk; blkp++, lbns_remfree++) {
1022                 /*
1023                  * We shall not leave the freed blocks on the vnode
1024                  * buffer object lists.
1025                  */
1026                 bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
1027                 if (bp != NULL) {
1028                         bp->b_flags |= (B_INVAL | B_RELBUF);
1029                         bp->b_flags &= ~B_ASYNC;
1030                         brelse(bp);
1031                 }
1032                 deallocated += fs->fs_bsize;
1033         }
1034         if (allocib != NULL) {
1035                 *allocib = 0;
1036         } else if (unwindidx >= 0) {
1037                 int r;
1038
1039                 r = bread(vp, indirs[unwindidx].in_lbn, 
1040                     (int)fs->fs_bsize, NOCRED, &bp);
1041                 if (r) {
1042                         panic("Could not unwind indirect block, error %d", r);
1043                         brelse(bp);
1044                 } else {
1045                         bap = (ufs2_daddr_t *)bp->b_data;
1046                         bap[indirs[unwindidx].in_off] = 0;
1047                         if (flags & IO_SYNC) {
1048                                 bwrite(bp);
1049                         } else {
1050                                 if (bp->b_bufsize == fs->fs_bsize)
1051                                         bp->b_flags |= B_CLUSTEROK;
1052                                 bdwrite(bp);
1053                         }
1054                 }
1055         }
1056         if (deallocated) {
1057 #ifdef QUOTA
1058                 /*
1059                  * Restore user's disk quota because allocation failed.
1060                  */
1061                 (void) chkdq(ip, -btodb(deallocated), cred, FORCE);
1062 #endif
1063                 dp->di_blocks -= btodb(deallocated);
1064                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1065         }
1066         (void) ffs_syncvnode(vp, MNT_WAIT, 0);
1067         /*
1068          * After the buffers are invalidated and on-disk pointers are
1069          * cleared, free the blocks.
1070          */
1071         for (blkp = allociblk; blkp < allocblk; blkp++) {
1072                 ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
1073                     ip->i_number, vp->v_type, NULL);
1074         }
1075         return (error);
1076 }