sys/ufs/ffs/ffs_snapshot.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
   5  *
   6  * Further information about snapshots can be obtained from:
   7  *
   8  *      Marshall Kirk McKusick          http://www.mckusick.com/softdep/
   9  *      1614 Oxford Street              mckusick@mckusick.com
  10  *      Berkeley, CA 94709-1608         +1-510-843-9542
  11  *      USA
  12  *
  13  * Redistribution and use in source and binary forms, with or without
  14  * modification, are permitted provided that the following conditions
  15  * are met:
  16  *
  17  * 1. Redistributions of source code must retain the above copyright
  18  *    notice, this list of conditions and the following disclaimer.
  19  * 2. Redistributions in binary form must reproduce the above copyright
  20  *    notice, this list of conditions and the following disclaimer in the
  21  *    documentation and/or other materials provided with the distribution.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
  24  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  25  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  26  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
  27  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  *
  35  *      @(#)ffs_snapshot.c      8.11 (McKusick) 7/23/00
  36  */
  37
  38 #include <sys/cdefs.h>
  39 __FBSDID("$FreeBSD$");
  40
  41 #include "opt_quota.h"
  42
  43 #include <sys/param.h>
  44 #include <sys/kernel.h>
  45 #include <sys/systm.h>
  46 #include <sys/conf.h>
  47 #include <sys/gsb_crc32.h>
  48 #include <sys/bio.h>
  49 #include <sys/buf.h>
  50 #include <sys/fcntl.h>
  51 #include <sys/proc.h>
  52 #include <sys/namei.h>
  53 #include <sys/sched.h>
  54 #include <sys/stat.h>
  55 #include <sys/malloc.h>
  56 #include <sys/mount.h>
  57 #include <sys/resource.h>
  58 #include <sys/resourcevar.h>
  59 #include <sys/rwlock.h>
  60 #include <sys/vnode.h>
  61
  62 #include <geom/geom.h>
  63
  64 #include <ufs/ufs/extattr.h>
  65 #include <ufs/ufs/quota.h>
  66 #include <ufs/ufs/ufsmount.h>
  67 #include <ufs/ufs/inode.h>
  68 #include <ufs/ufs/ufs_extern.h>
  69
  70 #include <ufs/ffs/fs.h>
  71 #include <ufs/ffs/ffs_extern.h>
  72
  73 #define KERNCRED thread0.td_ucred
  74
  75 #include "opt_ffs.h"
  76
  77 #ifdef NO_FFS_SNAPSHOT
  78 int
  79 ffs_snapshot(mp, snapfile)
  80         struct mount *mp;
  81         char *snapfile;
  82 {
  83         return (EINVAL);
  84 }
  85
  86 int
  87 ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd)
  88         struct fs *fs;
  89         struct vnode *devvp;
  90         ufs2_daddr_t bno;
  91         long size;
  92         ino_t inum;
  93         enum vtype vtype;
  94         struct workhead *wkhd;
  95 {
  96         return (EINVAL);
  97 }
  98
  99 void
 100 ffs_snapremove(vp)
 101         struct vnode *vp;
 102 {
 103 }
 104
 105 void
 106 ffs_snapshot_mount(mp)
 107         struct mount *mp;
 108 {
 109 }
 110
 111 void
 112 ffs_snapshot_unmount(mp)
 113         struct mount *mp;
 114 {
 115 }
 116
 117 void
 118 ffs_snapgone(ip)
 119         struct inode *ip;
 120 {
 121 }
 122
 123 int
 124 ffs_copyonwrite(devvp, bp)
 125         struct vnode *devvp;
 126         struct buf *bp;
 127 {
 128         return (EINVAL);
 129 }
 130
 131 void
 132 ffs_sync_snap(mp, waitfor)
 133         struct mount *mp;
 134         int waitfor;
 135 {
 136 }
 137
 138 #else
 139 FEATURE(ffs_snapshot, "FFS snapshot support");
 140
 141 LIST_HEAD(, snapdata) snapfree;
 142 static struct mtx snapfree_lock;
 143 MTX_SYSINIT(ffs_snapfree, &snapfree_lock, "snapdata free list", MTX_DEF);
 144
 145 static int cgaccount(int, struct vnode *, struct buf *, int);
 146 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
 147     int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
 148     ufs_lbn_t, int), int, int);
 149 static int indiracct_ufs1(struct vnode *, struct vnode *, int,
 150     ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
 151     int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
 152     ufs_lbn_t, int), int);
 153 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
 154     struct fs *, ufs_lbn_t, int);
 155 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
 156     struct fs *, ufs_lbn_t, int);
 157 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
 158     struct fs *, ufs_lbn_t, int);
 159 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
 160     int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
 161     ufs_lbn_t, int), int, int);
 162 static int indiracct_ufs2(struct vnode *, struct vnode *, int,
 163     ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
 164     int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
 165     ufs_lbn_t, int), int);
 166 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
 167     struct fs *, ufs_lbn_t, int);
 168 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
 169     struct fs *, ufs_lbn_t, int);
 170 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
 171     struct fs *, ufs_lbn_t, int);
 172 static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t);
 173 static void try_free_snapdata(struct vnode *devvp);
 174 static struct snapdata *ffs_snapdata_acquire(struct vnode *devvp);
 175 static int ffs_bp_snapblk(struct vnode *, struct buf *);
 176
 177 /*
 178  * To ensure the consistency of snapshots across crashes, we must
 179  * synchronously write out copied blocks before allowing the
 180  * originals to be modified. Because of the rather severe speed
 181  * penalty that this imposes, the code normally only ensures
 182  * persistence for the filesystem metadata contained within a
 183  * snapshot. Setting the following flag allows this crash
 184  * persistence to be enabled for file contents.
 185  */
 186 int dopersistence = 0;
 187
 188 #ifdef DIAGNOSTIC
 189 #include <sys/sysctl.h>
 190 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, "");
 191 static int snapdebug = 0;
 192 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
 193 int collectsnapstats = 0;
 194 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats,
 195         0, "");
 196 #endif /* DIAGNOSTIC */
 197
 198 /*
 199  * Create a snapshot file and initialize it for the filesystem.
 200  */
 201 int
 202 ffs_snapshot(mp, snapfile)
 203         struct mount *mp;
 204         char *snapfile;
 205 {
 206         ufs2_daddr_t numblks, blkno, *blkp, *snapblklist;
 207         int error, cg, snaploc;
 208         int i, size, len, loc;
 209         ufs2_daddr_t blockno;
 210         uint64_t flag;
 211         char saved_nice = 0;
 212         long redo = 0, snaplistsize = 0;
 213         int32_t *lp;
 214         void *space;
 215         struct fs *copy_fs = NULL, *fs;
 216         struct thread *td = curthread;
 217         struct inode *ip, *xp;
 218         struct buf *bp, *nbp, *ibp;
 219         struct nameidata nd;
 220         struct mount *wrtmp;
 221         struct vattr vat;
 222         struct vnode *vp, *xvp, *mvp, *devvp;
 223         struct uio auio;
 224         struct iovec aiov;
 225         struct snapdata *sn;
 226         struct ufsmount *ump;
 227 #ifdef DIAGNOSTIC
 228         struct timespec starttime = {0, 0}, endtime;
 229 #endif
 230
 231         ump = VFSTOUFS(mp);
 232         fs = ump->um_fs;
 233         sn = NULL;
 234         /*
 235          * At the moment, journaled soft updates cannot support
 236          * taking snapshots.
 237          */
 238         if (MOUNTEDSUJ(mp)) {
 239                 vfs_mount_error(mp, "%s: Snapshots are not yet supported when "
 240                     "running with journaled soft updates", fs->fs_fsmnt);
 241                 return (EOPNOTSUPP);
 242         }
 243         MNT_ILOCK(mp);
 244         flag = mp->mnt_flag;
 245         MNT_IUNLOCK(mp);
 246         /*
 247          * Need to serialize access to snapshot code per filesystem.
 248          */
 249         /*
 250          * Assign a snapshot slot in the superblock.
 251          */
 252         UFS_LOCK(ump);
 253         for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
 254                 if (fs->fs_snapinum[snaploc] == 0)
 255                         break;
 256         UFS_UNLOCK(ump);
 257         if (snaploc == FSMAXSNAP)
 258                 return (ENOSPC);
 259         /*
 260          * Create the snapshot file.
 261          */
 262 restart:
 263         NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF | NOCACHE, UIO_SYSSPACE,
 264             snapfile, td);
 265         if ((error = namei(&nd)) != 0)
 266                 return (error);
 267         if (nd.ni_vp != NULL) {
 268                 vput(nd.ni_vp);
 269                 error = EEXIST;
 270         }
 271         if (nd.ni_dvp->v_mount != mp)
 272                 error = EXDEV;
 273         if (error) {
 274                 NDFREE(&nd, NDF_ONLY_PNBUF);
 275                 if (nd.ni_dvp == nd.ni_vp)
 276                         vrele(nd.ni_dvp);
 277                 else
 278                         vput(nd.ni_dvp);
 279                 return (error);
 280         }
 281         VATTR_NULL(&vat);
 282         vat.va_type = VREG;
 283         vat.va_mode = S_IRUSR;
 284         vat.va_vaflags |= VA_EXCLUSIVE;
 285         if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
 286                 wrtmp = NULL;
 287         if (wrtmp != mp)
 288                 panic("ffs_snapshot: mount mismatch");
 289         vfs_rel(wrtmp);
 290         if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
 291                 NDFREE(&nd, NDF_ONLY_PNBUF);
 292                 vput(nd.ni_dvp);
 293                 if ((error = vn_start_write(NULL, &wrtmp,
 294                     V_XSLEEP | PCATCH)) != 0)
 295                         return (error);
 296                 goto restart;
 297         }
 298         error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
 299         VOP_UNLOCK(nd.ni_dvp);
 300         if (error) {
 301                 NDFREE(&nd, NDF_ONLY_PNBUF);
 302                 vn_finished_write(wrtmp);
 303                 vrele(nd.ni_dvp);
 304                 if (error == ERELOOKUP)
 305                         goto restart;
 306                 return (error);
 307         }
 308         vp = nd.ni_vp;
 309         vnode_create_vobject(nd.ni_vp, fs->fs_size, td);
 310         vp->v_vflag |= VV_SYSTEM;
 311         ip = VTOI(vp);
 312         devvp = ITODEVVP(ip);
 313         /*
 314          * Allocate and copy the last block contents so as to be able
 315          * to set size to that of the filesystem.
 316          */
 317         numblks = howmany(fs->fs_size, fs->fs_frag);
 318         error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
 319             fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
 320         if (error)
 321                 goto out;
 322         ip->i_size = lblktosize(fs, (off_t)numblks);
 323         DIP_SET(ip, i_size, ip->i_size);
 324         UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE);
 325         error = readblock(vp, bp, numblks - 1);
 326         bawrite(bp);
 327         if (error != 0)
 328                 goto out;
 329         /*
 330          * Preallocate critical data structures so that we can copy
 331          * them in without further allocation after we suspend all
 332          * operations on the filesystem. We would like to just release
 333          * the allocated buffers without writing them since they will
 334          * be filled in below once we are ready to go, but this upsets
 335          * the soft update code, so we go ahead and write the new buffers.
 336          *
 337          * Allocate all indirect blocks and mark all of them as not
 338          * needing to be copied.
 339          */
 340         for (blkno = UFS_NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
 341                 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
 342                     fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp);
 343                 if (error)
 344                         goto out;
 345                 bawrite(ibp);
 346         }
 347         /*
 348          * Allocate copies for the superblock and its summary information.
 349          */
 350         error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED,
 351             0, &nbp);
 352         if (error)
 353                 goto out;
 354         bawrite(nbp);
 355         blkno = fragstoblks(fs, fs->fs_csaddr);
 356         len = howmany(fs->fs_cssize, fs->fs_bsize);
 357         for (loc = 0; loc < len; loc++) {
 358                 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
 359                     fs->fs_bsize, KERNCRED, 0, &nbp);
 360                 if (error)
 361                         goto out;
 362                 bawrite(nbp);
 363         }
 364         /*
 365          * Allocate all cylinder group blocks.
 366          */
 367         for (cg = 0; cg < fs->fs_ncg; cg++) {
 368                 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
 369                     fs->fs_bsize, KERNCRED, 0, &nbp);
 370                 if (error)
 371                         goto out;
 372                 bawrite(nbp);
 373                 if (cg % 10 == 0) {
 374                         error = ffs_syncvnode(vp, MNT_WAIT, 0);
 375                         /* vp possibly reclaimed if unlocked */
 376                         if (error != 0)
 377                                 goto out;
 378                 }
 379         }
 380         /*
 381          * Copy all the cylinder group maps. Although the
 382          * filesystem is still active, we hope that only a few
 383          * cylinder groups will change between now and when we
 384          * suspend operations. Thus, we will be able to quickly
 385          * touch up the few cylinder groups that changed during
 386          * the suspension period.
 387          */
 388         len = howmany(fs->fs_ncg, NBBY);
 389         space = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
 390         UFS_LOCK(ump);
 391         fs->fs_active = space;
 392         UFS_UNLOCK(ump);
 393         for (cg = 0; cg < fs->fs_ncg; cg++) {
 394                 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
 395                     fs->fs_bsize, KERNCRED, 0, &nbp);
 396                 if (error)
 397                         goto out;
 398                 error = cgaccount(cg, vp, nbp, 1);
 399                 bawrite(nbp);
 400                 if (cg % 10 == 0 && error == 0)
 401                         error = ffs_syncvnode(vp, MNT_WAIT, 0);
 402                 if (error)
 403                         goto out;
 404         }
 405         /*
 406          * Change inode to snapshot type file.
 407          */
 408         ip->i_flags |= SF_SNAPSHOT;
 409         DIP_SET(ip, i_flags, ip->i_flags);
 410         UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
 411         /*
 412          * Ensure that the snapshot is completely on disk.
 413          * Since we have marked it as a snapshot it is safe to
 414          * unlock it as no process will be allowed to write to it.
 415          */
 416         if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0)
 417                 goto out;
 418         VOP_UNLOCK(vp);
 419         /*
 420          * All allocations are done, so we can now snapshot the system.
 421          *
 422          * Recind nice scheduling while running with the filesystem suspended.
 423          */
 424         if (td->td_proc->p_nice > 0) {
 425                 struct proc *p;
 426
 427                 p = td->td_proc;
 428                 PROC_LOCK(p);
 429                 saved_nice = p->p_nice;
 430                 sched_nice(p, 0);
 431                 PROC_UNLOCK(p);
 432         }
 433         /*
 434          * Suspend operation on filesystem.
 435          */
 436         for (;;) {
 437                 vn_finished_write(wrtmp);
 438                 if ((error = vfs_write_suspend(vp->v_mount, 0)) != 0) {
 439                         vn_start_write(NULL, &wrtmp, V_WAIT);
 440                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 441                         goto out;
 442                 }
 443                 if (mp->mnt_kern_flag & MNTK_SUSPENDED)
 444                         break;
 445                 vn_start_write(NULL, &wrtmp, V_WAIT);
 446         }
 447         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 448         if (ip->i_effnlink == 0) {
 449                 error = ENOENT;         /* Snapshot file unlinked */
 450                 goto out1;
 451         }
 452 #ifdef DIAGNOSTIC
 453         if (collectsnapstats)
 454                 nanotime(&starttime);
 455 #endif
 456
 457         /* The last block might have changed.  Copy it again to be sure. */
 458         error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
 459             fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
 460         if (error != 0)
 461                 goto out1;
 462         error = readblock(vp, bp, numblks - 1);
 463         bp->b_flags |= B_VALIDSUSPWRT;
 464         bawrite(bp);
 465         if (error != 0)
 466                 goto out1;
 467         /*
 468          * First, copy all the cylinder group maps that have changed.
 469          */
 470         for (cg = 0; cg < fs->fs_ncg; cg++) {
 471                 if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0)
 472                         continue;
 473                 redo++;
 474                 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
 475                     fs->fs_bsize, KERNCRED, 0, &nbp);
 476                 if (error)
 477                         goto out1;
 478                 error = cgaccount(cg, vp, nbp, 2);
 479                 bawrite(nbp);
 480                 if (error)
 481                         goto out1;
 482         }
 483         /*
 484          * Grab a copy of the superblock and its summary information.
 485          * We delay writing it until the suspension is released below.
 486          */
 487         copy_fs = malloc((u_long)fs->fs_bsize, M_UFSMNT, M_WAITOK);
 488         bcopy(fs, copy_fs, fs->fs_sbsize);
 489         copy_fs->fs_si = malloc(sizeof(struct fs_summary_info), M_UFSMNT,
 490             M_ZERO | M_WAITOK);
 491         if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
 492                 copy_fs->fs_clean = 1;
 493         size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
 494         if (fs->fs_sbsize < size)
 495                 bzero(&((char *)copy_fs)[fs->fs_sbsize],
 496                     size - fs->fs_sbsize);
 497         size = blkroundup(fs, fs->fs_cssize);
 498         if (fs->fs_contigsumsize > 0)
 499                 size += fs->fs_ncg * sizeof(int32_t);
 500         space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
 501         copy_fs->fs_csp = space;
 502         bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
 503         space = (char *)space + fs->fs_cssize;
 504         loc = howmany(fs->fs_cssize, fs->fs_fsize);
 505         i = fs->fs_frag - loc % fs->fs_frag;
 506         len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
 507         if (len > 0) {
 508                 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
 509                     len, KERNCRED, &bp)) != 0) {
 510                         brelse(bp);
 511                         free(copy_fs->fs_csp, M_UFSMNT);
 512                         free(copy_fs->fs_si, M_UFSMNT);
 513                         free(copy_fs, M_UFSMNT);
 514                         copy_fs = NULL;
 515                         goto out1;
 516                 }
 517                 bcopy(bp->b_data, space, (u_int)len);
 518                 space = (char *)space + len;
 519                 bp->b_flags |= B_INVAL | B_NOCACHE;
 520                 brelse(bp);
 521         }
 522         if (fs->fs_contigsumsize > 0) {
 523                 copy_fs->fs_maxcluster = lp = space;
 524                 for (i = 0; i < fs->fs_ncg; i++)
 525                         *lp++ = fs->fs_contigsumsize;
 526         }
 527         /*
 528          * We must check for active files that have been unlinked
 529          * (e.g., with a zero link count). We have to expunge all
 530          * trace of these files from the snapshot so that they are
 531          * not reclaimed prematurely by fsck or unnecessarily dumped.
 532          * We turn off the MNTK_SUSPENDED flag to avoid a panic from
 533          * spec_strategy about writing on a suspended filesystem.
 534          * Note that we skip unlinked snapshot files as they will
 535          * be handled separately below.
 536          *
 537          * We also calculate the needed size for the snapshot list.
 538          */
 539         snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
 540             FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
 541         MNT_ILOCK(mp);
 542         mp->mnt_kern_flag &= ~MNTK_SUSPENDED;
 543         MNT_IUNLOCK(mp);
 544 loop:
 545         MNT_VNODE_FOREACH_ALL(xvp, mp, mvp) {
 546                 if ((xvp->v_usecount == 0 &&
 547                      (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) ||
 548                     xvp->v_type == VNON ||
 549                     IS_SNAPSHOT(VTOI(xvp))) {
 550                         VI_UNLOCK(xvp);
 551                         continue;
 552                 }
 553                 /*
 554                  * We can skip parent directory vnode because it must have
 555                  * this snapshot file in it.
 556                  */
 557                 if (xvp == nd.ni_dvp) {
 558                         VI_UNLOCK(xvp);
 559                         continue;
 560                 }
 561                 vholdl(xvp);
 562                 if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) {
 563                         MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 564                         vdrop(xvp);
 565                         goto loop;
 566                 }
 567                 VI_LOCK(xvp);
 568                 if (xvp->v_usecount == 0 &&
 569                     (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) {
 570                         VI_UNLOCK(xvp);
 571                         VOP_UNLOCK(xvp);
 572                         vdrop(xvp);
 573                         continue;
 574                 }
 575                 VI_UNLOCK(xvp);
 576 #ifdef DIAGNOSTIC
 577                 if (snapdebug)
 578                         vn_printf(xvp, "ffs_snapshot: busy vnode ");
 579 #endif
 580                 if (VOP_GETATTR(xvp, &vat, td->td_ucred) == 0 &&
 581                     vat.va_nlink > 0) {
 582                         VOP_UNLOCK(xvp);
 583                         vdrop(xvp);
 584                         continue;
 585                 }
 586                 xp = VTOI(xvp);
 587                 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
 588                         VOP_UNLOCK(xvp);
 589                         vdrop(xvp);
 590                         continue;
 591                 }
 592                 /*
 593                  * If there is a fragment, clear it here.
 594                  */
 595                 blkno = 0;
 596                 loc = howmany(xp->i_size, fs->fs_bsize) - 1;
 597                 if (loc < UFS_NDADDR) {
 598                         len = fragroundup(fs, blkoff(fs, xp->i_size));
 599                         if (len != 0 && len < fs->fs_bsize) {
 600                                 ffs_blkfree(ump, copy_fs, vp,
 601                                     DIP(xp, i_db[loc]), len, xp->i_number,
 602                                     xvp->v_type, NULL, SINGLETON_KEY);
 603                                 blkno = DIP(xp, i_db[loc]);
 604                                 DIP_SET(xp, i_db[loc], 0);
 605                         }
 606                 }
 607                 snaplistsize += 1;
 608                 if (I_IS_UFS1(xp))
 609                         error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
 610                             BLK_NOCOPY, 1);
 611                 else
 612                         error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
 613                             BLK_NOCOPY, 1);
 614                 if (blkno)
 615                         DIP_SET(xp, i_db[loc], blkno);
 616                 if (!error)
 617                         error = ffs_freefile(ump, copy_fs, vp, xp->i_number,
 618                             xp->i_mode, NULL);
 619                 VOP_UNLOCK(xvp);
 620                 vdrop(xvp);
 621                 if (error) {
 622                         free(copy_fs->fs_csp, M_UFSMNT);
 623                         free(copy_fs->fs_si, M_UFSMNT);
 624                         free(copy_fs, M_UFSMNT);
 625                         copy_fs = NULL;
 626                         MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 627                         goto out1;
 628                 }
 629         }
 630         /*
 631          * Erase the journal file from the snapshot.
 632          */
 633         if (fs->fs_flags & FS_SUJ) {
 634                 error = softdep_journal_lookup(mp, &xvp);
 635                 if (error) {
 636                         free(copy_fs->fs_csp, M_UFSMNT);
 637                         free(copy_fs->fs_si, M_UFSMNT);
 638                         free(copy_fs, M_UFSMNT);
 639                         copy_fs = NULL;
 640                         goto out1;
 641                 }
 642                 xp = VTOI(xvp);
 643                 if (I_IS_UFS1(xp))
 644                         error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
 645                             BLK_NOCOPY, 0);
 646                 else
 647                         error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
 648                             BLK_NOCOPY, 0);
 649                 vput(xvp);
 650         }
 651         /*
 652          * Acquire a lock on the snapdata structure, creating it if necessary.
 653          */
 654         sn = ffs_snapdata_acquire(devvp);
 655         /*
 656          * Change vnode to use shared snapshot lock instead of the original
 657          * private lock.
 658          */
 659         vp->v_vnlock = &sn->sn_lock;
 660         lockmgr(&vp->v_lock, LK_RELEASE, NULL);
 661         xp = TAILQ_FIRST(&sn->sn_head);
 662         /*
 663          * If this is the first snapshot on this filesystem, then we need
 664          * to allocate the space for the list of preallocated snapshot blocks.
 665          * This list will be refined below, but this preliminary one will
 666          * keep us out of deadlock until the full one is ready.
 667          */
 668         if (xp == NULL) {
 669                 snapblklist = malloc(snaplistsize * sizeof(daddr_t),
 670                     M_UFSMNT, M_WAITOK);
 671                 blkp = &snapblklist[1];
 672                 *blkp++ = lblkno(fs, fs->fs_sblockloc);
 673                 blkno = fragstoblks(fs, fs->fs_csaddr);
 674                 for (cg = 0; cg < fs->fs_ncg; cg++) {
 675                         if (fragstoblks(fs, cgtod(fs, cg) > blkno))
 676                                 break;
 677                         *blkp++ = fragstoblks(fs, cgtod(fs, cg));
 678                 }
 679                 len = howmany(fs->fs_cssize, fs->fs_bsize);
 680                 for (loc = 0; loc < len; loc++)
 681                         *blkp++ = blkno + loc;
 682                 for (; cg < fs->fs_ncg; cg++)
 683                         *blkp++ = fragstoblks(fs, cgtod(fs, cg));
 684                 snapblklist[0] = blkp - snapblklist;
 685                 VI_LOCK(devvp);
 686                 if (sn->sn_blklist != NULL)
 687                         panic("ffs_snapshot: non-empty list");
 688                 sn->sn_blklist = snapblklist;
 689                 sn->sn_listsize = blkp - snapblklist;
 690                 VI_UNLOCK(devvp);
 691         }
 692         /*
 693          * Record snapshot inode. Since this is the newest snapshot,
 694          * it must be placed at the end of the list.
 695          */
 696         VI_LOCK(devvp);
 697         fs->fs_snapinum[snaploc] = ip->i_number;
 698         if (ip->i_nextsnap.tqe_prev != 0)
 699                 panic("ffs_snapshot: %ju already on list",
 700                     (uintmax_t)ip->i_number);
 701         TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap);
 702         devvp->v_vflag |= VV_COPYONWRITE;
 703         VI_UNLOCK(devvp);
 704         ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp");
 705 out1:
 706         KASSERT((sn != NULL && copy_fs != NULL && error == 0) ||
 707                 (sn == NULL && copy_fs == NULL && error != 0),
 708                 ("email phk@ and mckusick@"));
 709         /*
 710          * Resume operation on filesystem.
 711          */
 712         vfs_write_resume(vp->v_mount, VR_START_WRITE | VR_NO_SUSPCLR);
 713 #ifdef DIAGNOSTIC
 714         if (collectsnapstats && starttime.tv_sec > 0) {
 715                 nanotime(&endtime);
 716                 timespecsub(&endtime, &starttime, &endtime);
 717                 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n",
 718                     vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec,
 719                     endtime.tv_nsec / 1000000, redo, fs->fs_ncg);
 720         }
 721 #endif
 722         if (copy_fs == NULL)
 723                 goto out;
 724         /*
 725          * Copy allocation information from all the snapshots in
 726          * this snapshot and then expunge them from its view.
 727          */
 728         TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) {
 729                 if (xp == ip)
 730                         break;
 731                 if (I_IS_UFS1(xp))
 732                         error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
 733                             BLK_SNAP, 0);
 734                 else
 735                         error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
 736                             BLK_SNAP, 0);
 737                 if (error == 0 && xp->i_effnlink == 0) {
 738                         error = ffs_freefile(ump,
 739                                              copy_fs,
 740                                              vp,
 741                                              xp->i_number,
 742                                              xp->i_mode, NULL);
 743                 }
 744                 if (error) {
 745                         fs->fs_snapinum[snaploc] = 0;
 746                         goto done;
 747                 }
 748         }
 749         /*
 750          * Allocate space for the full list of preallocated snapshot blocks.
 751          */
 752         snapblklist = malloc(snaplistsize * sizeof(daddr_t),
 753             M_UFSMNT, M_WAITOK);
 754         ip->i_snapblklist = &snapblklist[1];
 755         /*
 756          * Expunge the blocks used by the snapshots from the set of
 757          * blocks marked as used in the snapshot bitmaps. Also, collect
 758          * the list of allocated blocks in i_snapblklist.
 759          */
 760         if (I_IS_UFS1(ip))
 761                 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1,
 762                     BLK_SNAP, 0);
 763         else
 764                 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2,
 765                     BLK_SNAP, 0);
 766         if (error) {
 767                 fs->fs_snapinum[snaploc] = 0;
 768                 free(snapblklist, M_UFSMNT);
 769                 goto done;
 770         }
 771         if (snaplistsize < ip->i_snapblklist - snapblklist)
 772                 panic("ffs_snapshot: list too small");
 773         snaplistsize = ip->i_snapblklist - snapblklist;
 774         snapblklist[0] = snaplistsize;
 775         ip->i_snapblklist = 0;
 776         /*
 777          * Write out the list of allocated blocks to the end of the snapshot.
 778          */
 779         auio.uio_iov = &aiov;
 780         auio.uio_iovcnt = 1;
 781         aiov.iov_base = (void *)snapblklist;
 782         aiov.iov_len = snaplistsize * sizeof(daddr_t);
 783         auio.uio_resid = aiov.iov_len;
 784         auio.uio_offset = ip->i_size;
 785         auio.uio_segflg = UIO_SYSSPACE;
 786         auio.uio_rw = UIO_WRITE;
 787         auio.uio_td = td;
 788         if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 789                 fs->fs_snapinum[snaploc] = 0;
 790                 free(snapblklist, M_UFSMNT);
 791                 goto done;
 792         }
 793         /*
 794          * Write the superblock and its summary information
 795          * to the snapshot.
 796          */
 797         blkno = fragstoblks(fs, fs->fs_csaddr);
 798         len = howmany(fs->fs_cssize, fs->fs_bsize);
 799         space = copy_fs->fs_csp;
 800         for (loc = 0; loc < len; loc++) {
 801                 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp);
 802                 if (error) {
 803                         fs->fs_snapinum[snaploc] = 0;
 804                         free(snapblklist, M_UFSMNT);
 805                         goto done;
 806                 }
 807                 bcopy(space, nbp->b_data, fs->fs_bsize);
 808                 space = (char *)space + fs->fs_bsize;
 809                 bawrite(nbp);
 810         }
 811         error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize,
 812             KERNCRED, &nbp);
 813         if (error) {
 814                 brelse(nbp);
 815         } else {
 816                 loc = blkoff(fs, fs->fs_sblockloc);
 817                 copy_fs->fs_fmod = 0;
 818                 copy_fs->fs_ckhash = ffs_calc_sbhash(copy_fs);
 819                 bcopy((char *)copy_fs, &nbp->b_data[loc], (u_int)fs->fs_sbsize);
 820                 bawrite(nbp);
 821         }
 822         /*
 823          * As this is the newest list, it is the most inclusive, so
 824          * should replace the previous list.
 825          */
 826         VI_LOCK(devvp);
 827         space = sn->sn_blklist;
 828         sn->sn_blklist = snapblklist;
 829         sn->sn_listsize = snaplistsize;
 830         VI_UNLOCK(devvp);
 831         if (space != NULL)
 832                 free(space, M_UFSMNT);
 833         /*
 834          * Preallocate all the direct blocks in the snapshot inode so
 835          * that we never have to write the inode itself to commit an
 836          * update to the contents of the snapshot. Note that once
 837          * created, the size of the snapshot will never change, so
 838          * there will never be a need to write the inode except to
 839          * update the non-integrity-critical time fields and
 840          * allocated-block count.
 841          */
 842         for (blockno = 0; blockno < UFS_NDADDR; blockno++) {
 843                 if (DIP(ip, i_db[blockno]) != 0)
 844                         continue;
 845                 error = UFS_BALLOC(vp, lblktosize(fs, blockno),
 846                     fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
 847                 if (error)
 848                         break;
 849                 error = readblock(vp, bp, blockno);
 850                 bawrite(bp);
 851                 if (error != 0)
 852                         break;
 853         }
 854 done:
 855         free(copy_fs->fs_csp, M_UFSMNT);
 856         free(copy_fs->fs_si, M_UFSMNT);
 857         free(copy_fs, M_UFSMNT);
 858         copy_fs = NULL;
 859 out:
 860         NDFREE(&nd, NDF_ONLY_PNBUF);
 861         if (saved_nice > 0) {
 862                 struct proc *p;
 863
 864                 p = td->td_proc;
 865                 PROC_LOCK(p);
 866                 sched_nice(td->td_proc, saved_nice);
 867                 PROC_UNLOCK(td->td_proc);
 868         }
 869         UFS_LOCK(ump);
 870         if (fs->fs_active != 0) {
 871                 free(fs->fs_active, M_DEVBUF);
 872                 fs->fs_active = 0;
 873         }
 874         UFS_UNLOCK(ump);
 875         MNT_ILOCK(mp);
 876         mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
 877         MNT_IUNLOCK(mp);
 878         if (error)
 879                 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
 880         (void) ffs_syncvnode(vp, MNT_WAIT, 0);
 881         if (error)
 882                 vput(vp);
 883         else
 884                 VOP_UNLOCK(vp);
 885         vrele(nd.ni_dvp);
 886         vn_finished_write(wrtmp);
 887         process_deferred_inactive(mp);
 888         return (error);
 889 }
 890
 891 /*
 892  * Copy a cylinder group map. All the unallocated blocks are marked
 893  * BLK_NOCOPY so that the snapshot knows that it need not copy them
 894  * if they are later written. If passno is one, then this is a first
 895  * pass, so only setting needs to be done. If passno is 2, then this
 896  * is a revision to a previous pass which must be undone as the
 897  * replacement pass is done.
 898  */
 899 static int
 900 cgaccount(cg, vp, nbp, passno)
 901         int cg;
 902         struct vnode *vp;
 903         struct buf *nbp;
 904         int passno;
 905 {
 906         struct buf *bp, *ibp;
 907         struct inode *ip;
 908         struct cg *cgp;
 909         struct fs *fs;
 910         ufs2_daddr_t base, numblks;
 911         int error, len, loc, indiroff;
 912
 913         ip = VTOI(vp);
 914         fs = ITOFS(ip);
 915         if ((error = ffs_getcg(fs, ITODEVVP(ip), cg, 0, &bp, &cgp)) != 0)
 916                 return (error);
 917         UFS_LOCK(ITOUMP(ip));
 918         ACTIVESET(fs, cg);
 919         /*
 920          * Recomputation of summary information might not have been performed
 921          * at mount time.  Sync up summary information for current cylinder
 922          * group while data is in memory to ensure that result of background
 923          * fsck is slightly more consistent.
 924          */
 925         fs->fs_cs(fs, cg) = cgp->cg_cs;
 926         UFS_UNLOCK(ITOUMP(ip));
 927         bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
 928         if (fs->fs_cgsize < fs->fs_bsize)
 929                 bzero(&nbp->b_data[fs->fs_cgsize],
 930                     fs->fs_bsize - fs->fs_cgsize);
 931         cgp = (struct cg *)nbp->b_data;
 932         bqrelse(bp);
 933         if (passno == 2)
 934                 nbp->b_flags |= B_VALIDSUSPWRT;
 935         numblks = howmany(fs->fs_size, fs->fs_frag);
 936         len = howmany(fs->fs_fpg, fs->fs_frag);
 937         base = cgbase(fs, cg) / fs->fs_frag;
 938         if (base + len >= numblks)
 939                 len = numblks - base - 1;
 940         loc = 0;
 941         if (base < UFS_NDADDR) {
 942                 for ( ; loc < UFS_NDADDR; loc++) {
 943                         if (ffs_isblock(fs, cg_blksfree(cgp), loc))
 944                                 DIP_SET(ip, i_db[loc], BLK_NOCOPY);
 945                         else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY)
 946                                 DIP_SET(ip, i_db[loc], 0);
 947                         else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY)
 948                                 panic("ffs_snapshot: lost direct block");
 949                 }
 950         }
 951         error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
 952             fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 953         if (error) {
 954                 goto out;
 955         }
 956         indiroff = (base + loc - UFS_NDADDR) % NINDIR(fs);
 957         for ( ; loc < len; loc++, indiroff++) {
 958                 if (indiroff >= NINDIR(fs)) {
 959                         if (passno == 2)
 960                                 ibp->b_flags |= B_VALIDSUSPWRT;
 961                         bawrite(ibp);
 962                         error = UFS_BALLOC(vp,
 963                             lblktosize(fs, (off_t)(base + loc)),
 964                             fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 965                         if (error) {
 966                                 goto out;
 967                         }
 968                         indiroff = 0;
 969                 }
 970                 if (I_IS_UFS1(ip)) {
 971                         if (ffs_isblock(fs, cg_blksfree(cgp), loc))
 972                                 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
 973                                     BLK_NOCOPY;
 974                         else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data))
 975                             [indiroff] == BLK_NOCOPY)
 976                                 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0;
 977                         else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data))
 978                             [indiroff] == BLK_NOCOPY)
 979                                 panic("ffs_snapshot: lost indirect block");
 980                         continue;
 981                 }
 982                 if (ffs_isblock(fs, cg_blksfree(cgp), loc))
 983                         ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
 984                 else if (passno == 2 &&
 985                     ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
 986                         ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0;
 987                 else if (passno == 1 &&
 988                     ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
 989                         panic("ffs_snapshot: lost indirect block");
 990         }
 991         if (passno == 2)
 992                 ibp->b_flags |= B_VALIDSUSPWRT;
 993         bdwrite(ibp);
 994 out:
 995         /*
 996          * We have to calculate the crc32c here rather than just setting the
 997          * BX_CYLGRP b_xflags because the allocation of the block for the
 998          * the cylinder group map will always be a full size block (fs_bsize)
 999          * even though the cylinder group may be smaller (fs_cgsize). The
1000          * crc32c must be computed only over fs_cgsize whereas the BX_CYLGRP
1001          * flag causes it to be computed over the size of the buffer.
1002          */
1003         if ((fs->fs_metackhash & CK_CYLGRP) != 0) {
1004                 ((struct cg *)nbp->b_data)->cg_ckhash = 0;
1005                 ((struct cg *)nbp->b_data)->cg_ckhash =
1006                     calculate_crc32c(~0L, nbp->b_data, fs->fs_cgsize);
1007         }
1008         return (error);
1009 }
1010
1011 /*
1012  * Before expunging a snapshot inode, note all the
1013  * blocks that it claims with BLK_SNAP so that fsck will
1014  * be able to account for those blocks properly and so
1015  * that this snapshot knows that it need not copy them
1016  * if the other snapshot holding them is freed. This code
1017  * is reproduced once each for UFS1 and UFS2.
1018  */
1019 static int
1020 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype, clearmode)
1021         struct vnode *snapvp;
1022         struct inode *cancelip;
1023         struct fs *fs;
1024         int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
1025             struct fs *, ufs_lbn_t, int);
1026         int expungetype;
1027         int clearmode;
1028 {
1029         int i, error, indiroff;
1030         ufs_lbn_t lbn, rlbn;
1031         ufs2_daddr_t len, blkno, numblks, blksperindir;
1032         struct ufs1_dinode *dip;
1033         struct thread *td = curthread;
1034         struct buf *bp;
1035
1036         /*
1037          * Prepare to expunge the inode. If its inode block has not
1038          * yet been copied, then allocate and fill the copy.
1039          */
1040         lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1041         blkno = 0;
1042         if (lbn < UFS_NDADDR) {
1043                 blkno = VTOI(snapvp)->i_din1->di_db[lbn];
1044         } else {
1045                 if (DOINGSOFTDEP(snapvp))
1046                         softdep_prealloc(snapvp, MNT_WAIT);
1047                 td->td_pflags |= TDP_COWINPROGRESS;
1048                 error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn),
1049                    fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
1050                 td->td_pflags &= ~TDP_COWINPROGRESS;
1051                 if (error)
1052                         return (error);
1053                 indiroff = (lbn - UFS_NDADDR) % NINDIR(fs);
1054                 blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff];
1055                 bqrelse(bp);
1056         }
1057         if (blkno != 0) {
1058                 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
1059                         return (error);
1060         } else {
1061                 error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn),
1062                     fs->fs_bsize, KERNCRED, 0, &bp);
1063                 if (error)
1064                         return (error);
1065                 if ((error = readblock(snapvp, bp, lbn)) != 0)
1066                         return (error);
1067         }
1068         /*
1069          * Set a snapshot inode to be a zero length file, regular files
1070          * or unlinked snapshots to be completely unallocated.
1071          */
1072         dip = (struct ufs1_dinode *)bp->b_data +
1073             ino_to_fsbo(fs, cancelip->i_number);
1074         if (clearmode || cancelip->i_effnlink == 0)
1075                 dip->di_mode = 0;
1076         dip->di_size = 0;
1077         dip->di_blocks = 0;
1078         dip->di_flags &= ~SF_SNAPSHOT;
1079         bzero(&dip->di_db[0], (UFS_NDADDR + UFS_NIADDR) * sizeof(ufs1_daddr_t));
1080         bdwrite(bp);
1081         /*
1082          * Now go through and expunge all the blocks in the file
1083          * using the function requested.
1084          */
1085         numblks = howmany(cancelip->i_size, fs->fs_bsize);
1086         if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0],
1087             &cancelip->i_din1->di_db[UFS_NDADDR], fs, 0, expungetype)))
1088                 return (error);
1089         if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0],
1090             &cancelip->i_din1->di_ib[UFS_NIADDR], fs, -1, expungetype)))
1091                 return (error);
1092         blksperindir = 1;
1093         lbn = -UFS_NDADDR;
1094         len = numblks - UFS_NDADDR;
1095         rlbn = UFS_NDADDR;
1096         for (i = 0; len > 0 && i < UFS_NIADDR; i++) {
1097                 error = indiracct_ufs1(snapvp, ITOV(cancelip), i,
1098                     cancelip->i_din1->di_ib[i], lbn, rlbn, len,
1099                     blksperindir, fs, acctfunc, expungetype);
1100                 if (error)
1101                         return (error);
1102                 blksperindir *= NINDIR(fs);
1103                 lbn -= blksperindir + 1;
1104                 len -= blksperindir;
1105                 rlbn += blksperindir;
1106         }
1107         return (0);
1108 }
1109
1110 /*
1111  * Descend an indirect block chain for vnode cancelvp accounting for all
1112  * its indirect blocks in snapvp.
1113  */
1114 static int
1115 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
1116             blksperindir, fs, acctfunc, expungetype)
1117         struct vnode *snapvp;
1118         struct vnode *cancelvp;
1119         int level;
1120         ufs1_daddr_t blkno;
1121         ufs_lbn_t lbn;
1122         ufs_lbn_t rlbn;
1123         ufs_lbn_t remblks;
1124         ufs_lbn_t blksperindir;
1125         struct fs *fs;
1126         int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
1127             struct fs *, ufs_lbn_t, int);
1128         int expungetype;
1129 {
1130         int error, num, i;
1131         ufs_lbn_t subblksperindir;
1132         struct indir indirs[UFS_NIADDR + 2];
1133         ufs1_daddr_t last, *bap;
1134         struct buf *bp;
1135
1136         if (blkno == 0) {
1137                 if (expungetype == BLK_NOCOPY)
1138                         return (0);
1139                 panic("indiracct_ufs1: missing indir");
1140         }
1141         if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1142                 return (error);
1143         if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
1144                 panic("indiracct_ufs1: botched params");
1145         /*
1146          * We have to expand bread here since it will deadlock looking
1147          * up the block number for any blocks that are not in the cache.
1148          */
1149         bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
1150         bp->b_blkno = fsbtodb(fs, blkno);
1151         if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
1152             (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) {
1153                 brelse(bp);
1154                 return (error);
1155         }
1156         /*
1157          * Account for the block pointers in this indirect block.
1158          */
1159         last = howmany(remblks, blksperindir);
1160         if (last > NINDIR(fs))
1161                 last = NINDIR(fs);
1162         bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK);
1163         bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
1164         bqrelse(bp);
1165         error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
1166             level == 0 ? rlbn : -1, expungetype);
1167         if (error || level == 0)
1168                 goto out;
1169         /*
1170          * Account for the block pointers in each of the indirect blocks
1171          * in the levels below us.
1172          */
1173         subblksperindir = blksperindir / NINDIR(fs);
1174         for (lbn++, level--, i = 0; i < last; i++) {
1175                 error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn,
1176                     rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
1177                 if (error)
1178                         goto out;
1179                 rlbn += blksperindir;
1180                 lbn -= blksperindir;
1181                 remblks -= blksperindir;
1182         }
1183 out:
1184         free(bap, M_DEVBUF);
1185         return (error);
1186 }
1187
1188 /*
1189  * Do both snap accounting and map accounting.
1190  */
1191 static int
1192 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)
1193         struct vnode *vp;
1194         ufs1_daddr_t *oldblkp, *lastblkp;
1195         struct fs *fs;
1196         ufs_lbn_t lblkno;
1197         int exptype;    /* BLK_SNAP or BLK_NOCOPY */
1198 {
1199         int error;
1200
1201         if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
1202                 return (error);
1203         return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype));
1204 }
1205
1206 /*
1207  * Identify a set of blocks allocated in a snapshot inode.
1208  */
1209 static int
1210 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1211         struct vnode *vp;
1212         ufs1_daddr_t *oldblkp, *lastblkp;
1213         struct fs *fs;
1214         ufs_lbn_t lblkno;
1215         int expungetype;        /* BLK_SNAP or BLK_NOCOPY */
1216 {
1217         struct inode *ip = VTOI(vp);
1218         ufs1_daddr_t blkno, *blkp;
1219         ufs_lbn_t lbn;
1220         struct buf *ibp;
1221         int error;
1222
1223         for ( ; oldblkp < lastblkp; oldblkp++) {
1224                 blkno = *oldblkp;
1225                 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1226                         continue;
1227                 lbn = fragstoblks(fs, blkno);
1228                 if (lbn < UFS_NDADDR) {
1229                         blkp = &ip->i_din1->di_db[lbn];
1230                         UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
1231                 } else {
1232                         error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn),
1233                             fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1234                         if (error)
1235                                 return (error);
1236                         blkp = &((ufs1_daddr_t *)(ibp->b_data))
1237                             [(lbn - UFS_NDADDR) % NINDIR(fs)];
1238                 }
1239                 /*
1240                  * If we are expunging a snapshot vnode and we
1241                  * find a block marked BLK_NOCOPY, then it is
1242                  * one that has been allocated to this snapshot after
1243                  * we took our current snapshot and can be ignored.
1244                  */
1245                 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
1246                         if (lbn >= UFS_NDADDR)
1247                                 brelse(ibp);
1248                 } else {
1249                         if (*blkp != 0)
1250                                 panic("snapacct_ufs1: bad block");
1251                         *blkp = expungetype;
1252                         if (lbn >= UFS_NDADDR)
1253                                 bdwrite(ibp);
1254                 }
1255         }
1256         return (0);
1257 }
1258
1259 /*
1260  * Account for a set of blocks allocated in a snapshot inode.
1261  */
1262 static int
1263 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1264         struct vnode *vp;
1265         ufs1_daddr_t *oldblkp, *lastblkp;
1266         struct fs *fs;
1267         ufs_lbn_t lblkno;
1268         int expungetype;
1269 {
1270         ufs1_daddr_t blkno;
1271         struct inode *ip;
1272         ino_t inum;
1273         int acctit;
1274
1275         ip = VTOI(vp);
1276         inum = ip->i_number;
1277         if (lblkno == -1)
1278                 acctit = 0;
1279         else
1280                 acctit = 1;
1281         for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
1282                 blkno = *oldblkp;
1283                 if (blkno == 0 || blkno == BLK_NOCOPY)
1284                         continue;
1285                 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1286                         *ip->i_snapblklist++ = lblkno;
1287                 if (blkno == BLK_SNAP)
1288                         blkno = blkstofrags(fs, lblkno);
1289                 ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum,
1290                     vp->v_type, NULL, SINGLETON_KEY);
1291         }
1292         return (0);
1293 }
1294
1295 /*
1296  * Before expunging a snapshot inode, note all the
1297  * blocks that it claims with BLK_SNAP so that fsck will
1298  * be able to account for those blocks properly and so
1299  * that this snapshot knows that it need not copy them
1300  * if the other snapshot holding them is freed. This code
1301  * is reproduced once each for UFS1 and UFS2.
1302  */
1303 static int
1304 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype, clearmode)
1305         struct vnode *snapvp;
1306         struct inode *cancelip;
1307         struct fs *fs;
1308         int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
1309             struct fs *, ufs_lbn_t, int);
1310         int expungetype;
1311         int clearmode;
1312 {
1313         int i, error, indiroff;
1314         ufs_lbn_t lbn, rlbn;
1315         ufs2_daddr_t len, blkno, numblks, blksperindir;
1316         struct ufs2_dinode *dip;
1317         struct thread *td = curthread;
1318         struct buf *bp;
1319
1320         /*
1321          * Prepare to expunge the inode. If its inode block has not
1322          * yet been copied, then allocate and fill the copy.
1323          */
1324         lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1325         blkno = 0;
1326         if (lbn < UFS_NDADDR) {
1327                 blkno = VTOI(snapvp)->i_din2->di_db[lbn];
1328         } else {
1329                 if (DOINGSOFTDEP(snapvp))
1330                         softdep_prealloc(snapvp, MNT_WAIT);
1331                 td->td_pflags |= TDP_COWINPROGRESS;
1332                 error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn),
1333                    fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
1334                 td->td_pflags &= ~TDP_COWINPROGRESS;
1335                 if (error)
1336                         return (error);
1337                 indiroff = (lbn - UFS_NDADDR) % NINDIR(fs);
1338                 blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff];
1339                 bqrelse(bp);
1340         }
1341         if (blkno != 0) {
1342                 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
1343                         return (error);
1344         } else {
1345                 error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn),
1346                     fs->fs_bsize, KERNCRED, 0, &bp);
1347                 if (error)
1348                         return (error);
1349                 if ((error = readblock(snapvp, bp, lbn)) != 0)
1350                         return (error);
1351         }
1352         /*
1353          * Set a snapshot inode to be a zero length file, regular files
1354          * to be completely unallocated.
1355          */
1356         dip = (struct ufs2_dinode *)bp->b_data +
1357             ino_to_fsbo(fs, cancelip->i_number);
1358         dip->di_size = 0;
1359         dip->di_blocks = 0;
1360         dip->di_flags &= ~SF_SNAPSHOT;
1361         bzero(&dip->di_db[0], (UFS_NDADDR + UFS_NIADDR) * sizeof(ufs2_daddr_t));
1362         if (clearmode || cancelip->i_effnlink == 0)
1363                 dip->di_mode = 0;
1364         else
1365                 ffs_update_dinode_ckhash(fs, dip);
1366         bdwrite(bp);
1367         /*
1368          * Now go through and expunge all the blocks in the file
1369          * using the function requested.
1370          */
1371         numblks = howmany(cancelip->i_size, fs->fs_bsize);
1372         if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0],
1373             &cancelip->i_din2->di_db[UFS_NDADDR], fs, 0, expungetype)))
1374                 return (error);
1375         if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0],
1376             &cancelip->i_din2->di_ib[UFS_NIADDR], fs, -1, expungetype)))
1377                 return (error);
1378         blksperindir = 1;
1379         lbn = -UFS_NDADDR;
1380         len = numblks - UFS_NDADDR;
1381         rlbn = UFS_NDADDR;
1382         for (i = 0; len > 0 && i < UFS_NIADDR; i++) {
1383                 error = indiracct_ufs2(snapvp, ITOV(cancelip), i,
1384                     cancelip->i_din2->di_ib[i], lbn, rlbn, len,
1385                     blksperindir, fs, acctfunc, expungetype);
1386                 if (error)
1387                         return (error);
1388                 blksperindir *= NINDIR(fs);
1389                 lbn -= blksperindir + 1;
1390                 len -= blksperindir;
1391                 rlbn += blksperindir;
1392         }
1393         return (0);
1394 }
1395
1396 /*
1397  * Descend an indirect block chain for vnode cancelvp accounting for all
1398  * its indirect blocks in snapvp.
1399  */
1400 static int
1401 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
1402             blksperindir, fs, acctfunc, expungetype)
1403         struct vnode *snapvp;
1404         struct vnode *cancelvp;
1405         int level;
1406         ufs2_daddr_t blkno;
1407         ufs_lbn_t lbn;
1408         ufs_lbn_t rlbn;
1409         ufs_lbn_t remblks;
1410         ufs_lbn_t blksperindir;
1411         struct fs *fs;
1412         int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
1413             struct fs *, ufs_lbn_t, int);
1414         int expungetype;
1415 {
1416         int error, num, i;
1417         ufs_lbn_t subblksperindir;
1418         struct indir indirs[UFS_NIADDR + 2];
1419         ufs2_daddr_t last, *bap;
1420         struct buf *bp;
1421
1422         if (blkno == 0) {
1423                 if (expungetype == BLK_NOCOPY)
1424                         return (0);
1425                 panic("indiracct_ufs2: missing indir");
1426         }
1427         if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1428                 return (error);
1429         if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
1430                 panic("indiracct_ufs2: botched params");
1431         /*
1432          * We have to expand bread here since it will deadlock looking
1433          * up the block number for any blocks that are not in the cache.
1434          */
1435         bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
1436         bp->b_blkno = fsbtodb(fs, blkno);
1437         if ((bp->b_flags & B_CACHE) == 0 &&
1438             (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) {
1439                 brelse(bp);
1440                 return (error);
1441         }
1442         /*
1443          * Account for the block pointers in this indirect block.
1444          */
1445         last = howmany(remblks, blksperindir);
1446         if (last > NINDIR(fs))
1447                 last = NINDIR(fs);
1448         bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK);
1449         bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
1450         bqrelse(bp);
1451         error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
1452             level == 0 ? rlbn : -1, expungetype);
1453         if (error || level == 0)
1454                 goto out;
1455         /*
1456          * Account for the block pointers in each of the indirect blocks
1457          * in the levels below us.
1458          */
1459         subblksperindir = blksperindir / NINDIR(fs);
1460         for (lbn++, level--, i = 0; i < last; i++) {
1461                 error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn,
1462                     rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
1463                 if (error)
1464                         goto out;
1465                 rlbn += blksperindir;
1466                 lbn -= blksperindir;
1467                 remblks -= blksperindir;
1468         }
1469 out:
1470         free(bap, M_DEVBUF);
1471         return (error);
1472 }
1473
1474 /*
1475  * Do both snap accounting and map accounting.
1476  */
1477 static int
1478 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)
1479         struct vnode *vp;
1480         ufs2_daddr_t *oldblkp, *lastblkp;
1481         struct fs *fs;
1482         ufs_lbn_t lblkno;
1483         int exptype;    /* BLK_SNAP or BLK_NOCOPY */
1484 {
1485         int error;
1486
1487         if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
1488                 return (error);
1489         return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype));
1490 }
1491
1492 /*
1493  * Identify a set of blocks allocated in a snapshot inode.
1494  */
1495 static int
1496 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1497         struct vnode *vp;
1498         ufs2_daddr_t *oldblkp, *lastblkp;
1499         struct fs *fs;
1500         ufs_lbn_t lblkno;
1501         int expungetype;        /* BLK_SNAP or BLK_NOCOPY */
1502 {
1503         struct inode *ip = VTOI(vp);
1504         ufs2_daddr_t blkno, *blkp;
1505         ufs_lbn_t lbn;
1506         struct buf *ibp;
1507         int error;
1508
1509         for ( ; oldblkp < lastblkp; oldblkp++) {
1510                 blkno = *oldblkp;
1511                 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1512                         continue;
1513                 lbn = fragstoblks(fs, blkno);
1514                 if (lbn < UFS_NDADDR) {
1515                         blkp = &ip->i_din2->di_db[lbn];
1516                         UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
1517                 } else {
1518                         error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn),
1519                             fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1520                         if (error)
1521                                 return (error);
1522                         blkp = &((ufs2_daddr_t *)(ibp->b_data))
1523                             [(lbn - UFS_NDADDR) % NINDIR(fs)];
1524                 }
1525                 /*
1526                  * If we are expunging a snapshot vnode and we
1527                  * find a block marked BLK_NOCOPY, then it is
1528                  * one that has been allocated to this snapshot after
1529                  * we took our current snapshot and can be ignored.
1530                  */
1531                 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
1532                         if (lbn >= UFS_NDADDR)
1533                                 brelse(ibp);
1534                 } else {
1535                         if (*blkp != 0)
1536                                 panic("snapacct_ufs2: bad block");
1537                         *blkp = expungetype;
1538                         if (lbn >= UFS_NDADDR)
1539                                 bdwrite(ibp);
1540                 }
1541         }
1542         return (0);
1543 }
1544
1545 /*
1546  * Account for a set of blocks allocated in a snapshot inode.
1547  */
1548 static int
1549 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1550         struct vnode *vp;
1551         ufs2_daddr_t *oldblkp, *lastblkp;
1552         struct fs *fs;
1553         ufs_lbn_t lblkno;
1554         int expungetype;
1555 {
1556         ufs2_daddr_t blkno;
1557         struct inode *ip;
1558         ino_t inum;
1559         int acctit;
1560
1561         ip = VTOI(vp);
1562         inum = ip->i_number;
1563         if (lblkno == -1)
1564                 acctit = 0;
1565         else
1566                 acctit = 1;
1567         for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
1568                 blkno = *oldblkp;
1569                 if (blkno == 0 || blkno == BLK_NOCOPY)
1570                         continue;
1571                 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1572                         *ip->i_snapblklist++ = lblkno;
1573                 if (blkno == BLK_SNAP)
1574                         blkno = blkstofrags(fs, lblkno);
1575                 ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum,
1576                     vp->v_type, NULL, SINGLETON_KEY);
1577         }
1578         return (0);
1579 }
1580
1581 /*
1582  * Decrement extra reference on snapshot when last name is removed.
1583  * It will not be freed until the last open reference goes away.
1584  */
1585 void
1586 ffs_snapgone(ip)
1587         struct inode *ip;
1588 {
1589         struct inode *xp;
1590         struct fs *fs;
1591         int snaploc;
1592         struct snapdata *sn;
1593         struct ufsmount *ump;
1594
1595         /*
1596          * Find snapshot in incore list.
1597          */
1598         xp = NULL;
1599         sn = ITODEVVP(ip)->v_rdev->si_snapdata;
1600         if (sn != NULL)
1601                 TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap)
1602                         if (xp == ip)
1603                                 break;
1604         if (xp != NULL)
1605                 vrele(ITOV(ip));
1606 #ifdef DIAGNOSTIC
1607         else if (snapdebug)
1608                 printf("ffs_snapgone: lost snapshot vnode %ju\n",
1609                     (uintmax_t)ip->i_number);
1610 #endif
1611         /*
1612          * Delete snapshot inode from superblock. Keep list dense.
1613          */
1614         ump = ITOUMP(ip);
1615         fs = ump->um_fs;
1616         UFS_LOCK(ump);
1617         for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
1618                 if (fs->fs_snapinum[snaploc] == ip->i_number)
1619                         break;
1620         if (snaploc < FSMAXSNAP) {
1621                 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
1622                         if (fs->fs_snapinum[snaploc] == 0)
1623                                 break;
1624                         fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
1625                 }
1626                 fs->fs_snapinum[snaploc - 1] = 0;
1627         }
1628         UFS_UNLOCK(ump);
1629 }
1630
1631 /*
1632  * Prepare a snapshot file for being removed.
1633  */
1634 void
1635 ffs_snapremove(vp)
1636         struct vnode *vp;
1637 {
1638         struct inode *ip;
1639         struct vnode *devvp;
1640         struct buf *ibp;
1641         struct fs *fs;
1642         ufs2_daddr_t numblks, blkno, dblk;
1643         int error, i, last, loc;
1644         struct snapdata *sn;
1645
1646         ip = VTOI(vp);
1647         fs = ITOFS(ip);
1648         devvp = ITODEVVP(ip);
1649         /*
1650          * If active, delete from incore list (this snapshot may
1651          * already have been in the process of being deleted, so
1652          * would not have been active).
1653          *
1654          * Clear copy-on-write flag if last snapshot.
1655          */
1656         VI_LOCK(devvp);
1657         if (ip->i_nextsnap.tqe_prev != 0) {
1658                 sn = devvp->v_rdev->si_snapdata;
1659                 TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap);
1660                 ip->i_nextsnap.tqe_prev = 0;
1661                 VI_UNLOCK(devvp);
1662                 lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL);
1663                 for (i = 0; i < sn->sn_lock.lk_recurse; i++)
1664                         lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL);
1665                 KASSERT(vp->v_vnlock == &sn->sn_lock,
1666                         ("ffs_snapremove: lost lock mutation"));
1667                 vp->v_vnlock = &vp->v_lock;
1668                 VI_LOCK(devvp);
1669                 while (sn->sn_lock.lk_recurse > 0)
1670                         lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
1671                 lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
1672                 try_free_snapdata(devvp);
1673         } else
1674                 VI_UNLOCK(devvp);
1675         /*
1676          * Clear all BLK_NOCOPY fields. Pass any block claims to other
1677          * snapshots that want them (see ffs_snapblkfree below).
1678          */
1679         for (blkno = 1; blkno < UFS_NDADDR; blkno++) {
1680                 dblk = DIP(ip, i_db[blkno]);
1681                 if (dblk == 0)
1682                         continue;
1683                 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1684                         DIP_SET(ip, i_db[blkno], 0);
1685                 else if ((dblk == blkstofrags(fs, blkno) &&
1686                      ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize,
1687                      ip->i_number, vp->v_type, NULL))) {
1688                         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) -
1689                             btodb(fs->fs_bsize));
1690                         DIP_SET(ip, i_db[blkno], 0);
1691                 }
1692         }
1693         numblks = howmany(ip->i_size, fs->fs_bsize);
1694         for (blkno = UFS_NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
1695                 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
1696                     fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1697                 if (error)
1698                         continue;
1699                 if (fs->fs_size - blkno > NINDIR(fs))
1700                         last = NINDIR(fs);
1701                 else
1702                         last = fs->fs_size - blkno;
1703                 for (loc = 0; loc < last; loc++) {
1704                         if (I_IS_UFS1(ip)) {
1705                                 dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc];
1706                                 if (dblk == 0)
1707                                         continue;
1708                                 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1709                                         ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
1710                                 else if ((dblk == blkstofrags(fs, blkno) &&
1711                                      ffs_snapblkfree(fs, ITODEVVP(ip), dblk,
1712                                      fs->fs_bsize, ip->i_number, vp->v_type,
1713                                      NULL))) {
1714                                         ip->i_din1->di_blocks -=
1715                                             btodb(fs->fs_bsize);
1716                                         ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
1717                                 }
1718                                 continue;
1719                         }
1720                         dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc];
1721                         if (dblk == 0)
1722                                 continue;
1723                         if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1724                                 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
1725                         else if ((dblk == blkstofrags(fs, blkno) &&
1726                              ffs_snapblkfree(fs, ITODEVVP(ip), dblk,
1727                              fs->fs_bsize, ip->i_number, vp->v_type, NULL))) {
1728                                 ip->i_din2->di_blocks -= btodb(fs->fs_bsize);
1729                                 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
1730                         }
1731                 }
1732                 bawrite(ibp);
1733         }
1734         /*
1735          * Clear snapshot flag and drop reference.
1736          */
1737         ip->i_flags &= ~SF_SNAPSHOT;
1738         DIP_SET(ip, i_flags, ip->i_flags);
1739         UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
1740         /*
1741          * The dirtied indirects must be written out before
1742          * softdep_setup_freeblocks() is called.  Otherwise indir_trunc()
1743          * may find indirect pointers using the magic BLK_* values.
1744          */
1745         if (DOINGSOFTDEP(vp))
1746                 ffs_syncvnode(vp, MNT_WAIT, 0);
1747 #ifdef QUOTA
1748         /*
1749          * Reenable disk quotas for ex-snapshot file.
1750          */
1751         if (!getinoquota(ip))
1752                 (void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE);
1753 #endif
1754 }
1755
1756 /*
1757  * Notification that a block is being freed. Return zero if the free
1758  * should be allowed to proceed. Return non-zero if the snapshot file
1759  * wants to claim the block. The block will be claimed if it is an
1760  * uncopied part of one of the snapshots. It will be freed if it is
1761  * either a BLK_NOCOPY or has already been copied in all of the snapshots.
1762  * If a fragment is being freed, then all snapshots that care about
1763  * it must make a copy since a snapshot file can only claim full sized
1764  * blocks. Note that if more than one snapshot file maps the block,
1765  * we can pick one at random to claim it. Since none of the snapshots
1766  * can change, we are assurred that they will all see the same unmodified
1767  * image. When deleting a snapshot file (see ffs_snapremove above), we
1768  * must push any of these claimed blocks to one of the other snapshots
1769  * that maps it. These claimed blocks are easily identified as they will
1770  * have a block number equal to their logical block number within the
1771  * snapshot. A copied block can never have this property because they
1772  * must always have been allocated from a BLK_NOCOPY location.
1773  */
1774 int
1775 ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd)
1776         struct fs *fs;
1777         struct vnode *devvp;
1778         ufs2_daddr_t bno;
1779         long size;
1780         ino_t inum;
1781         enum vtype vtype;
1782         struct workhead *wkhd;
1783 {
1784         struct buf *ibp, *cbp, *savedcbp = NULL;
1785         struct thread *td = curthread;
1786         struct inode *ip;
1787         struct vnode *vp = NULL;
1788         ufs_lbn_t lbn;
1789         ufs2_daddr_t blkno;
1790         int indiroff = 0, error = 0, claimedblk = 0;
1791         struct snapdata *sn;
1792
1793         lbn = fragstoblks(fs, bno);
1794 retry:
1795         VI_LOCK(devvp);
1796         sn = devvp->v_rdev->si_snapdata;
1797         if (sn == NULL) {
1798                 VI_UNLOCK(devvp);
1799                 return (0);
1800         }
1801         if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
1802             VI_MTX(devvp)) != 0)
1803                 goto retry;
1804         TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
1805                 vp = ITOV(ip);
1806                 if (DOINGSOFTDEP(vp))
1807                         softdep_prealloc(vp, MNT_WAIT);
1808                 /*
1809                  * Lookup block being written.
1810                  */
1811                 if (lbn < UFS_NDADDR) {
1812                         blkno = DIP(ip, i_db[lbn]);
1813                 } else {
1814                         td->td_pflags |= TDP_COWINPROGRESS;
1815                         error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1816                             fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1817                         td->td_pflags &= ~TDP_COWINPROGRESS;
1818                         if (error)
1819                                 break;
1820                         indiroff = (lbn - UFS_NDADDR) % NINDIR(fs);
1821                         if (I_IS_UFS1(ip))
1822                                 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
1823                         else
1824                                 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
1825                 }
1826                 /*
1827                  * Check to see if block needs to be copied.
1828                  */
1829                 if (blkno == 0) {
1830                         /*
1831                          * A block that we map is being freed. If it has not
1832                          * been claimed yet, we will claim or copy it (below).
1833                          */
1834                         claimedblk = 1;
1835                 } else if (blkno == BLK_SNAP) {
1836                         /*
1837                          * No previous snapshot claimed the block,
1838                          * so it will be freed and become a BLK_NOCOPY
1839                          * (don't care) for us.
1840                          */
1841                         if (claimedblk)
1842                                 panic("snapblkfree: inconsistent block type");
1843                         if (lbn < UFS_NDADDR) {
1844                                 DIP_SET(ip, i_db[lbn], BLK_NOCOPY);
1845                                 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
1846                         } else if (I_IS_UFS1(ip)) {
1847                                 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
1848                                     BLK_NOCOPY;
1849                                 bdwrite(ibp);
1850                         } else {
1851                                 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] =
1852                                     BLK_NOCOPY;
1853                                 bdwrite(ibp);
1854                         }
1855                         continue;
1856                 } else /* BLK_NOCOPY or default */ {
1857                         /*
1858                          * If the snapshot has already copied the block
1859                          * (default), or does not care about the block,
1860                          * it is not needed.
1861                          */
1862                         if (lbn >= UFS_NDADDR)
1863                                 bqrelse(ibp);
1864                         continue;
1865                 }
1866                 /*
1867                  * If this is a full size block, we will just grab it
1868                  * and assign it to the snapshot inode. Otherwise we
1869                  * will proceed to copy it. See explanation for this
1870                  * routine as to why only a single snapshot needs to
1871                  * claim this block.
1872                  */
1873                 if (size == fs->fs_bsize) {
1874 #ifdef DIAGNOSTIC
1875                         if (snapdebug)
1876                                 printf("%s %ju lbn %jd from inum %ju\n",
1877                                     "Grabonremove: snapino",
1878                                     (uintmax_t)ip->i_number,
1879                                     (intmax_t)lbn, (uintmax_t)inum);
1880 #endif
1881                         /*
1882                          * If journaling is tracking this write we must add
1883                          * the work to the inode or indirect being written.
1884                          */
1885                         if (wkhd != NULL) {
1886                                 if (lbn < UFS_NDADDR)
1887                                         softdep_inode_append(ip,
1888                                             curthread->td_ucred, wkhd);
1889                                 else
1890                                         softdep_buf_append(ibp, wkhd);
1891                         }
1892                         if (lbn < UFS_NDADDR) {
1893                                 DIP_SET(ip, i_db[lbn], bno);
1894                         } else if (I_IS_UFS1(ip)) {
1895                                 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno;
1896                                 bdwrite(ibp);
1897                         } else {
1898                                 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno;
1899                                 bdwrite(ibp);
1900                         }
1901                         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size));
1902                         UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
1903                         lockmgr(vp->v_vnlock, LK_RELEASE, NULL);
1904                         return (1);
1905                 }
1906                 if (lbn >= UFS_NDADDR)
1907                         bqrelse(ibp);
1908                 /*
1909                  * Allocate the block into which to do the copy. Note that this
1910                  * allocation will never require any additional allocations for
1911                  * the snapshot inode.
1912                  */
1913                 td->td_pflags |= TDP_COWINPROGRESS;
1914                 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1915                     fs->fs_bsize, KERNCRED, 0, &cbp);
1916                 td->td_pflags &= ~TDP_COWINPROGRESS;
1917                 if (error)
1918                         break;
1919 #ifdef DIAGNOSTIC
1920                 if (snapdebug)
1921                         printf("%s%ju lbn %jd %s %ju size %ld to blkno %jd\n",
1922                             "Copyonremove: snapino ", (uintmax_t)ip->i_number,
1923                             (intmax_t)lbn, "for inum", (uintmax_t)inum, size,
1924                             (intmax_t)cbp->b_blkno);
1925 #endif
1926                 /*
1927                  * If we have already read the old block contents, then
1928                  * simply copy them to the new block. Note that we need
1929                  * to synchronously write snapshots that have not been
1930                  * unlinked, and hence will be visible after a crash,
1931                  * to ensure their integrity. At a minimum we ensure the
1932                  * integrity of the filesystem metadata, but use the
1933                  * dopersistence sysctl-setable flag to decide on the
1934                  * persistence needed for file content data.
1935                  */
1936                 if (savedcbp != NULL) {
1937                         bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
1938                         bawrite(cbp);
1939                         if ((vtype == VDIR || dopersistence) &&
1940                             ip->i_effnlink > 0)
1941                                 (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
1942                         continue;
1943                 }
1944                 /*
1945                  * Otherwise, read the old block contents into the buffer.
1946                  */
1947                 if ((error = readblock(vp, cbp, lbn)) != 0) {
1948                         bzero(cbp->b_data, fs->fs_bsize);
1949                         bawrite(cbp);
1950                         if ((vtype == VDIR || dopersistence) &&
1951                             ip->i_effnlink > 0)
1952                                 (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
1953                         break;
1954                 }
1955                 savedcbp = cbp;
1956         }
1957         /*
1958          * Note that we need to synchronously write snapshots that
1959          * have not been unlinked, and hence will be visible after
1960          * a crash, to ensure their integrity. At a minimum we
1961          * ensure the integrity of the filesystem metadata, but
1962          * use the dopersistence sysctl-setable flag to decide on
1963          * the persistence needed for file content data.
1964          */
1965         if (savedcbp) {
1966                 vp = savedcbp->b_vp;
1967                 bawrite(savedcbp);
1968                 if ((vtype == VDIR || dopersistence) &&
1969                     VTOI(vp)->i_effnlink > 0)
1970                         (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
1971         }
1972         /*
1973          * If we have been unable to allocate a block in which to do
1974          * the copy, then return non-zero so that the fragment will
1975          * not be freed. Although space will be lost, the snapshot
1976          * will stay consistent.
1977          */
1978         if (error != 0 && wkhd != NULL)
1979                 softdep_freework(wkhd);
1980         lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
1981         return (error);
1982 }
1983
1984 /*
1985  * Associate snapshot files when mounting.
1986  */
1987 void
1988 ffs_snapshot_mount(mp)
1989         struct mount *mp;
1990 {
1991         struct ufsmount *ump = VFSTOUFS(mp);
1992         struct vnode *devvp = ump->um_devvp;
1993         struct fs *fs = ump->um_fs;
1994         struct thread *td = curthread;
1995         struct snapdata *sn;
1996         struct vnode *vp;
1997         struct vnode *lastvp;
1998         struct inode *ip;
1999         struct uio auio;
2000         struct iovec aiov;
2001         void *snapblklist;
2002         char *reason;
2003         daddr_t snaplistsize;
2004         int error, snaploc, loc;
2005
2006         /*
2007          * XXX The following needs to be set before ffs_truncate or
2008          * VOP_READ can be called.
2009          */
2010         mp->mnt_stat.f_iosize = fs->fs_bsize;
2011         /*
2012          * Process each snapshot listed in the superblock.
2013          */
2014         vp = NULL;
2015         lastvp = NULL;
2016         sn = NULL;
2017         for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
2018                 if (fs->fs_snapinum[snaploc] == 0)
2019                         break;
2020                 if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc],
2021                     LK_EXCLUSIVE, &vp)) != 0){
2022                         printf("ffs_snapshot_mount: vget failed %d\n", error);
2023                         continue;
2024                 }
2025                 ip = VTOI(vp);
2026                 if (vp->v_type != VREG) {
2027                         reason = "non-file snapshot";
2028                 } else if (!IS_SNAPSHOT(ip)) {
2029                         reason = "non-snapshot";
2030                 } else if (ip->i_size ==
2031                     lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) {
2032                         reason = "old format snapshot";
2033                         (void)ffs_truncate(vp, (off_t)0, 0, NOCRED);
2034                         (void)ffs_syncvnode(vp, MNT_WAIT, 0);
2035                 } else {
2036                         reason = NULL;
2037                 }
2038                 if (reason != NULL) {
2039                         printf("ffs_snapshot_mount: %s inode %d\n",
2040                             reason, fs->fs_snapinum[snaploc]);
2041                         vput(vp);
2042                         vp = NULL;
2043                         for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
2044                                 if (fs->fs_snapinum[loc] == 0)
2045                                         break;
2046                                 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
2047                         }
2048                         fs->fs_snapinum[loc - 1] = 0;
2049                         snaploc--;
2050                         continue;
2051                 }
2052                 /*
2053                  * Acquire a lock on the snapdata structure, creating it if
2054                  * necessary.
2055                  */
2056                 sn = ffs_snapdata_acquire(devvp);
2057                 /*
2058                  * Change vnode to use shared snapshot lock instead of the
2059                  * original private lock.
2060                  */
2061                 vp->v_vnlock = &sn->sn_lock;
2062                 lockmgr(&vp->v_lock, LK_RELEASE, NULL);
2063                 /*
2064                  * Link it onto the active snapshot list.
2065                  */
2066                 VI_LOCK(devvp);
2067                 if (ip->i_nextsnap.tqe_prev != 0)
2068                         panic("ffs_snapshot_mount: %ju already on list",
2069                             (uintmax_t)ip->i_number);
2070                 else
2071                         TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap);
2072                 vp->v_vflag |= VV_SYSTEM;
2073                 VI_UNLOCK(devvp);
2074                 VOP_UNLOCK(vp);
2075                 lastvp = vp;
2076         }
2077         vp = lastvp;
2078         /*
2079          * No usable snapshots found.
2080          */
2081         if (sn == NULL || vp == NULL)
2082                 return;
2083         /*
2084          * Allocate the space for the block hints list. We always want to
2085          * use the list from the newest snapshot.
2086          */
2087         auio.uio_iov = &aiov;
2088         auio.uio_iovcnt = 1;
2089         aiov.iov_base = (void *)&snaplistsize;
2090         aiov.iov_len = sizeof(snaplistsize);
2091         auio.uio_resid = aiov.iov_len;
2092         auio.uio_offset =
2093             lblktosize(fs, howmany(fs->fs_size, fs->fs_frag));
2094         auio.uio_segflg = UIO_SYSSPACE;
2095         auio.uio_rw = UIO_READ;
2096         auio.uio_td = td;
2097         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2098         if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
2099                 printf("ffs_snapshot_mount: read_1 failed %d\n", error);
2100                 VOP_UNLOCK(vp);
2101                 return;
2102         }
2103         snapblklist = malloc(snaplistsize * sizeof(daddr_t),
2104             M_UFSMNT, M_WAITOK);
2105         auio.uio_iovcnt = 1;
2106         aiov.iov_base = snapblklist;
2107         aiov.iov_len = snaplistsize * sizeof (daddr_t);
2108         auio.uio_resid = aiov.iov_len;
2109         auio.uio_offset -= sizeof(snaplistsize);
2110         if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
2111                 printf("ffs_snapshot_mount: read_2 failed %d\n", error);
2112                 VOP_UNLOCK(vp);
2113                 free(snapblklist, M_UFSMNT);
2114                 return;
2115         }
2116         VOP_UNLOCK(vp);
2117         VI_LOCK(devvp);
2118         ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount");
2119         sn->sn_listsize = snaplistsize;
2120         sn->sn_blklist = (daddr_t *)snapblklist;
2121         devvp->v_vflag |= VV_COPYONWRITE;
2122         VI_UNLOCK(devvp);
2123 }
2124
2125 /*
2126  * Disassociate snapshot files when unmounting.
2127  */
2128 void
2129 ffs_snapshot_unmount(mp)
2130         struct mount *mp;
2131 {
2132         struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
2133         struct snapdata *sn;
2134         struct inode *xp;
2135         struct vnode *vp;
2136
2137         VI_LOCK(devvp);
2138         sn = devvp->v_rdev->si_snapdata;
2139         while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) {
2140                 vp = ITOV(xp);
2141                 TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap);
2142                 xp->i_nextsnap.tqe_prev = 0;
2143                 lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE,
2144                     VI_MTX(devvp));
2145                 lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL);
2146                 KASSERT(vp->v_vnlock == &sn->sn_lock,
2147                 ("ffs_snapshot_unmount: lost lock mutation"));
2148                 vp->v_vnlock = &vp->v_lock;
2149                 lockmgr(&vp->v_lock, LK_RELEASE, NULL);
2150                 lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
2151                 if (xp->i_effnlink > 0)
2152                         vrele(vp);
2153                 VI_LOCK(devvp);
2154                 sn = devvp->v_rdev->si_snapdata;
2155         }
2156         try_free_snapdata(devvp);
2157         ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount");
2158 }
2159
2160 /*
2161  * Check the buffer block to be belong to device buffer that shall be
2162  * locked after snaplk. devvp shall be locked on entry, and will be
2163  * leaved locked upon exit.
2164  */
2165 static int
2166 ffs_bp_snapblk(devvp, bp)
2167         struct vnode *devvp;
2168         struct buf *bp;
2169 {
2170         struct snapdata *sn;
2171         struct fs *fs;
2172         ufs2_daddr_t lbn, *snapblklist;
2173         int lower, upper, mid;
2174
2175         ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk");
2176         KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp));
2177         sn = devvp->v_rdev->si_snapdata;
2178         if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL)
2179                 return (0);
2180         fs = ITOFS(TAILQ_FIRST(&sn->sn_head));
2181         lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
2182         snapblklist = sn->sn_blklist;
2183         upper = sn->sn_listsize - 1;
2184         lower = 1;
2185         while (lower <= upper) {
2186                 mid = (lower + upper) / 2;
2187                 if (snapblklist[mid] == lbn)
2188                         break;
2189                 if (snapblklist[mid] < lbn)
2190                         lower = mid + 1;
2191                 else
2192                         upper = mid - 1;
2193         }
2194         if (lower <= upper)
2195                 return (1);
2196         return (0);
2197 }
2198
2199 void
2200 ffs_bdflush(bo, bp)
2201         struct bufobj *bo;
2202         struct buf *bp;
2203 {
2204         struct thread *td;
2205         struct vnode *vp, *devvp;
2206         struct buf *nbp;
2207         int bp_bdskip;
2208
2209         if (bo->bo_dirty.bv_cnt <= dirtybufthresh)
2210                 return;
2211
2212         td = curthread;
2213         vp = bp->b_vp;
2214         devvp = bo2vnode(bo);
2215         KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp));
2216
2217         VI_LOCK(devvp);
2218         bp_bdskip = ffs_bp_snapblk(devvp, bp);
2219         if (bp_bdskip)
2220                 bdwriteskip++;
2221         VI_UNLOCK(devvp);
2222         if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) {
2223                 (void) VOP_FSYNC(vp, MNT_NOWAIT, td);
2224                 altbufferflushes++;
2225         } else {
2226                 BO_LOCK(bo);
2227                 /*
2228                  * Try to find a buffer to flush.
2229                  */
2230                 TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
2231                         if ((nbp->b_vflags & BV_BKGRDINPROG) ||
2232                             BUF_LOCK(nbp,
2233                                      LK_EXCLUSIVE | LK_NOWAIT, NULL))
2234                                 continue;
2235                         if (bp == nbp)
2236                                 panic("bdwrite: found ourselves");
2237                         BO_UNLOCK(bo);
2238                         /*
2239                          * Don't countdeps with the bo lock
2240                          * held.
2241                          */
2242                         if (buf_countdeps(nbp, 0)) {
2243                                 BO_LOCK(bo);
2244                                 BUF_UNLOCK(nbp);
2245                                 continue;
2246                         }
2247                         if (bp_bdskip) {
2248                                 VI_LOCK(devvp);
2249                                 if (!ffs_bp_snapblk(vp, nbp)) {
2250                                         VI_UNLOCK(devvp);
2251                                         BO_LOCK(bo);
2252                                         BUF_UNLOCK(nbp);
2253                                         continue;
2254                                 }
2255                                 VI_UNLOCK(devvp);
2256                         }
2257                         if (nbp->b_flags & B_CLUSTEROK) {
2258                                 vfs_bio_awrite(nbp);
2259                         } else {
2260                                 bremfree(nbp);
2261                                 bawrite(nbp);
2262                         }
2263                         dirtybufferflushes++;
2264                         break;
2265                 }
2266                 if (nbp == NULL)
2267                         BO_UNLOCK(bo);
2268         }
2269 }
2270
2271 /*
2272  * Check for need to copy block that is about to be written,
2273  * copying the block if necessary.
2274  */
2275 int
2276 ffs_copyonwrite(devvp, bp)
2277         struct vnode *devvp;
2278         struct buf *bp;
2279 {
2280         struct snapdata *sn;
2281         struct buf *ibp, *cbp, *savedcbp = NULL;
2282         struct thread *td = curthread;
2283         struct fs *fs;
2284         struct inode *ip;
2285         struct vnode *vp = NULL;
2286         ufs2_daddr_t lbn, blkno, *snapblklist;
2287         int lower, upper, mid, indiroff, error = 0;
2288         int launched_async_io, prev_norunningbuf;
2289         long saved_runningbufspace;
2290
2291         if (devvp != bp->b_vp && IS_SNAPSHOT(VTOI(bp->b_vp)))
2292                 return (0);             /* Update on a snapshot file */
2293         if (td->td_pflags & TDP_COWINPROGRESS)
2294                 panic("ffs_copyonwrite: recursive call");
2295         /*
2296          * First check to see if it is in the preallocated list.
2297          * By doing this check we avoid several potential deadlocks.
2298          */
2299         VI_LOCK(devvp);
2300         sn = devvp->v_rdev->si_snapdata;
2301         if (sn == NULL ||
2302             TAILQ_EMPTY(&sn->sn_head)) {
2303                 VI_UNLOCK(devvp);
2304                 return (0);             /* No snapshot */
2305         }
2306         ip = TAILQ_FIRST(&sn->sn_head);
2307         fs = ITOFS(ip);
2308         lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
2309         snapblklist = sn->sn_blklist;
2310         upper = sn->sn_listsize - 1;
2311         lower = 1;
2312         while (lower <= upper) {
2313                 mid = (lower + upper) / 2;
2314                 if (snapblklist[mid] == lbn)
2315                         break;
2316                 if (snapblklist[mid] < lbn)
2317                         lower = mid + 1;
2318                 else
2319                         upper = mid - 1;
2320         }
2321         if (lower <= upper) {
2322                 VI_UNLOCK(devvp);
2323                 return (0);
2324         }
2325         launched_async_io = 0;
2326         prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF;
2327         /*
2328          * Since I/O on bp isn't yet in progress and it may be blocked
2329          * for a long time waiting on snaplk, back it out of
2330          * runningbufspace, possibly waking other threads waiting for space.
2331          */
2332         saved_runningbufspace = bp->b_runningbufspace;
2333         if (saved_runningbufspace != 0)
2334                 runningbufwakeup(bp);
2335         /*
2336          * Not in the precomputed list, so check the snapshots.
2337          */
2338         while (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
2339             VI_MTX(devvp)) != 0) {
2340                 VI_LOCK(devvp);
2341                 sn = devvp->v_rdev->si_snapdata;
2342                 if (sn == NULL ||
2343                     TAILQ_EMPTY(&sn->sn_head)) {
2344                         VI_UNLOCK(devvp);
2345                         if (saved_runningbufspace != 0) {
2346                                 bp->b_runningbufspace = saved_runningbufspace;
2347                                 atomic_add_long(&runningbufspace,
2348                                                bp->b_runningbufspace);
2349                         }
2350                         return (0);             /* Snapshot gone */
2351                 }
2352         }
2353         TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
2354                 vp = ITOV(ip);
2355                 if (DOINGSOFTDEP(vp))
2356                         softdep_prealloc(vp, MNT_WAIT);
2357                 /*
2358                  * We ensure that everything of our own that needs to be
2359                  * copied will be done at the time that ffs_snapshot is
2360                  * called. Thus we can skip the check here which can
2361                  * deadlock in doing the lookup in UFS_BALLOC.
2362                  */
2363                 if (bp->b_vp == vp)
2364                         continue;
2365                 /*
2366                  * Check to see if block needs to be copied. We do not have
2367                  * to hold the snapshot lock while doing this lookup as it
2368                  * will never require any additional allocations for the
2369                  * snapshot inode.
2370                  */
2371                 if (lbn < UFS_NDADDR) {
2372                         blkno = DIP(ip, i_db[lbn]);
2373                 } else {
2374                         td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
2375                         error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
2376                            fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
2377                         td->td_pflags &= ~TDP_COWINPROGRESS;
2378                         if (error)
2379                                 break;
2380                         indiroff = (lbn - UFS_NDADDR) % NINDIR(fs);
2381                         if (I_IS_UFS1(ip))
2382                                 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
2383                         else
2384                                 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
2385                         bqrelse(ibp);
2386                 }
2387 #ifdef INVARIANTS
2388                 if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
2389                         panic("ffs_copyonwrite: bad copy block");
2390 #endif
2391                 if (blkno != 0)
2392                         continue;
2393                 /*
2394                  * Allocate the block into which to do the copy. Since
2395                  * multiple processes may all try to copy the same block,
2396                  * we have to recheck our need to do a copy if we sleep
2397                  * waiting for the lock.
2398                  *
2399                  * Because all snapshots on a filesystem share a single
2400                  * lock, we ensure that we will never be in competition
2401                  * with another process to allocate a block.
2402                  */
2403                 td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
2404                 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
2405                     fs->fs_bsize, KERNCRED, 0, &cbp);
2406                 td->td_pflags &= ~TDP_COWINPROGRESS;
2407                 if (error)
2408                         break;
2409 #ifdef DIAGNOSTIC
2410                 if (snapdebug) {
2411                         printf("Copyonwrite: snapino %ju lbn %jd for ",
2412                             (uintmax_t)ip->i_number, (intmax_t)lbn);
2413                         if (bp->b_vp == devvp)
2414                                 printf("fs metadata");
2415                         else
2416                                 printf("inum %ju",
2417                                     (uintmax_t)VTOI(bp->b_vp)->i_number);
2418                         printf(" lblkno %jd to blkno %jd\n",
2419                             (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno);
2420                 }
2421 #endif
2422                 /*
2423                  * If we have already read the old block contents, then
2424                  * simply copy them to the new block. Note that we need
2425                  * to synchronously write snapshots that have not been
2426                  * unlinked, and hence will be visible after a crash,
2427                  * to ensure their integrity. At a minimum we ensure the
2428                  * integrity of the filesystem metadata, but use the
2429                  * dopersistence sysctl-setable flag to decide on the
2430                  * persistence needed for file content data.
2431                  */
2432                 if (savedcbp != NULL) {
2433                         bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
2434                         bawrite(cbp);
2435                         if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR ||
2436                             dopersistence) && ip->i_effnlink > 0)
2437                                 (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
2438                         else
2439                                 launched_async_io = 1;
2440                         continue;
2441                 }
2442                 /*
2443                  * Otherwise, read the old block contents into the buffer.
2444                  */
2445                 if ((error = readblock(vp, cbp, lbn)) != 0) {
2446                         bzero(cbp->b_data, fs->fs_bsize);
2447                         bawrite(cbp);
2448                         if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR ||
2449                             dopersistence) && ip->i_effnlink > 0)
2450                                 (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
2451                         else
2452                                 launched_async_io = 1;
2453                         break;
2454                 }
2455                 savedcbp = cbp;
2456         }
2457         /*
2458          * Note that we need to synchronously write snapshots that
2459          * have not been unlinked, and hence will be visible after
2460          * a crash, to ensure their integrity. At a minimum we
2461          * ensure the integrity of the filesystem metadata, but
2462          * use the dopersistence sysctl-setable flag to decide on
2463          * the persistence needed for file content data.
2464          */
2465         if (savedcbp) {
2466                 vp = savedcbp->b_vp;
2467                 bawrite(savedcbp);
2468                 if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR ||
2469                     dopersistence) && VTOI(vp)->i_effnlink > 0)
2470                         (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
2471                 else
2472                         launched_async_io = 1;
2473         }
2474         lockmgr(vp->v_vnlock, LK_RELEASE, NULL);
2475         td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) |
2476                 prev_norunningbuf;
2477         if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0)
2478                 waitrunningbufspace();
2479         /*
2480          * I/O on bp will now be started, so count it in runningbufspace.
2481          */
2482         if (saved_runningbufspace != 0) {
2483                 bp->b_runningbufspace = saved_runningbufspace;
2484                 atomic_add_long(&runningbufspace, bp->b_runningbufspace);
2485         }
2486         return (error);
2487 }
2488
2489 /*
2490  * sync snapshots to force freework records waiting on snapshots to claim
2491  * blocks to free.
2492  */
2493 void
2494 ffs_sync_snap(mp, waitfor)
2495         struct mount *mp;
2496         int waitfor;
2497 {
2498         struct snapdata *sn;
2499         struct vnode *devvp;
2500         struct vnode *vp;
2501         struct inode *ip;
2502
2503         devvp = VFSTOUFS(mp)->um_devvp;
2504         if ((devvp->v_vflag & VV_COPYONWRITE) == 0)
2505                 return;
2506         for (;;) {
2507                 VI_LOCK(devvp);
2508                 sn = devvp->v_rdev->si_snapdata;
2509                 if (sn == NULL) {
2510                         VI_UNLOCK(devvp);
2511                         return;
2512                 }
2513                 if (lockmgr(&sn->sn_lock,
2514                     LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
2515                     VI_MTX(devvp)) == 0)
2516                         break;
2517         }
2518         TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
2519                 vp = ITOV(ip);
2520                 ffs_syncvnode(vp, waitfor, NO_INO_UPDT);
2521         }
2522         lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
2523 }
2524
2525 /*
2526  * Read the specified block into the given buffer.
2527  * Much of this boiler-plate comes from bwrite().
2528  */
2529 static int
2530 readblock(vp, bp, lbn)
2531         struct vnode *vp;
2532         struct buf *bp;
2533         ufs2_daddr_t lbn;
2534 {
2535         struct inode *ip;
2536         struct bio *bip;
2537         struct fs *fs;
2538
2539         ip = VTOI(vp);
2540         fs = ITOFS(ip);
2541
2542         bip = g_alloc_bio();
2543         bip->bio_cmd = BIO_READ;
2544         bip->bio_offset = dbtob(fsbtodb(fs, blkstofrags(fs, lbn)));
2545         bip->bio_data = bp->b_data;
2546         bip->bio_length = bp->b_bcount;
2547         bip->bio_done = NULL;
2548
2549         g_io_request(bip, ITODEVVP(ip)->v_bufobj.bo_private);
2550         bp->b_error = biowait(bip, "snaprdb");
2551         g_destroy_bio(bip);
2552         return (bp->b_error);
2553 }
2554
2555 #endif
2556
2557 /*
2558  * Process file deletes that were deferred by ufs_inactive() due to
2559  * the file system being suspended. Transfer IN_LAZYACCESS into
2560  * IN_MODIFIED for vnodes that were accessed during suspension.
2561  */
2562 void
2563 process_deferred_inactive(struct mount *mp)
2564 {
2565         struct vnode *vp, *mvp;
2566         struct inode *ip;
2567         int error;
2568
2569         (void) vn_start_secondary_write(NULL, &mp, V_WAIT);
2570  loop:
2571         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2572                 /*
2573                  * IN_LAZYACCESS is checked here without holding any
2574                  * vnode lock, but this flag is set only while holding
2575                  * vnode interlock.
2576                  */
2577                 if (vp->v_type == VNON ||
2578                     ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 &&
2579                     ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0))) {
2580                         VI_UNLOCK(vp);
2581                         continue;
2582                 }
2583                 vholdl(vp);
2584                 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
2585                 if (error != 0) {
2586                         vdrop(vp);
2587                         if (error == ENOENT)
2588                                 continue;       /* vnode recycled */
2589                         MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2590                         goto loop;
2591                 }
2592                 ip = VTOI(vp);
2593                 if ((ip->i_flag & IN_LAZYACCESS) != 0) {
2594                         ip->i_flag &= ~IN_LAZYACCESS;
2595                         UFS_INODE_SET_FLAG(ip, IN_MODIFIED);
2596                 }
2597                 VI_LOCK(vp);
2598                 vinactive(vp);
2599                 VI_UNLOCK(vp);
2600                 VOP_UNLOCK(vp);
2601                 vdrop(vp);
2602         }
2603         vn_finished_secondary_write(mp);
2604 }
2605
2606 #ifndef NO_FFS_SNAPSHOT
2607
2608 static struct snapdata *
2609 ffs_snapdata_alloc(void)
2610 {
2611         struct snapdata *sn;
2612
2613         /*
2614          * Fetch a snapdata from the free list if there is one available.
2615          */
2616         mtx_lock(&snapfree_lock);
2617         sn = LIST_FIRST(&snapfree);
2618         if (sn != NULL)
2619                 LIST_REMOVE(sn, sn_link);
2620         mtx_unlock(&snapfree_lock);
2621         if (sn != NULL)
2622                 return (sn);
2623         /*
2624          * If there were no free snapdatas allocate one.
2625          */
2626         sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO);
2627         TAILQ_INIT(&sn->sn_head);
2628         lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT,
2629             LK_CANRECURSE | LK_NOSHARE);
2630         return (sn);
2631 }
2632
2633 /*
2634  * The snapdata is never freed because we can not be certain that
2635  * there are no threads sleeping on the snap lock.  Persisting
2636  * them permanently avoids costly synchronization in ffs_lock().
2637  */
2638 static void
2639 ffs_snapdata_free(struct snapdata *sn)
2640 {
2641         mtx_lock(&snapfree_lock);
2642         LIST_INSERT_HEAD(&snapfree, sn, sn_link);
2643         mtx_unlock(&snapfree_lock);
2644 }
2645
2646 /* Try to free snapdata associated with devvp */
2647 static void
2648 try_free_snapdata(struct vnode *devvp)
2649 {
2650         struct snapdata *sn;
2651         ufs2_daddr_t *snapblklist;
2652
2653         ASSERT_VI_LOCKED(devvp, "try_free_snapdata");
2654         sn = devvp->v_rdev->si_snapdata;
2655
2656         if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL ||
2657             (devvp->v_vflag & VV_COPYONWRITE) == 0) {
2658                 VI_UNLOCK(devvp);
2659                 return;
2660         }
2661
2662         devvp->v_rdev->si_snapdata = NULL;
2663         devvp->v_vflag &= ~VV_COPYONWRITE;
2664         lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp));
2665         snapblklist = sn->sn_blklist;
2666         sn->sn_blklist = NULL;
2667         sn->sn_listsize = 0;
2668         lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
2669         if (snapblklist != NULL)
2670                 free(snapblklist, M_UFSMNT);
2671         ffs_snapdata_free(sn);
2672 }
2673
2674 static struct snapdata *
2675 ffs_snapdata_acquire(struct vnode *devvp)
2676 {
2677         struct snapdata *nsn, *sn;
2678         int error;
2679
2680         /*
2681          * Allocate a free snapdata.  This is done before acquiring the
2682          * devvp lock to avoid allocation while the devvp interlock is
2683          * held.
2684          */
2685         nsn = ffs_snapdata_alloc();
2686
2687         for (;;) {
2688                 VI_LOCK(devvp);
2689                 sn = devvp->v_rdev->si_snapdata;
2690                 if (sn == NULL) {
2691                         /*
2692                          * This is the first snapshot on this
2693                          * filesystem and we use our pre-allocated
2694                          * snapdata.  Publish sn with the sn_lock
2695                          * owned by us, to avoid the race.
2696                          */
2697                         error = lockmgr(&nsn->sn_lock, LK_EXCLUSIVE |
2698                             LK_NOWAIT, NULL);
2699                         if (error != 0)
2700                                 panic("leaked sn, lockmgr error %d", error);
2701                         sn = devvp->v_rdev->si_snapdata = nsn;
2702                         VI_UNLOCK(devvp);
2703                         nsn = NULL;
2704                         break;
2705                 }
2706
2707                 /*
2708                  * There is a snapshots which already exists on this
2709                  * filesystem, grab a reference to the common lock.
2710                  */
2711                 error = lockmgr(&sn->sn_lock, LK_INTERLOCK |
2712                     LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp));
2713                 if (error == 0)
2714                         break;
2715         }
2716
2717         /*
2718          * Free any unused snapdata.
2719          */
2720         if (nsn != NULL)
2721                 ffs_snapdata_free(nsn);
2722
2723         return (sn);
2724 }
2725
2726 #endif