sys/ufs/ffs/ffs_softdep.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright 1998, 2000 Marshall Kirk McKusick.
   5  * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
   6  * All rights reserved.
   7  *
   8  * The soft updates code is derived from the appendix of a University
   9  * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
  10  * "Soft Updates: A Solution to the Metadata Update Problem in File
  11  * Systems", CSE-TR-254-95, August 1995).
  12  *
  13  * Further information about soft updates can be obtained from:
  14  *
  15  *      Marshall Kirk McKusick          http://www.mckusick.com/softdep/
  16  *      1614 Oxford Street              mckusick@mckusick.com
  17  *      Berkeley, CA 94709-1608         +1-510-843-9542
  18  *      USA
  19  *
  20  * Redistribution and use in source and binary forms, with or without
  21  * modification, are permitted provided that the following conditions
  22  * are met:
  23  *
  24  * 1. Redistributions of source code must retain the above copyright
  25  *    notice, this list of conditions and the following disclaimer.
  26  * 2. Redistributions in binary form must reproduce the above copyright
  27  *    notice, this list of conditions and the following disclaimer in the
  28  *    documentation and/or other materials provided with the distribution.
  29  *
  30  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
  31  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  32  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  33  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  34  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  35  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  36  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  37  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
  38  * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  39  * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  40  *
  41  *      from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
  42  */
  43
  44 #include <sys/cdefs.h>
  45 __FBSDID("$FreeBSD$");
  46
  47 #include "opt_ffs.h"
  48 #include "opt_quota.h"
  49 #include "opt_ddb.h"
  50
  51 #include <sys/param.h>
  52 #include <sys/kernel.h>
  53 #include <sys/systm.h>
  54 #include <sys/bio.h>
  55 #include <sys/buf.h>
  56 #include <sys/kdb.h>
  57 #include <sys/kthread.h>
  58 #include <sys/ktr.h>
  59 #include <sys/limits.h>
  60 #include <sys/lock.h>
  61 #include <sys/malloc.h>
  62 #include <sys/mount.h>
  63 #include <sys/mutex.h>
  64 #include <sys/namei.h>
  65 #include <sys/priv.h>
  66 #include <sys/proc.h>
  67 #include <sys/racct.h>
  68 #include <sys/rwlock.h>
  69 #include <sys/stat.h>
  70 #include <sys/sysctl.h>
  71 #include <sys/syslog.h>
  72 #include <sys/vnode.h>
  73 #include <sys/conf.h>
  74
  75 #include <ufs/ufs/dir.h>
  76 #include <ufs/ufs/extattr.h>
  77 #include <ufs/ufs/quota.h>
  78 #include <ufs/ufs/inode.h>
  79 #include <ufs/ufs/ufsmount.h>
  80 #include <ufs/ffs/fs.h>
  81 #include <ufs/ffs/softdep.h>
  82 #include <ufs/ffs/ffs_extern.h>
  83 #include <ufs/ufs/ufs_extern.h>
  84
  85 #include <vm/vm.h>
  86 #include <vm/vm_extern.h>
  87 #include <vm/vm_object.h>
  88
  89 #include <geom/geom.h>
  90
  91 #include <ddb/ddb.h>
  92
  93 #define KTR_SUJ 0       /* Define to KTR_SPARE. */
  94
  95 #ifndef SOFTUPDATES
  96
  97 int
  98 softdep_flushfiles(oldmnt, flags, td)
  99         struct mount *oldmnt;
 100         int flags;
 101         struct thread *td;
 102 {
 103
 104         panic("softdep_flushfiles called");
 105 }
 106
 107 int
 108 softdep_mount(devvp, mp, fs, cred)
 109         struct vnode *devvp;
 110         struct mount *mp;
 111         struct fs *fs;
 112         struct ucred *cred;
 113 {
 114
 115         return (0);
 116 }
 117
 118 void
 119 softdep_initialize()
 120 {
 121
 122         return;
 123 }
 124
 125 void
 126 softdep_uninitialize()
 127 {
 128
 129         return;
 130 }
 131
 132 void
 133 softdep_unmount(mp)
 134         struct mount *mp;
 135 {
 136
 137         panic("softdep_unmount called");
 138 }
 139
 140 void
 141 softdep_setup_sbupdate(ump, fs, bp)
 142         struct ufsmount *ump;
 143         struct fs *fs;
 144         struct buf *bp;
 145 {
 146
 147         panic("softdep_setup_sbupdate called");
 148 }
 149
 150 void
 151 softdep_setup_inomapdep(bp, ip, newinum, mode)
 152         struct buf *bp;
 153         struct inode *ip;
 154         ino_t newinum;
 155         int mode;
 156 {
 157
 158         panic("softdep_setup_inomapdep called");
 159 }
 160
 161 void
 162 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
 163         struct buf *bp;
 164         struct mount *mp;
 165         ufs2_daddr_t newblkno;
 166         int frags;
 167         int oldfrags;
 168 {
 169
 170         panic("softdep_setup_blkmapdep called");
 171 }
 172
 173 void
 174 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 175         struct inode *ip;
 176         ufs_lbn_t lbn;
 177         ufs2_daddr_t newblkno;
 178         ufs2_daddr_t oldblkno;
 179         long newsize;
 180         long oldsize;
 181         struct buf *bp;
 182 {
 183
 184         panic("softdep_setup_allocdirect called");
 185 }
 186
 187 void
 188 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 189         struct inode *ip;
 190         ufs_lbn_t lbn;
 191         ufs2_daddr_t newblkno;
 192         ufs2_daddr_t oldblkno;
 193         long newsize;
 194         long oldsize;
 195         struct buf *bp;
 196 {
 197
 198         panic("softdep_setup_allocext called");
 199 }
 200
 201 void
 202 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
 203         struct inode *ip;
 204         ufs_lbn_t lbn;
 205         struct buf *bp;
 206         int ptrno;
 207         ufs2_daddr_t newblkno;
 208         ufs2_daddr_t oldblkno;
 209         struct buf *nbp;
 210 {
 211
 212         panic("softdep_setup_allocindir_page called");
 213 }
 214
 215 void
 216 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
 217         struct buf *nbp;
 218         struct inode *ip;
 219         struct buf *bp;
 220         int ptrno;
 221         ufs2_daddr_t newblkno;
 222 {
 223
 224         panic("softdep_setup_allocindir_meta called");
 225 }
 226
 227 void
 228 softdep_journal_freeblocks(ip, cred, length, flags)
 229         struct inode *ip;
 230         struct ucred *cred;
 231         off_t length;
 232         int flags;
 233 {
 234
 235         panic("softdep_journal_freeblocks called");
 236 }
 237
 238 void
 239 softdep_journal_fsync(ip)
 240         struct inode *ip;
 241 {
 242
 243         panic("softdep_journal_fsync called");
 244 }
 245
 246 void
 247 softdep_setup_freeblocks(ip, length, flags)
 248         struct inode *ip;
 249         off_t length;
 250         int flags;
 251 {
 252
 253         panic("softdep_setup_freeblocks called");
 254 }
 255
 256 void
 257 softdep_freefile(pvp, ino, mode)
 258                 struct vnode *pvp;
 259                 ino_t ino;
 260                 int mode;
 261 {
 262
 263         panic("softdep_freefile called");
 264 }
 265
 266 int
 267 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
 268         struct buf *bp;
 269         struct inode *dp;
 270         off_t diroffset;
 271         ino_t newinum;
 272         struct buf *newdirbp;
 273         int isnewblk;
 274 {
 275
 276         panic("softdep_setup_directory_add called");
 277 }
 278
 279 void
 280 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
 281         struct buf *bp;
 282         struct inode *dp;
 283         caddr_t base;
 284         caddr_t oldloc;
 285         caddr_t newloc;
 286         int entrysize;
 287 {
 288
 289         panic("softdep_change_directoryentry_offset called");
 290 }
 291
 292 void
 293 softdep_setup_remove(bp, dp, ip, isrmdir)
 294         struct buf *bp;
 295         struct inode *dp;
 296         struct inode *ip;
 297         int isrmdir;
 298 {
 299
 300         panic("softdep_setup_remove called");
 301 }
 302
 303 void
 304 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
 305         struct buf *bp;
 306         struct inode *dp;
 307         struct inode *ip;
 308         ino_t newinum;
 309         int isrmdir;
 310 {
 311
 312         panic("softdep_setup_directory_change called");
 313 }
 314
 315 void
 316 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
 317         struct mount *mp;
 318         struct buf *bp;
 319         ufs2_daddr_t blkno;
 320         int frags;
 321         struct workhead *wkhd;
 322 {
 323
 324         panic("%s called", __FUNCTION__);
 325 }
 326
 327 void
 328 softdep_setup_inofree(mp, bp, ino, wkhd)
 329         struct mount *mp;
 330         struct buf *bp;
 331         ino_t ino;
 332         struct workhead *wkhd;
 333 {
 334
 335         panic("%s called", __FUNCTION__);
 336 }
 337
 338 void
 339 softdep_setup_unlink(dp, ip)
 340         struct inode *dp;
 341         struct inode *ip;
 342 {
 343
 344         panic("%s called", __FUNCTION__);
 345 }
 346
 347 void
 348 softdep_setup_link(dp, ip)
 349         struct inode *dp;
 350         struct inode *ip;
 351 {
 352
 353         panic("%s called", __FUNCTION__);
 354 }
 355
 356 void
 357 softdep_revert_link(dp, ip)
 358         struct inode *dp;
 359         struct inode *ip;
 360 {
 361
 362         panic("%s called", __FUNCTION__);
 363 }
 364
 365 void
 366 softdep_setup_rmdir(dp, ip)
 367         struct inode *dp;
 368         struct inode *ip;
 369 {
 370
 371         panic("%s called", __FUNCTION__);
 372 }
 373
 374 void
 375 softdep_revert_rmdir(dp, ip)
 376         struct inode *dp;
 377         struct inode *ip;
 378 {
 379
 380         panic("%s called", __FUNCTION__);
 381 }
 382
 383 void
 384 softdep_setup_create(dp, ip)
 385         struct inode *dp;
 386         struct inode *ip;
 387 {
 388
 389         panic("%s called", __FUNCTION__);
 390 }
 391
 392 void
 393 softdep_revert_create(dp, ip)
 394         struct inode *dp;
 395         struct inode *ip;
 396 {
 397
 398         panic("%s called", __FUNCTION__);
 399 }
 400
 401 void
 402 softdep_setup_mkdir(dp, ip)
 403         struct inode *dp;
 404         struct inode *ip;
 405 {
 406
 407         panic("%s called", __FUNCTION__);
 408 }
 409
 410 void
 411 softdep_revert_mkdir(dp, ip)
 412         struct inode *dp;
 413         struct inode *ip;
 414 {
 415
 416         panic("%s called", __FUNCTION__);
 417 }
 418
 419 void
 420 softdep_setup_dotdot_link(dp, ip)
 421         struct inode *dp;
 422         struct inode *ip;
 423 {
 424
 425         panic("%s called", __FUNCTION__);
 426 }
 427
 428 int
 429 softdep_prealloc(vp, waitok)
 430         struct vnode *vp;
 431         int waitok;
 432 {
 433
 434         panic("%s called", __FUNCTION__);
 435 }
 436
 437 int
 438 softdep_journal_lookup(mp, vpp)
 439         struct mount *mp;
 440         struct vnode **vpp;
 441 {
 442
 443         return (ENOENT);
 444 }
 445
 446 void
 447 softdep_change_linkcnt(ip)
 448         struct inode *ip;
 449 {
 450
 451         panic("softdep_change_linkcnt called");
 452 }
 453
 454 void
 455 softdep_load_inodeblock(ip)
 456         struct inode *ip;
 457 {
 458
 459         panic("softdep_load_inodeblock called");
 460 }
 461
 462 void
 463 softdep_update_inodeblock(ip, bp, waitfor)
 464         struct inode *ip;
 465         struct buf *bp;
 466         int waitfor;
 467 {
 468
 469         panic("softdep_update_inodeblock called");
 470 }
 471
 472 int
 473 softdep_fsync(vp)
 474         struct vnode *vp;       /* the "in_core" copy of the inode */
 475 {
 476
 477         return (0);
 478 }
 479
 480 void
 481 softdep_fsync_mountdev(vp)
 482         struct vnode *vp;
 483 {
 484
 485         return;
 486 }
 487
 488 int
 489 softdep_flushworklist(oldmnt, countp, td)
 490         struct mount *oldmnt;
 491         int *countp;
 492         struct thread *td;
 493 {
 494
 495         *countp = 0;
 496         return (0);
 497 }
 498
 499 int
 500 softdep_sync_metadata(struct vnode *vp)
 501 {
 502
 503         panic("softdep_sync_metadata called");
 504 }
 505
 506 int
 507 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
 508 {
 509
 510         panic("softdep_sync_buf called");
 511 }
 512
 513 int
 514 softdep_slowdown(vp)
 515         struct vnode *vp;
 516 {
 517
 518         panic("softdep_slowdown called");
 519 }
 520
 521 int
 522 softdep_request_cleanup(fs, vp, cred, resource)
 523         struct fs *fs;
 524         struct vnode *vp;
 525         struct ucred *cred;
 526         int resource;
 527 {
 528
 529         return (0);
 530 }
 531
 532 int
 533 softdep_check_suspend(struct mount *mp,
 534                       struct vnode *devvp,
 535                       int softdep_depcnt,
 536                       int softdep_accdepcnt,
 537                       int secondary_writes,
 538                       int secondary_accwrites)
 539 {
 540         struct bufobj *bo;
 541         int error;
 542
 543         (void) softdep_depcnt,
 544         (void) softdep_accdepcnt;
 545
 546         bo = &devvp->v_bufobj;
 547         ASSERT_BO_WLOCKED(bo);
 548
 549         MNT_ILOCK(mp);
 550         while (mp->mnt_secondary_writes != 0) {
 551                 BO_UNLOCK(bo);
 552                 msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
 553                     (PUSER - 1) | PDROP, "secwr", 0);
 554                 BO_LOCK(bo);
 555                 MNT_ILOCK(mp);
 556         }
 557
 558         /*
 559          * Reasons for needing more work before suspend:
 560          * - Dirty buffers on devvp.
 561          * - Secondary writes occurred after start of vnode sync loop
 562          */
 563         error = 0;
 564         if (bo->bo_numoutput > 0 ||
 565             bo->bo_dirty.bv_cnt > 0 ||
 566             secondary_writes != 0 ||
 567             mp->mnt_secondary_writes != 0 ||
 568             secondary_accwrites != mp->mnt_secondary_accwrites)
 569                 error = EAGAIN;
 570         BO_UNLOCK(bo);
 571         return (error);
 572 }
 573
 574 void
 575 softdep_get_depcounts(struct mount *mp,
 576                       int *softdepactivep,
 577                       int *softdepactiveaccp)
 578 {
 579         (void) mp;
 580         *softdepactivep = 0;
 581         *softdepactiveaccp = 0;
 582 }
 583
 584 void
 585 softdep_buf_append(bp, wkhd)
 586         struct buf *bp;
 587         struct workhead *wkhd;
 588 {
 589
 590         panic("softdep_buf_appendwork called");
 591 }
 592
 593 void
 594 softdep_inode_append(ip, cred, wkhd)
 595         struct inode *ip;
 596         struct ucred *cred;
 597         struct workhead *wkhd;
 598 {
 599
 600         panic("softdep_inode_appendwork called");
 601 }
 602
 603 void
 604 softdep_freework(wkhd)
 605         struct workhead *wkhd;
 606 {
 607
 608         panic("softdep_freework called");
 609 }
 610
 611 #else
 612
 613 FEATURE(softupdates, "FFS soft-updates support");
 614
 615 static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0,
 616     "soft updates stats");
 617 static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
 618     "total dependencies allocated");
 619 static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse, CTLFLAG_RW, 0,
 620     "high use dependencies allocated");
 621 static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
 622     "current dependencies allocated");
 623 static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0,
 624     "current dependencies written");
 625
 626 unsigned long dep_current[D_LAST + 1];
 627 unsigned long dep_highuse[D_LAST + 1];
 628 unsigned long dep_total[D_LAST + 1];
 629 unsigned long dep_write[D_LAST + 1];
 630
 631 #define SOFTDEP_TYPE(type, str, long)                                   \
 632     static MALLOC_DEFINE(M_ ## type, #str, long);                       \
 633     SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,       \
 634         &dep_total[D_ ## type], 0, "");                                 \
 635     SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD,     \
 636         &dep_current[D_ ## type], 0, "");                               \
 637     SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD,     \
 638         &dep_highuse[D_ ## type], 0, "");                               \
 639     SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD,       \
 640         &dep_write[D_ ## type], 0, "");
 641
 642 SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
 643 SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
 644 SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
 645     "Block or frag allocated from cyl group map");
 646 SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
 647 SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
 648 SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
 649 SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
 650 SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
 651 SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
 652 SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
 653 SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
 654 SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
 655 SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
 656 SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
 657 SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
 658 SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
 659 SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
 660 SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
 661 SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
 662 SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
 663 SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
 664 SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
 665 SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
 666 SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
 667 SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
 668 SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
 669 SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
 670
 671 static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel");
 672
 673 static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
 674 static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
 675 static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data");
 676
 677 #define M_SOFTDEP_FLAGS (M_WAITOK)
 678
 679 /*
 680  * translate from workitem type to memory type
 681  * MUST match the defines above, such that memtype[D_XXX] == M_XXX
 682  */
 683 static struct malloc_type *memtype[] = {
 684         NULL,
 685         M_PAGEDEP,
 686         M_INODEDEP,
 687         M_BMSAFEMAP,
 688         M_NEWBLK,
 689         M_ALLOCDIRECT,
 690         M_INDIRDEP,
 691         M_ALLOCINDIR,
 692         M_FREEFRAG,
 693         M_FREEBLKS,
 694         M_FREEFILE,
 695         M_DIRADD,
 696         M_MKDIR,
 697         M_DIRREM,
 698         M_NEWDIRBLK,
 699         M_FREEWORK,
 700         M_FREEDEP,
 701         M_JADDREF,
 702         M_JREMREF,
 703         M_JMVREF,
 704         M_JNEWBLK,
 705         M_JFREEBLK,
 706         M_JFREEFRAG,
 707         M_JSEG,
 708         M_JSEGDEP,
 709         M_SBDEP,
 710         M_JTRUNC,
 711         M_JFSYNC,
 712         M_SENTINEL
 713 };
 714
 715 #define DtoM(type) (memtype[type])
 716
 717 /*
 718  * Names of malloc types.
 719  */
 720 #define TYPENAME(type)  \
 721         ((unsigned)(type) <= D_LAST && (unsigned)(type) >= D_FIRST ? \
 722         memtype[type]->ks_shortdesc : "???")
 723 /*
 724  * End system adaptation definitions.
 725  */
 726
 727 #define DOTDOT_OFFSET   offsetof(struct dirtemplate, dotdot_ino)
 728 #define DOT_OFFSET      offsetof(struct dirtemplate, dot_ino)
 729
 730 /*
 731  * Internal function prototypes.
 732  */
 733 static  void check_clear_deps(struct mount *);
 734 static  void softdep_error(char *, int);
 735 static  int softdep_process_worklist(struct mount *, int);
 736 static  int softdep_waitidle(struct mount *, int);
 737 static  void drain_output(struct vnode *);
 738 static  struct buf *getdirtybuf(struct buf *, struct rwlock *, int);
 739 static  int check_inodedep_free(struct inodedep *);
 740 static  void clear_remove(struct mount *);
 741 static  void clear_inodedeps(struct mount *);
 742 static  void unlinked_inodedep(struct mount *, struct inodedep *);
 743 static  void clear_unlinked_inodedep(struct inodedep *);
 744 static  struct inodedep *first_unlinked_inodedep(struct ufsmount *);
 745 static  int flush_pagedep_deps(struct vnode *, struct mount *,
 746             struct diraddhd *);
 747 static  int free_pagedep(struct pagedep *);
 748 static  int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
 749 static  int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
 750 static  int flush_deplist(struct allocdirectlst *, int, int *);
 751 static  int sync_cgs(struct mount *, int);
 752 static  int handle_written_filepage(struct pagedep *, struct buf *, int);
 753 static  int handle_written_sbdep(struct sbdep *, struct buf *);
 754 static  void initiate_write_sbdep(struct sbdep *);
 755 static  void diradd_inode_written(struct diradd *, struct inodedep *);
 756 static  int handle_written_indirdep(struct indirdep *, struct buf *,
 757             struct buf**, int);
 758 static  int handle_written_inodeblock(struct inodedep *, struct buf *, int);
 759 static  int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
 760             uint8_t *);
 761 static  int handle_written_bmsafemap(struct bmsafemap *, struct buf *, int);
 762 static  void handle_written_jaddref(struct jaddref *);
 763 static  void handle_written_jremref(struct jremref *);
 764 static  void handle_written_jseg(struct jseg *, struct buf *);
 765 static  void handle_written_jnewblk(struct jnewblk *);
 766 static  void handle_written_jblkdep(struct jblkdep *);
 767 static  void handle_written_jfreefrag(struct jfreefrag *);
 768 static  void complete_jseg(struct jseg *);
 769 static  void complete_jsegs(struct jseg *);
 770 static  void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
 771 static  void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
 772 static  void jremref_write(struct jremref *, struct jseg *, uint8_t *);
 773 static  void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
 774 static  void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
 775 static  void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
 776 static  void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
 777 static  void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
 778 static  void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
 779 static  inline void inoref_write(struct inoref *, struct jseg *,
 780             struct jrefrec *);
 781 static  void handle_allocdirect_partdone(struct allocdirect *,
 782             struct workhead *);
 783 static  struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
 784             struct workhead *);
 785 static  void indirdep_complete(struct indirdep *);
 786 static  int indirblk_lookup(struct mount *, ufs2_daddr_t);
 787 static  void indirblk_insert(struct freework *);
 788 static  void indirblk_remove(struct freework *);
 789 static  void handle_allocindir_partdone(struct allocindir *);
 790 static  void initiate_write_filepage(struct pagedep *, struct buf *);
 791 static  void initiate_write_indirdep(struct indirdep*, struct buf *);
 792 static  void handle_written_mkdir(struct mkdir *, int);
 793 static  int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
 794             uint8_t *);
 795 static  void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
 796 static  void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
 797 static  void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
 798 static  void handle_workitem_freefile(struct freefile *);
 799 static  int handle_workitem_remove(struct dirrem *, int);
 800 static  struct dirrem *newdirrem(struct buf *, struct inode *,
 801             struct inode *, int, struct dirrem **);
 802 static  struct indirdep *indirdep_lookup(struct mount *, struct inode *,
 803             struct buf *);
 804 static  void cancel_indirdep(struct indirdep *, struct buf *,
 805             struct freeblks *);
 806 static  void free_indirdep(struct indirdep *);
 807 static  void free_diradd(struct diradd *, struct workhead *);
 808 static  void merge_diradd(struct inodedep *, struct diradd *);
 809 static  void complete_diradd(struct diradd *);
 810 static  struct diradd *diradd_lookup(struct pagedep *, int);
 811 static  struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
 812             struct jremref *);
 813 static  struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
 814             struct jremref *);
 815 static  void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
 816             struct jremref *, struct jremref *);
 817 static  void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
 818             struct jremref *);
 819 static  void cancel_allocindir(struct allocindir *, struct buf *bp,
 820             struct freeblks *, int);
 821 static  int setup_trunc_indir(struct freeblks *, struct inode *,
 822             ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
 823 static  void complete_trunc_indir(struct freework *);
 824 static  void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
 825             int);
 826 static  void complete_mkdir(struct mkdir *);
 827 static  void free_newdirblk(struct newdirblk *);
 828 static  void free_jremref(struct jremref *);
 829 static  void free_jaddref(struct jaddref *);
 830 static  void free_jsegdep(struct jsegdep *);
 831 static  void free_jsegs(struct jblocks *);
 832 static  void rele_jseg(struct jseg *);
 833 static  void free_jseg(struct jseg *, struct jblocks *);
 834 static  void free_jnewblk(struct jnewblk *);
 835 static  void free_jblkdep(struct jblkdep *);
 836 static  void free_jfreefrag(struct jfreefrag *);
 837 static  void free_freedep(struct freedep *);
 838 static  void journal_jremref(struct dirrem *, struct jremref *,
 839             struct inodedep *);
 840 static  void cancel_jnewblk(struct jnewblk *, struct workhead *);
 841 static  int cancel_jaddref(struct jaddref *, struct inodedep *,
 842             struct workhead *);
 843 static  void cancel_jfreefrag(struct jfreefrag *);
 844 static  inline void setup_freedirect(struct freeblks *, struct inode *,
 845             int, int);
 846 static  inline void setup_freeext(struct freeblks *, struct inode *, int, int);
 847 static  inline void setup_freeindir(struct freeblks *, struct inode *, int,
 848             ufs_lbn_t, int);
 849 static  inline struct freeblks *newfreeblks(struct mount *, struct inode *);
 850 static  void freeblks_free(struct ufsmount *, struct freeblks *, int);
 851 static  void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
 852 static  ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
 853 static  int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
 854 static  void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
 855             int, int);
 856 static  void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
 857 static  int cancel_pagedep(struct pagedep *, struct freeblks *, int);
 858 static  int deallocate_dependencies(struct buf *, struct freeblks *, int);
 859 static  void newblk_freefrag(struct newblk*);
 860 static  void free_newblk(struct newblk *);
 861 static  void cancel_allocdirect(struct allocdirectlst *,
 862             struct allocdirect *, struct freeblks *);
 863 static  int check_inode_unwritten(struct inodedep *);
 864 static  int free_inodedep(struct inodedep *);
 865 static  void freework_freeblock(struct freework *, u_long);
 866 static  void freework_enqueue(struct freework *);
 867 static  int handle_workitem_freeblocks(struct freeblks *, int);
 868 static  int handle_complete_freeblocks(struct freeblks *, int);
 869 static  void handle_workitem_indirblk(struct freework *);
 870 static  void handle_written_freework(struct freework *);
 871 static  void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
 872 static  struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
 873             struct workhead *);
 874 static  struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
 875             struct inodedep *, struct allocindir *, ufs_lbn_t);
 876 static  struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
 877             ufs2_daddr_t, ufs_lbn_t);
 878 static  void handle_workitem_freefrag(struct freefrag *);
 879 static  struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
 880             ufs_lbn_t, u_long);
 881 static  void allocdirect_merge(struct allocdirectlst *,
 882             struct allocdirect *, struct allocdirect *);
 883 static  struct freefrag *allocindir_merge(struct allocindir *,
 884             struct allocindir *);
 885 static  int bmsafemap_find(struct bmsafemap_hashhead *, int,
 886             struct bmsafemap **);
 887 static  struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
 888             int cg, struct bmsafemap *);
 889 static  int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int,
 890             struct newblk **);
 891 static  int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
 892 static  int inodedep_find(struct inodedep_hashhead *, ino_t,
 893             struct inodedep **);
 894 static  int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
 895 static  int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
 896             int, struct pagedep **);
 897 static  int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
 898             struct pagedep **);
 899 static  void pause_timer(void *);
 900 static  int request_cleanup(struct mount *, int);
 901 static  int softdep_request_cleanup_flush(struct mount *, struct ufsmount *);
 902 static  void schedule_cleanup(struct mount *);
 903 static void softdep_ast_cleanup_proc(struct thread *);
 904 static struct ufsmount *softdep_bp_to_mp(struct buf *bp);
 905 static  int process_worklist_item(struct mount *, int, int);
 906 static  void process_removes(struct vnode *);
 907 static  void process_truncates(struct vnode *);
 908 static  void jwork_move(struct workhead *, struct workhead *);
 909 static  void jwork_insert(struct workhead *, struct jsegdep *);
 910 static  void add_to_worklist(struct worklist *, int);
 911 static  void wake_worklist(struct worklist *);
 912 static  void wait_worklist(struct worklist *, char *);
 913 static  void remove_from_worklist(struct worklist *);
 914 static  void softdep_flush(void *);
 915 static  void softdep_flushjournal(struct mount *);
 916 static  int softdep_speedup(struct ufsmount *);
 917 static  void worklist_speedup(struct mount *);
 918 static  int journal_mount(struct mount *, struct fs *, struct ucred *);
 919 static  void journal_unmount(struct ufsmount *);
 920 static  int journal_space(struct ufsmount *, int);
 921 static  void journal_suspend(struct ufsmount *);
 922 static  int journal_unsuspend(struct ufsmount *ump);
 923 static  void softdep_prelink(struct vnode *, struct vnode *);
 924 static  void add_to_journal(struct worklist *);
 925 static  void remove_from_journal(struct worklist *);
 926 static  bool softdep_excess_items(struct ufsmount *, int);
 927 static  void softdep_process_journal(struct mount *, struct worklist *, int);
 928 static  struct jremref *newjremref(struct dirrem *, struct inode *,
 929             struct inode *ip, off_t, nlink_t);
 930 static  struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
 931             uint16_t);
 932 static  inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
 933             uint16_t);
 934 static  inline struct jsegdep *inoref_jseg(struct inoref *);
 935 static  struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
 936 static  struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
 937             ufs2_daddr_t, int);
 938 static  void adjust_newfreework(struct freeblks *, int);
 939 static  struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
 940 static  void move_newblock_dep(struct jaddref *, struct inodedep *);
 941 static  void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
 942 static  struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
 943             ufs2_daddr_t, long, ufs_lbn_t);
 944 static  struct freework *newfreework(struct ufsmount *, struct freeblks *,
 945             struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
 946 static  int jwait(struct worklist *, int);
 947 static  struct inodedep *inodedep_lookup_ip(struct inode *);
 948 static  int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
 949 static  struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
 950 static  void handle_jwork(struct workhead *);
 951 static  struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
 952             struct mkdir **);
 953 static  struct jblocks *jblocks_create(void);
 954 static  ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
 955 static  void jblocks_free(struct jblocks *, struct mount *, int);
 956 static  void jblocks_destroy(struct jblocks *);
 957 static  void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
 958
 959 /*
 960  * Exported softdep operations.
 961  */
 962 static  void softdep_disk_io_initiation(struct buf *);
 963 static  void softdep_disk_write_complete(struct buf *);
 964 static  void softdep_deallocate_dependencies(struct buf *);
 965 static  int softdep_count_dependencies(struct buf *bp, int);
 966
 967 /*
 968  * Global lock over all of soft updates.
 969  */
 970 static struct mtx lk;
 971 MTX_SYSINIT(softdep_lock, &lk, "Global Softdep Lock", MTX_DEF);
 972
 973 #define ACQUIRE_GBLLOCK(lk)     mtx_lock(lk)
 974 #define FREE_GBLLOCK(lk)        mtx_unlock(lk)
 975 #define GBLLOCK_OWNED(lk)       mtx_assert((lk), MA_OWNED)
 976
 977 /*
 978  * Per-filesystem soft-updates locking.
 979  */
 980 #define LOCK_PTR(ump)           (&(ump)->um_softdep->sd_fslock)
 981 #define TRY_ACQUIRE_LOCK(ump)   rw_try_wlock(&(ump)->um_softdep->sd_fslock)
 982 #define ACQUIRE_LOCK(ump)       rw_wlock(&(ump)->um_softdep->sd_fslock)
 983 #define FREE_LOCK(ump)          rw_wunlock(&(ump)->um_softdep->sd_fslock)
 984 #define LOCK_OWNED(ump)         rw_assert(&(ump)->um_softdep->sd_fslock, \
 985                                     RA_WLOCKED)
 986
 987 #define BUF_AREC(bp)            lockallowrecurse(&(bp)->b_lock)
 988 #define BUF_NOREC(bp)           lockdisablerecurse(&(bp)->b_lock)
 989
 990 /*
 991  * Worklist queue management.
 992  * These routines require that the lock be held.
 993  */
 994 #ifndef /* NOT */ INVARIANTS
 995 #define WORKLIST_INSERT(head, item) do {        \
 996         (item)->wk_state |= ONWORKLIST;         \
 997         LIST_INSERT_HEAD(head, item, wk_list);  \
 998 } while (0)
 999 #define WORKLIST_REMOVE(item) do {              \
1000         (item)->wk_state &= ~ONWORKLIST;        \
1001         LIST_REMOVE(item, wk_list);             \
1002 } while (0)
1003 #define WORKLIST_INSERT_UNLOCKED        WORKLIST_INSERT
1004 #define WORKLIST_REMOVE_UNLOCKED        WORKLIST_REMOVE
1005
1006 #else /* INVARIANTS */
1007 static  void worklist_insert(struct workhead *, struct worklist *, int,
1008         const char *, int);
1009 static  void worklist_remove(struct worklist *, int, const char *, int);
1010
1011 #define WORKLIST_INSERT(head, item) \
1012         worklist_insert(head, item, 1, __func__, __LINE__)
1013 #define WORKLIST_INSERT_UNLOCKED(head, item)\
1014         worklist_insert(head, item, 0, __func__, __LINE__)
1015 #define WORKLIST_REMOVE(item)\
1016         worklist_remove(item, 1, __func__, __LINE__)
1017 #define WORKLIST_REMOVE_UNLOCKED(item)\
1018         worklist_remove(item, 0, __func__, __LINE__)
1019
1020 static void
1021 worklist_insert(head, item, locked, func, line)
1022         struct workhead *head;
1023         struct worklist *item;
1024         int locked;
1025         const char *func;
1026         int line;
1027 {
1028
1029         if (locked)
1030                 LOCK_OWNED(VFSTOUFS(item->wk_mp));
1031         if (item->wk_state & ONWORKLIST)
1032                 panic("worklist_insert: %p %s(0x%X) already on list, "
1033                     "added in function %s at line %d",
1034                     item, TYPENAME(item->wk_type), item->wk_state,
1035                     item->wk_func, item->wk_line);
1036         item->wk_state |= ONWORKLIST;
1037         item->wk_func = func;
1038         item->wk_line = line;
1039         LIST_INSERT_HEAD(head, item, wk_list);
1040 }
1041
1042 static void
1043 worklist_remove(item, locked, func, line)
1044         struct worklist *item;
1045         int locked;
1046         const char *func;
1047         int line;
1048 {
1049
1050         if (locked)
1051                 LOCK_OWNED(VFSTOUFS(item->wk_mp));
1052         if ((item->wk_state & ONWORKLIST) == 0)
1053                 panic("worklist_remove: %p %s(0x%X) not on list, "
1054                     "removed in function %s at line %d",
1055                     item, TYPENAME(item->wk_type), item->wk_state,
1056                     item->wk_func, item->wk_line);
1057         item->wk_state &= ~ONWORKLIST;
1058         item->wk_func = func;
1059         item->wk_line = line;
1060         LIST_REMOVE(item, wk_list);
1061 }
1062 #endif /* INVARIANTS */
1063
1064 /*
1065  * Merge two jsegdeps keeping only the oldest one as newer references
1066  * can't be discarded until after older references.
1067  */
1068 static inline struct jsegdep *
1069 jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
1070 {
1071         struct jsegdep *swp;
1072
1073         if (two == NULL)
1074                 return (one);
1075
1076         if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
1077                 swp = one;
1078                 one = two;
1079                 two = swp;
1080         }
1081         WORKLIST_REMOVE(&two->jd_list);
1082         free_jsegdep(two);
1083
1084         return (one);
1085 }
1086
1087 /*
1088  * If two freedeps are compatible free one to reduce list size.
1089  */
1090 static inline struct freedep *
1091 freedep_merge(struct freedep *one, struct freedep *two)
1092 {
1093         if (two == NULL)
1094                 return (one);
1095
1096         if (one->fd_freework == two->fd_freework) {
1097                 WORKLIST_REMOVE(&two->fd_list);
1098                 free_freedep(two);
1099         }
1100         return (one);
1101 }
1102
1103 /*
1104  * Move journal work from one list to another.  Duplicate freedeps and
1105  * jsegdeps are coalesced to keep the lists as small as possible.
1106  */
1107 static void
1108 jwork_move(dst, src)
1109         struct workhead *dst;
1110         struct workhead *src;
1111 {
1112         struct freedep *freedep;
1113         struct jsegdep *jsegdep;
1114         struct worklist *wkn;
1115         struct worklist *wk;
1116
1117         KASSERT(dst != src,
1118             ("jwork_move: dst == src"));
1119         freedep = NULL;
1120         jsegdep = NULL;
1121         LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
1122                 if (wk->wk_type == D_JSEGDEP)
1123                         jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1124                 else if (wk->wk_type == D_FREEDEP)
1125                         freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1126         }
1127
1128         while ((wk = LIST_FIRST(src)) != NULL) {
1129                 WORKLIST_REMOVE(wk);
1130                 WORKLIST_INSERT(dst, wk);
1131                 if (wk->wk_type == D_JSEGDEP) {
1132                         jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1133                         continue;
1134                 }
1135                 if (wk->wk_type == D_FREEDEP)
1136                         freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1137         }
1138 }
1139
1140 static void
1141 jwork_insert(dst, jsegdep)
1142         struct workhead *dst;
1143         struct jsegdep *jsegdep;
1144 {
1145         struct jsegdep *jsegdepn;
1146         struct worklist *wk;
1147
1148         LIST_FOREACH(wk, dst, wk_list)
1149                 if (wk->wk_type == D_JSEGDEP)
1150                         break;
1151         if (wk == NULL) {
1152                 WORKLIST_INSERT(dst, &jsegdep->jd_list);
1153                 return;
1154         }
1155         jsegdepn = WK_JSEGDEP(wk);
1156         if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
1157                 WORKLIST_REMOVE(wk);
1158                 free_jsegdep(jsegdepn);
1159                 WORKLIST_INSERT(dst, &jsegdep->jd_list);
1160         } else
1161                 free_jsegdep(jsegdep);
1162 }
1163
1164 /*
1165  * Routines for tracking and managing workitems.
1166  */
1167 static  void workitem_free(struct worklist *, int);
1168 static  void workitem_alloc(struct worklist *, int, struct mount *);
1169 static  void workitem_reassign(struct worklist *, int);
1170
1171 #define WORKITEM_FREE(item, type) \
1172         workitem_free((struct worklist *)(item), (type))
1173 #define WORKITEM_REASSIGN(item, type) \
1174         workitem_reassign((struct worklist *)(item), (type))
1175
1176 static void
1177 workitem_free(item, type)
1178         struct worklist *item;
1179         int type;
1180 {
1181         struct ufsmount *ump;
1182
1183 #ifdef INVARIANTS
1184         if (item->wk_state & ONWORKLIST)
1185                 panic("workitem_free: %s(0x%X) still on list, "
1186                     "added in function %s at line %d",
1187                     TYPENAME(item->wk_type), item->wk_state,
1188                     item->wk_func, item->wk_line);
1189         if (item->wk_type != type && type != D_NEWBLK)
1190                 panic("workitem_free: type mismatch %s != %s",
1191                     TYPENAME(item->wk_type), TYPENAME(type));
1192 #endif
1193         if (item->wk_state & IOWAITING)
1194                 wakeup(item);
1195         ump = VFSTOUFS(item->wk_mp);
1196         LOCK_OWNED(ump);
1197         KASSERT(ump->softdep_deps > 0,
1198             ("workitem_free: %s: softdep_deps going negative",
1199             ump->um_fs->fs_fsmnt));
1200         if (--ump->softdep_deps == 0 && ump->softdep_req)
1201                 wakeup(&ump->softdep_deps);
1202         KASSERT(dep_current[item->wk_type] > 0,
1203             ("workitem_free: %s: dep_current[%s] going negative",
1204             ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1205         KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1206             ("workitem_free: %s: softdep_curdeps[%s] going negative",
1207             ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1208         atomic_subtract_long(&dep_current[item->wk_type], 1);
1209         ump->softdep_curdeps[item->wk_type] -= 1;
1210         free(item, DtoM(type));
1211 }
1212
1213 static void
1214 workitem_alloc(item, type, mp)
1215         struct worklist *item;
1216         int type;
1217         struct mount *mp;
1218 {
1219         struct ufsmount *ump;
1220
1221         item->wk_type = type;
1222         item->wk_mp = mp;
1223         item->wk_state = 0;
1224
1225         ump = VFSTOUFS(mp);
1226         ACQUIRE_GBLLOCK(&lk);
1227         dep_current[type]++;
1228         if (dep_current[type] > dep_highuse[type])
1229                 dep_highuse[type] = dep_current[type];
1230         dep_total[type]++;
1231         FREE_GBLLOCK(&lk);
1232         ACQUIRE_LOCK(ump);
1233         ump->softdep_curdeps[type] += 1;
1234         ump->softdep_deps++;
1235         ump->softdep_accdeps++;
1236         FREE_LOCK(ump);
1237 }
1238
1239 static void
1240 workitem_reassign(item, newtype)
1241         struct worklist *item;
1242         int newtype;
1243 {
1244         struct ufsmount *ump;
1245
1246         ump = VFSTOUFS(item->wk_mp);
1247         LOCK_OWNED(ump);
1248         KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1249             ("workitem_reassign: %s: softdep_curdeps[%s] going negative",
1250             VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1251         ump->softdep_curdeps[item->wk_type] -= 1;
1252         ump->softdep_curdeps[newtype] += 1;
1253         KASSERT(dep_current[item->wk_type] > 0,
1254             ("workitem_reassign: %s: dep_current[%s] going negative",
1255             VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1256         ACQUIRE_GBLLOCK(&lk);
1257         dep_current[newtype]++;
1258         dep_current[item->wk_type]--;
1259         if (dep_current[newtype] > dep_highuse[newtype])
1260                 dep_highuse[newtype] = dep_current[newtype];
1261         dep_total[newtype]++;
1262         FREE_GBLLOCK(&lk);
1263         item->wk_type = newtype;
1264 }
1265
1266 /*
1267  * Workitem queue management
1268  */
1269 static int max_softdeps;        /* maximum number of structs before slowdown */
1270 static int tickdelay = 2;       /* number of ticks to pause during slowdown */
1271 static int proc_waiting;        /* tracks whether we have a timeout posted */
1272 static int *stat_countp;        /* statistic to count in proc_waiting timeout */
1273 static struct callout softdep_callout;
1274 static int req_clear_inodedeps; /* syncer process flush some inodedeps */
1275 static int req_clear_remove;    /* syncer process flush some freeblks */
1276 static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
1277
1278 /*
1279  * runtime statistics
1280  */
1281 static int stat_flush_threads;  /* number of softdep flushing threads */
1282 static int stat_worklist_push;  /* number of worklist cleanups */
1283 static int stat_blk_limit_push; /* number of times block limit neared */
1284 static int stat_ino_limit_push; /* number of times inode limit neared */
1285 static int stat_blk_limit_hit;  /* number of times block slowdown imposed */
1286 static int stat_ino_limit_hit;  /* number of times inode slowdown imposed */
1287 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
1288 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
1289 static int stat_inode_bitmap;   /* bufs redirtied as inode bitmap not written */
1290 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
1291 static int stat_dir_entry;      /* bufs redirtied as dir entry cannot write */
1292 static int stat_jaddref;        /* bufs redirtied as ino bitmap can not write */
1293 static int stat_jnewblk;        /* bufs redirtied as blk bitmap can not write */
1294 static int stat_journal_min;    /* Times hit journal min threshold */
1295 static int stat_journal_low;    /* Times hit journal low threshold */
1296 static int stat_journal_wait;   /* Times blocked in jwait(). */
1297 static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */
1298 static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */
1299 static int stat_jwait_inode;    /* Times blocked in jwait() for inodes. */
1300 static int stat_jwait_newblk;   /* Times blocked in jwait() for newblks. */
1301 static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
1302 static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
1303 static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
1304 static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
1305 static int stat_cleanup_failures; /* Number of cleanup requests that failed */
1306 static int stat_emptyjblocks; /* Number of potentially empty journal blocks */
1307
1308 SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
1309     &max_softdeps, 0, "");
1310 SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
1311     &tickdelay, 0, "");
1312 SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD,
1313     &stat_flush_threads, 0, "");
1314 SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
1315     &stat_worklist_push, 0,"");
1316 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
1317     &stat_blk_limit_push, 0,"");
1318 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
1319     &stat_ino_limit_push, 0,"");
1320 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
1321     &stat_blk_limit_hit, 0, "");
1322 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
1323     &stat_ino_limit_hit, 0, "");
1324 SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
1325     &stat_sync_limit_hit, 0, "");
1326 SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
1327     &stat_indir_blk_ptrs, 0, "");
1328 SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
1329     &stat_inode_bitmap, 0, "");
1330 SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
1331     &stat_direct_blk_ptrs, 0, "");
1332 SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
1333     &stat_dir_entry, 0, "");
1334 SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
1335     &stat_jaddref, 0, "");
1336 SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
1337     &stat_jnewblk, 0, "");
1338 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
1339     &stat_journal_low, 0, "");
1340 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
1341     &stat_journal_min, 0, "");
1342 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
1343     &stat_journal_wait, 0, "");
1344 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
1345     &stat_jwait_filepage, 0, "");
1346 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
1347     &stat_jwait_freeblks, 0, "");
1348 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
1349     &stat_jwait_inode, 0, "");
1350 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
1351     &stat_jwait_newblk, 0, "");
1352 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW,
1353     &stat_cleanup_blkrequests, 0, "");
1354 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW,
1355     &stat_cleanup_inorequests, 0, "");
1356 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW,
1357     &stat_cleanup_high_delay, 0, "");
1358 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW,
1359     &stat_cleanup_retries, 0, "");
1360 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW,
1361     &stat_cleanup_failures, 0, "");
1362 SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW,
1363     &softdep_flushcache, 0, "");
1364 SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD,
1365     &stat_emptyjblocks, 0, "");
1366
1367 SYSCTL_DECL(_vfs_ffs);
1368
1369 /* Whether to recompute the summary at mount time */
1370 static int compute_summary_at_mount = 0;
1371 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1372            &compute_summary_at_mount, 0, "Recompute summary at mount");
1373 static int print_threads = 0;
1374 SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW,
1375     &print_threads, 0, "Notify flusher thread start/stop");
1376
1377 /* List of all filesystems mounted with soft updates */
1378 static TAILQ_HEAD(, mount_softdeps) softdepmounts;
1379
1380 /*
1381  * This function cleans the worklist for a filesystem.
1382  * Each filesystem running with soft dependencies gets its own
1383  * thread to run in this function. The thread is started up in
1384  * softdep_mount and shutdown in softdep_unmount. They show up
1385  * as part of the kernel "bufdaemon" process whose process
1386  * entry is available in bufdaemonproc.
1387  */
1388 static int searchfailed;
1389 extern struct proc *bufdaemonproc;
1390 static void
1391 softdep_flush(addr)
1392         void *addr;
1393 {
1394         struct mount *mp;
1395         struct thread *td;
1396         struct ufsmount *ump;
1397
1398         td = curthread;
1399         td->td_pflags |= TDP_NORUNNINGBUF;
1400         mp = (struct mount *)addr;
1401         ump = VFSTOUFS(mp);
1402         atomic_add_int(&stat_flush_threads, 1);
1403         ACQUIRE_LOCK(ump);
1404         ump->softdep_flags &= ~FLUSH_STARTING;
1405         wakeup(&ump->softdep_flushtd);
1406         FREE_LOCK(ump);
1407         if (print_threads) {
1408                 if (stat_flush_threads == 1)
1409                         printf("Running %s at pid %d\n", bufdaemonproc->p_comm,
1410                             bufdaemonproc->p_pid);
1411                 printf("Start thread %s\n", td->td_name);
1412         }
1413         for (;;) {
1414                 while (softdep_process_worklist(mp, 0) > 0 ||
1415                     (MOUNTEDSUJ(mp) &&
1416                     VFSTOUFS(mp)->softdep_jblocks->jb_suspended))
1417                         kthread_suspend_check();
1418                 ACQUIRE_LOCK(ump);
1419                 if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1420                         msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM,
1421                             "sdflush", hz / 2);
1422                 ump->softdep_flags &= ~FLUSH_CLEANUP;
1423                 /*
1424                  * Check to see if we are done and need to exit.
1425                  */
1426                 if ((ump->softdep_flags & FLUSH_EXIT) == 0) {
1427                         FREE_LOCK(ump);
1428                         continue;
1429                 }
1430                 ump->softdep_flags &= ~FLUSH_EXIT;
1431                 FREE_LOCK(ump);
1432                 wakeup(&ump->softdep_flags);
1433                 if (print_threads)
1434                         printf("Stop thread %s: searchfailed %d, did cleanups %d\n", td->td_name, searchfailed, ump->um_softdep->sd_cleanups);
1435                 atomic_subtract_int(&stat_flush_threads, 1);
1436                 kthread_exit();
1437                 panic("kthread_exit failed\n");
1438         }
1439 }
1440
1441 static void
1442 worklist_speedup(mp)
1443         struct mount *mp;
1444 {
1445         struct ufsmount *ump;
1446
1447         ump = VFSTOUFS(mp);
1448         LOCK_OWNED(ump);
1449         if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1450                 ump->softdep_flags |= FLUSH_CLEANUP;
1451         wakeup(&ump->softdep_flushtd);
1452 }
1453
1454 static int
1455 softdep_speedup(ump)
1456         struct ufsmount *ump;
1457 {
1458         struct ufsmount *altump;
1459         struct mount_softdeps *sdp;
1460
1461         LOCK_OWNED(ump);
1462         worklist_speedup(ump->um_mountp);
1463         bd_speedup();
1464         /*
1465          * If we have global shortages, then we need other
1466          * filesystems to help with the cleanup. Here we wakeup a
1467          * flusher thread for a filesystem that is over its fair
1468          * share of resources.
1469          */
1470         if (req_clear_inodedeps || req_clear_remove) {
1471                 ACQUIRE_GBLLOCK(&lk);
1472                 TAILQ_FOREACH(sdp, &softdepmounts, sd_next) {
1473                         if ((altump = sdp->sd_ump) == ump)
1474                                 continue;
1475                         if (((req_clear_inodedeps &&
1476                             altump->softdep_curdeps[D_INODEDEP] >
1477                             max_softdeps / stat_flush_threads) ||
1478                             (req_clear_remove &&
1479                             altump->softdep_curdeps[D_DIRREM] >
1480                             (max_softdeps / 2) / stat_flush_threads)) &&
1481                             TRY_ACQUIRE_LOCK(altump))
1482                                 break;
1483                 }
1484                 if (sdp == NULL) {
1485                         searchfailed++;
1486                         FREE_GBLLOCK(&lk);
1487                 } else {
1488                         /*
1489                          * Move to the end of the list so we pick a
1490                          * different one on out next try.
1491                          */
1492                         TAILQ_REMOVE(&softdepmounts, sdp, sd_next);
1493                         TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
1494                         FREE_GBLLOCK(&lk);
1495                         if ((altump->softdep_flags &
1496                             (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1497                                 altump->softdep_flags |= FLUSH_CLEANUP;
1498                         altump->um_softdep->sd_cleanups++;
1499                         wakeup(&altump->softdep_flushtd);
1500                         FREE_LOCK(altump);
1501                 }
1502         }
1503         return (speedup_syncer());
1504 }
1505
1506 /*
1507  * Add an item to the end of the work queue.
1508  * This routine requires that the lock be held.
1509  * This is the only routine that adds items to the list.
1510  * The following routine is the only one that removes items
1511  * and does so in order from first to last.
1512  */
1513
1514 #define WK_HEAD         0x0001  /* Add to HEAD. */
1515 #define WK_NODELAY      0x0002  /* Process immediately. */
1516
1517 static void
1518 add_to_worklist(wk, flags)
1519         struct worklist *wk;
1520         int flags;
1521 {
1522         struct ufsmount *ump;
1523
1524         ump = VFSTOUFS(wk->wk_mp);
1525         LOCK_OWNED(ump);
1526         if (wk->wk_state & ONWORKLIST)
1527                 panic("add_to_worklist: %s(0x%X) already on list",
1528                     TYPENAME(wk->wk_type), wk->wk_state);
1529         wk->wk_state |= ONWORKLIST;
1530         if (ump->softdep_on_worklist == 0) {
1531                 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1532                 ump->softdep_worklist_tail = wk;
1533         } else if (flags & WK_HEAD) {
1534                 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1535         } else {
1536                 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1537                 ump->softdep_worklist_tail = wk;
1538         }
1539         ump->softdep_on_worklist += 1;
1540         if (flags & WK_NODELAY)
1541                 worklist_speedup(wk->wk_mp);
1542 }
1543
1544 /*
1545  * Remove the item to be processed. If we are removing the last
1546  * item on the list, we need to recalculate the tail pointer.
1547  */
1548 static void
1549 remove_from_worklist(wk)
1550         struct worklist *wk;
1551 {
1552         struct ufsmount *ump;
1553
1554         ump = VFSTOUFS(wk->wk_mp);
1555         if (ump->softdep_worklist_tail == wk)
1556                 ump->softdep_worklist_tail =
1557                     (struct worklist *)wk->wk_list.le_prev;
1558         WORKLIST_REMOVE(wk);
1559         ump->softdep_on_worklist -= 1;
1560 }
1561
1562 static void
1563 wake_worklist(wk)
1564         struct worklist *wk;
1565 {
1566         if (wk->wk_state & IOWAITING) {
1567                 wk->wk_state &= ~IOWAITING;
1568                 wakeup(wk);
1569         }
1570 }
1571
1572 static void
1573 wait_worklist(wk, wmesg)
1574         struct worklist *wk;
1575         char *wmesg;
1576 {
1577         struct ufsmount *ump;
1578
1579         ump = VFSTOUFS(wk->wk_mp);
1580         wk->wk_state |= IOWAITING;
1581         msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0);
1582 }
1583
1584 /*
1585  * Process that runs once per second to handle items in the background queue.
1586  *
1587  * Note that we ensure that everything is done in the order in which they
1588  * appear in the queue. The code below depends on this property to ensure
1589  * that blocks of a file are freed before the inode itself is freed. This
1590  * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1591  * until all the old ones have been purged from the dependency lists.
1592  */
1593 static int
1594 softdep_process_worklist(mp, full)
1595         struct mount *mp;
1596         int full;
1597 {
1598         int cnt, matchcnt;
1599         struct ufsmount *ump;
1600         long starttime;
1601
1602         KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1603         if (MOUNTEDSOFTDEP(mp) == 0)
1604                 return (0);
1605         matchcnt = 0;
1606         ump = VFSTOUFS(mp);
1607         ACQUIRE_LOCK(ump);
1608         starttime = time_second;
1609         softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0);
1610         check_clear_deps(mp);
1611         while (ump->softdep_on_worklist > 0) {
1612                 if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
1613                         break;
1614                 else
1615                         matchcnt += cnt;
1616                 check_clear_deps(mp);
1617                 /*
1618                  * We do not generally want to stop for buffer space, but if
1619                  * we are really being a buffer hog, we will stop and wait.
1620                  */
1621                 if (should_yield()) {
1622                         FREE_LOCK(ump);
1623                         kern_yield(PRI_USER);
1624                         bwillwrite();
1625                         ACQUIRE_LOCK(ump);
1626                 }
1627                 /*
1628                  * Never allow processing to run for more than one
1629                  * second. This gives the syncer thread the opportunity
1630                  * to pause if appropriate.
1631                  */
1632                 if (!full && starttime != time_second)
1633                         break;
1634         }
1635         if (full == 0)
1636                 journal_unsuspend(ump);
1637         FREE_LOCK(ump);
1638         return (matchcnt);
1639 }
1640
1641 /*
1642  * Process all removes associated with a vnode if we are running out of
1643  * journal space.  Any other process which attempts to flush these will
1644  * be unable as we have the vnodes locked.
1645  */
1646 static void
1647 process_removes(vp)
1648         struct vnode *vp;
1649 {
1650         struct inodedep *inodedep;
1651         struct dirrem *dirrem;
1652         struct ufsmount *ump;
1653         struct mount *mp;
1654         ino_t inum;
1655
1656         mp = vp->v_mount;
1657         ump = VFSTOUFS(mp);
1658         LOCK_OWNED(ump);
1659         inum = VTOI(vp)->i_number;
1660         for (;;) {
1661 top:
1662                 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1663                         return;
1664                 LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
1665                         /*
1666                          * If another thread is trying to lock this vnode
1667                          * it will fail but we must wait for it to do so
1668                          * before we can proceed.
1669                          */
1670                         if (dirrem->dm_state & INPROGRESS) {
1671                                 wait_worklist(&dirrem->dm_list, "pwrwait");
1672                                 goto top;
1673                         }
1674                         if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
1675                             (COMPLETE | ONWORKLIST))
1676                                 break;
1677                 }
1678                 if (dirrem == NULL)
1679                         return;
1680                 remove_from_worklist(&dirrem->dm_list);
1681                 FREE_LOCK(ump);
1682                 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1683                         panic("process_removes: suspended filesystem");
1684                 handle_workitem_remove(dirrem, 0);
1685                 vn_finished_secondary_write(mp);
1686                 ACQUIRE_LOCK(ump);
1687         }
1688 }
1689
1690 /*
1691  * Process all truncations associated with a vnode if we are running out
1692  * of journal space.  This is called when the vnode lock is already held
1693  * and no other process can clear the truncation.  This function returns
1694  * a value greater than zero if it did any work.
1695  */
1696 static void
1697 process_truncates(vp)
1698         struct vnode *vp;
1699 {
1700         struct inodedep *inodedep;
1701         struct freeblks *freeblks;
1702         struct ufsmount *ump;
1703         struct mount *mp;
1704         ino_t inum;
1705         int cgwait;
1706
1707         mp = vp->v_mount;
1708         ump = VFSTOUFS(mp);
1709         LOCK_OWNED(ump);
1710         inum = VTOI(vp)->i_number;
1711         for (;;) {
1712                 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1713                         return;
1714                 cgwait = 0;
1715                 TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
1716                         /* Journal entries not yet written.  */
1717                         if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
1718                                 jwait(&LIST_FIRST(
1719                                     &freeblks->fb_jblkdephd)->jb_list,
1720                                     MNT_WAIT);
1721                                 break;
1722                         }
1723                         /* Another thread is executing this item. */
1724                         if (freeblks->fb_state & INPROGRESS) {
1725                                 wait_worklist(&freeblks->fb_list, "ptrwait");
1726                                 break;
1727                         }
1728                         /* Freeblks is waiting on a inode write. */
1729                         if ((freeblks->fb_state & COMPLETE) == 0) {
1730                                 FREE_LOCK(ump);
1731                                 ffs_update(vp, 1);
1732                                 ACQUIRE_LOCK(ump);
1733                                 break;
1734                         }
1735                         if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
1736                             (ALLCOMPLETE | ONWORKLIST)) {
1737                                 remove_from_worklist(&freeblks->fb_list);
1738                                 freeblks->fb_state |= INPROGRESS;
1739                                 FREE_LOCK(ump);
1740                                 if (vn_start_secondary_write(NULL, &mp,
1741                                     V_NOWAIT))
1742                                         panic("process_truncates: "
1743                                             "suspended filesystem");
1744                                 handle_workitem_freeblocks(freeblks, 0);
1745                                 vn_finished_secondary_write(mp);
1746                                 ACQUIRE_LOCK(ump);
1747                                 break;
1748                         }
1749                         if (freeblks->fb_cgwait)
1750                                 cgwait++;
1751                 }
1752                 if (cgwait) {
1753                         FREE_LOCK(ump);
1754                         sync_cgs(mp, MNT_WAIT);
1755                         ffs_sync_snap(mp, MNT_WAIT);
1756                         ACQUIRE_LOCK(ump);
1757                         continue;
1758                 }
1759                 if (freeblks == NULL)
1760                         break;
1761         }
1762         return;
1763 }
1764
1765 /*
1766  * Process one item on the worklist.
1767  */
1768 static int
1769 process_worklist_item(mp, target, flags)
1770         struct mount *mp;
1771         int target;
1772         int flags;
1773 {
1774         struct worklist sentinel;
1775         struct worklist *wk;
1776         struct ufsmount *ump;
1777         int matchcnt;
1778         int error;
1779
1780         KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1781         /*
1782          * If we are being called because of a process doing a
1783          * copy-on-write, then it is not safe to write as we may
1784          * recurse into the copy-on-write routine.
1785          */
1786         if (curthread->td_pflags & TDP_COWINPROGRESS)
1787                 return (-1);
1788         PHOLD(curproc); /* Don't let the stack go away. */
1789         ump = VFSTOUFS(mp);
1790         LOCK_OWNED(ump);
1791         matchcnt = 0;
1792         sentinel.wk_mp = NULL;
1793         sentinel.wk_type = D_SENTINEL;
1794         LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list);
1795         for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL;
1796             wk = LIST_NEXT(&sentinel, wk_list)) {
1797                 if (wk->wk_type == D_SENTINEL) {
1798                         LIST_REMOVE(&sentinel, wk_list);
1799                         LIST_INSERT_AFTER(wk, &sentinel, wk_list);
1800                         continue;
1801                 }
1802                 if (wk->wk_state & INPROGRESS)
1803                         panic("process_worklist_item: %p already in progress.",
1804                             wk);
1805                 wk->wk_state |= INPROGRESS;
1806                 remove_from_worklist(wk);
1807                 FREE_LOCK(ump);
1808                 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1809                         panic("process_worklist_item: suspended filesystem");
1810                 switch (wk->wk_type) {
1811                 case D_DIRREM:
1812                         /* removal of a directory entry */
1813                         error = handle_workitem_remove(WK_DIRREM(wk), flags);
1814                         break;
1815
1816                 case D_FREEBLKS:
1817                         /* releasing blocks and/or fragments from a file */
1818                         error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
1819                             flags);
1820                         break;
1821
1822                 case D_FREEFRAG:
1823                         /* releasing a fragment when replaced as a file grows */
1824                         handle_workitem_freefrag(WK_FREEFRAG(wk));
1825                         error = 0;
1826                         break;
1827
1828                 case D_FREEFILE:
1829                         /* releasing an inode when its link count drops to 0 */
1830                         handle_workitem_freefile(WK_FREEFILE(wk));
1831                         error = 0;
1832                         break;
1833
1834                 default:
1835                         panic("%s_process_worklist: Unknown type %s",
1836                             "softdep", TYPENAME(wk->wk_type));
1837                         /* NOTREACHED */
1838                 }
1839                 vn_finished_secondary_write(mp);
1840                 ACQUIRE_LOCK(ump);
1841                 if (error == 0) {
1842                         if (++matchcnt == target)
1843                                 break;
1844                         continue;
1845                 }
1846                 /*
1847                  * We have to retry the worklist item later.  Wake up any
1848                  * waiters who may be able to complete it immediately and
1849                  * add the item back to the head so we don't try to execute
1850                  * it again.
1851                  */
1852                 wk->wk_state &= ~INPROGRESS;
1853                 wake_worklist(wk);
1854                 add_to_worklist(wk, WK_HEAD);
1855         }
1856         /* Sentinal could've become the tail from remove_from_worklist. */
1857         if (ump->softdep_worklist_tail == &sentinel)
1858                 ump->softdep_worklist_tail =
1859                     (struct worklist *)sentinel.wk_list.le_prev;
1860         LIST_REMOVE(&sentinel, wk_list);
1861         PRELE(curproc);
1862         return (matchcnt);
1863 }
1864
1865 /*
1866  * Move dependencies from one buffer to another.
1867  */
1868 int
1869 softdep_move_dependencies(oldbp, newbp)
1870         struct buf *oldbp;
1871         struct buf *newbp;
1872 {
1873         struct worklist *wk, *wktail;
1874         struct ufsmount *ump;
1875         int dirty;
1876
1877         if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL)
1878                 return (0);
1879         KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
1880             ("softdep_move_dependencies called on non-softdep filesystem"));
1881         dirty = 0;
1882         wktail = NULL;
1883         ump = VFSTOUFS(wk->wk_mp);
1884         ACQUIRE_LOCK(ump);
1885         while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
1886                 LIST_REMOVE(wk, wk_list);
1887                 if (wk->wk_type == D_BMSAFEMAP &&
1888                     bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp))
1889                         dirty = 1;
1890                 if (wktail == NULL)
1891                         LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1892                 else
1893                         LIST_INSERT_AFTER(wktail, wk, wk_list);
1894                 wktail = wk;
1895         }
1896         FREE_LOCK(ump);
1897
1898         return (dirty);
1899 }
1900
1901 /*
1902  * Purge the work list of all items associated with a particular mount point.
1903  */
1904 int
1905 softdep_flushworklist(oldmnt, countp, td)
1906         struct mount *oldmnt;
1907         int *countp;
1908         struct thread *td;
1909 {
1910         struct vnode *devvp;
1911         struct ufsmount *ump;
1912         int count, error;
1913
1914         /*
1915          * Alternately flush the block device associated with the mount
1916          * point and process any dependencies that the flushing
1917          * creates. We continue until no more worklist dependencies
1918          * are found.
1919          */
1920         *countp = 0;
1921         error = 0;
1922         ump = VFSTOUFS(oldmnt);
1923         devvp = ump->um_devvp;
1924         while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1925                 *countp += count;
1926                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1927                 error = VOP_FSYNC(devvp, MNT_WAIT, td);
1928                 VOP_UNLOCK(devvp, 0);
1929                 if (error != 0)
1930                         break;
1931         }
1932         return (error);
1933 }
1934
1935 #define SU_WAITIDLE_RETRIES     20
1936 static int
1937 softdep_waitidle(struct mount *mp, int flags __unused)
1938 {
1939         struct ufsmount *ump;
1940         struct vnode *devvp;
1941         struct thread *td;
1942         int error, i;
1943
1944         ump = VFSTOUFS(mp);
1945         devvp = ump->um_devvp;
1946         td = curthread;
1947         error = 0;
1948         ACQUIRE_LOCK(ump);
1949         for (i = 0; i < SU_WAITIDLE_RETRIES && ump->softdep_deps != 0; i++) {
1950                 ump->softdep_req = 1;
1951                 KASSERT((flags & FORCECLOSE) == 0 ||
1952                     ump->softdep_on_worklist == 0,
1953                     ("softdep_waitidle: work added after flush"));
1954                 msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM | PDROP,
1955                     "softdeps", 10 * hz);
1956                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1957                 error = VOP_FSYNC(devvp, MNT_WAIT, td);
1958                 VOP_UNLOCK(devvp, 0);
1959                 ACQUIRE_LOCK(ump);
1960                 if (error != 0)
1961                         break;
1962         }
1963         ump->softdep_req = 0;
1964         if (i == SU_WAITIDLE_RETRIES && error == 0 && ump->softdep_deps != 0) {
1965                 error = EBUSY;
1966                 printf("softdep_waitidle: Failed to flush worklist for %p\n",
1967                     mp);
1968         }
1969         FREE_LOCK(ump);
1970         return (error);
1971 }
1972
1973 /*
1974  * Flush all vnodes and worklist items associated with a specified mount point.
1975  */
1976 int
1977 softdep_flushfiles(oldmnt, flags, td)
1978         struct mount *oldmnt;
1979         int flags;
1980         struct thread *td;
1981 {
1982 #ifdef QUOTA
1983         struct ufsmount *ump;
1984         int i;
1985 #endif
1986         int error, early, depcount, loopcnt, retry_flush_count, retry;
1987         int morework;
1988
1989         KASSERT(MOUNTEDSOFTDEP(oldmnt) != 0,
1990             ("softdep_flushfiles called on non-softdep filesystem"));
1991         loopcnt = 10;
1992         retry_flush_count = 3;
1993 retry_flush:
1994         error = 0;
1995
1996         /*
1997          * Alternately flush the vnodes associated with the mount
1998          * point and process any dependencies that the flushing
1999          * creates. In theory, this loop can happen at most twice,
2000          * but we give it a few extra just to be sure.
2001          */
2002         for (; loopcnt > 0; loopcnt--) {
2003                 /*
2004                  * Do another flush in case any vnodes were brought in
2005                  * as part of the cleanup operations.
2006                  */
2007                 early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag &
2008                     MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH;
2009                 if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0)
2010                         break;
2011                 if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
2012                     depcount == 0)
2013                         break;
2014         }
2015         /*
2016          * If we are unmounting then it is an error to fail. If we
2017          * are simply trying to downgrade to read-only, then filesystem
2018          * activity can keep us busy forever, so we just fail with EBUSY.
2019          */
2020         if (loopcnt == 0) {
2021                 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
2022                         panic("softdep_flushfiles: looping");
2023                 error = EBUSY;
2024         }
2025         if (!error)
2026                 error = softdep_waitidle(oldmnt, flags);
2027         if (!error) {
2028                 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
2029                         retry = 0;
2030                         MNT_ILOCK(oldmnt);
2031                         KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
2032                             ("softdep_flushfiles: !MNTK_NOINSMNTQ"));
2033                         morework = oldmnt->mnt_nvnodelistsize > 0;
2034 #ifdef QUOTA
2035                         ump = VFSTOUFS(oldmnt);
2036                         UFS_LOCK(ump);
2037                         for (i = 0; i < MAXQUOTAS; i++) {
2038                                 if (ump->um_quotas[i] != NULLVP)
2039                                         morework = 1;
2040                         }
2041                         UFS_UNLOCK(ump);
2042 #endif
2043                         if (morework) {
2044                                 if (--retry_flush_count > 0) {
2045                                         retry = 1;
2046                                         loopcnt = 3;
2047                                 } else
2048                                         error = EBUSY;
2049                         }
2050                         MNT_IUNLOCK(oldmnt);
2051                         if (retry)
2052                                 goto retry_flush;
2053                 }
2054         }
2055         return (error);
2056 }
2057
2058 /*
2059  * Structure hashing.
2060  *
2061  * There are four types of structures that can be looked up:
2062  *      1) pagedep structures identified by mount point, inode number,
2063  *         and logical block.
2064  *      2) inodedep structures identified by mount point and inode number.
2065  *      3) newblk structures identified by mount point and
2066  *         physical block number.
2067  *      4) bmsafemap structures identified by mount point and
2068  *         cylinder group number.
2069  *
2070  * The "pagedep" and "inodedep" dependency structures are hashed
2071  * separately from the file blocks and inodes to which they correspond.
2072  * This separation helps when the in-memory copy of an inode or
2073  * file block must be replaced. It also obviates the need to access
2074  * an inode or file page when simply updating (or de-allocating)
2075  * dependency structures. Lookup of newblk structures is needed to
2076  * find newly allocated blocks when trying to associate them with
2077  * their allocdirect or allocindir structure.
2078  *
2079  * The lookup routines optionally create and hash a new instance when
2080  * an existing entry is not found. The bmsafemap lookup routine always
2081  * allocates a new structure if an existing one is not found.
2082  */
2083 #define DEPALLOC        0x0001  /* allocate structure if lookup fails */
2084
2085 /*
2086  * Structures and routines associated with pagedep caching.
2087  */
2088 #define PAGEDEP_HASH(ump, inum, lbn) \
2089         (&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size])
2090
2091 static int
2092 pagedep_find(pagedephd, ino, lbn, pagedeppp)
2093         struct pagedep_hashhead *pagedephd;
2094         ino_t ino;
2095         ufs_lbn_t lbn;
2096         struct pagedep **pagedeppp;
2097 {
2098         struct pagedep *pagedep;
2099
2100         LIST_FOREACH(pagedep, pagedephd, pd_hash) {
2101                 if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) {
2102                         *pagedeppp = pagedep;
2103                         return (1);
2104                 }
2105         }
2106         *pagedeppp = NULL;
2107         return (0);
2108 }
2109 /*
2110  * Look up a pagedep. Return 1 if found, 0 otherwise.
2111  * If not found, allocate if DEPALLOC flag is passed.
2112  * Found or allocated entry is returned in pagedeppp.
2113  */
2114 static int
2115 pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp)
2116         struct mount *mp;
2117         struct buf *bp;
2118         ino_t ino;
2119         ufs_lbn_t lbn;
2120         int flags;
2121         struct pagedep **pagedeppp;
2122 {
2123         struct pagedep *pagedep;
2124         struct pagedep_hashhead *pagedephd;
2125         struct worklist *wk;
2126         struct ufsmount *ump;
2127         int ret;
2128         int i;
2129
2130         ump = VFSTOUFS(mp);
2131         LOCK_OWNED(ump);
2132         if (bp) {
2133                 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
2134                         if (wk->wk_type == D_PAGEDEP) {
2135                                 *pagedeppp = WK_PAGEDEP(wk);
2136                                 return (1);
2137                         }
2138                 }
2139         }
2140         pagedephd = PAGEDEP_HASH(ump, ino, lbn);
2141         ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2142         if (ret) {
2143                 if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
2144                         WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
2145                 return (1);
2146         }
2147         if ((flags & DEPALLOC) == 0)
2148                 return (0);
2149         FREE_LOCK(ump);
2150         pagedep = malloc(sizeof(struct pagedep),
2151             M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
2152         workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
2153         ACQUIRE_LOCK(ump);
2154         ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2155         if (*pagedeppp) {
2156                 /*
2157                  * This should never happen since we only create pagedeps
2158                  * with the vnode lock held.  Could be an assert.
2159                  */
2160                 WORKITEM_FREE(pagedep, D_PAGEDEP);
2161                 return (ret);
2162         }
2163         pagedep->pd_ino = ino;
2164         pagedep->pd_lbn = lbn;
2165         LIST_INIT(&pagedep->pd_dirremhd);
2166         LIST_INIT(&pagedep->pd_pendinghd);
2167         for (i = 0; i < DAHASHSZ; i++)
2168                 LIST_INIT(&pagedep->pd_diraddhd[i]);
2169         LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
2170         WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2171         *pagedeppp = pagedep;
2172         return (0);
2173 }
2174
2175 /*
2176  * Structures and routines associated with inodedep caching.
2177  */
2178 #define INODEDEP_HASH(ump, inum) \
2179       (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size])
2180
2181 static int
2182 inodedep_find(inodedephd, inum, inodedeppp)
2183         struct inodedep_hashhead *inodedephd;
2184         ino_t inum;
2185         struct inodedep **inodedeppp;
2186 {
2187         struct inodedep *inodedep;
2188
2189         LIST_FOREACH(inodedep, inodedephd, id_hash)
2190                 if (inum == inodedep->id_ino)
2191                         break;
2192         if (inodedep) {
2193                 *inodedeppp = inodedep;
2194                 return (1);
2195         }
2196         *inodedeppp = NULL;
2197
2198         return (0);
2199 }
2200 /*
2201  * Look up an inodedep. Return 1 if found, 0 if not found.
2202  * If not found, allocate if DEPALLOC flag is passed.
2203  * Found or allocated entry is returned in inodedeppp.
2204  */
2205 static int
2206 inodedep_lookup(mp, inum, flags, inodedeppp)
2207         struct mount *mp;
2208         ino_t inum;
2209         int flags;
2210         struct inodedep **inodedeppp;
2211 {
2212         struct inodedep *inodedep;
2213         struct inodedep_hashhead *inodedephd;
2214         struct ufsmount *ump;
2215         struct fs *fs;
2216
2217         ump = VFSTOUFS(mp);
2218         LOCK_OWNED(ump);
2219         fs = ump->um_fs;
2220         inodedephd = INODEDEP_HASH(ump, inum);
2221
2222         if (inodedep_find(inodedephd, inum, inodedeppp))
2223                 return (1);
2224         if ((flags & DEPALLOC) == 0)
2225                 return (0);
2226         /*
2227          * If the system is over its limit and our filesystem is
2228          * responsible for more than our share of that usage and
2229          * we are not in a rush, request some inodedep cleanup.
2230          */
2231         if (softdep_excess_items(ump, D_INODEDEP))
2232                 schedule_cleanup(mp);
2233         else
2234                 FREE_LOCK(ump);
2235         inodedep = malloc(sizeof(struct inodedep),
2236                 M_INODEDEP, M_SOFTDEP_FLAGS);
2237         workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
2238         ACQUIRE_LOCK(ump);
2239         if (inodedep_find(inodedephd, inum, inodedeppp)) {
2240                 WORKITEM_FREE(inodedep, D_INODEDEP);
2241                 return (1);
2242         }
2243         inodedep->id_fs = fs;
2244         inodedep->id_ino = inum;
2245         inodedep->id_state = ALLCOMPLETE;
2246         inodedep->id_nlinkdelta = 0;
2247         inodedep->id_savedino1 = NULL;
2248         inodedep->id_savedsize = -1;
2249         inodedep->id_savedextsize = -1;
2250         inodedep->id_savednlink = -1;
2251         inodedep->id_bmsafemap = NULL;
2252         inodedep->id_mkdiradd = NULL;
2253         LIST_INIT(&inodedep->id_dirremhd);
2254         LIST_INIT(&inodedep->id_pendinghd);
2255         LIST_INIT(&inodedep->id_inowait);
2256         LIST_INIT(&inodedep->id_bufwait);
2257         TAILQ_INIT(&inodedep->id_inoreflst);
2258         TAILQ_INIT(&inodedep->id_inoupdt);
2259         TAILQ_INIT(&inodedep->id_newinoupdt);
2260         TAILQ_INIT(&inodedep->id_extupdt);
2261         TAILQ_INIT(&inodedep->id_newextupdt);
2262         TAILQ_INIT(&inodedep->id_freeblklst);
2263         LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
2264         *inodedeppp = inodedep;
2265         return (0);
2266 }
2267
2268 /*
2269  * Structures and routines associated with newblk caching.
2270  */
2271 #define NEWBLK_HASH(ump, inum) \
2272         (&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size])
2273
2274 static int
2275 newblk_find(newblkhd, newblkno, flags, newblkpp)
2276         struct newblk_hashhead *newblkhd;
2277         ufs2_daddr_t newblkno;
2278         int flags;
2279         struct newblk **newblkpp;
2280 {
2281         struct newblk *newblk;
2282
2283         LIST_FOREACH(newblk, newblkhd, nb_hash) {
2284                 if (newblkno != newblk->nb_newblkno)
2285                         continue;
2286                 /*
2287                  * If we're creating a new dependency don't match those that
2288                  * have already been converted to allocdirects.  This is for
2289                  * a frag extend.
2290                  */
2291                 if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
2292                         continue;
2293                 break;
2294         }
2295         if (newblk) {
2296                 *newblkpp = newblk;
2297                 return (1);
2298         }
2299         *newblkpp = NULL;
2300         return (0);
2301 }
2302
2303 /*
2304  * Look up a newblk. Return 1 if found, 0 if not found.
2305  * If not found, allocate if DEPALLOC flag is passed.
2306  * Found or allocated entry is returned in newblkpp.
2307  */
2308 static int
2309 newblk_lookup(mp, newblkno, flags, newblkpp)
2310         struct mount *mp;
2311         ufs2_daddr_t newblkno;
2312         int flags;
2313         struct newblk **newblkpp;
2314 {
2315         struct newblk *newblk;
2316         struct newblk_hashhead *newblkhd;
2317         struct ufsmount *ump;
2318
2319         ump = VFSTOUFS(mp);
2320         LOCK_OWNED(ump);
2321         newblkhd = NEWBLK_HASH(ump, newblkno);
2322         if (newblk_find(newblkhd, newblkno, flags, newblkpp))
2323                 return (1);
2324         if ((flags & DEPALLOC) == 0)
2325                 return (0);
2326         if (softdep_excess_items(ump, D_NEWBLK) ||
2327             softdep_excess_items(ump, D_ALLOCDIRECT) ||
2328             softdep_excess_items(ump, D_ALLOCINDIR))
2329                 schedule_cleanup(mp);
2330         else
2331                 FREE_LOCK(ump);
2332         newblk = malloc(sizeof(union allblk), M_NEWBLK,
2333             M_SOFTDEP_FLAGS | M_ZERO);
2334         workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
2335         ACQUIRE_LOCK(ump);
2336         if (newblk_find(newblkhd, newblkno, flags, newblkpp)) {
2337                 WORKITEM_FREE(newblk, D_NEWBLK);
2338                 return (1);
2339         }
2340         newblk->nb_freefrag = NULL;
2341         LIST_INIT(&newblk->nb_indirdeps);
2342         LIST_INIT(&newblk->nb_newdirblk);
2343         LIST_INIT(&newblk->nb_jwork);
2344         newblk->nb_state = ATTACHED;
2345         newblk->nb_newblkno = newblkno;
2346         LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
2347         *newblkpp = newblk;
2348         return (0);
2349 }
2350
2351 /*
2352  * Structures and routines associated with freed indirect block caching.
2353  */
2354 #define INDIR_HASH(ump, blkno) \
2355         (&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size])
2356
2357 /*
2358  * Lookup an indirect block in the indir hash table.  The freework is
2359  * removed and potentially freed.  The caller must do a blocking journal
2360  * write before writing to the blkno.
2361  */
2362 static int
2363 indirblk_lookup(mp, blkno)
2364         struct mount *mp;
2365         ufs2_daddr_t blkno;
2366 {
2367         struct freework *freework;
2368         struct indir_hashhead *wkhd;
2369         struct ufsmount *ump;
2370
2371         ump = VFSTOUFS(mp);
2372         wkhd = INDIR_HASH(ump, blkno);
2373         TAILQ_FOREACH(freework, wkhd, fw_next) {
2374                 if (freework->fw_blkno != blkno)
2375                         continue;
2376                 indirblk_remove(freework);
2377                 return (1);
2378         }
2379         return (0);
2380 }
2381
2382 /*
2383  * Insert an indirect block represented by freework into the indirblk
2384  * hash table so that it may prevent the block from being re-used prior
2385  * to the journal being written.
2386  */
2387 static void
2388 indirblk_insert(freework)
2389         struct freework *freework;
2390 {
2391         struct jblocks *jblocks;
2392         struct jseg *jseg;
2393         struct ufsmount *ump;
2394
2395         ump = VFSTOUFS(freework->fw_list.wk_mp);
2396         jblocks = ump->softdep_jblocks;
2397         jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
2398         if (jseg == NULL)
2399                 return;
2400
2401         LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
2402         TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework,
2403             fw_next);
2404         freework->fw_state &= ~DEPCOMPLETE;
2405 }
2406
2407 static void
2408 indirblk_remove(freework)
2409         struct freework *freework;
2410 {
2411         struct ufsmount *ump;
2412
2413         ump = VFSTOUFS(freework->fw_list.wk_mp);
2414         LIST_REMOVE(freework, fw_segs);
2415         TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next);
2416         freework->fw_state |= DEPCOMPLETE;
2417         if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
2418                 WORKITEM_FREE(freework, D_FREEWORK);
2419 }
2420
2421 /*
2422  * Executed during filesystem system initialization before
2423  * mounting any filesystems.
2424  */
2425 void
2426 softdep_initialize()
2427 {
2428
2429         TAILQ_INIT(&softdepmounts);
2430 #ifdef __LP64__
2431         max_softdeps = desiredvnodes * 4;
2432 #else
2433         max_softdeps = desiredvnodes * 2;
2434 #endif
2435
2436         /* initialise bioops hack */
2437         bioops.io_start = softdep_disk_io_initiation;
2438         bioops.io_complete = softdep_disk_write_complete;
2439         bioops.io_deallocate = softdep_deallocate_dependencies;
2440         bioops.io_countdeps = softdep_count_dependencies;
2441         softdep_ast_cleanup = softdep_ast_cleanup_proc;
2442
2443         /* Initialize the callout with an mtx. */
2444         callout_init_mtx(&softdep_callout, &lk, 0);
2445 }
2446
2447 /*
2448  * Executed after all filesystems have been unmounted during
2449  * filesystem module unload.
2450  */
2451 void
2452 softdep_uninitialize()
2453 {
2454
2455         /* clear bioops hack */
2456         bioops.io_start = NULL;
2457         bioops.io_complete = NULL;
2458         bioops.io_deallocate = NULL;
2459         bioops.io_countdeps = NULL;
2460         softdep_ast_cleanup = NULL;
2461
2462         callout_drain(&softdep_callout);
2463 }
2464
2465 /*
2466  * Called at mount time to notify the dependency code that a
2467  * filesystem wishes to use it.
2468  */
2469 int
2470 softdep_mount(devvp, mp, fs, cred)
2471         struct vnode *devvp;
2472         struct mount *mp;
2473         struct fs *fs;
2474         struct ucred *cred;
2475 {
2476         struct csum_total cstotal;
2477         struct mount_softdeps *sdp;
2478         struct ufsmount *ump;
2479         struct cg *cgp;
2480         struct buf *bp;
2481         u_int cyl, i;
2482         int error;
2483
2484         sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA,
2485             M_WAITOK | M_ZERO);
2486         MNT_ILOCK(mp);
2487         mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
2488         if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
2489                 mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
2490                         MNTK_SOFTDEP | MNTK_NOASYNC;
2491         }
2492         ump = VFSTOUFS(mp);
2493         ump->um_softdep = sdp;
2494         MNT_IUNLOCK(mp);
2495         rw_init(LOCK_PTR(ump), "Per-Filesystem Softdep Lock");
2496         sdp->sd_ump = ump;
2497         LIST_INIT(&ump->softdep_workitem_pending);
2498         LIST_INIT(&ump->softdep_journal_pending);
2499         TAILQ_INIT(&ump->softdep_unlinked);
2500         LIST_INIT(&ump->softdep_dirtycg);
2501         ump->softdep_worklist_tail = NULL;
2502         ump->softdep_on_worklist = 0;
2503         ump->softdep_deps = 0;
2504         LIST_INIT(&ump->softdep_mkdirlisthd);
2505         ump->pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
2506             &ump->pagedep_hash_size);
2507         ump->pagedep_nextclean = 0;
2508         ump->inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP,
2509             &ump->inodedep_hash_size);
2510         ump->inodedep_nextclean = 0;
2511         ump->newblk_hashtbl = hashinit(max_softdeps / 2,  M_NEWBLK,
2512             &ump->newblk_hash_size);
2513         ump->bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP,
2514             &ump->bmsafemap_hash_size);
2515         i = 1 << (ffs(desiredvnodes / 10) - 1);
2516         ump->indir_hashtbl = malloc(i * sizeof(struct indir_hashhead),
2517             M_FREEWORK, M_WAITOK);
2518         ump->indir_hash_size = i - 1;
2519         for (i = 0; i <= ump->indir_hash_size; i++)
2520                 TAILQ_INIT(&ump->indir_hashtbl[i]);
2521         ACQUIRE_GBLLOCK(&lk);
2522         TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
2523         FREE_GBLLOCK(&lk);
2524         if ((fs->fs_flags & FS_SUJ) &&
2525             (error = journal_mount(mp, fs, cred)) != 0) {
2526                 printf("Failed to start journal: %d\n", error);
2527                 softdep_unmount(mp);
2528                 return (error);
2529         }
2530         /*
2531          * Start our flushing thread in the bufdaemon process.
2532          */
2533         ACQUIRE_LOCK(ump);
2534         ump->softdep_flags |= FLUSH_STARTING;
2535         FREE_LOCK(ump);
2536         kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc,
2537             &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker",
2538             mp->mnt_stat.f_mntonname);
2539         ACQUIRE_LOCK(ump);
2540         while ((ump->softdep_flags & FLUSH_STARTING) != 0) {
2541                 msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart",
2542                     hz / 2);
2543         }
2544         FREE_LOCK(ump);
2545         /*
2546          * When doing soft updates, the counters in the
2547          * superblock may have gotten out of sync. Recomputation
2548          * can take a long time and can be deferred for background
2549          * fsck.  However, the old behavior of scanning the cylinder
2550          * groups and recalculating them at mount time is available
2551          * by setting vfs.ffs.compute_summary_at_mount to one.
2552          */
2553         if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
2554                 return (0);
2555         bzero(&cstotal, sizeof cstotal);
2556         for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
2557                 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
2558                     fs->fs_cgsize, cred, &bp)) != 0) {
2559                         brelse(bp);
2560                         softdep_unmount(mp);
2561                         return (error);
2562                 }
2563                 cgp = (struct cg *)bp->b_data;
2564                 cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
2565                 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
2566                 cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
2567                 cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
2568                 fs->fs_cs(fs, cyl) = cgp->cg_cs;
2569                 brelse(bp);
2570         }
2571 #ifdef INVARIANTS
2572         if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
2573                 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
2574 #endif
2575         bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
2576         return (0);
2577 }
2578
2579 void
2580 softdep_unmount(mp)
2581         struct mount *mp;
2582 {
2583         struct ufsmount *ump;
2584 #ifdef INVARIANTS
2585         int i;
2586 #endif
2587
2588         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
2589             ("softdep_unmount called on non-softdep filesystem"));
2590         ump = VFSTOUFS(mp);
2591         MNT_ILOCK(mp);
2592         mp->mnt_flag &= ~MNT_SOFTDEP;
2593         if (MOUNTEDSUJ(mp) == 0) {
2594                 MNT_IUNLOCK(mp);
2595         } else {
2596                 mp->mnt_flag &= ~MNT_SUJ;
2597                 MNT_IUNLOCK(mp);
2598                 journal_unmount(ump);
2599         }
2600         /*
2601          * Shut down our flushing thread. Check for NULL is if
2602          * softdep_mount errors out before the thread has been created.
2603          */
2604         if (ump->softdep_flushtd != NULL) {
2605                 ACQUIRE_LOCK(ump);
2606                 ump->softdep_flags |= FLUSH_EXIT;
2607                 wakeup(&ump->softdep_flushtd);
2608                 msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM | PDROP,
2609                     "sdwait", 0);
2610                 KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0,
2611                     ("Thread shutdown failed"));
2612         }
2613         /*
2614          * Free up our resources.
2615          */
2616         ACQUIRE_GBLLOCK(&lk);
2617         TAILQ_REMOVE(&softdepmounts, ump->um_softdep, sd_next);
2618         FREE_GBLLOCK(&lk);
2619         rw_destroy(LOCK_PTR(ump));
2620         hashdestroy(ump->pagedep_hashtbl, M_PAGEDEP, ump->pagedep_hash_size);
2621         hashdestroy(ump->inodedep_hashtbl, M_INODEDEP, ump->inodedep_hash_size);
2622         hashdestroy(ump->newblk_hashtbl, M_NEWBLK, ump->newblk_hash_size);
2623         hashdestroy(ump->bmsafemap_hashtbl, M_BMSAFEMAP,
2624             ump->bmsafemap_hash_size);
2625         free(ump->indir_hashtbl, M_FREEWORK);
2626 #ifdef INVARIANTS
2627         for (i = 0; i <= D_LAST; i++)
2628                 KASSERT(ump->softdep_curdeps[i] == 0,
2629                     ("Unmount %s: Dep type %s != 0 (%ld)", ump->um_fs->fs_fsmnt,
2630                     TYPENAME(i), ump->softdep_curdeps[i]));
2631 #endif
2632         free(ump->um_softdep, M_MOUNTDATA);
2633 }
2634
2635 static struct jblocks *
2636 jblocks_create(void)
2637 {
2638         struct jblocks *jblocks;
2639
2640         jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
2641         TAILQ_INIT(&jblocks->jb_segs);
2642         jblocks->jb_avail = 10;
2643         jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2644             M_JBLOCKS, M_WAITOK | M_ZERO);
2645
2646         return (jblocks);
2647 }
2648
2649 static ufs2_daddr_t
2650 jblocks_alloc(jblocks, bytes, actual)
2651         struct jblocks *jblocks;
2652         int bytes;
2653         int *actual;
2654 {
2655         ufs2_daddr_t daddr;
2656         struct jextent *jext;
2657         int freecnt;
2658         int blocks;
2659
2660         blocks = bytes / DEV_BSIZE;
2661         jext = &jblocks->jb_extent[jblocks->jb_head];
2662         freecnt = jext->je_blocks - jblocks->jb_off;
2663         if (freecnt == 0) {
2664                 jblocks->jb_off = 0;
2665                 if (++jblocks->jb_head > jblocks->jb_used)
2666                         jblocks->jb_head = 0;
2667                 jext = &jblocks->jb_extent[jblocks->jb_head];
2668                 freecnt = jext->je_blocks;
2669         }
2670         if (freecnt > blocks)
2671                 freecnt = blocks;
2672         *actual = freecnt * DEV_BSIZE;
2673         daddr = jext->je_daddr + jblocks->jb_off;
2674         jblocks->jb_off += freecnt;
2675         jblocks->jb_free -= freecnt;
2676
2677         return (daddr);
2678 }
2679
2680 static void
2681 jblocks_free(jblocks, mp, bytes)
2682         struct jblocks *jblocks;
2683         struct mount *mp;
2684         int bytes;
2685 {
2686
2687         LOCK_OWNED(VFSTOUFS(mp));
2688         jblocks->jb_free += bytes / DEV_BSIZE;
2689         if (jblocks->jb_suspended)
2690                 worklist_speedup(mp);
2691         wakeup(jblocks);
2692 }
2693
2694 static void
2695 jblocks_destroy(jblocks)
2696         struct jblocks *jblocks;
2697 {
2698
2699         if (jblocks->jb_extent)
2700                 free(jblocks->jb_extent, M_JBLOCKS);
2701         free(jblocks, M_JBLOCKS);
2702 }
2703
2704 static void
2705 jblocks_add(jblocks, daddr, blocks)
2706         struct jblocks *jblocks;
2707         ufs2_daddr_t daddr;
2708         int blocks;
2709 {
2710         struct jextent *jext;
2711
2712         jblocks->jb_blocks += blocks;
2713         jblocks->jb_free += blocks;
2714         jext = &jblocks->jb_extent[jblocks->jb_used];
2715         /* Adding the first block. */
2716         if (jext->je_daddr == 0) {
2717                 jext->je_daddr = daddr;
2718                 jext->je_blocks = blocks;
2719                 return;
2720         }
2721         /* Extending the last extent. */
2722         if (jext->je_daddr + jext->je_blocks == daddr) {
2723                 jext->je_blocks += blocks;
2724                 return;
2725         }
2726         /* Adding a new extent. */
2727         if (++jblocks->jb_used == jblocks->jb_avail) {
2728                 jblocks->jb_avail *= 2;
2729                 jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2730                     M_JBLOCKS, M_WAITOK | M_ZERO);
2731                 memcpy(jext, jblocks->jb_extent,
2732                     sizeof(struct jextent) * jblocks->jb_used);
2733                 free(jblocks->jb_extent, M_JBLOCKS);
2734                 jblocks->jb_extent = jext;
2735         }
2736         jext = &jblocks->jb_extent[jblocks->jb_used];
2737         jext->je_daddr = daddr;
2738         jext->je_blocks = blocks;
2739         return;
2740 }
2741
2742 int
2743 softdep_journal_lookup(mp, vpp)
2744         struct mount *mp;
2745         struct vnode **vpp;
2746 {
2747         struct componentname cnp;
2748         struct vnode *dvp;
2749         ino_t sujournal;
2750         int error;
2751
2752         error = VFS_VGET(mp, UFS_ROOTINO, LK_EXCLUSIVE, &dvp);
2753         if (error)
2754                 return (error);
2755         bzero(&cnp, sizeof(cnp));
2756         cnp.cn_nameiop = LOOKUP;
2757         cnp.cn_flags = ISLASTCN;
2758         cnp.cn_thread = curthread;
2759         cnp.cn_cred = curthread->td_ucred;
2760         cnp.cn_pnbuf = SUJ_FILE;
2761         cnp.cn_nameptr = SUJ_FILE;
2762         cnp.cn_namelen = strlen(SUJ_FILE);
2763         error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2764         vput(dvp);
2765         if (error != 0)
2766                 return (error);
2767         error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2768         return (error);
2769 }
2770
2771 /*
2772  * Open and verify the journal file.
2773  */
2774 static int
2775 journal_mount(mp, fs, cred)
2776         struct mount *mp;
2777         struct fs *fs;
2778         struct ucred *cred;
2779 {
2780         struct jblocks *jblocks;
2781         struct ufsmount *ump;
2782         struct vnode *vp;
2783         struct inode *ip;
2784         ufs2_daddr_t blkno;
2785         int bcount;
2786         int error;
2787         int i;
2788
2789         ump = VFSTOUFS(mp);
2790         ump->softdep_journal_tail = NULL;
2791         ump->softdep_on_journal = 0;
2792         ump->softdep_accdeps = 0;
2793         ump->softdep_req = 0;
2794         ump->softdep_jblocks = NULL;
2795         error = softdep_journal_lookup(mp, &vp);
2796         if (error != 0) {
2797                 printf("Failed to find journal.  Use tunefs to create one\n");
2798                 return (error);
2799         }
2800         ip = VTOI(vp);
2801         if (ip->i_size < SUJ_MIN) {
2802                 error = ENOSPC;
2803                 goto out;
2804         }
2805         bcount = lblkno(fs, ip->i_size);        /* Only use whole blocks. */
2806         jblocks = jblocks_create();
2807         for (i = 0; i < bcount; i++) {
2808                 error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
2809                 if (error)
2810                         break;
2811                 jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
2812         }
2813         if (error) {
2814                 jblocks_destroy(jblocks);
2815                 goto out;
2816         }
2817         jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */
2818         jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
2819         ump->softdep_jblocks = jblocks;
2820 out:
2821         if (error == 0) {
2822                 MNT_ILOCK(mp);
2823                 mp->mnt_flag |= MNT_SUJ;
2824                 mp->mnt_flag &= ~MNT_SOFTDEP;
2825                 MNT_IUNLOCK(mp);
2826                 /*
2827                  * Only validate the journal contents if the
2828                  * filesystem is clean, otherwise we write the logs
2829                  * but they'll never be used.  If the filesystem was
2830                  * still dirty when we mounted it the journal is
2831                  * invalid and a new journal can only be valid if it
2832                  * starts from a clean mount.
2833                  */
2834                 if (fs->fs_clean) {
2835                         DIP_SET(ip, i_modrev, fs->fs_mtime);
2836                         ip->i_flags |= IN_MODIFIED;
2837                         ffs_update(vp, 1);
2838                 }
2839         }
2840         vput(vp);
2841         return (error);
2842 }
2843
2844 static void
2845 journal_unmount(ump)
2846         struct ufsmount *ump;
2847 {
2848
2849         if (ump->softdep_jblocks)
2850                 jblocks_destroy(ump->softdep_jblocks);
2851         ump->softdep_jblocks = NULL;
2852 }
2853
2854 /*
2855  * Called when a journal record is ready to be written.  Space is allocated
2856  * and the journal entry is created when the journal is flushed to stable
2857  * store.
2858  */
2859 static void
2860 add_to_journal(wk)
2861         struct worklist *wk;
2862 {
2863         struct ufsmount *ump;
2864
2865         ump = VFSTOUFS(wk->wk_mp);
2866         LOCK_OWNED(ump);
2867         if (wk->wk_state & ONWORKLIST)
2868                 panic("add_to_journal: %s(0x%X) already on list",
2869                     TYPENAME(wk->wk_type), wk->wk_state);
2870         wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
2871         if (LIST_EMPTY(&ump->softdep_journal_pending)) {
2872                 ump->softdep_jblocks->jb_age = ticks;
2873                 LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
2874         } else
2875                 LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
2876         ump->softdep_journal_tail = wk;
2877         ump->softdep_on_journal += 1;
2878 }
2879
2880 /*
2881  * Remove an arbitrary item for the journal worklist maintain the tail
2882  * pointer.  This happens when a new operation obviates the need to
2883  * journal an old operation.
2884  */
2885 static void
2886 remove_from_journal(wk)
2887         struct worklist *wk;
2888 {
2889         struct ufsmount *ump;
2890
2891         ump = VFSTOUFS(wk->wk_mp);
2892         LOCK_OWNED(ump);
2893 #ifdef INVARIANTS
2894         {
2895                 struct worklist *wkn;
2896
2897                 LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
2898                         if (wkn == wk)
2899                                 break;
2900                 if (wkn == NULL)
2901                         panic("remove_from_journal: %p is not in journal", wk);
2902         }
2903 #endif
2904         /*
2905          * We emulate a TAILQ to save space in most structures which do not
2906          * require TAILQ semantics.  Here we must update the tail position
2907          * when removing the tail which is not the final entry. This works
2908          * only if the worklist linkage are at the beginning of the structure.
2909          */
2910         if (ump->softdep_journal_tail == wk)
2911                 ump->softdep_journal_tail =
2912                     (struct worklist *)wk->wk_list.le_prev;
2913         WORKLIST_REMOVE(wk);
2914         ump->softdep_on_journal -= 1;
2915 }
2916
2917 /*
2918  * Check for journal space as well as dependency limits so the prelink
2919  * code can throttle both journaled and non-journaled filesystems.
2920  * Threshold is 0 for low and 1 for min.
2921  */
2922 static int
2923 journal_space(ump, thresh)
2924         struct ufsmount *ump;
2925         int thresh;
2926 {
2927         struct jblocks *jblocks;
2928         int limit, avail;
2929
2930         jblocks = ump->softdep_jblocks;
2931         if (jblocks == NULL)
2932                 return (1);
2933         /*
2934          * We use a tighter restriction here to prevent request_cleanup()
2935          * running in threads from running into locks we currently hold.
2936          * We have to be over the limit and our filesystem has to be
2937          * responsible for more than our share of that usage.
2938          */
2939         limit = (max_softdeps / 10) * 9;
2940         if (dep_current[D_INODEDEP] > limit &&
2941             ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads)
2942                 return (0);
2943         if (thresh)
2944                 thresh = jblocks->jb_min;
2945         else
2946                 thresh = jblocks->jb_low;
2947         avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
2948         avail = jblocks->jb_free - avail;
2949
2950         return (avail > thresh);
2951 }
2952
2953 static void
2954 journal_suspend(ump)
2955         struct ufsmount *ump;
2956 {
2957         struct jblocks *jblocks;
2958         struct mount *mp;
2959
2960         mp = UFSTOVFS(ump);
2961         jblocks = ump->softdep_jblocks;
2962         MNT_ILOCK(mp);
2963         if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
2964                 stat_journal_min++;
2965                 mp->mnt_kern_flag |= MNTK_SUSPEND;
2966                 mp->mnt_susp_owner = ump->softdep_flushtd;
2967         }
2968         jblocks->jb_suspended = 1;
2969         MNT_IUNLOCK(mp);
2970 }
2971
2972 static int
2973 journal_unsuspend(struct ufsmount *ump)
2974 {
2975         struct jblocks *jblocks;
2976         struct mount *mp;
2977
2978         mp = UFSTOVFS(ump);
2979         jblocks = ump->softdep_jblocks;
2980
2981         if (jblocks != NULL && jblocks->jb_suspended &&
2982             journal_space(ump, jblocks->jb_min)) {
2983                 jblocks->jb_suspended = 0;
2984                 FREE_LOCK(ump);
2985                 mp->mnt_susp_owner = curthread;
2986                 vfs_write_resume(mp, 0);
2987                 ACQUIRE_LOCK(ump);
2988                 return (1);
2989         }
2990         return (0);
2991 }
2992
2993 /*
2994  * Called before any allocation function to be certain that there is
2995  * sufficient space in the journal prior to creating any new records.
2996  * Since in the case of block allocation we may have multiple locked
2997  * buffers at the time of the actual allocation we can not block
2998  * when the journal records are created.  Doing so would create a deadlock
2999  * if any of these buffers needed to be flushed to reclaim space.  Instead
3000  * we require a sufficiently large amount of available space such that
3001  * each thread in the system could have passed this allocation check and
3002  * still have sufficient free space.  With 20% of a minimum journal size
3003  * of 1MB we have 6553 records available.
3004  */
3005 int
3006 softdep_prealloc(vp, waitok)
3007         struct vnode *vp;
3008         int waitok;
3009 {
3010         struct ufsmount *ump;
3011
3012         KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
3013             ("softdep_prealloc called on non-softdep filesystem"));
3014         /*
3015          * Nothing to do if we are not running journaled soft updates.
3016          * If we currently hold the snapshot lock, we must avoid
3017          * handling other resources that could cause deadlock.  Do not
3018          * touch quotas vnode since it is typically recursed with
3019          * other vnode locks held.
3020          */
3021         if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)) ||
3022             (vp->v_vflag & VV_SYSTEM) != 0)
3023                 return (0);
3024         ump = VFSTOUFS(vp->v_mount);
3025         ACQUIRE_LOCK(ump);
3026         if (journal_space(ump, 0)) {
3027                 FREE_LOCK(ump);
3028                 return (0);
3029         }
3030         stat_journal_low++;
3031         FREE_LOCK(ump);
3032         if (waitok == MNT_NOWAIT)
3033                 return (ENOSPC);
3034         /*
3035          * Attempt to sync this vnode once to flush any journal
3036          * work attached to it.
3037          */
3038         if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
3039                 ffs_syncvnode(vp, waitok, 0);
3040         ACQUIRE_LOCK(ump);
3041         process_removes(vp);
3042         process_truncates(vp);
3043         if (journal_space(ump, 0) == 0) {
3044                 softdep_speedup(ump);
3045                 if (journal_space(ump, 1) == 0)
3046                         journal_suspend(ump);
3047         }
3048         FREE_LOCK(ump);
3049
3050         return (0);
3051 }
3052
3053 /*
3054  * Before adjusting a link count on a vnode verify that we have sufficient
3055  * journal space.  If not, process operations that depend on the currently
3056  * locked pair of vnodes to try to flush space as the syncer, buf daemon,
3057  * and softdep flush threads can not acquire these locks to reclaim space.
3058  */
3059 static void
3060 softdep_prelink(dvp, vp)
3061         struct vnode *dvp;
3062         struct vnode *vp;
3063 {
3064         struct ufsmount *ump;
3065
3066         ump = VFSTOUFS(dvp->v_mount);
3067         LOCK_OWNED(ump);
3068         /*
3069          * Nothing to do if we have sufficient journal space.
3070          * If we currently hold the snapshot lock, we must avoid
3071          * handling other resources that could cause deadlock.
3072          */
3073         if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
3074                 return;
3075         stat_journal_low++;
3076         FREE_LOCK(ump);
3077         if (vp)
3078                 ffs_syncvnode(vp, MNT_NOWAIT, 0);
3079         ffs_syncvnode(dvp, MNT_WAIT, 0);
3080         ACQUIRE_LOCK(ump);
3081         /* Process vp before dvp as it may create .. removes. */
3082         if (vp) {
3083                 process_removes(vp);
3084                 process_truncates(vp);
3085         }
3086         process_removes(dvp);
3087         process_truncates(dvp);
3088         softdep_speedup(ump);
3089         process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
3090         if (journal_space(ump, 0) == 0) {
3091                 softdep_speedup(ump);
3092                 if (journal_space(ump, 1) == 0)
3093                         journal_suspend(ump);
3094         }
3095 }
3096
3097 static void
3098 jseg_write(ump, jseg, data)
3099         struct ufsmount *ump;
3100         struct jseg *jseg;
3101         uint8_t *data;
3102 {
3103         struct jsegrec *rec;
3104
3105         rec = (struct jsegrec *)data;
3106         rec->jsr_seq = jseg->js_seq;
3107         rec->jsr_oldest = jseg->js_oldseq;
3108         rec->jsr_cnt = jseg->js_cnt;
3109         rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
3110         rec->jsr_crc = 0;
3111         rec->jsr_time = ump->um_fs->fs_mtime;
3112 }
3113
3114 static inline void
3115 inoref_write(inoref, jseg, rec)
3116         struct inoref *inoref;
3117         struct jseg *jseg;
3118         struct jrefrec *rec;
3119 {
3120
3121         inoref->if_jsegdep->jd_seg = jseg;
3122         rec->jr_ino = inoref->if_ino;
3123         rec->jr_parent = inoref->if_parent;
3124         rec->jr_nlink = inoref->if_nlink;
3125         rec->jr_mode = inoref->if_mode;
3126         rec->jr_diroff = inoref->if_diroff;
3127 }
3128
3129 static void
3130 jaddref_write(jaddref, jseg, data)
3131         struct jaddref *jaddref;
3132         struct jseg *jseg;
3133         uint8_t *data;
3134 {
3135         struct jrefrec *rec;
3136
3137         rec = (struct jrefrec *)data;
3138         rec->jr_op = JOP_ADDREF;
3139         inoref_write(&jaddref->ja_ref, jseg, rec);
3140 }
3141
3142 static void
3143 jremref_write(jremref, jseg, data)
3144         struct jremref *jremref;
3145         struct jseg *jseg;
3146         uint8_t *data;
3147 {
3148         struct jrefrec *rec;
3149
3150         rec = (struct jrefrec *)data;
3151         rec->jr_op = JOP_REMREF;
3152         inoref_write(&jremref->jr_ref, jseg, rec);
3153 }
3154
3155 static void
3156 jmvref_write(jmvref, jseg, data)
3157         struct jmvref *jmvref;
3158         struct jseg *jseg;
3159         uint8_t *data;
3160 {
3161         struct jmvrec *rec;
3162
3163         rec = (struct jmvrec *)data;
3164         rec->jm_op = JOP_MVREF;
3165         rec->jm_ino = jmvref->jm_ino;
3166         rec->jm_parent = jmvref->jm_parent;
3167         rec->jm_oldoff = jmvref->jm_oldoff;
3168         rec->jm_newoff = jmvref->jm_newoff;
3169 }
3170
3171 static void
3172 jnewblk_write(jnewblk, jseg, data)
3173         struct jnewblk *jnewblk;
3174         struct jseg *jseg;
3175         uint8_t *data;
3176 {
3177         struct jblkrec *rec;
3178
3179         jnewblk->jn_jsegdep->jd_seg = jseg;
3180         rec = (struct jblkrec *)data;
3181         rec->jb_op = JOP_NEWBLK;
3182         rec->jb_ino = jnewblk->jn_ino;
3183         rec->jb_blkno = jnewblk->jn_blkno;
3184         rec->jb_lbn = jnewblk->jn_lbn;
3185         rec->jb_frags = jnewblk->jn_frags;
3186         rec->jb_oldfrags = jnewblk->jn_oldfrags;
3187 }
3188
3189 static void
3190 jfreeblk_write(jfreeblk, jseg, data)
3191         struct jfreeblk *jfreeblk;
3192         struct jseg *jseg;
3193         uint8_t *data;
3194 {
3195         struct jblkrec *rec;
3196
3197         jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
3198         rec = (struct jblkrec *)data;
3199         rec->jb_op = JOP_FREEBLK;
3200         rec->jb_ino = jfreeblk->jf_ino;
3201         rec->jb_blkno = jfreeblk->jf_blkno;
3202         rec->jb_lbn = jfreeblk->jf_lbn;
3203         rec->jb_frags = jfreeblk->jf_frags;
3204         rec->jb_oldfrags = 0;
3205 }
3206
3207 static void
3208 jfreefrag_write(jfreefrag, jseg, data)
3209         struct jfreefrag *jfreefrag;
3210         struct jseg *jseg;
3211         uint8_t *data;
3212 {
3213         struct jblkrec *rec;
3214
3215         jfreefrag->fr_jsegdep->jd_seg = jseg;
3216         rec = (struct jblkrec *)data;
3217         rec->jb_op = JOP_FREEBLK;
3218         rec->jb_ino = jfreefrag->fr_ino;
3219         rec->jb_blkno = jfreefrag->fr_blkno;
3220         rec->jb_lbn = jfreefrag->fr_lbn;
3221         rec->jb_frags = jfreefrag->fr_frags;
3222         rec->jb_oldfrags = 0;
3223 }
3224
3225 static void
3226 jtrunc_write(jtrunc, jseg, data)
3227         struct jtrunc *jtrunc;
3228         struct jseg *jseg;
3229         uint8_t *data;
3230 {
3231         struct jtrncrec *rec;
3232
3233         jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
3234         rec = (struct jtrncrec *)data;
3235         rec->jt_op = JOP_TRUNC;
3236         rec->jt_ino = jtrunc->jt_ino;
3237         rec->jt_size = jtrunc->jt_size;
3238         rec->jt_extsize = jtrunc->jt_extsize;
3239 }
3240
3241 static void
3242 jfsync_write(jfsync, jseg, data)
3243         struct jfsync *jfsync;
3244         struct jseg *jseg;
3245         uint8_t *data;
3246 {
3247         struct jtrncrec *rec;
3248
3249         rec = (struct jtrncrec *)data;
3250         rec->jt_op = JOP_SYNC;
3251         rec->jt_ino = jfsync->jfs_ino;
3252         rec->jt_size = jfsync->jfs_size;
3253         rec->jt_extsize = jfsync->jfs_extsize;
3254 }
3255
3256 static void
3257 softdep_flushjournal(mp)
3258         struct mount *mp;
3259 {
3260         struct jblocks *jblocks;
3261         struct ufsmount *ump;
3262
3263         if (MOUNTEDSUJ(mp) == 0)
3264                 return;
3265         ump = VFSTOUFS(mp);
3266         jblocks = ump->softdep_jblocks;
3267         ACQUIRE_LOCK(ump);
3268         while (ump->softdep_on_journal) {
3269                 jblocks->jb_needseg = 1;
3270                 softdep_process_journal(mp, NULL, MNT_WAIT);
3271         }
3272         FREE_LOCK(ump);
3273 }
3274
3275 static void softdep_synchronize_completed(struct bio *);
3276 static void softdep_synchronize(struct bio *, struct ufsmount *, void *);
3277
3278 static void
3279 softdep_synchronize_completed(bp)
3280         struct bio *bp;
3281 {
3282         struct jseg *oldest;
3283         struct jseg *jseg;
3284         struct ufsmount *ump;
3285
3286         /*
3287          * caller1 marks the last segment written before we issued the
3288          * synchronize cache.
3289          */
3290         jseg = bp->bio_caller1;
3291         if (jseg == NULL) {
3292                 g_destroy_bio(bp);
3293                 return;
3294         }
3295         ump = VFSTOUFS(jseg->js_list.wk_mp);
3296         ACQUIRE_LOCK(ump);
3297         oldest = NULL;
3298         /*
3299          * Mark all the journal entries waiting on the synchronize cache
3300          * as completed so they may continue on.
3301          */
3302         while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) {
3303                 jseg->js_state |= COMPLETE;
3304                 oldest = jseg;
3305                 jseg = TAILQ_PREV(jseg, jseglst, js_next);
3306         }
3307         /*
3308          * Restart deferred journal entry processing from the oldest
3309          * completed jseg.
3310          */
3311         if (oldest)
3312                 complete_jsegs(oldest);
3313
3314         FREE_LOCK(ump);
3315         g_destroy_bio(bp);
3316 }
3317
3318 /*
3319  * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering
3320  * barriers.  The journal must be written prior to any blocks that depend
3321  * on it and the journal can not be released until the blocks have be
3322  * written.  This code handles both barriers simultaneously.
3323  */
3324 static void
3325 softdep_synchronize(bp, ump, caller1)
3326         struct bio *bp;
3327         struct ufsmount *ump;
3328         void *caller1;
3329 {
3330
3331         bp->bio_cmd = BIO_FLUSH;
3332         bp->bio_flags |= BIO_ORDERED;
3333         bp->bio_data = NULL;
3334         bp->bio_offset = ump->um_cp->provider->mediasize;
3335         bp->bio_length = 0;
3336         bp->bio_done = softdep_synchronize_completed;
3337         bp->bio_caller1 = caller1;
3338         g_io_request(bp,
3339             (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private);
3340 }
3341
3342 /*
3343  * Flush some journal records to disk.
3344  */
3345 static void
3346 softdep_process_journal(mp, needwk, flags)
3347         struct mount *mp;
3348         struct worklist *needwk;
3349         int flags;
3350 {
3351         struct jblocks *jblocks;
3352         struct ufsmount *ump;
3353         struct worklist *wk;
3354         struct jseg *jseg;
3355         struct buf *bp;
3356         struct bio *bio;
3357         uint8_t *data;
3358         struct fs *fs;
3359         int shouldflush;
3360         int segwritten;
3361         int jrecmin;    /* Minimum records per block. */
3362         int jrecmax;    /* Maximum records per block. */
3363         int size;
3364         int cnt;
3365         int off;
3366         int devbsize;
3367
3368         if (MOUNTEDSUJ(mp) == 0)
3369                 return;
3370         shouldflush = softdep_flushcache;
3371         bio = NULL;
3372         jseg = NULL;
3373         ump = VFSTOUFS(mp);
3374         LOCK_OWNED(ump);
3375         fs = ump->um_fs;
3376         jblocks = ump->softdep_jblocks;
3377         devbsize = ump->um_devvp->v_bufobj.bo_bsize;
3378         /*
3379          * We write anywhere between a disk block and fs block.  The upper
3380          * bound is picked to prevent buffer cache fragmentation and limit
3381          * processing time per I/O.
3382          */
3383         jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
3384         jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
3385         segwritten = 0;
3386         for (;;) {
3387                 cnt = ump->softdep_on_journal;
3388                 /*
3389                  * Criteria for writing a segment:
3390                  * 1) We have a full block.
3391                  * 2) We're called from jwait() and haven't found the
3392                  *    journal item yet.
3393                  * 3) Always write if needseg is set.
3394                  * 4) If we are called from process_worklist and have
3395                  *    not yet written anything we write a partial block
3396                  *    to enforce a 1 second maximum latency on journal
3397                  *    entries.
3398                  */
3399                 if (cnt < (jrecmax - 1) && needwk == NULL &&
3400                     jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
3401                         break;
3402                 cnt++;
3403                 /*
3404                  * Verify some free journal space.  softdep_prealloc() should
3405                  * guarantee that we don't run out so this is indicative of
3406                  * a problem with the flow control.  Try to recover
3407                  * gracefully in any event.
3408                  */
3409                 while (jblocks->jb_free == 0) {
3410                         if (flags != MNT_WAIT)
3411                                 break;
3412                         printf("softdep: Out of journal space!\n");
3413                         softdep_speedup(ump);
3414                         msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz);
3415                 }
3416                 FREE_LOCK(ump);
3417                 jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
3418                 workitem_alloc(&jseg->js_list, D_JSEG, mp);
3419                 LIST_INIT(&jseg->js_entries);
3420                 LIST_INIT(&jseg->js_indirs);
3421                 jseg->js_state = ATTACHED;
3422                 if (shouldflush == 0)
3423                         jseg->js_state |= COMPLETE;
3424                 else if (bio == NULL)
3425                         bio = g_alloc_bio();
3426                 jseg->js_jblocks = jblocks;
3427                 bp = geteblk(fs->fs_bsize, 0);
3428                 ACQUIRE_LOCK(ump);
3429                 /*
3430                  * If there was a race while we were allocating the block
3431                  * and jseg the entry we care about was likely written.
3432                  * We bail out in both the WAIT and NOWAIT case and assume
3433                  * the caller will loop if the entry it cares about is
3434                  * not written.
3435                  */
3436                 cnt = ump->softdep_on_journal;
3437                 if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
3438                         bp->b_flags |= B_INVAL | B_NOCACHE;
3439                         WORKITEM_FREE(jseg, D_JSEG);
3440                         FREE_LOCK(ump);
3441                         brelse(bp);
3442                         ACQUIRE_LOCK(ump);
3443                         break;
3444                 }
3445                 /*
3446                  * Calculate the disk block size required for the available
3447                  * records rounded to the min size.
3448                  */
3449                 if (cnt == 0)
3450                         size = devbsize;
3451                 else if (cnt < jrecmax)
3452                         size = howmany(cnt, jrecmin) * devbsize;
3453                 else
3454                         size = fs->fs_bsize;
3455                 /*
3456                  * Allocate a disk block for this journal data and account
3457                  * for truncation of the requested size if enough contiguous
3458                  * space was not available.
3459                  */
3460                 bp->b_blkno = jblocks_alloc(jblocks, size, &size);
3461                 bp->b_lblkno = bp->b_blkno;
3462                 bp->b_offset = bp->b_blkno * DEV_BSIZE;
3463                 bp->b_bcount = size;
3464                 bp->b_flags &= ~B_INVAL;
3465                 bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
3466                 /*
3467                  * Initialize our jseg with cnt records.  Assign the next
3468                  * sequence number to it and link it in-order.
3469                  */
3470                 cnt = MIN(cnt, (size / devbsize) * jrecmin);
3471                 jseg->js_buf = bp;
3472                 jseg->js_cnt = cnt;
3473                 jseg->js_refs = cnt + 1;        /* Self ref. */
3474                 jseg->js_size = size;
3475                 jseg->js_seq = jblocks->jb_nextseq++;
3476                 if (jblocks->jb_oldestseg == NULL)
3477                         jblocks->jb_oldestseg = jseg;
3478                 jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
3479                 TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
3480                 if (jblocks->jb_writeseg == NULL)
3481                         jblocks->jb_writeseg = jseg;
3482                 /*
3483                  * Start filling in records from the pending list.
3484                  */
3485                 data = bp->b_data;
3486                 off = 0;
3487
3488                 /*
3489                  * Always put a header on the first block.
3490                  * XXX As with below, there might not be a chance to get
3491                  * into the loop.  Ensure that something valid is written.
3492                  */
3493                 jseg_write(ump, jseg, data);
3494                 off += JREC_SIZE;
3495                 data = bp->b_data + off;
3496
3497                 /*
3498                  * XXX Something is wrong here.  There's no work to do,
3499                  * but we need to perform and I/O and allow it to complete
3500                  * anyways.
3501                  */
3502                 if (LIST_EMPTY(&ump->softdep_journal_pending))
3503                         stat_emptyjblocks++;
3504
3505                 while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
3506                     != NULL) {
3507                         if (cnt == 0)
3508                                 break;
3509                         /* Place a segment header on every device block. */
3510                         if ((off % devbsize) == 0) {
3511                                 jseg_write(ump, jseg, data);
3512                                 off += JREC_SIZE;
3513                                 data = bp->b_data + off;
3514                         }
3515                         if (wk == needwk)
3516                                 needwk = NULL;
3517                         remove_from_journal(wk);
3518                         wk->wk_state |= INPROGRESS;
3519                         WORKLIST_INSERT(&jseg->js_entries, wk);
3520                         switch (wk->wk_type) {
3521                         case D_JADDREF:
3522                                 jaddref_write(WK_JADDREF(wk), jseg, data);
3523                                 break;
3524                         case D_JREMREF:
3525                                 jremref_write(WK_JREMREF(wk), jseg, data);
3526                                 break;
3527                         case D_JMVREF:
3528                                 jmvref_write(WK_JMVREF(wk), jseg, data);
3529                                 break;
3530                         case D_JNEWBLK:
3531                                 jnewblk_write(WK_JNEWBLK(wk), jseg, data);
3532                                 break;
3533                         case D_JFREEBLK:
3534                                 jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
3535                                 break;
3536                         case D_JFREEFRAG:
3537                                 jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
3538                                 break;
3539                         case D_JTRUNC:
3540                                 jtrunc_write(WK_JTRUNC(wk), jseg, data);
3541                                 break;
3542                         case D_JFSYNC:
3543                                 jfsync_write(WK_JFSYNC(wk), jseg, data);
3544                                 break;
3545                         default:
3546                                 panic("process_journal: Unknown type %s",
3547                                     TYPENAME(wk->wk_type));
3548                                 /* NOTREACHED */
3549                         }
3550                         off += JREC_SIZE;
3551                         data = bp->b_data + off;
3552                         cnt--;
3553                 }
3554
3555                 /* Clear any remaining space so we don't leak kernel data */
3556                 if (size > off)
3557                         bzero(data, size - off);
3558
3559                 /*
3560                  * Write this one buffer and continue.
3561                  */
3562                 segwritten = 1;
3563                 jblocks->jb_needseg = 0;
3564                 WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
3565                 FREE_LOCK(ump);
3566                 pbgetvp(ump->um_devvp, bp);
3567                 /*
3568                  * We only do the blocking wait once we find the journal
3569                  * entry we're looking for.
3570                  */
3571                 if (needwk == NULL && flags == MNT_WAIT)
3572                         bwrite(bp);
3573                 else
3574                         bawrite(bp);
3575                 ACQUIRE_LOCK(ump);
3576         }
3577         /*
3578          * If we wrote a segment issue a synchronize cache so the journal
3579          * is reflected on disk before the data is written.  Since reclaiming
3580          * journal space also requires writing a journal record this
3581          * process also enforces a barrier before reclamation.
3582          */
3583         if (segwritten && shouldflush) {
3584                 softdep_synchronize(bio, ump,
3585                     TAILQ_LAST(&jblocks->jb_segs, jseglst));
3586         } else if (bio)
3587                 g_destroy_bio(bio);
3588         /*
3589          * If we've suspended the filesystem because we ran out of journal
3590          * space either try to sync it here to make some progress or
3591          * unsuspend it if we already have.
3592          */
3593         if (flags == 0 && jblocks->jb_suspended) {
3594                 if (journal_unsuspend(ump))
3595                         return;
3596                 FREE_LOCK(ump);
3597                 VFS_SYNC(mp, MNT_NOWAIT);
3598                 ffs_sbupdate(ump, MNT_WAIT, 0);
3599                 ACQUIRE_LOCK(ump);
3600         }
3601 }
3602
3603 /*
3604  * Complete a jseg, allowing all dependencies awaiting journal writes
3605  * to proceed.  Each journal dependency also attaches a jsegdep to dependent
3606  * structures so that the journal segment can be freed to reclaim space.
3607  */
3608 static void
3609 complete_jseg(jseg)
3610         struct jseg *jseg;
3611 {
3612         struct worklist *wk;
3613         struct jmvref *jmvref;
3614 #ifdef INVARIANTS
3615         int i = 0;
3616 #endif
3617
3618         while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
3619                 WORKLIST_REMOVE(wk);
3620                 wk->wk_state &= ~INPROGRESS;
3621                 wk->wk_state |= COMPLETE;
3622                 KASSERT(i++ < jseg->js_cnt,
3623                     ("handle_written_jseg: overflow %d >= %d",
3624                     i - 1, jseg->js_cnt));
3625                 switch (wk->wk_type) {
3626                 case D_JADDREF:
3627                         handle_written_jaddref(WK_JADDREF(wk));
3628                         break;
3629                 case D_JREMREF:
3630                         handle_written_jremref(WK_JREMREF(wk));
3631                         break;
3632                 case D_JMVREF:
3633                         rele_jseg(jseg);        /* No jsegdep. */
3634                         jmvref = WK_JMVREF(wk);
3635                         LIST_REMOVE(jmvref, jm_deps);
3636                         if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0)
3637                                 free_pagedep(jmvref->jm_pagedep);
3638                         WORKITEM_FREE(jmvref, D_JMVREF);
3639                         break;
3640                 case D_JNEWBLK:
3641                         handle_written_jnewblk(WK_JNEWBLK(wk));
3642                         break;
3643                 case D_JFREEBLK:
3644                         handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep);
3645                         break;
3646                 case D_JTRUNC:
3647                         handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep);
3648                         break;
3649                 case D_JFSYNC:
3650                         rele_jseg(jseg);        /* No jsegdep. */
3651                         WORKITEM_FREE(wk, D_JFSYNC);
3652                         break;
3653                 case D_JFREEFRAG:
3654                         handle_written_jfreefrag(WK_JFREEFRAG(wk));
3655                         break;
3656                 default:
3657                         panic("handle_written_jseg: Unknown type %s",
3658                             TYPENAME(wk->wk_type));
3659                         /* NOTREACHED */
3660                 }
3661         }
3662         /* Release the self reference so the structure may be freed. */
3663         rele_jseg(jseg);
3664 }
3665
3666 /*
3667  * Determine which jsegs are ready for completion processing.  Waits for
3668  * synchronize cache to complete as well as forcing in-order completion
3669  * of journal entries.
3670  */
3671 static void
3672 complete_jsegs(jseg)
3673         struct jseg *jseg;
3674 {
3675         struct jblocks *jblocks;
3676         struct jseg *jsegn;
3677
3678         jblocks = jseg->js_jblocks;
3679         /*
3680          * Don't allow out of order completions.  If this isn't the first
3681          * block wait for it to write before we're done.
3682          */
3683         if (jseg != jblocks->jb_writeseg)
3684                 return;
3685         /* Iterate through available jsegs processing their entries. */
3686         while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) {
3687                 jblocks->jb_oldestwrseq = jseg->js_oldseq;
3688                 jsegn = TAILQ_NEXT(jseg, js_next);
3689                 complete_jseg(jseg);
3690                 jseg = jsegn;
3691         }
3692         jblocks->jb_writeseg = jseg;
3693         /*
3694          * Attempt to free jsegs now that oldestwrseq may have advanced.
3695          */
3696         free_jsegs(jblocks);
3697 }
3698
3699 /*
3700  * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Attempt to handle
3701  * the final completions.
3702  */
3703 static void
3704 handle_written_jseg(jseg, bp)
3705         struct jseg *jseg;
3706         struct buf *bp;
3707 {
3708
3709         if (jseg->js_refs == 0)
3710                 panic("handle_written_jseg: No self-reference on %p", jseg);
3711         jseg->js_state |= DEPCOMPLETE;
3712         /*
3713          * We'll never need this buffer again, set flags so it will be
3714          * discarded.
3715          */
3716         bp->b_flags |= B_INVAL | B_NOCACHE;
3717         pbrelvp(bp);
3718         complete_jsegs(jseg);
3719 }
3720
3721 static inline struct jsegdep *
3722 inoref_jseg(inoref)
3723         struct inoref *inoref;
3724 {
3725         struct jsegdep *jsegdep;
3726
3727         jsegdep = inoref->if_jsegdep;
3728         inoref->if_jsegdep = NULL;
3729
3730         return (jsegdep);
3731 }
3732
3733 /*
3734  * Called once a jremref has made it to stable store.  The jremref is marked
3735  * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
3736  * for the jremref to complete will be awoken by free_jremref.
3737  */
3738 static void
3739 handle_written_jremref(jremref)
3740         struct jremref *jremref;
3741 {
3742         struct inodedep *inodedep;
3743         struct jsegdep *jsegdep;
3744         struct dirrem *dirrem;
3745
3746         /* Grab the jsegdep. */
3747         jsegdep = inoref_jseg(&jremref->jr_ref);
3748         /*
3749          * Remove us from the inoref list.
3750          */
3751         if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
3752             0, &inodedep) == 0)
3753                 panic("handle_written_jremref: Lost inodedep");
3754         TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
3755         /*
3756          * Complete the dirrem.
3757          */
3758         dirrem = jremref->jr_dirrem;
3759         jremref->jr_dirrem = NULL;
3760         LIST_REMOVE(jremref, jr_deps);
3761         jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
3762         jwork_insert(&dirrem->dm_jwork, jsegdep);
3763         if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
3764             (dirrem->dm_state & COMPLETE) != 0)
3765                 add_to_worklist(&dirrem->dm_list, 0);
3766         free_jremref(jremref);
3767 }
3768
3769 /*
3770  * Called once a jaddref has made it to stable store.  The dependency is
3771  * marked complete and any dependent structures are added to the inode
3772  * bufwait list to be completed as soon as it is written.  If a bitmap write
3773  * depends on this entry we move the inode into the inodedephd of the
3774  * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
3775  */
3776 static void
3777 handle_written_jaddref(jaddref)
3778         struct jaddref *jaddref;
3779 {
3780         struct jsegdep *jsegdep;
3781         struct inodedep *inodedep;
3782         struct diradd *diradd;
3783         struct mkdir *mkdir;
3784
3785         /* Grab the jsegdep. */
3786         jsegdep = inoref_jseg(&jaddref->ja_ref);
3787         mkdir = NULL;
3788         diradd = NULL;
3789         if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3790             0, &inodedep) == 0)
3791                 panic("handle_written_jaddref: Lost inodedep.");
3792         if (jaddref->ja_diradd == NULL)
3793                 panic("handle_written_jaddref: No dependency");
3794         if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
3795                 diradd = jaddref->ja_diradd;
3796                 WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
3797         } else if (jaddref->ja_state & MKDIR_PARENT) {
3798                 mkdir = jaddref->ja_mkdir;
3799                 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
3800         } else if (jaddref->ja_state & MKDIR_BODY)
3801                 mkdir = jaddref->ja_mkdir;
3802         else
3803                 panic("handle_written_jaddref: Unknown dependency %p",
3804                     jaddref->ja_diradd);
3805         jaddref->ja_diradd = NULL;      /* also clears ja_mkdir */
3806         /*
3807          * Remove us from the inode list.
3808          */
3809         TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
3810         /*
3811          * The mkdir may be waiting on the jaddref to clear before freeing.
3812          */
3813         if (mkdir) {
3814                 KASSERT(mkdir->md_list.wk_type == D_MKDIR,
3815                     ("handle_written_jaddref: Incorrect type for mkdir %s",
3816                     TYPENAME(mkdir->md_list.wk_type)));
3817                 mkdir->md_jaddref = NULL;
3818                 diradd = mkdir->md_diradd;
3819                 mkdir->md_state |= DEPCOMPLETE;
3820                 complete_mkdir(mkdir);
3821         }
3822         jwork_insert(&diradd->da_jwork, jsegdep);
3823         if (jaddref->ja_state & NEWBLOCK) {
3824                 inodedep->id_state |= ONDEPLIST;
3825                 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
3826                     inodedep, id_deps);
3827         }
3828         free_jaddref(jaddref);
3829 }
3830
3831 /*
3832  * Called once a jnewblk journal is written.  The allocdirect or allocindir
3833  * is placed in the bmsafemap to await notification of a written bitmap.  If
3834  * the operation was canceled we add the segdep to the appropriate
3835  * dependency to free the journal space once the canceling operation
3836  * completes.
3837  */
3838 static void
3839 handle_written_jnewblk(jnewblk)
3840         struct jnewblk *jnewblk;
3841 {
3842         struct bmsafemap *bmsafemap;
3843         struct freefrag *freefrag;
3844         struct freework *freework;
3845         struct jsegdep *jsegdep;
3846         struct newblk *newblk;
3847
3848         /* Grab the jsegdep. */
3849         jsegdep = jnewblk->jn_jsegdep;
3850         jnewblk->jn_jsegdep = NULL;
3851         if (jnewblk->jn_dep == NULL)
3852                 panic("handle_written_jnewblk: No dependency for the segdep.");
3853         switch (jnewblk->jn_dep->wk_type) {
3854         case D_NEWBLK:
3855         case D_ALLOCDIRECT:
3856         case D_ALLOCINDIR:
3857                 /*
3858                  * Add the written block to the bmsafemap so it can
3859                  * be notified when the bitmap is on disk.
3860                  */
3861                 newblk = WK_NEWBLK(jnewblk->jn_dep);
3862                 newblk->nb_jnewblk = NULL;
3863                 if ((newblk->nb_state & GOINGAWAY) == 0) {
3864                         bmsafemap = newblk->nb_bmsafemap;
3865                         newblk->nb_state |= ONDEPLIST;
3866                         LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk,
3867                             nb_deps);
3868                 }
3869                 jwork_insert(&newblk->nb_jwork, jsegdep);
3870                 break;
3871         case D_FREEFRAG:
3872                 /*
3873                  * A newblock being removed by a freefrag when replaced by
3874                  * frag extension.
3875                  */
3876                 freefrag = WK_FREEFRAG(jnewblk->jn_dep);
3877                 freefrag->ff_jdep = NULL;
3878                 jwork_insert(&freefrag->ff_jwork, jsegdep);
3879                 break;
3880         case D_FREEWORK:
3881                 /*
3882                  * A direct block was removed by truncate.
3883                  */
3884                 freework = WK_FREEWORK(jnewblk->jn_dep);
3885                 freework->fw_jnewblk = NULL;
3886                 jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep);
3887                 break;
3888         default:
3889                 panic("handle_written_jnewblk: Unknown type %d.",
3890                     jnewblk->jn_dep->wk_type);
3891         }
3892         jnewblk->jn_dep = NULL;
3893         free_jnewblk(jnewblk);
3894 }
3895
3896 /*
3897  * Cancel a jfreefrag that won't be needed, probably due to colliding with
3898  * an in-flight allocation that has not yet been committed.  Divorce us
3899  * from the freefrag and mark it DEPCOMPLETE so that it may be added
3900  * to the worklist.
3901  */
3902 static void
3903 cancel_jfreefrag(jfreefrag)
3904         struct jfreefrag *jfreefrag;
3905 {
3906         struct freefrag *freefrag;
3907
3908         if (jfreefrag->fr_jsegdep) {
3909                 free_jsegdep(jfreefrag->fr_jsegdep);
3910                 jfreefrag->fr_jsegdep = NULL;
3911         }
3912         freefrag = jfreefrag->fr_freefrag;
3913         jfreefrag->fr_freefrag = NULL;
3914         free_jfreefrag(jfreefrag);
3915         freefrag->ff_state |= DEPCOMPLETE;
3916         CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno);
3917 }
3918
3919 /*
3920  * Free a jfreefrag when the parent freefrag is rendered obsolete.
3921  */
3922 static void
3923 free_jfreefrag(jfreefrag)
3924         struct jfreefrag *jfreefrag;
3925 {
3926
3927         if (jfreefrag->fr_state & INPROGRESS)
3928                 WORKLIST_REMOVE(&jfreefrag->fr_list);
3929         else if (jfreefrag->fr_state & ONWORKLIST)
3930                 remove_from_journal(&jfreefrag->fr_list);
3931         if (jfreefrag->fr_freefrag != NULL)
3932                 panic("free_jfreefrag:  Still attached to a freefrag.");
3933         WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
3934 }
3935
3936 /*
3937  * Called when the journal write for a jfreefrag completes.  The parent
3938  * freefrag is added to the worklist if this completes its dependencies.
3939  */
3940 static void
3941 handle_written_jfreefrag(jfreefrag)
3942         struct jfreefrag *jfreefrag;
3943 {
3944         struct jsegdep *jsegdep;
3945         struct freefrag *freefrag;
3946
3947         /* Grab the jsegdep. */
3948         jsegdep = jfreefrag->fr_jsegdep;
3949         jfreefrag->fr_jsegdep = NULL;
3950         freefrag = jfreefrag->fr_freefrag;
3951         if (freefrag == NULL)
3952                 panic("handle_written_jfreefrag: No freefrag.");
3953         freefrag->ff_state |= DEPCOMPLETE;
3954         freefrag->ff_jdep = NULL;
3955         jwork_insert(&freefrag->ff_jwork, jsegdep);
3956         if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
3957                 add_to_worklist(&freefrag->ff_list, 0);
3958         jfreefrag->fr_freefrag = NULL;
3959         free_jfreefrag(jfreefrag);
3960 }
3961
3962 /*
3963  * Called when the journal write for a jfreeblk completes.  The jfreeblk
3964  * is removed from the freeblks list of pending journal writes and the
3965  * jsegdep is moved to the freeblks jwork to be completed when all blocks
3966  * have been reclaimed.
3967  */
3968 static void
3969 handle_written_jblkdep(jblkdep)
3970         struct jblkdep *jblkdep;
3971 {
3972         struct freeblks *freeblks;
3973         struct jsegdep *jsegdep;
3974
3975         /* Grab the jsegdep. */
3976         jsegdep = jblkdep->jb_jsegdep;
3977         jblkdep->jb_jsegdep = NULL;
3978         freeblks = jblkdep->jb_freeblks;
3979         LIST_REMOVE(jblkdep, jb_deps);
3980         jwork_insert(&freeblks->fb_jwork, jsegdep);
3981         /*
3982          * If the freeblks is all journaled, we can add it to the worklist.
3983          */
3984         if (LIST_EMPTY(&freeblks->fb_jblkdephd) &&
3985             (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
3986                 add_to_worklist(&freeblks->fb_list, WK_NODELAY);
3987
3988         free_jblkdep(jblkdep);
3989 }
3990
3991 static struct jsegdep *
3992 newjsegdep(struct worklist *wk)
3993 {
3994         struct jsegdep *jsegdep;
3995
3996         jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
3997         workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
3998         jsegdep->jd_seg = NULL;
3999
4000         return (jsegdep);
4001 }
4002
4003 static struct jmvref *
4004 newjmvref(dp, ino, oldoff, newoff)
4005         struct inode *dp;
4006         ino_t ino;
4007         off_t oldoff;
4008         off_t newoff;
4009 {
4010         struct jmvref *jmvref;
4011
4012         jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
4013         workitem_alloc(&jmvref->jm_list, D_JMVREF, ITOVFS(dp));
4014         jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
4015         jmvref->jm_parent = dp->i_number;
4016         jmvref->jm_ino = ino;
4017         jmvref->jm_oldoff = oldoff;
4018         jmvref->jm_newoff = newoff;
4019
4020         return (jmvref);
4021 }
4022
4023 /*
4024  * Allocate a new jremref that tracks the removal of ip from dp with the
4025  * directory entry offset of diroff.  Mark the entry as ATTACHED and
4026  * DEPCOMPLETE as we have all the information required for the journal write
4027  * and the directory has already been removed from the buffer.  The caller
4028  * is responsible for linking the jremref into the pagedep and adding it
4029  * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
4030  * a DOTDOT addition so handle_workitem_remove() can properly assign
4031  * the jsegdep when we're done.
4032  */
4033 static struct jremref *
4034 newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
4035     off_t diroff, nlink_t nlink)
4036 {
4037         struct jremref *jremref;
4038
4039         jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
4040         workitem_alloc(&jremref->jr_list, D_JREMREF, ITOVFS(dp));
4041         jremref->jr_state = ATTACHED;
4042         newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
4043            nlink, ip->i_mode);
4044         jremref->jr_dirrem = dirrem;
4045
4046         return (jremref);
4047 }
4048
4049 static inline void
4050 newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
4051     nlink_t nlink, uint16_t mode)
4052 {
4053
4054         inoref->if_jsegdep = newjsegdep(&inoref->if_list);
4055         inoref->if_diroff = diroff;
4056         inoref->if_ino = ino;
4057         inoref->if_parent = parent;
4058         inoref->if_nlink = nlink;
4059         inoref->if_mode = mode;
4060 }
4061
4062 /*
4063  * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
4064  * directory offset may not be known until later.  The caller is responsible
4065  * adding the entry to the journal when this information is available.  nlink
4066  * should be the link count prior to the addition and mode is only required
4067  * to have the correct FMT.
4068  */
4069 static struct jaddref *
4070 newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
4071     uint16_t mode)
4072 {
4073         struct jaddref *jaddref;
4074
4075         jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
4076         workitem_alloc(&jaddref->ja_list, D_JADDREF, ITOVFS(dp));
4077         jaddref->ja_state = ATTACHED;
4078         jaddref->ja_mkdir = NULL;
4079         newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
4080
4081         return (jaddref);
4082 }
4083
4084 /*
4085  * Create a new free dependency for a freework.  The caller is responsible
4086  * for adjusting the reference count when it has the lock held.  The freedep
4087  * will track an outstanding bitmap write that will ultimately clear the
4088  * freework to continue.
4089  */
4090 static struct freedep *
4091 newfreedep(struct freework *freework)
4092 {
4093         struct freedep *freedep;
4094
4095         freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
4096         workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
4097         freedep->fd_freework = freework;
4098
4099         return (freedep);
4100 }
4101
4102 /*
4103  * Free a freedep structure once the buffer it is linked to is written.  If
4104  * this is the last reference to the freework schedule it for completion.
4105  */
4106 static void
4107 free_freedep(freedep)
4108         struct freedep *freedep;
4109 {
4110         struct freework *freework;
4111
4112         freework = freedep->fd_freework;
4113         freework->fw_freeblks->fb_cgwait--;
4114         if (--freework->fw_ref == 0)
4115                 freework_enqueue(freework);
4116         WORKITEM_FREE(freedep, D_FREEDEP);
4117 }
4118
4119 /*
4120  * Allocate a new freework structure that may be a level in an indirect
4121  * when parent is not NULL or a top level block when it is.  The top level
4122  * freework structures are allocated without the per-filesystem lock held
4123  * and before the freeblks is visible outside of softdep_setup_freeblocks().
4124  */
4125 static struct freework *
4126 newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
4127         struct ufsmount *ump;
4128         struct freeblks *freeblks;
4129         struct freework *parent;
4130         ufs_lbn_t lbn;
4131         ufs2_daddr_t nb;
4132         int frags;
4133         int off;
4134         int journal;
4135 {
4136         struct freework *freework;
4137
4138         freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
4139         workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
4140         freework->fw_state = ATTACHED;
4141         freework->fw_jnewblk = NULL;
4142         freework->fw_freeblks = freeblks;
4143         freework->fw_parent = parent;
4144         freework->fw_lbn = lbn;
4145         freework->fw_blkno = nb;
4146         freework->fw_frags = frags;
4147         freework->fw_indir = NULL;
4148         freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 ||
4149             lbn >= -UFS_NXADDR) ? 0 : NINDIR(ump->um_fs) + 1;
4150         freework->fw_start = freework->fw_off = off;
4151         if (journal)
4152                 newjfreeblk(freeblks, lbn, nb, frags);
4153         if (parent == NULL) {
4154                 ACQUIRE_LOCK(ump);
4155                 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
4156                 freeblks->fb_ref++;
4157                 FREE_LOCK(ump);
4158         }
4159
4160         return (freework);
4161 }
4162
4163 /*
4164  * Eliminate a jfreeblk for a block that does not need journaling.
4165  */
4166 static void
4167 cancel_jfreeblk(freeblks, blkno)
4168         struct freeblks *freeblks;
4169         ufs2_daddr_t blkno;
4170 {
4171         struct jfreeblk *jfreeblk;
4172         struct jblkdep *jblkdep;
4173
4174         LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) {
4175                 if (jblkdep->jb_list.wk_type != D_JFREEBLK)
4176                         continue;
4177                 jfreeblk = WK_JFREEBLK(&jblkdep->jb_list);
4178                 if (jfreeblk->jf_blkno == blkno)
4179                         break;
4180         }
4181         if (jblkdep == NULL)
4182                 return;
4183         CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno);
4184         free_jsegdep(jblkdep->jb_jsegdep);
4185         LIST_REMOVE(jblkdep, jb_deps);
4186         WORKITEM_FREE(jfreeblk, D_JFREEBLK);
4187 }
4188
4189 /*
4190  * Allocate a new jfreeblk to journal top level block pointer when truncating
4191  * a file.  The caller must add this to the worklist when the per-filesystem
4192  * lock is held.
4193  */
4194 static struct jfreeblk *
4195 newjfreeblk(freeblks, lbn, blkno, frags)
4196         struct freeblks *freeblks;
4197         ufs_lbn_t lbn;
4198         ufs2_daddr_t blkno;
4199         int frags;
4200 {
4201         struct jfreeblk *jfreeblk;
4202
4203         jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
4204         workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK,
4205             freeblks->fb_list.wk_mp);
4206         jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list);
4207         jfreeblk->jf_dep.jb_freeblks = freeblks;
4208         jfreeblk->jf_ino = freeblks->fb_inum;
4209         jfreeblk->jf_lbn = lbn;
4210         jfreeblk->jf_blkno = blkno;
4211         jfreeblk->jf_frags = frags;
4212         LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps);
4213
4214         return (jfreeblk);
4215 }
4216
4217 /*
4218  * The journal is only prepared to handle full-size block numbers, so we
4219  * have to adjust the record to reflect the change to a full-size block.
4220  * For example, suppose we have a block made up of fragments 8-15 and
4221  * want to free its last two fragments. We are given a request that says:
4222  *     FREEBLK ino=5, blkno=14, lbn=0, frags=2, oldfrags=0
4223  * where frags are the number of fragments to free and oldfrags are the
4224  * number of fragments to keep. To block align it, we have to change it to
4225  * have a valid full-size blkno, so it becomes:
4226  *     FREEBLK ino=5, blkno=8, lbn=0, frags=2, oldfrags=6
4227  */
4228 static void
4229 adjust_newfreework(freeblks, frag_offset)
4230         struct freeblks *freeblks;
4231         int frag_offset;
4232 {
4233         struct jfreeblk *jfreeblk;
4234
4235         KASSERT((LIST_FIRST(&freeblks->fb_jblkdephd) != NULL &&
4236             LIST_FIRST(&freeblks->fb_jblkdephd)->jb_list.wk_type == D_JFREEBLK),
4237             ("adjust_newfreework: Missing freeblks dependency"));
4238
4239         jfreeblk = WK_JFREEBLK(LIST_FIRST(&freeblks->fb_jblkdephd));
4240         jfreeblk->jf_blkno -= frag_offset;
4241         jfreeblk->jf_frags += frag_offset;
4242 }
4243
4244 /*
4245  * Allocate a new jtrunc to track a partial truncation.
4246  */
4247 static struct jtrunc *
4248 newjtrunc(freeblks, size, extsize)
4249         struct freeblks *freeblks;
4250         off_t size;
4251         int extsize;
4252 {
4253         struct jtrunc *jtrunc;
4254
4255         jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
4256         workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC,
4257             freeblks->fb_list.wk_mp);
4258         jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list);
4259         jtrunc->jt_dep.jb_freeblks = freeblks;
4260         jtrunc->jt_ino = freeblks->fb_inum;
4261         jtrunc->jt_size = size;
4262         jtrunc->jt_extsize = extsize;
4263         LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps);
4264
4265         return (jtrunc);
4266 }
4267
4268 /*
4269  * If we're canceling a new bitmap we have to search for another ref
4270  * to move into the bmsafemap dep.  This might be better expressed
4271  * with another structure.
4272  */
4273 static void
4274 move_newblock_dep(jaddref, inodedep)
4275         struct jaddref *jaddref;
4276         struct inodedep *inodedep;
4277 {
4278         struct inoref *inoref;
4279         struct jaddref *jaddrefn;
4280
4281         jaddrefn = NULL;
4282         for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4283             inoref = TAILQ_NEXT(inoref, if_deps)) {
4284                 if ((jaddref->ja_state & NEWBLOCK) &&
4285                     inoref->if_list.wk_type == D_JADDREF) {
4286                         jaddrefn = (struct jaddref *)inoref;
4287                         break;
4288                 }
4289         }
4290         if (jaddrefn == NULL)
4291                 return;
4292         jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
4293         jaddrefn->ja_state |= jaddref->ja_state &
4294             (ATTACHED | UNDONE | NEWBLOCK);
4295         jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
4296         jaddref->ja_state |= ATTACHED;
4297         LIST_REMOVE(jaddref, ja_bmdeps);
4298         LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
4299             ja_bmdeps);
4300 }
4301
4302 /*
4303  * Cancel a jaddref either before it has been written or while it is being
4304  * written.  This happens when a link is removed before the add reaches
4305  * the disk.  The jaddref dependency is kept linked into the bmsafemap
4306  * and inode to prevent the link count or bitmap from reaching the disk
4307  * until handle_workitem_remove() re-adjusts the counts and bitmaps as
4308  * required.
4309  *
4310  * Returns 1 if the canceled addref requires journaling of the remove and
4311  * 0 otherwise.
4312  */
4313 static int
4314 cancel_jaddref(jaddref, inodedep, wkhd)
4315         struct jaddref *jaddref;
4316         struct inodedep *inodedep;
4317         struct workhead *wkhd;
4318 {
4319         struct inoref *inoref;
4320         struct jsegdep *jsegdep;
4321         int needsj;
4322
4323         KASSERT((jaddref->ja_state & COMPLETE) == 0,
4324             ("cancel_jaddref: Canceling complete jaddref"));
4325         if (jaddref->ja_state & (INPROGRESS | COMPLETE))
4326                 needsj = 1;
4327         else
4328                 needsj = 0;
4329         if (inodedep == NULL)
4330                 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
4331                     0, &inodedep) == 0)
4332                         panic("cancel_jaddref: Lost inodedep");
4333         /*
4334          * We must adjust the nlink of any reference operation that follows
4335          * us so that it is consistent with the in-memory reference.  This
4336          * ensures that inode nlink rollbacks always have the correct link.
4337          */
4338         if (needsj == 0) {
4339                 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4340                     inoref = TAILQ_NEXT(inoref, if_deps)) {
4341                         if (inoref->if_state & GOINGAWAY)
4342                                 break;
4343                         inoref->if_nlink--;
4344                 }
4345         }
4346         jsegdep = inoref_jseg(&jaddref->ja_ref);
4347         if (jaddref->ja_state & NEWBLOCK)
4348                 move_newblock_dep(jaddref, inodedep);
4349         wake_worklist(&jaddref->ja_list);
4350         jaddref->ja_mkdir = NULL;
4351         if (jaddref->ja_state & INPROGRESS) {
4352                 jaddref->ja_state &= ~INPROGRESS;
4353                 WORKLIST_REMOVE(&jaddref->ja_list);
4354                 jwork_insert(wkhd, jsegdep);
4355         } else {
4356                 free_jsegdep(jsegdep);
4357                 if (jaddref->ja_state & DEPCOMPLETE)
4358                         remove_from_journal(&jaddref->ja_list);
4359         }
4360         jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
4361         /*
4362          * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
4363          * can arrange for them to be freed with the bitmap.  Otherwise we
4364          * no longer need this addref attached to the inoreflst and it
4365          * will incorrectly adjust nlink if we leave it.
4366          */
4367         if ((jaddref->ja_state & NEWBLOCK) == 0) {
4368                 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
4369                     if_deps);
4370                 jaddref->ja_state |= COMPLETE;
4371                 free_jaddref(jaddref);
4372                 return (needsj);
4373         }
4374         /*
4375          * Leave the head of the list for jsegdeps for fast merging.
4376          */
4377         if (LIST_FIRST(wkhd) != NULL) {
4378                 jaddref->ja_state |= ONWORKLIST;
4379                 LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
4380         } else
4381                 WORKLIST_INSERT(wkhd, &jaddref->ja_list);
4382
4383         return (needsj);
4384 }
4385
4386 /*
4387  * Attempt to free a jaddref structure when some work completes.  This
4388  * should only succeed once the entry is written and all dependencies have
4389  * been notified.
4390  */
4391 static void
4392 free_jaddref(jaddref)
4393         struct jaddref *jaddref;
4394 {
4395
4396         if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
4397                 return;
4398         if (jaddref->ja_ref.if_jsegdep)
4399                 panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
4400                     jaddref, jaddref->ja_state);
4401         if (jaddref->ja_state & NEWBLOCK)
4402                 LIST_REMOVE(jaddref, ja_bmdeps);
4403         if (jaddref->ja_state & (INPROGRESS | ONWORKLIST))
4404                 panic("free_jaddref: Bad state %p(0x%X)",
4405                     jaddref, jaddref->ja_state);
4406         if (jaddref->ja_mkdir != NULL)
4407                 panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
4408         WORKITEM_FREE(jaddref, D_JADDREF);
4409 }
4410
4411 /*
4412  * Free a jremref structure once it has been written or discarded.
4413  */
4414 static void
4415 free_jremref(jremref)
4416         struct jremref *jremref;
4417 {
4418
4419         if (jremref->jr_ref.if_jsegdep)
4420                 free_jsegdep(jremref->jr_ref.if_jsegdep);
4421         if (jremref->jr_state & INPROGRESS)
4422                 panic("free_jremref: IO still pending");
4423         WORKITEM_FREE(jremref, D_JREMREF);
4424 }
4425
4426 /*
4427  * Free a jnewblk structure.
4428  */
4429 static void
4430 free_jnewblk(jnewblk)
4431         struct jnewblk *jnewblk;
4432 {
4433
4434         if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
4435                 return;
4436         LIST_REMOVE(jnewblk, jn_deps);
4437         if (jnewblk->jn_dep != NULL)
4438                 panic("free_jnewblk: Dependency still attached.");
4439         WORKITEM_FREE(jnewblk, D_JNEWBLK);
4440 }
4441
4442 /*
4443  * Cancel a jnewblk which has been been made redundant by frag extension.
4444  */
4445 static void
4446 cancel_jnewblk(jnewblk, wkhd)
4447         struct jnewblk *jnewblk;
4448         struct workhead *wkhd;
4449 {
4450         struct jsegdep *jsegdep;
4451
4452         CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno);
4453         jsegdep = jnewblk->jn_jsegdep;
4454         if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
4455                 panic("cancel_jnewblk: Invalid state");
4456         jnewblk->jn_jsegdep  = NULL;
4457         jnewblk->jn_dep = NULL;
4458         jnewblk->jn_state |= GOINGAWAY;
4459         if (jnewblk->jn_state & INPROGRESS) {
4460                 jnewblk->jn_state &= ~INPROGRESS;
4461                 WORKLIST_REMOVE(&jnewblk->jn_list);
4462                 jwork_insert(wkhd, jsegdep);
4463         } else {
4464                 free_jsegdep(jsegdep);
4465                 remove_from_journal(&jnewblk->jn_list);
4466         }
4467         wake_worklist(&jnewblk->jn_list);
4468         WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
4469 }
4470
4471 static void
4472 free_jblkdep(jblkdep)
4473         struct jblkdep *jblkdep;
4474 {
4475
4476         if (jblkdep->jb_list.wk_type == D_JFREEBLK)
4477                 WORKITEM_FREE(jblkdep, D_JFREEBLK);
4478         else if (jblkdep->jb_list.wk_type == D_JTRUNC)
4479                 WORKITEM_FREE(jblkdep, D_JTRUNC);
4480         else
4481                 panic("free_jblkdep: Unexpected type %s",
4482                     TYPENAME(jblkdep->jb_list.wk_type));
4483 }
4484
4485 /*
4486  * Free a single jseg once it is no longer referenced in memory or on
4487  * disk.  Reclaim journal blocks and dependencies waiting for the segment
4488  * to disappear.
4489  */
4490 static void
4491 free_jseg(jseg, jblocks)
4492         struct jseg *jseg;
4493         struct jblocks *jblocks;
4494 {
4495         struct freework *freework;
4496
4497         /*
4498          * Free freework structures that were lingering to indicate freed
4499          * indirect blocks that forced journal write ordering on reallocate.
4500          */
4501         while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL)
4502                 indirblk_remove(freework);
4503         if (jblocks->jb_oldestseg == jseg)
4504                 jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
4505         TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
4506         jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
4507         KASSERT(LIST_EMPTY(&jseg->js_entries),
4508             ("free_jseg: Freed jseg has valid entries."));
4509         WORKITEM_FREE(jseg, D_JSEG);
4510 }
4511
4512 /*
4513  * Free all jsegs that meet the criteria for being reclaimed and update
4514  * oldestseg.
4515  */
4516 static void
4517 free_jsegs(jblocks)
4518         struct jblocks *jblocks;
4519 {
4520         struct jseg *jseg;
4521
4522         /*
4523          * Free only those jsegs which have none allocated before them to
4524          * preserve the journal space ordering.
4525          */
4526         while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
4527                 /*
4528                  * Only reclaim space when nothing depends on this journal
4529                  * set and another set has written that it is no longer
4530                  * valid.
4531                  */
4532                 if (jseg->js_refs != 0) {
4533                         jblocks->jb_oldestseg = jseg;
4534                         return;
4535                 }
4536                 if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE)
4537                         break;
4538                 if (jseg->js_seq > jblocks->jb_oldestwrseq)
4539                         break;
4540                 /*
4541                  * We can free jsegs that didn't write entries when
4542                  * oldestwrseq == js_seq.
4543                  */
4544                 if (jseg->js_seq == jblocks->jb_oldestwrseq &&
4545                     jseg->js_cnt != 0)
4546                         break;
4547                 free_jseg(jseg, jblocks);
4548         }
4549         /*
4550          * If we exited the loop above we still must discover the
4551          * oldest valid segment.
4552          */
4553         if (jseg)
4554                 for (jseg = jblocks->jb_oldestseg; jseg != NULL;
4555                      jseg = TAILQ_NEXT(jseg, js_next))
4556                         if (jseg->js_refs != 0)
4557                                 break;
4558         jblocks->jb_oldestseg = jseg;
4559         /*
4560          * The journal has no valid records but some jsegs may still be
4561          * waiting on oldestwrseq to advance.  We force a small record
4562          * out to permit these lingering records to be reclaimed.
4563          */
4564         if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
4565                 jblocks->jb_needseg = 1;
4566 }
4567
4568 /*
4569  * Release one reference to a jseg and free it if the count reaches 0.  This
4570  * should eventually reclaim journal space as well.
4571  */
4572 static void
4573 rele_jseg(jseg)
4574         struct jseg *jseg;
4575 {
4576
4577         KASSERT(jseg->js_refs > 0,
4578             ("free_jseg: Invalid refcnt %d", jseg->js_refs));
4579         if (--jseg->js_refs != 0)
4580                 return;
4581         free_jsegs(jseg->js_jblocks);
4582 }
4583
4584 /*
4585  * Release a jsegdep and decrement the jseg count.
4586  */
4587 static void
4588 free_jsegdep(jsegdep)
4589         struct jsegdep *jsegdep;
4590 {
4591
4592         if (jsegdep->jd_seg)
4593                 rele_jseg(jsegdep->jd_seg);
4594         WORKITEM_FREE(jsegdep, D_JSEGDEP);
4595 }
4596
4597 /*
4598  * Wait for a journal item to make it to disk.  Initiate journal processing
4599  * if required.
4600  */
4601 static int
4602 jwait(wk, waitfor)
4603         struct worklist *wk;
4604         int waitfor;
4605 {
4606
4607         LOCK_OWNED(VFSTOUFS(wk->wk_mp));
4608         /*
4609          * Blocking journal waits cause slow synchronous behavior.  Record
4610          * stats on the frequency of these blocking operations.
4611          */
4612         if (waitfor == MNT_WAIT) {
4613                 stat_journal_wait++;
4614                 switch (wk->wk_type) {
4615                 case D_JREMREF:
4616                 case D_JMVREF:
4617                         stat_jwait_filepage++;
4618                         break;
4619                 case D_JTRUNC:
4620                 case D_JFREEBLK:
4621                         stat_jwait_freeblks++;
4622                         break;
4623                 case D_JNEWBLK:
4624                         stat_jwait_newblk++;
4625                         break;
4626                 case D_JADDREF:
4627                         stat_jwait_inode++;
4628                         break;
4629                 default:
4630                         break;
4631                 }
4632         }
4633         /*
4634          * If IO has not started we process the journal.  We can't mark the
4635          * worklist item as IOWAITING because we drop the lock while
4636          * processing the journal and the worklist entry may be freed after
4637          * this point.  The caller may call back in and re-issue the request.
4638          */
4639         if ((wk->wk_state & INPROGRESS) == 0) {
4640                 softdep_process_journal(wk->wk_mp, wk, waitfor);
4641                 if (waitfor != MNT_WAIT)
4642                         return (EBUSY);
4643                 return (0);
4644         }
4645         if (waitfor != MNT_WAIT)
4646                 return (EBUSY);
4647         wait_worklist(wk, "jwait");
4648         return (0);
4649 }
4650
4651 /*
4652  * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
4653  * appropriate.  This is a convenience function to reduce duplicate code
4654  * for the setup and revert functions below.
4655  */
4656 static struct inodedep *
4657 inodedep_lookup_ip(ip)
4658         struct inode *ip;
4659 {
4660         struct inodedep *inodedep;
4661
4662         KASSERT(ip->i_nlink >= ip->i_effnlink,
4663             ("inodedep_lookup_ip: bad delta"));
4664         (void) inodedep_lookup(ITOVFS(ip), ip->i_number, DEPALLOC,
4665             &inodedep);
4666         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
4667         KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
4668
4669         return (inodedep);
4670 }
4671
4672 /*
4673  * Called prior to creating a new inode and linking it to a directory.  The
4674  * jaddref structure must already be allocated by softdep_setup_inomapdep
4675  * and it is discovered here so we can initialize the mode and update
4676  * nlinkdelta.
4677  */
4678 void
4679 softdep_setup_create(dp, ip)
4680         struct inode *dp;
4681         struct inode *ip;
4682 {
4683         struct inodedep *inodedep;
4684         struct jaddref *jaddref;
4685         struct vnode *dvp;
4686
4687         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4688             ("softdep_setup_create called on non-softdep filesystem"));
4689         KASSERT(ip->i_nlink == 1,
4690             ("softdep_setup_create: Invalid link count."));
4691         dvp = ITOV(dp);
4692         ACQUIRE_LOCK(ITOUMP(dp));
4693         inodedep = inodedep_lookup_ip(ip);
4694         if (DOINGSUJ(dvp)) {
4695                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4696                     inoreflst);
4697                 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
4698                     ("softdep_setup_create: No addref structure present."));
4699         }
4700         softdep_prelink(dvp, NULL);
4701         FREE_LOCK(ITOUMP(dp));
4702 }
4703
4704 /*
4705  * Create a jaddref structure to track the addition of a DOTDOT link when
4706  * we are reparenting an inode as part of a rename.  This jaddref will be
4707  * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
4708  * non-journaling softdep.
4709  */
4710 void
4711 softdep_setup_dotdot_link(dp, ip)
4712         struct inode *dp;
4713         struct inode *ip;
4714 {
4715         struct inodedep *inodedep;
4716         struct jaddref *jaddref;
4717         struct vnode *dvp;
4718
4719         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4720             ("softdep_setup_dotdot_link called on non-softdep filesystem"));
4721         dvp = ITOV(dp);
4722         jaddref = NULL;
4723         /*
4724          * We don't set MKDIR_PARENT as this is not tied to a mkdir and
4725          * is used as a normal link would be.
4726          */
4727         if (DOINGSUJ(dvp))
4728                 jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4729                     dp->i_effnlink - 1, dp->i_mode);
4730         ACQUIRE_LOCK(ITOUMP(dp));
4731         inodedep = inodedep_lookup_ip(dp);
4732         if (jaddref)
4733                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4734                     if_deps);
4735         softdep_prelink(dvp, ITOV(ip));
4736         FREE_LOCK(ITOUMP(dp));
4737 }
4738
4739 /*
4740  * Create a jaddref structure to track a new link to an inode.  The directory
4741  * offset is not known until softdep_setup_directory_add or
4742  * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
4743  * softdep.
4744  */
4745 void
4746 softdep_setup_link(dp, ip)
4747         struct inode *dp;
4748         struct inode *ip;
4749 {
4750         struct inodedep *inodedep;
4751         struct jaddref *jaddref;
4752         struct vnode *dvp;
4753
4754         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4755             ("softdep_setup_link called on non-softdep filesystem"));
4756         dvp = ITOV(dp);
4757         jaddref = NULL;
4758         if (DOINGSUJ(dvp))
4759                 jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
4760                     ip->i_mode);
4761         ACQUIRE_LOCK(ITOUMP(dp));
4762         inodedep = inodedep_lookup_ip(ip);
4763         if (jaddref)
4764                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4765                     if_deps);
4766         softdep_prelink(dvp, ITOV(ip));
4767         FREE_LOCK(ITOUMP(dp));
4768 }
4769
4770 /*
4771  * Called to create the jaddref structures to track . and .. references as
4772  * well as lookup and further initialize the incomplete jaddref created
4773  * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
4774  * nlinkdelta for non-journaling softdep.
4775  */
4776 void
4777 softdep_setup_mkdir(dp, ip)
4778         struct inode *dp;
4779         struct inode *ip;
4780 {
4781         struct inodedep *inodedep;
4782         struct jaddref *dotdotaddref;
4783         struct jaddref *dotaddref;
4784         struct jaddref *jaddref;
4785         struct vnode *dvp;
4786
4787         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4788             ("softdep_setup_mkdir called on non-softdep filesystem"));
4789         dvp = ITOV(dp);
4790         dotaddref = dotdotaddref = NULL;
4791         if (DOINGSUJ(dvp)) {
4792                 dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
4793                     ip->i_mode);
4794                 dotaddref->ja_state |= MKDIR_BODY;
4795                 dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4796                     dp->i_effnlink - 1, dp->i_mode);
4797                 dotdotaddref->ja_state |= MKDIR_PARENT;
4798         }
4799         ACQUIRE_LOCK(ITOUMP(dp));
4800         inodedep = inodedep_lookup_ip(ip);
4801         if (DOINGSUJ(dvp)) {
4802                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4803                     inoreflst);
4804                 KASSERT(jaddref != NULL,
4805                     ("softdep_setup_mkdir: No addref structure present."));
4806                 KASSERT(jaddref->ja_parent == dp->i_number,
4807                     ("softdep_setup_mkdir: bad parent %ju",
4808                     (uintmax_t)jaddref->ja_parent));
4809                 TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
4810                     if_deps);
4811         }
4812         inodedep = inodedep_lookup_ip(dp);
4813         if (DOINGSUJ(dvp))
4814                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
4815                     &dotdotaddref->ja_ref, if_deps);
4816         softdep_prelink(ITOV(dp), NULL);
4817         FREE_LOCK(ITOUMP(dp));
4818 }
4819
4820 /*
4821  * Called to track nlinkdelta of the inode and parent directories prior to
4822  * unlinking a directory.
4823  */
4824 void
4825 softdep_setup_rmdir(dp, ip)
4826         struct inode *dp;
4827         struct inode *ip;
4828 {
4829         struct vnode *dvp;
4830
4831         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4832             ("softdep_setup_rmdir called on non-softdep filesystem"));
4833         dvp = ITOV(dp);
4834         ACQUIRE_LOCK(ITOUMP(dp));
4835         (void) inodedep_lookup_ip(ip);
4836         (void) inodedep_lookup_ip(dp);
4837         softdep_prelink(dvp, ITOV(ip));
4838         FREE_LOCK(ITOUMP(dp));
4839 }
4840
4841 /*
4842  * Called to track nlinkdelta of the inode and parent directories prior to
4843  * unlink.
4844  */
4845 void
4846 softdep_setup_unlink(dp, ip)
4847         struct inode *dp;
4848         struct inode *ip;
4849 {
4850         struct vnode *dvp;
4851
4852         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4853             ("softdep_setup_unlink called on non-softdep filesystem"));
4854         dvp = ITOV(dp);
4855         ACQUIRE_LOCK(ITOUMP(dp));
4856         (void) inodedep_lookup_ip(ip);
4857         (void) inodedep_lookup_ip(dp);
4858         softdep_prelink(dvp, ITOV(ip));
4859         FREE_LOCK(ITOUMP(dp));
4860 }
4861
4862 /*
4863  * Called to release the journal structures created by a failed non-directory
4864  * creation.  Adjusts nlinkdelta for non-journaling softdep.
4865  */
4866 void
4867 softdep_revert_create(dp, ip)
4868         struct inode *dp;
4869         struct inode *ip;
4870 {
4871         struct inodedep *inodedep;
4872         struct jaddref *jaddref;
4873         struct vnode *dvp;
4874
4875         KASSERT(MOUNTEDSOFTDEP(ITOVFS((dp))) != 0,
4876             ("softdep_revert_create called on non-softdep filesystem"));
4877         dvp = ITOV(dp);
4878         ACQUIRE_LOCK(ITOUMP(dp));
4879         inodedep = inodedep_lookup_ip(ip);
4880         if (DOINGSUJ(dvp)) {
4881                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4882                     inoreflst);
4883                 KASSERT(jaddref->ja_parent == dp->i_number,
4884                     ("softdep_revert_create: addref parent mismatch"));
4885                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4886         }
4887         FREE_LOCK(ITOUMP(dp));
4888 }
4889
4890 /*
4891  * Called to release the journal structures created by a failed link
4892  * addition.  Adjusts nlinkdelta for non-journaling softdep.
4893  */
4894 void
4895 softdep_revert_link(dp, ip)
4896         struct inode *dp;
4897         struct inode *ip;
4898 {
4899         struct inodedep *inodedep;
4900         struct jaddref *jaddref;
4901         struct vnode *dvp;
4902
4903         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4904             ("softdep_revert_link called on non-softdep filesystem"));
4905         dvp = ITOV(dp);
4906         ACQUIRE_LOCK(ITOUMP(dp));
4907         inodedep = inodedep_lookup_ip(ip);
4908         if (DOINGSUJ(dvp)) {
4909                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4910                     inoreflst);
4911                 KASSERT(jaddref->ja_parent == dp->i_number,
4912                     ("softdep_revert_link: addref parent mismatch"));
4913                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4914         }
4915         FREE_LOCK(ITOUMP(dp));
4916 }
4917
4918 /*
4919  * Called to release the journal structures created by a failed mkdir
4920  * attempt.  Adjusts nlinkdelta for non-journaling softdep.
4921  */
4922 void
4923 softdep_revert_mkdir(dp, ip)
4924         struct inode *dp;
4925         struct inode *ip;
4926 {
4927         struct inodedep *inodedep;
4928         struct jaddref *jaddref;
4929         struct jaddref *dotaddref;
4930         struct vnode *dvp;
4931
4932         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4933             ("softdep_revert_mkdir called on non-softdep filesystem"));
4934         dvp = ITOV(dp);
4935
4936         ACQUIRE_LOCK(ITOUMP(dp));
4937         inodedep = inodedep_lookup_ip(dp);
4938         if (DOINGSUJ(dvp)) {
4939                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4940                     inoreflst);
4941                 KASSERT(jaddref->ja_parent == ip->i_number,
4942                     ("softdep_revert_mkdir: dotdot addref parent mismatch"));
4943                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4944         }
4945         inodedep = inodedep_lookup_ip(ip);
4946         if (DOINGSUJ(dvp)) {
4947                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4948                     inoreflst);
4949                 KASSERT(jaddref->ja_parent == dp->i_number,
4950                     ("softdep_revert_mkdir: addref parent mismatch"));
4951                 dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
4952                     inoreflst, if_deps);
4953                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4954                 KASSERT(dotaddref->ja_parent == ip->i_number,
4955                     ("softdep_revert_mkdir: dot addref parent mismatch"));
4956                 cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
4957         }
4958         FREE_LOCK(ITOUMP(dp));
4959 }
4960
4961 /*
4962  * Called to correct nlinkdelta after a failed rmdir.
4963  */
4964 void
4965 softdep_revert_rmdir(dp, ip)
4966         struct inode *dp;
4967         struct inode *ip;
4968 {
4969
4970         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4971             ("softdep_revert_rmdir called on non-softdep filesystem"));
4972         ACQUIRE_LOCK(ITOUMP(dp));
4973         (void) inodedep_lookup_ip(ip);
4974         (void) inodedep_lookup_ip(dp);
4975         FREE_LOCK(ITOUMP(dp));
4976 }
4977
4978 /*
4979  * Protecting the freemaps (or bitmaps).
4980  *
4981  * To eliminate the need to execute fsck before mounting a filesystem
4982  * after a power failure, one must (conservatively) guarantee that the
4983  * on-disk copy of the bitmaps never indicate that a live inode or block is
4984  * free.  So, when a block or inode is allocated, the bitmap should be
4985  * updated (on disk) before any new pointers.  When a block or inode is
4986  * freed, the bitmap should not be updated until all pointers have been
4987  * reset.  The latter dependency is handled by the delayed de-allocation
4988  * approach described below for block and inode de-allocation.  The former
4989  * dependency is handled by calling the following procedure when a block or
4990  * inode is allocated. When an inode is allocated an "inodedep" is created
4991  * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
4992  * Each "inodedep" is also inserted into the hash indexing structure so
4993  * that any additional link additions can be made dependent on the inode
4994  * allocation.
4995  *
4996  * The ufs filesystem maintains a number of free block counts (e.g., per
4997  * cylinder group, per cylinder and per <cylinder, rotational position> pair)
4998  * in addition to the bitmaps.  These counts are used to improve efficiency
4999  * during allocation and therefore must be consistent with the bitmaps.
5000  * There is no convenient way to guarantee post-crash consistency of these
5001  * counts with simple update ordering, for two main reasons: (1) The counts
5002  * and bitmaps for a single cylinder group block are not in the same disk
5003  * sector.  If a disk write is interrupted (e.g., by power failure), one may
5004  * be written and the other not.  (2) Some of the counts are located in the
5005  * superblock rather than the cylinder group block. So, we focus our soft
5006  * updates implementation on protecting the bitmaps. When mounting a
5007  * filesystem, we recompute the auxiliary counts from the bitmaps.
5008  */
5009
5010 /*
5011  * Called just after updating the cylinder group block to allocate an inode.
5012  */
5013 void
5014 softdep_setup_inomapdep(bp, ip, newinum, mode)
5015         struct buf *bp;         /* buffer for cylgroup block with inode map */
5016         struct inode *ip;       /* inode related to allocation */
5017         ino_t newinum;          /* new inode number being allocated */
5018         int mode;
5019 {
5020         struct inodedep *inodedep;
5021         struct bmsafemap *bmsafemap;
5022         struct jaddref *jaddref;
5023         struct mount *mp;
5024         struct fs *fs;
5025
5026         mp = ITOVFS(ip);
5027         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5028             ("softdep_setup_inomapdep called on non-softdep filesystem"));
5029         fs = VFSTOUFS(mp)->um_fs;
5030         jaddref = NULL;
5031
5032         /*
5033          * Allocate the journal reference add structure so that the bitmap
5034          * can be dependent on it.
5035          */
5036         if (MOUNTEDSUJ(mp)) {
5037                 jaddref = newjaddref(ip, newinum, 0, 0, mode);
5038                 jaddref->ja_state |= NEWBLOCK;
5039         }
5040
5041         /*
5042          * Create a dependency for the newly allocated inode.
5043          * Panic if it already exists as something is seriously wrong.
5044          * Otherwise add it to the dependency list for the buffer holding
5045          * the cylinder group map from which it was allocated.
5046          *
5047          * We have to preallocate a bmsafemap entry in case it is needed
5048          * in bmsafemap_lookup since once we allocate the inodedep, we
5049          * have to finish initializing it before we can FREE_LOCK().
5050          * By preallocating, we avoid FREE_LOCK() while doing a malloc
5051          * in bmsafemap_lookup. We cannot call bmsafemap_lookup before
5052          * creating the inodedep as it can be freed during the time
5053          * that we FREE_LOCK() while allocating the inodedep. We must
5054          * call workitem_alloc() before entering the locked section as
5055          * it also acquires the lock and we must avoid trying doing so
5056          * recursively.
5057          */
5058         bmsafemap = malloc(sizeof(struct bmsafemap),
5059             M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5060         workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5061         ACQUIRE_LOCK(ITOUMP(ip));
5062         if ((inodedep_lookup(mp, newinum, DEPALLOC, &inodedep)))
5063                 panic("softdep_setup_inomapdep: dependency %p for new"
5064                     "inode already exists", inodedep);
5065         bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap);
5066         if (jaddref) {
5067                 LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
5068                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
5069                     if_deps);
5070         } else {
5071                 inodedep->id_state |= ONDEPLIST;
5072                 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
5073         }
5074         inodedep->id_bmsafemap = bmsafemap;
5075         inodedep->id_state &= ~DEPCOMPLETE;
5076         FREE_LOCK(ITOUMP(ip));
5077 }
5078
5079 /*
5080  * Called just after updating the cylinder group block to
5081  * allocate block or fragment.
5082  */
5083 void
5084 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
5085         struct buf *bp;         /* buffer for cylgroup block with block map */
5086         struct mount *mp;       /* filesystem doing allocation */
5087         ufs2_daddr_t newblkno;  /* number of newly allocated block */
5088         int frags;              /* Number of fragments. */
5089         int oldfrags;           /* Previous number of fragments for extend. */
5090 {
5091         struct newblk *newblk;
5092         struct bmsafemap *bmsafemap;
5093         struct jnewblk *jnewblk;
5094         struct ufsmount *ump;
5095         struct fs *fs;
5096
5097         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5098             ("softdep_setup_blkmapdep called on non-softdep filesystem"));
5099         ump = VFSTOUFS(mp);
5100         fs = ump->um_fs;
5101         jnewblk = NULL;
5102         /*
5103          * Create a dependency for the newly allocated block.
5104          * Add it to the dependency list for the buffer holding
5105          * the cylinder group map from which it was allocated.
5106          */
5107         if (MOUNTEDSUJ(mp)) {
5108                 jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
5109                 workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
5110                 jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
5111                 jnewblk->jn_state = ATTACHED;
5112                 jnewblk->jn_blkno = newblkno;
5113                 jnewblk->jn_frags = frags;
5114                 jnewblk->jn_oldfrags = oldfrags;
5115 #ifdef INVARIANTS
5116                 {
5117                         struct cg *cgp;
5118                         uint8_t *blksfree;
5119                         long bno;
5120                         int i;
5121
5122                         cgp = (struct cg *)bp->b_data;
5123                         blksfree = cg_blksfree(cgp);
5124                         bno = dtogd(fs, jnewblk->jn_blkno);
5125                         for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
5126                             i++) {
5127                                 if (isset(blksfree, bno + i))
5128                                         panic("softdep_setup_blkmapdep: "
5129                                             "free fragment %d from %d-%d "
5130                                             "state 0x%X dep %p", i,
5131                                             jnewblk->jn_oldfrags,
5132                                             jnewblk->jn_frags,
5133                                             jnewblk->jn_state,
5134                                             jnewblk->jn_dep);
5135                         }
5136                 }
5137 #endif
5138         }
5139
5140         CTR3(KTR_SUJ,
5141             "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d",
5142             newblkno, frags, oldfrags);
5143         ACQUIRE_LOCK(ump);
5144         if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
5145                 panic("softdep_setup_blkmapdep: found block");
5146         newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
5147             dtog(fs, newblkno), NULL);
5148         if (jnewblk) {
5149                 jnewblk->jn_dep = (struct worklist *)newblk;
5150                 LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
5151         } else {
5152                 newblk->nb_state |= ONDEPLIST;
5153                 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
5154         }
5155         newblk->nb_bmsafemap = bmsafemap;
5156         newblk->nb_jnewblk = jnewblk;
5157         FREE_LOCK(ump);
5158 }
5159
5160 #define BMSAFEMAP_HASH(ump, cg) \
5161       (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size])
5162
5163 static int
5164 bmsafemap_find(bmsafemaphd, cg, bmsafemapp)
5165         struct bmsafemap_hashhead *bmsafemaphd;
5166         int cg;
5167         struct bmsafemap **bmsafemapp;
5168 {
5169         struct bmsafemap *bmsafemap;
5170
5171         LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
5172                 if (bmsafemap->sm_cg == cg)
5173                         break;
5174         if (bmsafemap) {
5175                 *bmsafemapp = bmsafemap;
5176                 return (1);
5177         }
5178         *bmsafemapp = NULL;
5179
5180         return (0);
5181 }
5182
5183 /*
5184  * Find the bmsafemap associated with a cylinder group buffer.
5185  * If none exists, create one. The buffer must be locked when
5186  * this routine is called and this routine must be called with
5187  * the softdep lock held. To avoid giving up the lock while
5188  * allocating a new bmsafemap, a preallocated bmsafemap may be
5189  * provided. If it is provided but not needed, it is freed.
5190  */
5191 static struct bmsafemap *
5192 bmsafemap_lookup(mp, bp, cg, newbmsafemap)
5193         struct mount *mp;
5194         struct buf *bp;
5195         int cg;
5196         struct bmsafemap *newbmsafemap;
5197 {
5198         struct bmsafemap_hashhead *bmsafemaphd;
5199         struct bmsafemap *bmsafemap, *collision;
5200         struct worklist *wk;
5201         struct ufsmount *ump;
5202
5203         ump = VFSTOUFS(mp);
5204         LOCK_OWNED(ump);
5205         KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer"));
5206         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5207                 if (wk->wk_type == D_BMSAFEMAP) {
5208                         if (newbmsafemap)
5209                                 WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5210                         return (WK_BMSAFEMAP(wk));
5211                 }
5212         }
5213         bmsafemaphd = BMSAFEMAP_HASH(ump, cg);
5214         if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) {
5215                 if (newbmsafemap)
5216                         WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5217                 return (bmsafemap);
5218         }
5219         if (newbmsafemap) {
5220                 bmsafemap = newbmsafemap;
5221         } else {
5222                 FREE_LOCK(ump);
5223                 bmsafemap = malloc(sizeof(struct bmsafemap),
5224                         M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5225                 workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5226                 ACQUIRE_LOCK(ump);
5227         }
5228         bmsafemap->sm_buf = bp;
5229         LIST_INIT(&bmsafemap->sm_inodedephd);
5230         LIST_INIT(&bmsafemap->sm_inodedepwr);
5231         LIST_INIT(&bmsafemap->sm_newblkhd);
5232         LIST_INIT(&bmsafemap->sm_newblkwr);
5233         LIST_INIT(&bmsafemap->sm_jaddrefhd);
5234         LIST_INIT(&bmsafemap->sm_jnewblkhd);
5235         LIST_INIT(&bmsafemap->sm_freehd);
5236         LIST_INIT(&bmsafemap->sm_freewr);
5237         if (bmsafemap_find(bmsafemaphd, cg, &collision) == 1) {
5238                 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
5239                 return (collision);
5240         }
5241         bmsafemap->sm_cg = cg;
5242         LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
5243         LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
5244         WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
5245         return (bmsafemap);
5246 }
5247
5248 /*
5249  * Direct block allocation dependencies.
5250  *
5251  * When a new block is allocated, the corresponding disk locations must be
5252  * initialized (with zeros or new data) before the on-disk inode points to
5253  * them.  Also, the freemap from which the block was allocated must be
5254  * updated (on disk) before the inode's pointer. These two dependencies are
5255  * independent of each other and are needed for all file blocks and indirect
5256  * blocks that are pointed to directly by the inode.  Just before the
5257  * "in-core" version of the inode is updated with a newly allocated block
5258  * number, a procedure (below) is called to setup allocation dependency
5259  * structures.  These structures are removed when the corresponding
5260  * dependencies are satisfied or when the block allocation becomes obsolete
5261  * (i.e., the file is deleted, the block is de-allocated, or the block is a
5262  * fragment that gets upgraded).  All of these cases are handled in
5263  * procedures described later.
5264  *
5265  * When a file extension causes a fragment to be upgraded, either to a larger
5266  * fragment or to a full block, the on-disk location may change (if the
5267  * previous fragment could not simply be extended). In this case, the old
5268  * fragment must be de-allocated, but not until after the inode's pointer has
5269  * been updated. In most cases, this is handled by later procedures, which
5270  * will construct a "freefrag" structure to be added to the workitem queue
5271  * when the inode update is complete (or obsolete).  The main exception to
5272  * this is when an allocation occurs while a pending allocation dependency
5273  * (for the same block pointer) remains.  This case is handled in the main
5274  * allocation dependency setup procedure by immediately freeing the
5275  * unreferenced fragments.
5276  */
5277 void
5278 softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5279         struct inode *ip;       /* inode to which block is being added */
5280         ufs_lbn_t off;          /* block pointer within inode */
5281         ufs2_daddr_t newblkno;  /* disk block number being added */
5282         ufs2_daddr_t oldblkno;  /* previous block number, 0 unless frag */
5283         long newsize;           /* size of new block */
5284         long oldsize;           /* size of new block */
5285         struct buf *bp;         /* bp for allocated block */
5286 {
5287         struct allocdirect *adp, *oldadp;
5288         struct allocdirectlst *adphead;
5289         struct freefrag *freefrag;
5290         struct inodedep *inodedep;
5291         struct pagedep *pagedep;
5292         struct jnewblk *jnewblk;
5293         struct newblk *newblk;
5294         struct mount *mp;
5295         ufs_lbn_t lbn;
5296
5297         lbn = bp->b_lblkno;
5298         mp = ITOVFS(ip);
5299         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5300             ("softdep_setup_allocdirect called on non-softdep filesystem"));
5301         if (oldblkno && oldblkno != newblkno)
5302                 /*
5303                  * The usual case is that a smaller fragment that
5304                  * was just allocated has been replaced with a bigger
5305                  * fragment or a full-size block. If it is marked as
5306                  * B_DELWRI, the current contents have not been written
5307                  * to disk. It is possible that the block was written
5308                  * earlier, but very uncommon. If the block has never
5309                  * been written, there is no need to send a BIO_DELETE
5310                  * for it when it is freed. The gain from avoiding the
5311                  * TRIMs for the common case of unwritten blocks far
5312                  * exceeds the cost of the write amplification for the
5313                  * uncommon case of failing to send a TRIM for a block
5314                  * that had been written.
5315                  */
5316                 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
5317                     (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY);
5318         else
5319                 freefrag = NULL;
5320
5321         CTR6(KTR_SUJ,
5322             "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd "
5323             "off %jd newsize %ld oldsize %d",
5324             ip->i_number, newblkno, oldblkno, off, newsize, oldsize);
5325         ACQUIRE_LOCK(ITOUMP(ip));
5326         if (off >= UFS_NDADDR) {
5327                 if (lbn > 0)
5328                         panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
5329                             lbn, off);
5330                 /* allocating an indirect block */
5331                 if (oldblkno != 0)
5332                         panic("softdep_setup_allocdirect: non-zero indir");
5333         } else {
5334                 if (off != lbn)
5335                         panic("softdep_setup_allocdirect: lbn %jd != off %jd",
5336                             lbn, off);
5337                 /*
5338                  * Allocating a direct block.
5339                  *
5340                  * If we are allocating a directory block, then we must
5341                  * allocate an associated pagedep to track additions and
5342                  * deletions.
5343                  */
5344                 if ((ip->i_mode & IFMT) == IFDIR)
5345                         pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC,
5346                             &pagedep);
5347         }
5348         if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5349                 panic("softdep_setup_allocdirect: lost block");
5350         KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5351             ("softdep_setup_allocdirect: newblk already initialized"));
5352         /*
5353          * Convert the newblk to an allocdirect.
5354          */
5355         WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5356         adp = (struct allocdirect *)newblk;
5357         newblk->nb_freefrag = freefrag;
5358         adp->ad_offset = off;
5359         adp->ad_oldblkno = oldblkno;
5360         adp->ad_newsize = newsize;
5361         adp->ad_oldsize = oldsize;
5362
5363         /*
5364          * Finish initializing the journal.
5365          */
5366         if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5367                 jnewblk->jn_ino = ip->i_number;
5368                 jnewblk->jn_lbn = lbn;
5369                 add_to_journal(&jnewblk->jn_list);
5370         }
5371         if (freefrag && freefrag->ff_jdep != NULL &&
5372             freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5373                 add_to_journal(freefrag->ff_jdep);
5374         inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5375         adp->ad_inodedep = inodedep;
5376
5377         WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5378         /*
5379          * The list of allocdirects must be kept in sorted and ascending
5380          * order so that the rollback routines can quickly determine the
5381          * first uncommitted block (the size of the file stored on disk
5382          * ends at the end of the lowest committed fragment, or if there
5383          * are no fragments, at the end of the highest committed block).
5384          * Since files generally grow, the typical case is that the new
5385          * block is to be added at the end of the list. We speed this
5386          * special case by checking against the last allocdirect in the
5387          * list before laboriously traversing the list looking for the
5388          * insertion point.
5389          */
5390         adphead = &inodedep->id_newinoupdt;
5391         oldadp = TAILQ_LAST(adphead, allocdirectlst);
5392         if (oldadp == NULL || oldadp->ad_offset <= off) {
5393                 /* insert at end of list */
5394                 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5395                 if (oldadp != NULL && oldadp->ad_offset == off)
5396                         allocdirect_merge(adphead, adp, oldadp);
5397                 FREE_LOCK(ITOUMP(ip));
5398                 return;
5399         }
5400         TAILQ_FOREACH(oldadp, adphead, ad_next) {
5401                 if (oldadp->ad_offset >= off)
5402                         break;
5403         }
5404         if (oldadp == NULL)
5405                 panic("softdep_setup_allocdirect: lost entry");
5406         /* insert in middle of list */
5407         TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5408         if (oldadp->ad_offset == off)
5409                 allocdirect_merge(adphead, adp, oldadp);
5410
5411         FREE_LOCK(ITOUMP(ip));
5412 }
5413
5414 /*
5415  * Merge a newer and older journal record to be stored either in a
5416  * newblock or freefrag.  This handles aggregating journal records for
5417  * fragment allocation into a second record as well as replacing a
5418  * journal free with an aborted journal allocation.  A segment for the
5419  * oldest record will be placed on wkhd if it has been written.  If not
5420  * the segment for the newer record will suffice.
5421  */
5422 static struct worklist *
5423 jnewblk_merge(new, old, wkhd)
5424         struct worklist *new;
5425         struct worklist *old;
5426         struct workhead *wkhd;
5427 {
5428         struct jnewblk *njnewblk;
5429         struct jnewblk *jnewblk;
5430
5431         /* Handle NULLs to simplify callers. */
5432         if (new == NULL)
5433                 return (old);
5434         if (old == NULL)
5435                 return (new);
5436         /* Replace a jfreefrag with a jnewblk. */
5437         if (new->wk_type == D_JFREEFRAG) {
5438                 if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno)
5439                         panic("jnewblk_merge: blkno mismatch: %p, %p",
5440                             old, new);
5441                 cancel_jfreefrag(WK_JFREEFRAG(new));
5442                 return (old);
5443         }
5444         if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK)
5445                 panic("jnewblk_merge: Bad type: old %d new %d\n",
5446                     old->wk_type, new->wk_type);
5447         /*
5448          * Handle merging of two jnewblk records that describe
5449          * different sets of fragments in the same block.
5450          */
5451         jnewblk = WK_JNEWBLK(old);
5452         njnewblk = WK_JNEWBLK(new);
5453         if (jnewblk->jn_blkno != njnewblk->jn_blkno)
5454                 panic("jnewblk_merge: Merging disparate blocks.");
5455         /*
5456          * The record may be rolled back in the cg.
5457          */
5458         if (jnewblk->jn_state & UNDONE) {
5459                 jnewblk->jn_state &= ~UNDONE;
5460                 njnewblk->jn_state |= UNDONE;
5461                 njnewblk->jn_state &= ~ATTACHED;
5462         }
5463         /*
5464          * We modify the newer addref and free the older so that if neither
5465          * has been written the most up-to-date copy will be on disk.  If
5466          * both have been written but rolled back we only temporarily need
5467          * one of them to fix the bits when the cg write completes.
5468          */
5469         jnewblk->jn_state |= ATTACHED | COMPLETE;
5470         njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
5471         cancel_jnewblk(jnewblk, wkhd);
5472         WORKLIST_REMOVE(&jnewblk->jn_list);
5473         free_jnewblk(jnewblk);
5474         return (new);
5475 }
5476
5477 /*
5478  * Replace an old allocdirect dependency with a newer one.
5479  */
5480 static void
5481 allocdirect_merge(adphead, newadp, oldadp)
5482         struct allocdirectlst *adphead; /* head of list holding allocdirects */
5483         struct allocdirect *newadp;     /* allocdirect being added */
5484         struct allocdirect *oldadp;     /* existing allocdirect being checked */
5485 {
5486         struct worklist *wk;
5487         struct freefrag *freefrag;
5488
5489         freefrag = NULL;
5490         LOCK_OWNED(VFSTOUFS(newadp->ad_list.wk_mp));
5491         if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
5492             newadp->ad_oldsize != oldadp->ad_newsize ||
5493             newadp->ad_offset >= UFS_NDADDR)
5494                 panic("%s %jd != new %jd || old size %ld != new %ld",
5495                     "allocdirect_merge: old blkno",
5496                     (intmax_t)newadp->ad_oldblkno,
5497                     (intmax_t)oldadp->ad_newblkno,
5498                     newadp->ad_oldsize, oldadp->ad_newsize);
5499         newadp->ad_oldblkno = oldadp->ad_oldblkno;
5500         newadp->ad_oldsize = oldadp->ad_oldsize;
5501         /*
5502          * If the old dependency had a fragment to free or had never
5503          * previously had a block allocated, then the new dependency
5504          * can immediately post its freefrag and adopt the old freefrag.
5505          * This action is done by swapping the freefrag dependencies.
5506          * The new dependency gains the old one's freefrag, and the
5507          * old one gets the new one and then immediately puts it on
5508          * the worklist when it is freed by free_newblk. It is
5509          * not possible to do this swap when the old dependency had a
5510          * non-zero size but no previous fragment to free. This condition
5511          * arises when the new block is an extension of the old block.
5512          * Here, the first part of the fragment allocated to the new
5513          * dependency is part of the block currently claimed on disk by
5514          * the old dependency, so cannot legitimately be freed until the
5515          * conditions for the new dependency are fulfilled.
5516          */
5517         freefrag = newadp->ad_freefrag;
5518         if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
5519                 newadp->ad_freefrag = oldadp->ad_freefrag;
5520                 oldadp->ad_freefrag = freefrag;
5521         }
5522         /*
5523          * If we are tracking a new directory-block allocation,
5524          * move it from the old allocdirect to the new allocdirect.
5525          */
5526         if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
5527                 WORKLIST_REMOVE(wk);
5528                 if (!LIST_EMPTY(&oldadp->ad_newdirblk))
5529                         panic("allocdirect_merge: extra newdirblk");
5530                 WORKLIST_INSERT(&newadp->ad_newdirblk, wk);
5531         }
5532         TAILQ_REMOVE(adphead, oldadp, ad_next);
5533         /*
5534          * We need to move any journal dependencies over to the freefrag
5535          * that releases this block if it exists.  Otherwise we are
5536          * extending an existing block and we'll wait until that is
5537          * complete to release the journal space and extend the
5538          * new journal to cover this old space as well.
5539          */
5540         if (freefrag == NULL) {
5541                 if (oldadp->ad_newblkno != newadp->ad_newblkno)
5542                         panic("allocdirect_merge: %jd != %jd",
5543                             oldadp->ad_newblkno, newadp->ad_newblkno);
5544                 newadp->ad_block.nb_jnewblk = (struct jnewblk *)
5545                     jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list,
5546                     &oldadp->ad_block.nb_jnewblk->jn_list,
5547                     &newadp->ad_block.nb_jwork);
5548                 oldadp->ad_block.nb_jnewblk = NULL;
5549                 cancel_newblk(&oldadp->ad_block, NULL,
5550                     &newadp->ad_block.nb_jwork);
5551         } else {
5552                 wk = (struct worklist *) cancel_newblk(&oldadp->ad_block,
5553                     &freefrag->ff_list, &freefrag->ff_jwork);
5554                 freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk,
5555                     &freefrag->ff_jwork);
5556         }
5557         free_newblk(&oldadp->ad_block);
5558 }
5559
5560 /*
5561  * Allocate a jfreefrag structure to journal a single block free.
5562  */
5563 static struct jfreefrag *
5564 newjfreefrag(freefrag, ip, blkno, size, lbn)
5565         struct freefrag *freefrag;
5566         struct inode *ip;
5567         ufs2_daddr_t blkno;
5568         long size;
5569         ufs_lbn_t lbn;
5570 {
5571         struct jfreefrag *jfreefrag;
5572         struct fs *fs;
5573
5574         fs = ITOFS(ip);
5575         jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
5576             M_SOFTDEP_FLAGS);
5577         workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, ITOVFS(ip));
5578         jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
5579         jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
5580         jfreefrag->fr_ino = ip->i_number;
5581         jfreefrag->fr_lbn = lbn;
5582         jfreefrag->fr_blkno = blkno;
5583         jfreefrag->fr_frags = numfrags(fs, size);
5584         jfreefrag->fr_freefrag = freefrag;
5585
5586         return (jfreefrag);
5587 }
5588
5589 /*
5590  * Allocate a new freefrag structure.
5591  */
5592 static struct freefrag *
5593 newfreefrag(ip, blkno, size, lbn, key)
5594         struct inode *ip;
5595         ufs2_daddr_t blkno;
5596         long size;
5597         ufs_lbn_t lbn;
5598         u_long key;
5599 {
5600         struct freefrag *freefrag;
5601         struct ufsmount *ump;
5602         struct fs *fs;
5603
5604         CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd",
5605             ip->i_number, blkno, size, lbn);
5606         ump = ITOUMP(ip);
5607         fs = ump->um_fs;
5608         if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
5609                 panic("newfreefrag: frag size");
5610         freefrag = malloc(sizeof(struct freefrag),
5611             M_FREEFRAG, M_SOFTDEP_FLAGS);
5612         workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ump));
5613         freefrag->ff_state = ATTACHED;
5614         LIST_INIT(&freefrag->ff_jwork);
5615         freefrag->ff_inum = ip->i_number;
5616         freefrag->ff_vtype = ITOV(ip)->v_type;
5617         freefrag->ff_blkno = blkno;
5618         freefrag->ff_fragsize = size;
5619         freefrag->ff_key = key;
5620
5621         if (MOUNTEDSUJ(UFSTOVFS(ump))) {
5622                 freefrag->ff_jdep = (struct worklist *)
5623                     newjfreefrag(freefrag, ip, blkno, size, lbn);
5624         } else {
5625                 freefrag->ff_state |= DEPCOMPLETE;
5626                 freefrag->ff_jdep = NULL;
5627         }
5628
5629         return (freefrag);
5630 }
5631
5632 /*
5633  * This workitem de-allocates fragments that were replaced during
5634  * file block allocation.
5635  */
5636 static void
5637 handle_workitem_freefrag(freefrag)
5638         struct freefrag *freefrag;
5639 {
5640         struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
5641         struct workhead wkhd;
5642
5643         CTR3(KTR_SUJ,
5644             "handle_workitem_freefrag: ino %d blkno %jd size %ld",
5645             freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize);
5646         /*
5647          * It would be illegal to add new completion items to the
5648          * freefrag after it was schedule to be done so it must be
5649          * safe to modify the list head here.
5650          */
5651         LIST_INIT(&wkhd);
5652         ACQUIRE_LOCK(ump);
5653         LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
5654         /*
5655          * If the journal has not been written we must cancel it here.
5656          */
5657         if (freefrag->ff_jdep) {
5658                 if (freefrag->ff_jdep->wk_type != D_JNEWBLK)
5659                         panic("handle_workitem_freefrag: Unexpected type %d\n",
5660                             freefrag->ff_jdep->wk_type);
5661                 cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd);
5662         }
5663         FREE_LOCK(ump);
5664         ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
5665            freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype,
5666            &wkhd, freefrag->ff_key);
5667         ACQUIRE_LOCK(ump);
5668         WORKITEM_FREE(freefrag, D_FREEFRAG);
5669         FREE_LOCK(ump);
5670 }
5671
5672 /*
5673  * Set up a dependency structure for an external attributes data block.
5674  * This routine follows much of the structure of softdep_setup_allocdirect.
5675  * See the description of softdep_setup_allocdirect above for details.
5676  */
5677 void
5678 softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5679         struct inode *ip;
5680         ufs_lbn_t off;
5681         ufs2_daddr_t newblkno;
5682         ufs2_daddr_t oldblkno;
5683         long newsize;
5684         long oldsize;
5685         struct buf *bp;
5686 {
5687         struct allocdirect *adp, *oldadp;
5688         struct allocdirectlst *adphead;
5689         struct freefrag *freefrag;
5690         struct inodedep *inodedep;
5691         struct jnewblk *jnewblk;
5692         struct newblk *newblk;
5693         struct mount *mp;
5694         struct ufsmount *ump;
5695         ufs_lbn_t lbn;
5696
5697         mp = ITOVFS(ip);
5698         ump = VFSTOUFS(mp);
5699         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5700             ("softdep_setup_allocext called on non-softdep filesystem"));
5701         KASSERT(off < UFS_NXADDR,
5702             ("softdep_setup_allocext: lbn %lld > UFS_NXADDR", (long long)off));
5703
5704         lbn = bp->b_lblkno;
5705         if (oldblkno && oldblkno != newblkno)
5706                 /*
5707                  * The usual case is that a smaller fragment that
5708                  * was just allocated has been replaced with a bigger
5709                  * fragment or a full-size block. If it is marked as
5710                  * B_DELWRI, the current contents have not been written
5711                  * to disk. It is possible that the block was written
5712                  * earlier, but very uncommon. If the block has never
5713                  * been written, there is no need to send a BIO_DELETE
5714                  * for it when it is freed. The gain from avoiding the
5715                  * TRIMs for the common case of unwritten blocks far
5716                  * exceeds the cost of the write amplification for the
5717                  * uncommon case of failing to send a TRIM for a block
5718                  * that had been written.
5719                  */
5720                 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
5721                     (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY);
5722         else
5723                 freefrag = NULL;
5724
5725         ACQUIRE_LOCK(ump);
5726         if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5727                 panic("softdep_setup_allocext: lost block");
5728         KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5729             ("softdep_setup_allocext: newblk already initialized"));
5730         /*
5731          * Convert the newblk to an allocdirect.
5732          */
5733         WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5734         adp = (struct allocdirect *)newblk;
5735         newblk->nb_freefrag = freefrag;
5736         adp->ad_offset = off;
5737         adp->ad_oldblkno = oldblkno;
5738         adp->ad_newsize = newsize;
5739         adp->ad_oldsize = oldsize;
5740         adp->ad_state |=  EXTDATA;
5741
5742         /*
5743          * Finish initializing the journal.
5744          */
5745         if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5746                 jnewblk->jn_ino = ip->i_number;
5747                 jnewblk->jn_lbn = lbn;
5748                 add_to_journal(&jnewblk->jn_list);
5749         }
5750         if (freefrag && freefrag->ff_jdep != NULL &&
5751             freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5752                 add_to_journal(freefrag->ff_jdep);
5753         inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5754         adp->ad_inodedep = inodedep;
5755
5756         WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5757         /*
5758          * The list of allocdirects must be kept in sorted and ascending
5759          * order so that the rollback routines can quickly determine the
5760          * first uncommitted block (the size of the file stored on disk
5761          * ends at the end of the lowest committed fragment, or if there
5762          * are no fragments, at the end of the highest committed block).
5763          * Since files generally grow, the typical case is that the new
5764          * block is to be added at the end of the list. We speed this
5765          * special case by checking against the last allocdirect in the
5766          * list before laboriously traversing the list looking for the
5767          * insertion point.
5768          */
5769         adphead = &inodedep->id_newextupdt;
5770         oldadp = TAILQ_LAST(adphead, allocdirectlst);
5771         if (oldadp == NULL || oldadp->ad_offset <= off) {
5772                 /* insert at end of list */
5773                 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5774                 if (oldadp != NULL && oldadp->ad_offset == off)
5775                         allocdirect_merge(adphead, adp, oldadp);
5776                 FREE_LOCK(ump);
5777                 return;
5778         }
5779         TAILQ_FOREACH(oldadp, adphead, ad_next) {
5780                 if (oldadp->ad_offset >= off)
5781                         break;
5782         }
5783         if (oldadp == NULL)
5784                 panic("softdep_setup_allocext: lost entry");
5785         /* insert in middle of list */
5786         TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5787         if (oldadp->ad_offset == off)
5788                 allocdirect_merge(adphead, adp, oldadp);
5789         FREE_LOCK(ump);
5790 }
5791
5792 /*
5793  * Indirect block allocation dependencies.
5794  *
5795  * The same dependencies that exist for a direct block also exist when
5796  * a new block is allocated and pointed to by an entry in a block of
5797  * indirect pointers. The undo/redo states described above are also
5798  * used here. Because an indirect block contains many pointers that
5799  * may have dependencies, a second copy of the entire in-memory indirect
5800  * block is kept. The buffer cache copy is always completely up-to-date.
5801  * The second copy, which is used only as a source for disk writes,
5802  * contains only the safe pointers (i.e., those that have no remaining
5803  * update dependencies). The second copy is freed when all pointers
5804  * are safe. The cache is not allowed to replace indirect blocks with
5805  * pending update dependencies. If a buffer containing an indirect
5806  * block with dependencies is written, these routines will mark it
5807  * dirty again. It can only be successfully written once all the
5808  * dependencies are removed. The ffs_fsync routine in conjunction with
5809  * softdep_sync_metadata work together to get all the dependencies
5810  * removed so that a file can be successfully written to disk. Three
5811  * procedures are used when setting up indirect block pointer
5812  * dependencies. The division is necessary because of the organization
5813  * of the "balloc" routine and because of the distinction between file
5814  * pages and file metadata blocks.
5815  */
5816
5817 /*
5818  * Allocate a new allocindir structure.
5819  */
5820 static struct allocindir *
5821 newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
5822         struct inode *ip;       /* inode for file being extended */
5823         int ptrno;              /* offset of pointer in indirect block */
5824         ufs2_daddr_t newblkno;  /* disk block number being added */
5825         ufs2_daddr_t oldblkno;  /* previous block number, 0 if none */
5826         ufs_lbn_t lbn;
5827 {
5828         struct newblk *newblk;
5829         struct allocindir *aip;
5830         struct freefrag *freefrag;
5831         struct jnewblk *jnewblk;
5832
5833         if (oldblkno)
5834                 freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn,
5835                     SINGLETON_KEY);
5836         else
5837                 freefrag = NULL;
5838         ACQUIRE_LOCK(ITOUMP(ip));
5839         if (newblk_lookup(ITOVFS(ip), newblkno, 0, &newblk) == 0)
5840                 panic("new_allocindir: lost block");
5841         KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5842             ("newallocindir: newblk already initialized"));
5843         WORKITEM_REASSIGN(newblk, D_ALLOCINDIR);
5844         newblk->nb_freefrag = freefrag;
5845         aip = (struct allocindir *)newblk;
5846         aip->ai_offset = ptrno;
5847         aip->ai_oldblkno = oldblkno;
5848         aip->ai_lbn = lbn;
5849         if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5850                 jnewblk->jn_ino = ip->i_number;
5851                 jnewblk->jn_lbn = lbn;
5852                 add_to_journal(&jnewblk->jn_list);
5853         }
5854         if (freefrag && freefrag->ff_jdep != NULL &&
5855             freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5856                 add_to_journal(freefrag->ff_jdep);
5857         return (aip);
5858 }
5859
5860 /*
5861  * Called just before setting an indirect block pointer
5862  * to a newly allocated file page.
5863  */
5864 void
5865 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
5866         struct inode *ip;       /* inode for file being extended */
5867         ufs_lbn_t lbn;          /* allocated block number within file */
5868         struct buf *bp;         /* buffer with indirect blk referencing page */
5869         int ptrno;              /* offset of pointer in indirect block */
5870         ufs2_daddr_t newblkno;  /* disk block number being added */
5871         ufs2_daddr_t oldblkno;  /* previous block number, 0 if none */
5872         struct buf *nbp;        /* buffer holding allocated page */
5873 {
5874         struct inodedep *inodedep;
5875         struct freefrag *freefrag;
5876         struct allocindir *aip;
5877         struct pagedep *pagedep;
5878         struct mount *mp;
5879         struct ufsmount *ump;
5880
5881         mp = ITOVFS(ip);
5882         ump = VFSTOUFS(mp);
5883         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5884             ("softdep_setup_allocindir_page called on non-softdep filesystem"));
5885         KASSERT(lbn == nbp->b_lblkno,
5886             ("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
5887             lbn, bp->b_lblkno));
5888         CTR4(KTR_SUJ,
5889             "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd "
5890             "lbn %jd", ip->i_number, newblkno, oldblkno, lbn);
5891         ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
5892         aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
5893         (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5894         /*
5895          * If we are allocating a directory page, then we must
5896          * allocate an associated pagedep to track additions and
5897          * deletions.
5898          */
5899         if ((ip->i_mode & IFMT) == IFDIR)
5900                 pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep);
5901         WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5902         freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
5903         FREE_LOCK(ump);
5904         if (freefrag)
5905                 handle_workitem_freefrag(freefrag);
5906 }
5907
5908 /*
5909  * Called just before setting an indirect block pointer to a
5910  * newly allocated indirect block.
5911  */
5912 void
5913 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
5914         struct buf *nbp;        /* newly allocated indirect block */
5915         struct inode *ip;       /* inode for file being extended */
5916         struct buf *bp;         /* indirect block referencing allocated block */
5917         int ptrno;              /* offset of pointer in indirect block */
5918         ufs2_daddr_t newblkno;  /* disk block number being added */
5919 {
5920         struct inodedep *inodedep;
5921         struct allocindir *aip;
5922         struct ufsmount *ump;
5923         ufs_lbn_t lbn;
5924
5925         ump = ITOUMP(ip);
5926         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
5927             ("softdep_setup_allocindir_meta called on non-softdep filesystem"));
5928         CTR3(KTR_SUJ,
5929             "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d",
5930             ip->i_number, newblkno, ptrno);
5931         lbn = nbp->b_lblkno;
5932         ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
5933         aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
5934         inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep);
5935         WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5936         if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn))
5937                 panic("softdep_setup_allocindir_meta: Block already existed");
5938         FREE_LOCK(ump);
5939 }
5940
5941 static void
5942 indirdep_complete(indirdep)
5943         struct indirdep *indirdep;
5944 {
5945         struct allocindir *aip;
5946
5947         LIST_REMOVE(indirdep, ir_next);
5948         indirdep->ir_state |= DEPCOMPLETE;
5949
5950         while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
5951                 LIST_REMOVE(aip, ai_next);
5952                 free_newblk(&aip->ai_block);
5953         }
5954         /*
5955          * If this indirdep is not attached to a buf it was simply waiting
5956          * on completion to clear completehd.  free_indirdep() asserts
5957          * that nothing is dangling.
5958          */
5959         if ((indirdep->ir_state & ONWORKLIST) == 0)
5960                 free_indirdep(indirdep);
5961 }
5962
5963 static struct indirdep *
5964 indirdep_lookup(mp, ip, bp)
5965         struct mount *mp;
5966         struct inode *ip;
5967         struct buf *bp;
5968 {
5969         struct indirdep *indirdep, *newindirdep;
5970         struct newblk *newblk;
5971         struct ufsmount *ump;
5972         struct worklist *wk;
5973         struct fs *fs;
5974         ufs2_daddr_t blkno;
5975
5976         ump = VFSTOUFS(mp);
5977         LOCK_OWNED(ump);
5978         indirdep = NULL;
5979         newindirdep = NULL;
5980         fs = ump->um_fs;
5981         for (;;) {
5982                 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5983                         if (wk->wk_type != D_INDIRDEP)
5984                                 continue;
5985                         indirdep = WK_INDIRDEP(wk);
5986                         break;
5987                 }
5988                 /* Found on the buffer worklist, no new structure to free. */
5989                 if (indirdep != NULL && newindirdep == NULL)
5990                         return (indirdep);
5991                 if (indirdep != NULL && newindirdep != NULL)
5992                         panic("indirdep_lookup: simultaneous create");
5993                 /* None found on the buffer and a new structure is ready. */
5994                 if (indirdep == NULL && newindirdep != NULL)
5995                         break;
5996                 /* None found and no new structure available. */
5997                 FREE_LOCK(ump);
5998                 newindirdep = malloc(sizeof(struct indirdep),
5999                     M_INDIRDEP, M_SOFTDEP_FLAGS);
6000                 workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
6001                 newindirdep->ir_state = ATTACHED;
6002                 if (I_IS_UFS1(ip))
6003                         newindirdep->ir_state |= UFS1FMT;
6004                 TAILQ_INIT(&newindirdep->ir_trunc);
6005                 newindirdep->ir_saveddata = NULL;
6006                 LIST_INIT(&newindirdep->ir_deplisthd);
6007                 LIST_INIT(&newindirdep->ir_donehd);
6008                 LIST_INIT(&newindirdep->ir_writehd);
6009                 LIST_INIT(&newindirdep->ir_completehd);
6010                 if (bp->b_blkno == bp->b_lblkno) {
6011                         ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
6012                             NULL, NULL);
6013                         bp->b_blkno = blkno;
6014                 }
6015                 newindirdep->ir_freeblks = NULL;
6016                 newindirdep->ir_savebp =
6017                     getblk(ump->um_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
6018                 newindirdep->ir_bp = bp;
6019                 BUF_KERNPROC(newindirdep->ir_savebp);
6020                 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
6021                 ACQUIRE_LOCK(ump);
6022         }
6023         indirdep = newindirdep;
6024         WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
6025         /*
6026          * If the block is not yet allocated we don't set DEPCOMPLETE so
6027          * that we don't free dependencies until the pointers are valid.
6028          * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather
6029          * than using the hash.
6030          */
6031         if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk))
6032                 LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next);
6033         else
6034                 indirdep->ir_state |= DEPCOMPLETE;
6035         return (indirdep);
6036 }
6037
6038 /*
6039  * Called to finish the allocation of the "aip" allocated
6040  * by one of the two routines above.
6041  */
6042 static struct freefrag *
6043 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
6044         struct buf *bp;         /* in-memory copy of the indirect block */
6045         struct inode *ip;       /* inode for file being extended */
6046         struct inodedep *inodedep; /* Inodedep for ip */
6047         struct allocindir *aip; /* allocindir allocated by the above routines */
6048         ufs_lbn_t lbn;          /* Logical block number for this block. */
6049 {
6050         struct fs *fs;
6051         struct indirdep *indirdep;
6052         struct allocindir *oldaip;
6053         struct freefrag *freefrag;
6054         struct mount *mp;
6055         struct ufsmount *ump;
6056
6057         mp = ITOVFS(ip);
6058         ump = VFSTOUFS(mp);
6059         LOCK_OWNED(ump);
6060         fs = ump->um_fs;
6061         if (bp->b_lblkno >= 0)
6062                 panic("setup_allocindir_phase2: not indir blk");
6063         KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs),
6064             ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset));
6065         indirdep = indirdep_lookup(mp, ip, bp);
6066         KASSERT(indirdep->ir_savebp != NULL,
6067             ("setup_allocindir_phase2 NULL ir_savebp"));
6068         aip->ai_indirdep = indirdep;
6069         /*
6070          * Check for an unwritten dependency for this indirect offset.  If
6071          * there is, merge the old dependency into the new one.  This happens
6072          * as a result of reallocblk only.
6073          */
6074         freefrag = NULL;
6075         if (aip->ai_oldblkno != 0) {
6076                 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) {
6077                         if (oldaip->ai_offset == aip->ai_offset) {
6078                                 freefrag = allocindir_merge(aip, oldaip);
6079                                 goto done;
6080                         }
6081                 }
6082                 LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) {
6083                         if (oldaip->ai_offset == aip->ai_offset) {
6084                                 freefrag = allocindir_merge(aip, oldaip);
6085                                 goto done;
6086                         }
6087                 }
6088         }
6089 done:
6090         LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
6091         return (freefrag);
6092 }
6093
6094 /*
6095  * Merge two allocindirs which refer to the same block.  Move newblock
6096  * dependencies and setup the freefrags appropriately.
6097  */
6098 static struct freefrag *
6099 allocindir_merge(aip, oldaip)
6100         struct allocindir *aip;
6101         struct allocindir *oldaip;
6102 {
6103         struct freefrag *freefrag;
6104         struct worklist *wk;
6105
6106         if (oldaip->ai_newblkno != aip->ai_oldblkno)
6107                 panic("allocindir_merge: blkno");
6108         aip->ai_oldblkno = oldaip->ai_oldblkno;
6109         freefrag = aip->ai_freefrag;
6110         aip->ai_freefrag = oldaip->ai_freefrag;
6111         oldaip->ai_freefrag = NULL;
6112         KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
6113         /*
6114          * If we are tracking a new directory-block allocation,
6115          * move it from the old allocindir to the new allocindir.
6116          */
6117         if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
6118                 WORKLIST_REMOVE(wk);
6119                 if (!LIST_EMPTY(&oldaip->ai_newdirblk))
6120                         panic("allocindir_merge: extra newdirblk");
6121                 WORKLIST_INSERT(&aip->ai_newdirblk, wk);
6122         }
6123         /*
6124          * We can skip journaling for this freefrag and just complete
6125          * any pending journal work for the allocindir that is being
6126          * removed after the freefrag completes.
6127          */
6128         if (freefrag->ff_jdep)
6129                 cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep));
6130         LIST_REMOVE(oldaip, ai_next);
6131         freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block,
6132             &freefrag->ff_list, &freefrag->ff_jwork);
6133         free_newblk(&oldaip->ai_block);
6134
6135         return (freefrag);
6136 }
6137
6138 static inline void
6139 setup_freedirect(freeblks, ip, i, needj)
6140         struct freeblks *freeblks;
6141         struct inode *ip;
6142         int i;
6143         int needj;
6144 {
6145         struct ufsmount *ump;
6146         ufs2_daddr_t blkno;
6147         int frags;
6148
6149         blkno = DIP(ip, i_db[i]);
6150         if (blkno == 0)
6151                 return;
6152         DIP_SET(ip, i_db[i], 0);
6153         ump = ITOUMP(ip);
6154         frags = sblksize(ump->um_fs, ip->i_size, i);
6155         frags = numfrags(ump->um_fs, frags);
6156         newfreework(ump, freeblks, NULL, i, blkno, frags, 0, needj);
6157 }
6158
6159 static inline void
6160 setup_freeext(freeblks, ip, i, needj)
6161         struct freeblks *freeblks;
6162         struct inode *ip;
6163         int i;
6164         int needj;
6165 {
6166         struct ufsmount *ump;
6167         ufs2_daddr_t blkno;
6168         int frags;
6169
6170         blkno = ip->i_din2->di_extb[i];
6171         if (blkno == 0)
6172                 return;
6173         ip->i_din2->di_extb[i] = 0;
6174         ump = ITOUMP(ip);
6175         frags = sblksize(ump->um_fs, ip->i_din2->di_extsize, i);
6176         frags = numfrags(ump->um_fs, frags);
6177         newfreework(ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj);
6178 }
6179
6180 static inline void
6181 setup_freeindir(freeblks, ip, i, lbn, needj)
6182         struct freeblks *freeblks;
6183         struct inode *ip;
6184         int i;
6185         ufs_lbn_t lbn;
6186         int needj;
6187 {
6188         struct ufsmount *ump;
6189         ufs2_daddr_t blkno;
6190
6191         blkno = DIP(ip, i_ib[i]);
6192         if (blkno == 0)
6193                 return;
6194         DIP_SET(ip, i_ib[i], 0);
6195         ump = ITOUMP(ip);
6196         newfreework(ump, freeblks, NULL, lbn, blkno, ump->um_fs->fs_frag,
6197             0, needj);
6198 }
6199
6200 static inline struct freeblks *
6201 newfreeblks(mp, ip)
6202         struct mount *mp;
6203         struct inode *ip;
6204 {
6205         struct freeblks *freeblks;
6206
6207         freeblks = malloc(sizeof(struct freeblks),
6208                 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
6209         workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
6210         LIST_INIT(&freeblks->fb_jblkdephd);
6211         LIST_INIT(&freeblks->fb_jwork);
6212         freeblks->fb_ref = 0;
6213         freeblks->fb_cgwait = 0;
6214         freeblks->fb_state = ATTACHED;
6215         freeblks->fb_uid = ip->i_uid;
6216         freeblks->fb_inum = ip->i_number;
6217         freeblks->fb_vtype = ITOV(ip)->v_type;
6218         freeblks->fb_modrev = DIP(ip, i_modrev);
6219         freeblks->fb_devvp = ITODEVVP(ip);
6220         freeblks->fb_chkcnt = 0;
6221         freeblks->fb_len = 0;
6222
6223         return (freeblks);
6224 }
6225
6226 static void
6227 trunc_indirdep(indirdep, freeblks, bp, off)
6228         struct indirdep *indirdep;
6229         struct freeblks *freeblks;
6230         struct buf *bp;
6231         int off;
6232 {
6233         struct allocindir *aip, *aipn;
6234
6235         /*
6236          * The first set of allocindirs won't be in savedbp.
6237          */
6238         LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn)
6239                 if (aip->ai_offset > off)
6240                         cancel_allocindir(aip, bp, freeblks, 1);
6241         LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn)
6242                 if (aip->ai_offset > off)
6243                         cancel_allocindir(aip, bp, freeblks, 1);
6244         /*
6245          * These will exist in savedbp.
6246          */
6247         LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn)
6248                 if (aip->ai_offset > off)
6249                         cancel_allocindir(aip, NULL, freeblks, 0);
6250         LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn)
6251                 if (aip->ai_offset > off)
6252                         cancel_allocindir(aip, NULL, freeblks, 0);
6253 }
6254
6255 /*
6256  * Follow the chain of indirects down to lastlbn creating a freework
6257  * structure for each.  This will be used to start indir_trunc() at
6258  * the right offset and create the journal records for the parrtial
6259  * truncation.  A second step will handle the truncated dependencies.
6260  */
6261 static int
6262 setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno)
6263         struct freeblks *freeblks;
6264         struct inode *ip;
6265         ufs_lbn_t lbn;
6266         ufs_lbn_t lastlbn;
6267         ufs2_daddr_t blkno;
6268 {
6269         struct indirdep *indirdep;
6270         struct indirdep *indirn;
6271         struct freework *freework;
6272         struct newblk *newblk;
6273         struct mount *mp;
6274         struct ufsmount *ump;
6275         struct buf *bp;
6276         uint8_t *start;
6277         uint8_t *end;
6278         ufs_lbn_t lbnadd;
6279         int level;
6280         int error;
6281         int off;
6282
6283
6284         freework = NULL;
6285         if (blkno == 0)
6286                 return (0);
6287         mp = freeblks->fb_list.wk_mp;
6288         ump = VFSTOUFS(mp);
6289         bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0);
6290         if ((bp->b_flags & B_CACHE) == 0) {
6291                 bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno);
6292                 bp->b_iocmd = BIO_READ;
6293                 bp->b_flags &= ~B_INVAL;
6294                 bp->b_ioflags &= ~BIO_ERROR;
6295                 vfs_busy_pages(bp, 0);
6296                 bp->b_iooffset = dbtob(bp->b_blkno);
6297                 bstrategy(bp);
6298 #ifdef RACCT
6299                 if (racct_enable) {
6300                         PROC_LOCK(curproc);
6301                         racct_add_buf(curproc, bp, 0);
6302                         PROC_UNLOCK(curproc);
6303                 }
6304 #endif /* RACCT */
6305                 curthread->td_ru.ru_inblock++;
6306                 error = bufwait(bp);
6307                 if (error) {
6308                         brelse(bp);
6309                         return (error);
6310                 }
6311         }
6312         level = lbn_level(lbn);
6313         lbnadd = lbn_offset(ump->um_fs, level);
6314         /*
6315          * Compute the offset of the last block we want to keep.  Store
6316          * in the freework the first block we want to completely free.
6317          */
6318         off = (lastlbn - -(lbn + level)) / lbnadd;
6319         if (off + 1 == NINDIR(ump->um_fs))
6320                 goto nowork;
6321         freework = newfreework(ump, freeblks, NULL, lbn, blkno, 0, off + 1, 0);
6322         /*
6323          * Link the freework into the indirdep.  This will prevent any new
6324          * allocations from proceeding until we are finished with the
6325          * truncate and the block is written.
6326          */
6327         ACQUIRE_LOCK(ump);
6328         indirdep = indirdep_lookup(mp, ip, bp);
6329         if (indirdep->ir_freeblks)
6330                 panic("setup_trunc_indir: indirdep already truncated.");
6331         TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next);
6332         freework->fw_indir = indirdep;
6333         /*
6334          * Cancel any allocindirs that will not make it to disk.
6335          * We have to do this for all copies of the indirdep that
6336          * live on this newblk.
6337          */
6338         if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
6339                 if (newblk_lookup(mp, dbtofsb(ump->um_fs, bp->b_blkno), 0,
6340                     &newblk) == 0)
6341                         panic("setup_trunc_indir: lost block");
6342                 LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next)
6343                         trunc_indirdep(indirn, freeblks, bp, off);
6344         } else
6345                 trunc_indirdep(indirdep, freeblks, bp, off);
6346         FREE_LOCK(ump);
6347         /*
6348          * Creation is protected by the buf lock. The saveddata is only
6349          * needed if a full truncation follows a partial truncation but it
6350          * is difficult to allocate in that case so we fetch it anyway.
6351          */
6352         if (indirdep->ir_saveddata == NULL)
6353                 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
6354                     M_SOFTDEP_FLAGS);
6355 nowork:
6356         /* Fetch the blkno of the child and the zero start offset. */
6357         if (I_IS_UFS1(ip)) {
6358                 blkno = ((ufs1_daddr_t *)bp->b_data)[off];
6359                 start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1];
6360         } else {
6361                 blkno = ((ufs2_daddr_t *)bp->b_data)[off];
6362                 start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1];
6363         }
6364         if (freework) {
6365                 /* Zero the truncated pointers. */
6366                 end = bp->b_data + bp->b_bcount;
6367                 bzero(start, end - start);
6368                 bdwrite(bp);
6369         } else
6370                 bqrelse(bp);
6371         if (level == 0)
6372                 return (0);
6373         lbn++; /* adjust level */
6374         lbn -= (off * lbnadd);
6375         return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno);
6376 }
6377
6378 /*
6379  * Complete the partial truncation of an indirect block setup by
6380  * setup_trunc_indir().  This zeros the truncated pointers in the saved
6381  * copy and writes them to disk before the freeblks is allowed to complete.
6382  */
6383 static void
6384 complete_trunc_indir(freework)
6385         struct freework *freework;
6386 {
6387         struct freework *fwn;
6388         struct indirdep *indirdep;
6389         struct ufsmount *ump;
6390         struct buf *bp;
6391         uintptr_t start;
6392         int count;
6393
6394         ump = VFSTOUFS(freework->fw_list.wk_mp);
6395         LOCK_OWNED(ump);
6396         indirdep = freework->fw_indir;
6397         for (;;) {
6398                 bp = indirdep->ir_bp;
6399                 /* See if the block was discarded. */
6400                 if (bp == NULL)
6401                         break;
6402                 /* Inline part of getdirtybuf().  We dont want bremfree. */
6403                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0)
6404                         break;
6405                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
6406                     LOCK_PTR(ump)) == 0)
6407                         BUF_UNLOCK(bp);
6408                 ACQUIRE_LOCK(ump);
6409         }
6410         freework->fw_state |= DEPCOMPLETE;
6411         TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next);
6412         /*
6413          * Zero the pointers in the saved copy.
6414          */
6415         if (indirdep->ir_state & UFS1FMT)
6416                 start = sizeof(ufs1_daddr_t);
6417         else
6418                 start = sizeof(ufs2_daddr_t);
6419         start *= freework->fw_start;
6420         count = indirdep->ir_savebp->b_bcount - start;
6421         start += (uintptr_t)indirdep->ir_savebp->b_data;
6422         bzero((char *)start, count);
6423         /*
6424          * We need to start the next truncation in the list if it has not
6425          * been started yet.
6426          */
6427         fwn = TAILQ_FIRST(&indirdep->ir_trunc);
6428         if (fwn != NULL) {
6429                 if (fwn->fw_freeblks == indirdep->ir_freeblks)
6430                         TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next);
6431                 if ((fwn->fw_state & ONWORKLIST) == 0)
6432                         freework_enqueue(fwn);
6433         }
6434         /*
6435          * If bp is NULL the block was fully truncated, restore
6436          * the saved block list otherwise free it if it is no
6437          * longer needed.
6438          */
6439         if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
6440                 if (bp == NULL)
6441                         bcopy(indirdep->ir_saveddata,
6442                             indirdep->ir_savebp->b_data,
6443                             indirdep->ir_savebp->b_bcount);
6444                 free(indirdep->ir_saveddata, M_INDIRDEP);
6445                 indirdep->ir_saveddata = NULL;
6446         }
6447         /*
6448          * When bp is NULL there is a full truncation pending.  We
6449          * must wait for this full truncation to be journaled before
6450          * we can release this freework because the disk pointers will
6451          * never be written as zero.
6452          */
6453         if (bp == NULL)  {
6454                 if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd))
6455                         handle_written_freework(freework);
6456                 else
6457                         WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd,
6458                            &freework->fw_list);
6459         } else {
6460                 /* Complete when the real copy is written. */
6461                 WORKLIST_INSERT(&bp->b_dep, &freework->fw_list);
6462                 BUF_UNLOCK(bp);
6463         }
6464 }
6465
6466 /*
6467  * Calculate the number of blocks we are going to release where datablocks
6468  * is the current total and length is the new file size.
6469  */
6470 static ufs2_daddr_t
6471 blkcount(fs, datablocks, length)
6472         struct fs *fs;
6473         ufs2_daddr_t datablocks;
6474         off_t length;
6475 {
6476         off_t totblks, numblks;
6477
6478         totblks = 0;
6479         numblks = howmany(length, fs->fs_bsize);
6480         if (numblks <= UFS_NDADDR) {
6481                 totblks = howmany(length, fs->fs_fsize);
6482                 goto out;
6483         }
6484         totblks = blkstofrags(fs, numblks);
6485         numblks -= UFS_NDADDR;
6486         /*
6487          * Count all single, then double, then triple indirects required.
6488          * Subtracting one indirects worth of blocks for each pass
6489          * acknowledges one of each pointed to by the inode.
6490          */
6491         for (;;) {
6492                 totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs)));
6493                 numblks -= NINDIR(fs);
6494                 if (numblks <= 0)
6495                         break;
6496                 numblks = howmany(numblks, NINDIR(fs));
6497         }
6498 out:
6499         totblks = fsbtodb(fs, totblks);
6500         /*
6501          * Handle sparse files.  We can't reclaim more blocks than the inode
6502          * references.  We will correct it later in handle_complete_freeblks()
6503          * when we know the real count.
6504          */
6505         if (totblks > datablocks)
6506                 return (0);
6507         return (datablocks - totblks);
6508 }
6509
6510 /*
6511  * Handle freeblocks for journaled softupdate filesystems.
6512  *
6513  * Contrary to normal softupdates, we must preserve the block pointers in
6514  * indirects until their subordinates are free.  This is to avoid journaling
6515  * every block that is freed which may consume more space than the journal
6516  * itself.  The recovery program will see the free block journals at the
6517  * base of the truncated area and traverse them to reclaim space.  The
6518  * pointers in the inode may be cleared immediately after the journal
6519  * records are written because each direct and indirect pointer in the
6520  * inode is recorded in a journal.  This permits full truncation to proceed
6521  * asynchronously.  The write order is journal -> inode -> cgs -> indirects.
6522  *
6523  * The algorithm is as follows:
6524  * 1) Traverse the in-memory state and create journal entries to release
6525  *    the relevant blocks and full indirect trees.
6526  * 2) Traverse the indirect block chain adding partial truncation freework
6527  *    records to indirects in the path to lastlbn.  The freework will
6528  *    prevent new allocation dependencies from being satisfied in this
6529  *    indirect until the truncation completes.
6530  * 3) Read and lock the inode block, performing an update with the new size
6531  *    and pointers.  This prevents truncated data from becoming valid on
6532  *    disk through step 4.
6533  * 4) Reap unsatisfied dependencies that are beyond the truncated area,
6534  *    eliminate journal work for those records that do not require it.
6535  * 5) Schedule the journal records to be written followed by the inode block.
6536  * 6) Allocate any necessary frags for the end of file.
6537  * 7) Zero any partially truncated blocks.
6538  *
6539  * From this truncation proceeds asynchronously using the freework and
6540  * indir_trunc machinery.  The file will not be extended again into a
6541  * partially truncated indirect block until all work is completed but
6542  * the normal dependency mechanism ensures that it is rolled back/forward
6543  * as appropriate.  Further truncation may occur without delay and is
6544  * serialized in indir_trunc().
6545  */
6546 void
6547 softdep_journal_freeblocks(ip, cred, length, flags)
6548         struct inode *ip;       /* The inode whose length is to be reduced */
6549         struct ucred *cred;
6550         off_t length;           /* The new length for the file */
6551         int flags;              /* IO_EXT and/or IO_NORMAL */
6552 {
6553         struct freeblks *freeblks, *fbn;
6554         struct worklist *wk, *wkn;
6555         struct inodedep *inodedep;
6556         struct jblkdep *jblkdep;
6557         struct allocdirect *adp, *adpn;
6558         struct ufsmount *ump;
6559         struct fs *fs;
6560         struct buf *bp;
6561         struct vnode *vp;
6562         struct mount *mp;
6563         ufs2_daddr_t extblocks, datablocks;
6564         ufs_lbn_t tmpval, lbn, lastlbn;
6565         int frags, lastoff, iboff, allocblock, needj, error, i;
6566
6567         ump = ITOUMP(ip);
6568         mp = UFSTOVFS(ump);
6569         fs = ump->um_fs;
6570         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6571             ("softdep_journal_freeblocks called on non-softdep filesystem"));
6572         vp = ITOV(ip);
6573         needj = 1;
6574         iboff = -1;
6575         allocblock = 0;
6576         extblocks = 0;
6577         datablocks = 0;
6578         frags = 0;
6579         freeblks = newfreeblks(mp, ip);
6580         ACQUIRE_LOCK(ump);
6581         /*
6582          * If we're truncating a removed file that will never be written
6583          * we don't need to journal the block frees.  The canceled journals
6584          * for the allocations will suffice.
6585          */
6586         inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6587         if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED &&
6588             length == 0)
6589                 needj = 0;
6590         CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d",
6591             ip->i_number, length, needj);
6592         FREE_LOCK(ump);
6593         /*
6594          * Calculate the lbn that we are truncating to.  This results in -1
6595          * if we're truncating the 0 bytes.  So it is the last lbn we want
6596          * to keep, not the first lbn we want to truncate.
6597          */
6598         lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1;
6599         lastoff = blkoff(fs, length);
6600         /*
6601          * Compute frags we are keeping in lastlbn.  0 means all.
6602          */
6603         if (lastlbn >= 0 && lastlbn < UFS_NDADDR) {
6604                 frags = fragroundup(fs, lastoff);
6605                 /* adp offset of last valid allocdirect. */
6606                 iboff = lastlbn;
6607         } else if (lastlbn > 0)
6608                 iboff = UFS_NDADDR;
6609         if (fs->fs_magic == FS_UFS2_MAGIC)
6610                 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6611         /*
6612          * Handle normal data blocks and indirects.  This section saves
6613          * values used after the inode update to complete frag and indirect
6614          * truncation.
6615          */
6616         if ((flags & IO_NORMAL) != 0) {
6617                 /*
6618                  * Handle truncation of whole direct and indirect blocks.
6619                  */
6620                 for (i = iboff + 1; i < UFS_NDADDR; i++)
6621                         setup_freedirect(freeblks, ip, i, needj);
6622                 for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR;
6623                     i < UFS_NIADDR;
6624                     i++, lbn += tmpval, tmpval *= NINDIR(fs)) {
6625                         /* Release a whole indirect tree. */
6626                         if (lbn > lastlbn) {
6627                                 setup_freeindir(freeblks, ip, i, -lbn -i,
6628                                     needj);
6629                                 continue;
6630                         }
6631                         iboff = i + UFS_NDADDR;
6632                         /*
6633                          * Traverse partially truncated indirect tree.
6634                          */
6635                         if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn)
6636                                 setup_trunc_indir(freeblks, ip, -lbn - i,
6637                                     lastlbn, DIP(ip, i_ib[i]));
6638                 }
6639                 /*
6640                  * Handle partial truncation to a frag boundary.
6641                  */
6642                 if (frags) {
6643                         ufs2_daddr_t blkno;
6644                         long oldfrags;
6645
6646                         oldfrags = blksize(fs, ip, lastlbn);
6647                         blkno = DIP(ip, i_db[lastlbn]);
6648                         if (blkno && oldfrags != frags) {
6649                                 oldfrags -= frags;
6650                                 oldfrags = numfrags(fs, oldfrags);
6651                                 blkno += numfrags(fs, frags);
6652                                 newfreework(ump, freeblks, NULL, lastlbn,
6653                                     blkno, oldfrags, 0, needj);
6654                                 if (needj)
6655                                         adjust_newfreework(freeblks,
6656                                             numfrags(fs, frags));
6657                         } else if (blkno == 0)
6658                                 allocblock = 1;
6659                 }
6660                 /*
6661                  * Add a journal record for partial truncate if we are
6662                  * handling indirect blocks.  Non-indirects need no extra
6663                  * journaling.
6664                  */
6665                 if (length != 0 && lastlbn >= UFS_NDADDR) {
6666                         ip->i_flag |= IN_TRUNCATED;
6667                         newjtrunc(freeblks, length, 0);
6668                 }
6669                 ip->i_size = length;
6670                 DIP_SET(ip, i_size, ip->i_size);
6671                 datablocks = DIP(ip, i_blocks) - extblocks;
6672                 if (length != 0)
6673                         datablocks = blkcount(fs, datablocks, length);
6674                 freeblks->fb_len = length;
6675         }
6676         if ((flags & IO_EXT) != 0) {
6677                 for (i = 0; i < UFS_NXADDR; i++)
6678                         setup_freeext(freeblks, ip, i, needj);
6679                 ip->i_din2->di_extsize = 0;
6680                 datablocks += extblocks;
6681         }
6682 #ifdef QUOTA
6683         /* Reference the quotas in case the block count is wrong in the end. */
6684         quotaref(vp, freeblks->fb_quota);
6685         (void) chkdq(ip, -datablocks, NOCRED, FORCE);
6686 #endif
6687         freeblks->fb_chkcnt = -datablocks;
6688         UFS_LOCK(ump);
6689         fs->fs_pendingblocks += datablocks;
6690         UFS_UNLOCK(ump);
6691         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6692         /*
6693          * Handle truncation of incomplete alloc direct dependencies.  We
6694          * hold the inode block locked to prevent incomplete dependencies
6695          * from reaching the disk while we are eliminating those that
6696          * have been truncated.  This is a partially inlined ffs_update().
6697          */
6698         ufs_itimes(vp);
6699         ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
6700         error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6701             (int)fs->fs_bsize, cred, &bp);
6702         if (error) {
6703                 brelse(bp);
6704                 softdep_error("softdep_journal_freeblocks", error);
6705                 return;
6706         }
6707         if (bp->b_bufsize == fs->fs_bsize)
6708                 bp->b_flags |= B_CLUSTEROK;
6709         softdep_update_inodeblock(ip, bp, 0);
6710         if (ump->um_fstype == UFS1) {
6711                 *((struct ufs1_dinode *)bp->b_data +
6712                     ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
6713         } else {
6714                 ffs_update_dinode_ckhash(fs, ip->i_din2);
6715                 *((struct ufs2_dinode *)bp->b_data +
6716                     ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
6717         }
6718         ACQUIRE_LOCK(ump);
6719         (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6720         if ((inodedep->id_state & IOSTARTED) != 0)
6721                 panic("softdep_setup_freeblocks: inode busy");
6722         /*
6723          * Add the freeblks structure to the list of operations that
6724          * must await the zero'ed inode being written to disk. If we
6725          * still have a bitmap dependency (needj), then the inode
6726          * has never been written to disk, so we can process the
6727          * freeblks below once we have deleted the dependencies.
6728          */
6729         if (needj)
6730                 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6731         else
6732                 freeblks->fb_state |= COMPLETE;
6733         if ((flags & IO_NORMAL) != 0) {
6734                 TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) {
6735                         if (adp->ad_offset > iboff)
6736                                 cancel_allocdirect(&inodedep->id_inoupdt, adp,
6737                                     freeblks);
6738                         /*
6739                          * Truncate the allocdirect.  We could eliminate
6740                          * or modify journal records as well.
6741                          */
6742                         else if (adp->ad_offset == iboff && frags)
6743                                 adp->ad_newsize = frags;
6744                 }
6745         }
6746         if ((flags & IO_EXT) != 0)
6747                 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
6748                         cancel_allocdirect(&inodedep->id_extupdt, adp,
6749                             freeblks);
6750         /*
6751          * Scan the bufwait list for newblock dependencies that will never
6752          * make it to disk.
6753          */
6754         LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) {
6755                 if (wk->wk_type != D_ALLOCDIRECT)
6756                         continue;
6757                 adp = WK_ALLOCDIRECT(wk);
6758                 if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) ||
6759                     ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) {
6760                         cancel_jfreeblk(freeblks, adp->ad_newblkno);
6761                         cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork);
6762                         WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
6763                 }
6764         }
6765         /*
6766          * Add journal work.
6767          */
6768         LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps)
6769                 add_to_journal(&jblkdep->jb_list);
6770         FREE_LOCK(ump);
6771         bdwrite(bp);
6772         /*
6773          * Truncate dependency structures beyond length.
6774          */
6775         trunc_dependencies(ip, freeblks, lastlbn, frags, flags);
6776         /*
6777          * This is only set when we need to allocate a fragment because
6778          * none existed at the end of a frag-sized file.  It handles only
6779          * allocating a new, zero filled block.
6780          */
6781         if (allocblock) {
6782                 ip->i_size = length - lastoff;
6783                 DIP_SET(ip, i_size, ip->i_size);
6784                 error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp);
6785                 if (error != 0) {
6786                         softdep_error("softdep_journal_freeblks", error);
6787                         return;
6788                 }
6789                 ip->i_size = length;
6790                 DIP_SET(ip, i_size, length);
6791                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
6792                 allocbuf(bp, frags);
6793                 ffs_update(vp, 0);
6794                 bawrite(bp);
6795         } else if (lastoff != 0 && vp->v_type != VDIR) {
6796                 int size;
6797
6798                 /*
6799                  * Zero the end of a truncated frag or block.
6800                  */
6801                 size = sblksize(fs, length, lastlbn);
6802                 error = bread(vp, lastlbn, size, cred, &bp);
6803                 if (error) {
6804                         softdep_error("softdep_journal_freeblks", error);
6805                         return;
6806                 }
6807                 bzero((char *)bp->b_data + lastoff, size - lastoff);
6808                 bawrite(bp);
6809
6810         }
6811         ACQUIRE_LOCK(ump);
6812         inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6813         TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next);
6814         freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST;
6815         /*
6816          * We zero earlier truncations so they don't erroneously
6817          * update i_blocks.
6818          */
6819         if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0)
6820                 TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next)
6821                         fbn->fb_len = 0;
6822         if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE &&
6823             LIST_EMPTY(&freeblks->fb_jblkdephd))
6824                 freeblks->fb_state |= INPROGRESS;
6825         else
6826                 freeblks = NULL;
6827         FREE_LOCK(ump);
6828         if (freeblks)
6829                 handle_workitem_freeblocks(freeblks, 0);
6830         trunc_pages(ip, length, extblocks, flags);
6831
6832 }
6833
6834 /*
6835  * Flush a JOP_SYNC to the journal.
6836  */
6837 void
6838 softdep_journal_fsync(ip)
6839         struct inode *ip;
6840 {
6841         struct jfsync *jfsync;
6842         struct ufsmount *ump;
6843
6844         ump = ITOUMP(ip);
6845         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
6846             ("softdep_journal_fsync called on non-softdep filesystem"));
6847         if ((ip->i_flag & IN_TRUNCATED) == 0)
6848                 return;
6849         ip->i_flag &= ~IN_TRUNCATED;
6850         jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO);
6851         workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ump));
6852         jfsync->jfs_size = ip->i_size;
6853         jfsync->jfs_ino = ip->i_number;
6854         ACQUIRE_LOCK(ump);
6855         add_to_journal(&jfsync->jfs_list);
6856         jwait(&jfsync->jfs_list, MNT_WAIT);
6857         FREE_LOCK(ump);
6858 }
6859
6860 /*
6861  * Block de-allocation dependencies.
6862  *
6863  * When blocks are de-allocated, the on-disk pointers must be nullified before
6864  * the blocks are made available for use by other files.  (The true
6865  * requirement is that old pointers must be nullified before new on-disk
6866  * pointers are set.  We chose this slightly more stringent requirement to
6867  * reduce complexity.) Our implementation handles this dependency by updating
6868  * the inode (or indirect block) appropriately but delaying the actual block
6869  * de-allocation (i.e., freemap and free space count manipulation) until
6870  * after the updated versions reach stable storage.  After the disk is
6871  * updated, the blocks can be safely de-allocated whenever it is convenient.
6872  * This implementation handles only the common case of reducing a file's
6873  * length to zero. Other cases are handled by the conventional synchronous
6874  * write approach.
6875  *
6876  * The ffs implementation with which we worked double-checks
6877  * the state of the block pointers and file size as it reduces
6878  * a file's length.  Some of this code is replicated here in our
6879  * soft updates implementation.  The freeblks->fb_chkcnt field is
6880  * used to transfer a part of this information to the procedure
6881  * that eventually de-allocates the blocks.
6882  *
6883  * This routine should be called from the routine that shortens
6884  * a file's length, before the inode's size or block pointers
6885  * are modified. It will save the block pointer information for
6886  * later release and zero the inode so that the calling routine
6887  * can release it.
6888  */
6889 void
6890 softdep_setup_freeblocks(ip, length, flags)
6891         struct inode *ip;       /* The inode whose length is to be reduced */
6892         off_t length;           /* The new length for the file */
6893         int flags;              /* IO_EXT and/or IO_NORMAL */
6894 {
6895         struct ufs1_dinode *dp1;
6896         struct ufs2_dinode *dp2;
6897         struct freeblks *freeblks;
6898         struct inodedep *inodedep;
6899         struct allocdirect *adp;
6900         struct ufsmount *ump;
6901         struct buf *bp;
6902         struct fs *fs;
6903         ufs2_daddr_t extblocks, datablocks;
6904         struct mount *mp;
6905         int i, delay, error;
6906         ufs_lbn_t tmpval;
6907         ufs_lbn_t lbn;
6908
6909         ump = ITOUMP(ip);
6910         mp = UFSTOVFS(ump);
6911         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6912             ("softdep_setup_freeblocks called on non-softdep filesystem"));
6913         CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld",
6914             ip->i_number, length);
6915         KASSERT(length == 0, ("softdep_setup_freeblocks: non-zero length"));
6916         fs = ump->um_fs;
6917         if ((error = bread(ump->um_devvp,
6918             fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6919             (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
6920                 brelse(bp);
6921                 softdep_error("softdep_setup_freeblocks", error);
6922                 return;
6923         }
6924         freeblks = newfreeblks(mp, ip);
6925         extblocks = 0;
6926         datablocks = 0;
6927         if (fs->fs_magic == FS_UFS2_MAGIC)
6928                 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6929         if ((flags & IO_NORMAL) != 0) {
6930                 for (i = 0; i < UFS_NDADDR; i++)
6931                         setup_freedirect(freeblks, ip, i, 0);
6932                 for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR;
6933                     i < UFS_NIADDR;
6934                     i++, lbn += tmpval, tmpval *= NINDIR(fs))
6935                         setup_freeindir(freeblks, ip, i, -lbn -i, 0);
6936                 ip->i_size = 0;
6937                 DIP_SET(ip, i_size, 0);
6938                 datablocks = DIP(ip, i_blocks) - extblocks;
6939         }
6940         if ((flags & IO_EXT) != 0) {
6941                 for (i = 0; i < UFS_NXADDR; i++)
6942                         setup_freeext(freeblks, ip, i, 0);
6943                 ip->i_din2->di_extsize = 0;
6944                 datablocks += extblocks;
6945         }
6946 #ifdef QUOTA
6947         /* Reference the quotas in case the block count is wrong in the end. */
6948         quotaref(ITOV(ip), freeblks->fb_quota);
6949         (void) chkdq(ip, -datablocks, NOCRED, FORCE);
6950 #endif
6951         freeblks->fb_chkcnt = -datablocks;
6952         UFS_LOCK(ump);
6953         fs->fs_pendingblocks += datablocks;
6954         UFS_UNLOCK(ump);
6955         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6956         /*
6957          * Push the zero'ed inode to its disk buffer so that we are free
6958          * to delete its dependencies below. Once the dependencies are gone
6959          * the buffer can be safely released.
6960          */
6961         if (ump->um_fstype == UFS1) {
6962                 dp1 = ((struct ufs1_dinode *)bp->b_data +
6963                     ino_to_fsbo(fs, ip->i_number));
6964                 ip->i_din1->di_freelink = dp1->di_freelink;
6965                 *dp1 = *ip->i_din1;
6966         } else {
6967                 dp2 = ((struct ufs2_dinode *)bp->b_data +
6968                     ino_to_fsbo(fs, ip->i_number));
6969                 ip->i_din2->di_freelink = dp2->di_freelink;
6970                 ffs_update_dinode_ckhash(fs, ip->i_din2);
6971                 *dp2 = *ip->i_din2;
6972         }
6973         /*
6974          * Find and eliminate any inode dependencies.
6975          */
6976         ACQUIRE_LOCK(ump);
6977         (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6978         if ((inodedep->id_state & IOSTARTED) != 0)
6979                 panic("softdep_setup_freeblocks: inode busy");
6980         /*
6981          * Add the freeblks structure to the list of operations that
6982          * must await the zero'ed inode being written to disk. If we
6983          * still have a bitmap dependency (delay == 0), then the inode
6984          * has never been written to disk, so we can process the
6985          * freeblks below once we have deleted the dependencies.
6986          */
6987         delay = (inodedep->id_state & DEPCOMPLETE);
6988         if (delay)
6989                 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6990         else
6991                 freeblks->fb_state |= COMPLETE;
6992         /*
6993          * Because the file length has been truncated to zero, any
6994          * pending block allocation dependency structures associated
6995          * with this inode are obsolete and can simply be de-allocated.
6996          * We must first merge the two dependency lists to get rid of
6997          * any duplicate freefrag structures, then purge the merged list.
6998          * If we still have a bitmap dependency, then the inode has never
6999          * been written to disk, so we can free any fragments without delay.
7000          */
7001         if (flags & IO_NORMAL) {
7002                 merge_inode_lists(&inodedep->id_newinoupdt,
7003                     &inodedep->id_inoupdt);
7004                 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
7005                         cancel_allocdirect(&inodedep->id_inoupdt, adp,
7006                             freeblks);
7007         }
7008         if (flags & IO_EXT) {
7009                 merge_inode_lists(&inodedep->id_newextupdt,
7010                     &inodedep->id_extupdt);
7011                 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
7012                         cancel_allocdirect(&inodedep->id_extupdt, adp,
7013                             freeblks);
7014         }
7015         FREE_LOCK(ump);
7016         bdwrite(bp);
7017         trunc_dependencies(ip, freeblks, -1, 0, flags);
7018         ACQUIRE_LOCK(ump);
7019         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
7020                 (void) free_inodedep(inodedep);
7021         freeblks->fb_state |= DEPCOMPLETE;
7022         /*
7023          * If the inode with zeroed block pointers is now on disk
7024          * we can start freeing blocks.
7025          */
7026         if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
7027                 freeblks->fb_state |= INPROGRESS;
7028         else
7029                 freeblks = NULL;
7030         FREE_LOCK(ump);
7031         if (freeblks)
7032                 handle_workitem_freeblocks(freeblks, 0);
7033         trunc_pages(ip, length, extblocks, flags);
7034 }
7035
7036 /*
7037  * Eliminate pages from the page cache that back parts of this inode and
7038  * adjust the vnode pager's idea of our size.  This prevents stale data
7039  * from hanging around in the page cache.
7040  */
7041 static void
7042 trunc_pages(ip, length, extblocks, flags)
7043         struct inode *ip;
7044         off_t length;
7045         ufs2_daddr_t extblocks;
7046         int flags;
7047 {
7048         struct vnode *vp;
7049         struct fs *fs;
7050         ufs_lbn_t lbn;
7051         off_t end, extend;
7052
7053         vp = ITOV(ip);
7054         fs = ITOFS(ip);
7055         extend = OFF_TO_IDX(lblktosize(fs, -extblocks));
7056         if ((flags & IO_EXT) != 0)
7057                 vn_pages_remove(vp, extend, 0);
7058         if ((flags & IO_NORMAL) == 0)
7059                 return;
7060         BO_LOCK(&vp->v_bufobj);
7061         drain_output(vp);
7062         BO_UNLOCK(&vp->v_bufobj);
7063         /*
7064          * The vnode pager eliminates file pages we eliminate indirects
7065          * below.
7066          */
7067         vnode_pager_setsize(vp, length);
7068         /*
7069          * Calculate the end based on the last indirect we want to keep.  If
7070          * the block extends into indirects we can just use the negative of
7071          * its lbn.  Doubles and triples exist at lower numbers so we must
7072          * be careful not to remove those, if they exist.  double and triple
7073          * indirect lbns do not overlap with others so it is not important
7074          * to verify how many levels are required.
7075          */
7076         lbn = lblkno(fs, length);
7077         if (lbn >= UFS_NDADDR) {
7078                 /* Calculate the virtual lbn of the triple indirect. */
7079                 lbn = -lbn - (UFS_NIADDR - 1);
7080                 end = OFF_TO_IDX(lblktosize(fs, lbn));
7081         } else
7082                 end = extend;
7083         vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end);
7084 }
7085
7086 /*
7087  * See if the buf bp is in the range eliminated by truncation.
7088  */
7089 static int
7090 trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags)
7091         struct buf *bp;
7092         int *blkoffp;
7093         ufs_lbn_t lastlbn;
7094         int lastoff;
7095         int flags;
7096 {
7097         ufs_lbn_t lbn;
7098
7099         *blkoffp = 0;
7100         /* Only match ext/normal blocks as appropriate. */
7101         if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
7102             ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0))
7103                 return (0);
7104         /* ALTDATA is always a full truncation. */
7105         if ((bp->b_xflags & BX_ALTDATA) != 0)
7106                 return (1);
7107         /* -1 is full truncation. */
7108         if (lastlbn == -1)
7109                 return (1);
7110         /*
7111          * If this is a partial truncate we only want those
7112          * blocks and indirect blocks that cover the range
7113          * we're after.
7114          */
7115         lbn = bp->b_lblkno;
7116         if (lbn < 0)
7117                 lbn = -(lbn + lbn_level(lbn));
7118         if (lbn < lastlbn)
7119                 return (0);
7120         /* Here we only truncate lblkno if it's partial. */
7121         if (lbn == lastlbn) {
7122                 if (lastoff == 0)
7123                         return (0);
7124                 *blkoffp = lastoff;
7125         }
7126         return (1);
7127 }
7128
7129 /*
7130  * Eliminate any dependencies that exist in memory beyond lblkno:off
7131  */
7132 static void
7133 trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags)
7134         struct inode *ip;
7135         struct freeblks *freeblks;
7136         ufs_lbn_t lastlbn;
7137         int lastoff;
7138         int flags;
7139 {
7140         struct bufobj *bo;
7141         struct vnode *vp;
7142         struct buf *bp;
7143         int blkoff;
7144
7145         /*
7146          * We must wait for any I/O in progress to finish so that
7147          * all potential buffers on the dirty list will be visible.
7148          * Once they are all there, walk the list and get rid of
7149          * any dependencies.
7150          */
7151         vp = ITOV(ip);
7152         bo = &vp->v_bufobj;
7153         BO_LOCK(bo);
7154         drain_output(vp);
7155         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
7156                 bp->b_vflags &= ~BV_SCANNED;
7157 restart:
7158         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
7159                 if (bp->b_vflags & BV_SCANNED)
7160                         continue;
7161                 if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
7162                         bp->b_vflags |= BV_SCANNED;
7163                         continue;
7164                 }
7165                 KASSERT(bp->b_bufobj == bo, ("Wrong object in buffer"));
7166                 if ((bp = getdirtybuf(bp, BO_LOCKPTR(bo), MNT_WAIT)) == NULL)
7167                         goto restart;
7168                 BO_UNLOCK(bo);
7169                 if (deallocate_dependencies(bp, freeblks, blkoff))
7170                         bqrelse(bp);
7171                 else
7172                         brelse(bp);
7173                 BO_LOCK(bo);
7174                 goto restart;
7175         }
7176         /*
7177          * Now do the work of vtruncbuf while also matching indirect blocks.
7178          */
7179         TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs)
7180                 bp->b_vflags &= ~BV_SCANNED;
7181 cleanrestart:
7182         TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) {
7183                 if (bp->b_vflags & BV_SCANNED)
7184                         continue;
7185                 if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
7186                         bp->b_vflags |= BV_SCANNED;
7187                         continue;
7188                 }
7189                 if (BUF_LOCK(bp,
7190                     LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
7191                     BO_LOCKPTR(bo)) == ENOLCK) {
7192                         BO_LOCK(bo);
7193                         goto cleanrestart;
7194                 }
7195                 bp->b_vflags |= BV_SCANNED;
7196                 bremfree(bp);
7197                 if (blkoff != 0) {
7198                         allocbuf(bp, blkoff);
7199                         bqrelse(bp);
7200                 } else {
7201                         bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF;
7202                         brelse(bp);
7203                 }
7204                 BO_LOCK(bo);
7205                 goto cleanrestart;
7206         }
7207         drain_output(vp);
7208         BO_UNLOCK(bo);
7209 }
7210
7211 static int
7212 cancel_pagedep(pagedep, freeblks, blkoff)
7213         struct pagedep *pagedep;
7214         struct freeblks *freeblks;
7215         int blkoff;
7216 {
7217         struct jremref *jremref;
7218         struct jmvref *jmvref;
7219         struct dirrem *dirrem, *tmp;
7220         int i;
7221
7222         /*
7223          * Copy any directory remove dependencies to the list
7224          * to be processed after the freeblks proceeds.  If
7225          * directory entry never made it to disk they
7226          * can be dumped directly onto the work list.
7227          */
7228         LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) {
7229                 /* Skip this directory removal if it is intended to remain. */
7230                 if (dirrem->dm_offset < blkoff)
7231                         continue;
7232                 /*
7233                  * If there are any dirrems we wait for the journal write
7234                  * to complete and then restart the buf scan as the lock
7235                  * has been dropped.
7236                  */
7237                 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
7238                         jwait(&jremref->jr_list, MNT_WAIT);
7239                         return (ERESTART);
7240                 }
7241                 LIST_REMOVE(dirrem, dm_next);
7242                 dirrem->dm_dirinum = pagedep->pd_ino;
7243                 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list);
7244         }
7245         while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
7246                 jwait(&jmvref->jm_list, MNT_WAIT);
7247                 return (ERESTART);
7248         }
7249         /*
7250          * When we're partially truncating a pagedep we just want to flush
7251          * journal entries and return.  There can not be any adds in the
7252          * truncated portion of the directory and newblk must remain if
7253          * part of the block remains.
7254          */
7255         if (blkoff != 0) {
7256                 struct diradd *dap;
7257
7258                 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
7259                         if (dap->da_offset > blkoff)
7260                                 panic("cancel_pagedep: diradd %p off %d > %d",
7261                                     dap, dap->da_offset, blkoff);
7262                 for (i = 0; i < DAHASHSZ; i++)
7263                         LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist)
7264                                 if (dap->da_offset > blkoff)
7265                                         panic("cancel_pagedep: diradd %p off %d > %d",
7266                                             dap, dap->da_offset, blkoff);
7267                 return (0);
7268         }
7269         /*
7270          * There should be no directory add dependencies present
7271          * as the directory could not be truncated until all
7272          * children were removed.
7273          */
7274         KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
7275             ("deallocate_dependencies: pendinghd != NULL"));
7276         for (i = 0; i < DAHASHSZ; i++)
7277                 KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
7278                     ("deallocate_dependencies: diraddhd != NULL"));
7279         if ((pagedep->pd_state & NEWBLOCK) != 0)
7280                 free_newdirblk(pagedep->pd_newdirblk);
7281         if (free_pagedep(pagedep) == 0)
7282                 panic("Failed to free pagedep %p", pagedep);
7283         return (0);
7284 }
7285
7286 /*
7287  * Reclaim any dependency structures from a buffer that is about to
7288  * be reallocated to a new vnode. The buffer must be locked, thus,
7289  * no I/O completion operations can occur while we are manipulating
7290  * its associated dependencies. The mutex is held so that other I/O's
7291  * associated with related dependencies do not occur.
7292  */
7293 static int
7294 deallocate_dependencies(bp, freeblks, off)
7295         struct buf *bp;
7296         struct freeblks *freeblks;
7297         int off;
7298 {
7299         struct indirdep *indirdep;
7300         struct pagedep *pagedep;
7301         struct worklist *wk, *wkn;
7302         struct ufsmount *ump;
7303
7304         ump = softdep_bp_to_mp(bp);
7305         if (ump == NULL)
7306                 goto done;
7307         ACQUIRE_LOCK(ump);
7308         LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) {
7309                 switch (wk->wk_type) {
7310                 case D_INDIRDEP:
7311                         indirdep = WK_INDIRDEP(wk);
7312                         if (bp->b_lblkno >= 0 ||
7313                             bp->b_blkno != indirdep->ir_savebp->b_lblkno)
7314                                 panic("deallocate_dependencies: not indir");
7315                         cancel_indirdep(indirdep, bp, freeblks);
7316                         continue;
7317
7318                 case D_PAGEDEP:
7319                         pagedep = WK_PAGEDEP(wk);
7320                         if (cancel_pagedep(pagedep, freeblks, off)) {
7321                                 FREE_LOCK(ump);
7322                                 return (ERESTART);
7323                         }
7324                         continue;
7325
7326                 case D_ALLOCINDIR:
7327                         /*
7328                          * Simply remove the allocindir, we'll find it via
7329                          * the indirdep where we can clear pointers if
7330                          * needed.
7331                          */
7332                         WORKLIST_REMOVE(wk);
7333                         continue;
7334
7335                 case D_FREEWORK:
7336                         /*
7337                          * A truncation is waiting for the zero'd pointers
7338                          * to be written.  It can be freed when the freeblks
7339                          * is journaled.
7340                          */
7341                         WORKLIST_REMOVE(wk);
7342                         wk->wk_state |= ONDEPLIST;
7343                         WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
7344                         break;
7345
7346                 case D_ALLOCDIRECT:
7347                         if (off != 0)
7348                                 continue;
7349                         /* FALLTHROUGH */
7350                 default:
7351                         panic("deallocate_dependencies: Unexpected type %s",
7352                             TYPENAME(wk->wk_type));
7353                         /* NOTREACHED */
7354                 }
7355         }
7356         FREE_LOCK(ump);
7357 done:
7358         /*
7359          * Don't throw away this buf, we were partially truncating and
7360          * some deps may always remain.
7361          */
7362         if (off) {
7363                 allocbuf(bp, off);
7364                 bp->b_vflags |= BV_SCANNED;
7365                 return (EBUSY);
7366         }
7367         bp->b_flags |= B_INVAL | B_NOCACHE;
7368
7369         return (0);
7370 }
7371
7372 /*
7373  * An allocdirect is being canceled due to a truncate.  We must make sure
7374  * the journal entry is released in concert with the blkfree that releases
7375  * the storage.  Completed journal entries must not be released until the
7376  * space is no longer pointed to by the inode or in the bitmap.
7377  */
7378 static void
7379 cancel_allocdirect(adphead, adp, freeblks)
7380         struct allocdirectlst *adphead;
7381         struct allocdirect *adp;
7382         struct freeblks *freeblks;
7383 {
7384         struct freework *freework;
7385         struct newblk *newblk;
7386         struct worklist *wk;
7387
7388         TAILQ_REMOVE(adphead, adp, ad_next);
7389         newblk = (struct newblk *)adp;
7390         freework = NULL;
7391         /*
7392          * Find the correct freework structure.
7393          */
7394         LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
7395                 if (wk->wk_type != D_FREEWORK)
7396                         continue;
7397                 freework = WK_FREEWORK(wk);
7398                 if (freework->fw_blkno == newblk->nb_newblkno)
7399                         break;
7400         }
7401         if (freework == NULL)
7402                 panic("cancel_allocdirect: Freework not found");
7403         /*
7404          * If a newblk exists at all we still have the journal entry that
7405          * initiated the allocation so we do not need to journal the free.
7406          */
7407         cancel_jfreeblk(freeblks, freework->fw_blkno);
7408         /*
7409          * If the journal hasn't been written the jnewblk must be passed
7410          * to the call to ffs_blkfree that reclaims the space.  We accomplish
7411          * this by linking the journal dependency into the freework to be
7412          * freed when freework_freeblock() is called.  If the journal has
7413          * been written we can simply reclaim the journal space when the
7414          * freeblks work is complete.
7415          */
7416         freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list,
7417             &freeblks->fb_jwork);
7418         WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
7419 }
7420
7421
7422 /*
7423  * Cancel a new block allocation.  May be an indirect or direct block.  We
7424  * remove it from various lists and return any journal record that needs to
7425  * be resolved by the caller.
7426  *
7427  * A special consideration is made for indirects which were never pointed
7428  * at on disk and will never be found once this block is released.
7429  */
7430 static struct jnewblk *
7431 cancel_newblk(newblk, wk, wkhd)
7432         struct newblk *newblk;
7433         struct worklist *wk;
7434         struct workhead *wkhd;
7435 {
7436         struct jnewblk *jnewblk;
7437
7438         CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno);
7439
7440         newblk->nb_state |= GOINGAWAY;
7441         /*
7442          * Previously we traversed the completedhd on each indirdep
7443          * attached to this newblk to cancel them and gather journal
7444          * work.  Since we need only the oldest journal segment and
7445          * the lowest point on the tree will always have the oldest
7446          * journal segment we are free to release the segments
7447          * of any subordinates and may leave the indirdep list to
7448          * indirdep_complete() when this newblk is freed.
7449          */
7450         if (newblk->nb_state & ONDEPLIST) {
7451                 newblk->nb_state &= ~ONDEPLIST;
7452                 LIST_REMOVE(newblk, nb_deps);
7453         }
7454         if (newblk->nb_state & ONWORKLIST)
7455                 WORKLIST_REMOVE(&newblk->nb_list);
7456         /*
7457          * If the journal entry hasn't been written we save a pointer to
7458          * the dependency that frees it until it is written or the
7459          * superseding operation completes.
7460          */
7461         jnewblk = newblk->nb_jnewblk;
7462         if (jnewblk != NULL && wk != NULL) {
7463                 newblk->nb_jnewblk = NULL;
7464                 jnewblk->jn_dep = wk;
7465         }
7466         if (!LIST_EMPTY(&newblk->nb_jwork))
7467                 jwork_move(wkhd, &newblk->nb_jwork);
7468         /*
7469          * When truncating we must free the newdirblk early to remove
7470          * the pagedep from the hash before returning.
7471          */
7472         if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7473                 free_newdirblk(WK_NEWDIRBLK(wk));
7474         if (!LIST_EMPTY(&newblk->nb_newdirblk))
7475                 panic("cancel_newblk: extra newdirblk");
7476
7477         return (jnewblk);
7478 }
7479
7480 /*
7481  * Schedule the freefrag associated with a newblk to be released once
7482  * the pointers are written and the previous block is no longer needed.
7483  */
7484 static void
7485 newblk_freefrag(newblk)
7486         struct newblk *newblk;
7487 {
7488         struct freefrag *freefrag;
7489
7490         if (newblk->nb_freefrag == NULL)
7491                 return;
7492         freefrag = newblk->nb_freefrag;
7493         newblk->nb_freefrag = NULL;
7494         freefrag->ff_state |= COMPLETE;
7495         if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
7496                 add_to_worklist(&freefrag->ff_list, 0);
7497 }
7498
7499 /*
7500  * Free a newblk. Generate a new freefrag work request if appropriate.
7501  * This must be called after the inode pointer and any direct block pointers
7502  * are valid or fully removed via truncate or frag extension.
7503  */
7504 static void
7505 free_newblk(newblk)
7506         struct newblk *newblk;
7507 {
7508         struct indirdep *indirdep;
7509         struct worklist *wk;
7510
7511         KASSERT(newblk->nb_jnewblk == NULL,
7512             ("free_newblk: jnewblk %p still attached", newblk->nb_jnewblk));
7513         KASSERT(newblk->nb_list.wk_type != D_NEWBLK,
7514             ("free_newblk: unclaimed newblk"));
7515         LOCK_OWNED(VFSTOUFS(newblk->nb_list.wk_mp));
7516         newblk_freefrag(newblk);
7517         if (newblk->nb_state & ONDEPLIST)
7518                 LIST_REMOVE(newblk, nb_deps);
7519         if (newblk->nb_state & ONWORKLIST)
7520                 WORKLIST_REMOVE(&newblk->nb_list);
7521         LIST_REMOVE(newblk, nb_hash);
7522         if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7523                 free_newdirblk(WK_NEWDIRBLK(wk));
7524         if (!LIST_EMPTY(&newblk->nb_newdirblk))
7525                 panic("free_newblk: extra newdirblk");
7526         while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL)
7527                 indirdep_complete(indirdep);
7528         handle_jwork(&newblk->nb_jwork);
7529         WORKITEM_FREE(newblk, D_NEWBLK);
7530 }
7531
7532 /*
7533  * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
7534  */
7535 static void
7536 free_newdirblk(newdirblk)
7537         struct newdirblk *newdirblk;
7538 {
7539         struct pagedep *pagedep;
7540         struct diradd *dap;
7541         struct worklist *wk;
7542
7543         LOCK_OWNED(VFSTOUFS(newdirblk->db_list.wk_mp));
7544         WORKLIST_REMOVE(&newdirblk->db_list);
7545         /*
7546          * If the pagedep is still linked onto the directory buffer
7547          * dependency chain, then some of the entries on the
7548          * pd_pendinghd list may not be committed to disk yet. In
7549          * this case, we will simply clear the NEWBLOCK flag and
7550          * let the pd_pendinghd list be processed when the pagedep
7551          * is next written. If the pagedep is no longer on the buffer
7552          * dependency chain, then all the entries on the pd_pending
7553          * list are committed to disk and we can free them here.
7554          */
7555         pagedep = newdirblk->db_pagedep;
7556         pagedep->pd_state &= ~NEWBLOCK;
7557         if ((pagedep->pd_state & ONWORKLIST) == 0) {
7558                 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
7559                         free_diradd(dap, NULL);
7560                 /*
7561                  * If no dependencies remain, the pagedep will be freed.
7562                  */
7563                 free_pagedep(pagedep);
7564         }
7565         /* Should only ever be one item in the list. */
7566         while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
7567                 WORKLIST_REMOVE(wk);
7568                 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
7569         }
7570         WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
7571 }
7572
7573 /*
7574  * Prepare an inode to be freed. The actual free operation is not
7575  * done until the zero'ed inode has been written to disk.
7576  */
7577 void
7578 softdep_freefile(pvp, ino, mode)
7579         struct vnode *pvp;
7580         ino_t ino;
7581         int mode;
7582 {
7583         struct inode *ip = VTOI(pvp);
7584         struct inodedep *inodedep;
7585         struct freefile *freefile;
7586         struct freeblks *freeblks;
7587         struct ufsmount *ump;
7588
7589         ump = ITOUMP(ip);
7590         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
7591             ("softdep_freefile called on non-softdep filesystem"));
7592         /*
7593          * This sets up the inode de-allocation dependency.
7594          */
7595         freefile = malloc(sizeof(struct freefile),
7596                 M_FREEFILE, M_SOFTDEP_FLAGS);
7597         workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
7598         freefile->fx_mode = mode;
7599         freefile->fx_oldinum = ino;
7600         freefile->fx_devvp = ump->um_devvp;
7601         LIST_INIT(&freefile->fx_jwork);
7602         UFS_LOCK(ump);
7603         ump->um_fs->fs_pendinginodes += 1;
7604         UFS_UNLOCK(ump);
7605
7606         /*
7607          * If the inodedep does not exist, then the zero'ed inode has
7608          * been written to disk. If the allocated inode has never been
7609          * written to disk, then the on-disk inode is zero'ed. In either
7610          * case we can free the file immediately.  If the journal was
7611          * canceled before being written the inode will never make it to
7612          * disk and we must send the canceled journal entrys to
7613          * ffs_freefile() to be cleared in conjunction with the bitmap.
7614          * Any blocks waiting on the inode to write can be safely freed
7615          * here as it will never been written.
7616          */
7617         ACQUIRE_LOCK(ump);
7618         inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7619         if (inodedep) {
7620                 /*
7621                  * Clear out freeblks that no longer need to reference
7622                  * this inode.
7623                  */
7624                 while ((freeblks =
7625                     TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) {
7626                         TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks,
7627                             fb_next);
7628                         freeblks->fb_state &= ~ONDEPLIST;
7629                 }
7630                 /*
7631                  * Remove this inode from the unlinked list.
7632                  */
7633                 if (inodedep->id_state & UNLINKED) {
7634                         /*
7635                          * Save the journal work to be freed with the bitmap
7636                          * before we clear UNLINKED.  Otherwise it can be lost
7637                          * if the inode block is written.
7638                          */
7639                         handle_bufwait(inodedep, &freefile->fx_jwork);
7640                         clear_unlinked_inodedep(inodedep);
7641                         /*
7642                          * Re-acquire inodedep as we've dropped the
7643                          * per-filesystem lock in clear_unlinked_inodedep().
7644                          */
7645                         inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7646                 }
7647         }
7648         if (inodedep == NULL || check_inode_unwritten(inodedep)) {
7649                 FREE_LOCK(ump);
7650                 handle_workitem_freefile(freefile);
7651                 return;
7652         }
7653         if ((inodedep->id_state & DEPCOMPLETE) == 0)
7654                 inodedep->id_state |= GOINGAWAY;
7655         WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
7656         FREE_LOCK(ump);
7657         if (ip->i_number == ino)
7658                 ip->i_flag |= IN_MODIFIED;
7659 }
7660
7661 /*
7662  * Check to see if an inode has never been written to disk. If
7663  * so free the inodedep and return success, otherwise return failure.
7664  *
7665  * If we still have a bitmap dependency, then the inode has never
7666  * been written to disk. Drop the dependency as it is no longer
7667  * necessary since the inode is being deallocated. We set the
7668  * ALLCOMPLETE flags since the bitmap now properly shows that the
7669  * inode is not allocated. Even if the inode is actively being
7670  * written, it has been rolled back to its zero'ed state, so we
7671  * are ensured that a zero inode is what is on the disk. For short
7672  * lived files, this change will usually result in removing all the
7673  * dependencies from the inode so that it can be freed immediately.
7674  */
7675 static int
7676 check_inode_unwritten(inodedep)
7677         struct inodedep *inodedep;
7678 {
7679
7680         LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7681
7682         if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
7683             !LIST_EMPTY(&inodedep->id_dirremhd) ||
7684             !LIST_EMPTY(&inodedep->id_pendinghd) ||
7685             !LIST_EMPTY(&inodedep->id_bufwait) ||
7686             !LIST_EMPTY(&inodedep->id_inowait) ||
7687             !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7688             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7689             !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7690             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7691             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7692             !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7693             inodedep->id_mkdiradd != NULL ||
7694             inodedep->id_nlinkdelta != 0)
7695                 return (0);
7696         /*
7697          * Another process might be in initiate_write_inodeblock_ufs[12]
7698          * trying to allocate memory without holding "Softdep Lock".
7699          */
7700         if ((inodedep->id_state & IOSTARTED) != 0 &&
7701             inodedep->id_savedino1 == NULL)
7702                 return (0);
7703
7704         if (inodedep->id_state & ONDEPLIST)
7705                 LIST_REMOVE(inodedep, id_deps);
7706         inodedep->id_state &= ~ONDEPLIST;
7707         inodedep->id_state |= ALLCOMPLETE;
7708         inodedep->id_bmsafemap = NULL;
7709         if (inodedep->id_state & ONWORKLIST)
7710                 WORKLIST_REMOVE(&inodedep->id_list);
7711         if (inodedep->id_savedino1 != NULL) {
7712                 free(inodedep->id_savedino1, M_SAVEDINO);
7713                 inodedep->id_savedino1 = NULL;
7714         }
7715         if (free_inodedep(inodedep) == 0)
7716                 panic("check_inode_unwritten: busy inode");
7717         return (1);
7718 }
7719
7720 static int
7721 check_inodedep_free(inodedep)
7722         struct inodedep *inodedep;
7723 {
7724
7725         LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7726         if ((inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
7727             !LIST_EMPTY(&inodedep->id_dirremhd) ||
7728             !LIST_EMPTY(&inodedep->id_pendinghd) ||
7729             !LIST_EMPTY(&inodedep->id_bufwait) ||
7730             !LIST_EMPTY(&inodedep->id_inowait) ||
7731             !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7732             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7733             !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7734             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7735             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7736             !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7737             inodedep->id_mkdiradd != NULL ||
7738             inodedep->id_nlinkdelta != 0 ||
7739             inodedep->id_savedino1 != NULL)
7740                 return (0);
7741         return (1);
7742 }
7743
7744 /*
7745  * Try to free an inodedep structure. Return 1 if it could be freed.
7746  */
7747 static int
7748 free_inodedep(inodedep)
7749         struct inodedep *inodedep;
7750 {
7751
7752         LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7753         if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
7754             !check_inodedep_free(inodedep))
7755                 return (0);
7756         if (inodedep->id_state & ONDEPLIST)
7757                 LIST_REMOVE(inodedep, id_deps);
7758         LIST_REMOVE(inodedep, id_hash);
7759         WORKITEM_FREE(inodedep, D_INODEDEP);
7760         return (1);
7761 }
7762
7763 /*
7764  * Free the block referenced by a freework structure.  The parent freeblks
7765  * structure is released and completed when the final cg bitmap reaches
7766  * the disk.  This routine may be freeing a jnewblk which never made it to
7767  * disk in which case we do not have to wait as the operation is undone
7768  * in memory immediately.
7769  */
7770 static void
7771 freework_freeblock(freework, key)
7772         struct freework *freework;
7773         u_long key;
7774 {
7775         struct freeblks *freeblks;
7776         struct jnewblk *jnewblk;
7777         struct ufsmount *ump;
7778         struct workhead wkhd;
7779         struct fs *fs;
7780         int bsize;
7781         int needj;
7782
7783         ump = VFSTOUFS(freework->fw_list.wk_mp);
7784         LOCK_OWNED(ump);
7785         /*
7786          * Handle partial truncate separately.
7787          */
7788         if (freework->fw_indir) {
7789                 complete_trunc_indir(freework);
7790                 return;
7791         }
7792         freeblks = freework->fw_freeblks;
7793         fs = ump->um_fs;
7794         needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0;
7795         bsize = lfragtosize(fs, freework->fw_frags);
7796         LIST_INIT(&wkhd);
7797         /*
7798          * DEPCOMPLETE is cleared in indirblk_insert() if the block lives
7799          * on the indirblk hashtable and prevents premature freeing.
7800          */
7801         freework->fw_state |= DEPCOMPLETE;
7802         /*
7803          * SUJ needs to wait for the segment referencing freed indirect
7804          * blocks to expire so that we know the checker will not confuse
7805          * a re-allocated indirect block with its old contents.
7806          */
7807         if (needj && freework->fw_lbn <= -UFS_NDADDR)
7808                 indirblk_insert(freework);
7809         /*
7810          * If we are canceling an existing jnewblk pass it to the free
7811          * routine, otherwise pass the freeblk which will ultimately
7812          * release the freeblks.  If we're not journaling, we can just
7813          * free the freeblks immediately.
7814          */
7815         jnewblk = freework->fw_jnewblk;
7816         if (jnewblk != NULL) {
7817                 cancel_jnewblk(jnewblk, &wkhd);
7818                 needj = 0;
7819         } else if (needj) {
7820                 freework->fw_state |= DELAYEDFREE;
7821                 freeblks->fb_cgwait++;
7822                 WORKLIST_INSERT(&wkhd, &freework->fw_list);
7823         }
7824         FREE_LOCK(ump);
7825         freeblks_free(ump, freeblks, btodb(bsize));
7826         CTR4(KTR_SUJ,
7827             "freework_freeblock: ino %jd blkno %jd lbn %jd size %d",
7828             freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
7829         ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
7830             freeblks->fb_inum, freeblks->fb_vtype, &wkhd, key);
7831         ACQUIRE_LOCK(ump);
7832         /*
7833          * The jnewblk will be discarded and the bits in the map never
7834          * made it to disk.  We can immediately free the freeblk.
7835          */
7836         if (needj == 0)
7837                 handle_written_freework(freework);
7838 }
7839
7840 /*
7841  * We enqueue freework items that need processing back on the freeblks and
7842  * add the freeblks to the worklist.  This makes it easier to find all work
7843  * required to flush a truncation in process_truncates().
7844  */
7845 static void
7846 freework_enqueue(freework)
7847         struct freework *freework;
7848 {
7849         struct freeblks *freeblks;
7850
7851         freeblks = freework->fw_freeblks;
7852         if ((freework->fw_state & INPROGRESS) == 0)
7853                 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
7854         if ((freeblks->fb_state &
7855             (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE &&
7856             LIST_EMPTY(&freeblks->fb_jblkdephd))
7857                 add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7858 }
7859
7860 /*
7861  * Start, continue, or finish the process of freeing an indirect block tree.
7862  * The free operation may be paused at any point with fw_off containing the
7863  * offset to restart from.  This enables us to implement some flow control
7864  * for large truncates which may fan out and generate a huge number of
7865  * dependencies.
7866  */
7867 static void
7868 handle_workitem_indirblk(freework)
7869         struct freework *freework;
7870 {
7871         struct freeblks *freeblks;
7872         struct ufsmount *ump;
7873         struct fs *fs;
7874
7875         freeblks = freework->fw_freeblks;
7876         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7877         fs = ump->um_fs;
7878         if (freework->fw_state & DEPCOMPLETE) {
7879                 handle_written_freework(freework);
7880                 return;
7881         }
7882         if (freework->fw_off == NINDIR(fs)) {
7883                 freework_freeblock(freework, SINGLETON_KEY);
7884                 return;
7885         }
7886         freework->fw_state |= INPROGRESS;
7887         FREE_LOCK(ump);
7888         indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
7889             freework->fw_lbn);
7890         ACQUIRE_LOCK(ump);
7891 }
7892
7893 /*
7894  * Called when a freework structure attached to a cg buf is written.  The
7895  * ref on either the parent or the freeblks structure is released and
7896  * the freeblks is added back to the worklist if there is more work to do.
7897  */
7898 static void
7899 handle_written_freework(freework)
7900         struct freework *freework;
7901 {
7902         struct freeblks *freeblks;
7903         struct freework *parent;
7904
7905         freeblks = freework->fw_freeblks;
7906         parent = freework->fw_parent;
7907         if (freework->fw_state & DELAYEDFREE)
7908                 freeblks->fb_cgwait--;
7909         freework->fw_state |= COMPLETE;
7910         if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
7911                 WORKITEM_FREE(freework, D_FREEWORK);
7912         if (parent) {
7913                 if (--parent->fw_ref == 0)
7914                         freework_enqueue(parent);
7915                 return;
7916         }
7917         if (--freeblks->fb_ref != 0)
7918                 return;
7919         if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) ==
7920             ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd))
7921                 add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7922 }
7923
7924 /*
7925  * This workitem routine performs the block de-allocation.
7926  * The workitem is added to the pending list after the updated
7927  * inode block has been written to disk.  As mentioned above,
7928  * checks regarding the number of blocks de-allocated (compared
7929  * to the number of blocks allocated for the file) are also
7930  * performed in this function.
7931  */
7932 static int
7933 handle_workitem_freeblocks(freeblks, flags)
7934         struct freeblks *freeblks;
7935         int flags;
7936 {
7937         struct freework *freework;
7938         struct newblk *newblk;
7939         struct allocindir *aip;
7940         struct ufsmount *ump;
7941         struct worklist *wk;
7942         u_long key;
7943
7944         KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
7945             ("handle_workitem_freeblocks: Journal entries not written."));
7946         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7947         key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum);
7948         ACQUIRE_LOCK(ump);
7949         while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
7950                 WORKLIST_REMOVE(wk);
7951                 switch (wk->wk_type) {
7952                 case D_DIRREM:
7953                         wk->wk_state |= COMPLETE;
7954                         add_to_worklist(wk, 0);
7955                         continue;
7956
7957                 case D_ALLOCDIRECT:
7958                         free_newblk(WK_NEWBLK(wk));
7959                         continue;
7960
7961                 case D_ALLOCINDIR:
7962                         aip = WK_ALLOCINDIR(wk);
7963                         freework = NULL;
7964                         if (aip->ai_state & DELAYEDFREE) {
7965                                 FREE_LOCK(ump);
7966                                 freework = newfreework(ump, freeblks, NULL,
7967                                     aip->ai_lbn, aip->ai_newblkno,
7968                                     ump->um_fs->fs_frag, 0, 0);
7969                                 ACQUIRE_LOCK(ump);
7970                         }
7971                         newblk = WK_NEWBLK(wk);
7972                         if (newblk->nb_jnewblk) {
7973                                 freework->fw_jnewblk = newblk->nb_jnewblk;
7974                                 newblk->nb_jnewblk->jn_dep = &freework->fw_list;
7975                                 newblk->nb_jnewblk = NULL;
7976                         }
7977                         free_newblk(newblk);
7978                         continue;
7979
7980                 case D_FREEWORK:
7981                         freework = WK_FREEWORK(wk);
7982                         if (freework->fw_lbn <= -UFS_NDADDR)
7983                                 handle_workitem_indirblk(freework);
7984                         else
7985                                 freework_freeblock(freework, key);
7986                         continue;
7987                 default:
7988                         panic("handle_workitem_freeblocks: Unknown type %s",
7989                             TYPENAME(wk->wk_type));
7990                 }
7991         }
7992         if (freeblks->fb_ref != 0) {
7993                 freeblks->fb_state &= ~INPROGRESS;
7994                 wake_worklist(&freeblks->fb_list);
7995                 freeblks = NULL;
7996         }
7997         FREE_LOCK(ump);
7998         ffs_blkrelease_finish(ump, key);
7999         if (freeblks)
8000                 return handle_complete_freeblocks(freeblks, flags);
8001         return (0);
8002 }
8003
8004 /*
8005  * Handle completion of block free via truncate.  This allows fs_pending
8006  * to track the actual free block count more closely than if we only updated
8007  * it at the end.  We must be careful to handle cases where the block count
8008  * on free was incorrect.
8009  */
8010 static void
8011 freeblks_free(ump, freeblks, blocks)
8012         struct ufsmount *ump;
8013         struct freeblks *freeblks;
8014         int blocks;
8015 {
8016         struct fs *fs;
8017         ufs2_daddr_t remain;
8018
8019         UFS_LOCK(ump);
8020         remain = -freeblks->fb_chkcnt;
8021         freeblks->fb_chkcnt += blocks;
8022         if (remain > 0) {
8023                 if (remain < blocks)
8024                         blocks = remain;
8025                 fs = ump->um_fs;
8026                 fs->fs_pendingblocks -= blocks;
8027         }
8028         UFS_UNLOCK(ump);
8029 }
8030
8031 /*
8032  * Once all of the freework workitems are complete we can retire the
8033  * freeblocks dependency and any journal work awaiting completion.  This
8034  * can not be called until all other dependencies are stable on disk.
8035  */
8036 static int
8037 handle_complete_freeblocks(freeblks, flags)
8038         struct freeblks *freeblks;
8039         int flags;
8040 {
8041         struct inodedep *inodedep;
8042         struct inode *ip;
8043         struct vnode *vp;
8044         struct fs *fs;
8045         struct ufsmount *ump;
8046         ufs2_daddr_t spare;
8047
8048         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
8049         fs = ump->um_fs;
8050         flags = LK_EXCLUSIVE | flags;
8051         spare = freeblks->fb_chkcnt;
8052
8053         /*
8054          * If we did not release the expected number of blocks we may have
8055          * to adjust the inode block count here.  Only do so if it wasn't
8056          * a truncation to zero and the modrev still matches.
8057          */
8058         if (spare && freeblks->fb_len != 0) {
8059                 if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum,
8060                     flags, &vp, FFSV_FORCEINSMQ) != 0)
8061                         return (EBUSY);
8062                 ip = VTOI(vp);
8063                 if (DIP(ip, i_modrev) == freeblks->fb_modrev) {
8064                         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare);
8065                         ip->i_flag |= IN_CHANGE;
8066                         /*
8067                          * We must wait so this happens before the
8068                          * journal is reclaimed.
8069                          */
8070                         ffs_update(vp, 1);
8071                 }
8072                 vput(vp);
8073         }
8074         if (spare < 0) {
8075                 UFS_LOCK(ump);
8076                 fs->fs_pendingblocks += spare;
8077                 UFS_UNLOCK(ump);
8078         }
8079 #ifdef QUOTA
8080         /* Handle spare. */
8081         if (spare)
8082                 quotaadj(freeblks->fb_quota, ump, -spare);
8083         quotarele(freeblks->fb_quota);
8084 #endif
8085         ACQUIRE_LOCK(ump);
8086         if (freeblks->fb_state & ONDEPLIST) {
8087                 inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum,
8088                     0, &inodedep);
8089                 TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next);
8090                 freeblks->fb_state &= ~ONDEPLIST;
8091                 if (TAILQ_EMPTY(&inodedep->id_freeblklst))
8092                         free_inodedep(inodedep);
8093         }
8094         /*
8095          * All of the freeblock deps must be complete prior to this call
8096          * so it's now safe to complete earlier outstanding journal entries.
8097          */
8098         handle_jwork(&freeblks->fb_jwork);
8099         WORKITEM_FREE(freeblks, D_FREEBLKS);
8100         FREE_LOCK(ump);
8101         return (0);
8102 }
8103
8104 /*
8105  * Release blocks associated with the freeblks and stored in the indirect
8106  * block dbn. If level is greater than SINGLE, the block is an indirect block
8107  * and recursive calls to indirtrunc must be used to cleanse other indirect
8108  * blocks.
8109  *
8110  * This handles partial and complete truncation of blocks.  Partial is noted
8111  * with goingaway == 0.  In this case the freework is completed after the
8112  * zero'd indirects are written to disk.  For full truncation the freework
8113  * is completed after the block is freed.
8114  */
8115 static void
8116 indir_trunc(freework, dbn, lbn)
8117         struct freework *freework;
8118         ufs2_daddr_t dbn;
8119         ufs_lbn_t lbn;
8120 {
8121         struct freework *nfreework;
8122         struct workhead wkhd;
8123         struct freeblks *freeblks;
8124         struct buf *bp;
8125         struct fs *fs;
8126         struct indirdep *indirdep;
8127         struct mount *mp;
8128         struct ufsmount *ump;
8129         ufs1_daddr_t *bap1;
8130         ufs2_daddr_t nb, nnb, *bap2;
8131         ufs_lbn_t lbnadd, nlbn;
8132         u_long key;
8133         int nblocks, ufs1fmt, freedblocks;
8134         int goingaway, freedeps, needj, level, cnt, i;
8135
8136         freeblks = freework->fw_freeblks;
8137         mp = freeblks->fb_list.wk_mp;
8138         ump = VFSTOUFS(mp);
8139         fs = ump->um_fs;
8140         /*
8141          * Get buffer of block pointers to be freed.  There are three cases:
8142          *
8143          * 1) Partial truncate caches the indirdep pointer in the freework
8144          *    which provides us a back copy to the save bp which holds the
8145          *    pointers we want to clear.  When this completes the zero
8146          *    pointers are written to the real copy.
8147          * 2) The indirect is being completely truncated, cancel_indirdep()
8148          *    eliminated the real copy and placed the indirdep on the saved
8149          *    copy.  The indirdep and buf are discarded when this completes.
8150          * 3) The indirect was not in memory, we read a copy off of the disk
8151          *    using the devvp and drop and invalidate the buffer when we're
8152          *    done.
8153          */
8154         goingaway = 1;
8155         indirdep = NULL;
8156         if (freework->fw_indir != NULL) {
8157                 goingaway = 0;
8158                 indirdep = freework->fw_indir;
8159                 bp = indirdep->ir_savebp;
8160                 if (bp == NULL || bp->b_blkno != dbn)
8161                         panic("indir_trunc: Bad saved buf %p blkno %jd",
8162                             bp, (intmax_t)dbn);
8163         } else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) {
8164                 /*
8165                  * The lock prevents the buf dep list from changing and
8166                  * indirects on devvp should only ever have one dependency.
8167                  */
8168                 indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep));
8169                 if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0)
8170                         panic("indir_trunc: Bad indirdep %p from buf %p",
8171                             indirdep, bp);
8172         } else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
8173             NOCRED, &bp) != 0) {
8174                 brelse(bp);
8175                 return;
8176         }
8177         ACQUIRE_LOCK(ump);
8178         /* Protects against a race with complete_trunc_indir(). */
8179         freework->fw_state &= ~INPROGRESS;
8180         /*
8181          * If we have an indirdep we need to enforce the truncation order
8182          * and discard it when it is complete.
8183          */
8184         if (indirdep) {
8185                 if (freework != TAILQ_FIRST(&indirdep->ir_trunc) &&
8186                     !TAILQ_EMPTY(&indirdep->ir_trunc)) {
8187                         /*
8188                          * Add the complete truncate to the list on the
8189                          * indirdep to enforce in-order processing.
8190                          */
8191                         if (freework->fw_indir == NULL)
8192                                 TAILQ_INSERT_TAIL(&indirdep->ir_trunc,
8193                                     freework, fw_next);
8194                         FREE_LOCK(ump);
8195                         return;
8196                 }
8197                 /*
8198                  * If we're goingaway, free the indirdep.  Otherwise it will
8199                  * linger until the write completes.
8200                  */
8201                 if (goingaway)
8202                         free_indirdep(indirdep);
8203         }
8204         FREE_LOCK(ump);
8205         /* Initialize pointers depending on block size. */
8206         if (ump->um_fstype == UFS1) {
8207                 bap1 = (ufs1_daddr_t *)bp->b_data;
8208                 nb = bap1[freework->fw_off];
8209                 ufs1fmt = 1;
8210                 bap2 = NULL;
8211         } else {
8212                 bap2 = (ufs2_daddr_t *)bp->b_data;
8213                 nb = bap2[freework->fw_off];
8214                 ufs1fmt = 0;
8215                 bap1 = NULL;
8216         }
8217         level = lbn_level(lbn);
8218         needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0;
8219         lbnadd = lbn_offset(fs, level);
8220         nblocks = btodb(fs->fs_bsize);
8221         nfreework = freework;
8222         freedeps = 0;
8223         cnt = 0;
8224         /*
8225          * Reclaim blocks.  Traverses into nested indirect levels and
8226          * arranges for the current level to be freed when subordinates
8227          * are free when journaling.
8228          */
8229         key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum);
8230         for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
8231                 if (UFS_CHECK_BLKNO(mp, freeblks->fb_inum, nb,
8232                     fs->fs_bsize) != 0)
8233                         nb = 0;
8234                 if (i != NINDIR(fs) - 1) {
8235                         if (ufs1fmt)
8236                                 nnb = bap1[i+1];
8237                         else
8238                                 nnb = bap2[i+1];
8239                 } else
8240                         nnb = 0;
8241                 if (nb == 0)
8242                         continue;
8243                 cnt++;
8244                 if (level != 0) {
8245                         nlbn = (lbn + 1) - (i * lbnadd);
8246                         if (needj != 0) {
8247                                 nfreework = newfreework(ump, freeblks, freework,
8248                                     nlbn, nb, fs->fs_frag, 0, 0);
8249                                 freedeps++;
8250                         }
8251                         indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
8252                 } else {
8253                         struct freedep *freedep;
8254
8255                         /*
8256                          * Attempt to aggregate freedep dependencies for
8257                          * all blocks being released to the same CG.
8258                          */
8259                         LIST_INIT(&wkhd);
8260                         if (needj != 0 &&
8261                             (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
8262                                 freedep = newfreedep(freework);
8263                                 WORKLIST_INSERT_UNLOCKED(&wkhd,
8264                                     &freedep->fd_list);
8265                                 freedeps++;
8266                         }
8267                         CTR3(KTR_SUJ,
8268                             "indir_trunc: ino %jd blkno %jd size %d",
8269                             freeblks->fb_inum, nb, fs->fs_bsize);
8270                         ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
8271                             fs->fs_bsize, freeblks->fb_inum,
8272                             freeblks->fb_vtype, &wkhd, key);
8273                 }
8274         }
8275         ffs_blkrelease_finish(ump, key);
8276         if (goingaway) {
8277                 bp->b_flags |= B_INVAL | B_NOCACHE;
8278                 brelse(bp);
8279         }
8280         freedblocks = 0;
8281         if (level == 0)
8282                 freedblocks = (nblocks * cnt);
8283         if (needj == 0)
8284                 freedblocks += nblocks;
8285         freeblks_free(ump, freeblks, freedblocks);
8286         /*
8287          * If we are journaling set up the ref counts and offset so this
8288          * indirect can be completed when its children are free.
8289          */
8290         if (needj) {
8291                 ACQUIRE_LOCK(ump);
8292                 freework->fw_off = i;
8293                 freework->fw_ref += freedeps;
8294                 freework->fw_ref -= NINDIR(fs) + 1;
8295                 if (level == 0)
8296                         freeblks->fb_cgwait += freedeps;
8297                 if (freework->fw_ref == 0)
8298                         freework_freeblock(freework, SINGLETON_KEY);
8299                 FREE_LOCK(ump);
8300                 return;
8301         }
8302         /*
8303          * If we're not journaling we can free the indirect now.
8304          */
8305         dbn = dbtofsb(fs, dbn);
8306         CTR3(KTR_SUJ,
8307             "indir_trunc 2: ino %jd blkno %jd size %d",
8308             freeblks->fb_inum, dbn, fs->fs_bsize);
8309         ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
8310             freeblks->fb_inum, freeblks->fb_vtype, NULL, SINGLETON_KEY);
8311         /* Non SUJ softdep does single-threaded truncations. */
8312         if (freework->fw_blkno == dbn) {
8313                 freework->fw_state |= ALLCOMPLETE;
8314                 ACQUIRE_LOCK(ump);
8315                 handle_written_freework(freework);
8316                 FREE_LOCK(ump);
8317         }
8318         return;
8319 }
8320
8321 /*
8322  * Cancel an allocindir when it is removed via truncation.  When bp is not
8323  * NULL the indirect never appeared on disk and is scheduled to be freed
8324  * independently of the indir so we can more easily track journal work.
8325  */
8326 static void
8327 cancel_allocindir(aip, bp, freeblks, trunc)
8328         struct allocindir *aip;
8329         struct buf *bp;
8330         struct freeblks *freeblks;
8331         int trunc;
8332 {
8333         struct indirdep *indirdep;
8334         struct freefrag *freefrag;
8335         struct newblk *newblk;
8336
8337         newblk = (struct newblk *)aip;
8338         LIST_REMOVE(aip, ai_next);
8339         /*
8340          * We must eliminate the pointer in bp if it must be freed on its
8341          * own due to partial truncate or pending journal work.
8342          */
8343         if (bp && (trunc || newblk->nb_jnewblk)) {
8344                 /*
8345                  * Clear the pointer and mark the aip to be freed
8346                  * directly if it never existed on disk.
8347                  */
8348                 aip->ai_state |= DELAYEDFREE;
8349                 indirdep = aip->ai_indirdep;
8350                 if (indirdep->ir_state & UFS1FMT)
8351                         ((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8352                 else
8353                         ((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8354         }
8355         /*
8356          * When truncating the previous pointer will be freed via
8357          * savedbp.  Eliminate the freefrag which would dup free.
8358          */
8359         if (trunc && (freefrag = newblk->nb_freefrag) != NULL) {
8360                 newblk->nb_freefrag = NULL;
8361                 if (freefrag->ff_jdep)
8362                         cancel_jfreefrag(
8363                             WK_JFREEFRAG(freefrag->ff_jdep));
8364                 jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork);
8365                 WORKITEM_FREE(freefrag, D_FREEFRAG);
8366         }
8367         /*
8368          * If the journal hasn't been written the jnewblk must be passed
8369          * to the call to ffs_blkfree that reclaims the space.  We accomplish
8370          * this by leaving the journal dependency on the newblk to be freed
8371          * when a freework is created in handle_workitem_freeblocks().
8372          */
8373         cancel_newblk(newblk, NULL, &freeblks->fb_jwork);
8374         WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
8375 }
8376
8377 /*
8378  * Create the mkdir dependencies for . and .. in a new directory.  Link them
8379  * in to a newdirblk so any subsequent additions are tracked properly.  The
8380  * caller is responsible for adding the mkdir1 dependency to the journal
8381  * and updating id_mkdiradd.  This function returns with the per-filesystem
8382  * lock held.
8383  */
8384 static struct mkdir *
8385 setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
8386         struct diradd *dap;
8387         ino_t newinum;
8388         ino_t dinum;
8389         struct buf *newdirbp;
8390         struct mkdir **mkdirp;
8391 {
8392         struct newblk *newblk;
8393         struct pagedep *pagedep;
8394         struct inodedep *inodedep;
8395         struct newdirblk *newdirblk;
8396         struct mkdir *mkdir1, *mkdir2;
8397         struct worklist *wk;
8398         struct jaddref *jaddref;
8399         struct ufsmount *ump;
8400         struct mount *mp;
8401
8402         mp = dap->da_list.wk_mp;
8403         ump = VFSTOUFS(mp);
8404         newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
8405             M_SOFTDEP_FLAGS);
8406         workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8407         LIST_INIT(&newdirblk->db_mkdir);
8408         mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8409         workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
8410         mkdir1->md_state = ATTACHED | MKDIR_BODY;
8411         mkdir1->md_diradd = dap;
8412         mkdir1->md_jaddref = NULL;
8413         mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8414         workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
8415         mkdir2->md_state = ATTACHED | MKDIR_PARENT;
8416         mkdir2->md_diradd = dap;
8417         mkdir2->md_jaddref = NULL;
8418         if (MOUNTEDSUJ(mp) == 0) {
8419                 mkdir1->md_state |= DEPCOMPLETE;
8420                 mkdir2->md_state |= DEPCOMPLETE;
8421         }
8422         /*
8423          * Dependency on "." and ".." being written to disk.
8424          */
8425         mkdir1->md_buf = newdirbp;
8426         ACQUIRE_LOCK(VFSTOUFS(mp));
8427         LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir1, md_mkdirs);
8428         /*
8429          * We must link the pagedep, allocdirect, and newdirblk for
8430          * the initial file page so the pointer to the new directory
8431          * is not written until the directory contents are live and
8432          * any subsequent additions are not marked live until the
8433          * block is reachable via the inode.
8434          */
8435         if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0)
8436                 panic("setup_newdir: lost pagedep");
8437         LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
8438                 if (wk->wk_type == D_ALLOCDIRECT)
8439                         break;
8440         if (wk == NULL)
8441                 panic("setup_newdir: lost allocdirect");
8442         if (pagedep->pd_state & NEWBLOCK)
8443                 panic("setup_newdir: NEWBLOCK already set");
8444         newblk = WK_NEWBLK(wk);
8445         pagedep->pd_state |= NEWBLOCK;
8446         pagedep->pd_newdirblk = newdirblk;
8447         newdirblk->db_pagedep = pagedep;
8448         WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8449         WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
8450         /*
8451          * Look up the inodedep for the parent directory so that we
8452          * can link mkdir2 into the pending dotdot jaddref or
8453          * the inode write if there is none.  If the inode is
8454          * ALLCOMPLETE and no jaddref is present all dependencies have
8455          * been satisfied and mkdir2 can be freed.
8456          */
8457         inodedep_lookup(mp, dinum, 0, &inodedep);
8458         if (MOUNTEDSUJ(mp)) {
8459                 if (inodedep == NULL)
8460                         panic("setup_newdir: Lost parent.");
8461                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8462                     inoreflst);
8463                 KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
8464                     (jaddref->ja_state & MKDIR_PARENT),
8465                     ("setup_newdir: bad dotdot jaddref %p", jaddref));
8466                 LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
8467                 mkdir2->md_jaddref = jaddref;
8468                 jaddref->ja_mkdir = mkdir2;
8469         } else if (inodedep == NULL ||
8470             (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
8471                 dap->da_state &= ~MKDIR_PARENT;
8472                 WORKITEM_FREE(mkdir2, D_MKDIR);
8473                 mkdir2 = NULL;
8474         } else {
8475                 LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
8476                 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list);
8477         }
8478         *mkdirp = mkdir2;
8479
8480         return (mkdir1);
8481 }
8482
8483 /*
8484  * Directory entry addition dependencies.
8485  *
8486  * When adding a new directory entry, the inode (with its incremented link
8487  * count) must be written to disk before the directory entry's pointer to it.
8488  * Also, if the inode is newly allocated, the corresponding freemap must be
8489  * updated (on disk) before the directory entry's pointer. These requirements
8490  * are met via undo/redo on the directory entry's pointer, which consists
8491  * simply of the inode number.
8492  *
8493  * As directory entries are added and deleted, the free space within a
8494  * directory block can become fragmented.  The ufs filesystem will compact
8495  * a fragmented directory block to make space for a new entry. When this
8496  * occurs, the offsets of previously added entries change. Any "diradd"
8497  * dependency structures corresponding to these entries must be updated with
8498  * the new offsets.
8499  */
8500
8501 /*
8502  * This routine is called after the in-memory inode's link
8503  * count has been incremented, but before the directory entry's
8504  * pointer to the inode has been set.
8505  */
8506 int
8507 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
8508         struct buf *bp;         /* buffer containing directory block */
8509         struct inode *dp;       /* inode for directory */
8510         off_t diroffset;        /* offset of new entry in directory */
8511         ino_t newinum;          /* inode referenced by new directory entry */
8512         struct buf *newdirbp;   /* non-NULL => contents of new mkdir */
8513         int isnewblk;           /* entry is in a newly allocated block */
8514 {
8515         int offset;             /* offset of new entry within directory block */
8516         ufs_lbn_t lbn;          /* block in directory containing new entry */
8517         struct fs *fs;
8518         struct diradd *dap;
8519         struct newblk *newblk;
8520         struct pagedep *pagedep;
8521         struct inodedep *inodedep;
8522         struct newdirblk *newdirblk;
8523         struct mkdir *mkdir1, *mkdir2;
8524         struct jaddref *jaddref;
8525         struct ufsmount *ump;
8526         struct mount *mp;
8527         int isindir;
8528
8529         mp = ITOVFS(dp);
8530         ump = VFSTOUFS(mp);
8531         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
8532             ("softdep_setup_directory_add called on non-softdep filesystem"));
8533         /*
8534          * Whiteouts have no dependencies.
8535          */
8536         if (newinum == UFS_WINO) {
8537                 if (newdirbp != NULL)
8538                         bdwrite(newdirbp);
8539                 return (0);
8540         }
8541         jaddref = NULL;
8542         mkdir1 = mkdir2 = NULL;
8543         fs = ump->um_fs;
8544         lbn = lblkno(fs, diroffset);
8545         offset = blkoff(fs, diroffset);
8546         dap = malloc(sizeof(struct diradd), M_DIRADD,
8547                 M_SOFTDEP_FLAGS|M_ZERO);
8548         workitem_alloc(&dap->da_list, D_DIRADD, mp);
8549         dap->da_offset = offset;
8550         dap->da_newinum = newinum;
8551         dap->da_state = ATTACHED;
8552         LIST_INIT(&dap->da_jwork);
8553         isindir = bp->b_lblkno >= UFS_NDADDR;
8554         newdirblk = NULL;
8555         if (isnewblk &&
8556             (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
8557                 newdirblk = malloc(sizeof(struct newdirblk),
8558                     M_NEWDIRBLK, M_SOFTDEP_FLAGS);
8559                 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8560                 LIST_INIT(&newdirblk->db_mkdir);
8561         }
8562         /*
8563          * If we're creating a new directory setup the dependencies and set
8564          * the dap state to wait for them.  Otherwise it's COMPLETE and
8565          * we can move on.
8566          */
8567         if (newdirbp == NULL) {
8568                 dap->da_state |= DEPCOMPLETE;
8569                 ACQUIRE_LOCK(ump);
8570         } else {
8571                 dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
8572                 mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
8573                     &mkdir2);
8574         }
8575         /*
8576          * Link into parent directory pagedep to await its being written.
8577          */
8578         pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep);
8579 #ifdef INVARIANTS
8580         if (diradd_lookup(pagedep, offset) != NULL)
8581                 panic("softdep_setup_directory_add: %p already at off %d\n",
8582                     diradd_lookup(pagedep, offset), offset);
8583 #endif
8584         dap->da_pagedep = pagedep;
8585         LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
8586             da_pdlist);
8587         inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
8588         /*
8589          * If we're journaling, link the diradd into the jaddref so it
8590          * may be completed after the journal entry is written.  Otherwise,
8591          * link the diradd into its inodedep.  If the inode is not yet
8592          * written place it on the bufwait list, otherwise do the post-inode
8593          * write processing to put it on the id_pendinghd list.
8594          */
8595         if (MOUNTEDSUJ(mp)) {
8596                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8597                     inoreflst);
8598                 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
8599                     ("softdep_setup_directory_add: bad jaddref %p", jaddref));
8600                 jaddref->ja_diroff = diroffset;
8601                 jaddref->ja_diradd = dap;
8602                 add_to_journal(&jaddref->ja_list);
8603         } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
8604                 diradd_inode_written(dap, inodedep);
8605         else
8606                 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
8607         /*
8608          * Add the journal entries for . and .. links now that the primary
8609          * link is written.
8610          */
8611         if (mkdir1 != NULL && MOUNTEDSUJ(mp)) {
8612                 jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
8613                     inoreflst, if_deps);
8614                 KASSERT(jaddref != NULL &&
8615                     jaddref->ja_ino == jaddref->ja_parent &&
8616                     (jaddref->ja_state & MKDIR_BODY),
8617                     ("softdep_setup_directory_add: bad dot jaddref %p",
8618                     jaddref));
8619                 mkdir1->md_jaddref = jaddref;
8620                 jaddref->ja_mkdir = mkdir1;
8621                 /*
8622                  * It is important that the dotdot journal entry
8623                  * is added prior to the dot entry since dot writes
8624                  * both the dot and dotdot links.  These both must
8625                  * be added after the primary link for the journal
8626                  * to remain consistent.
8627                  */
8628                 add_to_journal(&mkdir2->md_jaddref->ja_list);
8629                 add_to_journal(&jaddref->ja_list);
8630         }
8631         /*
8632          * If we are adding a new directory remember this diradd so that if
8633          * we rename it we can keep the dot and dotdot dependencies.  If
8634          * we are adding a new name for an inode that has a mkdiradd we
8635          * must be in rename and we have to move the dot and dotdot
8636          * dependencies to this new name.  The old name is being orphaned
8637          * soon.
8638          */
8639         if (mkdir1 != NULL) {
8640                 if (inodedep->id_mkdiradd != NULL)
8641                         panic("softdep_setup_directory_add: Existing mkdir");
8642                 inodedep->id_mkdiradd = dap;
8643         } else if (inodedep->id_mkdiradd)
8644                 merge_diradd(inodedep, dap);
8645         if (newdirblk != NULL) {
8646                 /*
8647                  * There is nothing to do if we are already tracking
8648                  * this block.
8649                  */
8650                 if ((pagedep->pd_state & NEWBLOCK) != 0) {
8651                         WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
8652                         FREE_LOCK(ump);
8653                         return (0);
8654                 }
8655                 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
8656                     == 0)
8657                         panic("softdep_setup_directory_add: lost entry");
8658                 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8659                 pagedep->pd_state |= NEWBLOCK;
8660                 pagedep->pd_newdirblk = newdirblk;
8661                 newdirblk->db_pagedep = pagedep;
8662                 FREE_LOCK(ump);
8663                 /*
8664                  * If we extended into an indirect signal direnter to sync.
8665                  */
8666                 if (isindir)
8667                         return (1);
8668                 return (0);
8669         }
8670         FREE_LOCK(ump);
8671         return (0);
8672 }
8673
8674 /*
8675  * This procedure is called to change the offset of a directory
8676  * entry when compacting a directory block which must be owned
8677  * exclusively by the caller. Note that the actual entry movement
8678  * must be done in this procedure to ensure that no I/O completions
8679  * occur while the move is in progress.
8680  */
8681 void
8682 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
8683         struct buf *bp;         /* Buffer holding directory block. */
8684         struct inode *dp;       /* inode for directory */
8685         caddr_t base;           /* address of dp->i_offset */
8686         caddr_t oldloc;         /* address of old directory location */
8687         caddr_t newloc;         /* address of new directory location */
8688         int entrysize;          /* size of directory entry */
8689 {
8690         int offset, oldoffset, newoffset;
8691         struct pagedep *pagedep;
8692         struct jmvref *jmvref;
8693         struct diradd *dap;
8694         struct direct *de;
8695         struct mount *mp;
8696         struct ufsmount *ump;
8697         ufs_lbn_t lbn;
8698         int flags;
8699
8700         mp = ITOVFS(dp);
8701         ump = VFSTOUFS(mp);
8702         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
8703             ("softdep_change_directoryentry_offset called on "
8704              "non-softdep filesystem"));
8705         de = (struct direct *)oldloc;
8706         jmvref = NULL;
8707         flags = 0;
8708         /*
8709          * Moves are always journaled as it would be too complex to
8710          * determine if any affected adds or removes are present in the
8711          * journal.
8712          */
8713         if (MOUNTEDSUJ(mp)) {
8714                 flags = DEPALLOC;
8715                 jmvref = newjmvref(dp, de->d_ino,
8716                     dp->i_offset + (oldloc - base),
8717                     dp->i_offset + (newloc - base));
8718         }
8719         lbn = lblkno(ump->um_fs, dp->i_offset);
8720         offset = blkoff(ump->um_fs, dp->i_offset);
8721         oldoffset = offset + (oldloc - base);
8722         newoffset = offset + (newloc - base);
8723         ACQUIRE_LOCK(ump);
8724         if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0)
8725                 goto done;
8726         dap = diradd_lookup(pagedep, oldoffset);
8727         if (dap) {
8728                 dap->da_offset = newoffset;
8729                 newoffset = DIRADDHASH(newoffset);
8730                 oldoffset = DIRADDHASH(oldoffset);
8731                 if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
8732                     newoffset != oldoffset) {
8733                         LIST_REMOVE(dap, da_pdlist);
8734                         LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
8735                             dap, da_pdlist);
8736                 }
8737         }
8738 done:
8739         if (jmvref) {
8740                 jmvref->jm_pagedep = pagedep;
8741                 LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
8742                 add_to_journal(&jmvref->jm_list);
8743         }
8744         bcopy(oldloc, newloc, entrysize);
8745         FREE_LOCK(ump);
8746 }
8747
8748 /*
8749  * Move the mkdir dependencies and journal work from one diradd to another
8750  * when renaming a directory.  The new name must depend on the mkdir deps
8751  * completing as the old name did.  Directories can only have one valid link
8752  * at a time so one must be canonical.
8753  */
8754 static void
8755 merge_diradd(inodedep, newdap)
8756         struct inodedep *inodedep;
8757         struct diradd *newdap;
8758 {
8759         struct diradd *olddap;
8760         struct mkdir *mkdir, *nextmd;
8761         struct ufsmount *ump;
8762         short state;
8763
8764         olddap = inodedep->id_mkdiradd;
8765         inodedep->id_mkdiradd = newdap;
8766         if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8767                 newdap->da_state &= ~DEPCOMPLETE;
8768                 ump = VFSTOUFS(inodedep->id_list.wk_mp);
8769                 for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
8770                      mkdir = nextmd) {
8771                         nextmd = LIST_NEXT(mkdir, md_mkdirs);
8772                         if (mkdir->md_diradd != olddap)
8773                                 continue;
8774                         mkdir->md_diradd = newdap;
8775                         state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
8776                         newdap->da_state |= state;
8777                         olddap->da_state &= ~state;
8778                         if ((olddap->da_state &
8779                             (MKDIR_PARENT | MKDIR_BODY)) == 0)
8780                                 break;
8781                 }
8782                 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8783                         panic("merge_diradd: unfound ref");
8784         }
8785         /*
8786          * Any mkdir related journal items are not safe to be freed until
8787          * the new name is stable.
8788          */
8789         jwork_move(&newdap->da_jwork, &olddap->da_jwork);
8790         olddap->da_state |= DEPCOMPLETE;
8791         complete_diradd(olddap);
8792 }
8793
8794 /*
8795  * Move the diradd to the pending list when all diradd dependencies are
8796  * complete.
8797  */
8798 static void
8799 complete_diradd(dap)
8800         struct diradd *dap;
8801 {
8802         struct pagedep *pagedep;
8803
8804         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
8805                 if (dap->da_state & DIRCHG)
8806                         pagedep = dap->da_previous->dm_pagedep;
8807                 else
8808                         pagedep = dap->da_pagedep;
8809                 LIST_REMOVE(dap, da_pdlist);
8810                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
8811         }
8812 }
8813
8814 /*
8815  * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
8816  * add entries and conditonally journal the remove.
8817  */
8818 static void
8819 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
8820         struct diradd *dap;
8821         struct dirrem *dirrem;
8822         struct jremref *jremref;
8823         struct jremref *dotremref;
8824         struct jremref *dotdotremref;
8825 {
8826         struct inodedep *inodedep;
8827         struct jaddref *jaddref;
8828         struct inoref *inoref;
8829         struct ufsmount *ump;
8830         struct mkdir *mkdir;
8831
8832         /*
8833          * If no remove references were allocated we're on a non-journaled
8834          * filesystem and can skip the cancel step.
8835          */
8836         if (jremref == NULL) {
8837                 free_diradd(dap, NULL);
8838                 return;
8839         }
8840         /*
8841          * Cancel the primary name an free it if it does not require
8842          * journaling.
8843          */
8844         if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
8845             0, &inodedep) != 0) {
8846                 /* Abort the addref that reference this diradd.  */
8847                 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
8848                         if (inoref->if_list.wk_type != D_JADDREF)
8849                                 continue;
8850                         jaddref = (struct jaddref *)inoref;
8851                         if (jaddref->ja_diradd != dap)
8852                                 continue;
8853                         if (cancel_jaddref(jaddref, inodedep,
8854                             &dirrem->dm_jwork) == 0) {
8855                                 free_jremref(jremref);
8856                                 jremref = NULL;
8857                         }
8858                         break;
8859                 }
8860         }
8861         /*
8862          * Cancel subordinate names and free them if they do not require
8863          * journaling.
8864          */
8865         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8866                 ump = VFSTOUFS(dap->da_list.wk_mp);
8867                 LIST_FOREACH(mkdir, &ump->softdep_mkdirlisthd, md_mkdirs) {
8868                         if (mkdir->md_diradd != dap)
8869                                 continue;
8870                         if ((jaddref = mkdir->md_jaddref) == NULL)
8871                                 continue;
8872                         mkdir->md_jaddref = NULL;
8873                         if (mkdir->md_state & MKDIR_PARENT) {
8874                                 if (cancel_jaddref(jaddref, NULL,
8875                                     &dirrem->dm_jwork) == 0) {
8876                                         free_jremref(dotdotremref);
8877                                         dotdotremref = NULL;
8878                                 }
8879                         } else {
8880                                 if (cancel_jaddref(jaddref, inodedep,
8881                                     &dirrem->dm_jwork) == 0) {
8882                                         free_jremref(dotremref);
8883                                         dotremref = NULL;
8884                                 }
8885                         }
8886                 }
8887         }
8888
8889         if (jremref)
8890                 journal_jremref(dirrem, jremref, inodedep);
8891         if (dotremref)
8892                 journal_jremref(dirrem, dotremref, inodedep);
8893         if (dotdotremref)
8894                 journal_jremref(dirrem, dotdotremref, NULL);
8895         jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
8896         free_diradd(dap, &dirrem->dm_jwork);
8897 }
8898
8899 /*
8900  * Free a diradd dependency structure.
8901  */
8902 static void
8903 free_diradd(dap, wkhd)
8904         struct diradd *dap;
8905         struct workhead *wkhd;
8906 {
8907         struct dirrem *dirrem;
8908         struct pagedep *pagedep;
8909         struct inodedep *inodedep;
8910         struct mkdir *mkdir, *nextmd;
8911         struct ufsmount *ump;
8912
8913         ump = VFSTOUFS(dap->da_list.wk_mp);
8914         LOCK_OWNED(ump);
8915         LIST_REMOVE(dap, da_pdlist);
8916         if (dap->da_state & ONWORKLIST)
8917                 WORKLIST_REMOVE(&dap->da_list);
8918         if ((dap->da_state & DIRCHG) == 0) {
8919                 pagedep = dap->da_pagedep;
8920         } else {
8921                 dirrem = dap->da_previous;
8922                 pagedep = dirrem->dm_pagedep;
8923                 dirrem->dm_dirinum = pagedep->pd_ino;
8924                 dirrem->dm_state |= COMPLETE;
8925                 if (LIST_EMPTY(&dirrem->dm_jremrefhd))
8926                         add_to_worklist(&dirrem->dm_list, 0);
8927         }
8928         if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
8929             0, &inodedep) != 0)
8930                 if (inodedep->id_mkdiradd == dap)
8931                         inodedep->id_mkdiradd = NULL;
8932         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8933                 for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
8934                      mkdir = nextmd) {
8935                         nextmd = LIST_NEXT(mkdir, md_mkdirs);
8936                         if (mkdir->md_diradd != dap)
8937                                 continue;
8938                         dap->da_state &=
8939                             ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
8940                         LIST_REMOVE(mkdir, md_mkdirs);
8941                         if (mkdir->md_state & ONWORKLIST)
8942                                 WORKLIST_REMOVE(&mkdir->md_list);
8943                         if (mkdir->md_jaddref != NULL)
8944                                 panic("free_diradd: Unexpected jaddref");
8945                         WORKITEM_FREE(mkdir, D_MKDIR);
8946                         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
8947                                 break;
8948                 }
8949                 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8950                         panic("free_diradd: unfound ref");
8951         }
8952         if (inodedep)
8953                 free_inodedep(inodedep);
8954         /*
8955          * Free any journal segments waiting for the directory write.
8956          */
8957         handle_jwork(&dap->da_jwork);
8958         WORKITEM_FREE(dap, D_DIRADD);
8959 }
8960
8961 /*
8962  * Directory entry removal dependencies.
8963  *
8964  * When removing a directory entry, the entry's inode pointer must be
8965  * zero'ed on disk before the corresponding inode's link count is decremented
8966  * (possibly freeing the inode for re-use). This dependency is handled by
8967  * updating the directory entry but delaying the inode count reduction until
8968  * after the directory block has been written to disk. After this point, the
8969  * inode count can be decremented whenever it is convenient.
8970  */
8971
8972 /*
8973  * This routine should be called immediately after removing
8974  * a directory entry.  The inode's link count should not be
8975  * decremented by the calling procedure -- the soft updates
8976  * code will do this task when it is safe.
8977  */
8978 void
8979 softdep_setup_remove(bp, dp, ip, isrmdir)
8980         struct buf *bp;         /* buffer containing directory block */
8981         struct inode *dp;       /* inode for the directory being modified */
8982         struct inode *ip;       /* inode for directory entry being removed */
8983         int isrmdir;            /* indicates if doing RMDIR */
8984 {
8985         struct dirrem *dirrem, *prevdirrem;
8986         struct inodedep *inodedep;
8987         struct ufsmount *ump;
8988         int direct;
8989
8990         ump = ITOUMP(ip);
8991         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
8992             ("softdep_setup_remove called on non-softdep filesystem"));
8993         /*
8994          * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
8995          * newdirrem() to setup the full directory remove which requires
8996          * isrmdir > 1.
8997          */
8998         dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
8999         /*
9000          * Add the dirrem to the inodedep's pending remove list for quick
9001          * discovery later.
9002          */
9003         if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0)
9004                 panic("softdep_setup_remove: Lost inodedep.");
9005         KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
9006         dirrem->dm_state |= ONDEPLIST;
9007         LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9008
9009         /*
9010          * If the COMPLETE flag is clear, then there were no active
9011          * entries and we want to roll back to a zeroed entry until
9012          * the new inode is committed to disk. If the COMPLETE flag is
9013          * set then we have deleted an entry that never made it to
9014          * disk. If the entry we deleted resulted from a name change,
9015          * then the old name still resides on disk. We cannot delete
9016          * its inode (returned to us in prevdirrem) until the zeroed
9017          * directory entry gets to disk. The new inode has never been
9018          * referenced on the disk, so can be deleted immediately.
9019          */
9020         if ((dirrem->dm_state & COMPLETE) == 0) {
9021                 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
9022                     dm_next);
9023                 FREE_LOCK(ump);
9024         } else {
9025                 if (prevdirrem != NULL)
9026                         LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
9027                             prevdirrem, dm_next);
9028                 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
9029                 direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
9030                 FREE_LOCK(ump);
9031                 if (direct)
9032                         handle_workitem_remove(dirrem, 0);
9033         }
9034 }
9035
9036 /*
9037  * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
9038  * pd_pendinghd list of a pagedep.
9039  */
9040 static struct diradd *
9041 diradd_lookup(pagedep, offset)
9042         struct pagedep *pagedep;
9043         int offset;
9044 {
9045         struct diradd *dap;
9046
9047         LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
9048                 if (dap->da_offset == offset)
9049                         return (dap);
9050         LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
9051                 if (dap->da_offset == offset)
9052                         return (dap);
9053         return (NULL);
9054 }
9055
9056 /*
9057  * Search for a .. diradd dependency in a directory that is being removed.
9058  * If the directory was renamed to a new parent we have a diradd rather
9059  * than a mkdir for the .. entry.  We need to cancel it now before
9060  * it is found in truncate().
9061  */
9062 static struct jremref *
9063 cancel_diradd_dotdot(ip, dirrem, jremref)
9064         struct inode *ip;
9065         struct dirrem *dirrem;
9066         struct jremref *jremref;
9067 {
9068         struct pagedep *pagedep;
9069         struct diradd *dap;
9070         struct worklist *wk;
9071
9072         if (pagedep_lookup(ITOVFS(ip), NULL, ip->i_number, 0, 0, &pagedep) == 0)
9073                 return (jremref);
9074         dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
9075         if (dap == NULL)
9076                 return (jremref);
9077         cancel_diradd(dap, dirrem, jremref, NULL, NULL);
9078         /*
9079          * Mark any journal work as belonging to the parent so it is freed
9080          * with the .. reference.
9081          */
9082         LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
9083                 wk->wk_state |= MKDIR_PARENT;
9084         return (NULL);
9085 }
9086
9087 /*
9088  * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
9089  * replace it with a dirrem/diradd pair as a result of re-parenting a
9090  * directory.  This ensures that we don't simultaneously have a mkdir and
9091  * a diradd for the same .. entry.
9092  */
9093 static struct jremref *
9094 cancel_mkdir_dotdot(ip, dirrem, jremref)
9095         struct inode *ip;
9096         struct dirrem *dirrem;
9097         struct jremref *jremref;
9098 {
9099         struct inodedep *inodedep;
9100         struct jaddref *jaddref;
9101         struct ufsmount *ump;
9102         struct mkdir *mkdir;
9103         struct diradd *dap;
9104         struct mount *mp;
9105
9106         mp = ITOVFS(ip);
9107         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
9108                 return (jremref);
9109         dap = inodedep->id_mkdiradd;
9110         if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
9111                 return (jremref);
9112         ump = VFSTOUFS(inodedep->id_list.wk_mp);
9113         for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
9114             mkdir = LIST_NEXT(mkdir, md_mkdirs))
9115                 if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
9116                         break;
9117         if (mkdir == NULL)
9118                 panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
9119         if ((jaddref = mkdir->md_jaddref) != NULL) {
9120                 mkdir->md_jaddref = NULL;
9121                 jaddref->ja_state &= ~MKDIR_PARENT;
9122                 if (inodedep_lookup(mp, jaddref->ja_ino, 0, &inodedep) == 0)
9123                         panic("cancel_mkdir_dotdot: Lost parent inodedep");
9124                 if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
9125                         journal_jremref(dirrem, jremref, inodedep);
9126                         jremref = NULL;
9127                 }
9128         }
9129         if (mkdir->md_state & ONWORKLIST)
9130                 WORKLIST_REMOVE(&mkdir->md_list);
9131         mkdir->md_state |= ALLCOMPLETE;
9132         complete_mkdir(mkdir);
9133         return (jremref);
9134 }
9135
9136 static void
9137 journal_jremref(dirrem, jremref, inodedep)
9138         struct dirrem *dirrem;
9139         struct jremref *jremref;
9140         struct inodedep *inodedep;
9141 {
9142
9143         if (inodedep == NULL)
9144                 if (inodedep_lookup(jremref->jr_list.wk_mp,
9145                     jremref->jr_ref.if_ino, 0, &inodedep) == 0)
9146                         panic("journal_jremref: Lost inodedep");
9147         LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
9148         TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
9149         add_to_journal(&jremref->jr_list);
9150 }
9151
9152 static void
9153 dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
9154         struct dirrem *dirrem;
9155         struct jremref *jremref;
9156         struct jremref *dotremref;
9157         struct jremref *dotdotremref;
9158 {
9159         struct inodedep *inodedep;
9160
9161
9162         if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
9163             &inodedep) == 0)
9164                 panic("dirrem_journal: Lost inodedep");
9165         journal_jremref(dirrem, jremref, inodedep);
9166         if (dotremref)
9167                 journal_jremref(dirrem, dotremref, inodedep);
9168         if (dotdotremref)
9169                 journal_jremref(dirrem, dotdotremref, NULL);
9170 }
9171
9172 /*
9173  * Allocate a new dirrem if appropriate and return it along with
9174  * its associated pagedep. Called without a lock, returns with lock.
9175  */
9176 static struct dirrem *
9177 newdirrem(bp, dp, ip, isrmdir, prevdirremp)
9178         struct buf *bp;         /* buffer containing directory block */
9179         struct inode *dp;       /* inode for the directory being modified */
9180         struct inode *ip;       /* inode for directory entry being removed */
9181         int isrmdir;            /* indicates if doing RMDIR */
9182         struct dirrem **prevdirremp; /* previously referenced inode, if any */
9183 {
9184         int offset;
9185         ufs_lbn_t lbn;
9186         struct diradd *dap;
9187         struct dirrem *dirrem;
9188         struct pagedep *pagedep;
9189         struct jremref *jremref;
9190         struct jremref *dotremref;
9191         struct jremref *dotdotremref;
9192         struct vnode *dvp;
9193         struct ufsmount *ump;
9194
9195         /*
9196          * Whiteouts have no deletion dependencies.
9197          */
9198         if (ip == NULL)
9199                 panic("newdirrem: whiteout");
9200         dvp = ITOV(dp);
9201         ump = ITOUMP(dp);
9202
9203         /*
9204          * If the system is over its limit and our filesystem is
9205          * responsible for more than our share of that usage and
9206          * we are not a snapshot, request some inodedep cleanup.
9207          * Limiting the number of dirrem structures will also limit
9208          * the number of freefile and freeblks structures.
9209          */
9210         ACQUIRE_LOCK(ump);
9211         if (!IS_SNAPSHOT(ip) && softdep_excess_items(ump, D_DIRREM))
9212                 schedule_cleanup(UFSTOVFS(ump));
9213         else
9214                 FREE_LOCK(ump);
9215         dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS |
9216             M_ZERO);
9217         workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
9218         LIST_INIT(&dirrem->dm_jremrefhd);
9219         LIST_INIT(&dirrem->dm_jwork);
9220         dirrem->dm_state = isrmdir ? RMDIR : 0;
9221         dirrem->dm_oldinum = ip->i_number;
9222         *prevdirremp = NULL;
9223         /*
9224          * Allocate remove reference structures to track journal write
9225          * dependencies.  We will always have one for the link and
9226          * when doing directories we will always have one more for dot.
9227          * When renaming a directory we skip the dotdot link change so
9228          * this is not needed.
9229          */
9230         jremref = dotremref = dotdotremref = NULL;
9231         if (DOINGSUJ(dvp)) {
9232                 if (isrmdir) {
9233                         jremref = newjremref(dirrem, dp, ip, dp->i_offset,
9234                             ip->i_effnlink + 2);
9235                         dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
9236                             ip->i_effnlink + 1);
9237                         dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
9238                             dp->i_effnlink + 1);
9239                         dotdotremref->jr_state |= MKDIR_PARENT;
9240                 } else
9241                         jremref = newjremref(dirrem, dp, ip, dp->i_offset,
9242                             ip->i_effnlink + 1);
9243         }
9244         ACQUIRE_LOCK(ump);
9245         lbn = lblkno(ump->um_fs, dp->i_offset);
9246         offset = blkoff(ump->um_fs, dp->i_offset);
9247         pagedep_lookup(UFSTOVFS(ump), bp, dp->i_number, lbn, DEPALLOC,
9248             &pagedep);
9249         dirrem->dm_pagedep = pagedep;
9250         dirrem->dm_offset = offset;
9251         /*
9252          * If we're renaming a .. link to a new directory, cancel any
9253          * existing MKDIR_PARENT mkdir.  If it has already been canceled
9254          * the jremref is preserved for any potential diradd in this
9255          * location.  This can not coincide with a rmdir.
9256          */
9257         if (dp->i_offset == DOTDOT_OFFSET) {
9258                 if (isrmdir)
9259                         panic("newdirrem: .. directory change during remove?");
9260                 jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
9261         }
9262         /*
9263          * If we're removing a directory search for the .. dependency now and
9264          * cancel it.  Any pending journal work will be added to the dirrem
9265          * to be completed when the workitem remove completes.
9266          */
9267         if (isrmdir)
9268                 dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
9269         /*
9270          * Check for a diradd dependency for the same directory entry.
9271          * If present, then both dependencies become obsolete and can
9272          * be de-allocated.
9273          */
9274         dap = diradd_lookup(pagedep, offset);
9275         if (dap == NULL) {
9276                 /*
9277                  * Link the jremref structures into the dirrem so they are
9278                  * written prior to the pagedep.
9279                  */
9280                 if (jremref)
9281                         dirrem_journal(dirrem, jremref, dotremref,
9282                             dotdotremref);
9283                 return (dirrem);
9284         }
9285         /*
9286          * Must be ATTACHED at this point.
9287          */
9288         if ((dap->da_state & ATTACHED) == 0)
9289                 panic("newdirrem: not ATTACHED");
9290         if (dap->da_newinum != ip->i_number)
9291                 panic("newdirrem: inum %ju should be %ju",
9292                     (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum);
9293         /*
9294          * If we are deleting a changed name that never made it to disk,
9295          * then return the dirrem describing the previous inode (which
9296          * represents the inode currently referenced from this entry on disk).
9297          */
9298         if ((dap->da_state & DIRCHG) != 0) {
9299                 *prevdirremp = dap->da_previous;
9300                 dap->da_state &= ~DIRCHG;
9301                 dap->da_pagedep = pagedep;
9302         }
9303         /*
9304          * We are deleting an entry that never made it to disk.
9305          * Mark it COMPLETE so we can delete its inode immediately.
9306          */
9307         dirrem->dm_state |= COMPLETE;
9308         cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
9309 #ifdef INVARIANTS
9310         if (isrmdir == 0) {
9311                 struct worklist *wk;
9312
9313                 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
9314                         if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
9315                                 panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
9316         }
9317 #endif
9318
9319         return (dirrem);
9320 }
9321
9322 /*
9323  * Directory entry change dependencies.
9324  *
9325  * Changing an existing directory entry requires that an add operation
9326  * be completed first followed by a deletion. The semantics for the addition
9327  * are identical to the description of adding a new entry above except
9328  * that the rollback is to the old inode number rather than zero. Once
9329  * the addition dependency is completed, the removal is done as described
9330  * in the removal routine above.
9331  */
9332
9333 /*
9334  * This routine should be called immediately after changing
9335  * a directory entry.  The inode's link count should not be
9336  * decremented by the calling procedure -- the soft updates
9337  * code will perform this task when it is safe.
9338  */
9339 void
9340 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
9341         struct buf *bp;         /* buffer containing directory block */
9342         struct inode *dp;       /* inode for the directory being modified */
9343         struct inode *ip;       /* inode for directory entry being removed */
9344         ino_t newinum;          /* new inode number for changed entry */
9345         int isrmdir;            /* indicates if doing RMDIR */
9346 {
9347         int offset;
9348         struct diradd *dap = NULL;
9349         struct dirrem *dirrem, *prevdirrem;
9350         struct pagedep *pagedep;
9351         struct inodedep *inodedep;
9352         struct jaddref *jaddref;
9353         struct mount *mp;
9354         struct ufsmount *ump;
9355
9356         mp = ITOVFS(dp);
9357         ump = VFSTOUFS(mp);
9358         offset = blkoff(ump->um_fs, dp->i_offset);
9359         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
9360            ("softdep_setup_directory_change called on non-softdep filesystem"));
9361
9362         /*
9363          * Whiteouts do not need diradd dependencies.
9364          */
9365         if (newinum != UFS_WINO) {
9366                 dap = malloc(sizeof(struct diradd),
9367                     M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
9368                 workitem_alloc(&dap->da_list, D_DIRADD, mp);
9369                 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
9370                 dap->da_offset = offset;
9371                 dap->da_newinum = newinum;
9372                 LIST_INIT(&dap->da_jwork);
9373         }
9374
9375         /*
9376          * Allocate a new dirrem and ACQUIRE_LOCK.
9377          */
9378         dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
9379         pagedep = dirrem->dm_pagedep;
9380         /*
9381          * The possible values for isrmdir:
9382          *      0 - non-directory file rename
9383          *      1 - directory rename within same directory
9384          *   inum - directory rename to new directory of given inode number
9385          * When renaming to a new directory, we are both deleting and
9386          * creating a new directory entry, so the link count on the new
9387          * directory should not change. Thus we do not need the followup
9388          * dirrem which is usually done in handle_workitem_remove. We set
9389          * the DIRCHG flag to tell handle_workitem_remove to skip the
9390          * followup dirrem.
9391          */
9392         if (isrmdir > 1)
9393                 dirrem->dm_state |= DIRCHG;
9394
9395         /*
9396          * Whiteouts have no additional dependencies,
9397          * so just put the dirrem on the correct list.
9398          */
9399         if (newinum == UFS_WINO) {
9400                 if ((dirrem->dm_state & COMPLETE) == 0) {
9401                         LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
9402                             dm_next);
9403                 } else {
9404                         dirrem->dm_dirinum = pagedep->pd_ino;
9405                         if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9406                                 add_to_worklist(&dirrem->dm_list, 0);
9407                 }
9408                 FREE_LOCK(ump);
9409                 return;
9410         }
9411         /*
9412          * Add the dirrem to the inodedep's pending remove list for quick
9413          * discovery later.  A valid nlinkdelta ensures that this lookup
9414          * will not fail.
9415          */
9416         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
9417                 panic("softdep_setup_directory_change: Lost inodedep.");
9418         dirrem->dm_state |= ONDEPLIST;
9419         LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9420
9421         /*
9422          * If the COMPLETE flag is clear, then there were no active
9423          * entries and we want to roll back to the previous inode until
9424          * the new inode is committed to disk. If the COMPLETE flag is
9425          * set, then we have deleted an entry that never made it to disk.
9426          * If the entry we deleted resulted from a name change, then the old
9427          * inode reference still resides on disk. Any rollback that we do
9428          * needs to be to that old inode (returned to us in prevdirrem). If
9429          * the entry we deleted resulted from a create, then there is
9430          * no entry on the disk, so we want to roll back to zero rather
9431          * than the uncommitted inode. In either of the COMPLETE cases we
9432          * want to immediately free the unwritten and unreferenced inode.
9433          */
9434         if ((dirrem->dm_state & COMPLETE) == 0) {
9435                 dap->da_previous = dirrem;
9436         } else {
9437                 if (prevdirrem != NULL) {
9438                         dap->da_previous = prevdirrem;
9439                 } else {
9440                         dap->da_state &= ~DIRCHG;
9441                         dap->da_pagedep = pagedep;
9442                 }
9443                 dirrem->dm_dirinum = pagedep->pd_ino;
9444                 if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9445                         add_to_worklist(&dirrem->dm_list, 0);
9446         }
9447         /*
9448          * Lookup the jaddref for this journal entry.  We must finish
9449          * initializing it and make the diradd write dependent on it.
9450          * If we're not journaling, put it on the id_bufwait list if the
9451          * inode is not yet written. If it is written, do the post-inode
9452          * write processing to put it on the id_pendinghd list.
9453          */
9454         inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
9455         if (MOUNTEDSUJ(mp)) {
9456                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
9457                     inoreflst);
9458                 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
9459                     ("softdep_setup_directory_change: bad jaddref %p",
9460                     jaddref));
9461                 jaddref->ja_diroff = dp->i_offset;
9462                 jaddref->ja_diradd = dap;
9463                 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9464                     dap, da_pdlist);
9465                 add_to_journal(&jaddref->ja_list);
9466         } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
9467                 dap->da_state |= COMPLETE;
9468                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
9469                 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
9470         } else {
9471                 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9472                     dap, da_pdlist);
9473                 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
9474         }
9475         /*
9476          * If we're making a new name for a directory that has not been
9477          * committed when need to move the dot and dotdot references to
9478          * this new name.
9479          */
9480         if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
9481                 merge_diradd(inodedep, dap);
9482         FREE_LOCK(ump);
9483 }
9484
9485 /*
9486  * Called whenever the link count on an inode is changed.
9487  * It creates an inode dependency so that the new reference(s)
9488  * to the inode cannot be committed to disk until the updated
9489  * inode has been written.
9490  */
9491 void
9492 softdep_change_linkcnt(ip)
9493         struct inode *ip;       /* the inode with the increased link count */
9494 {
9495         struct inodedep *inodedep;
9496         struct ufsmount *ump;
9497
9498         ump = ITOUMP(ip);
9499         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
9500             ("softdep_change_linkcnt called on non-softdep filesystem"));
9501         ACQUIRE_LOCK(ump);
9502         inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep);
9503         if (ip->i_nlink < ip->i_effnlink)
9504                 panic("softdep_change_linkcnt: bad delta");
9505         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9506         FREE_LOCK(ump);
9507 }
9508
9509 /*
9510  * Attach a sbdep dependency to the superblock buf so that we can keep
9511  * track of the head of the linked list of referenced but unlinked inodes.
9512  */
9513 void
9514 softdep_setup_sbupdate(ump, fs, bp)
9515         struct ufsmount *ump;
9516         struct fs *fs;
9517         struct buf *bp;
9518 {
9519         struct sbdep *sbdep;
9520         struct worklist *wk;
9521
9522         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
9523             ("softdep_setup_sbupdate called on non-softdep filesystem"));
9524         LIST_FOREACH(wk, &bp->b_dep, wk_list)
9525                 if (wk->wk_type == D_SBDEP)
9526                         break;
9527         if (wk != NULL)
9528                 return;
9529         sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
9530         workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
9531         sbdep->sb_fs = fs;
9532         sbdep->sb_ump = ump;
9533         ACQUIRE_LOCK(ump);
9534         WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
9535         FREE_LOCK(ump);
9536 }
9537
9538 /*
9539  * Return the first unlinked inodedep which is ready to be the head of the
9540  * list.  The inodedep and all those after it must have valid next pointers.
9541  */
9542 static struct inodedep *
9543 first_unlinked_inodedep(ump)
9544         struct ufsmount *ump;
9545 {
9546         struct inodedep *inodedep;
9547         struct inodedep *idp;
9548
9549         LOCK_OWNED(ump);
9550         for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
9551             inodedep; inodedep = idp) {
9552                 if ((inodedep->id_state & UNLINKNEXT) == 0)
9553                         return (NULL);
9554                 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9555                 if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
9556                         break;
9557                 if ((inodedep->id_state & UNLINKPREV) == 0)
9558                         break;
9559         }
9560         return (inodedep);
9561 }
9562
9563 /*
9564  * Set the sujfree unlinked head pointer prior to writing a superblock.
9565  */
9566 static void
9567 initiate_write_sbdep(sbdep)
9568         struct sbdep *sbdep;
9569 {
9570         struct inodedep *inodedep;
9571         struct fs *bpfs;
9572         struct fs *fs;
9573
9574         bpfs = sbdep->sb_fs;
9575         fs = sbdep->sb_ump->um_fs;
9576         inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9577         if (inodedep) {
9578                 fs->fs_sujfree = inodedep->id_ino;
9579                 inodedep->id_state |= UNLINKPREV;
9580         } else
9581                 fs->fs_sujfree = 0;
9582         bpfs->fs_sujfree = fs->fs_sujfree;
9583 }
9584
9585 /*
9586  * After a superblock is written determine whether it must be written again
9587  * due to a changing unlinked list head.
9588  */
9589 static int
9590 handle_written_sbdep(sbdep, bp)
9591         struct sbdep *sbdep;
9592         struct buf *bp;
9593 {
9594         struct inodedep *inodedep;
9595         struct fs *fs;
9596
9597         LOCK_OWNED(sbdep->sb_ump);
9598         fs = sbdep->sb_fs;
9599         /*
9600          * If the superblock doesn't match the in-memory list start over.
9601          */
9602         inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9603         if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
9604             (inodedep == NULL && fs->fs_sujfree != 0)) {
9605                 bdirty(bp);
9606                 return (1);
9607         }
9608         WORKITEM_FREE(sbdep, D_SBDEP);
9609         if (fs->fs_sujfree == 0)
9610                 return (0);
9611         /*
9612          * Now that we have a record of this inode in stable store allow it
9613          * to be written to free up pending work.  Inodes may see a lot of
9614          * write activity after they are unlinked which we must not hold up.
9615          */
9616         for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
9617                 if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
9618                         panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
9619                             inodedep, inodedep->id_state);
9620                 if (inodedep->id_state & UNLINKONLIST)
9621                         break;
9622                 inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
9623         }
9624
9625         return (0);
9626 }
9627
9628 /*
9629  * Mark an inodedep as unlinked and insert it into the in-memory unlinked list.
9630  */
9631 static void
9632 unlinked_inodedep(mp, inodedep)
9633         struct mount *mp;
9634         struct inodedep *inodedep;
9635 {
9636         struct ufsmount *ump;
9637
9638         ump = VFSTOUFS(mp);
9639         LOCK_OWNED(ump);
9640         if (MOUNTEDSUJ(mp) == 0)
9641                 return;
9642         ump->um_fs->fs_fmod = 1;
9643         if (inodedep->id_state & UNLINKED)
9644                 panic("unlinked_inodedep: %p already unlinked\n", inodedep);
9645         inodedep->id_state |= UNLINKED;
9646         TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
9647 }
9648
9649 /*
9650  * Remove an inodedep from the unlinked inodedep list.  This may require
9651  * disk writes if the inode has made it that far.
9652  */
9653 static void
9654 clear_unlinked_inodedep(inodedep)
9655         struct inodedep *inodedep;
9656 {
9657         struct ufs2_dinode *dip;
9658         struct ufsmount *ump;
9659         struct inodedep *idp;
9660         struct inodedep *idn;
9661         struct fs *fs;
9662         struct buf *bp;
9663         ino_t ino;
9664         ino_t nino;
9665         ino_t pino;
9666         int error;
9667
9668         ump = VFSTOUFS(inodedep->id_list.wk_mp);
9669         fs = ump->um_fs;
9670         ino = inodedep->id_ino;
9671         error = 0;
9672         for (;;) {
9673                 LOCK_OWNED(ump);
9674                 KASSERT((inodedep->id_state & UNLINKED) != 0,
9675                     ("clear_unlinked_inodedep: inodedep %p not unlinked",
9676                     inodedep));
9677                 /*
9678                  * If nothing has yet been written simply remove us from
9679                  * the in memory list and return.  This is the most common
9680                  * case where handle_workitem_remove() loses the final
9681                  * reference.
9682                  */
9683                 if ((inodedep->id_state & UNLINKLINKS) == 0)
9684                         break;
9685                 /*
9686                  * If we have a NEXT pointer and no PREV pointer we can simply
9687                  * clear NEXT's PREV and remove ourselves from the list.  Be
9688                  * careful not to clear PREV if the superblock points at
9689                  * next as well.
9690                  */
9691                 idn = TAILQ_NEXT(inodedep, id_unlinked);
9692                 if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
9693                         if (idn && fs->fs_sujfree != idn->id_ino)
9694                                 idn->id_state &= ~UNLINKPREV;
9695                         break;
9696                 }
9697                 /*
9698                  * Here we have an inodedep which is actually linked into
9699                  * the list.  We must remove it by forcing a write to the
9700                  * link before us, whether it be the superblock or an inode.
9701                  * Unfortunately the list may change while we're waiting
9702                  * on the buf lock for either resource so we must loop until
9703                  * we lock the right one.  If both the superblock and an
9704                  * inode point to this inode we must clear the inode first
9705                  * followed by the superblock.
9706                  */
9707                 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9708                 pino = 0;
9709                 if (idp && (idp->id_state & UNLINKNEXT))
9710                         pino = idp->id_ino;
9711                 FREE_LOCK(ump);
9712                 if (pino == 0) {
9713                         bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9714                             (int)fs->fs_sbsize, 0, 0, 0);
9715                 } else {
9716                         error = bread(ump->um_devvp,
9717                             fsbtodb(fs, ino_to_fsba(fs, pino)),
9718                             (int)fs->fs_bsize, NOCRED, &bp);
9719                         if (error)
9720                                 brelse(bp);
9721                 }
9722                 ACQUIRE_LOCK(ump);
9723                 if (error)
9724                         break;
9725                 /* If the list has changed restart the loop. */
9726                 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9727                 nino = 0;
9728                 if (idp && (idp->id_state & UNLINKNEXT))
9729                         nino = idp->id_ino;
9730                 if (nino != pino ||
9731                     (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
9732                         FREE_LOCK(ump);
9733                         brelse(bp);
9734                         ACQUIRE_LOCK(ump);
9735                         continue;
9736                 }
9737                 nino = 0;
9738                 idn = TAILQ_NEXT(inodedep, id_unlinked);
9739                 if (idn)
9740                         nino = idn->id_ino;
9741                 /*
9742                  * Remove us from the in memory list.  After this we cannot
9743                  * access the inodedep.
9744                  */
9745                 KASSERT((inodedep->id_state & UNLINKED) != 0,
9746                     ("clear_unlinked_inodedep: inodedep %p not unlinked",
9747                     inodedep));
9748                 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9749                 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9750                 FREE_LOCK(ump);
9751                 /*
9752                  * The predecessor's next pointer is manually updated here
9753                  * so that the NEXT flag is never cleared for an element
9754                  * that is in the list.
9755                  */
9756                 if (pino == 0) {
9757                         bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9758                         ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9759                         softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9760                             bp);
9761                 } else if (fs->fs_magic == FS_UFS1_MAGIC) {
9762                         ((struct ufs1_dinode *)bp->b_data +
9763                             ino_to_fsbo(fs, pino))->di_freelink = nino;
9764                 } else {
9765                         dip = (struct ufs2_dinode *)bp->b_data +
9766                             ino_to_fsbo(fs, pino);
9767                         dip->di_freelink = nino;
9768                         ffs_update_dinode_ckhash(fs, dip);
9769                 }
9770                 /*
9771                  * If the bwrite fails we have no recourse to recover.  The
9772                  * filesystem is corrupted already.
9773                  */
9774                 bwrite(bp);
9775                 ACQUIRE_LOCK(ump);
9776                 /*
9777                  * If the superblock pointer still needs to be cleared force
9778                  * a write here.
9779                  */
9780                 if (fs->fs_sujfree == ino) {
9781                         FREE_LOCK(ump);
9782                         bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9783                             (int)fs->fs_sbsize, 0, 0, 0);
9784                         bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9785                         ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9786                         softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9787                             bp);
9788                         bwrite(bp);
9789                         ACQUIRE_LOCK(ump);
9790                 }
9791
9792                 if (fs->fs_sujfree != ino)
9793                         return;
9794                 panic("clear_unlinked_inodedep: Failed to clear free head");
9795         }
9796         if (inodedep->id_ino == fs->fs_sujfree)
9797                 panic("clear_unlinked_inodedep: Freeing head of free list");
9798         inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9799         TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9800         return;
9801 }
9802
9803 /*
9804  * This workitem decrements the inode's link count.
9805  * If the link count reaches zero, the file is removed.
9806  */
9807 static int
9808 handle_workitem_remove(dirrem, flags)
9809         struct dirrem *dirrem;
9810         int flags;
9811 {
9812         struct inodedep *inodedep;
9813         struct workhead dotdotwk;
9814         struct worklist *wk;
9815         struct ufsmount *ump;
9816         struct mount *mp;
9817         struct vnode *vp;
9818         struct inode *ip;
9819         ino_t oldinum;
9820
9821         if (dirrem->dm_state & ONWORKLIST)
9822                 panic("handle_workitem_remove: dirrem %p still on worklist",
9823                     dirrem);
9824         oldinum = dirrem->dm_oldinum;
9825         mp = dirrem->dm_list.wk_mp;
9826         ump = VFSTOUFS(mp);
9827         flags |= LK_EXCLUSIVE;
9828         if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0)
9829                 return (EBUSY);
9830         ip = VTOI(vp);
9831         ACQUIRE_LOCK(ump);
9832         if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
9833                 panic("handle_workitem_remove: lost inodedep");
9834         if (dirrem->dm_state & ONDEPLIST)
9835                 LIST_REMOVE(dirrem, dm_inonext);
9836         KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
9837             ("handle_workitem_remove:  Journal entries not written."));
9838
9839         /*
9840          * Move all dependencies waiting on the remove to complete
9841          * from the dirrem to the inode inowait list to be completed
9842          * after the inode has been updated and written to disk.  Any
9843          * marked MKDIR_PARENT are saved to be completed when the .. ref
9844          * is removed.
9845          */
9846         LIST_INIT(&dotdotwk);
9847         while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
9848                 WORKLIST_REMOVE(wk);
9849                 if (wk->wk_state & MKDIR_PARENT) {
9850                         wk->wk_state &= ~MKDIR_PARENT;
9851                         WORKLIST_INSERT(&dotdotwk, wk);
9852                         continue;
9853                 }
9854                 WORKLIST_INSERT(&inodedep->id_inowait, wk);
9855         }
9856         LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
9857         /*
9858          * Normal file deletion.
9859          */
9860         if ((dirrem->dm_state & RMDIR) == 0) {
9861                 ip->i_nlink--;
9862                 DIP_SET(ip, i_nlink, ip->i_nlink);
9863                 ip->i_flag |= IN_CHANGE;
9864                 if (ip->i_nlink < ip->i_effnlink)
9865                         panic("handle_workitem_remove: bad file delta");
9866                 if (ip->i_nlink == 0)
9867                         unlinked_inodedep(mp, inodedep);
9868                 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9869                 KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9870                     ("handle_workitem_remove: worklist not empty. %s",
9871                     TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
9872                 WORKITEM_FREE(dirrem, D_DIRREM);
9873                 FREE_LOCK(ump);
9874                 goto out;
9875         }
9876         /*
9877          * Directory deletion. Decrement reference count for both the
9878          * just deleted parent directory entry and the reference for ".".
9879          * Arrange to have the reference count on the parent decremented
9880          * to account for the loss of "..".
9881          */
9882         ip->i_nlink -= 2;
9883         DIP_SET(ip, i_nlink, ip->i_nlink);
9884         ip->i_flag |= IN_CHANGE;
9885         if (ip->i_nlink < ip->i_effnlink)
9886                 panic("handle_workitem_remove: bad dir delta");
9887         if (ip->i_nlink == 0)
9888                 unlinked_inodedep(mp, inodedep);
9889         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9890         /*
9891          * Rename a directory to a new parent. Since, we are both deleting
9892          * and creating a new directory entry, the link count on the new
9893          * directory should not change. Thus we skip the followup dirrem.
9894          */
9895         if (dirrem->dm_state & DIRCHG) {
9896                 KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9897                     ("handle_workitem_remove: DIRCHG and worklist not empty."));
9898                 WORKITEM_FREE(dirrem, D_DIRREM);
9899                 FREE_LOCK(ump);
9900                 goto out;
9901         }
9902         dirrem->dm_state = ONDEPLIST;
9903         dirrem->dm_oldinum = dirrem->dm_dirinum;
9904         /*
9905          * Place the dirrem on the parent's diremhd list.
9906          */
9907         if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
9908                 panic("handle_workitem_remove: lost dir inodedep");
9909         LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9910         /*
9911          * If the allocated inode has never been written to disk, then
9912          * the on-disk inode is zero'ed and we can remove the file
9913          * immediately.  When journaling if the inode has been marked
9914          * unlinked and not DEPCOMPLETE we know it can never be written.
9915          */
9916         inodedep_lookup(mp, oldinum, 0, &inodedep);
9917         if (inodedep == NULL ||
9918             (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
9919             check_inode_unwritten(inodedep)) {
9920                 FREE_LOCK(ump);
9921                 vput(vp);
9922                 return handle_workitem_remove(dirrem, flags);
9923         }
9924         WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
9925         FREE_LOCK(ump);
9926         ip->i_flag |= IN_CHANGE;
9927 out:
9928         ffs_update(vp, 0);
9929         vput(vp);
9930         return (0);
9931 }
9932
9933 /*
9934  * Inode de-allocation dependencies.
9935  *
9936  * When an inode's link count is reduced to zero, it can be de-allocated. We
9937  * found it convenient to postpone de-allocation until after the inode is
9938  * written to disk with its new link count (zero).  At this point, all of the
9939  * on-disk inode's block pointers are nullified and, with careful dependency
9940  * list ordering, all dependencies related to the inode will be satisfied and
9941  * the corresponding dependency structures de-allocated.  So, if/when the
9942  * inode is reused, there will be no mixing of old dependencies with new
9943  * ones.  This artificial dependency is set up by the block de-allocation
9944  * procedure above (softdep_setup_freeblocks) and completed by the
9945  * following procedure.
9946  */
9947 static void
9948 handle_workitem_freefile(freefile)
9949         struct freefile *freefile;
9950 {
9951         struct workhead wkhd;
9952         struct fs *fs;
9953         struct ufsmount *ump;
9954         int error;
9955 #ifdef INVARIANTS
9956         struct inodedep *idp;
9957 #endif
9958
9959         ump = VFSTOUFS(freefile->fx_list.wk_mp);
9960         fs = ump->um_fs;
9961 #ifdef INVARIANTS
9962         ACQUIRE_LOCK(ump);
9963         error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
9964         FREE_LOCK(ump);
9965         if (error)
9966                 panic("handle_workitem_freefile: inodedep %p survived", idp);
9967 #endif
9968         UFS_LOCK(ump);
9969         fs->fs_pendinginodes -= 1;
9970         UFS_UNLOCK(ump);
9971         LIST_INIT(&wkhd);
9972         LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
9973         if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
9974             freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
9975                 softdep_error("handle_workitem_freefile", error);
9976         ACQUIRE_LOCK(ump);
9977         WORKITEM_FREE(freefile, D_FREEFILE);
9978         FREE_LOCK(ump);
9979 }
9980
9981
9982 /*
9983  * Helper function which unlinks marker element from work list and returns
9984  * the next element on the list.
9985  */
9986 static __inline struct worklist *
9987 markernext(struct worklist *marker)
9988 {
9989         struct worklist *next;
9990
9991         next = LIST_NEXT(marker, wk_list);
9992         LIST_REMOVE(marker, wk_list);
9993         return next;
9994 }
9995
9996 /*
9997  * Disk writes.
9998  *
9999  * The dependency structures constructed above are most actively used when file
10000  * system blocks are written to disk.  No constraints are placed on when a
10001  * block can be written, but unsatisfied update dependencies are made safe by
10002  * modifying (or replacing) the source memory for the duration of the disk
10003  * write.  When the disk write completes, the memory block is again brought
10004  * up-to-date.
10005  *
10006  * In-core inode structure reclamation.
10007  *
10008  * Because there are a finite number of "in-core" inode structures, they are
10009  * reused regularly.  By transferring all inode-related dependencies to the
10010  * in-memory inode block and indexing them separately (via "inodedep"s), we
10011  * can allow "in-core" inode structures to be reused at any time and avoid
10012  * any increase in contention.
10013  *
10014  * Called just before entering the device driver to initiate a new disk I/O.
10015  * The buffer must be locked, thus, no I/O completion operations can occur
10016  * while we are manipulating its associated dependencies.
10017  */
10018 static void
10019 softdep_disk_io_initiation(bp)
10020         struct buf *bp;         /* structure describing disk write to occur */
10021 {
10022         struct worklist *wk;
10023         struct worklist marker;
10024         struct inodedep *inodedep;
10025         struct freeblks *freeblks;
10026         struct jblkdep *jblkdep;
10027         struct newblk *newblk;
10028         struct ufsmount *ump;
10029
10030         /*
10031          * We only care about write operations. There should never
10032          * be dependencies for reads.
10033          */
10034         if (bp->b_iocmd != BIO_WRITE)
10035                 panic("softdep_disk_io_initiation: not write");
10036
10037         if (bp->b_vflags & BV_BKGRDINPROG)
10038                 panic("softdep_disk_io_initiation: Writing buffer with "
10039                     "background write in progress: %p", bp);
10040
10041         ump = softdep_bp_to_mp(bp);
10042         if (ump == NULL)
10043                 return;
10044
10045         marker.wk_type = D_LAST + 1;    /* Not a normal workitem */
10046         PHOLD(curproc);                 /* Don't swap out kernel stack */
10047         ACQUIRE_LOCK(ump);
10048         /*
10049          * Do any necessary pre-I/O processing.
10050          */
10051         for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
10052              wk = markernext(&marker)) {
10053                 LIST_INSERT_AFTER(wk, &marker, wk_list);
10054                 switch (wk->wk_type) {
10055
10056                 case D_PAGEDEP:
10057                         initiate_write_filepage(WK_PAGEDEP(wk), bp);
10058                         continue;
10059
10060                 case D_INODEDEP:
10061                         inodedep = WK_INODEDEP(wk);
10062                         if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
10063                                 initiate_write_inodeblock_ufs1(inodedep, bp);
10064                         else
10065                                 initiate_write_inodeblock_ufs2(inodedep, bp);
10066                         continue;
10067
10068                 case D_INDIRDEP:
10069                         initiate_write_indirdep(WK_INDIRDEP(wk), bp);
10070                         continue;
10071
10072                 case D_BMSAFEMAP:
10073                         initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
10074                         continue;
10075
10076                 case D_JSEG:
10077                         WK_JSEG(wk)->js_buf = NULL;
10078                         continue;
10079
10080                 case D_FREEBLKS:
10081                         freeblks = WK_FREEBLKS(wk);
10082                         jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd);
10083                         /*
10084                          * We have to wait for the freeblks to be journaled
10085                          * before we can write an inodeblock with updated
10086                          * pointers.  Be careful to arrange the marker so
10087                          * we revisit the freeblks if it's not removed by
10088                          * the first jwait().
10089                          */
10090                         if (jblkdep != NULL) {
10091                                 LIST_REMOVE(&marker, wk_list);
10092                                 LIST_INSERT_BEFORE(wk, &marker, wk_list);
10093                                 jwait(&jblkdep->jb_list, MNT_WAIT);
10094                         }
10095                         continue;
10096                 case D_ALLOCDIRECT:
10097                 case D_ALLOCINDIR:
10098                         /*
10099                          * We have to wait for the jnewblk to be journaled
10100                          * before we can write to a block if the contents
10101                          * may be confused with an earlier file's indirect
10102                          * at recovery time.  Handle the marker as described
10103                          * above.
10104                          */
10105                         newblk = WK_NEWBLK(wk);
10106                         if (newblk->nb_jnewblk != NULL &&
10107                             indirblk_lookup(newblk->nb_list.wk_mp,
10108                             newblk->nb_newblkno)) {
10109                                 LIST_REMOVE(&marker, wk_list);
10110                                 LIST_INSERT_BEFORE(wk, &marker, wk_list);
10111                                 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
10112                         }
10113                         continue;
10114
10115                 case D_SBDEP:
10116                         initiate_write_sbdep(WK_SBDEP(wk));
10117                         continue;
10118
10119                 case D_MKDIR:
10120                 case D_FREEWORK:
10121                 case D_FREEDEP:
10122                 case D_JSEGDEP:
10123                         continue;
10124
10125                 default:
10126                         panic("handle_disk_io_initiation: Unexpected type %s",
10127                             TYPENAME(wk->wk_type));
10128                         /* NOTREACHED */
10129                 }
10130         }
10131         FREE_LOCK(ump);
10132         PRELE(curproc);                 /* Allow swapout of kernel stack */
10133 }
10134
10135 /*
10136  * Called from within the procedure above to deal with unsatisfied
10137  * allocation dependencies in a directory. The buffer must be locked,
10138  * thus, no I/O completion operations can occur while we are
10139  * manipulating its associated dependencies.
10140  */
10141 static void
10142 initiate_write_filepage(pagedep, bp)
10143         struct pagedep *pagedep;
10144         struct buf *bp;
10145 {
10146         struct jremref *jremref;
10147         struct jmvref *jmvref;
10148         struct dirrem *dirrem;
10149         struct diradd *dap;
10150         struct direct *ep;
10151         int i;
10152
10153         if (pagedep->pd_state & IOSTARTED) {
10154                 /*
10155                  * This can only happen if there is a driver that does not
10156                  * understand chaining. Here biodone will reissue the call
10157                  * to strategy for the incomplete buffers.
10158                  */
10159                 printf("initiate_write_filepage: already started\n");
10160                 return;
10161         }
10162         pagedep->pd_state |= IOSTARTED;
10163         /*
10164          * Wait for all journal remove dependencies to hit the disk.
10165          * We can not allow any potentially conflicting directory adds
10166          * to be visible before removes and rollback is too difficult.
10167          * The per-filesystem lock may be dropped and re-acquired, however
10168          * we hold the buf locked so the dependency can not go away.
10169          */
10170         LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
10171                 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL)
10172                         jwait(&jremref->jr_list, MNT_WAIT);
10173         while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL)
10174                 jwait(&jmvref->jm_list, MNT_WAIT);
10175         for (i = 0; i < DAHASHSZ; i++) {
10176                 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
10177                         ep = (struct direct *)
10178                             ((char *)bp->b_data + dap->da_offset);
10179                         if (ep->d_ino != dap->da_newinum)
10180                                 panic("%s: dir inum %ju != new %ju",
10181                                     "initiate_write_filepage",
10182                                     (uintmax_t)ep->d_ino,
10183                                     (uintmax_t)dap->da_newinum);
10184                         if (dap->da_state & DIRCHG)
10185                                 ep->d_ino = dap->da_previous->dm_oldinum;
10186                         else
10187                                 ep->d_ino = 0;
10188                         dap->da_state &= ~ATTACHED;
10189                         dap->da_state |= UNDONE;
10190                 }
10191         }
10192 }
10193
10194 /*
10195  * Version of initiate_write_inodeblock that handles UFS1 dinodes.
10196  * Note that any bug fixes made to this routine must be done in the
10197  * version found below.
10198  *
10199  * Called from within the procedure above to deal with unsatisfied
10200  * allocation dependencies in an inodeblock. The buffer must be
10201  * locked, thus, no I/O completion operations can occur while we
10202  * are manipulating its associated dependencies.
10203  */
10204 static void
10205 initiate_write_inodeblock_ufs1(inodedep, bp)
10206         struct inodedep *inodedep;
10207         struct buf *bp;                 /* The inode block */
10208 {
10209         struct allocdirect *adp, *lastadp;
10210         struct ufs1_dinode *dp;
10211         struct ufs1_dinode *sip;
10212         struct inoref *inoref;
10213         struct ufsmount *ump;
10214         struct fs *fs;
10215         ufs_lbn_t i;
10216 #ifdef INVARIANTS
10217         ufs_lbn_t prevlbn = 0;
10218 #endif
10219         int deplist;
10220
10221         if (inodedep->id_state & IOSTARTED)
10222                 panic("initiate_write_inodeblock_ufs1: already started");
10223         inodedep->id_state |= IOSTARTED;
10224         fs = inodedep->id_fs;
10225         ump = VFSTOUFS(inodedep->id_list.wk_mp);
10226         LOCK_OWNED(ump);
10227         dp = (struct ufs1_dinode *)bp->b_data +
10228             ino_to_fsbo(fs, inodedep->id_ino);
10229
10230         /*
10231          * If we're on the unlinked list but have not yet written our
10232          * next pointer initialize it here.
10233          */
10234         if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10235                 struct inodedep *inon;
10236
10237                 inon = TAILQ_NEXT(inodedep, id_unlinked);
10238                 dp->di_freelink = inon ? inon->id_ino : 0;
10239         }
10240         /*
10241          * If the bitmap is not yet written, then the allocated
10242          * inode cannot be written to disk.
10243          */
10244         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10245                 if (inodedep->id_savedino1 != NULL)
10246                         panic("initiate_write_inodeblock_ufs1: I/O underway");
10247                 FREE_LOCK(ump);
10248                 sip = malloc(sizeof(struct ufs1_dinode),
10249                     M_SAVEDINO, M_SOFTDEP_FLAGS);
10250                 ACQUIRE_LOCK(ump);
10251                 inodedep->id_savedino1 = sip;
10252                 *inodedep->id_savedino1 = *dp;
10253                 bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
10254                 dp->di_gen = inodedep->id_savedino1->di_gen;
10255                 dp->di_freelink = inodedep->id_savedino1->di_freelink;
10256                 return;
10257         }
10258         /*
10259          * If no dependencies, then there is nothing to roll back.
10260          */
10261         inodedep->id_savedsize = dp->di_size;
10262         inodedep->id_savedextsize = 0;
10263         inodedep->id_savednlink = dp->di_nlink;
10264         if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10265             TAILQ_EMPTY(&inodedep->id_inoreflst))
10266                 return;
10267         /*
10268          * Revert the link count to that of the first unwritten journal entry.
10269          */
10270         inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10271         if (inoref)
10272                 dp->di_nlink = inoref->if_nlink;
10273         /*
10274          * Set the dependencies to busy.
10275          */
10276         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10277              adp = TAILQ_NEXT(adp, ad_next)) {
10278 #ifdef INVARIANTS
10279                 if (deplist != 0 && prevlbn >= adp->ad_offset)
10280                         panic("softdep_write_inodeblock: lbn order");
10281                 prevlbn = adp->ad_offset;
10282                 if (adp->ad_offset < UFS_NDADDR &&
10283                     dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10284                         panic("initiate_write_inodeblock_ufs1: "
10285                             "direct pointer #%jd mismatch %d != %jd",
10286                             (intmax_t)adp->ad_offset,
10287                             dp->di_db[adp->ad_offset],
10288                             (intmax_t)adp->ad_newblkno);
10289                 if (adp->ad_offset >= UFS_NDADDR &&
10290                     dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno)
10291                         panic("initiate_write_inodeblock_ufs1: "
10292                             "indirect pointer #%jd mismatch %d != %jd",
10293                             (intmax_t)adp->ad_offset - UFS_NDADDR,
10294                             dp->di_ib[adp->ad_offset - UFS_NDADDR],
10295                             (intmax_t)adp->ad_newblkno);
10296                 deplist |= 1 << adp->ad_offset;
10297                 if ((adp->ad_state & ATTACHED) == 0)
10298                         panic("initiate_write_inodeblock_ufs1: "
10299                             "Unknown state 0x%x", adp->ad_state);
10300 #endif /* INVARIANTS */
10301                 adp->ad_state &= ~ATTACHED;
10302                 adp->ad_state |= UNDONE;
10303         }
10304         /*
10305          * The on-disk inode cannot claim to be any larger than the last
10306          * fragment that has been written. Otherwise, the on-disk inode
10307          * might have fragments that were not the last block in the file
10308          * which would corrupt the filesystem.
10309          */
10310         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10311              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10312                 if (adp->ad_offset >= UFS_NDADDR)
10313                         break;
10314                 dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10315                 /* keep going until hitting a rollback to a frag */
10316                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10317                         continue;
10318                 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10319                 for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) {
10320 #ifdef INVARIANTS
10321                         if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10322                                 panic("initiate_write_inodeblock_ufs1: "
10323                                     "lost dep1");
10324 #endif /* INVARIANTS */
10325                         dp->di_db[i] = 0;
10326                 }
10327                 for (i = 0; i < UFS_NIADDR; i++) {
10328 #ifdef INVARIANTS
10329                         if (dp->di_ib[i] != 0 &&
10330                             (deplist & ((1 << UFS_NDADDR) << i)) == 0)
10331                                 panic("initiate_write_inodeblock_ufs1: "
10332                                     "lost dep2");
10333 #endif /* INVARIANTS */
10334                         dp->di_ib[i] = 0;
10335                 }
10336                 return;
10337         }
10338         /*
10339          * If we have zero'ed out the last allocated block of the file,
10340          * roll back the size to the last currently allocated block.
10341          * We know that this last allocated block is a full-sized as
10342          * we already checked for fragments in the loop above.
10343          */
10344         if (lastadp != NULL &&
10345             dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10346                 for (i = lastadp->ad_offset; i >= 0; i--)
10347                         if (dp->di_db[i] != 0)
10348                                 break;
10349                 dp->di_size = (i + 1) * fs->fs_bsize;
10350         }
10351         /*
10352          * The only dependencies are for indirect blocks.
10353          *
10354          * The file size for indirect block additions is not guaranteed.
10355          * Such a guarantee would be non-trivial to achieve. The conventional
10356          * synchronous write implementation also does not make this guarantee.
10357          * Fsck should catch and fix discrepancies. Arguably, the file size
10358          * can be over-estimated without destroying integrity when the file
10359          * moves into the indirect blocks (i.e., is large). If we want to
10360          * postpone fsck, we are stuck with this argument.
10361          */
10362         for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10363                 dp->di_ib[adp->ad_offset - UFS_NDADDR] = 0;
10364 }
10365
10366 /*
10367  * Version of initiate_write_inodeblock that handles UFS2 dinodes.
10368  * Note that any bug fixes made to this routine must be done in the
10369  * version found above.
10370  *
10371  * Called from within the procedure above to deal with unsatisfied
10372  * allocation dependencies in an inodeblock. The buffer must be
10373  * locked, thus, no I/O completion operations can occur while we
10374  * are manipulating its associated dependencies.
10375  */
10376 static void
10377 initiate_write_inodeblock_ufs2(inodedep, bp)
10378         struct inodedep *inodedep;
10379         struct buf *bp;                 /* The inode block */
10380 {
10381         struct allocdirect *adp, *lastadp;
10382         struct ufs2_dinode *dp;
10383         struct ufs2_dinode *sip;
10384         struct inoref *inoref;
10385         struct ufsmount *ump;
10386         struct fs *fs;
10387         ufs_lbn_t i;
10388 #ifdef INVARIANTS
10389         ufs_lbn_t prevlbn = 0;
10390 #endif
10391         int deplist;
10392
10393         if (inodedep->id_state & IOSTARTED)
10394                 panic("initiate_write_inodeblock_ufs2: already started");
10395         inodedep->id_state |= IOSTARTED;
10396         fs = inodedep->id_fs;
10397         ump = VFSTOUFS(inodedep->id_list.wk_mp);
10398         LOCK_OWNED(ump);
10399         dp = (struct ufs2_dinode *)bp->b_data +
10400             ino_to_fsbo(fs, inodedep->id_ino);
10401
10402         /*
10403          * If we're on the unlinked list but have not yet written our
10404          * next pointer initialize it here.
10405          */
10406         if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10407                 struct inodedep *inon;
10408
10409                 inon = TAILQ_NEXT(inodedep, id_unlinked);
10410                 dp->di_freelink = inon ? inon->id_ino : 0;
10411                 ffs_update_dinode_ckhash(fs, dp);
10412         }
10413         /*
10414          * If the bitmap is not yet written, then the allocated
10415          * inode cannot be written to disk.
10416          */
10417         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10418                 if (inodedep->id_savedino2 != NULL)
10419                         panic("initiate_write_inodeblock_ufs2: I/O underway");
10420                 FREE_LOCK(ump);
10421                 sip = malloc(sizeof(struct ufs2_dinode),
10422                     M_SAVEDINO, M_SOFTDEP_FLAGS);
10423                 ACQUIRE_LOCK(ump);
10424                 inodedep->id_savedino2 = sip;
10425                 *inodedep->id_savedino2 = *dp;
10426                 bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
10427                 dp->di_gen = inodedep->id_savedino2->di_gen;
10428                 dp->di_freelink = inodedep->id_savedino2->di_freelink;
10429                 return;
10430         }
10431         /*
10432          * If no dependencies, then there is nothing to roll back.
10433          */
10434         inodedep->id_savedsize = dp->di_size;
10435         inodedep->id_savedextsize = dp->di_extsize;
10436         inodedep->id_savednlink = dp->di_nlink;
10437         if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10438             TAILQ_EMPTY(&inodedep->id_extupdt) &&
10439             TAILQ_EMPTY(&inodedep->id_inoreflst))
10440                 return;
10441         /*
10442          * Revert the link count to that of the first unwritten journal entry.
10443          */
10444         inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10445         if (inoref)
10446                 dp->di_nlink = inoref->if_nlink;
10447
10448         /*
10449          * Set the ext data dependencies to busy.
10450          */
10451         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10452              adp = TAILQ_NEXT(adp, ad_next)) {
10453 #ifdef INVARIANTS
10454                 if (deplist != 0 && prevlbn >= adp->ad_offset)
10455                         panic("initiate_write_inodeblock_ufs2: lbn order");
10456                 prevlbn = adp->ad_offset;
10457                 if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
10458                         panic("initiate_write_inodeblock_ufs2: "
10459                             "ext pointer #%jd mismatch %jd != %jd",
10460                             (intmax_t)adp->ad_offset,
10461                             (intmax_t)dp->di_extb[adp->ad_offset],
10462                             (intmax_t)adp->ad_newblkno);
10463                 deplist |= 1 << adp->ad_offset;
10464                 if ((adp->ad_state & ATTACHED) == 0)
10465                         panic("initiate_write_inodeblock_ufs2: Unknown "
10466                             "state 0x%x", adp->ad_state);
10467 #endif /* INVARIANTS */
10468                 adp->ad_state &= ~ATTACHED;
10469                 adp->ad_state |= UNDONE;
10470         }
10471         /*
10472          * The on-disk inode cannot claim to be any larger than the last
10473          * fragment that has been written. Otherwise, the on-disk inode
10474          * might have fragments that were not the last block in the ext
10475          * data which would corrupt the filesystem.
10476          */
10477         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10478              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10479                 dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
10480                 /* keep going until hitting a rollback to a frag */
10481                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10482                         continue;
10483                 dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10484                 for (i = adp->ad_offset + 1; i < UFS_NXADDR; i++) {
10485 #ifdef INVARIANTS
10486                         if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
10487                                 panic("initiate_write_inodeblock_ufs2: "
10488                                     "lost dep1");
10489 #endif /* INVARIANTS */
10490                         dp->di_extb[i] = 0;
10491                 }
10492                 lastadp = NULL;
10493                 break;
10494         }
10495         /*
10496          * If we have zero'ed out the last allocated block of the ext
10497          * data, roll back the size to the last currently allocated block.
10498          * We know that this last allocated block is a full-sized as
10499          * we already checked for fragments in the loop above.
10500          */
10501         if (lastadp != NULL &&
10502             dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10503                 for (i = lastadp->ad_offset; i >= 0; i--)
10504                         if (dp->di_extb[i] != 0)
10505                                 break;
10506                 dp->di_extsize = (i + 1) * fs->fs_bsize;
10507         }
10508         /*
10509          * Set the file data dependencies to busy.
10510          */
10511         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10512              adp = TAILQ_NEXT(adp, ad_next)) {
10513 #ifdef INVARIANTS
10514                 if (deplist != 0 && prevlbn >= adp->ad_offset)
10515                         panic("softdep_write_inodeblock: lbn order");
10516                 if ((adp->ad_state & ATTACHED) == 0)
10517                         panic("inodedep %p and adp %p not attached", inodedep, adp);
10518                 prevlbn = adp->ad_offset;
10519                 if (adp->ad_offset < UFS_NDADDR &&
10520                     dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10521                         panic("initiate_write_inodeblock_ufs2: "
10522                             "direct pointer #%jd mismatch %jd != %jd",
10523                             (intmax_t)adp->ad_offset,
10524                             (intmax_t)dp->di_db[adp->ad_offset],
10525                             (intmax_t)adp->ad_newblkno);
10526                 if (adp->ad_offset >= UFS_NDADDR &&
10527                     dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno)
10528                         panic("initiate_write_inodeblock_ufs2: "
10529                             "indirect pointer #%jd mismatch %jd != %jd",
10530                             (intmax_t)adp->ad_offset - UFS_NDADDR,
10531                             (intmax_t)dp->di_ib[adp->ad_offset - UFS_NDADDR],
10532                             (intmax_t)adp->ad_newblkno);
10533                 deplist |= 1 << adp->ad_offset;
10534                 if ((adp->ad_state & ATTACHED) == 0)
10535                         panic("initiate_write_inodeblock_ufs2: Unknown "
10536                              "state 0x%x", adp->ad_state);
10537 #endif /* INVARIANTS */
10538                 adp->ad_state &= ~ATTACHED;
10539                 adp->ad_state |= UNDONE;
10540         }
10541         /*
10542          * The on-disk inode cannot claim to be any larger than the last
10543          * fragment that has been written. Otherwise, the on-disk inode
10544          * might have fragments that were not the last block in the file
10545          * which would corrupt the filesystem.
10546          */
10547         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10548              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10549                 if (adp->ad_offset >= UFS_NDADDR)
10550                         break;
10551                 dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10552                 /* keep going until hitting a rollback to a frag */
10553                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10554                         continue;
10555                 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10556                 for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) {
10557 #ifdef INVARIANTS
10558                         if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10559                                 panic("initiate_write_inodeblock_ufs2: "
10560                                     "lost dep2");
10561 #endif /* INVARIANTS */
10562                         dp->di_db[i] = 0;
10563                 }
10564                 for (i = 0; i < UFS_NIADDR; i++) {
10565 #ifdef INVARIANTS
10566                         if (dp->di_ib[i] != 0 &&
10567                             (deplist & ((1 << UFS_NDADDR) << i)) == 0)
10568                                 panic("initiate_write_inodeblock_ufs2: "
10569                                     "lost dep3");
10570 #endif /* INVARIANTS */
10571                         dp->di_ib[i] = 0;
10572                 }
10573                 ffs_update_dinode_ckhash(fs, dp);
10574                 return;
10575         }
10576         /*
10577          * If we have zero'ed out the last allocated block of the file,
10578          * roll back the size to the last currently allocated block.
10579          * We know that this last allocated block is a full-sized as
10580          * we already checked for fragments in the loop above.
10581          */
10582         if (lastadp != NULL &&
10583             dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10584                 for (i = lastadp->ad_offset; i >= 0; i--)
10585                         if (dp->di_db[i] != 0)
10586                                 break;
10587                 dp->di_size = (i + 1) * fs->fs_bsize;
10588         }
10589         /*
10590          * The only dependencies are for indirect blocks.
10591          *
10592          * The file size for indirect block additions is not guaranteed.
10593          * Such a guarantee would be non-trivial to achieve. The conventional
10594          * synchronous write implementation also does not make this guarantee.
10595          * Fsck should catch and fix discrepancies. Arguably, the file size
10596          * can be over-estimated without destroying integrity when the file
10597          * moves into the indirect blocks (i.e., is large). If we want to
10598          * postpone fsck, we are stuck with this argument.
10599          */
10600         for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10601                 dp->di_ib[adp->ad_offset - UFS_NDADDR] = 0;
10602         ffs_update_dinode_ckhash(fs, dp);
10603 }
10604
10605 /*
10606  * Cancel an indirdep as a result of truncation.  Release all of the
10607  * children allocindirs and place their journal work on the appropriate
10608  * list.
10609  */
10610 static void
10611 cancel_indirdep(indirdep, bp, freeblks)
10612         struct indirdep *indirdep;
10613         struct buf *bp;
10614         struct freeblks *freeblks;
10615 {
10616         struct allocindir *aip;
10617
10618         /*
10619          * None of the indirect pointers will ever be visible,
10620          * so they can simply be tossed. GOINGAWAY ensures
10621          * that allocated pointers will be saved in the buffer
10622          * cache until they are freed. Note that they will
10623          * only be able to be found by their physical address
10624          * since the inode mapping the logical address will
10625          * be gone. The save buffer used for the safe copy
10626          * was allocated in setup_allocindir_phase2 using
10627          * the physical address so it could be used for this
10628          * purpose. Hence we swap the safe copy with the real
10629          * copy, allowing the safe copy to be freed and holding
10630          * on to the real copy for later use in indir_trunc.
10631          */
10632         if (indirdep->ir_state & GOINGAWAY)
10633                 panic("cancel_indirdep: already gone");
10634         if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
10635                 indirdep->ir_state |= DEPCOMPLETE;
10636                 LIST_REMOVE(indirdep, ir_next);
10637         }
10638         indirdep->ir_state |= GOINGAWAY;
10639         /*
10640          * Pass in bp for blocks still have journal writes
10641          * pending so we can cancel them on their own.
10642          */
10643         while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != NULL)
10644                 cancel_allocindir(aip, bp, freeblks, 0);
10645         while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL)
10646                 cancel_allocindir(aip, NULL, freeblks, 0);
10647         while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL)
10648                 cancel_allocindir(aip, NULL, freeblks, 0);
10649         while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL)
10650                 cancel_allocindir(aip, NULL, freeblks, 0);
10651         /*
10652          * If there are pending partial truncations we need to keep the
10653          * old block copy around until they complete.  This is because
10654          * the current b_data is not a perfect superset of the available
10655          * blocks.
10656          */
10657         if (TAILQ_EMPTY(&indirdep->ir_trunc))
10658                 bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
10659         else
10660                 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10661         WORKLIST_REMOVE(&indirdep->ir_list);
10662         WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
10663         indirdep->ir_bp = NULL;
10664         indirdep->ir_freeblks = freeblks;
10665 }
10666
10667 /*
10668  * Free an indirdep once it no longer has new pointers to track.
10669  */
10670 static void
10671 free_indirdep(indirdep)
10672         struct indirdep *indirdep;
10673 {
10674
10675         KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc),
10676             ("free_indirdep: Indir trunc list not empty."));
10677         KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
10678             ("free_indirdep: Complete head not empty."));
10679         KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
10680             ("free_indirdep: write head not empty."));
10681         KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
10682             ("free_indirdep: done head not empty."));
10683         KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
10684             ("free_indirdep: deplist head not empty."));
10685         KASSERT((indirdep->ir_state & DEPCOMPLETE),
10686             ("free_indirdep: %p still on newblk list.", indirdep));
10687         KASSERT(indirdep->ir_saveddata == NULL,
10688             ("free_indirdep: %p still has saved data.", indirdep));
10689         if (indirdep->ir_state & ONWORKLIST)
10690                 WORKLIST_REMOVE(&indirdep->ir_list);
10691         WORKITEM_FREE(indirdep, D_INDIRDEP);
10692 }
10693
10694 /*
10695  * Called before a write to an indirdep.  This routine is responsible for
10696  * rolling back pointers to a safe state which includes only those
10697  * allocindirs which have been completed.
10698  */
10699 static void
10700 initiate_write_indirdep(indirdep, bp)
10701         struct indirdep *indirdep;
10702         struct buf *bp;
10703 {
10704         struct ufsmount *ump;
10705
10706         indirdep->ir_state |= IOSTARTED;
10707         if (indirdep->ir_state & GOINGAWAY)
10708                 panic("disk_io_initiation: indirdep gone");
10709         /*
10710          * If there are no remaining dependencies, this will be writing
10711          * the real pointers.
10712          */
10713         if (LIST_EMPTY(&indirdep->ir_deplisthd) &&
10714             TAILQ_EMPTY(&indirdep->ir_trunc))
10715                 return;
10716         /*
10717          * Replace up-to-date version with safe version.
10718          */
10719         if (indirdep->ir_saveddata == NULL) {
10720                 ump = VFSTOUFS(indirdep->ir_list.wk_mp);
10721                 LOCK_OWNED(ump);
10722                 FREE_LOCK(ump);
10723                 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
10724                     M_SOFTDEP_FLAGS);
10725                 ACQUIRE_LOCK(ump);
10726         }
10727         indirdep->ir_state &= ~ATTACHED;
10728         indirdep->ir_state |= UNDONE;
10729         bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10730         bcopy(indirdep->ir_savebp->b_data, bp->b_data,
10731             bp->b_bcount);
10732 }
10733
10734 /*
10735  * Called when an inode has been cleared in a cg bitmap.  This finally
10736  * eliminates any canceled jaddrefs
10737  */
10738 void
10739 softdep_setup_inofree(mp, bp, ino, wkhd)
10740         struct mount *mp;
10741         struct buf *bp;
10742         ino_t ino;
10743         struct workhead *wkhd;
10744 {
10745         struct worklist *wk, *wkn;
10746         struct inodedep *inodedep;
10747         struct ufsmount *ump;
10748         uint8_t *inosused;
10749         struct cg *cgp;
10750         struct fs *fs;
10751
10752         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
10753             ("softdep_setup_inofree called on non-softdep filesystem"));
10754         ump = VFSTOUFS(mp);
10755         ACQUIRE_LOCK(ump);
10756         fs = ump->um_fs;
10757         cgp = (struct cg *)bp->b_data;
10758         inosused = cg_inosused(cgp);
10759         if (isset(inosused, ino % fs->fs_ipg))
10760                 panic("softdep_setup_inofree: inode %ju not freed.",
10761                     (uintmax_t)ino);
10762         if (inodedep_lookup(mp, ino, 0, &inodedep))
10763                 panic("softdep_setup_inofree: ino %ju has existing inodedep %p",
10764                     (uintmax_t)ino, inodedep);
10765         if (wkhd) {
10766                 LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
10767                         if (wk->wk_type != D_JADDREF)
10768                                 continue;
10769                         WORKLIST_REMOVE(wk);
10770                         /*
10771                          * We can free immediately even if the jaddref
10772                          * isn't attached in a background write as now
10773                          * the bitmaps are reconciled.
10774                          */
10775                         wk->wk_state |= COMPLETE | ATTACHED;
10776                         free_jaddref(WK_JADDREF(wk));
10777                 }
10778                 jwork_move(&bp->b_dep, wkhd);
10779         }
10780         FREE_LOCK(ump);
10781 }
10782
10783
10784 /*
10785  * Called via ffs_blkfree() after a set of frags has been cleared from a cg
10786  * map.  Any dependencies waiting for the write to clear are added to the
10787  * buf's list and any jnewblks that are being canceled are discarded
10788  * immediately.
10789  */
10790 void
10791 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
10792         struct mount *mp;
10793         struct buf *bp;
10794         ufs2_daddr_t blkno;
10795         int frags;
10796         struct workhead *wkhd;
10797 {
10798         struct bmsafemap *bmsafemap;
10799         struct jnewblk *jnewblk;
10800         struct ufsmount *ump;
10801         struct worklist *wk;
10802         struct fs *fs;
10803 #ifdef INVARIANTS
10804         uint8_t *blksfree;
10805         struct cg *cgp;
10806         ufs2_daddr_t jstart;
10807         ufs2_daddr_t jend;
10808         ufs2_daddr_t end;
10809         long bno;
10810         int i;
10811 #endif
10812
10813         CTR3(KTR_SUJ,
10814             "softdep_setup_blkfree: blkno %jd frags %d wk head %p",
10815             blkno, frags, wkhd);
10816
10817         ump = VFSTOUFS(mp);
10818         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
10819             ("softdep_setup_blkfree called on non-softdep filesystem"));
10820         ACQUIRE_LOCK(ump);
10821         /* Lookup the bmsafemap so we track when it is dirty. */
10822         fs = ump->um_fs;
10823         bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10824         /*
10825          * Detach any jnewblks which have been canceled.  They must linger
10826          * until the bitmap is cleared again by ffs_blkfree() to prevent
10827          * an unjournaled allocation from hitting the disk.
10828          */
10829         if (wkhd) {
10830                 while ((wk = LIST_FIRST(wkhd)) != NULL) {
10831                         CTR2(KTR_SUJ,
10832                             "softdep_setup_blkfree: blkno %jd wk type %d",
10833                             blkno, wk->wk_type);
10834                         WORKLIST_REMOVE(wk);
10835                         if (wk->wk_type != D_JNEWBLK) {
10836                                 WORKLIST_INSERT(&bmsafemap->sm_freehd, wk);
10837                                 continue;
10838                         }
10839                         jnewblk = WK_JNEWBLK(wk);
10840                         KASSERT(jnewblk->jn_state & GOINGAWAY,
10841                             ("softdep_setup_blkfree: jnewblk not canceled."));
10842 #ifdef INVARIANTS
10843                         /*
10844                          * Assert that this block is free in the bitmap
10845                          * before we discard the jnewblk.
10846                          */
10847                         cgp = (struct cg *)bp->b_data;
10848                         blksfree = cg_blksfree(cgp);
10849                         bno = dtogd(fs, jnewblk->jn_blkno);
10850                         for (i = jnewblk->jn_oldfrags;
10851                             i < jnewblk->jn_frags; i++) {
10852                                 if (isset(blksfree, bno + i))
10853                                         continue;
10854                                 panic("softdep_setup_blkfree: not free");
10855                         }
10856 #endif
10857                         /*
10858                          * Even if it's not attached we can free immediately
10859                          * as the new bitmap is correct.
10860                          */
10861                         wk->wk_state |= COMPLETE | ATTACHED;
10862                         free_jnewblk(jnewblk);
10863                 }
10864         }
10865
10866 #ifdef INVARIANTS
10867         /*
10868          * Assert that we are not freeing a block which has an outstanding
10869          * allocation dependency.
10870          */
10871         fs = VFSTOUFS(mp)->um_fs;
10872         bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10873         end = blkno + frags;
10874         LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10875                 /*
10876                  * Don't match against blocks that will be freed when the
10877                  * background write is done.
10878                  */
10879                 if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
10880                     (COMPLETE | DEPCOMPLETE))
10881                         continue;
10882                 jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
10883                 jend = jnewblk->jn_blkno + jnewblk->jn_frags;
10884                 if ((blkno >= jstart && blkno < jend) ||
10885                     (end > jstart && end <= jend)) {
10886                         printf("state 0x%X %jd - %d %d dep %p\n",
10887                             jnewblk->jn_state, jnewblk->jn_blkno,
10888                             jnewblk->jn_oldfrags, jnewblk->jn_frags,
10889                             jnewblk->jn_dep);
10890                         panic("softdep_setup_blkfree: "
10891                             "%jd-%jd(%d) overlaps with %jd-%jd",
10892                             blkno, end, frags, jstart, jend);
10893                 }
10894         }
10895 #endif
10896         FREE_LOCK(ump);
10897 }
10898
10899 /*
10900  * Revert a block allocation when the journal record that describes it
10901  * is not yet written.
10902  */
10903 static int
10904 jnewblk_rollback(jnewblk, fs, cgp, blksfree)
10905         struct jnewblk *jnewblk;
10906         struct fs *fs;
10907         struct cg *cgp;
10908         uint8_t *blksfree;
10909 {
10910         ufs1_daddr_t fragno;
10911         long cgbno, bbase;
10912         int frags, blk;
10913         int i;
10914
10915         frags = 0;
10916         cgbno = dtogd(fs, jnewblk->jn_blkno);
10917         /*
10918          * We have to test which frags need to be rolled back.  We may
10919          * be operating on a stale copy when doing background writes.
10920          */
10921         for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++)
10922                 if (isclr(blksfree, cgbno + i))
10923                         frags++;
10924         if (frags == 0)
10925                 return (0);
10926         /*
10927          * This is mostly ffs_blkfree() sans some validation and
10928          * superblock updates.
10929          */
10930         if (frags == fs->fs_frag) {
10931                 fragno = fragstoblks(fs, cgbno);
10932                 ffs_setblock(fs, blksfree, fragno);
10933                 ffs_clusteracct(fs, cgp, fragno, 1);
10934                 cgp->cg_cs.cs_nbfree++;
10935         } else {
10936                 cgbno += jnewblk->jn_oldfrags;
10937                 bbase = cgbno - fragnum(fs, cgbno);
10938                 /* Decrement the old frags.  */
10939                 blk = blkmap(fs, blksfree, bbase);
10940                 ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
10941                 /* Deallocate the fragment */
10942                 for (i = 0; i < frags; i++)
10943                         setbit(blksfree, cgbno + i);
10944                 cgp->cg_cs.cs_nffree += frags;
10945                 /* Add back in counts associated with the new frags */
10946                 blk = blkmap(fs, blksfree, bbase);
10947                 ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
10948                 /* If a complete block has been reassembled, account for it. */
10949                 fragno = fragstoblks(fs, bbase);
10950                 if (ffs_isblock(fs, blksfree, fragno)) {
10951                         cgp->cg_cs.cs_nffree -= fs->fs_frag;
10952                         ffs_clusteracct(fs, cgp, fragno, 1);
10953                         cgp->cg_cs.cs_nbfree++;
10954                 }
10955         }
10956         stat_jnewblk++;
10957         jnewblk->jn_state &= ~ATTACHED;
10958         jnewblk->jn_state |= UNDONE;
10959
10960         return (frags);
10961 }
10962
10963 static void
10964 initiate_write_bmsafemap(bmsafemap, bp)
10965         struct bmsafemap *bmsafemap;
10966         struct buf *bp;                 /* The cg block. */
10967 {
10968         struct jaddref *jaddref;
10969         struct jnewblk *jnewblk;
10970         uint8_t *inosused;
10971         uint8_t *blksfree;
10972         struct cg *cgp;
10973         struct fs *fs;
10974         ino_t ino;
10975
10976         /*
10977          * If this is a background write, we did this at the time that
10978          * the copy was made, so do not need to do it again.
10979          */
10980         if (bmsafemap->sm_state & IOSTARTED)
10981                 return;
10982         bmsafemap->sm_state |= IOSTARTED;
10983         /*
10984          * Clear any inode allocations which are pending journal writes.
10985          */
10986         if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
10987                 cgp = (struct cg *)bp->b_data;
10988                 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10989                 inosused = cg_inosused(cgp);
10990                 LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
10991                         ino = jaddref->ja_ino % fs->fs_ipg;
10992                         if (isset(inosused, ino)) {
10993                                 if ((jaddref->ja_mode & IFMT) == IFDIR)
10994                                         cgp->cg_cs.cs_ndir--;
10995                                 cgp->cg_cs.cs_nifree++;
10996                                 clrbit(inosused, ino);
10997                                 jaddref->ja_state &= ~ATTACHED;
10998                                 jaddref->ja_state |= UNDONE;
10999                                 stat_jaddref++;
11000                         } else
11001                                 panic("initiate_write_bmsafemap: inode %ju "
11002                                     "marked free", (uintmax_t)jaddref->ja_ino);
11003                 }
11004         }
11005         /*
11006          * Clear any block allocations which are pending journal writes.
11007          */
11008         if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
11009                 cgp = (struct cg *)bp->b_data;
11010                 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11011                 blksfree = cg_blksfree(cgp);
11012                 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
11013                         if (jnewblk_rollback(jnewblk, fs, cgp, blksfree))
11014                                 continue;
11015                         panic("initiate_write_bmsafemap: block %jd "
11016                             "marked free", jnewblk->jn_blkno);
11017                 }
11018         }
11019         /*
11020          * Move allocation lists to the written lists so they can be
11021          * cleared once the block write is complete.
11022          */
11023         LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
11024             inodedep, id_deps);
11025         LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
11026             newblk, nb_deps);
11027         LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist,
11028             wk_list);
11029 }
11030
11031 /*
11032  * This routine is called during the completion interrupt
11033  * service routine for a disk write (from the procedure called
11034  * by the device driver to inform the filesystem caches of
11035  * a request completion).  It should be called early in this
11036  * procedure, before the block is made available to other
11037  * processes or other routines are called.
11038  *
11039  */
11040 static void
11041 softdep_disk_write_complete(bp)
11042         struct buf *bp;         /* describes the completed disk write */
11043 {
11044         struct worklist *wk;
11045         struct worklist *owk;
11046         struct ufsmount *ump;
11047         struct workhead reattach;
11048         struct freeblks *freeblks;
11049         struct buf *sbp;
11050
11051         ump = softdep_bp_to_mp(bp);
11052         KASSERT(LIST_EMPTY(&bp->b_dep) || ump != NULL,
11053             ("softdep_disk_write_complete: softdep_bp_to_mp returned NULL "
11054              "with outstanding dependencies for buffer %p", bp));
11055         if (ump == NULL)
11056                 return;
11057         /*
11058          * If an error occurred while doing the write, then the data
11059          * has not hit the disk and the dependencies cannot be processed.
11060          * But we do have to go through and roll forward any dependencies
11061          * that were rolled back before the disk write.
11062          */
11063         sbp = NULL;
11064         ACQUIRE_LOCK(ump);
11065         if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) {
11066                 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
11067                         switch (wk->wk_type) {
11068
11069                         case D_PAGEDEP:
11070                                 handle_written_filepage(WK_PAGEDEP(wk), bp, 0);
11071                                 continue;
11072
11073                         case D_INODEDEP:
11074                                 handle_written_inodeblock(WK_INODEDEP(wk),
11075                                     bp, 0);
11076                                 continue;
11077
11078                         case D_BMSAFEMAP:
11079                                 handle_written_bmsafemap(WK_BMSAFEMAP(wk),
11080                                     bp, 0);
11081                                 continue;
11082
11083                         case D_INDIRDEP:
11084                                 handle_written_indirdep(WK_INDIRDEP(wk),
11085                                     bp, &sbp, 0);
11086                                 continue;
11087                         default:
11088                                 /* nothing to roll forward */
11089                                 continue;
11090                         }
11091                 }
11092                 FREE_LOCK(ump);
11093                 if (sbp)
11094                         brelse(sbp);
11095                 return;
11096         }
11097         LIST_INIT(&reattach);
11098
11099         /*
11100          * Ump SU lock must not be released anywhere in this code segment.
11101          */
11102         owk = NULL;
11103         while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
11104                 WORKLIST_REMOVE(wk);
11105                 atomic_add_long(&dep_write[wk->wk_type], 1);
11106                 if (wk == owk)
11107                         panic("duplicate worklist: %p\n", wk);
11108                 owk = wk;
11109                 switch (wk->wk_type) {
11110
11111                 case D_PAGEDEP:
11112                         if (handle_written_filepage(WK_PAGEDEP(wk), bp,
11113                             WRITESUCCEEDED))
11114                                 WORKLIST_INSERT(&reattach, wk);
11115                         continue;
11116
11117                 case D_INODEDEP:
11118                         if (handle_written_inodeblock(WK_INODEDEP(wk), bp,
11119                             WRITESUCCEEDED))
11120                                 WORKLIST_INSERT(&reattach, wk);
11121                         continue;
11122
11123                 case D_BMSAFEMAP:
11124                         if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp,
11125                             WRITESUCCEEDED))
11126                                 WORKLIST_INSERT(&reattach, wk);
11127                         continue;
11128
11129                 case D_MKDIR:
11130                         handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
11131                         continue;
11132
11133                 case D_ALLOCDIRECT:
11134                         wk->wk_state |= COMPLETE;
11135                         handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
11136                         continue;
11137
11138                 case D_ALLOCINDIR:
11139                         wk->wk_state |= COMPLETE;
11140                         handle_allocindir_partdone(WK_ALLOCINDIR(wk));
11141                         continue;
11142
11143                 case D_INDIRDEP:
11144                         if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp,
11145                             WRITESUCCEEDED))
11146                                 WORKLIST_INSERT(&reattach, wk);
11147                         continue;
11148
11149                 case D_FREEBLKS:
11150                         wk->wk_state |= COMPLETE;
11151                         freeblks = WK_FREEBLKS(wk);
11152                         if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE &&
11153                             LIST_EMPTY(&freeblks->fb_jblkdephd))
11154                                 add_to_worklist(wk, WK_NODELAY);
11155                         continue;
11156
11157                 case D_FREEWORK:
11158                         handle_written_freework(WK_FREEWORK(wk));
11159                         break;
11160
11161                 case D_JSEGDEP:
11162                         free_jsegdep(WK_JSEGDEP(wk));
11163                         continue;
11164
11165                 case D_JSEG:
11166                         handle_written_jseg(WK_JSEG(wk), bp);
11167                         continue;
11168
11169                 case D_SBDEP:
11170                         if (handle_written_sbdep(WK_SBDEP(wk), bp))
11171                                 WORKLIST_INSERT(&reattach, wk);
11172                         continue;
11173
11174                 case D_FREEDEP:
11175                         free_freedep(WK_FREEDEP(wk));
11176                         continue;
11177
11178                 default:
11179                         panic("handle_disk_write_complete: Unknown type %s",
11180                             TYPENAME(wk->wk_type));
11181                         /* NOTREACHED */
11182                 }
11183         }
11184         /*
11185          * Reattach any requests that must be redone.
11186          */
11187         while ((wk = LIST_FIRST(&reattach)) != NULL) {
11188                 WORKLIST_REMOVE(wk);
11189                 WORKLIST_INSERT(&bp->b_dep, wk);
11190         }
11191         FREE_LOCK(ump);
11192         if (sbp)
11193                 brelse(sbp);
11194 }
11195
11196 /*
11197  * Called from within softdep_disk_write_complete above.
11198  */
11199 static void
11200 handle_allocdirect_partdone(adp, wkhd)
11201         struct allocdirect *adp;        /* the completed allocdirect */
11202         struct workhead *wkhd;          /* Work to do when inode is writtne. */
11203 {
11204         struct allocdirectlst *listhead;
11205         struct allocdirect *listadp;
11206         struct inodedep *inodedep;
11207         long bsize;
11208
11209         LOCK_OWNED(VFSTOUFS(adp->ad_block.nb_list.wk_mp));
11210         if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
11211                 return;
11212         /*
11213          * The on-disk inode cannot claim to be any larger than the last
11214          * fragment that has been written. Otherwise, the on-disk inode
11215          * might have fragments that were not the last block in the file
11216          * which would corrupt the filesystem. Thus, we cannot free any
11217          * allocdirects after one whose ad_oldblkno claims a fragment as
11218          * these blocks must be rolled back to zero before writing the inode.
11219          * We check the currently active set of allocdirects in id_inoupdt
11220          * or id_extupdt as appropriate.
11221          */
11222         inodedep = adp->ad_inodedep;
11223         bsize = inodedep->id_fs->fs_bsize;
11224         if (adp->ad_state & EXTDATA)
11225                 listhead = &inodedep->id_extupdt;
11226         else
11227                 listhead = &inodedep->id_inoupdt;
11228         TAILQ_FOREACH(listadp, listhead, ad_next) {
11229                 /* found our block */
11230                 if (listadp == adp)
11231                         break;
11232                 /* continue if ad_oldlbn is not a fragment */
11233                 if (listadp->ad_oldsize == 0 ||
11234                     listadp->ad_oldsize == bsize)
11235                         continue;
11236                 /* hit a fragment */
11237                 return;
11238         }
11239         /*
11240          * If we have reached the end of the current list without
11241          * finding the just finished dependency, then it must be
11242          * on the future dependency list. Future dependencies cannot
11243          * be freed until they are moved to the current list.
11244          */
11245         if (listadp == NULL) {
11246 #ifdef INVARIANTS
11247                 if (adp->ad_state & EXTDATA)
11248                         listhead = &inodedep->id_newextupdt;
11249                 else
11250                         listhead = &inodedep->id_newinoupdt;
11251                 TAILQ_FOREACH(listadp, listhead, ad_next)
11252                         /* found our block */
11253                         if (listadp == adp)
11254                                 break;
11255                 if (listadp == NULL)
11256                         panic("handle_allocdirect_partdone: lost dep");
11257 #endif /* INVARIANTS */
11258                 return;
11259         }
11260         /*
11261          * If we have found the just finished dependency, then queue
11262          * it along with anything that follows it that is complete.
11263          * Since the pointer has not yet been written in the inode
11264          * as the dependency prevents it, place the allocdirect on the
11265          * bufwait list where it will be freed once the pointer is
11266          * valid.
11267          */
11268         if (wkhd == NULL)
11269                 wkhd = &inodedep->id_bufwait;
11270         for (; adp; adp = listadp) {
11271                 listadp = TAILQ_NEXT(adp, ad_next);
11272                 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
11273                         return;
11274                 TAILQ_REMOVE(listhead, adp, ad_next);
11275                 WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
11276         }
11277 }
11278
11279 /*
11280  * Called from within softdep_disk_write_complete above.  This routine
11281  * completes successfully written allocindirs.
11282  */
11283 static void
11284 handle_allocindir_partdone(aip)
11285         struct allocindir *aip;         /* the completed allocindir */
11286 {
11287         struct indirdep *indirdep;
11288
11289         if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
11290                 return;
11291         indirdep = aip->ai_indirdep;
11292         LIST_REMOVE(aip, ai_next);
11293         /*
11294          * Don't set a pointer while the buffer is undergoing IO or while
11295          * we have active truncations.
11296          */
11297         if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) {
11298                 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
11299                 return;
11300         }
11301         if (indirdep->ir_state & UFS1FMT)
11302                 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
11303                     aip->ai_newblkno;
11304         else
11305                 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
11306                     aip->ai_newblkno;
11307         /*
11308          * Await the pointer write before freeing the allocindir.
11309          */
11310         LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
11311 }
11312
11313 /*
11314  * Release segments held on a jwork list.
11315  */
11316 static void
11317 handle_jwork(wkhd)
11318         struct workhead *wkhd;
11319 {
11320         struct worklist *wk;
11321
11322         while ((wk = LIST_FIRST(wkhd)) != NULL) {
11323                 WORKLIST_REMOVE(wk);
11324                 switch (wk->wk_type) {
11325                 case D_JSEGDEP:
11326                         free_jsegdep(WK_JSEGDEP(wk));
11327                         continue;
11328                 case D_FREEDEP:
11329                         free_freedep(WK_FREEDEP(wk));
11330                         continue;
11331                 case D_FREEFRAG:
11332                         rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep));
11333                         WORKITEM_FREE(wk, D_FREEFRAG);
11334                         continue;
11335                 case D_FREEWORK:
11336                         handle_written_freework(WK_FREEWORK(wk));
11337                         continue;
11338                 default:
11339                         panic("handle_jwork: Unknown type %s\n",
11340                             TYPENAME(wk->wk_type));
11341                 }
11342         }
11343 }
11344
11345 /*
11346  * Handle the bufwait list on an inode when it is safe to release items
11347  * held there.  This normally happens after an inode block is written but
11348  * may be delayed and handled later if there are pending journal items that
11349  * are not yet safe to be released.
11350  */
11351 static struct freefile *
11352 handle_bufwait(inodedep, refhd)
11353         struct inodedep *inodedep;
11354         struct workhead *refhd;
11355 {
11356         struct jaddref *jaddref;
11357         struct freefile *freefile;
11358         struct worklist *wk;
11359
11360         freefile = NULL;
11361         while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
11362                 WORKLIST_REMOVE(wk);
11363                 switch (wk->wk_type) {
11364                 case D_FREEFILE:
11365                         /*
11366                          * We defer adding freefile to the worklist
11367                          * until all other additions have been made to
11368                          * ensure that it will be done after all the
11369                          * old blocks have been freed.
11370                          */
11371                         if (freefile != NULL)
11372                                 panic("handle_bufwait: freefile");
11373                         freefile = WK_FREEFILE(wk);
11374                         continue;
11375
11376                 case D_MKDIR:
11377                         handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
11378                         continue;
11379
11380                 case D_DIRADD:
11381                         diradd_inode_written(WK_DIRADD(wk), inodedep);
11382                         continue;
11383
11384                 case D_FREEFRAG:
11385                         wk->wk_state |= COMPLETE;
11386                         if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
11387                                 add_to_worklist(wk, 0);
11388                         continue;
11389
11390                 case D_DIRREM:
11391                         wk->wk_state |= COMPLETE;
11392                         add_to_worklist(wk, 0);
11393                         continue;
11394
11395                 case D_ALLOCDIRECT:
11396                 case D_ALLOCINDIR:
11397                         free_newblk(WK_NEWBLK(wk));
11398                         continue;
11399
11400                 case D_JNEWBLK:
11401                         wk->wk_state |= COMPLETE;
11402                         free_jnewblk(WK_JNEWBLK(wk));
11403                         continue;
11404
11405                 /*
11406                  * Save freed journal segments and add references on
11407                  * the supplied list which will delay their release
11408                  * until the cg bitmap is cleared on disk.
11409                  */
11410                 case D_JSEGDEP:
11411                         if (refhd == NULL)
11412                                 free_jsegdep(WK_JSEGDEP(wk));
11413                         else
11414                                 WORKLIST_INSERT(refhd, wk);
11415                         continue;
11416
11417                 case D_JADDREF:
11418                         jaddref = WK_JADDREF(wk);
11419                         TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
11420                             if_deps);
11421                         /*
11422                          * Transfer any jaddrefs to the list to be freed with
11423                          * the bitmap if we're handling a removed file.
11424                          */
11425                         if (refhd == NULL) {
11426                                 wk->wk_state |= COMPLETE;
11427                                 free_jaddref(jaddref);
11428                         } else
11429                                 WORKLIST_INSERT(refhd, wk);
11430                         continue;
11431
11432                 default:
11433                         panic("handle_bufwait: Unknown type %p(%s)",
11434                             wk, TYPENAME(wk->wk_type));
11435                         /* NOTREACHED */
11436                 }
11437         }
11438         return (freefile);
11439 }
11440 /*
11441  * Called from within softdep_disk_write_complete above to restore
11442  * in-memory inode block contents to their most up-to-date state. Note
11443  * that this routine is always called from interrupt level with further
11444  * interrupts from this device blocked.
11445  *
11446  * If the write did not succeed, we will do all the roll-forward
11447  * operations, but we will not take the actions that will allow its
11448  * dependencies to be processed.
11449  */
11450 static int
11451 handle_written_inodeblock(inodedep, bp, flags)
11452         struct inodedep *inodedep;
11453         struct buf *bp;         /* buffer containing the inode block */
11454         int flags;
11455 {
11456         struct freefile *freefile;
11457         struct allocdirect *adp, *nextadp;
11458         struct ufs1_dinode *dp1 = NULL;
11459         struct ufs2_dinode *dp2 = NULL;
11460         struct workhead wkhd;
11461         int hadchanges, fstype;
11462         ino_t freelink;
11463
11464         LIST_INIT(&wkhd);
11465         hadchanges = 0;
11466         freefile = NULL;
11467         if ((inodedep->id_state & IOSTARTED) == 0)
11468                 panic("handle_written_inodeblock: not started");
11469         inodedep->id_state &= ~IOSTARTED;
11470         if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
11471                 fstype = UFS1;
11472                 dp1 = (struct ufs1_dinode *)bp->b_data +
11473                     ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11474                 freelink = dp1->di_freelink;
11475         } else {
11476                 fstype = UFS2;
11477                 dp2 = (struct ufs2_dinode *)bp->b_data +
11478                     ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11479                 freelink = dp2->di_freelink;
11480         }
11481         /*
11482          * Leave this inodeblock dirty until it's in the list.
11483          */
11484         if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED &&
11485             (flags & WRITESUCCEEDED)) {
11486                 struct inodedep *inon;
11487
11488                 inon = TAILQ_NEXT(inodedep, id_unlinked);
11489                 if ((inon == NULL && freelink == 0) ||
11490                     (inon && inon->id_ino == freelink)) {
11491                         if (inon)
11492                                 inon->id_state |= UNLINKPREV;
11493                         inodedep->id_state |= UNLINKNEXT;
11494                 }
11495                 hadchanges = 1;
11496         }
11497         /*
11498          * If we had to rollback the inode allocation because of
11499          * bitmaps being incomplete, then simply restore it.
11500          * Keep the block dirty so that it will not be reclaimed until
11501          * all associated dependencies have been cleared and the
11502          * corresponding updates written to disk.
11503          */
11504         if (inodedep->id_savedino1 != NULL) {
11505                 hadchanges = 1;
11506                 if (fstype == UFS1)
11507                         *dp1 = *inodedep->id_savedino1;
11508                 else
11509                         *dp2 = *inodedep->id_savedino2;
11510                 free(inodedep->id_savedino1, M_SAVEDINO);
11511                 inodedep->id_savedino1 = NULL;
11512                 if ((bp->b_flags & B_DELWRI) == 0)
11513                         stat_inode_bitmap++;
11514                 bdirty(bp);
11515                 /*
11516                  * If the inode is clear here and GOINGAWAY it will never
11517                  * be written.  Process the bufwait and clear any pending
11518                  * work which may include the freefile.
11519                  */
11520                 if (inodedep->id_state & GOINGAWAY)
11521                         goto bufwait;
11522                 return (1);
11523         }
11524         if (flags & WRITESUCCEEDED)
11525                 inodedep->id_state |= COMPLETE;
11526         /*
11527          * Roll forward anything that had to be rolled back before
11528          * the inode could be updated.
11529          */
11530         for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
11531                 nextadp = TAILQ_NEXT(adp, ad_next);
11532                 if (adp->ad_state & ATTACHED)
11533                         panic("handle_written_inodeblock: new entry");
11534                 if (fstype == UFS1) {
11535                         if (adp->ad_offset < UFS_NDADDR) {
11536                                 if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11537                                         panic("%s %s #%jd mismatch %d != %jd",
11538                                             "handle_written_inodeblock:",
11539                                             "direct pointer",
11540                                             (intmax_t)adp->ad_offset,
11541                                             dp1->di_db[adp->ad_offset],
11542                                             (intmax_t)adp->ad_oldblkno);
11543                                 dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
11544                         } else {
11545                                 if (dp1->di_ib[adp->ad_offset - UFS_NDADDR] !=
11546                                     0)
11547                                         panic("%s: %s #%jd allocated as %d",
11548                                             "handle_written_inodeblock",
11549                                             "indirect pointer",
11550                                             (intmax_t)adp->ad_offset -
11551                                             UFS_NDADDR,
11552                                             dp1->di_ib[adp->ad_offset -
11553                                             UFS_NDADDR]);
11554                                 dp1->di_ib[adp->ad_offset - UFS_NDADDR] =
11555                                     adp->ad_newblkno;
11556                         }
11557                 } else {
11558                         if (adp->ad_offset < UFS_NDADDR) {
11559                                 if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11560                                         panic("%s: %s #%jd %s %jd != %jd",
11561                                             "handle_written_inodeblock",
11562                                             "direct pointer",
11563                                             (intmax_t)adp->ad_offset, "mismatch",
11564                                             (intmax_t)dp2->di_db[adp->ad_offset],
11565                                             (intmax_t)adp->ad_oldblkno);
11566                                 dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
11567                         } else {
11568                                 if (dp2->di_ib[adp->ad_offset - UFS_NDADDR] !=
11569                                     0)
11570                                         panic("%s: %s #%jd allocated as %jd",
11571                                             "handle_written_inodeblock",
11572                                             "indirect pointer",
11573                                             (intmax_t)adp->ad_offset -
11574                                             UFS_NDADDR,
11575                                             (intmax_t)
11576                                             dp2->di_ib[adp->ad_offset -
11577                                             UFS_NDADDR]);
11578                                 dp2->di_ib[adp->ad_offset - UFS_NDADDR] =
11579                                     adp->ad_newblkno;
11580                         }
11581                 }
11582                 adp->ad_state &= ~UNDONE;
11583                 adp->ad_state |= ATTACHED;
11584                 hadchanges = 1;
11585         }
11586         for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
11587                 nextadp = TAILQ_NEXT(adp, ad_next);
11588                 if (adp->ad_state & ATTACHED)
11589                         panic("handle_written_inodeblock: new entry");
11590                 if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
11591                         panic("%s: direct pointers #%jd %s %jd != %jd",
11592                             "handle_written_inodeblock",
11593                             (intmax_t)adp->ad_offset, "mismatch",
11594                             (intmax_t)dp2->di_extb[adp->ad_offset],
11595                             (intmax_t)adp->ad_oldblkno);
11596                 dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
11597                 adp->ad_state &= ~UNDONE;
11598                 adp->ad_state |= ATTACHED;
11599                 hadchanges = 1;
11600         }
11601         if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
11602                 stat_direct_blk_ptrs++;
11603         /*
11604          * Reset the file size to its most up-to-date value.
11605          */
11606         if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
11607                 panic("handle_written_inodeblock: bad size");
11608         if (inodedep->id_savednlink > UFS_LINK_MAX)
11609                 panic("handle_written_inodeblock: Invalid link count "
11610                     "%jd for inodedep %p", (uintmax_t)inodedep->id_savednlink,
11611                     inodedep);
11612         if (fstype == UFS1) {
11613                 if (dp1->di_nlink != inodedep->id_savednlink) {
11614                         dp1->di_nlink = inodedep->id_savednlink;
11615                         hadchanges = 1;
11616                 }
11617                 if (dp1->di_size != inodedep->id_savedsize) {
11618                         dp1->di_size = inodedep->id_savedsize;
11619                         hadchanges = 1;
11620                 }
11621         } else {
11622                 if (dp2->di_nlink != inodedep->id_savednlink) {
11623                         dp2->di_nlink = inodedep->id_savednlink;
11624                         hadchanges = 1;
11625                 }
11626                 if (dp2->di_size != inodedep->id_savedsize) {
11627                         dp2->di_size = inodedep->id_savedsize;
11628                         hadchanges = 1;
11629                 }
11630                 if (dp2->di_extsize != inodedep->id_savedextsize) {
11631                         dp2->di_extsize = inodedep->id_savedextsize;
11632                         hadchanges = 1;
11633                 }
11634         }
11635         inodedep->id_savedsize = -1;
11636         inodedep->id_savedextsize = -1;
11637         inodedep->id_savednlink = -1;
11638         /*
11639          * If there were any rollbacks in the inode block, then it must be
11640          * marked dirty so that its will eventually get written back in
11641          * its correct form.
11642          */
11643         if (hadchanges) {
11644                 if (fstype == UFS2)
11645                         ffs_update_dinode_ckhash(inodedep->id_fs, dp2);
11646                 bdirty(bp);
11647         }
11648 bufwait:
11649         /*
11650          * If the write did not succeed, we have done all the roll-forward
11651          * operations, but we cannot take the actions that will allow its
11652          * dependencies to be processed.
11653          */
11654         if ((flags & WRITESUCCEEDED) == 0)
11655                 return (hadchanges);
11656         /*
11657          * Process any allocdirects that completed during the update.
11658          */
11659         if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
11660                 handle_allocdirect_partdone(adp, &wkhd);
11661         if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
11662                 handle_allocdirect_partdone(adp, &wkhd);
11663         /*
11664          * Process deallocations that were held pending until the
11665          * inode had been written to disk. Freeing of the inode
11666          * is delayed until after all blocks have been freed to
11667          * avoid creation of new <vfsid, inum, lbn> triples
11668          * before the old ones have been deleted.  Completely
11669          * unlinked inodes are not processed until the unlinked
11670          * inode list is written or the last reference is removed.
11671          */
11672         if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
11673                 freefile = handle_bufwait(inodedep, NULL);
11674                 if (freefile && !LIST_EMPTY(&wkhd)) {
11675                         WORKLIST_INSERT(&wkhd, &freefile->fx_list);
11676                         freefile = NULL;
11677                 }
11678         }
11679         /*
11680          * Move rolled forward dependency completions to the bufwait list
11681          * now that those that were already written have been processed.
11682          */
11683         if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
11684                 panic("handle_written_inodeblock: bufwait but no changes");
11685         jwork_move(&inodedep->id_bufwait, &wkhd);
11686
11687         if (freefile != NULL) {
11688                 /*
11689                  * If the inode is goingaway it was never written.  Fake up
11690                  * the state here so free_inodedep() can succeed.
11691                  */
11692                 if (inodedep->id_state & GOINGAWAY)
11693                         inodedep->id_state |= COMPLETE | DEPCOMPLETE;
11694                 if (free_inodedep(inodedep) == 0)
11695                         panic("handle_written_inodeblock: live inodedep %p",
11696                             inodedep);
11697                 add_to_worklist(&freefile->fx_list, 0);
11698                 return (0);
11699         }
11700
11701         /*
11702          * If no outstanding dependencies, free it.
11703          */
11704         if (free_inodedep(inodedep) ||
11705             (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
11706              TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
11707              TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
11708              LIST_FIRST(&inodedep->id_bufwait) == 0))
11709                 return (0);
11710         return (hadchanges);
11711 }
11712
11713 /*
11714  * Perform needed roll-forwards and kick off any dependencies that
11715  * can now be processed.
11716  *
11717  * If the write did not succeed, we will do all the roll-forward
11718  * operations, but we will not take the actions that will allow its
11719  * dependencies to be processed.
11720  */
11721 static int
11722 handle_written_indirdep(indirdep, bp, bpp, flags)
11723         struct indirdep *indirdep;
11724         struct buf *bp;
11725         struct buf **bpp;
11726         int flags;
11727 {
11728         struct allocindir *aip;
11729         struct buf *sbp;
11730         int chgs;
11731
11732         if (indirdep->ir_state & GOINGAWAY)
11733                 panic("handle_written_indirdep: indirdep gone");
11734         if ((indirdep->ir_state & IOSTARTED) == 0)
11735                 panic("handle_written_indirdep: IO not started");
11736         chgs = 0;
11737         /*
11738          * If there were rollbacks revert them here.
11739          */
11740         if (indirdep->ir_saveddata) {
11741                 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
11742                 if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11743                         free(indirdep->ir_saveddata, M_INDIRDEP);
11744                         indirdep->ir_saveddata = NULL;
11745                 }
11746                 chgs = 1;
11747         }
11748         indirdep->ir_state &= ~(UNDONE | IOSTARTED);
11749         indirdep->ir_state |= ATTACHED;
11750         /*
11751          * If the write did not succeed, we have done all the roll-forward
11752          * operations, but we cannot take the actions that will allow its
11753          * dependencies to be processed.
11754          */
11755         if ((flags & WRITESUCCEEDED) == 0) {
11756                 stat_indir_blk_ptrs++;
11757                 bdirty(bp);
11758                 return (1);
11759         }
11760         /*
11761          * Move allocindirs with written pointers to the completehd if
11762          * the indirdep's pointer is not yet written.  Otherwise
11763          * free them here.
11764          */
11765         while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL) {
11766                 LIST_REMOVE(aip, ai_next);
11767                 if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
11768                         LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
11769                             ai_next);
11770                         newblk_freefrag(&aip->ai_block);
11771                         continue;
11772                 }
11773                 free_newblk(&aip->ai_block);
11774         }
11775         /*
11776          * Move allocindirs that have finished dependency processing from
11777          * the done list to the write list after updating the pointers.
11778          */
11779         if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11780                 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL) {
11781                         handle_allocindir_partdone(aip);
11782                         if (aip == LIST_FIRST(&indirdep->ir_donehd))
11783                                 panic("disk_write_complete: not gone");
11784                         chgs = 1;
11785                 }
11786         }
11787         /*
11788          * Preserve the indirdep if there were any changes or if it is not
11789          * yet valid on disk.
11790          */
11791         if (chgs) {
11792                 stat_indir_blk_ptrs++;
11793                 bdirty(bp);
11794                 return (1);
11795         }
11796         /*
11797          * If there were no changes we can discard the savedbp and detach
11798          * ourselves from the buf.  We are only carrying completed pointers
11799          * in this case.
11800          */
11801         sbp = indirdep->ir_savebp;
11802         sbp->b_flags |= B_INVAL | B_NOCACHE;
11803         indirdep->ir_savebp = NULL;
11804         indirdep->ir_bp = NULL;
11805         if (*bpp != NULL)
11806                 panic("handle_written_indirdep: bp already exists.");
11807         *bpp = sbp;
11808         /*
11809          * The indirdep may not be freed until its parent points at it.
11810          */
11811         if (indirdep->ir_state & DEPCOMPLETE)
11812                 free_indirdep(indirdep);
11813
11814         return (0);
11815 }
11816
11817 /*
11818  * Process a diradd entry after its dependent inode has been written.
11819  */
11820 static void
11821 diradd_inode_written(dap, inodedep)
11822         struct diradd *dap;
11823         struct inodedep *inodedep;
11824 {
11825
11826         LOCK_OWNED(VFSTOUFS(dap->da_list.wk_mp));
11827         dap->da_state |= COMPLETE;
11828         complete_diradd(dap);
11829         WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
11830 }
11831
11832 /*
11833  * Returns true if the bmsafemap will have rollbacks when written.  Must only
11834  * be called with the per-filesystem lock and the buf lock on the cg held.
11835  */
11836 static int
11837 bmsafemap_backgroundwrite(bmsafemap, bp)
11838         struct bmsafemap *bmsafemap;
11839         struct buf *bp;
11840 {
11841         int dirty;
11842
11843         LOCK_OWNED(VFSTOUFS(bmsafemap->sm_list.wk_mp));
11844         dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
11845             !LIST_EMPTY(&bmsafemap->sm_jnewblkhd);
11846         /*
11847          * If we're initiating a background write we need to process the
11848          * rollbacks as they exist now, not as they exist when IO starts.
11849          * No other consumers will look at the contents of the shadowed
11850          * buf so this is safe to do here.
11851          */
11852         if (bp->b_xflags & BX_BKGRDMARKER)
11853                 initiate_write_bmsafemap(bmsafemap, bp);
11854
11855         return (dirty);
11856 }
11857
11858 /*
11859  * Re-apply an allocation when a cg write is complete.
11860  */
11861 static int
11862 jnewblk_rollforward(jnewblk, fs, cgp, blksfree)
11863         struct jnewblk *jnewblk;
11864         struct fs *fs;
11865         struct cg *cgp;
11866         uint8_t *blksfree;
11867 {
11868         ufs1_daddr_t fragno;
11869         ufs2_daddr_t blkno;
11870         long cgbno, bbase;
11871         int frags, blk;
11872         int i;
11873
11874         frags = 0;
11875         cgbno = dtogd(fs, jnewblk->jn_blkno);
11876         for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) {
11877                 if (isclr(blksfree, cgbno + i))
11878                         panic("jnewblk_rollforward: re-allocated fragment");
11879                 frags++;
11880         }
11881         if (frags == fs->fs_frag) {
11882                 blkno = fragstoblks(fs, cgbno);
11883                 ffs_clrblock(fs, blksfree, (long)blkno);
11884                 ffs_clusteracct(fs, cgp, blkno, -1);
11885                 cgp->cg_cs.cs_nbfree--;
11886         } else {
11887                 bbase = cgbno - fragnum(fs, cgbno);
11888                 cgbno += jnewblk->jn_oldfrags;
11889                 /* If a complete block had been reassembled, account for it. */
11890                 fragno = fragstoblks(fs, bbase);
11891                 if (ffs_isblock(fs, blksfree, fragno)) {
11892                         cgp->cg_cs.cs_nffree += fs->fs_frag;
11893                         ffs_clusteracct(fs, cgp, fragno, -1);
11894                         cgp->cg_cs.cs_nbfree--;
11895                 }
11896                 /* Decrement the old frags.  */
11897                 blk = blkmap(fs, blksfree, bbase);
11898                 ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
11899                 /* Allocate the fragment */
11900                 for (i = 0; i < frags; i++)
11901                         clrbit(blksfree, cgbno + i);
11902                 cgp->cg_cs.cs_nffree -= frags;
11903                 /* Add back in counts associated with the new frags */
11904                 blk = blkmap(fs, blksfree, bbase);
11905                 ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
11906         }
11907         return (frags);
11908 }
11909
11910 /*
11911  * Complete a write to a bmsafemap structure.  Roll forward any bitmap
11912  * changes if it's not a background write.  Set all written dependencies
11913  * to DEPCOMPLETE and free the structure if possible.
11914  *
11915  * If the write did not succeed, we will do all the roll-forward
11916  * operations, but we will not take the actions that will allow its
11917  * dependencies to be processed.
11918  */
11919 static int
11920 handle_written_bmsafemap(bmsafemap, bp, flags)
11921         struct bmsafemap *bmsafemap;
11922         struct buf *bp;
11923         int flags;
11924 {
11925         struct newblk *newblk;
11926         struct inodedep *inodedep;
11927         struct jaddref *jaddref, *jatmp;
11928         struct jnewblk *jnewblk, *jntmp;
11929         struct ufsmount *ump;
11930         uint8_t *inosused;
11931         uint8_t *blksfree;
11932         struct cg *cgp;
11933         struct fs *fs;
11934         ino_t ino;
11935         int foreground;
11936         int chgs;
11937
11938         if ((bmsafemap->sm_state & IOSTARTED) == 0)
11939                 panic("handle_written_bmsafemap: Not started\n");
11940         ump = VFSTOUFS(bmsafemap->sm_list.wk_mp);
11941         chgs = 0;
11942         bmsafemap->sm_state &= ~IOSTARTED;
11943         foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0;
11944         /*
11945          * If write was successful, release journal work that was waiting
11946          * on the write. Otherwise move the work back.
11947          */
11948         if (flags & WRITESUCCEEDED)
11949                 handle_jwork(&bmsafemap->sm_freewr);
11950         else
11951                 LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
11952                     worklist, wk_list);
11953
11954         /*
11955          * Restore unwritten inode allocation pending jaddref writes.
11956          */
11957         if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
11958                 cgp = (struct cg *)bp->b_data;
11959                 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11960                 inosused = cg_inosused(cgp);
11961                 LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
11962                     ja_bmdeps, jatmp) {
11963                         if ((jaddref->ja_state & UNDONE) == 0)
11964                                 continue;
11965                         ino = jaddref->ja_ino % fs->fs_ipg;
11966                         if (isset(inosused, ino))
11967                                 panic("handle_written_bmsafemap: "
11968                                     "re-allocated inode");
11969                         /* Do the roll-forward only if it's a real copy. */
11970                         if (foreground) {
11971                                 if ((jaddref->ja_mode & IFMT) == IFDIR)
11972                                         cgp->cg_cs.cs_ndir++;
11973                                 cgp->cg_cs.cs_nifree--;
11974                                 setbit(inosused, ino);
11975                                 chgs = 1;
11976                         }
11977                         jaddref->ja_state &= ~UNDONE;
11978                         jaddref->ja_state |= ATTACHED;
11979                         free_jaddref(jaddref);
11980                 }
11981         }
11982         /*
11983          * Restore any block allocations which are pending journal writes.
11984          */
11985         if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
11986                 cgp = (struct cg *)bp->b_data;
11987                 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11988                 blksfree = cg_blksfree(cgp);
11989                 LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
11990                     jntmp) {
11991                         if ((jnewblk->jn_state & UNDONE) == 0)
11992                                 continue;
11993                         /* Do the roll-forward only if it's a real copy. */
11994                         if (foreground &&
11995                             jnewblk_rollforward(jnewblk, fs, cgp, blksfree))
11996                                 chgs = 1;
11997                         jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
11998                         jnewblk->jn_state |= ATTACHED;
11999                         free_jnewblk(jnewblk);
12000                 }
12001         }
12002         /*
12003          * If the write did not succeed, we have done all the roll-forward
12004          * operations, but we cannot take the actions that will allow its
12005          * dependencies to be processed.
12006          */
12007         if ((flags & WRITESUCCEEDED) == 0) {
12008                 LIST_CONCAT(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
12009                     newblk, nb_deps);
12010                 LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
12011                     worklist, wk_list);
12012                 if (foreground)
12013                         bdirty(bp);
12014                 return (1);
12015         }
12016         while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
12017                 newblk->nb_state |= DEPCOMPLETE;
12018                 newblk->nb_state &= ~ONDEPLIST;
12019                 newblk->nb_bmsafemap = NULL;
12020                 LIST_REMOVE(newblk, nb_deps);
12021                 if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
12022                         handle_allocdirect_partdone(
12023                             WK_ALLOCDIRECT(&newblk->nb_list), NULL);
12024                 else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
12025                         handle_allocindir_partdone(
12026                             WK_ALLOCINDIR(&newblk->nb_list));
12027                 else if (newblk->nb_list.wk_type != D_NEWBLK)
12028                         panic("handle_written_bmsafemap: Unexpected type: %s",
12029                             TYPENAME(newblk->nb_list.wk_type));
12030         }
12031         while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
12032                 inodedep->id_state |= DEPCOMPLETE;
12033                 inodedep->id_state &= ~ONDEPLIST;
12034                 LIST_REMOVE(inodedep, id_deps);
12035                 inodedep->id_bmsafemap = NULL;
12036         }
12037         LIST_REMOVE(bmsafemap, sm_next);
12038         if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
12039             LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
12040             LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
12041             LIST_EMPTY(&bmsafemap->sm_inodedephd) &&
12042             LIST_EMPTY(&bmsafemap->sm_freehd)) {
12043                 LIST_REMOVE(bmsafemap, sm_hash);
12044                 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
12045                 return (0);
12046         }
12047         LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
12048         if (foreground)
12049                 bdirty(bp);
12050         return (1);
12051 }
12052
12053 /*
12054  * Try to free a mkdir dependency.
12055  */
12056 static void
12057 complete_mkdir(mkdir)
12058         struct mkdir *mkdir;
12059 {
12060         struct diradd *dap;
12061
12062         if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
12063                 return;
12064         LIST_REMOVE(mkdir, md_mkdirs);
12065         dap = mkdir->md_diradd;
12066         dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
12067         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
12068                 dap->da_state |= DEPCOMPLETE;
12069                 complete_diradd(dap);
12070         }
12071         WORKITEM_FREE(mkdir, D_MKDIR);
12072 }
12073
12074 /*
12075  * Handle the completion of a mkdir dependency.
12076  */
12077 static void
12078 handle_written_mkdir(mkdir, type)
12079         struct mkdir *mkdir;
12080         int type;
12081 {
12082
12083         if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
12084                 panic("handle_written_mkdir: bad type");
12085         mkdir->md_state |= COMPLETE;
12086         complete_mkdir(mkdir);
12087 }
12088
12089 static int
12090 free_pagedep(pagedep)
12091         struct pagedep *pagedep;
12092 {
12093         int i;
12094
12095         if (pagedep->pd_state & NEWBLOCK)
12096                 return (0);
12097         if (!LIST_EMPTY(&pagedep->pd_dirremhd))
12098                 return (0);
12099         for (i = 0; i < DAHASHSZ; i++)
12100                 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
12101                         return (0);
12102         if (!LIST_EMPTY(&pagedep->pd_pendinghd))
12103                 return (0);
12104         if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
12105                 return (0);
12106         if (pagedep->pd_state & ONWORKLIST)
12107                 WORKLIST_REMOVE(&pagedep->pd_list);
12108         LIST_REMOVE(pagedep, pd_hash);
12109         WORKITEM_FREE(pagedep, D_PAGEDEP);
12110
12111         return (1);
12112 }
12113
12114 /*
12115  * Called from within softdep_disk_write_complete above.
12116  * A write operation was just completed. Removed inodes can
12117  * now be freed and associated block pointers may be committed.
12118  * Note that this routine is always called from interrupt level
12119  * with further interrupts from this device blocked.
12120  *
12121  * If the write did not succeed, we will do all the roll-forward
12122  * operations, but we will not take the actions that will allow its
12123  * dependencies to be processed.
12124  */
12125 static int
12126 handle_written_filepage(pagedep, bp, flags)
12127         struct pagedep *pagedep;
12128         struct buf *bp;         /* buffer containing the written page */
12129         int flags;
12130 {
12131         struct dirrem *dirrem;
12132         struct diradd *dap, *nextdap;
12133         struct direct *ep;
12134         int i, chgs;
12135
12136         if ((pagedep->pd_state & IOSTARTED) == 0)
12137                 panic("handle_written_filepage: not started");
12138         pagedep->pd_state &= ~IOSTARTED;
12139         if ((flags & WRITESUCCEEDED) == 0)
12140                 goto rollforward;
12141         /*
12142          * Process any directory removals that have been committed.
12143          */
12144         while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
12145                 LIST_REMOVE(dirrem, dm_next);
12146                 dirrem->dm_state |= COMPLETE;
12147                 dirrem->dm_dirinum = pagedep->pd_ino;
12148                 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
12149                     ("handle_written_filepage: Journal entries not written."));
12150                 add_to_worklist(&dirrem->dm_list, 0);
12151         }
12152         /*
12153          * Free any directory additions that have been committed.
12154          * If it is a newly allocated block, we have to wait until
12155          * the on-disk directory inode claims the new block.
12156          */
12157         if ((pagedep->pd_state & NEWBLOCK) == 0)
12158                 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
12159                         free_diradd(dap, NULL);
12160 rollforward:
12161         /*
12162          * Uncommitted directory entries must be restored.
12163          */
12164         for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
12165                 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
12166                      dap = nextdap) {
12167                         nextdap = LIST_NEXT(dap, da_pdlist);
12168                         if (dap->da_state & ATTACHED)
12169                                 panic("handle_written_filepage: attached");
12170                         ep = (struct direct *)
12171                             ((char *)bp->b_data + dap->da_offset);
12172                         ep->d_ino = dap->da_newinum;
12173                         dap->da_state &= ~UNDONE;
12174                         dap->da_state |= ATTACHED;
12175                         chgs = 1;
12176                         /*
12177                          * If the inode referenced by the directory has
12178                          * been written out, then the dependency can be
12179                          * moved to the pending list.
12180                          */
12181                         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
12182                                 LIST_REMOVE(dap, da_pdlist);
12183                                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
12184                                     da_pdlist);
12185                         }
12186                 }
12187         }
12188         /*
12189          * If there were any rollbacks in the directory, then it must be
12190          * marked dirty so that its will eventually get written back in
12191          * its correct form.
12192          */
12193         if (chgs || (flags & WRITESUCCEEDED) == 0) {
12194                 if ((bp->b_flags & B_DELWRI) == 0)
12195                         stat_dir_entry++;
12196                 bdirty(bp);
12197                 return (1);
12198         }
12199         /*
12200          * If we are not waiting for a new directory block to be
12201          * claimed by its inode, then the pagedep will be freed.
12202          * Otherwise it will remain to track any new entries on
12203          * the page in case they are fsync'ed.
12204          */
12205         free_pagedep(pagedep);
12206         return (0);
12207 }
12208
12209 /*
12210  * Writing back in-core inode structures.
12211  *
12212  * The filesystem only accesses an inode's contents when it occupies an
12213  * "in-core" inode structure.  These "in-core" structures are separate from
12214  * the page frames used to cache inode blocks.  Only the latter are
12215  * transferred to/from the disk.  So, when the updated contents of the
12216  * "in-core" inode structure are copied to the corresponding in-memory inode
12217  * block, the dependencies are also transferred.  The following procedure is
12218  * called when copying a dirty "in-core" inode to a cached inode block.
12219  */
12220
12221 /*
12222  * Called when an inode is loaded from disk. If the effective link count
12223  * differed from the actual link count when it was last flushed, then we
12224  * need to ensure that the correct effective link count is put back.
12225  */
12226 void
12227 softdep_load_inodeblock(ip)
12228         struct inode *ip;       /* the "in_core" copy of the inode */
12229 {
12230         struct inodedep *inodedep;
12231         struct ufsmount *ump;
12232
12233         ump = ITOUMP(ip);
12234         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
12235             ("softdep_load_inodeblock called on non-softdep filesystem"));
12236         /*
12237          * Check for alternate nlink count.
12238          */
12239         ip->i_effnlink = ip->i_nlink;
12240         ACQUIRE_LOCK(ump);
12241         if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0) {
12242                 FREE_LOCK(ump);
12243                 return;
12244         }
12245         ip->i_effnlink -= inodedep->id_nlinkdelta;
12246         FREE_LOCK(ump);
12247 }
12248
12249 /*
12250  * This routine is called just before the "in-core" inode
12251  * information is to be copied to the in-memory inode block.
12252  * Recall that an inode block contains several inodes. If
12253  * the force flag is set, then the dependencies will be
12254  * cleared so that the update can always be made. Note that
12255  * the buffer is locked when this routine is called, so we
12256  * will never be in the middle of writing the inode block
12257  * to disk.
12258  */
12259 void
12260 softdep_update_inodeblock(ip, bp, waitfor)
12261         struct inode *ip;       /* the "in_core" copy of the inode */
12262         struct buf *bp;         /* the buffer containing the inode block */
12263         int waitfor;            /* nonzero => update must be allowed */
12264 {
12265         struct inodedep *inodedep;
12266         struct inoref *inoref;
12267         struct ufsmount *ump;
12268         struct worklist *wk;
12269         struct mount *mp;
12270         struct buf *ibp;
12271         struct fs *fs;
12272         int error;
12273
12274         ump = ITOUMP(ip);
12275         mp = UFSTOVFS(ump);
12276         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
12277             ("softdep_update_inodeblock called on non-softdep filesystem"));
12278         fs = ump->um_fs;
12279         /*
12280          * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
12281          * does not have access to the in-core ip so must write directly into
12282          * the inode block buffer when setting freelink.
12283          */
12284         if (fs->fs_magic == FS_UFS1_MAGIC)
12285                 DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
12286                     ino_to_fsbo(fs, ip->i_number))->di_freelink);
12287         else
12288                 DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
12289                     ino_to_fsbo(fs, ip->i_number))->di_freelink);
12290         /*
12291          * If the effective link count is not equal to the actual link
12292          * count, then we must track the difference in an inodedep while
12293          * the inode is (potentially) tossed out of the cache. Otherwise,
12294          * if there is no existing inodedep, then there are no dependencies
12295          * to track.
12296          */
12297         ACQUIRE_LOCK(ump);
12298 again:
12299         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12300                 FREE_LOCK(ump);
12301                 if (ip->i_effnlink != ip->i_nlink)
12302                         panic("softdep_update_inodeblock: bad link count");
12303                 return;
12304         }
12305         if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
12306                 panic("softdep_update_inodeblock: bad delta");
12307         /*
12308          * If we're flushing all dependencies we must also move any waiting
12309          * for journal writes onto the bufwait list prior to I/O.
12310          */
12311         if (waitfor) {
12312                 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12313                         if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12314                             == DEPCOMPLETE) {
12315                                 jwait(&inoref->if_list, MNT_WAIT);
12316                                 goto again;
12317                         }
12318                 }
12319         }
12320         /*
12321          * Changes have been initiated. Anything depending on these
12322          * changes cannot occur until this inode has been written.
12323          */
12324         inodedep->id_state &= ~COMPLETE;
12325         if ((inodedep->id_state & ONWORKLIST) == 0)
12326                 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
12327         /*
12328          * Any new dependencies associated with the incore inode must
12329          * now be moved to the list associated with the buffer holding
12330          * the in-memory copy of the inode. Once merged process any
12331          * allocdirects that are completed by the merger.
12332          */
12333         merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
12334         if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
12335                 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
12336                     NULL);
12337         merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
12338         if (!TAILQ_EMPTY(&inodedep->id_extupdt))
12339                 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
12340                     NULL);
12341         /*
12342          * Now that the inode has been pushed into the buffer, the
12343          * operations dependent on the inode being written to disk
12344          * can be moved to the id_bufwait so that they will be
12345          * processed when the buffer I/O completes.
12346          */
12347         while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
12348                 WORKLIST_REMOVE(wk);
12349                 WORKLIST_INSERT(&inodedep->id_bufwait, wk);
12350         }
12351         /*
12352          * Newly allocated inodes cannot be written until the bitmap
12353          * that allocates them have been written (indicated by
12354          * DEPCOMPLETE being set in id_state). If we are doing a
12355          * forced sync (e.g., an fsync on a file), we force the bitmap
12356          * to be written so that the update can be done.
12357          */
12358         if (waitfor == 0) {
12359                 FREE_LOCK(ump);
12360                 return;
12361         }
12362 retry:
12363         if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
12364                 FREE_LOCK(ump);
12365                 return;
12366         }
12367         ibp = inodedep->id_bmsafemap->sm_buf;
12368         ibp = getdirtybuf(ibp, LOCK_PTR(ump), MNT_WAIT);
12369         if (ibp == NULL) {
12370                 /*
12371                  * If ibp came back as NULL, the dependency could have been
12372                  * freed while we slept.  Look it up again, and check to see
12373                  * that it has completed.
12374                  */
12375                 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
12376                         goto retry;
12377                 FREE_LOCK(ump);
12378                 return;
12379         }
12380         FREE_LOCK(ump);
12381         if ((error = bwrite(ibp)) != 0)
12382                 softdep_error("softdep_update_inodeblock: bwrite", error);
12383 }
12384
12385 /*
12386  * Merge the a new inode dependency list (such as id_newinoupdt) into an
12387  * old inode dependency list (such as id_inoupdt).
12388  */
12389 static void
12390 merge_inode_lists(newlisthead, oldlisthead)
12391         struct allocdirectlst *newlisthead;
12392         struct allocdirectlst *oldlisthead;
12393 {
12394         struct allocdirect *listadp, *newadp;
12395
12396         newadp = TAILQ_FIRST(newlisthead);
12397         if (newadp != NULL)
12398                 LOCK_OWNED(VFSTOUFS(newadp->ad_block.nb_list.wk_mp));
12399         for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
12400                 if (listadp->ad_offset < newadp->ad_offset) {
12401                         listadp = TAILQ_NEXT(listadp, ad_next);
12402                         continue;
12403                 }
12404                 TAILQ_REMOVE(newlisthead, newadp, ad_next);
12405                 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
12406                 if (listadp->ad_offset == newadp->ad_offset) {
12407                         allocdirect_merge(oldlisthead, newadp,
12408                             listadp);
12409                         listadp = newadp;
12410                 }
12411                 newadp = TAILQ_FIRST(newlisthead);
12412         }
12413         while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
12414                 TAILQ_REMOVE(newlisthead, newadp, ad_next);
12415                 TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
12416         }
12417 }
12418
12419 /*
12420  * If we are doing an fsync, then we must ensure that any directory
12421  * entries for the inode have been written after the inode gets to disk.
12422  */
12423 int
12424 softdep_fsync(vp)
12425         struct vnode *vp;       /* the "in_core" copy of the inode */
12426 {
12427         struct inodedep *inodedep;
12428         struct pagedep *pagedep;
12429         struct inoref *inoref;
12430         struct ufsmount *ump;
12431         struct worklist *wk;
12432         struct diradd *dap;
12433         struct mount *mp;
12434         struct vnode *pvp;
12435         struct inode *ip;
12436         struct buf *bp;
12437         struct fs *fs;
12438         struct thread *td = curthread;
12439         int error, flushparent, pagedep_new_block;
12440         ino_t parentino;
12441         ufs_lbn_t lbn;
12442
12443         ip = VTOI(vp);
12444         mp = vp->v_mount;
12445         ump = VFSTOUFS(mp);
12446         fs = ump->um_fs;
12447         if (MOUNTEDSOFTDEP(mp) == 0)
12448                 return (0);
12449         ACQUIRE_LOCK(ump);
12450 restart:
12451         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12452                 FREE_LOCK(ump);
12453                 return (0);
12454         }
12455         TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12456                 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12457                     == DEPCOMPLETE) {
12458                         jwait(&inoref->if_list, MNT_WAIT);
12459                         goto restart;
12460                 }
12461         }
12462         if (!LIST_EMPTY(&inodedep->id_inowait) ||
12463             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
12464             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
12465             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
12466             !TAILQ_EMPTY(&inodedep->id_newinoupdt))
12467                 panic("softdep_fsync: pending ops %p", inodedep);
12468         for (error = 0, flushparent = 0; ; ) {
12469                 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
12470                         break;
12471                 if (wk->wk_type != D_DIRADD)
12472                         panic("softdep_fsync: Unexpected type %s",
12473                             TYPENAME(wk->wk_type));
12474                 dap = WK_DIRADD(wk);
12475                 /*
12476                  * Flush our parent if this directory entry has a MKDIR_PARENT
12477                  * dependency or is contained in a newly allocated block.
12478                  */
12479                 if (dap->da_state & DIRCHG)
12480                         pagedep = dap->da_previous->dm_pagedep;
12481                 else
12482                         pagedep = dap->da_pagedep;
12483                 parentino = pagedep->pd_ino;
12484                 lbn = pagedep->pd_lbn;
12485                 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
12486                         panic("softdep_fsync: dirty");
12487                 if ((dap->da_state & MKDIR_PARENT) ||
12488                     (pagedep->pd_state & NEWBLOCK))
12489                         flushparent = 1;
12490                 else
12491                         flushparent = 0;
12492                 /*
12493                  * If we are being fsync'ed as part of vgone'ing this vnode,
12494                  * then we will not be able to release and recover the
12495                  * vnode below, so we just have to give up on writing its
12496                  * directory entry out. It will eventually be written, just
12497                  * not now, but then the user was not asking to have it
12498                  * written, so we are not breaking any promises.
12499                  */
12500                 if (vp->v_iflag & VI_DOOMED)
12501                         break;
12502                 /*
12503                  * We prevent deadlock by always fetching inodes from the
12504                  * root, moving down the directory tree. Thus, when fetching
12505                  * our parent directory, we first try to get the lock. If
12506                  * that fails, we must unlock ourselves before requesting
12507                  * the lock on our parent. See the comment in ufs_lookup
12508                  * for details on possible races.
12509                  */
12510                 FREE_LOCK(ump);
12511                 if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
12512                     FFSV_FORCEINSMQ)) {
12513                         error = vfs_busy(mp, MBF_NOWAIT);
12514                         if (error != 0) {
12515                                 vfs_ref(mp);
12516                                 VOP_UNLOCK(vp, 0);
12517                                 error = vfs_busy(mp, 0);
12518                                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
12519                                 vfs_rel(mp);
12520                                 if (error != 0)
12521                                         return (ENOENT);
12522                                 if (vp->v_iflag & VI_DOOMED) {
12523                                         vfs_unbusy(mp);
12524                                         return (ENOENT);
12525                                 }
12526                         }
12527                         VOP_UNLOCK(vp, 0);
12528                         error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
12529                             &pvp, FFSV_FORCEINSMQ);
12530                         vfs_unbusy(mp);
12531                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
12532                         if (vp->v_iflag & VI_DOOMED) {
12533                                 if (error == 0)
12534                                         vput(pvp);
12535                                 error = ENOENT;
12536                         }
12537                         if (error != 0)
12538                                 return (error);
12539                 }
12540                 /*
12541                  * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
12542                  * that are contained in direct blocks will be resolved by
12543                  * doing a ffs_update. Pagedeps contained in indirect blocks
12544                  * may require a complete sync'ing of the directory. So, we
12545                  * try the cheap and fast ffs_update first, and if that fails,
12546                  * then we do the slower ffs_syncvnode of the directory.
12547                  */
12548                 if (flushparent) {
12549                         int locked;
12550
12551                         if ((error = ffs_update(pvp, 1)) != 0) {
12552                                 vput(pvp);
12553                                 return (error);
12554                         }
12555                         ACQUIRE_LOCK(ump);
12556                         locked = 1;
12557                         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
12558                                 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
12559                                         if (wk->wk_type != D_DIRADD)
12560                                                 panic("softdep_fsync: Unexpected type %s",
12561                                                       TYPENAME(wk->wk_type));
12562                                         dap = WK_DIRADD(wk);
12563                                         if (dap->da_state & DIRCHG)
12564                                                 pagedep = dap->da_previous->dm_pagedep;
12565                                         else
12566                                                 pagedep = dap->da_pagedep;
12567                                         pagedep_new_block = pagedep->pd_state & NEWBLOCK;
12568                                         FREE_LOCK(ump);
12569                                         locked = 0;
12570                                         if (pagedep_new_block && (error =
12571                                             ffs_syncvnode(pvp, MNT_WAIT, 0))) {
12572                                                 vput(pvp);
12573                                                 return (error);
12574                                         }
12575                                 }
12576                         }
12577                         if (locked)
12578                                 FREE_LOCK(ump);
12579                 }
12580                 /*
12581                  * Flush directory page containing the inode's name.
12582                  */
12583                 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
12584                     &bp);
12585                 if (error == 0)
12586                         error = bwrite(bp);
12587                 else
12588                         brelse(bp);
12589                 vput(pvp);
12590                 if (error != 0)
12591                         return (error);
12592                 ACQUIRE_LOCK(ump);
12593                 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
12594                         break;
12595         }
12596         FREE_LOCK(ump);
12597         return (0);
12598 }
12599
12600 /*
12601  * Flush all the dirty bitmaps associated with the block device
12602  * before flushing the rest of the dirty blocks so as to reduce
12603  * the number of dependencies that will have to be rolled back.
12604  *
12605  * XXX Unused?
12606  */
12607 void
12608 softdep_fsync_mountdev(vp)
12609         struct vnode *vp;
12610 {
12611         struct buf *bp, *nbp;
12612         struct worklist *wk;
12613         struct bufobj *bo;
12614
12615         if (!vn_isdisk(vp, NULL))
12616                 panic("softdep_fsync_mountdev: vnode not a disk");
12617         bo = &vp->v_bufobj;
12618 restart:
12619         BO_LOCK(bo);
12620         TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
12621                 /*
12622                  * If it is already scheduled, skip to the next buffer.
12623                  */
12624                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
12625                         continue;
12626
12627                 if ((bp->b_flags & B_DELWRI) == 0)
12628                         panic("softdep_fsync_mountdev: not dirty");
12629                 /*
12630                  * We are only interested in bitmaps with outstanding
12631                  * dependencies.
12632                  */
12633                 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
12634                     wk->wk_type != D_BMSAFEMAP ||
12635                     (bp->b_vflags & BV_BKGRDINPROG)) {
12636                         BUF_UNLOCK(bp);
12637                         continue;
12638                 }
12639                 BO_UNLOCK(bo);
12640                 bremfree(bp);
12641                 (void) bawrite(bp);
12642                 goto restart;
12643         }
12644         drain_output(vp);
12645         BO_UNLOCK(bo);
12646 }
12647
12648 /*
12649  * Sync all cylinder groups that were dirty at the time this function is
12650  * called.  Newly dirtied cgs will be inserted before the sentinel.  This
12651  * is used to flush freedep activity that may be holding up writes to a
12652  * indirect block.
12653  */
12654 static int
12655 sync_cgs(mp, waitfor)
12656         struct mount *mp;
12657         int waitfor;
12658 {
12659         struct bmsafemap *bmsafemap;
12660         struct bmsafemap *sentinel;
12661         struct ufsmount *ump;
12662         struct buf *bp;
12663         int error;
12664
12665         sentinel = malloc(sizeof(*sentinel), M_BMSAFEMAP, M_ZERO | M_WAITOK);
12666         sentinel->sm_cg = -1;
12667         ump = VFSTOUFS(mp);
12668         error = 0;
12669         ACQUIRE_LOCK(ump);
12670         LIST_INSERT_HEAD(&ump->softdep_dirtycg, sentinel, sm_next);
12671         for (bmsafemap = LIST_NEXT(sentinel, sm_next); bmsafemap != NULL;
12672             bmsafemap = LIST_NEXT(sentinel, sm_next)) {
12673                 /* Skip sentinels and cgs with no work to release. */
12674                 if (bmsafemap->sm_cg == -1 ||
12675                     (LIST_EMPTY(&bmsafemap->sm_freehd) &&
12676                     LIST_EMPTY(&bmsafemap->sm_freewr))) {
12677                         LIST_REMOVE(sentinel, sm_next);
12678                         LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12679                         continue;
12680                 }
12681                 /*
12682                  * If we don't get the lock and we're waiting try again, if
12683                  * not move on to the next buf and try to sync it.
12684                  */
12685                 bp = getdirtybuf(bmsafemap->sm_buf, LOCK_PTR(ump), waitfor);
12686                 if (bp == NULL && waitfor == MNT_WAIT)
12687                         continue;
12688                 LIST_REMOVE(sentinel, sm_next);
12689                 LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12690                 if (bp == NULL)
12691                         continue;
12692                 FREE_LOCK(ump);
12693                 if (waitfor == MNT_NOWAIT)
12694                         bawrite(bp);
12695                 else
12696                         error = bwrite(bp);
12697                 ACQUIRE_LOCK(ump);
12698                 if (error)
12699                         break;
12700         }
12701         LIST_REMOVE(sentinel, sm_next);
12702         FREE_LOCK(ump);
12703         free(sentinel, M_BMSAFEMAP);
12704         return (error);
12705 }
12706
12707 /*
12708  * This routine is called when we are trying to synchronously flush a
12709  * file. This routine must eliminate any filesystem metadata dependencies
12710  * so that the syncing routine can succeed.
12711  */
12712 int
12713 softdep_sync_metadata(struct vnode *vp)
12714 {
12715         struct inode *ip;
12716         int error;
12717
12718         ip = VTOI(vp);
12719         KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
12720             ("softdep_sync_metadata called on non-softdep filesystem"));
12721         /*
12722          * Ensure that any direct block dependencies have been cleared,
12723          * truncations are started, and inode references are journaled.
12724          */
12725         ACQUIRE_LOCK(VFSTOUFS(vp->v_mount));
12726         /*
12727          * Write all journal records to prevent rollbacks on devvp.
12728          */
12729         if (vp->v_type == VCHR)
12730                 softdep_flushjournal(vp->v_mount);
12731         error = flush_inodedep_deps(vp, vp->v_mount, ip->i_number);
12732         /*
12733          * Ensure that all truncates are written so we won't find deps on
12734          * indirect blocks.
12735          */
12736         process_truncates(vp);
12737         FREE_LOCK(VFSTOUFS(vp->v_mount));
12738
12739         return (error);
12740 }
12741
12742 /*
12743  * This routine is called when we are attempting to sync a buf with
12744  * dependencies.  If waitfor is MNT_NOWAIT it attempts to schedule any
12745  * other IO it can but returns EBUSY if the buffer is not yet able to
12746  * be written.  Dependencies which will not cause rollbacks will always
12747  * return 0.
12748  */
12749 int
12750 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
12751 {
12752         struct indirdep *indirdep;
12753         struct pagedep *pagedep;
12754         struct allocindir *aip;
12755         struct newblk *newblk;
12756         struct ufsmount *ump;
12757         struct buf *nbp;
12758         struct worklist *wk;
12759         int i, error;
12760
12761         KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
12762             ("softdep_sync_buf called on non-softdep filesystem"));
12763         /*
12764          * For VCHR we just don't want to force flush any dependencies that
12765          * will cause rollbacks.
12766          */
12767         if (vp->v_type == VCHR) {
12768                 if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0))
12769                         return (EBUSY);
12770                 return (0);
12771         }
12772         ump = VFSTOUFS(vp->v_mount);
12773         ACQUIRE_LOCK(ump);
12774         /*
12775          * As we hold the buffer locked, none of its dependencies
12776          * will disappear.
12777          */
12778         error = 0;
12779 top:
12780         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
12781                 switch (wk->wk_type) {
12782
12783                 case D_ALLOCDIRECT:
12784                 case D_ALLOCINDIR:
12785                         newblk = WK_NEWBLK(wk);
12786                         if (newblk->nb_jnewblk != NULL) {
12787                                 if (waitfor == MNT_NOWAIT) {
12788                                         error = EBUSY;
12789                                         goto out_unlock;
12790                                 }
12791                                 jwait(&newblk->nb_jnewblk->jn_list, waitfor);
12792                                 goto top;
12793                         }
12794                         if (newblk->nb_state & DEPCOMPLETE ||
12795                             waitfor == MNT_NOWAIT)
12796                                 continue;
12797                         nbp = newblk->nb_bmsafemap->sm_buf;
12798                         nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
12799                         if (nbp == NULL)
12800                                 goto top;
12801                         FREE_LOCK(ump);
12802                         if ((error = bwrite(nbp)) != 0)
12803                                 goto out;
12804                         ACQUIRE_LOCK(ump);
12805                         continue;
12806
12807                 case D_INDIRDEP:
12808                         indirdep = WK_INDIRDEP(wk);
12809                         if (waitfor == MNT_NOWAIT) {
12810                                 if (!TAILQ_EMPTY(&indirdep->ir_trunc) ||
12811                                     !LIST_EMPTY(&indirdep->ir_deplisthd)) {
12812                                         error = EBUSY;
12813                                         goto out_unlock;
12814                                 }
12815                         }
12816                         if (!TAILQ_EMPTY(&indirdep->ir_trunc))
12817                                 panic("softdep_sync_buf: truncation pending.");
12818                 restart:
12819                         LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
12820                                 newblk = (struct newblk *)aip;
12821                                 if (newblk->nb_jnewblk != NULL) {
12822                                         jwait(&newblk->nb_jnewblk->jn_list,
12823                                             waitfor);
12824                                         goto restart;
12825                                 }
12826                                 if (newblk->nb_state & DEPCOMPLETE)
12827                                         continue;
12828                                 nbp = newblk->nb_bmsafemap->sm_buf;
12829                                 nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
12830                                 if (nbp == NULL)
12831                                         goto restart;
12832                                 FREE_LOCK(ump);
12833                                 if ((error = bwrite(nbp)) != 0)
12834                                         goto out;
12835                                 ACQUIRE_LOCK(ump);
12836                                 goto restart;
12837                         }
12838                         continue;
12839
12840                 case D_PAGEDEP:
12841                         /*
12842                          * Only flush directory entries in synchronous passes.
12843                          */
12844                         if (waitfor != MNT_WAIT) {
12845                                 error = EBUSY;
12846                                 goto out_unlock;
12847                         }
12848                         /*
12849                          * While syncing snapshots, we must allow recursive
12850                          * lookups.
12851                          */
12852                         BUF_AREC(bp);
12853                         /*
12854                          * We are trying to sync a directory that may
12855                          * have dependencies on both its own metadata
12856                          * and/or dependencies on the inodes of any
12857                          * recently allocated files. We walk its diradd
12858                          * lists pushing out the associated inode.
12859                          */
12860                         pagedep = WK_PAGEDEP(wk);
12861                         for (i = 0; i < DAHASHSZ; i++) {
12862                                 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
12863                                         continue;
12864                                 if ((error = flush_pagedep_deps(vp, wk->wk_mp,
12865                                     &pagedep->pd_diraddhd[i]))) {
12866                                         BUF_NOREC(bp);
12867                                         goto out_unlock;
12868                                 }
12869                         }
12870                         BUF_NOREC(bp);
12871                         continue;
12872
12873                 case D_FREEWORK:
12874                 case D_FREEDEP:
12875                 case D_JSEGDEP:
12876                 case D_JNEWBLK:
12877                         continue;
12878
12879                 default:
12880                         panic("softdep_sync_buf: Unknown type %s",
12881                             TYPENAME(wk->wk_type));
12882                         /* NOTREACHED */
12883                 }
12884         }
12885 out_unlock:
12886         FREE_LOCK(ump);
12887 out:
12888         return (error);
12889 }
12890
12891 /*
12892  * Flush the dependencies associated with an inodedep.
12893  */
12894 static int
12895 flush_inodedep_deps(vp, mp, ino)
12896         struct vnode *vp;
12897         struct mount *mp;
12898         ino_t ino;
12899 {
12900         struct inodedep *inodedep;
12901         struct inoref *inoref;
12902         struct ufsmount *ump;
12903         int error, waitfor;
12904
12905         /*
12906          * This work is done in two passes. The first pass grabs most
12907          * of the buffers and begins asynchronously writing them. The
12908          * only way to wait for these asynchronous writes is to sleep
12909          * on the filesystem vnode which may stay busy for a long time
12910          * if the filesystem is active. So, instead, we make a second
12911          * pass over the dependencies blocking on each write. In the
12912          * usual case we will be blocking against a write that we
12913          * initiated, so when it is done the dependency will have been
12914          * resolved. Thus the second pass is expected to end quickly.
12915          * We give a brief window at the top of the loop to allow
12916          * any pending I/O to complete.
12917          */
12918         ump = VFSTOUFS(mp);
12919         LOCK_OWNED(ump);
12920         for (error = 0, waitfor = MNT_NOWAIT; ; ) {
12921                 if (error)
12922                         return (error);
12923                 FREE_LOCK(ump);
12924                 ACQUIRE_LOCK(ump);
12925 restart:
12926                 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
12927                         return (0);
12928                 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12929                         if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12930                             == DEPCOMPLETE) {
12931                                 jwait(&inoref->if_list, MNT_WAIT);
12932                                 goto restart;
12933                         }
12934                 }
12935                 if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
12936                     flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
12937                     flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
12938                     flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
12939                         continue;
12940                 /*
12941                  * If pass2, we are done, otherwise do pass 2.
12942                  */
12943                 if (waitfor == MNT_WAIT)
12944                         break;
12945                 waitfor = MNT_WAIT;
12946         }
12947         /*
12948          * Try freeing inodedep in case all dependencies have been removed.
12949          */
12950         if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
12951                 (void) free_inodedep(inodedep);
12952         return (0);
12953 }
12954
12955 /*
12956  * Flush an inode dependency list.
12957  */
12958 static int
12959 flush_deplist(listhead, waitfor, errorp)
12960         struct allocdirectlst *listhead;
12961         int waitfor;
12962         int *errorp;
12963 {
12964         struct allocdirect *adp;
12965         struct newblk *newblk;
12966         struct ufsmount *ump;
12967         struct buf *bp;
12968
12969         if ((adp = TAILQ_FIRST(listhead)) == NULL)
12970                 return (0);
12971         ump = VFSTOUFS(adp->ad_list.wk_mp);
12972         LOCK_OWNED(ump);
12973         TAILQ_FOREACH(adp, listhead, ad_next) {
12974                 newblk = (struct newblk *)adp;
12975                 if (newblk->nb_jnewblk != NULL) {
12976                         jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12977                         return (1);
12978                 }
12979                 if (newblk->nb_state & DEPCOMPLETE)
12980                         continue;
12981                 bp = newblk->nb_bmsafemap->sm_buf;
12982                 bp = getdirtybuf(bp, LOCK_PTR(ump), waitfor);
12983                 if (bp == NULL) {
12984                         if (waitfor == MNT_NOWAIT)
12985                                 continue;
12986                         return (1);
12987                 }
12988                 FREE_LOCK(ump);
12989                 if (waitfor == MNT_NOWAIT)
12990                         bawrite(bp);
12991                 else
12992                         *errorp = bwrite(bp);
12993                 ACQUIRE_LOCK(ump);
12994                 return (1);
12995         }
12996         return (0);
12997 }
12998
12999 /*
13000  * Flush dependencies associated with an allocdirect block.
13001  */
13002 static int
13003 flush_newblk_dep(vp, mp, lbn)
13004         struct vnode *vp;
13005         struct mount *mp;
13006         ufs_lbn_t lbn;
13007 {
13008         struct newblk *newblk;
13009         struct ufsmount *ump;
13010         struct bufobj *bo;
13011         struct inode *ip;
13012         struct buf *bp;
13013         ufs2_daddr_t blkno;
13014         int error;
13015
13016         error = 0;
13017         bo = &vp->v_bufobj;
13018         ip = VTOI(vp);
13019         blkno = DIP(ip, i_db[lbn]);
13020         if (blkno == 0)
13021                 panic("flush_newblk_dep: Missing block");
13022         ump = VFSTOUFS(mp);
13023         ACQUIRE_LOCK(ump);
13024         /*
13025          * Loop until all dependencies related to this block are satisfied.
13026          * We must be careful to restart after each sleep in case a write
13027          * completes some part of this process for us.
13028          */
13029         for (;;) {
13030                 if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
13031                         FREE_LOCK(ump);
13032                         break;
13033                 }
13034                 if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
13035                         panic("flush_newblk_dep: Bad newblk %p", newblk);
13036                 /*
13037                  * Flush the journal.
13038                  */
13039                 if (newblk->nb_jnewblk != NULL) {
13040                         jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
13041                         continue;
13042                 }
13043                 /*
13044                  * Write the bitmap dependency.
13045                  */
13046                 if ((newblk->nb_state & DEPCOMPLETE) == 0) {
13047                         bp = newblk->nb_bmsafemap->sm_buf;
13048                         bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
13049                         if (bp == NULL)
13050                                 continue;
13051                         FREE_LOCK(ump);
13052                         error = bwrite(bp);
13053                         if (error)
13054                                 break;
13055                         ACQUIRE_LOCK(ump);
13056                         continue;
13057                 }
13058                 /*
13059                  * Write the buffer.
13060                  */
13061                 FREE_LOCK(ump);
13062                 BO_LOCK(bo);
13063                 bp = gbincore(bo, lbn);
13064                 if (bp != NULL) {
13065                         error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
13066                             LK_INTERLOCK, BO_LOCKPTR(bo));
13067                         if (error == ENOLCK) {
13068                                 ACQUIRE_LOCK(ump);
13069                                 error = 0;
13070                                 continue; /* Slept, retry */
13071                         }
13072                         if (error != 0)
13073                                 break;  /* Failed */
13074                         if (bp->b_flags & B_DELWRI) {
13075                                 bremfree(bp);
13076                                 error = bwrite(bp);
13077                                 if (error)
13078                                         break;
13079                         } else
13080                                 BUF_UNLOCK(bp);
13081                 } else
13082                         BO_UNLOCK(bo);
13083                 /*
13084                  * We have to wait for the direct pointers to
13085                  * point at the newdirblk before the dependency
13086                  * will go away.
13087                  */
13088                 error = ffs_update(vp, 1);
13089                 if (error)
13090                         break;
13091                 ACQUIRE_LOCK(ump);
13092         }
13093         return (error);
13094 }
13095
13096 /*
13097  * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
13098  */
13099 static int
13100 flush_pagedep_deps(pvp, mp, diraddhdp)
13101         struct vnode *pvp;
13102         struct mount *mp;
13103         struct diraddhd *diraddhdp;
13104 {
13105         struct inodedep *inodedep;
13106         struct inoref *inoref;
13107         struct ufsmount *ump;
13108         struct diradd *dap;
13109         struct vnode *vp;
13110         int error = 0;
13111         struct buf *bp;
13112         ino_t inum;
13113         struct diraddhd unfinished;
13114
13115         LIST_INIT(&unfinished);
13116         ump = VFSTOUFS(mp);
13117         LOCK_OWNED(ump);
13118 restart:
13119         while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
13120                 /*
13121                  * Flush ourselves if this directory entry
13122                  * has a MKDIR_PARENT dependency.
13123                  */
13124                 if (dap->da_state & MKDIR_PARENT) {
13125                         FREE_LOCK(ump);
13126                         if ((error = ffs_update(pvp, 1)) != 0)
13127                                 break;
13128                         ACQUIRE_LOCK(ump);
13129                         /*
13130                          * If that cleared dependencies, go on to next.
13131                          */
13132                         if (dap != LIST_FIRST(diraddhdp))
13133                                 continue;
13134                         /*
13135                          * All MKDIR_PARENT dependencies and all the
13136                          * NEWBLOCK pagedeps that are contained in direct
13137                          * blocks were resolved by doing above ffs_update.
13138                          * Pagedeps contained in indirect blocks may
13139                          * require a complete sync'ing of the directory.
13140                          * We are in the midst of doing a complete sync,
13141                          * so if they are not resolved in this pass we
13142                          * defer them for now as they will be sync'ed by
13143                          * our caller shortly.
13144                          */
13145                         LIST_REMOVE(dap, da_pdlist);
13146                         LIST_INSERT_HEAD(&unfinished, dap, da_pdlist);
13147                         continue;
13148                 }
13149                 /*
13150                  * A newly allocated directory must have its "." and
13151                  * ".." entries written out before its name can be
13152                  * committed in its parent.
13153                  */
13154                 inum = dap->da_newinum;
13155                 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
13156                         panic("flush_pagedep_deps: lost inode1");
13157                 /*
13158                  * Wait for any pending journal adds to complete so we don't
13159                  * cause rollbacks while syncing.
13160                  */
13161                 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
13162                         if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
13163                             == DEPCOMPLETE) {
13164                                 jwait(&inoref->if_list, MNT_WAIT);
13165                                 goto restart;
13166                         }
13167                 }
13168                 if (dap->da_state & MKDIR_BODY) {
13169                         FREE_LOCK(ump);
13170                         if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
13171                             FFSV_FORCEINSMQ)))
13172                                 break;
13173                         error = flush_newblk_dep(vp, mp, 0);
13174                         /*
13175                          * If we still have the dependency we might need to
13176                          * update the vnode to sync the new link count to
13177                          * disk.
13178                          */
13179                         if (error == 0 && dap == LIST_FIRST(diraddhdp))
13180                                 error = ffs_update(vp, 1);
13181                         vput(vp);
13182                         if (error != 0)
13183                                 break;
13184                         ACQUIRE_LOCK(ump);
13185                         /*
13186                          * If that cleared dependencies, go on to next.
13187                          */
13188                         if (dap != LIST_FIRST(diraddhdp))
13189                                 continue;
13190                         if (dap->da_state & MKDIR_BODY) {
13191                                 inodedep_lookup(UFSTOVFS(ump), inum, 0,
13192                                     &inodedep);
13193                                 panic("flush_pagedep_deps: MKDIR_BODY "
13194                                     "inodedep %p dap %p vp %p",
13195                                     inodedep, dap, vp);
13196                         }
13197                 }
13198                 /*
13199                  * Flush the inode on which the directory entry depends.
13200                  * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
13201                  * the only remaining dependency is that the updated inode
13202                  * count must get pushed to disk. The inode has already
13203                  * been pushed into its inode buffer (via VOP_UPDATE) at
13204                  * the time of the reference count change. So we need only
13205                  * locate that buffer, ensure that there will be no rollback
13206                  * caused by a bitmap dependency, then write the inode buffer.
13207                  */
13208 retry:
13209                 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
13210                         panic("flush_pagedep_deps: lost inode");
13211                 /*
13212                  * If the inode still has bitmap dependencies,
13213                  * push them to disk.
13214                  */
13215                 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
13216                         bp = inodedep->id_bmsafemap->sm_buf;
13217                         bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
13218                         if (bp == NULL)
13219                                 goto retry;
13220                         FREE_LOCK(ump);
13221                         if ((error = bwrite(bp)) != 0)
13222                                 break;
13223                         ACQUIRE_LOCK(ump);
13224                         if (dap != LIST_FIRST(diraddhdp))
13225                                 continue;
13226                 }
13227                 /*
13228                  * If the inode is still sitting in a buffer waiting
13229                  * to be written or waiting for the link count to be
13230                  * adjusted update it here to flush it to disk.
13231                  */
13232                 if (dap == LIST_FIRST(diraddhdp)) {
13233                         FREE_LOCK(ump);
13234                         if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
13235                             FFSV_FORCEINSMQ)))
13236                                 break;
13237                         error = ffs_update(vp, 1);
13238                         vput(vp);
13239                         if (error)
13240                                 break;
13241                         ACQUIRE_LOCK(ump);
13242                 }
13243                 /*
13244                  * If we have failed to get rid of all the dependencies
13245                  * then something is seriously wrong.
13246                  */
13247                 if (dap == LIST_FIRST(diraddhdp)) {
13248                         inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
13249                         panic("flush_pagedep_deps: failed to flush "
13250                             "inodedep %p ino %ju dap %p",
13251                             inodedep, (uintmax_t)inum, dap);
13252                 }
13253         }
13254         if (error)
13255                 ACQUIRE_LOCK(ump);
13256         while ((dap = LIST_FIRST(&unfinished)) != NULL) {
13257                 LIST_REMOVE(dap, da_pdlist);
13258                 LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist);
13259         }
13260         return (error);
13261 }
13262
13263 /*
13264  * A large burst of file addition or deletion activity can drive the
13265  * memory load excessively high. First attempt to slow things down
13266  * using the techniques below. If that fails, this routine requests
13267  * the offending operations to fall back to running synchronously
13268  * until the memory load returns to a reasonable level.
13269  */
13270 int
13271 softdep_slowdown(vp)
13272         struct vnode *vp;
13273 {
13274         struct ufsmount *ump;
13275         int jlow;
13276         int max_softdeps_hard;
13277
13278         KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
13279             ("softdep_slowdown called on non-softdep filesystem"));
13280         ump = VFSTOUFS(vp->v_mount);
13281         ACQUIRE_LOCK(ump);
13282         jlow = 0;
13283         /*
13284          * Check for journal space if needed.
13285          */
13286         if (DOINGSUJ(vp)) {
13287                 if (journal_space(ump, 0) == 0)
13288                         jlow = 1;
13289         }
13290         /*
13291          * If the system is under its limits and our filesystem is
13292          * not responsible for more than our share of the usage and
13293          * we are not low on journal space, then no need to slow down.
13294          */
13295         max_softdeps_hard = max_softdeps * 11 / 10;
13296         if (dep_current[D_DIRREM] < max_softdeps_hard / 2 &&
13297             dep_current[D_INODEDEP] < max_softdeps_hard &&
13298             dep_current[D_INDIRDEP] < max_softdeps_hard / 1000 &&
13299             dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0 &&
13300             ump->softdep_curdeps[D_DIRREM] <
13301             (max_softdeps_hard / 2) / stat_flush_threads &&
13302             ump->softdep_curdeps[D_INODEDEP] <
13303             max_softdeps_hard / stat_flush_threads &&
13304             ump->softdep_curdeps[D_INDIRDEP] <
13305             (max_softdeps_hard / 1000) / stat_flush_threads &&
13306             ump->softdep_curdeps[D_FREEBLKS] <
13307             max_softdeps_hard / stat_flush_threads) {
13308                 FREE_LOCK(ump);
13309                 return (0);
13310         }
13311         /*
13312          * If the journal is low or our filesystem is over its limit
13313          * then speedup the cleanup.
13314          */
13315         if (ump->softdep_curdeps[D_INDIRDEP] <
13316             (max_softdeps_hard / 1000) / stat_flush_threads || jlow)
13317                 softdep_speedup(ump);
13318         stat_sync_limit_hit += 1;
13319         FREE_LOCK(ump);
13320         /*
13321          * We only slow down the rate at which new dependencies are
13322          * generated if we are not using journaling. With journaling,
13323          * the cleanup should always be sufficient to keep things
13324          * under control.
13325          */
13326         if (DOINGSUJ(vp))
13327                 return (0);
13328         return (1);
13329 }
13330
13331 /*
13332  * Called by the allocation routines when they are about to fail
13333  * in the hope that we can free up the requested resource (inodes
13334  * or disk space).
13335  *
13336  * First check to see if the work list has anything on it. If it has,
13337  * clean up entries until we successfully free the requested resource.
13338  * Because this process holds inodes locked, we cannot handle any remove
13339  * requests that might block on a locked inode as that could lead to
13340  * deadlock. If the worklist yields none of the requested resource,
13341  * start syncing out vnodes to free up the needed space.
13342  */
13343 int
13344 softdep_request_cleanup(fs, vp, cred, resource)
13345         struct fs *fs;
13346         struct vnode *vp;
13347         struct ucred *cred;
13348         int resource;
13349 {
13350         struct ufsmount *ump;
13351         struct mount *mp;
13352         long starttime;
13353         ufs2_daddr_t needed;
13354         int error, failed_vnode;
13355
13356         /*
13357          * If we are being called because of a process doing a
13358          * copy-on-write, then it is not safe to process any
13359          * worklist items as we will recurse into the copyonwrite
13360          * routine.  This will result in an incoherent snapshot.
13361          * If the vnode that we hold is a snapshot, we must avoid
13362          * handling other resources that could cause deadlock.
13363          */
13364         if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp)))
13365                 return (0);
13366
13367         if (resource == FLUSH_BLOCKS_WAIT)
13368                 stat_cleanup_blkrequests += 1;
13369         else
13370                 stat_cleanup_inorequests += 1;
13371
13372         mp = vp->v_mount;
13373         ump = VFSTOUFS(mp);
13374         mtx_assert(UFS_MTX(ump), MA_OWNED);
13375         UFS_UNLOCK(ump);
13376         error = ffs_update(vp, 1);
13377         if (error != 0 || MOUNTEDSOFTDEP(mp) == 0) {
13378                 UFS_LOCK(ump);
13379                 return (0);
13380         }
13381         /*
13382          * If we are in need of resources, start by cleaning up
13383          * any block removals associated with our inode.
13384          */
13385         ACQUIRE_LOCK(ump);
13386         process_removes(vp);
13387         process_truncates(vp);
13388         FREE_LOCK(ump);
13389         /*
13390          * Now clean up at least as many resources as we will need.
13391          *
13392          * When requested to clean up inodes, the number that are needed
13393          * is set by the number of simultaneous writers (mnt_writeopcount)
13394          * plus a bit of slop (2) in case some more writers show up while
13395          * we are cleaning.
13396          *
13397          * When requested to free up space, the amount of space that
13398          * we need is enough blocks to allocate a full-sized segment
13399          * (fs_contigsumsize). The number of such segments that will
13400          * be needed is set by the number of simultaneous writers
13401          * (mnt_writeopcount) plus a bit of slop (2) in case some more
13402          * writers show up while we are cleaning.
13403          *
13404          * Additionally, if we are unpriviledged and allocating space,
13405          * we need to ensure that we clean up enough blocks to get the
13406          * needed number of blocks over the threshold of the minimum
13407          * number of blocks required to be kept free by the filesystem
13408          * (fs_minfree).
13409          */
13410         if (resource == FLUSH_INODES_WAIT) {
13411                 needed = vp->v_mount->mnt_writeopcount + 2;
13412         } else if (resource == FLUSH_BLOCKS_WAIT) {
13413                 needed = (vp->v_mount->mnt_writeopcount + 2) *
13414                     fs->fs_contigsumsize;
13415                 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE))
13416                         needed += fragstoblks(fs,
13417                             roundup((fs->fs_dsize * fs->fs_minfree / 100) -
13418                             fs->fs_cstotal.cs_nffree, fs->fs_frag));
13419         } else {
13420                 UFS_LOCK(ump);
13421                 printf("softdep_request_cleanup: Unknown resource type %d\n",
13422                     resource);
13423                 return (0);
13424         }
13425         starttime = time_second;
13426 retry:
13427         if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 &&
13428             fs->fs_cstotal.cs_nbfree <= needed) ||
13429             (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13430             fs->fs_cstotal.cs_nifree <= needed)) {
13431                 ACQUIRE_LOCK(ump);
13432                 if (ump->softdep_on_worklist > 0 &&
13433                     process_worklist_item(UFSTOVFS(ump),
13434                     ump->softdep_on_worklist, LK_NOWAIT) != 0)
13435                         stat_worklist_push += 1;
13436                 FREE_LOCK(ump);
13437         }
13438         /*
13439          * If we still need resources and there are no more worklist
13440          * entries to process to obtain them, we have to start flushing
13441          * the dirty vnodes to force the release of additional requests
13442          * to the worklist that we can then process to reap addition
13443          * resources. We walk the vnodes associated with the mount point
13444          * until we get the needed worklist requests that we can reap.
13445          *
13446          * If there are several threads all needing to clean the same
13447          * mount point, only one is allowed to walk the mount list.
13448          * When several threads all try to walk the same mount list,
13449          * they end up competing with each other and often end up in
13450          * livelock. This approach ensures that forward progress is
13451          * made at the cost of occational ENOSPC errors being returned
13452          * that might otherwise have been avoided.
13453          */
13454         error = 1;
13455         if ((resource == FLUSH_BLOCKS_WAIT &&
13456              fs->fs_cstotal.cs_nbfree <= needed) ||
13457             (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13458              fs->fs_cstotal.cs_nifree <= needed)) {
13459                 ACQUIRE_LOCK(ump);
13460                 if ((ump->um_softdep->sd_flags & FLUSH_RC_ACTIVE) == 0) {
13461                         ump->um_softdep->sd_flags |= FLUSH_RC_ACTIVE;
13462                         FREE_LOCK(ump);
13463                         failed_vnode = softdep_request_cleanup_flush(mp, ump);
13464                         ACQUIRE_LOCK(ump);
13465                         ump->um_softdep->sd_flags &= ~FLUSH_RC_ACTIVE;
13466                         FREE_LOCK(ump);
13467                         if (ump->softdep_on_worklist > 0) {
13468                                 stat_cleanup_retries += 1;
13469                                 if (!failed_vnode)
13470                                         goto retry;
13471                         }
13472                 } else {
13473                         FREE_LOCK(ump);
13474                         error = 0;
13475                 }
13476                 stat_cleanup_failures += 1;
13477         }
13478         if (time_second - starttime > stat_cleanup_high_delay)
13479                 stat_cleanup_high_delay = time_second - starttime;
13480         UFS_LOCK(ump);
13481         return (error);
13482 }
13483
13484 /*
13485  * Scan the vnodes for the specified mount point flushing out any
13486  * vnodes that can be locked without waiting. Finally, try to flush
13487  * the device associated with the mount point if it can be locked
13488  * without waiting.
13489  *
13490  * We return 0 if we were able to lock every vnode in our scan.
13491  * If we had to skip one or more vnodes, we return 1.
13492  */
13493 static int
13494 softdep_request_cleanup_flush(mp, ump)
13495         struct mount *mp;
13496         struct ufsmount *ump;
13497 {
13498         struct thread *td;
13499         struct vnode *lvp, *mvp;
13500         int failed_vnode;
13501
13502         failed_vnode = 0;
13503         td = curthread;
13504         MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) {
13505                 if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) {
13506                         VI_UNLOCK(lvp);
13507                         continue;
13508                 }
13509                 if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT,
13510                     td) != 0) {
13511                         failed_vnode = 1;
13512                         continue;
13513                 }
13514                 if (lvp->v_vflag & VV_NOSYNC) { /* unlinked */
13515                         vput(lvp);
13516                         continue;
13517                 }
13518                 (void) ffs_syncvnode(lvp, MNT_NOWAIT, 0);
13519                 vput(lvp);
13520         }
13521         lvp = ump->um_devvp;
13522         if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
13523                 VOP_FSYNC(lvp, MNT_NOWAIT, td);
13524                 VOP_UNLOCK(lvp, 0);
13525         }
13526         return (failed_vnode);
13527 }
13528
13529 static bool
13530 softdep_excess_items(struct ufsmount *ump, int item)
13531 {
13532
13533         KASSERT(item >= 0 && item < D_LAST, ("item %d", item));
13534         return (dep_current[item] > max_softdeps &&
13535             ump->softdep_curdeps[item] > max_softdeps /
13536             stat_flush_threads);
13537 }
13538
13539 static void
13540 schedule_cleanup(struct mount *mp)
13541 {
13542         struct ufsmount *ump;
13543         struct thread *td;
13544
13545         ump = VFSTOUFS(mp);
13546         LOCK_OWNED(ump);
13547         FREE_LOCK(ump);
13548         td = curthread;
13549         if ((td->td_pflags & TDP_KTHREAD) != 0 &&
13550             (td->td_proc->p_flag2 & P2_AST_SU) == 0) {
13551                 /*
13552                  * No ast is delivered to kernel threads, so nobody
13553                  * would deref the mp.  Some kernel threads
13554                  * explicitely check for AST, e.g. NFS daemon does
13555                  * this in the serving loop.
13556                  */
13557                 return;
13558         }
13559         if (td->td_su != NULL)
13560                 vfs_rel(td->td_su);
13561         vfs_ref(mp);
13562         td->td_su = mp;
13563         thread_lock(td);
13564         td->td_flags |= TDF_ASTPENDING;
13565         thread_unlock(td);
13566 }
13567
13568 static void
13569 softdep_ast_cleanup_proc(struct thread *td)
13570 {
13571         struct mount *mp;
13572         struct ufsmount *ump;
13573         int error;
13574         bool req;
13575
13576         while ((mp = td->td_su) != NULL) {
13577                 td->td_su = NULL;
13578                 error = vfs_busy(mp, MBF_NOWAIT);
13579                 vfs_rel(mp);
13580                 if (error != 0)
13581                         return;
13582                 if (ffs_own_mount(mp) && MOUNTEDSOFTDEP(mp)) {
13583                         ump = VFSTOUFS(mp);
13584                         for (;;) {
13585                                 req = false;
13586                                 ACQUIRE_LOCK(ump);
13587                                 if (softdep_excess_items(ump, D_INODEDEP)) {
13588                                         req = true;
13589                                         request_cleanup(mp, FLUSH_INODES);
13590                                 }
13591                                 if (softdep_excess_items(ump, D_DIRREM)) {
13592                                         req = true;
13593                                         request_cleanup(mp, FLUSH_BLOCKS);
13594                                 }
13595                                 FREE_LOCK(ump);
13596                                 if (softdep_excess_items(ump, D_NEWBLK) ||
13597                                     softdep_excess_items(ump, D_ALLOCDIRECT) ||
13598                                     softdep_excess_items(ump, D_ALLOCINDIR)) {
13599                                         error = vn_start_write(NULL, &mp,
13600                                             V_WAIT);
13601                                         if (error == 0) {
13602                                                 req = true;
13603                                                 VFS_SYNC(mp, MNT_WAIT);
13604                                                 vn_finished_write(mp);
13605                                         }
13606                                 }
13607                                 if ((td->td_pflags & TDP_KTHREAD) != 0 || !req)
13608                                         break;
13609                         }
13610                 }
13611                 vfs_unbusy(mp);
13612         }
13613         if ((mp = td->td_su) != NULL) {
13614                 td->td_su = NULL;
13615                 vfs_rel(mp);
13616         }
13617 }
13618
13619 /*
13620  * If memory utilization has gotten too high, deliberately slow things
13621  * down and speed up the I/O processing.
13622  */
13623 static int
13624 request_cleanup(mp, resource)
13625         struct mount *mp;
13626         int resource;
13627 {
13628         struct thread *td = curthread;
13629         struct ufsmount *ump;
13630
13631         ump = VFSTOUFS(mp);
13632         LOCK_OWNED(ump);
13633         /*
13634          * We never hold up the filesystem syncer or buf daemon.
13635          */
13636         if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
13637                 return (0);
13638         /*
13639          * First check to see if the work list has gotten backlogged.
13640          * If it has, co-opt this process to help clean up two entries.
13641          * Because this process may hold inodes locked, we cannot
13642          * handle any remove requests that might block on a locked
13643          * inode as that could lead to deadlock.  We set TDP_SOFTDEP
13644          * to avoid recursively processing the worklist.
13645          */
13646         if (ump->softdep_on_worklist > max_softdeps / 10) {
13647                 td->td_pflags |= TDP_SOFTDEP;
13648                 process_worklist_item(mp, 2, LK_NOWAIT);
13649                 td->td_pflags &= ~TDP_SOFTDEP;
13650                 stat_worklist_push += 2;
13651                 return(1);
13652         }
13653         /*
13654          * Next, we attempt to speed up the syncer process. If that
13655          * is successful, then we allow the process to continue.
13656          */
13657         if (softdep_speedup(ump) &&
13658             resource != FLUSH_BLOCKS_WAIT &&
13659             resource != FLUSH_INODES_WAIT)
13660                 return(0);
13661         /*
13662          * If we are resource constrained on inode dependencies, try
13663          * flushing some dirty inodes. Otherwise, we are constrained
13664          * by file deletions, so try accelerating flushes of directories
13665          * with removal dependencies. We would like to do the cleanup
13666          * here, but we probably hold an inode locked at this point and
13667          * that might deadlock against one that we try to clean. So,
13668          * the best that we can do is request the syncer daemon to do
13669          * the cleanup for us.
13670          */
13671         switch (resource) {
13672
13673         case FLUSH_INODES:
13674         case FLUSH_INODES_WAIT:
13675                 ACQUIRE_GBLLOCK(&lk);
13676                 stat_ino_limit_push += 1;
13677                 req_clear_inodedeps += 1;
13678                 FREE_GBLLOCK(&lk);
13679                 stat_countp = &stat_ino_limit_hit;
13680                 break;
13681
13682         case FLUSH_BLOCKS:
13683         case FLUSH_BLOCKS_WAIT:
13684                 ACQUIRE_GBLLOCK(&lk);
13685                 stat_blk_limit_push += 1;
13686                 req_clear_remove += 1;
13687                 FREE_GBLLOCK(&lk);
13688                 stat_countp = &stat_blk_limit_hit;
13689                 break;
13690
13691         default:
13692                 panic("request_cleanup: unknown type");
13693         }
13694         /*
13695          * Hopefully the syncer daemon will catch up and awaken us.
13696          * We wait at most tickdelay before proceeding in any case.
13697          */
13698         ACQUIRE_GBLLOCK(&lk);
13699         FREE_LOCK(ump);
13700         proc_waiting += 1;
13701         if (callout_pending(&softdep_callout) == FALSE)
13702                 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
13703                     pause_timer, 0);
13704
13705         if ((td->td_pflags & TDP_KTHREAD) == 0)
13706                 msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
13707         proc_waiting -= 1;
13708         FREE_GBLLOCK(&lk);
13709         ACQUIRE_LOCK(ump);
13710         return (1);
13711 }
13712
13713 /*
13714  * Awaken processes pausing in request_cleanup and clear proc_waiting
13715  * to indicate that there is no longer a timer running. Pause_timer
13716  * will be called with the global softdep mutex (&lk) locked.
13717  */
13718 static void
13719 pause_timer(arg)
13720         void *arg;
13721 {
13722
13723         GBLLOCK_OWNED(&lk);
13724         /*
13725          * The callout_ API has acquired mtx and will hold it around this
13726          * function call.
13727          */
13728         *stat_countp += proc_waiting;
13729         wakeup(&proc_waiting);
13730 }
13731
13732 /*
13733  * If requested, try removing inode or removal dependencies.
13734  */
13735 static void
13736 check_clear_deps(mp)
13737         struct mount *mp;
13738 {
13739
13740         /*
13741          * If we are suspended, it may be because of our using
13742          * too many inodedeps, so help clear them out.
13743          */
13744         if (MOUNTEDSUJ(mp) && VFSTOUFS(mp)->softdep_jblocks->jb_suspended)
13745                 clear_inodedeps(mp);
13746         /*
13747          * General requests for cleanup of backed up dependencies
13748          */
13749         ACQUIRE_GBLLOCK(&lk);
13750         if (req_clear_inodedeps) {
13751                 req_clear_inodedeps -= 1;
13752                 FREE_GBLLOCK(&lk);
13753                 clear_inodedeps(mp);
13754                 ACQUIRE_GBLLOCK(&lk);
13755                 wakeup(&proc_waiting);
13756         }
13757         if (req_clear_remove) {
13758                 req_clear_remove -= 1;
13759                 FREE_GBLLOCK(&lk);
13760                 clear_remove(mp);
13761                 ACQUIRE_GBLLOCK(&lk);
13762                 wakeup(&proc_waiting);
13763         }
13764         FREE_GBLLOCK(&lk);
13765 }
13766
13767 /*
13768  * Flush out a directory with at least one removal dependency in an effort to
13769  * reduce the number of dirrem, freefile, and freeblks dependency structures.
13770  */
13771 static void
13772 clear_remove(mp)
13773         struct mount *mp;
13774 {
13775         struct pagedep_hashhead *pagedephd;
13776         struct pagedep *pagedep;
13777         struct ufsmount *ump;
13778         struct vnode *vp;
13779         struct bufobj *bo;
13780         int error, cnt;
13781         ino_t ino;
13782
13783         ump = VFSTOUFS(mp);
13784         LOCK_OWNED(ump);
13785
13786         for (cnt = 0; cnt <= ump->pagedep_hash_size; cnt++) {
13787                 pagedephd = &ump->pagedep_hashtbl[ump->pagedep_nextclean++];
13788                 if (ump->pagedep_nextclean > ump->pagedep_hash_size)
13789                         ump->pagedep_nextclean = 0;
13790                 LIST_FOREACH(pagedep, pagedephd, pd_hash) {
13791                         if (LIST_EMPTY(&pagedep->pd_dirremhd))
13792                                 continue;
13793                         ino = pagedep->pd_ino;
13794                         if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13795                                 continue;
13796                         FREE_LOCK(ump);
13797
13798                         /*
13799                          * Let unmount clear deps
13800                          */
13801                         error = vfs_busy(mp, MBF_NOWAIT);
13802                         if (error != 0)
13803                                 goto finish_write;
13804                         error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13805                              FFSV_FORCEINSMQ);
13806                         vfs_unbusy(mp);
13807                         if (error != 0) {
13808                                 softdep_error("clear_remove: vget", error);
13809                                 goto finish_write;
13810                         }
13811                         if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13812                                 softdep_error("clear_remove: fsync", error);
13813                         bo = &vp->v_bufobj;
13814                         BO_LOCK(bo);
13815                         drain_output(vp);
13816                         BO_UNLOCK(bo);
13817                         vput(vp);
13818                 finish_write:
13819                         vn_finished_write(mp);
13820                         ACQUIRE_LOCK(ump);
13821                         return;
13822                 }
13823         }
13824 }
13825
13826 /*
13827  * Clear out a block of dirty inodes in an effort to reduce
13828  * the number of inodedep dependency structures.
13829  */
13830 static void
13831 clear_inodedeps(mp)
13832         struct mount *mp;
13833 {
13834         struct inodedep_hashhead *inodedephd;
13835         struct inodedep *inodedep;
13836         struct ufsmount *ump;
13837         struct vnode *vp;
13838         struct fs *fs;
13839         int error, cnt;
13840         ino_t firstino, lastino, ino;
13841
13842         ump = VFSTOUFS(mp);
13843         fs = ump->um_fs;
13844         LOCK_OWNED(ump);
13845         /*
13846          * Pick a random inode dependency to be cleared.
13847          * We will then gather up all the inodes in its block
13848          * that have dependencies and flush them out.
13849          */
13850         for (cnt = 0; cnt <= ump->inodedep_hash_size; cnt++) {
13851                 inodedephd = &ump->inodedep_hashtbl[ump->inodedep_nextclean++];
13852                 if (ump->inodedep_nextclean > ump->inodedep_hash_size)
13853                         ump->inodedep_nextclean = 0;
13854                 if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
13855                         break;
13856         }
13857         if (inodedep == NULL)
13858                 return;
13859         /*
13860          * Find the last inode in the block with dependencies.
13861          */
13862         firstino = rounddown2(inodedep->id_ino, INOPB(fs));
13863         for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
13864                 if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
13865                         break;
13866         /*
13867          * Asynchronously push all but the last inode with dependencies.
13868          * Synchronously push the last inode with dependencies to ensure
13869          * that the inode block gets written to free up the inodedeps.
13870          */
13871         for (ino = firstino; ino <= lastino; ino++) {
13872                 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
13873                         continue;
13874                 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13875                         continue;
13876                 FREE_LOCK(ump);
13877                 error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
13878                 if (error != 0) {
13879                         vn_finished_write(mp);
13880                         ACQUIRE_LOCK(ump);
13881                         return;
13882                 }
13883                 if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13884                     FFSV_FORCEINSMQ)) != 0) {
13885                         softdep_error("clear_inodedeps: vget", error);
13886                         vfs_unbusy(mp);
13887                         vn_finished_write(mp);
13888                         ACQUIRE_LOCK(ump);
13889                         return;
13890                 }
13891                 vfs_unbusy(mp);
13892                 if (ino == lastino) {
13893                         if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)))
13894                                 softdep_error("clear_inodedeps: fsync1", error);
13895                 } else {
13896                         if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13897                                 softdep_error("clear_inodedeps: fsync2", error);
13898                         BO_LOCK(&vp->v_bufobj);
13899                         drain_output(vp);
13900                         BO_UNLOCK(&vp->v_bufobj);
13901                 }
13902                 vput(vp);
13903                 vn_finished_write(mp);
13904                 ACQUIRE_LOCK(ump);
13905         }
13906 }
13907
13908 void
13909 softdep_buf_append(bp, wkhd)
13910         struct buf *bp;
13911         struct workhead *wkhd;
13912 {
13913         struct worklist *wk;
13914         struct ufsmount *ump;
13915
13916         if ((wk = LIST_FIRST(wkhd)) == NULL)
13917                 return;
13918         KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
13919             ("softdep_buf_append called on non-softdep filesystem"));
13920         ump = VFSTOUFS(wk->wk_mp);
13921         ACQUIRE_LOCK(ump);
13922         while ((wk = LIST_FIRST(wkhd)) != NULL) {
13923                 WORKLIST_REMOVE(wk);
13924                 WORKLIST_INSERT(&bp->b_dep, wk);
13925         }
13926         FREE_LOCK(ump);
13927
13928 }
13929
13930 void
13931 softdep_inode_append(ip, cred, wkhd)
13932         struct inode *ip;
13933         struct ucred *cred;
13934         struct workhead *wkhd;
13935 {
13936         struct buf *bp;
13937         struct fs *fs;
13938         struct ufsmount *ump;
13939         int error;
13940
13941         ump = ITOUMP(ip);
13942         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
13943             ("softdep_inode_append called on non-softdep filesystem"));
13944         fs = ump->um_fs;
13945         error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
13946             (int)fs->fs_bsize, cred, &bp);
13947         if (error) {
13948                 bqrelse(bp);
13949                 softdep_freework(wkhd);
13950                 return;
13951         }
13952         softdep_buf_append(bp, wkhd);
13953         bqrelse(bp);
13954 }
13955
13956 void
13957 softdep_freework(wkhd)
13958         struct workhead *wkhd;
13959 {
13960         struct worklist *wk;
13961         struct ufsmount *ump;
13962
13963         if ((wk = LIST_FIRST(wkhd)) == NULL)
13964                 return;
13965         KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
13966             ("softdep_freework called on non-softdep filesystem"));
13967         ump = VFSTOUFS(wk->wk_mp);
13968         ACQUIRE_LOCK(ump);
13969         handle_jwork(wkhd);
13970         FREE_LOCK(ump);
13971 }
13972
13973 static struct ufsmount *
13974 softdep_bp_to_mp(bp)
13975         struct buf *bp;
13976 {
13977         struct mount *mp;
13978         struct vnode *vp;
13979
13980         if (LIST_EMPTY(&bp->b_dep))
13981                 return (NULL);
13982         vp = bp->b_vp;
13983         KASSERT(vp != NULL,
13984             ("%s, buffer with dependencies lacks vnode", __func__));
13985
13986         /*
13987          * The ump mount point is stable after we get a correct
13988          * pointer, since bp is locked and this prevents unmount from
13989          * proceeding.  But to get to it, we cannot dereference bp->b_dep
13990          * head wk_mp, because we do not yet own SU ump lock and
13991          * workitem might be freed while dereferenced.
13992          */
13993 retry:
13994         switch (vp->v_type) {
13995         case VCHR:
13996                 VI_LOCK(vp);
13997                 mp = vp->v_type == VCHR ? vp->v_rdev->si_mountpt : NULL;
13998                 VI_UNLOCK(vp);
13999                 if (mp == NULL)
14000                         goto retry;
14001                 break;
14002         case VREG:
14003         case VDIR:
14004         case VLNK:
14005         case VFIFO:
14006         case VSOCK:
14007                 mp = vp->v_mount;
14008                 break;
14009         case VBLK:
14010                 vn_printf(vp, "softdep_bp_to_mp: unexpected block device\n");
14011                 /* FALLTHROUGH */
14012         case VNON:
14013         case VBAD:
14014         case VMARKER:
14015                 mp = NULL;
14016                 break;
14017         default:
14018                 vn_printf(vp, "unknown vnode type");
14019                 mp = NULL;
14020                 break;
14021         }
14022         return (VFSTOUFS(mp));
14023 }
14024
14025 /*
14026  * Function to determine if the buffer has outstanding dependencies
14027  * that will cause a roll-back if the buffer is written. If wantcount
14028  * is set, return number of dependencies, otherwise just yes or no.
14029  */
14030 static int
14031 softdep_count_dependencies(bp, wantcount)
14032         struct buf *bp;
14033         int wantcount;
14034 {
14035         struct worklist *wk;
14036         struct ufsmount *ump;
14037         struct bmsafemap *bmsafemap;
14038         struct freework *freework;
14039         struct inodedep *inodedep;
14040         struct indirdep *indirdep;
14041         struct freeblks *freeblks;
14042         struct allocindir *aip;
14043         struct pagedep *pagedep;
14044         struct dirrem *dirrem;
14045         struct newblk *newblk;
14046         struct mkdir *mkdir;
14047         struct diradd *dap;
14048         int i, retval;
14049
14050         ump = softdep_bp_to_mp(bp);
14051         if (ump == NULL)
14052                 return (0);
14053         retval = 0;
14054         ACQUIRE_LOCK(ump);
14055         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
14056                 switch (wk->wk_type) {
14057
14058                 case D_INODEDEP:
14059                         inodedep = WK_INODEDEP(wk);
14060                         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
14061                                 /* bitmap allocation dependency */
14062                                 retval += 1;
14063                                 if (!wantcount)
14064                                         goto out;
14065                         }
14066                         if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
14067                                 /* direct block pointer dependency */
14068                                 retval += 1;
14069                                 if (!wantcount)
14070                                         goto out;
14071                         }
14072                         if (TAILQ_FIRST(&inodedep->id_extupdt)) {
14073                                 /* direct block pointer dependency */
14074                                 retval += 1;
14075                                 if (!wantcount)
14076                                         goto out;
14077                         }
14078                         if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
14079                                 /* Add reference dependency. */
14080                                 retval += 1;
14081                                 if (!wantcount)
14082                                         goto out;
14083                         }
14084                         continue;
14085
14086                 case D_INDIRDEP:
14087                         indirdep = WK_INDIRDEP(wk);
14088
14089                         TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) {
14090                                 /* indirect truncation dependency */
14091                                 retval += 1;
14092                                 if (!wantcount)
14093                                         goto out;
14094                         }
14095
14096                         LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
14097                                 /* indirect block pointer dependency */
14098                                 retval += 1;
14099                                 if (!wantcount)
14100                                         goto out;
14101                         }
14102                         continue;
14103
14104                 case D_PAGEDEP:
14105                         pagedep = WK_PAGEDEP(wk);
14106                         LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
14107                                 if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
14108                                         /* Journal remove ref dependency. */
14109                                         retval += 1;
14110                                         if (!wantcount)
14111                                                 goto out;
14112                                 }
14113                         }
14114                         for (i = 0; i < DAHASHSZ; i++) {
14115
14116                                 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
14117                                         /* directory entry dependency */
14118                                         retval += 1;
14119                                         if (!wantcount)
14120                                                 goto out;
14121                                 }
14122                         }
14123                         continue;
14124
14125                 case D_BMSAFEMAP:
14126                         bmsafemap = WK_BMSAFEMAP(wk);
14127                         if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
14128                                 /* Add reference dependency. */
14129                                 retval += 1;
14130                                 if (!wantcount)
14131                                         goto out;
14132                         }
14133                         if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
14134                                 /* Allocate block dependency. */
14135                                 retval += 1;
14136                                 if (!wantcount)
14137                                         goto out;
14138                         }
14139                         continue;
14140
14141                 case D_FREEBLKS:
14142                         freeblks = WK_FREEBLKS(wk);
14143                         if (LIST_FIRST(&freeblks->fb_jblkdephd)) {
14144                                 /* Freeblk journal dependency. */
14145                                 retval += 1;
14146                                 if (!wantcount)
14147                                         goto out;
14148                         }
14149                         continue;
14150
14151                 case D_ALLOCDIRECT:
14152                 case D_ALLOCINDIR:
14153                         newblk = WK_NEWBLK(wk);
14154                         if (newblk->nb_jnewblk) {
14155                                 /* Journal allocate dependency. */
14156                                 retval += 1;
14157                                 if (!wantcount)
14158                                         goto out;
14159                         }
14160                         continue;
14161
14162                 case D_MKDIR:
14163                         mkdir = WK_MKDIR(wk);
14164                         if (mkdir->md_jaddref) {
14165                                 /* Journal reference dependency. */
14166                                 retval += 1;
14167                                 if (!wantcount)
14168                                         goto out;
14169                         }
14170                         continue;
14171
14172                 case D_FREEWORK:
14173                 case D_FREEDEP:
14174                 case D_JSEGDEP:
14175                 case D_JSEG:
14176                 case D_SBDEP:
14177                         /* never a dependency on these blocks */
14178                         continue;
14179
14180                 default:
14181                         panic("softdep_count_dependencies: Unexpected type %s",
14182                             TYPENAME(wk->wk_type));
14183                         /* NOTREACHED */
14184                 }
14185         }
14186 out:
14187         FREE_LOCK(ump);
14188         return (retval);
14189 }
14190
14191 /*
14192  * Acquire exclusive access to a buffer.
14193  * Must be called with a locked mtx parameter.
14194  * Return acquired buffer or NULL on failure.
14195  */
14196 static struct buf *
14197 getdirtybuf(bp, lock, waitfor)
14198         struct buf *bp;
14199         struct rwlock *lock;
14200         int waitfor;
14201 {
14202         int error;
14203
14204         if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
14205                 if (waitfor != MNT_WAIT)
14206                         return (NULL);
14207                 error = BUF_LOCK(bp,
14208                     LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock);
14209                 /*
14210                  * Even if we successfully acquire bp here, we have dropped
14211                  * lock, which may violates our guarantee.
14212                  */
14213                 if (error == 0)
14214                         BUF_UNLOCK(bp);
14215                 else if (error != ENOLCK)
14216                         panic("getdirtybuf: inconsistent lock: %d", error);
14217                 rw_wlock(lock);
14218                 return (NULL);
14219         }
14220         if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
14221                 if (lock != BO_LOCKPTR(bp->b_bufobj) && waitfor == MNT_WAIT) {
14222                         rw_wunlock(lock);
14223                         BO_LOCK(bp->b_bufobj);
14224                         BUF_UNLOCK(bp);
14225                         if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
14226                                 bp->b_vflags |= BV_BKGRDWAIT;
14227                                 msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj),
14228                                        PRIBIO | PDROP, "getbuf", 0);
14229                         } else
14230                                 BO_UNLOCK(bp->b_bufobj);
14231                         rw_wlock(lock);
14232                         return (NULL);
14233                 }
14234                 BUF_UNLOCK(bp);
14235                 if (waitfor != MNT_WAIT)
14236                         return (NULL);
14237 #ifdef DEBUG_VFS_LOCKS
14238                 if (bp->b_vp->v_type != VCHR)
14239                         ASSERT_BO_WLOCKED(bp->b_bufobj);
14240 #endif
14241                 bp->b_vflags |= BV_BKGRDWAIT;
14242                 rw_sleep(&bp->b_xflags, lock, PRIBIO, "getbuf", 0);
14243                 return (NULL);
14244         }
14245         if ((bp->b_flags & B_DELWRI) == 0) {
14246                 BUF_UNLOCK(bp);
14247                 return (NULL);
14248         }
14249         bremfree(bp);
14250         return (bp);
14251 }
14252
14253
14254 /*
14255  * Check if it is safe to suspend the file system now.  On entry,
14256  * the vnode interlock for devvp should be held.  Return 0 with
14257  * the mount interlock held if the file system can be suspended now,
14258  * otherwise return EAGAIN with the mount interlock held.
14259  */
14260 int
14261 softdep_check_suspend(struct mount *mp,
14262                       struct vnode *devvp,
14263                       int softdep_depcnt,
14264                       int softdep_accdepcnt,
14265                       int secondary_writes,
14266                       int secondary_accwrites)
14267 {
14268         struct bufobj *bo;
14269         struct ufsmount *ump;
14270         struct inodedep *inodedep;
14271         int error, unlinked;
14272
14273         bo = &devvp->v_bufobj;
14274         ASSERT_BO_WLOCKED(bo);
14275
14276         /*
14277          * If we are not running with soft updates, then we need only
14278          * deal with secondary writes as we try to suspend.
14279          */
14280         if (MOUNTEDSOFTDEP(mp) == 0) {
14281                 MNT_ILOCK(mp);
14282                 while (mp->mnt_secondary_writes != 0) {
14283                         BO_UNLOCK(bo);
14284                         msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
14285                             (PUSER - 1) | PDROP, "secwr", 0);
14286                         BO_LOCK(bo);
14287                         MNT_ILOCK(mp);
14288                 }
14289
14290                 /*
14291                  * Reasons for needing more work before suspend:
14292                  * - Dirty buffers on devvp.
14293                  * - Secondary writes occurred after start of vnode sync loop
14294                  */
14295                 error = 0;
14296                 if (bo->bo_numoutput > 0 ||
14297                     bo->bo_dirty.bv_cnt > 0 ||
14298                     secondary_writes != 0 ||
14299                     mp->mnt_secondary_writes != 0 ||
14300                     secondary_accwrites != mp->mnt_secondary_accwrites)
14301                         error = EAGAIN;
14302                 BO_UNLOCK(bo);
14303                 return (error);
14304         }
14305
14306         /*
14307          * If we are running with soft updates, then we need to coordinate
14308          * with them as we try to suspend.
14309          */
14310         ump = VFSTOUFS(mp);
14311         for (;;) {
14312                 if (!TRY_ACQUIRE_LOCK(ump)) {
14313                         BO_UNLOCK(bo);
14314                         ACQUIRE_LOCK(ump);
14315                         FREE_LOCK(ump);
14316                         BO_LOCK(bo);
14317                         continue;
14318                 }
14319                 MNT_ILOCK(mp);
14320                 if (mp->mnt_secondary_writes != 0) {
14321                         FREE_LOCK(ump);
14322                         BO_UNLOCK(bo);
14323                         msleep(&mp->mnt_secondary_writes,
14324                                MNT_MTX(mp),
14325                                (PUSER - 1) | PDROP, "secwr", 0);
14326                         BO_LOCK(bo);
14327                         continue;
14328                 }
14329                 break;
14330         }
14331
14332         unlinked = 0;
14333         if (MOUNTEDSUJ(mp)) {
14334                 for (inodedep = TAILQ_FIRST(&ump->softdep_unlinked);
14335                     inodedep != NULL;
14336                     inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
14337                         if ((inodedep->id_state & (UNLINKED | UNLINKLINKS |
14338                             UNLINKONLIST)) != (UNLINKED | UNLINKLINKS |
14339                             UNLINKONLIST) ||
14340                             !check_inodedep_free(inodedep))
14341                                 continue;
14342                         unlinked++;
14343                 }
14344         }
14345
14346         /*
14347          * Reasons for needing more work before suspend:
14348          * - Dirty buffers on devvp.
14349          * - Softdep activity occurred after start of vnode sync loop
14350          * - Secondary writes occurred after start of vnode sync loop
14351          */
14352         error = 0;
14353         if (bo->bo_numoutput > 0 ||
14354             bo->bo_dirty.bv_cnt > 0 ||
14355             softdep_depcnt != unlinked ||
14356             ump->softdep_deps != unlinked ||
14357             softdep_accdepcnt != ump->softdep_accdeps ||
14358             secondary_writes != 0 ||
14359             mp->mnt_secondary_writes != 0 ||
14360             secondary_accwrites != mp->mnt_secondary_accwrites)
14361                 error = EAGAIN;
14362         FREE_LOCK(ump);
14363         BO_UNLOCK(bo);
14364         return (error);
14365 }
14366
14367
14368 /*
14369  * Get the number of dependency structures for the file system, both
14370  * the current number and the total number allocated.  These will
14371  * later be used to detect that softdep processing has occurred.
14372  */
14373 void
14374 softdep_get_depcounts(struct mount *mp,
14375                       int *softdep_depsp,
14376                       int *softdep_accdepsp)
14377 {
14378         struct ufsmount *ump;
14379
14380         if (MOUNTEDSOFTDEP(mp) == 0) {
14381                 *softdep_depsp = 0;
14382                 *softdep_accdepsp = 0;
14383                 return;
14384         }
14385         ump = VFSTOUFS(mp);
14386         ACQUIRE_LOCK(ump);
14387         *softdep_depsp = ump->softdep_deps;
14388         *softdep_accdepsp = ump->softdep_accdeps;
14389         FREE_LOCK(ump);
14390 }
14391
14392 /*
14393  * Wait for pending output on a vnode to complete.
14394  */
14395 static void
14396 drain_output(vp)
14397         struct vnode *vp;
14398 {
14399
14400         ASSERT_VOP_LOCKED(vp, "drain_output");
14401         (void)bufobj_wwait(&vp->v_bufobj, 0, 0);
14402 }
14403
14404 /*
14405  * Called whenever a buffer that is being invalidated or reallocated
14406  * contains dependencies. This should only happen if an I/O error has
14407  * occurred. The routine is called with the buffer locked.
14408  */
14409 static void
14410 softdep_deallocate_dependencies(bp)
14411         struct buf *bp;
14412 {
14413
14414         if ((bp->b_ioflags & BIO_ERROR) == 0)
14415                 panic("softdep_deallocate_dependencies: dangling deps");
14416         if (bp->b_vp != NULL && bp->b_vp->v_mount != NULL)
14417                 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
14418         else
14419                 printf("softdep_deallocate_dependencies: "
14420                     "got error %d while accessing filesystem\n", bp->b_error);
14421         if (bp->b_error != ENXIO)
14422                 panic("softdep_deallocate_dependencies: unrecovered I/O error");
14423 }
14424
14425 /*
14426  * Function to handle asynchronous write errors in the filesystem.
14427  */
14428 static void
14429 softdep_error(func, error)
14430         char *func;
14431         int error;
14432 {
14433
14434         /* XXX should do something better! */
14435         printf("%s: got error %d while accessing filesystem\n", func, error);
14436 }
14437
14438 #ifdef DDB
14439
14440 /* exported to ffs_vfsops.c */
14441 extern void db_print_ffs(struct ufsmount *ump);
14442 void
14443 db_print_ffs(struct ufsmount *ump)
14444 {
14445         db_printf("mp %p (%s) devvp %p\n", ump->um_mountp,
14446             ump->um_mountp->mnt_stat.f_mntonname, ump->um_devvp);
14447         db_printf("    fs %p su_wl %d su_deps %d su_req %d\n",
14448             ump->um_fs, ump->softdep_on_worklist,
14449             ump->softdep_deps, ump->softdep_req);
14450 }
14451
14452 static void
14453 worklist_print(struct worklist *wk, int verbose)
14454 {
14455
14456         if (!verbose) {
14457                 db_printf("%s: %p state 0x%b\n", TYPENAME(wk->wk_type), wk,
14458                     (u_int)wk->wk_state, PRINT_SOFTDEP_FLAGS);
14459                 return;
14460         }
14461         db_printf("worklist: %p type %s state 0x%b next %p\n    ", wk,
14462             TYPENAME(wk->wk_type), (u_int)wk->wk_state, PRINT_SOFTDEP_FLAGS,
14463             LIST_NEXT(wk, wk_list));
14464         db_print_ffs(VFSTOUFS(wk->wk_mp));
14465 }
14466
14467 static void
14468 inodedep_print(struct inodedep *inodedep, int verbose)
14469 {
14470
14471         worklist_print(&inodedep->id_list, 0);
14472         db_printf("    fs %p ino %jd inoblk %jd delta %jd nlink %jd\n",
14473             inodedep->id_fs,
14474             (intmax_t)inodedep->id_ino,
14475             (intmax_t)fsbtodb(inodedep->id_fs,
14476                 ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
14477             (intmax_t)inodedep->id_nlinkdelta,
14478             (intmax_t)inodedep->id_savednlink);
14479
14480         if (verbose == 0)
14481                 return;
14482
14483         db_printf("    bmsafemap %p, mkdiradd %p, inoreflst %p\n",
14484             inodedep->id_bmsafemap,
14485             inodedep->id_mkdiradd,
14486             TAILQ_FIRST(&inodedep->id_inoreflst));
14487         db_printf("    dirremhd %p, pendinghd %p, bufwait %p\n",
14488             LIST_FIRST(&inodedep->id_dirremhd),
14489             LIST_FIRST(&inodedep->id_pendinghd),
14490             LIST_FIRST(&inodedep->id_bufwait));
14491         db_printf("    inowait %p, inoupdt %p, newinoupdt %p\n",
14492             LIST_FIRST(&inodedep->id_inowait),
14493             TAILQ_FIRST(&inodedep->id_inoupdt),
14494             TAILQ_FIRST(&inodedep->id_newinoupdt));
14495         db_printf("    extupdt %p, newextupdt %p, freeblklst %p\n",
14496             TAILQ_FIRST(&inodedep->id_extupdt),
14497             TAILQ_FIRST(&inodedep->id_newextupdt),
14498             TAILQ_FIRST(&inodedep->id_freeblklst));
14499         db_printf("    saveino %p, savedsize %jd, savedextsize %jd\n",
14500             inodedep->id_savedino1,
14501             (intmax_t)inodedep->id_savedsize,
14502             (intmax_t)inodedep->id_savedextsize);
14503 }
14504
14505 static void
14506 newblk_print(struct newblk *nbp)
14507 {
14508
14509         worklist_print(&nbp->nb_list, 0);
14510         db_printf("    newblkno %jd\n", (intmax_t)nbp->nb_newblkno);
14511         db_printf("    jnewblk %p, bmsafemap %p, freefrag %p\n",
14512             &nbp->nb_jnewblk,
14513             &nbp->nb_bmsafemap,
14514             &nbp->nb_freefrag);
14515         db_printf("    indirdeps %p, newdirblk %p, jwork %p\n",
14516             LIST_FIRST(&nbp->nb_indirdeps),
14517             LIST_FIRST(&nbp->nb_newdirblk),
14518             LIST_FIRST(&nbp->nb_jwork));
14519 }
14520
14521 static void
14522 allocdirect_print(struct allocdirect *adp)
14523 {
14524
14525         newblk_print(&adp->ad_block);
14526         db_printf("    oldblkno %jd, oldsize %ld, newsize %ld\n",
14527             adp->ad_oldblkno, adp->ad_oldsize, adp->ad_newsize);
14528         db_printf("    offset %d, inodedep %p\n",
14529             adp->ad_offset, adp->ad_inodedep);
14530 }
14531
14532 static void
14533 allocindir_print(struct allocindir *aip)
14534 {
14535
14536         newblk_print(&aip->ai_block);
14537         db_printf("    oldblkno %jd, lbn %jd\n",
14538             (intmax_t)aip->ai_oldblkno, (intmax_t)aip->ai_lbn);
14539         db_printf("    offset %d, indirdep %p\n",
14540             aip->ai_offset, aip->ai_indirdep);
14541 }
14542
14543 static void
14544 mkdir_print(struct mkdir *mkdir)
14545 {
14546
14547         worklist_print(&mkdir->md_list, 0);
14548         db_printf("    diradd %p, jaddref %p, buf %p\n",
14549                 mkdir->md_diradd, mkdir->md_jaddref, mkdir->md_buf);
14550 }
14551
14552 DB_SHOW_COMMAND(sd_inodedep, db_show_sd_inodedep)
14553 {
14554
14555         if (have_addr == 0) {
14556                 db_printf("inodedep address required\n");
14557                 return;
14558         }
14559         inodedep_print((struct inodedep*)addr, 1);
14560 }
14561
14562 DB_SHOW_COMMAND(sd_allinodedeps, db_show_sd_allinodedeps)
14563 {
14564         struct inodedep_hashhead *inodedephd;
14565         struct inodedep *inodedep;
14566         struct ufsmount *ump;
14567         int cnt;
14568
14569         if (have_addr == 0) {
14570                 db_printf("ufsmount address required\n");
14571                 return;
14572         }
14573         ump = (struct ufsmount *)addr;
14574         for (cnt = 0; cnt < ump->inodedep_hash_size; cnt++) {
14575                 inodedephd = &ump->inodedep_hashtbl[cnt];
14576                 LIST_FOREACH(inodedep, inodedephd, id_hash) {
14577                         inodedep_print(inodedep, 0);
14578                 }
14579         }
14580 }
14581
14582 DB_SHOW_COMMAND(sd_worklist, db_show_sd_worklist)
14583 {
14584
14585         if (have_addr == 0) {
14586                 db_printf("worklist address required\n");
14587                 return;
14588         }
14589         worklist_print((struct worklist *)addr, 1);
14590 }
14591
14592 DB_SHOW_COMMAND(sd_workhead, db_show_sd_workhead)
14593 {
14594         struct worklist *wk;
14595         struct workhead *wkhd;
14596
14597         if (have_addr == 0) {
14598                 db_printf("worklist address required "
14599                     "(for example value in bp->b_dep)\n");
14600                 return;
14601         }
14602         /*
14603          * We often do not have the address of the worklist head but
14604          * instead a pointer to its first entry (e.g., we have the
14605          * contents of bp->b_dep rather than &bp->b_dep). But the back
14606          * pointer of bp->b_dep will point at the head of the list, so
14607          * we cheat and use that instead. If we are in the middle of
14608          * a list we will still get the same result, so nothing
14609          * unexpected will result.
14610          */
14611         wk = (struct worklist *)addr;
14612         if (wk == NULL)
14613                 return;
14614         wkhd = (struct workhead *)wk->wk_list.le_prev;
14615         LIST_FOREACH(wk, wkhd, wk_list) {
14616                 switch(wk->wk_type) {
14617                 case D_INODEDEP:
14618                         inodedep_print(WK_INODEDEP(wk), 0);
14619                         continue;
14620                 case D_ALLOCDIRECT:
14621                         allocdirect_print(WK_ALLOCDIRECT(wk));
14622                         continue;
14623                 case D_ALLOCINDIR:
14624                         allocindir_print(WK_ALLOCINDIR(wk));
14625                         continue;
14626                 case D_MKDIR:
14627                         mkdir_print(WK_MKDIR(wk));
14628                         continue;
14629                 default:
14630                         worklist_print(wk, 0);
14631                         continue;
14632                 }
14633         }
14634 }
14635
14636 DB_SHOW_COMMAND(sd_mkdir, db_show_sd_mkdir)
14637 {
14638         if (have_addr == 0) {
14639                 db_printf("mkdir address required\n");
14640                 return;
14641         }
14642         mkdir_print((struct mkdir *)addr);
14643 }
14644
14645 DB_SHOW_COMMAND(sd_mkdir_list, db_show_sd_mkdir_list)
14646 {
14647         struct mkdirlist *mkdirlisthd;
14648         struct mkdir *mkdir;
14649
14650         if (have_addr == 0) {
14651                 db_printf("mkdir listhead address required\n");
14652                 return;
14653         }
14654         mkdirlisthd = (struct mkdirlist *)addr;
14655         LIST_FOREACH(mkdir, mkdirlisthd, md_mkdirs) {
14656                 mkdir_print(mkdir);
14657                 if (mkdir->md_diradd != NULL) {
14658                         db_printf("    ");
14659                         worklist_print(&mkdir->md_diradd->da_list, 0);
14660                 }
14661                 if (mkdir->md_jaddref != NULL) {
14662                         db_printf("    ");
14663                         worklist_print(&mkdir->md_jaddref->ja_list, 0);
14664                 }
14665         }
14666 }
14667
14668 DB_SHOW_COMMAND(sd_allocdirect, db_show_sd_allocdirect)
14669 {
14670         if (have_addr == 0) {
14671                 db_printf("allocdirect address required\n");
14672                 return;
14673         }
14674         allocdirect_print((struct allocdirect *)addr);
14675 }
14676
14677 DB_SHOW_COMMAND(sd_allocindir, db_show_sd_allocindir)
14678 {
14679         if (have_addr == 0) {
14680                 db_printf("allocindir address required\n");
14681                 return;
14682         }
14683         allocindir_print((struct allocindir *)addr);
14684 }
14685
14686 #endif /* DDB */
14687
14688 #endif /* SOFTUPDATES */