sys/ufs/ffs/ffs_softdep.c

   1 /*-
   2  * Copyright 1998, 2000 Marshall Kirk McKusick.
   3  * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
   4  * All rights reserved.
   5  *
   6  * The soft updates code is derived from the appendix of a University
   7  * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
   8  * "Soft Updates: A Solution to the Metadata Update Problem in File
   9  * Systems", CSE-TR-254-95, August 1995).
  10  *
  11  * Further information about soft updates can be obtained from:
  12  *
  13  *      Marshall Kirk McKusick          http://www.mckusick.com/softdep/
  14  *      1614 Oxford Street              mckusick@mckusick.com
  15  *      Berkeley, CA 94709-1608         +1-510-843-9542
  16  *      USA
  17  *
  18  * Redistribution and use in source and binary forms, with or without
  19  * modification, are permitted provided that the following conditions
  20  * are met:
  21  *
  22  * 1. Redistributions of source code must retain the above copyright
  23  *    notice, this list of conditions and the following disclaimer.
  24  * 2. Redistributions in binary form must reproduce the above copyright
  25  *    notice, this list of conditions and the following disclaimer in the
  26  *    documentation and/or other materials provided with the distribution.
  27  *
  28  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
  29  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  30  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  31  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  34  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  35  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
  36  * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  37  * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38  *
  39  *      from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
  40  */
  41
  42 #include <sys/cdefs.h>
  43 __FBSDID("$FreeBSD$");
  44
  45 #include "opt_ffs.h"
  46 #include "opt_quota.h"
  47 #include "opt_ddb.h"
  48
  49 /*
  50  * For now we want the safety net that the DEBUG flag provides.
  51  */
  52 #ifndef DEBUG
  53 #define DEBUG
  54 #endif
  55
  56 #include <sys/param.h>
  57 #include <sys/kernel.h>
  58 #include <sys/systm.h>
  59 #include <sys/bio.h>
  60 #include <sys/buf.h>
  61 #include <sys/kdb.h>
  62 #include <sys/kthread.h>
  63 #include <sys/ktr.h>
  64 #include <sys/limits.h>
  65 #include <sys/lock.h>
  66 #include <sys/malloc.h>
  67 #include <sys/mount.h>
  68 #include <sys/mutex.h>
  69 #include <sys/namei.h>
  70 #include <sys/priv.h>
  71 #include <sys/proc.h>
  72 #include <sys/rwlock.h>
  73 #include <sys/stat.h>
  74 #include <sys/sysctl.h>
  75 #include <sys/syslog.h>
  76 #include <sys/vnode.h>
  77 #include <sys/conf.h>
  78
  79 #include <ufs/ufs/dir.h>
  80 #include <ufs/ufs/extattr.h>
  81 #include <ufs/ufs/quota.h>
  82 #include <ufs/ufs/inode.h>
  83 #include <ufs/ufs/ufsmount.h>
  84 #include <ufs/ffs/fs.h>
  85 #include <ufs/ffs/softdep.h>
  86 #include <ufs/ffs/ffs_extern.h>
  87 #include <ufs/ufs/ufs_extern.h>
  88
  89 #include <vm/vm.h>
  90 #include <vm/vm_extern.h>
  91 #include <vm/vm_object.h>
  92
  93 #include <geom/geom.h>
  94
  95 #include <ddb/ddb.h>
  96
  97 #define KTR_SUJ 0       /* Define to KTR_SPARE. */
  98
  99 #ifndef SOFTUPDATES
 100
 101 int
 102 softdep_flushfiles(oldmnt, flags, td)
 103         struct mount *oldmnt;
 104         int flags;
 105         struct thread *td;
 106 {
 107
 108         panic("softdep_flushfiles called");
 109 }
 110
 111 int
 112 softdep_mount(devvp, mp, fs, cred)
 113         struct vnode *devvp;
 114         struct mount *mp;
 115         struct fs *fs;
 116         struct ucred *cred;
 117 {
 118
 119         return (0);
 120 }
 121
 122 void
 123 softdep_initialize()
 124 {
 125
 126         return;
 127 }
 128
 129 void
 130 softdep_uninitialize()
 131 {
 132
 133         return;
 134 }
 135
 136 void
 137 softdep_unmount(mp)
 138         struct mount *mp;
 139 {
 140
 141         panic("softdep_unmount called");
 142 }
 143
 144 void
 145 softdep_setup_sbupdate(ump, fs, bp)
 146         struct ufsmount *ump;
 147         struct fs *fs;
 148         struct buf *bp;
 149 {
 150
 151         panic("softdep_setup_sbupdate called");
 152 }
 153
 154 void
 155 softdep_setup_inomapdep(bp, ip, newinum, mode)
 156         struct buf *bp;
 157         struct inode *ip;
 158         ino_t newinum;
 159         int mode;
 160 {
 161
 162         panic("softdep_setup_inomapdep called");
 163 }
 164
 165 void
 166 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
 167         struct buf *bp;
 168         struct mount *mp;
 169         ufs2_daddr_t newblkno;
 170         int frags;
 171         int oldfrags;
 172 {
 173
 174         panic("softdep_setup_blkmapdep called");
 175 }
 176
 177 void
 178 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 179         struct inode *ip;
 180         ufs_lbn_t lbn;
 181         ufs2_daddr_t newblkno;
 182         ufs2_daddr_t oldblkno;
 183         long newsize;
 184         long oldsize;
 185         struct buf *bp;
 186 {
 187
 188         panic("softdep_setup_allocdirect called");
 189 }
 190
 191 void
 192 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 193         struct inode *ip;
 194         ufs_lbn_t lbn;
 195         ufs2_daddr_t newblkno;
 196         ufs2_daddr_t oldblkno;
 197         long newsize;
 198         long oldsize;
 199         struct buf *bp;
 200 {
 201
 202         panic("softdep_setup_allocext called");
 203 }
 204
 205 void
 206 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
 207         struct inode *ip;
 208         ufs_lbn_t lbn;
 209         struct buf *bp;
 210         int ptrno;
 211         ufs2_daddr_t newblkno;
 212         ufs2_daddr_t oldblkno;
 213         struct buf *nbp;
 214 {
 215
 216         panic("softdep_setup_allocindir_page called");
 217 }
 218
 219 void
 220 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
 221         struct buf *nbp;
 222         struct inode *ip;
 223         struct buf *bp;
 224         int ptrno;
 225         ufs2_daddr_t newblkno;
 226 {
 227
 228         panic("softdep_setup_allocindir_meta called");
 229 }
 230
 231 void
 232 softdep_journal_freeblocks(ip, cred, length, flags)
 233         struct inode *ip;
 234         struct ucred *cred;
 235         off_t length;
 236         int flags;
 237 {
 238
 239         panic("softdep_journal_freeblocks called");
 240 }
 241
 242 void
 243 softdep_journal_fsync(ip)
 244         struct inode *ip;
 245 {
 246
 247         panic("softdep_journal_fsync called");
 248 }
 249
 250 void
 251 softdep_setup_freeblocks(ip, length, flags)
 252         struct inode *ip;
 253         off_t length;
 254         int flags;
 255 {
 256
 257         panic("softdep_setup_freeblocks called");
 258 }
 259
 260 void
 261 softdep_freefile(pvp, ino, mode)
 262                 struct vnode *pvp;
 263                 ino_t ino;
 264                 int mode;
 265 {
 266
 267         panic("softdep_freefile called");
 268 }
 269
 270 int
 271 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
 272         struct buf *bp;
 273         struct inode *dp;
 274         off_t diroffset;
 275         ino_t newinum;
 276         struct buf *newdirbp;
 277         int isnewblk;
 278 {
 279
 280         panic("softdep_setup_directory_add called");
 281 }
 282
 283 void
 284 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
 285         struct buf *bp;
 286         struct inode *dp;
 287         caddr_t base;
 288         caddr_t oldloc;
 289         caddr_t newloc;
 290         int entrysize;
 291 {
 292
 293         panic("softdep_change_directoryentry_offset called");
 294 }
 295
 296 void
 297 softdep_setup_remove(bp, dp, ip, isrmdir)
 298         struct buf *bp;
 299         struct inode *dp;
 300         struct inode *ip;
 301         int isrmdir;
 302 {
 303
 304         panic("softdep_setup_remove called");
 305 }
 306
 307 void
 308 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
 309         struct buf *bp;
 310         struct inode *dp;
 311         struct inode *ip;
 312         ino_t newinum;
 313         int isrmdir;
 314 {
 315
 316         panic("softdep_setup_directory_change called");
 317 }
 318
 319 void
 320 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
 321         struct mount *mp;
 322         struct buf *bp;
 323         ufs2_daddr_t blkno;
 324         int frags;
 325         struct workhead *wkhd;
 326 {
 327
 328         panic("%s called", __FUNCTION__);
 329 }
 330
 331 void
 332 softdep_setup_inofree(mp, bp, ino, wkhd)
 333         struct mount *mp;
 334         struct buf *bp;
 335         ino_t ino;
 336         struct workhead *wkhd;
 337 {
 338
 339         panic("%s called", __FUNCTION__);
 340 }
 341
 342 void
 343 softdep_setup_unlink(dp, ip)
 344         struct inode *dp;
 345         struct inode *ip;
 346 {
 347
 348         panic("%s called", __FUNCTION__);
 349 }
 350
 351 void
 352 softdep_setup_link(dp, ip)
 353         struct inode *dp;
 354         struct inode *ip;
 355 {
 356
 357         panic("%s called", __FUNCTION__);
 358 }
 359
 360 void
 361 softdep_revert_link(dp, ip)
 362         struct inode *dp;
 363         struct inode *ip;
 364 {
 365
 366         panic("%s called", __FUNCTION__);
 367 }
 368
 369 void
 370 softdep_setup_rmdir(dp, ip)
 371         struct inode *dp;
 372         struct inode *ip;
 373 {
 374
 375         panic("%s called", __FUNCTION__);
 376 }
 377
 378 void
 379 softdep_revert_rmdir(dp, ip)
 380         struct inode *dp;
 381         struct inode *ip;
 382 {
 383
 384         panic("%s called", __FUNCTION__);
 385 }
 386
 387 void
 388 softdep_setup_create(dp, ip)
 389         struct inode *dp;
 390         struct inode *ip;
 391 {
 392
 393         panic("%s called", __FUNCTION__);
 394 }
 395
 396 void
 397 softdep_revert_create(dp, ip)
 398         struct inode *dp;
 399         struct inode *ip;
 400 {
 401
 402         panic("%s called", __FUNCTION__);
 403 }
 404
 405 void
 406 softdep_setup_mkdir(dp, ip)
 407         struct inode *dp;
 408         struct inode *ip;
 409 {
 410
 411         panic("%s called", __FUNCTION__);
 412 }
 413
 414 void
 415 softdep_revert_mkdir(dp, ip)
 416         struct inode *dp;
 417         struct inode *ip;
 418 {
 419
 420         panic("%s called", __FUNCTION__);
 421 }
 422
 423 void
 424 softdep_setup_dotdot_link(dp, ip)
 425         struct inode *dp;
 426         struct inode *ip;
 427 {
 428
 429         panic("%s called", __FUNCTION__);
 430 }
 431
 432 int
 433 softdep_prealloc(vp, waitok)
 434         struct vnode *vp;
 435         int waitok;
 436 {
 437
 438         panic("%s called", __FUNCTION__);
 439 }
 440
 441 int
 442 softdep_journal_lookup(mp, vpp)
 443         struct mount *mp;
 444         struct vnode **vpp;
 445 {
 446
 447         return (ENOENT);
 448 }
 449
 450 void
 451 softdep_change_linkcnt(ip)
 452         struct inode *ip;
 453 {
 454
 455         panic("softdep_change_linkcnt called");
 456 }
 457
 458 void
 459 softdep_load_inodeblock(ip)
 460         struct inode *ip;
 461 {
 462
 463         panic("softdep_load_inodeblock called");
 464 }
 465
 466 void
 467 softdep_update_inodeblock(ip, bp, waitfor)
 468         struct inode *ip;
 469         struct buf *bp;
 470         int waitfor;
 471 {
 472
 473         panic("softdep_update_inodeblock called");
 474 }
 475
 476 int
 477 softdep_fsync(vp)
 478         struct vnode *vp;       /* the "in_core" copy of the inode */
 479 {
 480
 481         return (0);
 482 }
 483
 484 void
 485 softdep_fsync_mountdev(vp)
 486         struct vnode *vp;
 487 {
 488
 489         return;
 490 }
 491
 492 int
 493 softdep_flushworklist(oldmnt, countp, td)
 494         struct mount *oldmnt;
 495         int *countp;
 496         struct thread *td;
 497 {
 498
 499         *countp = 0;
 500         return (0);
 501 }
 502
 503 int
 504 softdep_sync_metadata(struct vnode *vp)
 505 {
 506
 507         panic("softdep_sync_metadata called");
 508 }
 509
 510 int
 511 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
 512 {
 513
 514         panic("softdep_sync_buf called");
 515 }
 516
 517 int
 518 softdep_slowdown(vp)
 519         struct vnode *vp;
 520 {
 521
 522         panic("softdep_slowdown called");
 523 }
 524
 525 int
 526 softdep_request_cleanup(fs, vp, cred, resource)
 527         struct fs *fs;
 528         struct vnode *vp;
 529         struct ucred *cred;
 530         int resource;
 531 {
 532
 533         return (0);
 534 }
 535
 536 int
 537 softdep_check_suspend(struct mount *mp,
 538                       struct vnode *devvp,
 539                       int softdep_depcnt,
 540                       int softdep_accdepcnt,
 541                       int secondary_writes,
 542                       int secondary_accwrites)
 543 {
 544         struct bufobj *bo;
 545         int error;
 546
 547         (void) softdep_depcnt,
 548         (void) softdep_accdepcnt;
 549
 550         bo = &devvp->v_bufobj;
 551         ASSERT_BO_WLOCKED(bo);
 552
 553         MNT_ILOCK(mp);
 554         while (mp->mnt_secondary_writes != 0) {
 555                 BO_UNLOCK(bo);
 556                 msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
 557                     (PUSER - 1) | PDROP, "secwr", 0);
 558                 BO_LOCK(bo);
 559                 MNT_ILOCK(mp);
 560         }
 561
 562         /*
 563          * Reasons for needing more work before suspend:
 564          * - Dirty buffers on devvp.
 565          * - Secondary writes occurred after start of vnode sync loop
 566          */
 567         error = 0;
 568         if (bo->bo_numoutput > 0 ||
 569             bo->bo_dirty.bv_cnt > 0 ||
 570             secondary_writes != 0 ||
 571             mp->mnt_secondary_writes != 0 ||
 572             secondary_accwrites != mp->mnt_secondary_accwrites)
 573                 error = EAGAIN;
 574         BO_UNLOCK(bo);
 575         return (error);
 576 }
 577
 578 void
 579 softdep_get_depcounts(struct mount *mp,
 580                       int *softdepactivep,
 581                       int *softdepactiveaccp)
 582 {
 583         (void) mp;
 584         *softdepactivep = 0;
 585         *softdepactiveaccp = 0;
 586 }
 587
 588 void
 589 softdep_buf_append(bp, wkhd)
 590         struct buf *bp;
 591         struct workhead *wkhd;
 592 {
 593
 594         panic("softdep_buf_appendwork called");
 595 }
 596
 597 void
 598 softdep_inode_append(ip, cred, wkhd)
 599         struct inode *ip;
 600         struct ucred *cred;
 601         struct workhead *wkhd;
 602 {
 603
 604         panic("softdep_inode_appendwork called");
 605 }
 606
 607 void
 608 softdep_freework(wkhd)
 609         struct workhead *wkhd;
 610 {
 611
 612         panic("softdep_freework called");
 613 }
 614
 615 #else
 616
 617 FEATURE(softupdates, "FFS soft-updates support");
 618
 619 static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0,
 620     "soft updates stats");
 621 static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
 622     "total dependencies allocated");
 623 static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse, CTLFLAG_RW, 0,
 624     "high use dependencies allocated");
 625 static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
 626     "current dependencies allocated");
 627 static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0,
 628     "current dependencies written");
 629
 630 unsigned long dep_current[D_LAST + 1];
 631 unsigned long dep_highuse[D_LAST + 1];
 632 unsigned long dep_total[D_LAST + 1];
 633 unsigned long dep_write[D_LAST + 1];
 634
 635 #define SOFTDEP_TYPE(type, str, long)                                   \
 636     static MALLOC_DEFINE(M_ ## type, #str, long);                       \
 637     SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,       \
 638         &dep_total[D_ ## type], 0, "");                                 \
 639     SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD,     \
 640         &dep_current[D_ ## type], 0, "");                               \
 641     SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD,     \
 642         &dep_highuse[D_ ## type], 0, "");                               \
 643     SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD,       \
 644         &dep_write[D_ ## type], 0, "");
 645
 646 SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
 647 SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
 648 SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
 649     "Block or frag allocated from cyl group map");
 650 SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
 651 SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
 652 SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
 653 SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
 654 SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
 655 SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
 656 SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
 657 SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
 658 SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
 659 SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
 660 SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
 661 SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
 662 SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
 663 SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
 664 SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
 665 SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
 666 SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
 667 SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
 668 SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
 669 SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
 670 SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
 671 SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
 672 SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
 673 SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
 674
 675 static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel");
 676
 677 static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
 678 static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
 679 static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data");
 680
 681 #define M_SOFTDEP_FLAGS (M_WAITOK)
 682
 683 /*
 684  * translate from workitem type to memory type
 685  * MUST match the defines above, such that memtype[D_XXX] == M_XXX
 686  */
 687 static struct malloc_type *memtype[] = {
 688         M_PAGEDEP,
 689         M_INODEDEP,
 690         M_BMSAFEMAP,
 691         M_NEWBLK,
 692         M_ALLOCDIRECT,
 693         M_INDIRDEP,
 694         M_ALLOCINDIR,
 695         M_FREEFRAG,
 696         M_FREEBLKS,
 697         M_FREEFILE,
 698         M_DIRADD,
 699         M_MKDIR,
 700         M_DIRREM,
 701         M_NEWDIRBLK,
 702         M_FREEWORK,
 703         M_FREEDEP,
 704         M_JADDREF,
 705         M_JREMREF,
 706         M_JMVREF,
 707         M_JNEWBLK,
 708         M_JFREEBLK,
 709         M_JFREEFRAG,
 710         M_JSEG,
 711         M_JSEGDEP,
 712         M_SBDEP,
 713         M_JTRUNC,
 714         M_JFSYNC,
 715         M_SENTINEL
 716 };
 717
 718 #define DtoM(type) (memtype[type])
 719
 720 /*
 721  * Names of malloc types.
 722  */
 723 #define TYPENAME(type)  \
 724         ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
 725 /*
 726  * End system adaptation definitions.
 727  */
 728
 729 #define DOTDOT_OFFSET   offsetof(struct dirtemplate, dotdot_ino)
 730 #define DOT_OFFSET      offsetof(struct dirtemplate, dot_ino)
 731
 732 /*
 733  * Internal function prototypes.
 734  */
 735 static  void check_clear_deps(struct mount *);
 736 static  void softdep_error(char *, int);
 737 static  int softdep_process_worklist(struct mount *, int);
 738 static  int softdep_waitidle(struct mount *);
 739 static  void drain_output(struct vnode *);
 740 static  struct buf *getdirtybuf(struct buf *, struct rwlock *, int);
 741 static  void clear_remove(struct mount *);
 742 static  void clear_inodedeps(struct mount *);
 743 static  void unlinked_inodedep(struct mount *, struct inodedep *);
 744 static  void clear_unlinked_inodedep(struct inodedep *);
 745 static  struct inodedep *first_unlinked_inodedep(struct ufsmount *);
 746 static  int flush_pagedep_deps(struct vnode *, struct mount *,
 747             struct diraddhd *);
 748 static  int free_pagedep(struct pagedep *);
 749 static  int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
 750 static  int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
 751 static  int flush_deplist(struct allocdirectlst *, int, int *);
 752 static  int sync_cgs(struct mount *, int);
 753 static  int handle_written_filepage(struct pagedep *, struct buf *);
 754 static  int handle_written_sbdep(struct sbdep *, struct buf *);
 755 static  void initiate_write_sbdep(struct sbdep *);
 756 static  void diradd_inode_written(struct diradd *, struct inodedep *);
 757 static  int handle_written_indirdep(struct indirdep *, struct buf *,
 758             struct buf**);
 759 static  int handle_written_inodeblock(struct inodedep *, struct buf *);
 760 static  int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
 761             uint8_t *);
 762 static  int handle_written_bmsafemap(struct bmsafemap *, struct buf *);
 763 static  void handle_written_jaddref(struct jaddref *);
 764 static  void handle_written_jremref(struct jremref *);
 765 static  void handle_written_jseg(struct jseg *, struct buf *);
 766 static  void handle_written_jnewblk(struct jnewblk *);
 767 static  void handle_written_jblkdep(struct jblkdep *);
 768 static  void handle_written_jfreefrag(struct jfreefrag *);
 769 static  void complete_jseg(struct jseg *);
 770 static  void complete_jsegs(struct jseg *);
 771 static  void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
 772 static  void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
 773 static  void jremref_write(struct jremref *, struct jseg *, uint8_t *);
 774 static  void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
 775 static  void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
 776 static  void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
 777 static  void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
 778 static  void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
 779 static  void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
 780 static  inline void inoref_write(struct inoref *, struct jseg *,
 781             struct jrefrec *);
 782 static  void handle_allocdirect_partdone(struct allocdirect *,
 783             struct workhead *);
 784 static  struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
 785             struct workhead *);
 786 static  void indirdep_complete(struct indirdep *);
 787 static  int indirblk_lookup(struct mount *, ufs2_daddr_t);
 788 static  void indirblk_insert(struct freework *);
 789 static  void indirblk_remove(struct freework *);
 790 static  void handle_allocindir_partdone(struct allocindir *);
 791 static  void initiate_write_filepage(struct pagedep *, struct buf *);
 792 static  void initiate_write_indirdep(struct indirdep*, struct buf *);
 793 static  void handle_written_mkdir(struct mkdir *, int);
 794 static  int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
 795             uint8_t *);
 796 static  void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
 797 static  void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
 798 static  void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
 799 static  void handle_workitem_freefile(struct freefile *);
 800 static  int handle_workitem_remove(struct dirrem *, int);
 801 static  struct dirrem *newdirrem(struct buf *, struct inode *,
 802             struct inode *, int, struct dirrem **);
 803 static  struct indirdep *indirdep_lookup(struct mount *, struct inode *,
 804             struct buf *);
 805 static  void cancel_indirdep(struct indirdep *, struct buf *,
 806             struct freeblks *);
 807 static  void free_indirdep(struct indirdep *);
 808 static  void free_diradd(struct diradd *, struct workhead *);
 809 static  void merge_diradd(struct inodedep *, struct diradd *);
 810 static  void complete_diradd(struct diradd *);
 811 static  struct diradd *diradd_lookup(struct pagedep *, int);
 812 static  struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
 813             struct jremref *);
 814 static  struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
 815             struct jremref *);
 816 static  void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
 817             struct jremref *, struct jremref *);
 818 static  void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
 819             struct jremref *);
 820 static  void cancel_allocindir(struct allocindir *, struct buf *bp,
 821             struct freeblks *, int);
 822 static  int setup_trunc_indir(struct freeblks *, struct inode *,
 823             ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
 824 static  void complete_trunc_indir(struct freework *);
 825 static  void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
 826             int);
 827 static  void complete_mkdir(struct mkdir *);
 828 static  void free_newdirblk(struct newdirblk *);
 829 static  void free_jremref(struct jremref *);
 830 static  void free_jaddref(struct jaddref *);
 831 static  void free_jsegdep(struct jsegdep *);
 832 static  void free_jsegs(struct jblocks *);
 833 static  void rele_jseg(struct jseg *);
 834 static  void free_jseg(struct jseg *, struct jblocks *);
 835 static  void free_jnewblk(struct jnewblk *);
 836 static  void free_jblkdep(struct jblkdep *);
 837 static  void free_jfreefrag(struct jfreefrag *);
 838 static  void free_freedep(struct freedep *);
 839 static  void journal_jremref(struct dirrem *, struct jremref *,
 840             struct inodedep *);
 841 static  void cancel_jnewblk(struct jnewblk *, struct workhead *);
 842 static  int cancel_jaddref(struct jaddref *, struct inodedep *,
 843             struct workhead *);
 844 static  void cancel_jfreefrag(struct jfreefrag *);
 845 static  inline void setup_freedirect(struct freeblks *, struct inode *,
 846             int, int);
 847 static  inline void setup_freeext(struct freeblks *, struct inode *, int, int);
 848 static  inline void setup_freeindir(struct freeblks *, struct inode *, int,
 849             ufs_lbn_t, int);
 850 static  inline struct freeblks *newfreeblks(struct mount *, struct inode *);
 851 static  void freeblks_free(struct ufsmount *, struct freeblks *, int);
 852 static  void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
 853 static  ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
 854 static  int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
 855 static  void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
 856             int, int);
 857 static  void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
 858 static  int cancel_pagedep(struct pagedep *, struct freeblks *, int);
 859 static  int deallocate_dependencies(struct buf *, struct freeblks *, int);
 860 static  void newblk_freefrag(struct newblk*);
 861 static  void free_newblk(struct newblk *);
 862 static  void cancel_allocdirect(struct allocdirectlst *,
 863             struct allocdirect *, struct freeblks *);
 864 static  int check_inode_unwritten(struct inodedep *);
 865 static  int free_inodedep(struct inodedep *);
 866 static  void freework_freeblock(struct freework *);
 867 static  void freework_enqueue(struct freework *);
 868 static  int handle_workitem_freeblocks(struct freeblks *, int);
 869 static  int handle_complete_freeblocks(struct freeblks *, int);
 870 static  void handle_workitem_indirblk(struct freework *);
 871 static  void handle_written_freework(struct freework *);
 872 static  void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
 873 static  struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
 874             struct workhead *);
 875 static  struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
 876             struct inodedep *, struct allocindir *, ufs_lbn_t);
 877 static  struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
 878             ufs2_daddr_t, ufs_lbn_t);
 879 static  void handle_workitem_freefrag(struct freefrag *);
 880 static  struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
 881             ufs_lbn_t);
 882 static  void allocdirect_merge(struct allocdirectlst *,
 883             struct allocdirect *, struct allocdirect *);
 884 static  struct freefrag *allocindir_merge(struct allocindir *,
 885             struct allocindir *);
 886 static  int bmsafemap_find(struct bmsafemap_hashhead *, int,
 887             struct bmsafemap **);
 888 static  struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
 889             int cg, struct bmsafemap *);
 890 static  int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int,
 891             struct newblk **);
 892 static  int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
 893 static  int inodedep_find(struct inodedep_hashhead *, ino_t,
 894             struct inodedep **);
 895 static  int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
 896 static  int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
 897             int, struct pagedep **);
 898 static  int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
 899             struct pagedep **);
 900 static  void pause_timer(void *);
 901 static  int request_cleanup(struct mount *, int);
 902 static  int process_worklist_item(struct mount *, int, int);
 903 static  void process_removes(struct vnode *);
 904 static  void process_truncates(struct vnode *);
 905 static  void jwork_move(struct workhead *, struct workhead *);
 906 static  void jwork_insert(struct workhead *, struct jsegdep *);
 907 static  void add_to_worklist(struct worklist *, int);
 908 static  void wake_worklist(struct worklist *);
 909 static  void wait_worklist(struct worklist *, char *);
 910 static  void remove_from_worklist(struct worklist *);
 911 static  void softdep_flush(void);
 912 static  void softdep_flushjournal(struct mount *);
 913 static  int softdep_speedup(void);
 914 static  void worklist_speedup(struct mount *);
 915 static  int journal_mount(struct mount *, struct fs *, struct ucred *);
 916 static  void journal_unmount(struct ufsmount *);
 917 static  int journal_space(struct ufsmount *, int);
 918 static  void journal_suspend(struct ufsmount *);
 919 static  int journal_unsuspend(struct ufsmount *ump);
 920 static  void softdep_prelink(struct vnode *, struct vnode *);
 921 static  void add_to_journal(struct worklist *);
 922 static  void remove_from_journal(struct worklist *);
 923 static  void softdep_process_journal(struct mount *, struct worklist *, int);
 924 static  struct jremref *newjremref(struct dirrem *, struct inode *,
 925             struct inode *ip, off_t, nlink_t);
 926 static  struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
 927             uint16_t);
 928 static  inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
 929             uint16_t);
 930 static  inline struct jsegdep *inoref_jseg(struct inoref *);
 931 static  struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
 932 static  struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
 933             ufs2_daddr_t, int);
 934 static  struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
 935 static  void move_newblock_dep(struct jaddref *, struct inodedep *);
 936 static  void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
 937 static  struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
 938             ufs2_daddr_t, long, ufs_lbn_t);
 939 static  struct freework *newfreework(struct ufsmount *, struct freeblks *,
 940             struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
 941 static  int jwait(struct worklist *, int);
 942 static  struct inodedep *inodedep_lookup_ip(struct inode *);
 943 static  int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
 944 static  struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
 945 static  void handle_jwork(struct workhead *);
 946 static  struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
 947             struct mkdir **);
 948 static  struct jblocks *jblocks_create(void);
 949 static  ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
 950 static  void jblocks_free(struct jblocks *, struct mount *, int);
 951 static  void jblocks_destroy(struct jblocks *);
 952 static  void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
 953
 954 /*
 955  * Exported softdep operations.
 956  */
 957 static  void softdep_disk_io_initiation(struct buf *);
 958 static  void softdep_disk_write_complete(struct buf *);
 959 static  void softdep_deallocate_dependencies(struct buf *);
 960 static  int softdep_count_dependencies(struct buf *bp, int);
 961
 962 /*
 963  * Global lock over all of soft updates.
 964  */
 965 static struct rwlock lk;
 966 RW_SYSINIT(softdep_lock, &lk, "Softdep Lock");
 967
 968 /*
 969  * Allow per-filesystem soft-updates locking.
 970  * For now all use the same global lock defined above.
 971  */
 972 #define LOCK_PTR(ump)           ((ump)->um_softdep->sd_fslock)
 973 #define TRY_ACQUIRE_LOCK(ump)   rw_try_wlock((ump)->um_softdep->sd_fslock)
 974 #define ACQUIRE_LOCK(ump)       rw_wlock((ump)->um_softdep->sd_fslock)
 975 #define FREE_LOCK(ump)          rw_wunlock((ump)->um_softdep->sd_fslock)
 976 #define LOCK_OWNED(ump)         rw_assert((ump)->um_softdep->sd_fslock, \
 977                                     RA_WLOCKED)
 978
 979 #define BUF_AREC(bp)            lockallowrecurse(&(bp)->b_lock)
 980 #define BUF_NOREC(bp)           lockdisablerecurse(&(bp)->b_lock)
 981
 982 /*
 983  * Worklist queue management.
 984  * These routines require that the lock be held.
 985  */
 986 #ifndef /* NOT */ DEBUG
 987 #define WORKLIST_INSERT(head, item) do {        \
 988         (item)->wk_state |= ONWORKLIST;         \
 989         LIST_INSERT_HEAD(head, item, wk_list);  \
 990 } while (0)
 991 #define WORKLIST_REMOVE(item) do {              \
 992         (item)->wk_state &= ~ONWORKLIST;        \
 993         LIST_REMOVE(item, wk_list);             \
 994 } while (0)
 995 #define WORKLIST_INSERT_UNLOCKED        WORKLIST_INSERT
 996 #define WORKLIST_REMOVE_UNLOCKED        WORKLIST_REMOVE
 997
 998 #else /* DEBUG */
 999 static  void worklist_insert(struct workhead *, struct worklist *, int);
1000 static  void worklist_remove(struct worklist *, int);
1001
1002 #define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
1003 #define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
1004 #define WORKLIST_REMOVE(item) worklist_remove(item, 1)
1005 #define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
1006
1007 static void
1008 worklist_insert(head, item, locked)
1009         struct workhead *head;
1010         struct worklist *item;
1011         int locked;
1012 {
1013
1014         if (locked)
1015                 LOCK_OWNED(VFSTOUFS(item->wk_mp));
1016         if (item->wk_state & ONWORKLIST)
1017                 panic("worklist_insert: %p %s(0x%X) already on list",
1018                     item, TYPENAME(item->wk_type), item->wk_state);
1019         item->wk_state |= ONWORKLIST;
1020         LIST_INSERT_HEAD(head, item, wk_list);
1021 }
1022
1023 static void
1024 worklist_remove(item, locked)
1025         struct worklist *item;
1026         int locked;
1027 {
1028
1029         if (locked)
1030                 LOCK_OWNED(VFSTOUFS(item->wk_mp));
1031         if ((item->wk_state & ONWORKLIST) == 0)
1032                 panic("worklist_remove: %p %s(0x%X) not on list",
1033                     item, TYPENAME(item->wk_type), item->wk_state);
1034         item->wk_state &= ~ONWORKLIST;
1035         LIST_REMOVE(item, wk_list);
1036 }
1037 #endif /* DEBUG */
1038
1039 /*
1040  * Merge two jsegdeps keeping only the oldest one as newer references
1041  * can't be discarded until after older references.
1042  */
1043 static inline struct jsegdep *
1044 jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
1045 {
1046         struct jsegdep *swp;
1047
1048         if (two == NULL)
1049                 return (one);
1050
1051         if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
1052                 swp = one;
1053                 one = two;
1054                 two = swp;
1055         }
1056         WORKLIST_REMOVE(&two->jd_list);
1057         free_jsegdep(two);
1058
1059         return (one);
1060 }
1061
1062 /*
1063  * If two freedeps are compatible free one to reduce list size.
1064  */
1065 static inline struct freedep *
1066 freedep_merge(struct freedep *one, struct freedep *two)
1067 {
1068         if (two == NULL)
1069                 return (one);
1070
1071         if (one->fd_freework == two->fd_freework) {
1072                 WORKLIST_REMOVE(&two->fd_list);
1073                 free_freedep(two);
1074         }
1075         return (one);
1076 }
1077
1078 /*
1079  * Move journal work from one list to another.  Duplicate freedeps and
1080  * jsegdeps are coalesced to keep the lists as small as possible.
1081  */
1082 static void
1083 jwork_move(dst, src)
1084         struct workhead *dst;
1085         struct workhead *src;
1086 {
1087         struct freedep *freedep;
1088         struct jsegdep *jsegdep;
1089         struct worklist *wkn;
1090         struct worklist *wk;
1091
1092         KASSERT(dst != src,
1093             ("jwork_move: dst == src"));
1094         freedep = NULL;
1095         jsegdep = NULL;
1096         LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
1097                 if (wk->wk_type == D_JSEGDEP)
1098                         jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1099                 if (wk->wk_type == D_FREEDEP)
1100                         freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1101         }
1102
1103         while ((wk = LIST_FIRST(src)) != NULL) {
1104                 WORKLIST_REMOVE(wk);
1105                 WORKLIST_INSERT(dst, wk);
1106                 if (wk->wk_type == D_JSEGDEP) {
1107                         jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1108                         continue;
1109                 }
1110                 if (wk->wk_type == D_FREEDEP)
1111                         freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1112         }
1113 }
1114
1115 static void
1116 jwork_insert(dst, jsegdep)
1117         struct workhead *dst;
1118         struct jsegdep *jsegdep;
1119 {
1120         struct jsegdep *jsegdepn;
1121         struct worklist *wk;
1122
1123         LIST_FOREACH(wk, dst, wk_list)
1124                 if (wk->wk_type == D_JSEGDEP)
1125                         break;
1126         if (wk == NULL) {
1127                 WORKLIST_INSERT(dst, &jsegdep->jd_list);
1128                 return;
1129         }
1130         jsegdepn = WK_JSEGDEP(wk);
1131         if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
1132                 WORKLIST_REMOVE(wk);
1133                 free_jsegdep(jsegdepn);
1134                 WORKLIST_INSERT(dst, &jsegdep->jd_list);
1135         } else
1136                 free_jsegdep(jsegdep);
1137 }
1138
1139 /*
1140  * Routines for tracking and managing workitems.
1141  */
1142 static  void workitem_free(struct worklist *, int);
1143 static  void workitem_alloc(struct worklist *, int, struct mount *);
1144 static  void workitem_reassign(struct worklist *, int);
1145
1146 #define WORKITEM_FREE(item, type) \
1147         workitem_free((struct worklist *)(item), (type))
1148 #define WORKITEM_REASSIGN(item, type) \
1149         workitem_reassign((struct worklist *)(item), (type))
1150
1151 static void
1152 workitem_free(item, type)
1153         struct worklist *item;
1154         int type;
1155 {
1156         struct ufsmount *ump;
1157
1158 #ifdef DEBUG
1159         if (item->wk_state & ONWORKLIST)
1160                 panic("workitem_free: %s(0x%X) still on list",
1161                     TYPENAME(item->wk_type), item->wk_state);
1162         if (item->wk_type != type && type != D_NEWBLK)
1163                 panic("workitem_free: type mismatch %s != %s",
1164                     TYPENAME(item->wk_type), TYPENAME(type));
1165 #endif
1166         if (item->wk_state & IOWAITING)
1167                 wakeup(item);
1168         ump = VFSTOUFS(item->wk_mp);
1169         LOCK_OWNED(ump);
1170         KASSERT(ump->softdep_deps > 0,
1171             ("workitem_free: %s: softdep_deps going negative",
1172             ump->um_fs->fs_fsmnt));
1173         if (--ump->softdep_deps == 0 && ump->softdep_req)
1174                 wakeup(&ump->softdep_deps);
1175         KASSERT(dep_current[item->wk_type] > 0,
1176             ("workitem_free: %s: dep_current[%s] going negative",
1177             ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1178         KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1179             ("workitem_free: %s: softdep_curdeps[%s] going negative",
1180             ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1181         dep_current[item->wk_type]--;
1182         ump->softdep_curdeps[item->wk_type] -= 1;
1183         free(item, DtoM(type));
1184 }
1185
1186 static void
1187 workitem_alloc(item, type, mp)
1188         struct worklist *item;
1189         int type;
1190         struct mount *mp;
1191 {
1192         struct ufsmount *ump;
1193
1194         item->wk_type = type;
1195         item->wk_mp = mp;
1196         item->wk_state = 0;
1197
1198         ump = VFSTOUFS(mp);
1199         ACQUIRE_LOCK(ump);
1200         dep_current[type]++;
1201         if (dep_current[type] > dep_highuse[type])
1202                 dep_highuse[type] = dep_current[type];
1203         dep_total[type]++;
1204         ump->softdep_curdeps[type] += 1;
1205         ump->softdep_deps++;
1206         ump->softdep_accdeps++;
1207         FREE_LOCK(ump);
1208 }
1209
1210 static void
1211 workitem_reassign(item, newtype)
1212         struct worklist *item;
1213         int newtype;
1214 {
1215         struct ufsmount *ump;
1216
1217         ump = VFSTOUFS(item->wk_mp);
1218         LOCK_OWNED(ump);
1219         KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1220             ("workitem_reassign: %s: softdep_curdeps[%s] going negative",
1221             VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1222         ump->softdep_curdeps[item->wk_type] -= 1;
1223         ump->softdep_curdeps[newtype] += 1;
1224         KASSERT(dep_current[item->wk_type] > 0,
1225             ("workitem_reassign: %s: dep_current[%s] going negative",
1226             VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1227         dep_current[item->wk_type]--;
1228         dep_current[newtype]++;
1229         if (dep_current[newtype] > dep_highuse[newtype])
1230                 dep_highuse[newtype] = dep_current[newtype];
1231         dep_total[newtype]++;
1232         item->wk_type = newtype;
1233 }
1234
1235 /*
1236  * Workitem queue management
1237  */
1238 static int max_softdeps;        /* maximum number of structs before slowdown */
1239 static int maxindirdeps = 50;   /* max number of indirdeps before slowdown */
1240 static int tickdelay = 2;       /* number of ticks to pause during slowdown */
1241 static int proc_waiting;        /* tracks whether we have a timeout posted */
1242 static int *stat_countp;        /* statistic to count in proc_waiting timeout */
1243 static struct callout softdep_callout;
1244 static struct mount *req_pending;
1245 #define ALLCLEAN ((struct mount *)-1)
1246 static int req_clear_inodedeps; /* syncer process flush some inodedeps */
1247 static int req_clear_remove;    /* syncer process flush some freeblks */
1248 static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
1249
1250 /*
1251  * runtime statistics
1252  */
1253 static int stat_softdep_mounts; /* number of softdep mounted filesystems */
1254 static int stat_worklist_push;  /* number of worklist cleanups */
1255 static int stat_blk_limit_push; /* number of times block limit neared */
1256 static int stat_ino_limit_push; /* number of times inode limit neared */
1257 static int stat_blk_limit_hit;  /* number of times block slowdown imposed */
1258 static int stat_ino_limit_hit;  /* number of times inode slowdown imposed */
1259 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
1260 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
1261 static int stat_inode_bitmap;   /* bufs redirtied as inode bitmap not written */
1262 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
1263 static int stat_dir_entry;      /* bufs redirtied as dir entry cannot write */
1264 static int stat_jaddref;        /* bufs redirtied as ino bitmap can not write */
1265 static int stat_jnewblk;        /* bufs redirtied as blk bitmap can not write */
1266 static int stat_journal_min;    /* Times hit journal min threshold */
1267 static int stat_journal_low;    /* Times hit journal low threshold */
1268 static int stat_journal_wait;   /* Times blocked in jwait(). */
1269 static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */
1270 static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */
1271 static int stat_jwait_inode;    /* Times blocked in jwait() for inodes. */
1272 static int stat_jwait_newblk;   /* Times blocked in jwait() for newblks. */
1273 static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
1274 static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
1275 static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
1276 static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
1277 static int stat_cleanup_failures; /* Number of cleanup requests that failed */
1278
1279 SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
1280     &max_softdeps, 0, "");
1281 SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
1282     &tickdelay, 0, "");
1283 SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW,
1284     &maxindirdeps, 0, "");
1285 SYSCTL_INT(_debug_softdep, OID_AUTO, softdep_mounts, CTLFLAG_RD,
1286     &stat_softdep_mounts, 0, "");
1287 SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
1288     &stat_worklist_push, 0,"");
1289 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
1290     &stat_blk_limit_push, 0,"");
1291 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
1292     &stat_ino_limit_push, 0,"");
1293 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
1294     &stat_blk_limit_hit, 0, "");
1295 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
1296     &stat_ino_limit_hit, 0, "");
1297 SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
1298     &stat_sync_limit_hit, 0, "");
1299 SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
1300     &stat_indir_blk_ptrs, 0, "");
1301 SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
1302     &stat_inode_bitmap, 0, "");
1303 SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
1304     &stat_direct_blk_ptrs, 0, "");
1305 SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
1306     &stat_dir_entry, 0, "");
1307 SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
1308     &stat_jaddref, 0, "");
1309 SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
1310     &stat_jnewblk, 0, "");
1311 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
1312     &stat_journal_low, 0, "");
1313 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
1314     &stat_journal_min, 0, "");
1315 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
1316     &stat_journal_wait, 0, "");
1317 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
1318     &stat_jwait_filepage, 0, "");
1319 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
1320     &stat_jwait_freeblks, 0, "");
1321 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
1322     &stat_jwait_inode, 0, "");
1323 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
1324     &stat_jwait_newblk, 0, "");
1325 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW,
1326     &stat_cleanup_blkrequests, 0, "");
1327 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW,
1328     &stat_cleanup_inorequests, 0, "");
1329 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW,
1330     &stat_cleanup_high_delay, 0, "");
1331 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW,
1332     &stat_cleanup_retries, 0, "");
1333 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW,
1334     &stat_cleanup_failures, 0, "");
1335 SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW,
1336     &softdep_flushcache, 0, "");
1337
1338 SYSCTL_DECL(_vfs_ffs);
1339
1340 /* Whether to recompute the summary at mount time */
1341 static int compute_summary_at_mount = 0;
1342 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1343            &compute_summary_at_mount, 0, "Recompute summary at mount");
1344 static struct proc *softdepproc;
1345 static struct kproc_desc softdep_kp = {
1346         "softdepflush",
1347         softdep_flush,
1348         &softdepproc
1349 };
1350 SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start,
1351     &softdep_kp);
1352
1353 static void
1354 softdep_flush(void)
1355 {
1356         struct mount *nmp;
1357         struct mount *mp;
1358         struct ufsmount *ump;
1359         struct thread *td;
1360         int remaining;
1361         int progress;
1362
1363         td = curthread;
1364         td->td_pflags |= TDP_NORUNNINGBUF;
1365
1366         for (;;) {
1367                 kproc_suspend_check(softdepproc);
1368                 remaining = progress = 0;
1369                 mtx_lock(&mountlist_mtx);
1370                 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp)  {
1371                         nmp = TAILQ_NEXT(mp, mnt_list);
1372                         if (MOUNTEDSOFTDEP(mp) == 0)
1373                                 continue;
1374                         if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
1375                                 continue;
1376                         ump = VFSTOUFS(mp);
1377                         progress += softdep_process_worklist(mp, 0);
1378                         remaining += ump->softdep_on_worklist;
1379                         mtx_lock(&mountlist_mtx);
1380                         nmp = TAILQ_NEXT(mp, mnt_list);
1381                         vfs_unbusy(mp);
1382                 }
1383                 mtx_unlock(&mountlist_mtx);
1384                 if (remaining && progress)
1385                         continue;
1386                 rw_wlock(&lk);
1387                 if (req_pending == NULL)
1388                         msleep(&req_pending, &lk, PVM, "sdflush", hz);
1389                 req_pending = NULL;
1390                 rw_wunlock(&lk);
1391         }
1392 }
1393
1394 static void
1395 worklist_speedup(mp)
1396         struct mount *mp;
1397 {
1398         rw_assert(&lk, RA_WLOCKED);
1399         if (req_pending == 0) {
1400                 req_pending = mp;
1401                 wakeup(&req_pending);
1402         }
1403 }
1404
1405 static int
1406 softdep_speedup(void)
1407 {
1408
1409         worklist_speedup(ALLCLEAN);
1410         bd_speedup();
1411         return (speedup_syncer());
1412 }
1413
1414 /*
1415  * Add an item to the end of the work queue.
1416  * This routine requires that the lock be held.
1417  * This is the only routine that adds items to the list.
1418  * The following routine is the only one that removes items
1419  * and does so in order from first to last.
1420  */
1421
1422 #define WK_HEAD         0x0001  /* Add to HEAD. */
1423 #define WK_NODELAY      0x0002  /* Process immediately. */
1424
1425 static void
1426 add_to_worklist(wk, flags)
1427         struct worklist *wk;
1428         int flags;
1429 {
1430         struct ufsmount *ump;
1431
1432         ump = VFSTOUFS(wk->wk_mp);
1433         LOCK_OWNED(ump);
1434         if (wk->wk_state & ONWORKLIST)
1435                 panic("add_to_worklist: %s(0x%X) already on list",
1436                     TYPENAME(wk->wk_type), wk->wk_state);
1437         wk->wk_state |= ONWORKLIST;
1438         if (ump->softdep_on_worklist == 0) {
1439                 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1440                 ump->softdep_worklist_tail = wk;
1441         } else if (flags & WK_HEAD) {
1442                 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1443         } else {
1444                 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1445                 ump->softdep_worklist_tail = wk;
1446         }
1447         ump->softdep_on_worklist += 1;
1448         if (flags & WK_NODELAY)
1449                 worklist_speedup(wk->wk_mp);
1450 }
1451
1452 /*
1453  * Remove the item to be processed. If we are removing the last
1454  * item on the list, we need to recalculate the tail pointer.
1455  */
1456 static void
1457 remove_from_worklist(wk)
1458         struct worklist *wk;
1459 {
1460         struct ufsmount *ump;
1461
1462         ump = VFSTOUFS(wk->wk_mp);
1463         WORKLIST_REMOVE(wk);
1464         if (ump->softdep_worklist_tail == wk)
1465                 ump->softdep_worklist_tail =
1466                     (struct worklist *)wk->wk_list.le_prev;
1467         ump->softdep_on_worklist -= 1;
1468 }
1469
1470 static void
1471 wake_worklist(wk)
1472         struct worklist *wk;
1473 {
1474         if (wk->wk_state & IOWAITING) {
1475                 wk->wk_state &= ~IOWAITING;
1476                 wakeup(wk);
1477         }
1478 }
1479
1480 static void
1481 wait_worklist(wk, wmesg)
1482         struct worklist *wk;
1483         char *wmesg;
1484 {
1485         struct ufsmount *ump;
1486
1487         ump = VFSTOUFS(wk->wk_mp);
1488         wk->wk_state |= IOWAITING;
1489         msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0);
1490 }
1491
1492 /*
1493  * Process that runs once per second to handle items in the background queue.
1494  *
1495  * Note that we ensure that everything is done in the order in which they
1496  * appear in the queue. The code below depends on this property to ensure
1497  * that blocks of a file are freed before the inode itself is freed. This
1498  * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1499  * until all the old ones have been purged from the dependency lists.
1500  */
1501 static int
1502 softdep_process_worklist(mp, full)
1503         struct mount *mp;
1504         int full;
1505 {
1506         int cnt, matchcnt;
1507         struct ufsmount *ump;
1508         long starttime;
1509
1510         KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1511         if (MOUNTEDSOFTDEP(mp) == 0)
1512                 return (0);
1513         matchcnt = 0;
1514         ump = VFSTOUFS(mp);
1515         ACQUIRE_LOCK(ump);
1516         starttime = time_second;
1517         softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0);
1518         check_clear_deps(mp);
1519         while (ump->softdep_on_worklist > 0) {
1520                 if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
1521                         break;
1522                 else
1523                         matchcnt += cnt;
1524                 check_clear_deps(mp);
1525                 /*
1526                  * We do not generally want to stop for buffer space, but if
1527                  * we are really being a buffer hog, we will stop and wait.
1528                  */
1529                 if (should_yield()) {
1530                         FREE_LOCK(ump);
1531                         kern_yield(PRI_USER);
1532                         bwillwrite();
1533                         ACQUIRE_LOCK(ump);
1534                 }
1535                 /*
1536                  * Never allow processing to run for more than one
1537                  * second. This gives the syncer thread the opportunity
1538                  * to pause if appropriate.
1539                  */
1540                 if (!full && starttime != time_second)
1541                         break;
1542         }
1543         if (full == 0)
1544                 journal_unsuspend(ump);
1545         FREE_LOCK(ump);
1546         return (matchcnt);
1547 }
1548
1549 /*
1550  * Process all removes associated with a vnode if we are running out of
1551  * journal space.  Any other process which attempts to flush these will
1552  * be unable as we have the vnodes locked.
1553  */
1554 static void
1555 process_removes(vp)
1556         struct vnode *vp;
1557 {
1558         struct inodedep *inodedep;
1559         struct dirrem *dirrem;
1560         struct ufsmount *ump;
1561         struct mount *mp;
1562         ino_t inum;
1563
1564         mp = vp->v_mount;
1565         ump = VFSTOUFS(mp);
1566         LOCK_OWNED(ump);
1567         inum = VTOI(vp)->i_number;
1568         for (;;) {
1569 top:
1570                 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1571                         return;
1572                 LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
1573                         /*
1574                          * If another thread is trying to lock this vnode
1575                          * it will fail but we must wait for it to do so
1576                          * before we can proceed.
1577                          */
1578                         if (dirrem->dm_state & INPROGRESS) {
1579                                 wait_worklist(&dirrem->dm_list, "pwrwait");
1580                                 goto top;
1581                         }
1582                         if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
1583                             (COMPLETE | ONWORKLIST))
1584                                 break;
1585                 }
1586                 if (dirrem == NULL)
1587                         return;
1588                 remove_from_worklist(&dirrem->dm_list);
1589                 FREE_LOCK(ump);
1590                 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1591                         panic("process_removes: suspended filesystem");
1592                 handle_workitem_remove(dirrem, 0);
1593                 vn_finished_secondary_write(mp);
1594                 ACQUIRE_LOCK(ump);
1595         }
1596 }
1597
1598 /*
1599  * Process all truncations associated with a vnode if we are running out
1600  * of journal space.  This is called when the vnode lock is already held
1601  * and no other process can clear the truncation.  This function returns
1602  * a value greater than zero if it did any work.
1603  */
1604 static void
1605 process_truncates(vp)
1606         struct vnode *vp;
1607 {
1608         struct inodedep *inodedep;
1609         struct freeblks *freeblks;
1610         struct ufsmount *ump;
1611         struct mount *mp;
1612         ino_t inum;
1613         int cgwait;
1614
1615         mp = vp->v_mount;
1616         ump = VFSTOUFS(mp);
1617         LOCK_OWNED(ump);
1618         inum = VTOI(vp)->i_number;
1619         for (;;) {
1620                 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1621                         return;
1622                 cgwait = 0;
1623                 TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
1624                         /* Journal entries not yet written.  */
1625                         if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
1626                                 jwait(&LIST_FIRST(
1627                                     &freeblks->fb_jblkdephd)->jb_list,
1628                                     MNT_WAIT);
1629                                 break;
1630                         }
1631                         /* Another thread is executing this item. */
1632                         if (freeblks->fb_state & INPROGRESS) {
1633                                 wait_worklist(&freeblks->fb_list, "ptrwait");
1634                                 break;
1635                         }
1636                         /* Freeblks is waiting on a inode write. */
1637                         if ((freeblks->fb_state & COMPLETE) == 0) {
1638                                 FREE_LOCK(ump);
1639                                 ffs_update(vp, 1);
1640                                 ACQUIRE_LOCK(ump);
1641                                 break;
1642                         }
1643                         if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
1644                             (ALLCOMPLETE | ONWORKLIST)) {
1645                                 remove_from_worklist(&freeblks->fb_list);
1646                                 freeblks->fb_state |= INPROGRESS;
1647                                 FREE_LOCK(ump);
1648                                 if (vn_start_secondary_write(NULL, &mp,
1649                                     V_NOWAIT))
1650                                         panic("process_truncates: "
1651                                             "suspended filesystem");
1652                                 handle_workitem_freeblocks(freeblks, 0);
1653                                 vn_finished_secondary_write(mp);
1654                                 ACQUIRE_LOCK(ump);
1655                                 break;
1656                         }
1657                         if (freeblks->fb_cgwait)
1658                                 cgwait++;
1659                 }
1660                 if (cgwait) {
1661                         FREE_LOCK(ump);
1662                         sync_cgs(mp, MNT_WAIT);
1663                         ffs_sync_snap(mp, MNT_WAIT);
1664                         ACQUIRE_LOCK(ump);
1665                         continue;
1666                 }
1667                 if (freeblks == NULL)
1668                         break;
1669         }
1670         return;
1671 }
1672
1673 /*
1674  * Process one item on the worklist.
1675  */
1676 static int
1677 process_worklist_item(mp, target, flags)
1678         struct mount *mp;
1679         int target;
1680         int flags;
1681 {
1682         struct worklist sentinel;
1683         struct worklist *wk;
1684         struct ufsmount *ump;
1685         int matchcnt;
1686         int error;
1687
1688         KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1689         /*
1690          * If we are being called because of a process doing a
1691          * copy-on-write, then it is not safe to write as we may
1692          * recurse into the copy-on-write routine.
1693          */
1694         if (curthread->td_pflags & TDP_COWINPROGRESS)
1695                 return (-1);
1696         PHOLD(curproc); /* Don't let the stack go away. */
1697         ump = VFSTOUFS(mp);
1698         LOCK_OWNED(ump);
1699         matchcnt = 0;
1700         sentinel.wk_mp = NULL;
1701         sentinel.wk_type = D_SENTINEL;
1702         LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list);
1703         for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL;
1704             wk = LIST_NEXT(&sentinel, wk_list)) {
1705                 if (wk->wk_type == D_SENTINEL) {
1706                         LIST_REMOVE(&sentinel, wk_list);
1707                         LIST_INSERT_AFTER(wk, &sentinel, wk_list);
1708                         continue;
1709                 }
1710                 if (wk->wk_state & INPROGRESS)
1711                         panic("process_worklist_item: %p already in progress.",
1712                             wk);
1713                 wk->wk_state |= INPROGRESS;
1714                 remove_from_worklist(wk);
1715                 FREE_LOCK(ump);
1716                 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1717                         panic("process_worklist_item: suspended filesystem");
1718                 switch (wk->wk_type) {
1719                 case D_DIRREM:
1720                         /* removal of a directory entry */
1721                         error = handle_workitem_remove(WK_DIRREM(wk), flags);
1722                         break;
1723
1724                 case D_FREEBLKS:
1725                         /* releasing blocks and/or fragments from a file */
1726                         error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
1727                             flags);
1728                         break;
1729
1730                 case D_FREEFRAG:
1731                         /* releasing a fragment when replaced as a file grows */
1732                         handle_workitem_freefrag(WK_FREEFRAG(wk));
1733                         error = 0;
1734                         break;
1735
1736                 case D_FREEFILE:
1737                         /* releasing an inode when its link count drops to 0 */
1738                         handle_workitem_freefile(WK_FREEFILE(wk));
1739                         error = 0;
1740                         break;
1741
1742                 default:
1743                         panic("%s_process_worklist: Unknown type %s",
1744                             "softdep", TYPENAME(wk->wk_type));
1745                         /* NOTREACHED */
1746                 }
1747                 vn_finished_secondary_write(mp);
1748                 ACQUIRE_LOCK(ump);
1749                 if (error == 0) {
1750                         if (++matchcnt == target)
1751                                 break;
1752                         continue;
1753                 }
1754                 /*
1755                  * We have to retry the worklist item later.  Wake up any
1756                  * waiters who may be able to complete it immediately and
1757                  * add the item back to the head so we don't try to execute
1758                  * it again.
1759                  */
1760                 wk->wk_state &= ~INPROGRESS;
1761                 wake_worklist(wk);
1762                 add_to_worklist(wk, WK_HEAD);
1763         }
1764         LIST_REMOVE(&sentinel, wk_list);
1765         /* Sentinal could've become the tail from remove_from_worklist. */
1766         if (ump->softdep_worklist_tail == &sentinel)
1767                 ump->softdep_worklist_tail =
1768                     (struct worklist *)sentinel.wk_list.le_prev;
1769         PRELE(curproc);
1770         return (matchcnt);
1771 }
1772
1773 /*
1774  * Move dependencies from one buffer to another.
1775  */
1776 int
1777 softdep_move_dependencies(oldbp, newbp)
1778         struct buf *oldbp;
1779         struct buf *newbp;
1780 {
1781         struct worklist *wk, *wktail;
1782         struct ufsmount *ump;
1783         int dirty;
1784
1785         if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL)
1786                 return (0);
1787         KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
1788             ("softdep_move_dependencies called on non-softdep filesystem"));
1789         dirty = 0;
1790         wktail = NULL;
1791         ump = VFSTOUFS(wk->wk_mp);
1792         ACQUIRE_LOCK(ump);
1793         while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
1794                 LIST_REMOVE(wk, wk_list);
1795                 if (wk->wk_type == D_BMSAFEMAP &&
1796                     bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp))
1797                         dirty = 1;
1798                 if (wktail == 0)
1799                         LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1800                 else
1801                         LIST_INSERT_AFTER(wktail, wk, wk_list);
1802                 wktail = wk;
1803         }
1804         FREE_LOCK(ump);
1805
1806         return (dirty);
1807 }
1808
1809 /*
1810  * Purge the work list of all items associated with a particular mount point.
1811  */
1812 int
1813 softdep_flushworklist(oldmnt, countp, td)
1814         struct mount *oldmnt;
1815         int *countp;
1816         struct thread *td;
1817 {
1818         struct vnode *devvp;
1819         int count, error = 0;
1820         struct ufsmount *ump;
1821
1822         /*
1823          * Alternately flush the block device associated with the mount
1824          * point and process any dependencies that the flushing
1825          * creates. We continue until no more worklist dependencies
1826          * are found.
1827          */
1828         *countp = 0;
1829         ump = VFSTOUFS(oldmnt);
1830         devvp = ump->um_devvp;
1831         while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1832                 *countp += count;
1833                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1834                 error = VOP_FSYNC(devvp, MNT_WAIT, td);
1835                 VOP_UNLOCK(devvp, 0);
1836                 if (error)
1837                         break;
1838         }
1839         return (error);
1840 }
1841
1842 static int
1843 softdep_waitidle(struct mount *mp)
1844 {
1845         struct ufsmount *ump;
1846         int error;
1847         int i;
1848
1849         ump = VFSTOUFS(mp);
1850         ACQUIRE_LOCK(ump);
1851         for (i = 0; i < 10 && ump->softdep_deps; i++) {
1852                 ump->softdep_req = 1;
1853                 if (ump->softdep_on_worklist)
1854                         panic("softdep_waitidle: work added after flush.");
1855                 msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM, "softdeps", 1);
1856         }
1857         ump->softdep_req = 0;
1858         FREE_LOCK(ump);
1859         error = 0;
1860         if (i == 10) {
1861                 error = EBUSY;
1862                 printf("softdep_waitidle: Failed to flush worklist for %p\n",
1863                     mp);
1864         }
1865
1866         return (error);
1867 }
1868
1869 /*
1870  * Flush all vnodes and worklist items associated with a specified mount point.
1871  */
1872 int
1873 softdep_flushfiles(oldmnt, flags, td)
1874         struct mount *oldmnt;
1875         int flags;
1876         struct thread *td;
1877 {
1878 #ifdef QUOTA
1879         struct ufsmount *ump;
1880         int i;
1881 #endif
1882         int error, early, depcount, loopcnt, retry_flush_count, retry;
1883         int morework;
1884
1885         KASSERT(MOUNTEDSOFTDEP(oldmnt) != 0,
1886             ("softdep_flushfiles called on non-softdep filesystem"));
1887         loopcnt = 10;
1888         retry_flush_count = 3;
1889 retry_flush:
1890         error = 0;
1891
1892         /*
1893          * Alternately flush the vnodes associated with the mount
1894          * point and process any dependencies that the flushing
1895          * creates. In theory, this loop can happen at most twice,
1896          * but we give it a few extra just to be sure.
1897          */
1898         for (; loopcnt > 0; loopcnt--) {
1899                 /*
1900                  * Do another flush in case any vnodes were brought in
1901                  * as part of the cleanup operations.
1902                  */
1903                 early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag &
1904                     MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH;
1905                 if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0)
1906                         break;
1907                 if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
1908                     depcount == 0)
1909                         break;
1910         }
1911         /*
1912          * If we are unmounting then it is an error to fail. If we
1913          * are simply trying to downgrade to read-only, then filesystem
1914          * activity can keep us busy forever, so we just fail with EBUSY.
1915          */
1916         if (loopcnt == 0) {
1917                 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
1918                         panic("softdep_flushfiles: looping");
1919                 error = EBUSY;
1920         }
1921         if (!error)
1922                 error = softdep_waitidle(oldmnt);
1923         if (!error) {
1924                 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
1925                         retry = 0;
1926                         MNT_ILOCK(oldmnt);
1927                         KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
1928                             ("softdep_flushfiles: !MNTK_NOINSMNTQ"));
1929                         morework = oldmnt->mnt_nvnodelistsize > 0;
1930 #ifdef QUOTA
1931                         ump = VFSTOUFS(oldmnt);
1932                         UFS_LOCK(ump);
1933                         for (i = 0; i < MAXQUOTAS; i++) {
1934                                 if (ump->um_quotas[i] != NULLVP)
1935                                         morework = 1;
1936                         }
1937                         UFS_UNLOCK(ump);
1938 #endif
1939                         if (morework) {
1940                                 if (--retry_flush_count > 0) {
1941                                         retry = 1;
1942                                         loopcnt = 3;
1943                                 } else
1944                                         error = EBUSY;
1945                         }
1946                         MNT_IUNLOCK(oldmnt);
1947                         if (retry)
1948                                 goto retry_flush;
1949                 }
1950         }
1951         return (error);
1952 }
1953
1954 /*
1955  * Structure hashing.
1956  *
1957  * There are four types of structures that can be looked up:
1958  *      1) pagedep structures identified by mount point, inode number,
1959  *         and logical block.
1960  *      2) inodedep structures identified by mount point and inode number.
1961  *      3) newblk structures identified by mount point and
1962  *         physical block number.
1963  *      4) bmsafemap structures identified by mount point and
1964  *         cylinder group number.
1965  *
1966  * The "pagedep" and "inodedep" dependency structures are hashed
1967  * separately from the file blocks and inodes to which they correspond.
1968  * This separation helps when the in-memory copy of an inode or
1969  * file block must be replaced. It also obviates the need to access
1970  * an inode or file page when simply updating (or de-allocating)
1971  * dependency structures. Lookup of newblk structures is needed to
1972  * find newly allocated blocks when trying to associate them with
1973  * their allocdirect or allocindir structure.
1974  *
1975  * The lookup routines optionally create and hash a new instance when
1976  * an existing entry is not found. The bmsafemap lookup routine always
1977  * allocates a new structure if an existing one is not found.
1978  */
1979 #define DEPALLOC        0x0001  /* allocate structure if lookup fails */
1980 #define NODELAY         0x0002  /* cannot do background work */
1981
1982 /*
1983  * Structures and routines associated with pagedep caching.
1984  */
1985 #define PAGEDEP_HASH(ump, inum, lbn) \
1986         (&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size])
1987
1988 static int
1989 pagedep_find(pagedephd, ino, lbn, pagedeppp)
1990         struct pagedep_hashhead *pagedephd;
1991         ino_t ino;
1992         ufs_lbn_t lbn;
1993         struct pagedep **pagedeppp;
1994 {
1995         struct pagedep *pagedep;
1996
1997         LIST_FOREACH(pagedep, pagedephd, pd_hash) {
1998                 if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) {
1999                         *pagedeppp = pagedep;
2000                         return (1);
2001                 }
2002         }
2003         *pagedeppp = NULL;
2004         return (0);
2005 }
2006 /*
2007  * Look up a pagedep. Return 1 if found, 0 otherwise.
2008  * If not found, allocate if DEPALLOC flag is passed.
2009  * Found or allocated entry is returned in pagedeppp.
2010  * This routine must be called with splbio interrupts blocked.
2011  */
2012 static int
2013 pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp)
2014         struct mount *mp;
2015         struct buf *bp;
2016         ino_t ino;
2017         ufs_lbn_t lbn;
2018         int flags;
2019         struct pagedep **pagedeppp;
2020 {
2021         struct pagedep *pagedep;
2022         struct pagedep_hashhead *pagedephd;
2023         struct worklist *wk;
2024         struct ufsmount *ump;
2025         int ret;
2026         int i;
2027
2028         ump = VFSTOUFS(mp);
2029         LOCK_OWNED(ump);
2030         if (bp) {
2031                 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
2032                         if (wk->wk_type == D_PAGEDEP) {
2033                                 *pagedeppp = WK_PAGEDEP(wk);
2034                                 return (1);
2035                         }
2036                 }
2037         }
2038         pagedephd = PAGEDEP_HASH(ump, ino, lbn);
2039         ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2040         if (ret) {
2041                 if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
2042                         WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
2043                 return (1);
2044         }
2045         if ((flags & DEPALLOC) == 0)
2046                 return (0);
2047         FREE_LOCK(ump);
2048         pagedep = malloc(sizeof(struct pagedep),
2049             M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
2050         workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
2051         ACQUIRE_LOCK(ump);
2052         ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2053         if (*pagedeppp) {
2054                 /*
2055                  * This should never happen since we only create pagedeps
2056                  * with the vnode lock held.  Could be an assert.
2057                  */
2058                 WORKITEM_FREE(pagedep, D_PAGEDEP);
2059                 return (ret);
2060         }
2061         pagedep->pd_ino = ino;
2062         pagedep->pd_lbn = lbn;
2063         LIST_INIT(&pagedep->pd_dirremhd);
2064         LIST_INIT(&pagedep->pd_pendinghd);
2065         for (i = 0; i < DAHASHSZ; i++)
2066                 LIST_INIT(&pagedep->pd_diraddhd[i]);
2067         LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
2068         WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2069         *pagedeppp = pagedep;
2070         return (0);
2071 }
2072
2073 /*
2074  * Structures and routines associated with inodedep caching.
2075  */
2076 #define INODEDEP_HASH(ump, inum) \
2077       (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size])
2078
2079 static int
2080 inodedep_find(inodedephd, inum, inodedeppp)
2081         struct inodedep_hashhead *inodedephd;
2082         ino_t inum;
2083         struct inodedep **inodedeppp;
2084 {
2085         struct inodedep *inodedep;
2086
2087         LIST_FOREACH(inodedep, inodedephd, id_hash)
2088                 if (inum == inodedep->id_ino)
2089                         break;
2090         if (inodedep) {
2091                 *inodedeppp = inodedep;
2092                 return (1);
2093         }
2094         *inodedeppp = NULL;
2095
2096         return (0);
2097 }
2098 /*
2099  * Look up an inodedep. Return 1 if found, 0 if not found.
2100  * If not found, allocate if DEPALLOC flag is passed.
2101  * Found or allocated entry is returned in inodedeppp.
2102  * This routine must be called with splbio interrupts blocked.
2103  */
2104 static int
2105 inodedep_lookup(mp, inum, flags, inodedeppp)
2106         struct mount *mp;
2107         ino_t inum;
2108         int flags;
2109         struct inodedep **inodedeppp;
2110 {
2111         struct inodedep *inodedep;
2112         struct inodedep_hashhead *inodedephd;
2113         struct ufsmount *ump;
2114         struct fs *fs;
2115
2116         ump = VFSTOUFS(mp);
2117         LOCK_OWNED(ump);
2118         fs = ump->um_fs;
2119         inodedephd = INODEDEP_HASH(ump, inum);
2120
2121         if (inodedep_find(inodedephd, inum, inodedeppp))
2122                 return (1);
2123         if ((flags & DEPALLOC) == 0)
2124                 return (0);
2125         /*
2126          * If we are over our limit, try to improve the situation.
2127          */
2128         if (dep_current[D_INODEDEP] > max_softdeps && (flags & NODELAY) == 0)
2129                 request_cleanup(mp, FLUSH_INODES);
2130         FREE_LOCK(ump);
2131         inodedep = malloc(sizeof(struct inodedep),
2132                 M_INODEDEP, M_SOFTDEP_FLAGS);
2133         workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
2134         ACQUIRE_LOCK(ump);
2135         if (inodedep_find(inodedephd, inum, inodedeppp)) {
2136                 WORKITEM_FREE(inodedep, D_INODEDEP);
2137                 return (1);
2138         }
2139         inodedep->id_fs = fs;
2140         inodedep->id_ino = inum;
2141         inodedep->id_state = ALLCOMPLETE;
2142         inodedep->id_nlinkdelta = 0;
2143         inodedep->id_savedino1 = NULL;
2144         inodedep->id_savedsize = -1;
2145         inodedep->id_savedextsize = -1;
2146         inodedep->id_savednlink = -1;
2147         inodedep->id_bmsafemap = NULL;
2148         inodedep->id_mkdiradd = NULL;
2149         LIST_INIT(&inodedep->id_dirremhd);
2150         LIST_INIT(&inodedep->id_pendinghd);
2151         LIST_INIT(&inodedep->id_inowait);
2152         LIST_INIT(&inodedep->id_bufwait);
2153         TAILQ_INIT(&inodedep->id_inoreflst);
2154         TAILQ_INIT(&inodedep->id_inoupdt);
2155         TAILQ_INIT(&inodedep->id_newinoupdt);
2156         TAILQ_INIT(&inodedep->id_extupdt);
2157         TAILQ_INIT(&inodedep->id_newextupdt);
2158         TAILQ_INIT(&inodedep->id_freeblklst);
2159         LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
2160         *inodedeppp = inodedep;
2161         return (0);
2162 }
2163
2164 /*
2165  * Structures and routines associated with newblk caching.
2166  */
2167 #define NEWBLK_HASH(ump, inum) \
2168         (&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size])
2169
2170 static int
2171 newblk_find(newblkhd, newblkno, flags, newblkpp)
2172         struct newblk_hashhead *newblkhd;
2173         ufs2_daddr_t newblkno;
2174         int flags;
2175         struct newblk **newblkpp;
2176 {
2177         struct newblk *newblk;
2178
2179         LIST_FOREACH(newblk, newblkhd, nb_hash) {
2180                 if (newblkno != newblk->nb_newblkno)
2181                         continue;
2182                 /*
2183                  * If we're creating a new dependency don't match those that
2184                  * have already been converted to allocdirects.  This is for
2185                  * a frag extend.
2186                  */
2187                 if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
2188                         continue;
2189                 break;
2190         }
2191         if (newblk) {
2192                 *newblkpp = newblk;
2193                 return (1);
2194         }
2195         *newblkpp = NULL;
2196         return (0);
2197 }
2198
2199 /*
2200  * Look up a newblk. Return 1 if found, 0 if not found.
2201  * If not found, allocate if DEPALLOC flag is passed.
2202  * Found or allocated entry is returned in newblkpp.
2203  */
2204 static int
2205 newblk_lookup(mp, newblkno, flags, newblkpp)
2206         struct mount *mp;
2207         ufs2_daddr_t newblkno;
2208         int flags;
2209         struct newblk **newblkpp;
2210 {
2211         struct newblk *newblk;
2212         struct newblk_hashhead *newblkhd;
2213         struct ufsmount *ump;
2214
2215         ump = VFSTOUFS(mp);
2216         LOCK_OWNED(ump);
2217         newblkhd = NEWBLK_HASH(ump, newblkno);
2218         if (newblk_find(newblkhd, newblkno, flags, newblkpp))
2219                 return (1);
2220         if ((flags & DEPALLOC) == 0)
2221                 return (0);
2222         FREE_LOCK(ump);
2223         newblk = malloc(sizeof(union allblk), M_NEWBLK,
2224             M_SOFTDEP_FLAGS | M_ZERO);
2225         workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
2226         ACQUIRE_LOCK(ump);
2227         if (newblk_find(newblkhd, newblkno, flags, newblkpp)) {
2228                 WORKITEM_FREE(newblk, D_NEWBLK);
2229                 return (1);
2230         }
2231         newblk->nb_freefrag = NULL;
2232         LIST_INIT(&newblk->nb_indirdeps);
2233         LIST_INIT(&newblk->nb_newdirblk);
2234         LIST_INIT(&newblk->nb_jwork);
2235         newblk->nb_state = ATTACHED;
2236         newblk->nb_newblkno = newblkno;
2237         LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
2238         *newblkpp = newblk;
2239         return (0);
2240 }
2241
2242 /*
2243  * Structures and routines associated with freed indirect block caching.
2244  */
2245 #define INDIR_HASH(ump, blkno) \
2246         (&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size])
2247
2248 /*
2249  * Lookup an indirect block in the indir hash table.  The freework is
2250  * removed and potentially freed.  The caller must do a blocking journal
2251  * write before writing to the blkno.
2252  */
2253 static int
2254 indirblk_lookup(mp, blkno)
2255         struct mount *mp;
2256         ufs2_daddr_t blkno;
2257 {
2258         struct freework *freework;
2259         struct indir_hashhead *wkhd;
2260         struct ufsmount *ump;
2261
2262         ump = VFSTOUFS(mp);
2263         wkhd = INDIR_HASH(ump, blkno);
2264         TAILQ_FOREACH(freework, wkhd, fw_next) {
2265                 if (freework->fw_blkno != blkno)
2266                         continue;
2267                 indirblk_remove(freework);
2268                 return (1);
2269         }
2270         return (0);
2271 }
2272
2273 /*
2274  * Insert an indirect block represented by freework into the indirblk
2275  * hash table so that it may prevent the block from being re-used prior
2276  * to the journal being written.
2277  */
2278 static void
2279 indirblk_insert(freework)
2280         struct freework *freework;
2281 {
2282         struct jblocks *jblocks;
2283         struct jseg *jseg;
2284         struct ufsmount *ump;
2285
2286         ump = VFSTOUFS(freework->fw_list.wk_mp);
2287         jblocks = ump->softdep_jblocks;
2288         jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
2289         if (jseg == NULL)
2290                 return;
2291
2292         LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
2293         TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework,
2294             fw_next);
2295         freework->fw_state &= ~DEPCOMPLETE;
2296 }
2297
2298 static void
2299 indirblk_remove(freework)
2300         struct freework *freework;
2301 {
2302         struct ufsmount *ump;
2303
2304         ump = VFSTOUFS(freework->fw_list.wk_mp);
2305         LIST_REMOVE(freework, fw_segs);
2306         TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next);
2307         freework->fw_state |= DEPCOMPLETE;
2308         if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
2309                 WORKITEM_FREE(freework, D_FREEWORK);
2310 }
2311
2312 /*
2313  * Executed during filesystem system initialization before
2314  * mounting any filesystems.
2315  */
2316 void
2317 softdep_initialize()
2318 {
2319
2320         max_softdeps = desiredvnodes * 4;
2321
2322         /* initialise bioops hack */
2323         bioops.io_start = softdep_disk_io_initiation;
2324         bioops.io_complete = softdep_disk_write_complete;
2325         bioops.io_deallocate = softdep_deallocate_dependencies;
2326         bioops.io_countdeps = softdep_count_dependencies;
2327
2328         /* Initialize the callout with an mtx. */
2329         callout_init_mtx(&softdep_callout, &lk, 0);
2330 }
2331
2332 /*
2333  * Executed after all filesystems have been unmounted during
2334  * filesystem module unload.
2335  */
2336 void
2337 softdep_uninitialize()
2338 {
2339
2340         /* clear bioops hack */
2341         bioops.io_start = NULL;
2342         bioops.io_complete = NULL;
2343         bioops.io_deallocate = NULL;
2344         bioops.io_countdeps = NULL;
2345
2346         callout_drain(&softdep_callout);
2347 }
2348
2349 /*
2350  * Called at mount time to notify the dependency code that a
2351  * filesystem wishes to use it.
2352  */
2353 int
2354 softdep_mount(devvp, mp, fs, cred)
2355         struct vnode *devvp;
2356         struct mount *mp;
2357         struct fs *fs;
2358         struct ucred *cred;
2359 {
2360         struct csum_total cstotal;
2361         struct mount_softdeps *sdp;
2362         struct ufsmount *ump;
2363         struct cg *cgp;
2364         struct buf *bp;
2365         int i, error, cyl;
2366
2367         sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA,
2368             M_WAITOK | M_ZERO);
2369         MNT_ILOCK(mp);
2370         mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
2371         if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
2372                 mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
2373                         MNTK_SOFTDEP | MNTK_NOASYNC;
2374         }
2375         ump = VFSTOUFS(mp);
2376         ump->um_softdep = sdp;
2377         MNT_IUNLOCK(mp);
2378         LOCK_PTR(ump) = &lk;
2379         LIST_INIT(&ump->softdep_workitem_pending);
2380         LIST_INIT(&ump->softdep_journal_pending);
2381         TAILQ_INIT(&ump->softdep_unlinked);
2382         LIST_INIT(&ump->softdep_dirtycg);
2383         ump->softdep_worklist_tail = NULL;
2384         ump->softdep_on_worklist = 0;
2385         ump->softdep_deps = 0;
2386         LIST_INIT(&ump->softdep_mkdirlisthd);
2387         ump->pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
2388             &ump->pagedep_hash_size);
2389         ump->pagedep_nextclean = 0;
2390         ump->inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP,
2391             &ump->inodedep_hash_size);
2392         ump->inodedep_nextclean = 0;
2393         ump->newblk_hashtbl = hashinit(max_softdeps / 2,  M_NEWBLK,
2394             &ump->newblk_hash_size);
2395         ump->bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP,
2396             &ump->bmsafemap_hash_size);
2397         i = 1 << (ffs(desiredvnodes / 10) - 1);
2398         ump->indir_hashtbl = malloc(i * sizeof(struct indir_hashhead),
2399             M_FREEWORK, M_WAITOK);
2400         ump->indir_hash_size = i - 1;
2401         for (i = 0; i <= ump->indir_hash_size; i++)
2402                 TAILQ_INIT(&ump->indir_hashtbl[i]);
2403         if ((fs->fs_flags & FS_SUJ) &&
2404             (error = journal_mount(mp, fs, cred)) != 0) {
2405                 printf("Failed to start journal: %d\n", error);
2406                 softdep_unmount(mp);
2407                 return (error);
2408         }
2409         atomic_add_int(&stat_softdep_mounts, 1);
2410         /*
2411          * When doing soft updates, the counters in the
2412          * superblock may have gotten out of sync. Recomputation
2413          * can take a long time and can be deferred for background
2414          * fsck.  However, the old behavior of scanning the cylinder
2415          * groups and recalculating them at mount time is available
2416          * by setting vfs.ffs.compute_summary_at_mount to one.
2417          */
2418         if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
2419                 return (0);
2420         bzero(&cstotal, sizeof cstotal);
2421         for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
2422                 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
2423                     fs->fs_cgsize, cred, &bp)) != 0) {
2424                         brelse(bp);
2425                         softdep_unmount(mp);
2426                         return (error);
2427                 }
2428                 cgp = (struct cg *)bp->b_data;
2429                 cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
2430                 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
2431                 cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
2432                 cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
2433                 fs->fs_cs(fs, cyl) = cgp->cg_cs;
2434                 brelse(bp);
2435         }
2436 #ifdef DEBUG
2437         if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
2438                 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
2439 #endif
2440         bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
2441         return (0);
2442 }
2443
2444 void
2445 softdep_unmount(mp)
2446         struct mount *mp;
2447 {
2448         struct ufsmount *ump;
2449 #ifdef INVARIANTS
2450         int i;
2451 #endif
2452
2453         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
2454             ("softdep_unmount called on non-softdep filesystem"));
2455         ump = VFSTOUFS(mp);
2456         MNT_ILOCK(mp);
2457         mp->mnt_flag &= ~MNT_SOFTDEP;
2458         if (MOUNTEDSUJ(mp) == 0) {
2459                 MNT_IUNLOCK(mp);
2460         } else {
2461                 mp->mnt_flag &= ~MNT_SUJ;
2462                 MNT_IUNLOCK(mp);
2463                 journal_unmount(ump);
2464         }
2465         atomic_subtract_int(&stat_softdep_mounts, 1);
2466         hashdestroy(ump->pagedep_hashtbl, M_PAGEDEP, ump->pagedep_hash_size);
2467         hashdestroy(ump->inodedep_hashtbl, M_INODEDEP, ump->inodedep_hash_size);
2468         hashdestroy(ump->newblk_hashtbl, M_NEWBLK, ump->newblk_hash_size);
2469         hashdestroy(ump->bmsafemap_hashtbl, M_BMSAFEMAP,
2470             ump->bmsafemap_hash_size);
2471         free(ump->indir_hashtbl, M_FREEWORK);
2472 #ifdef INVARIANTS
2473         for (i = 0; i <= D_LAST; i++)
2474                 KASSERT(ump->softdep_curdeps[i] == 0,
2475                     ("Unmount %s: Dep type %s != 0 (%ld)", ump->um_fs->fs_fsmnt,
2476                     TYPENAME(i), ump->softdep_curdeps[i]));
2477 #endif
2478         free(ump->um_softdep, M_MOUNTDATA);
2479 }
2480
2481 static struct jblocks *
2482 jblocks_create(void)
2483 {
2484         struct jblocks *jblocks;
2485
2486         jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
2487         TAILQ_INIT(&jblocks->jb_segs);
2488         jblocks->jb_avail = 10;
2489         jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2490             M_JBLOCKS, M_WAITOK | M_ZERO);
2491
2492         return (jblocks);
2493 }
2494
2495 static ufs2_daddr_t
2496 jblocks_alloc(jblocks, bytes, actual)
2497         struct jblocks *jblocks;
2498         int bytes;
2499         int *actual;
2500 {
2501         ufs2_daddr_t daddr;
2502         struct jextent *jext;
2503         int freecnt;
2504         int blocks;
2505
2506         blocks = bytes / DEV_BSIZE;
2507         jext = &jblocks->jb_extent[jblocks->jb_head];
2508         freecnt = jext->je_blocks - jblocks->jb_off;
2509         if (freecnt == 0) {
2510                 jblocks->jb_off = 0;
2511                 if (++jblocks->jb_head > jblocks->jb_used)
2512                         jblocks->jb_head = 0;
2513                 jext = &jblocks->jb_extent[jblocks->jb_head];
2514                 freecnt = jext->je_blocks;
2515         }
2516         if (freecnt > blocks)
2517                 freecnt = blocks;
2518         *actual = freecnt * DEV_BSIZE;
2519         daddr = jext->je_daddr + jblocks->jb_off;
2520         jblocks->jb_off += freecnt;
2521         jblocks->jb_free -= freecnt;
2522
2523         return (daddr);
2524 }
2525
2526 static void
2527 jblocks_free(jblocks, mp, bytes)
2528         struct jblocks *jblocks;
2529         struct mount *mp;
2530         int bytes;
2531 {
2532
2533         LOCK_OWNED(VFSTOUFS(mp));
2534         jblocks->jb_free += bytes / DEV_BSIZE;
2535         if (jblocks->jb_suspended)
2536                 worklist_speedup(mp);
2537         wakeup(jblocks);
2538 }
2539
2540 static void
2541 jblocks_destroy(jblocks)
2542         struct jblocks *jblocks;
2543 {
2544
2545         if (jblocks->jb_extent)
2546                 free(jblocks->jb_extent, M_JBLOCKS);
2547         free(jblocks, M_JBLOCKS);
2548 }
2549
2550 static void
2551 jblocks_add(jblocks, daddr, blocks)
2552         struct jblocks *jblocks;
2553         ufs2_daddr_t daddr;
2554         int blocks;
2555 {
2556         struct jextent *jext;
2557
2558         jblocks->jb_blocks += blocks;
2559         jblocks->jb_free += blocks;
2560         jext = &jblocks->jb_extent[jblocks->jb_used];
2561         /* Adding the first block. */
2562         if (jext->je_daddr == 0) {
2563                 jext->je_daddr = daddr;
2564                 jext->je_blocks = blocks;
2565                 return;
2566         }
2567         /* Extending the last extent. */
2568         if (jext->je_daddr + jext->je_blocks == daddr) {
2569                 jext->je_blocks += blocks;
2570                 return;
2571         }
2572         /* Adding a new extent. */
2573         if (++jblocks->jb_used == jblocks->jb_avail) {
2574                 jblocks->jb_avail *= 2;
2575                 jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2576                     M_JBLOCKS, M_WAITOK | M_ZERO);
2577                 memcpy(jext, jblocks->jb_extent,
2578                     sizeof(struct jextent) * jblocks->jb_used);
2579                 free(jblocks->jb_extent, M_JBLOCKS);
2580                 jblocks->jb_extent = jext;
2581         }
2582         jext = &jblocks->jb_extent[jblocks->jb_used];
2583         jext->je_daddr = daddr;
2584         jext->je_blocks = blocks;
2585         return;
2586 }
2587
2588 int
2589 softdep_journal_lookup(mp, vpp)
2590         struct mount *mp;
2591         struct vnode **vpp;
2592 {
2593         struct componentname cnp;
2594         struct vnode *dvp;
2595         ino_t sujournal;
2596         int error;
2597
2598         error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
2599         if (error)
2600                 return (error);
2601         bzero(&cnp, sizeof(cnp));
2602         cnp.cn_nameiop = LOOKUP;
2603         cnp.cn_flags = ISLASTCN;
2604         cnp.cn_thread = curthread;
2605         cnp.cn_cred = curthread->td_ucred;
2606         cnp.cn_pnbuf = SUJ_FILE;
2607         cnp.cn_nameptr = SUJ_FILE;
2608         cnp.cn_namelen = strlen(SUJ_FILE);
2609         error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2610         vput(dvp);
2611         if (error != 0)
2612                 return (error);
2613         error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2614         return (error);
2615 }
2616
2617 /*
2618  * Open and verify the journal file.
2619  */
2620 static int
2621 journal_mount(mp, fs, cred)
2622         struct mount *mp;
2623         struct fs *fs;
2624         struct ucred *cred;
2625 {
2626         struct jblocks *jblocks;
2627         struct ufsmount *ump;
2628         struct vnode *vp;
2629         struct inode *ip;
2630         ufs2_daddr_t blkno;
2631         int bcount;
2632         int error;
2633         int i;
2634
2635         ump = VFSTOUFS(mp);
2636         ump->softdep_journal_tail = NULL;
2637         ump->softdep_on_journal = 0;
2638         ump->softdep_accdeps = 0;
2639         ump->softdep_req = 0;
2640         ump->softdep_jblocks = NULL;
2641         error = softdep_journal_lookup(mp, &vp);
2642         if (error != 0) {
2643                 printf("Failed to find journal.  Use tunefs to create one\n");
2644                 return (error);
2645         }
2646         ip = VTOI(vp);
2647         if (ip->i_size < SUJ_MIN) {
2648                 error = ENOSPC;
2649                 goto out;
2650         }
2651         bcount = lblkno(fs, ip->i_size);        /* Only use whole blocks. */
2652         jblocks = jblocks_create();
2653         for (i = 0; i < bcount; i++) {
2654                 error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
2655                 if (error)
2656                         break;
2657                 jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
2658         }
2659         if (error) {
2660                 jblocks_destroy(jblocks);
2661                 goto out;
2662         }
2663         jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */
2664         jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
2665         ump->softdep_jblocks = jblocks;
2666 out:
2667         if (error == 0) {
2668                 MNT_ILOCK(mp);
2669                 mp->mnt_flag |= MNT_SUJ;
2670                 mp->mnt_flag &= ~MNT_SOFTDEP;
2671                 MNT_IUNLOCK(mp);
2672                 /*
2673                  * Only validate the journal contents if the
2674                  * filesystem is clean, otherwise we write the logs
2675                  * but they'll never be used.  If the filesystem was
2676                  * still dirty when we mounted it the journal is
2677                  * invalid and a new journal can only be valid if it
2678                  * starts from a clean mount.
2679                  */
2680                 if (fs->fs_clean) {
2681                         DIP_SET(ip, i_modrev, fs->fs_mtime);
2682                         ip->i_flags |= IN_MODIFIED;
2683                         ffs_update(vp, 1);
2684                 }
2685         }
2686         vput(vp);
2687         return (error);
2688 }
2689
2690 static void
2691 journal_unmount(ump)
2692         struct ufsmount *ump;
2693 {
2694
2695         if (ump->softdep_jblocks)
2696                 jblocks_destroy(ump->softdep_jblocks);
2697         ump->softdep_jblocks = NULL;
2698 }
2699
2700 /*
2701  * Called when a journal record is ready to be written.  Space is allocated
2702  * and the journal entry is created when the journal is flushed to stable
2703  * store.
2704  */
2705 static void
2706 add_to_journal(wk)
2707         struct worklist *wk;
2708 {
2709         struct ufsmount *ump;
2710
2711         ump = VFSTOUFS(wk->wk_mp);
2712         LOCK_OWNED(ump);
2713         if (wk->wk_state & ONWORKLIST)
2714                 panic("add_to_journal: %s(0x%X) already on list",
2715                     TYPENAME(wk->wk_type), wk->wk_state);
2716         wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
2717         if (LIST_EMPTY(&ump->softdep_journal_pending)) {
2718                 ump->softdep_jblocks->jb_age = ticks;
2719                 LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
2720         } else
2721                 LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
2722         ump->softdep_journal_tail = wk;
2723         ump->softdep_on_journal += 1;
2724 }
2725
2726 /*
2727  * Remove an arbitrary item for the journal worklist maintain the tail
2728  * pointer.  This happens when a new operation obviates the need to
2729  * journal an old operation.
2730  */
2731 static void
2732 remove_from_journal(wk)
2733         struct worklist *wk;
2734 {
2735         struct ufsmount *ump;
2736
2737         ump = VFSTOUFS(wk->wk_mp);
2738         LOCK_OWNED(ump);
2739 #ifdef SUJ_DEBUG
2740         {
2741                 struct worklist *wkn;
2742
2743                 LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
2744                         if (wkn == wk)
2745                                 break;
2746                 if (wkn == NULL)
2747                         panic("remove_from_journal: %p is not in journal", wk);
2748         }
2749 #endif
2750         /*
2751          * We emulate a TAILQ to save space in most structures which do not
2752          * require TAILQ semantics.  Here we must update the tail position
2753          * when removing the tail which is not the final entry. This works
2754          * only if the worklist linkage are at the beginning of the structure.
2755          */
2756         if (ump->softdep_journal_tail == wk)
2757                 ump->softdep_journal_tail =
2758                     (struct worklist *)wk->wk_list.le_prev;
2759
2760         WORKLIST_REMOVE(wk);
2761         ump->softdep_on_journal -= 1;
2762 }
2763
2764 /*
2765  * Check for journal space as well as dependency limits so the prelink
2766  * code can throttle both journaled and non-journaled filesystems.
2767  * Threshold is 0 for low and 1 for min.
2768  */
2769 static int
2770 journal_space(ump, thresh)
2771         struct ufsmount *ump;
2772         int thresh;
2773 {
2774         struct jblocks *jblocks;
2775         int limit, avail;
2776
2777         jblocks = ump->softdep_jblocks;
2778         if (jblocks == NULL)
2779                 return (1);
2780         /*
2781          * We use a tighter restriction here to prevent request_cleanup()
2782          * running in threads from running into locks we currently hold.
2783          * We have to be over the limit and our filesystem has to be
2784          * responsible for more than our share of that usage.
2785          */
2786         limit = (max_softdeps / 10) * 9;
2787         if (dep_current[D_INODEDEP] > limit &&
2788             ump->softdep_curdeps[D_INODEDEP] > limit / stat_softdep_mounts)
2789                 return (0);
2790         if (thresh)
2791                 thresh = jblocks->jb_min;
2792         else
2793                 thresh = jblocks->jb_low;
2794         avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
2795         avail = jblocks->jb_free - avail;
2796
2797         return (avail > thresh);
2798 }
2799
2800 static void
2801 journal_suspend(ump)
2802         struct ufsmount *ump;
2803 {
2804         struct jblocks *jblocks;
2805         struct mount *mp;
2806
2807         mp = UFSTOVFS(ump);
2808         jblocks = ump->softdep_jblocks;
2809         MNT_ILOCK(mp);
2810         if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
2811                 stat_journal_min++;
2812                 mp->mnt_kern_flag |= MNTK_SUSPEND;
2813                 mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc);
2814         }
2815         jblocks->jb_suspended = 1;
2816         MNT_IUNLOCK(mp);
2817 }
2818
2819 static int
2820 journal_unsuspend(struct ufsmount *ump)
2821 {
2822         struct jblocks *jblocks;
2823         struct mount *mp;
2824
2825         mp = UFSTOVFS(ump);
2826         jblocks = ump->softdep_jblocks;
2827
2828         if (jblocks != NULL && jblocks->jb_suspended &&
2829             journal_space(ump, jblocks->jb_min)) {
2830                 jblocks->jb_suspended = 0;
2831                 FREE_LOCK(ump);
2832                 mp->mnt_susp_owner = curthread;
2833                 vfs_write_resume(mp, 0);
2834                 ACQUIRE_LOCK(ump);
2835                 return (1);
2836         }
2837         return (0);
2838 }
2839
2840 /*
2841  * Called before any allocation function to be certain that there is
2842  * sufficient space in the journal prior to creating any new records.
2843  * Since in the case of block allocation we may have multiple locked
2844  * buffers at the time of the actual allocation we can not block
2845  * when the journal records are created.  Doing so would create a deadlock
2846  * if any of these buffers needed to be flushed to reclaim space.  Instead
2847  * we require a sufficiently large amount of available space such that
2848  * each thread in the system could have passed this allocation check and
2849  * still have sufficient free space.  With 20% of a minimum journal size
2850  * of 1MB we have 6553 records available.
2851  */
2852 int
2853 softdep_prealloc(vp, waitok)
2854         struct vnode *vp;
2855         int waitok;
2856 {
2857         struct ufsmount *ump;
2858
2859         KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
2860             ("softdep_prealloc called on non-softdep filesystem"));
2861         /*
2862          * Nothing to do if we are not running journaled soft updates.
2863          * If we currently hold the snapshot lock, we must avoid handling
2864          * other resources that could cause deadlock.
2865          */
2866         if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)))
2867                 return (0);
2868         ump = VFSTOUFS(vp->v_mount);
2869         ACQUIRE_LOCK(ump);
2870         if (journal_space(ump, 0)) {
2871                 FREE_LOCK(ump);
2872                 return (0);
2873         }
2874         stat_journal_low++;
2875         FREE_LOCK(ump);
2876         if (waitok == MNT_NOWAIT)
2877                 return (ENOSPC);
2878         /*
2879          * Attempt to sync this vnode once to flush any journal
2880          * work attached to it.
2881          */
2882         if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
2883                 ffs_syncvnode(vp, waitok, 0);
2884         ACQUIRE_LOCK(ump);
2885         process_removes(vp);
2886         process_truncates(vp);
2887         if (journal_space(ump, 0) == 0) {
2888                 softdep_speedup();
2889                 if (journal_space(ump, 1) == 0)
2890                         journal_suspend(ump);
2891         }
2892         FREE_LOCK(ump);
2893
2894         return (0);
2895 }
2896
2897 /*
2898  * Before adjusting a link count on a vnode verify that we have sufficient
2899  * journal space.  If not, process operations that depend on the currently
2900  * locked pair of vnodes to try to flush space as the syncer, buf daemon,
2901  * and softdep flush threads can not acquire these locks to reclaim space.
2902  */
2903 static void
2904 softdep_prelink(dvp, vp)
2905         struct vnode *dvp;
2906         struct vnode *vp;
2907 {
2908         struct ufsmount *ump;
2909
2910         ump = VFSTOUFS(dvp->v_mount);
2911         LOCK_OWNED(ump);
2912         /*
2913          * Nothing to do if we have sufficient journal space.
2914          * If we currently hold the snapshot lock, we must avoid
2915          * handling other resources that could cause deadlock.
2916          */
2917         if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
2918                 return;
2919         stat_journal_low++;
2920         FREE_LOCK(ump);
2921         if (vp)
2922                 ffs_syncvnode(vp, MNT_NOWAIT, 0);
2923         ffs_syncvnode(dvp, MNT_WAIT, 0);
2924         ACQUIRE_LOCK(ump);
2925         /* Process vp before dvp as it may create .. removes. */
2926         if (vp) {
2927                 process_removes(vp);
2928                 process_truncates(vp);
2929         }
2930         process_removes(dvp);
2931         process_truncates(dvp);
2932         softdep_speedup();
2933         process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
2934         if (journal_space(ump, 0) == 0) {
2935                 softdep_speedup();
2936                 if (journal_space(ump, 1) == 0)
2937                         journal_suspend(ump);
2938         }
2939 }
2940
2941 static void
2942 jseg_write(ump, jseg, data)
2943         struct ufsmount *ump;
2944         struct jseg *jseg;
2945         uint8_t *data;
2946 {
2947         struct jsegrec *rec;
2948
2949         rec = (struct jsegrec *)data;
2950         rec->jsr_seq = jseg->js_seq;
2951         rec->jsr_oldest = jseg->js_oldseq;
2952         rec->jsr_cnt = jseg->js_cnt;
2953         rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
2954         rec->jsr_crc = 0;
2955         rec->jsr_time = ump->um_fs->fs_mtime;
2956 }
2957
2958 static inline void
2959 inoref_write(inoref, jseg, rec)
2960         struct inoref *inoref;
2961         struct jseg *jseg;
2962         struct jrefrec *rec;
2963 {
2964
2965         inoref->if_jsegdep->jd_seg = jseg;
2966         rec->jr_ino = inoref->if_ino;
2967         rec->jr_parent = inoref->if_parent;
2968         rec->jr_nlink = inoref->if_nlink;
2969         rec->jr_mode = inoref->if_mode;
2970         rec->jr_diroff = inoref->if_diroff;
2971 }
2972
2973 static void
2974 jaddref_write(jaddref, jseg, data)
2975         struct jaddref *jaddref;
2976         struct jseg *jseg;
2977         uint8_t *data;
2978 {
2979         struct jrefrec *rec;
2980
2981         rec = (struct jrefrec *)data;
2982         rec->jr_op = JOP_ADDREF;
2983         inoref_write(&jaddref->ja_ref, jseg, rec);
2984 }
2985
2986 static void
2987 jremref_write(jremref, jseg, data)
2988         struct jremref *jremref;
2989         struct jseg *jseg;
2990         uint8_t *data;
2991 {
2992         struct jrefrec *rec;
2993
2994         rec = (struct jrefrec *)data;
2995         rec->jr_op = JOP_REMREF;
2996         inoref_write(&jremref->jr_ref, jseg, rec);
2997 }
2998
2999 static void
3000 jmvref_write(jmvref, jseg, data)
3001         struct jmvref *jmvref;
3002         struct jseg *jseg;
3003         uint8_t *data;
3004 {
3005         struct jmvrec *rec;
3006
3007         rec = (struct jmvrec *)data;
3008         rec->jm_op = JOP_MVREF;
3009         rec->jm_ino = jmvref->jm_ino;
3010         rec->jm_parent = jmvref->jm_parent;
3011         rec->jm_oldoff = jmvref->jm_oldoff;
3012         rec->jm_newoff = jmvref->jm_newoff;
3013 }
3014
3015 static void
3016 jnewblk_write(jnewblk, jseg, data)
3017         struct jnewblk *jnewblk;
3018         struct jseg *jseg;
3019         uint8_t *data;
3020 {
3021         struct jblkrec *rec;
3022
3023         jnewblk->jn_jsegdep->jd_seg = jseg;
3024         rec = (struct jblkrec *)data;
3025         rec->jb_op = JOP_NEWBLK;
3026         rec->jb_ino = jnewblk->jn_ino;
3027         rec->jb_blkno = jnewblk->jn_blkno;
3028         rec->jb_lbn = jnewblk->jn_lbn;
3029         rec->jb_frags = jnewblk->jn_frags;
3030         rec->jb_oldfrags = jnewblk->jn_oldfrags;
3031 }
3032
3033 static void
3034 jfreeblk_write(jfreeblk, jseg, data)
3035         struct jfreeblk *jfreeblk;
3036         struct jseg *jseg;
3037         uint8_t *data;
3038 {
3039         struct jblkrec *rec;
3040
3041         jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
3042         rec = (struct jblkrec *)data;
3043         rec->jb_op = JOP_FREEBLK;
3044         rec->jb_ino = jfreeblk->jf_ino;
3045         rec->jb_blkno = jfreeblk->jf_blkno;
3046         rec->jb_lbn = jfreeblk->jf_lbn;
3047         rec->jb_frags = jfreeblk->jf_frags;
3048         rec->jb_oldfrags = 0;
3049 }
3050
3051 static void
3052 jfreefrag_write(jfreefrag, jseg, data)
3053         struct jfreefrag *jfreefrag;
3054         struct jseg *jseg;
3055         uint8_t *data;
3056 {
3057         struct jblkrec *rec;
3058
3059         jfreefrag->fr_jsegdep->jd_seg = jseg;
3060         rec = (struct jblkrec *)data;
3061         rec->jb_op = JOP_FREEBLK;
3062         rec->jb_ino = jfreefrag->fr_ino;
3063         rec->jb_blkno = jfreefrag->fr_blkno;
3064         rec->jb_lbn = jfreefrag->fr_lbn;
3065         rec->jb_frags = jfreefrag->fr_frags;
3066         rec->jb_oldfrags = 0;
3067 }
3068
3069 static void
3070 jtrunc_write(jtrunc, jseg, data)
3071         struct jtrunc *jtrunc;
3072         struct jseg *jseg;
3073         uint8_t *data;
3074 {
3075         struct jtrncrec *rec;
3076
3077         jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
3078         rec = (struct jtrncrec *)data;
3079         rec->jt_op = JOP_TRUNC;
3080         rec->jt_ino = jtrunc->jt_ino;
3081         rec->jt_size = jtrunc->jt_size;
3082         rec->jt_extsize = jtrunc->jt_extsize;
3083 }
3084
3085 static void
3086 jfsync_write(jfsync, jseg, data)
3087         struct jfsync *jfsync;
3088         struct jseg *jseg;
3089         uint8_t *data;
3090 {
3091         struct jtrncrec *rec;
3092
3093         rec = (struct jtrncrec *)data;
3094         rec->jt_op = JOP_SYNC;
3095         rec->jt_ino = jfsync->jfs_ino;
3096         rec->jt_size = jfsync->jfs_size;
3097         rec->jt_extsize = jfsync->jfs_extsize;
3098 }
3099
3100 static void
3101 softdep_flushjournal(mp)
3102         struct mount *mp;
3103 {
3104         struct jblocks *jblocks;
3105         struct ufsmount *ump;
3106
3107         if (MOUNTEDSUJ(mp) == 0)
3108                 return;
3109         ump = VFSTOUFS(mp);
3110         jblocks = ump->softdep_jblocks;
3111         ACQUIRE_LOCK(ump);
3112         while (ump->softdep_on_journal) {
3113                 jblocks->jb_needseg = 1;
3114                 softdep_process_journal(mp, NULL, MNT_WAIT);
3115         }
3116         FREE_LOCK(ump);
3117 }
3118
3119 static void softdep_synchronize_completed(struct bio *);
3120 static void softdep_synchronize(struct bio *, struct ufsmount *, void *);
3121
3122 static void
3123 softdep_synchronize_completed(bp)
3124         struct bio *bp;
3125 {
3126         struct jseg *oldest;
3127         struct jseg *jseg;
3128         struct ufsmount *ump;
3129
3130         /*
3131          * caller1 marks the last segment written before we issued the
3132          * synchronize cache.
3133          */
3134         jseg = bp->bio_caller1;
3135         if (jseg == NULL) {
3136                 g_destroy_bio(bp);
3137                 return;
3138         }
3139         ump = VFSTOUFS(jseg->js_list.wk_mp);
3140         ACQUIRE_LOCK(ump);
3141         oldest = NULL;
3142         /*
3143          * Mark all the journal entries waiting on the synchronize cache
3144          * as completed so they may continue on.
3145          */
3146         while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) {
3147                 jseg->js_state |= COMPLETE;
3148                 oldest = jseg;
3149                 jseg = TAILQ_PREV(jseg, jseglst, js_next);
3150         }
3151         /*
3152          * Restart deferred journal entry processing from the oldest
3153          * completed jseg.
3154          */
3155         if (oldest)
3156                 complete_jsegs(oldest);
3157
3158         FREE_LOCK(ump);
3159         g_destroy_bio(bp);
3160 }
3161
3162 /*
3163  * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering
3164  * barriers.  The journal must be written prior to any blocks that depend
3165  * on it and the journal can not be released until the blocks have be
3166  * written.  This code handles both barriers simultaneously.
3167  */
3168 static void
3169 softdep_synchronize(bp, ump, caller1)
3170         struct bio *bp;
3171         struct ufsmount *ump;
3172         void *caller1;
3173 {
3174
3175         bp->bio_cmd = BIO_FLUSH;
3176         bp->bio_flags |= BIO_ORDERED;
3177         bp->bio_data = NULL;
3178         bp->bio_offset = ump->um_cp->provider->mediasize;
3179         bp->bio_length = 0;
3180         bp->bio_done = softdep_synchronize_completed;
3181         bp->bio_caller1 = caller1;
3182         g_io_request(bp,
3183             (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private);
3184 }
3185
3186 /*
3187  * Flush some journal records to disk.
3188  */
3189 static void
3190 softdep_process_journal(mp, needwk, flags)
3191         struct mount *mp;
3192         struct worklist *needwk;
3193         int flags;
3194 {
3195         struct jblocks *jblocks;
3196         struct ufsmount *ump;
3197         struct worklist *wk;
3198         struct jseg *jseg;
3199         struct buf *bp;
3200         struct bio *bio;
3201         uint8_t *data;
3202         struct fs *fs;
3203         int shouldflush;
3204         int segwritten;
3205         int jrecmin;    /* Minimum records per block. */
3206         int jrecmax;    /* Maximum records per block. */
3207         int size;
3208         int cnt;
3209         int off;
3210         int devbsize;
3211
3212         if (MOUNTEDSUJ(mp) == 0)
3213                 return;
3214         shouldflush = softdep_flushcache;
3215         bio = NULL;
3216         jseg = NULL;
3217         ump = VFSTOUFS(mp);
3218         LOCK_OWNED(ump);
3219         fs = ump->um_fs;
3220         jblocks = ump->softdep_jblocks;
3221         devbsize = ump->um_devvp->v_bufobj.bo_bsize;
3222         /*
3223          * We write anywhere between a disk block and fs block.  The upper
3224          * bound is picked to prevent buffer cache fragmentation and limit
3225          * processing time per I/O.
3226          */
3227         jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
3228         jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
3229         segwritten = 0;
3230         for (;;) {
3231                 cnt = ump->softdep_on_journal;
3232                 /*
3233                  * Criteria for writing a segment:
3234                  * 1) We have a full block.
3235                  * 2) We're called from jwait() and haven't found the
3236                  *    journal item yet.
3237                  * 3) Always write if needseg is set.
3238                  * 4) If we are called from process_worklist and have
3239                  *    not yet written anything we write a partial block
3240                  *    to enforce a 1 second maximum latency on journal
3241                  *    entries.
3242                  */
3243                 if (cnt < (jrecmax - 1) && needwk == NULL &&
3244                     jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
3245                         break;
3246                 cnt++;
3247                 /*
3248                  * Verify some free journal space.  softdep_prealloc() should
3249                  * guarantee that we don't run out so this is indicative of
3250                  * a problem with the flow control.  Try to recover
3251                  * gracefully in any event.
3252                  */
3253                 while (jblocks->jb_free == 0) {
3254                         if (flags != MNT_WAIT)
3255                                 break;
3256                         printf("softdep: Out of journal space!\n");
3257                         softdep_speedup();
3258                         msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz);
3259                 }
3260                 FREE_LOCK(ump);
3261                 jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
3262                 workitem_alloc(&jseg->js_list, D_JSEG, mp);
3263                 LIST_INIT(&jseg->js_entries);
3264                 LIST_INIT(&jseg->js_indirs);
3265                 jseg->js_state = ATTACHED;
3266                 if (shouldflush == 0)
3267                         jseg->js_state |= COMPLETE;
3268                 else if (bio == NULL)
3269                         bio = g_alloc_bio();
3270                 jseg->js_jblocks = jblocks;
3271                 bp = geteblk(fs->fs_bsize, 0);
3272                 ACQUIRE_LOCK(ump);
3273                 /*
3274                  * If there was a race while we were allocating the block
3275                  * and jseg the entry we care about was likely written.
3276                  * We bail out in both the WAIT and NOWAIT case and assume
3277                  * the caller will loop if the entry it cares about is
3278                  * not written.
3279                  */
3280                 cnt = ump->softdep_on_journal;
3281                 if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
3282                         bp->b_flags |= B_INVAL | B_NOCACHE;
3283                         WORKITEM_FREE(jseg, D_JSEG);
3284                         FREE_LOCK(ump);
3285                         brelse(bp);
3286                         ACQUIRE_LOCK(ump);
3287                         break;
3288                 }
3289                 /*
3290                  * Calculate the disk block size required for the available
3291                  * records rounded to the min size.
3292                  */
3293                 if (cnt == 0)
3294                         size = devbsize;
3295                 else if (cnt < jrecmax)
3296                         size = howmany(cnt, jrecmin) * devbsize;
3297                 else
3298                         size = fs->fs_bsize;
3299                 /*
3300                  * Allocate a disk block for this journal data and account
3301                  * for truncation of the requested size if enough contiguous
3302                  * space was not available.
3303                  */
3304                 bp->b_blkno = jblocks_alloc(jblocks, size, &size);
3305                 bp->b_lblkno = bp->b_blkno;
3306                 bp->b_offset = bp->b_blkno * DEV_BSIZE;
3307                 bp->b_bcount = size;
3308                 bp->b_flags &= ~B_INVAL;
3309                 bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
3310                 /*
3311                  * Initialize our jseg with cnt records.  Assign the next
3312                  * sequence number to it and link it in-order.
3313                  */
3314                 cnt = MIN(cnt, (size / devbsize) * jrecmin);
3315                 jseg->js_buf = bp;
3316                 jseg->js_cnt = cnt;
3317                 jseg->js_refs = cnt + 1;        /* Self ref. */
3318                 jseg->js_size = size;
3319                 jseg->js_seq = jblocks->jb_nextseq++;
3320                 if (jblocks->jb_oldestseg == NULL)
3321                         jblocks->jb_oldestseg = jseg;
3322                 jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
3323                 TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
3324                 if (jblocks->jb_writeseg == NULL)
3325                         jblocks->jb_writeseg = jseg;
3326                 /*
3327                  * Start filling in records from the pending list.
3328                  */
3329                 data = bp->b_data;
3330                 off = 0;
3331                 while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
3332                     != NULL) {
3333                         if (cnt == 0)
3334                                 break;
3335                         /* Place a segment header on every device block. */
3336                         if ((off % devbsize) == 0) {
3337                                 jseg_write(ump, jseg, data);
3338                                 off += JREC_SIZE;
3339                                 data = bp->b_data + off;
3340                         }
3341                         if (wk == needwk)
3342                                 needwk = NULL;
3343                         remove_from_journal(wk);
3344                         wk->wk_state |= INPROGRESS;
3345                         WORKLIST_INSERT(&jseg->js_entries, wk);
3346                         switch (wk->wk_type) {
3347                         case D_JADDREF:
3348                                 jaddref_write(WK_JADDREF(wk), jseg, data);
3349                                 break;
3350                         case D_JREMREF:
3351                                 jremref_write(WK_JREMREF(wk), jseg, data);
3352                                 break;
3353                         case D_JMVREF:
3354                                 jmvref_write(WK_JMVREF(wk), jseg, data);
3355                                 break;
3356                         case D_JNEWBLK:
3357                                 jnewblk_write(WK_JNEWBLK(wk), jseg, data);
3358                                 break;
3359                         case D_JFREEBLK:
3360                                 jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
3361                                 break;
3362                         case D_JFREEFRAG:
3363                                 jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
3364                                 break;
3365                         case D_JTRUNC:
3366                                 jtrunc_write(WK_JTRUNC(wk), jseg, data);
3367                                 break;
3368                         case D_JFSYNC:
3369                                 jfsync_write(WK_JFSYNC(wk), jseg, data);
3370                                 break;
3371                         default:
3372                                 panic("process_journal: Unknown type %s",
3373                                     TYPENAME(wk->wk_type));
3374                                 /* NOTREACHED */
3375                         }
3376                         off += JREC_SIZE;
3377                         data = bp->b_data + off;
3378                         cnt--;
3379                 }
3380                 /*
3381                  * Write this one buffer and continue.
3382                  */
3383                 segwritten = 1;
3384                 jblocks->jb_needseg = 0;
3385                 WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
3386                 FREE_LOCK(ump);
3387                 pbgetvp(ump->um_devvp, bp);
3388                 /*
3389                  * We only do the blocking wait once we find the journal
3390                  * entry we're looking for.
3391                  */
3392                 if (needwk == NULL && flags == MNT_WAIT)
3393                         bwrite(bp);
3394                 else
3395                         bawrite(bp);
3396                 ACQUIRE_LOCK(ump);
3397         }
3398         /*
3399          * If we wrote a segment issue a synchronize cache so the journal
3400          * is reflected on disk before the data is written.  Since reclaiming
3401          * journal space also requires writing a journal record this
3402          * process also enforces a barrier before reclamation.
3403          */
3404         if (segwritten && shouldflush) {
3405                 softdep_synchronize(bio, ump,
3406                     TAILQ_LAST(&jblocks->jb_segs, jseglst));
3407         } else if (bio)
3408                 g_destroy_bio(bio);
3409         /*
3410          * If we've suspended the filesystem because we ran out of journal
3411          * space either try to sync it here to make some progress or
3412          * unsuspend it if we already have.
3413          */
3414         if (flags == 0 && jblocks->jb_suspended) {
3415                 if (journal_unsuspend(ump))
3416                         return;
3417                 FREE_LOCK(ump);
3418                 VFS_SYNC(mp, MNT_NOWAIT);
3419                 ffs_sbupdate(ump, MNT_WAIT, 0);
3420                 ACQUIRE_LOCK(ump);
3421         }
3422 }
3423
3424 /*
3425  * Complete a jseg, allowing all dependencies awaiting journal writes
3426  * to proceed.  Each journal dependency also attaches a jsegdep to dependent
3427  * structures so that the journal segment can be freed to reclaim space.
3428  */
3429 static void
3430 complete_jseg(jseg)
3431         struct jseg *jseg;
3432 {
3433         struct worklist *wk;
3434         struct jmvref *jmvref;
3435         int waiting;
3436 #ifdef INVARIANTS
3437         int i = 0;
3438 #endif
3439
3440         while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
3441                 WORKLIST_REMOVE(wk);
3442                 waiting = wk->wk_state & IOWAITING;
3443                 wk->wk_state &= ~(INPROGRESS | IOWAITING);
3444                 wk->wk_state |= COMPLETE;
3445                 KASSERT(i++ < jseg->js_cnt,
3446                     ("handle_written_jseg: overflow %d >= %d",
3447                     i - 1, jseg->js_cnt));
3448                 switch (wk->wk_type) {
3449                 case D_JADDREF:
3450                         handle_written_jaddref(WK_JADDREF(wk));
3451                         break;
3452                 case D_JREMREF:
3453                         handle_written_jremref(WK_JREMREF(wk));
3454                         break;
3455                 case D_JMVREF:
3456                         rele_jseg(jseg);        /* No jsegdep. */
3457                         jmvref = WK_JMVREF(wk);
3458                         LIST_REMOVE(jmvref, jm_deps);
3459                         if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0)
3460                                 free_pagedep(jmvref->jm_pagedep);
3461                         WORKITEM_FREE(jmvref, D_JMVREF);
3462                         break;
3463                 case D_JNEWBLK:
3464                         handle_written_jnewblk(WK_JNEWBLK(wk));
3465                         break;
3466                 case D_JFREEBLK:
3467                         handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep);
3468                         break;
3469                 case D_JTRUNC:
3470                         handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep);
3471                         break;
3472                 case D_JFSYNC:
3473                         rele_jseg(jseg);        /* No jsegdep. */
3474                         WORKITEM_FREE(wk, D_JFSYNC);
3475                         break;
3476                 case D_JFREEFRAG:
3477                         handle_written_jfreefrag(WK_JFREEFRAG(wk));
3478                         break;
3479                 default:
3480                         panic("handle_written_jseg: Unknown type %s",
3481                             TYPENAME(wk->wk_type));
3482                         /* NOTREACHED */
3483                 }
3484                 if (waiting)
3485                         wakeup(wk);
3486         }
3487         /* Release the self reference so the structure may be freed. */
3488         rele_jseg(jseg);
3489 }
3490
3491 /*
3492  * Determine which jsegs are ready for completion processing.  Waits for
3493  * synchronize cache to complete as well as forcing in-order completion
3494  * of journal entries.
3495  */
3496 static void
3497 complete_jsegs(jseg)
3498         struct jseg *jseg;
3499 {
3500         struct jblocks *jblocks;
3501         struct jseg *jsegn;
3502
3503         jblocks = jseg->js_jblocks;
3504         /*
3505          * Don't allow out of order completions.  If this isn't the first
3506          * block wait for it to write before we're done.
3507          */
3508         if (jseg != jblocks->jb_writeseg)
3509                 return;
3510         /* Iterate through available jsegs processing their entries. */
3511         while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) {
3512                 jblocks->jb_oldestwrseq = jseg->js_oldseq;
3513                 jsegn = TAILQ_NEXT(jseg, js_next);
3514                 complete_jseg(jseg);
3515                 jseg = jsegn;
3516         }
3517         jblocks->jb_writeseg = jseg;
3518         /*
3519          * Attempt to free jsegs now that oldestwrseq may have advanced.
3520          */
3521         free_jsegs(jblocks);
3522 }
3523
3524 /*
3525  * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Attempt to handle
3526  * the final completions.
3527  */
3528 static void
3529 handle_written_jseg(jseg, bp)
3530         struct jseg *jseg;
3531         struct buf *bp;
3532 {
3533
3534         if (jseg->js_refs == 0)
3535                 panic("handle_written_jseg: No self-reference on %p", jseg);
3536         jseg->js_state |= DEPCOMPLETE;
3537         /*
3538          * We'll never need this buffer again, set flags so it will be
3539          * discarded.
3540          */
3541         bp->b_flags |= B_INVAL | B_NOCACHE;
3542         pbrelvp(bp);
3543         complete_jsegs(jseg);
3544 }
3545
3546 static inline struct jsegdep *
3547 inoref_jseg(inoref)
3548         struct inoref *inoref;
3549 {
3550         struct jsegdep *jsegdep;
3551
3552         jsegdep = inoref->if_jsegdep;
3553         inoref->if_jsegdep = NULL;
3554
3555         return (jsegdep);
3556 }
3557
3558 /*
3559  * Called once a jremref has made it to stable store.  The jremref is marked
3560  * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
3561  * for the jremref to complete will be awoken by free_jremref.
3562  */
3563 static void
3564 handle_written_jremref(jremref)
3565         struct jremref *jremref;
3566 {
3567         struct inodedep *inodedep;
3568         struct jsegdep *jsegdep;
3569         struct dirrem *dirrem;
3570
3571         /* Grab the jsegdep. */
3572         jsegdep = inoref_jseg(&jremref->jr_ref);
3573         /*
3574          * Remove us from the inoref list.
3575          */
3576         if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
3577             0, &inodedep) == 0)
3578                 panic("handle_written_jremref: Lost inodedep");
3579         TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
3580         /*
3581          * Complete the dirrem.
3582          */
3583         dirrem = jremref->jr_dirrem;
3584         jremref->jr_dirrem = NULL;
3585         LIST_REMOVE(jremref, jr_deps);
3586         jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
3587         jwork_insert(&dirrem->dm_jwork, jsegdep);
3588         if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
3589             (dirrem->dm_state & COMPLETE) != 0)
3590                 add_to_worklist(&dirrem->dm_list, 0);
3591         free_jremref(jremref);
3592 }
3593
3594 /*
3595  * Called once a jaddref has made it to stable store.  The dependency is
3596  * marked complete and any dependent structures are added to the inode
3597  * bufwait list to be completed as soon as it is written.  If a bitmap write
3598  * depends on this entry we move the inode into the inodedephd of the
3599  * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
3600  */
3601 static void
3602 handle_written_jaddref(jaddref)
3603         struct jaddref *jaddref;
3604 {
3605         struct jsegdep *jsegdep;
3606         struct inodedep *inodedep;
3607         struct diradd *diradd;
3608         struct mkdir *mkdir;
3609
3610         /* Grab the jsegdep. */
3611         jsegdep = inoref_jseg(&jaddref->ja_ref);
3612         mkdir = NULL;
3613         diradd = NULL;
3614         if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3615             0, &inodedep) == 0)
3616                 panic("handle_written_jaddref: Lost inodedep.");
3617         if (jaddref->ja_diradd == NULL)
3618                 panic("handle_written_jaddref: No dependency");
3619         if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
3620                 diradd = jaddref->ja_diradd;
3621                 WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
3622         } else if (jaddref->ja_state & MKDIR_PARENT) {
3623                 mkdir = jaddref->ja_mkdir;
3624                 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
3625         } else if (jaddref->ja_state & MKDIR_BODY)
3626                 mkdir = jaddref->ja_mkdir;
3627         else
3628                 panic("handle_written_jaddref: Unknown dependency %p",
3629                     jaddref->ja_diradd);
3630         jaddref->ja_diradd = NULL;      /* also clears ja_mkdir */
3631         /*
3632          * Remove us from the inode list.
3633          */
3634         TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
3635         /*
3636          * The mkdir may be waiting on the jaddref to clear before freeing.
3637          */
3638         if (mkdir) {
3639                 KASSERT(mkdir->md_list.wk_type == D_MKDIR,
3640                     ("handle_written_jaddref: Incorrect type for mkdir %s",
3641                     TYPENAME(mkdir->md_list.wk_type)));
3642                 mkdir->md_jaddref = NULL;
3643                 diradd = mkdir->md_diradd;
3644                 mkdir->md_state |= DEPCOMPLETE;
3645                 complete_mkdir(mkdir);
3646         }
3647         jwork_insert(&diradd->da_jwork, jsegdep);
3648         if (jaddref->ja_state & NEWBLOCK) {
3649                 inodedep->id_state |= ONDEPLIST;
3650                 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
3651                     inodedep, id_deps);
3652         }
3653         free_jaddref(jaddref);
3654 }
3655
3656 /*
3657  * Called once a jnewblk journal is written.  The allocdirect or allocindir
3658  * is placed in the bmsafemap to await notification of a written bitmap.  If
3659  * the operation was canceled we add the segdep to the appropriate
3660  * dependency to free the journal space once the canceling operation
3661  * completes.
3662  */
3663 static void
3664 handle_written_jnewblk(jnewblk)
3665         struct jnewblk *jnewblk;
3666 {
3667         struct bmsafemap *bmsafemap;
3668         struct freefrag *freefrag;
3669         struct freework *freework;
3670         struct jsegdep *jsegdep;
3671         struct newblk *newblk;
3672
3673         /* Grab the jsegdep. */
3674         jsegdep = jnewblk->jn_jsegdep;
3675         jnewblk->jn_jsegdep = NULL;
3676         if (jnewblk->jn_dep == NULL)
3677                 panic("handle_written_jnewblk: No dependency for the segdep.");
3678         switch (jnewblk->jn_dep->wk_type) {
3679         case D_NEWBLK:
3680         case D_ALLOCDIRECT:
3681         case D_ALLOCINDIR:
3682                 /*
3683                  * Add the written block to the bmsafemap so it can
3684                  * be notified when the bitmap is on disk.
3685                  */
3686                 newblk = WK_NEWBLK(jnewblk->jn_dep);
3687                 newblk->nb_jnewblk = NULL;
3688                 if ((newblk->nb_state & GOINGAWAY) == 0) {
3689                         bmsafemap = newblk->nb_bmsafemap;
3690                         newblk->nb_state |= ONDEPLIST;
3691                         LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk,
3692                             nb_deps);
3693                 }
3694                 jwork_insert(&newblk->nb_jwork, jsegdep);
3695                 break;
3696         case D_FREEFRAG:
3697                 /*
3698                  * A newblock being removed by a freefrag when replaced by
3699                  * frag extension.
3700                  */
3701                 freefrag = WK_FREEFRAG(jnewblk->jn_dep);
3702                 freefrag->ff_jdep = NULL;
3703                 jwork_insert(&freefrag->ff_jwork, jsegdep);
3704                 break;
3705         case D_FREEWORK:
3706                 /*
3707                  * A direct block was removed by truncate.
3708                  */
3709                 freework = WK_FREEWORK(jnewblk->jn_dep);
3710                 freework->fw_jnewblk = NULL;
3711                 jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep);
3712                 break;
3713         default:
3714                 panic("handle_written_jnewblk: Unknown type %d.",
3715                     jnewblk->jn_dep->wk_type);
3716         }
3717         jnewblk->jn_dep = NULL;
3718         free_jnewblk(jnewblk);
3719 }
3720
3721 /*
3722  * Cancel a jfreefrag that won't be needed, probably due to colliding with
3723  * an in-flight allocation that has not yet been committed.  Divorce us
3724  * from the freefrag and mark it DEPCOMPLETE so that it may be added
3725  * to the worklist.
3726  */
3727 static void
3728 cancel_jfreefrag(jfreefrag)
3729         struct jfreefrag *jfreefrag;
3730 {
3731         struct freefrag *freefrag;
3732
3733         if (jfreefrag->fr_jsegdep) {
3734                 free_jsegdep(jfreefrag->fr_jsegdep);
3735                 jfreefrag->fr_jsegdep = NULL;
3736         }
3737         freefrag = jfreefrag->fr_freefrag;
3738         jfreefrag->fr_freefrag = NULL;
3739         free_jfreefrag(jfreefrag);
3740         freefrag->ff_state |= DEPCOMPLETE;
3741         CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno);
3742 }
3743
3744 /*
3745  * Free a jfreefrag when the parent freefrag is rendered obsolete.
3746  */
3747 static void
3748 free_jfreefrag(jfreefrag)
3749         struct jfreefrag *jfreefrag;
3750 {
3751
3752         if (jfreefrag->fr_state & INPROGRESS)
3753                 WORKLIST_REMOVE(&jfreefrag->fr_list);
3754         else if (jfreefrag->fr_state & ONWORKLIST)
3755                 remove_from_journal(&jfreefrag->fr_list);
3756         if (jfreefrag->fr_freefrag != NULL)
3757                 panic("free_jfreefrag:  Still attached to a freefrag.");
3758         WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
3759 }
3760
3761 /*
3762  * Called when the journal write for a jfreefrag completes.  The parent
3763  * freefrag is added to the worklist if this completes its dependencies.
3764  */
3765 static void
3766 handle_written_jfreefrag(jfreefrag)
3767         struct jfreefrag *jfreefrag;
3768 {
3769         struct jsegdep *jsegdep;
3770         struct freefrag *freefrag;
3771
3772         /* Grab the jsegdep. */
3773         jsegdep = jfreefrag->fr_jsegdep;
3774         jfreefrag->fr_jsegdep = NULL;
3775         freefrag = jfreefrag->fr_freefrag;
3776         if (freefrag == NULL)
3777                 panic("handle_written_jfreefrag: No freefrag.");
3778         freefrag->ff_state |= DEPCOMPLETE;
3779         freefrag->ff_jdep = NULL;
3780         jwork_insert(&freefrag->ff_jwork, jsegdep);
3781         if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
3782                 add_to_worklist(&freefrag->ff_list, 0);
3783         jfreefrag->fr_freefrag = NULL;
3784         free_jfreefrag(jfreefrag);
3785 }
3786
3787 /*
3788  * Called when the journal write for a jfreeblk completes.  The jfreeblk
3789  * is removed from the freeblks list of pending journal writes and the
3790  * jsegdep is moved to the freeblks jwork to be completed when all blocks
3791  * have been reclaimed.
3792  */
3793 static void
3794 handle_written_jblkdep(jblkdep)
3795         struct jblkdep *jblkdep;
3796 {
3797         struct freeblks *freeblks;
3798         struct jsegdep *jsegdep;
3799
3800         /* Grab the jsegdep. */
3801         jsegdep = jblkdep->jb_jsegdep;
3802         jblkdep->jb_jsegdep = NULL;
3803         freeblks = jblkdep->jb_freeblks;
3804         LIST_REMOVE(jblkdep, jb_deps);
3805         jwork_insert(&freeblks->fb_jwork, jsegdep);
3806         /*
3807          * If the freeblks is all journaled, we can add it to the worklist.
3808          */
3809         if (LIST_EMPTY(&freeblks->fb_jblkdephd) &&
3810             (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
3811                 add_to_worklist(&freeblks->fb_list, WK_NODELAY);
3812
3813         free_jblkdep(jblkdep);
3814 }
3815
3816 static struct jsegdep *
3817 newjsegdep(struct worklist *wk)
3818 {
3819         struct jsegdep *jsegdep;
3820
3821         jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
3822         workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
3823         jsegdep->jd_seg = NULL;
3824
3825         return (jsegdep);
3826 }
3827
3828 static struct jmvref *
3829 newjmvref(dp, ino, oldoff, newoff)
3830         struct inode *dp;
3831         ino_t ino;
3832         off_t oldoff;
3833         off_t newoff;
3834 {
3835         struct jmvref *jmvref;
3836
3837         jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
3838         workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump));
3839         jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
3840         jmvref->jm_parent = dp->i_number;
3841         jmvref->jm_ino = ino;
3842         jmvref->jm_oldoff = oldoff;
3843         jmvref->jm_newoff = newoff;
3844
3845         return (jmvref);
3846 }
3847
3848 /*
3849  * Allocate a new jremref that tracks the removal of ip from dp with the
3850  * directory entry offset of diroff.  Mark the entry as ATTACHED and
3851  * DEPCOMPLETE as we have all the information required for the journal write
3852  * and the directory has already been removed from the buffer.  The caller
3853  * is responsible for linking the jremref into the pagedep and adding it
3854  * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
3855  * a DOTDOT addition so handle_workitem_remove() can properly assign
3856  * the jsegdep when we're done.
3857  */
3858 static struct jremref *
3859 newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
3860     off_t diroff, nlink_t nlink)
3861 {
3862         struct jremref *jremref;
3863
3864         jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
3865         workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump));
3866         jremref->jr_state = ATTACHED;
3867         newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
3868            nlink, ip->i_mode);
3869         jremref->jr_dirrem = dirrem;
3870
3871         return (jremref);
3872 }
3873
3874 static inline void
3875 newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
3876     nlink_t nlink, uint16_t mode)
3877 {
3878
3879         inoref->if_jsegdep = newjsegdep(&inoref->if_list);
3880         inoref->if_diroff = diroff;
3881         inoref->if_ino = ino;
3882         inoref->if_parent = parent;
3883         inoref->if_nlink = nlink;
3884         inoref->if_mode = mode;
3885 }
3886
3887 /*
3888  * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
3889  * directory offset may not be known until later.  The caller is responsible
3890  * adding the entry to the journal when this information is available.  nlink
3891  * should be the link count prior to the addition and mode is only required
3892  * to have the correct FMT.
3893  */
3894 static struct jaddref *
3895 newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
3896     uint16_t mode)
3897 {
3898         struct jaddref *jaddref;
3899
3900         jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
3901         workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump));
3902         jaddref->ja_state = ATTACHED;
3903         jaddref->ja_mkdir = NULL;
3904         newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
3905
3906         return (jaddref);
3907 }
3908
3909 /*
3910  * Create a new free dependency for a freework.  The caller is responsible
3911  * for adjusting the reference count when it has the lock held.  The freedep
3912  * will track an outstanding bitmap write that will ultimately clear the
3913  * freework to continue.
3914  */
3915 static struct freedep *
3916 newfreedep(struct freework *freework)
3917 {
3918         struct freedep *freedep;
3919
3920         freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
3921         workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
3922         freedep->fd_freework = freework;
3923
3924         return (freedep);
3925 }
3926
3927 /*
3928  * Free a freedep structure once the buffer it is linked to is written.  If
3929  * this is the last reference to the freework schedule it for completion.
3930  */
3931 static void
3932 free_freedep(freedep)
3933         struct freedep *freedep;
3934 {
3935         struct freework *freework;
3936
3937         freework = freedep->fd_freework;
3938         freework->fw_freeblks->fb_cgwait--;
3939         if (--freework->fw_ref == 0)
3940                 freework_enqueue(freework);
3941         WORKITEM_FREE(freedep, D_FREEDEP);
3942 }
3943
3944 /*
3945  * Allocate a new freework structure that may be a level in an indirect
3946  * when parent is not NULL or a top level block when it is.  The top level
3947  * freework structures are allocated without the soft updates lock held
3948  * and before the freeblks is visible outside of softdep_setup_freeblocks().
3949  */
3950 static struct freework *
3951 newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
3952         struct ufsmount *ump;
3953         struct freeblks *freeblks;
3954         struct freework *parent;
3955         ufs_lbn_t lbn;
3956         ufs2_daddr_t nb;
3957         int frags;
3958         int off;
3959         int journal;
3960 {
3961         struct freework *freework;
3962
3963         freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
3964         workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
3965         freework->fw_state = ATTACHED;
3966         freework->fw_jnewblk = NULL;
3967         freework->fw_freeblks = freeblks;
3968         freework->fw_parent = parent;
3969         freework->fw_lbn = lbn;
3970         freework->fw_blkno = nb;
3971         freework->fw_frags = frags;
3972         freework->fw_indir = NULL;
3973         freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -NXADDR)
3974                 ? 0 : NINDIR(ump->um_fs) + 1;
3975         freework->fw_start = freework->fw_off = off;
3976         if (journal)
3977                 newjfreeblk(freeblks, lbn, nb, frags);
3978         if (parent == NULL) {
3979                 ACQUIRE_LOCK(ump);
3980                 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
3981                 freeblks->fb_ref++;
3982                 FREE_LOCK(ump);
3983         }
3984
3985         return (freework);
3986 }
3987
3988 /*
3989  * Eliminate a jfreeblk for a block that does not need journaling.
3990  */
3991 static void
3992 cancel_jfreeblk(freeblks, blkno)
3993         struct freeblks *freeblks;
3994         ufs2_daddr_t blkno;
3995 {
3996         struct jfreeblk *jfreeblk;
3997         struct jblkdep *jblkdep;
3998
3999         LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) {
4000                 if (jblkdep->jb_list.wk_type != D_JFREEBLK)
4001                         continue;
4002                 jfreeblk = WK_JFREEBLK(&jblkdep->jb_list);
4003                 if (jfreeblk->jf_blkno == blkno)
4004                         break;
4005         }
4006         if (jblkdep == NULL)
4007                 return;
4008         CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno);
4009         free_jsegdep(jblkdep->jb_jsegdep);
4010         LIST_REMOVE(jblkdep, jb_deps);
4011         WORKITEM_FREE(jfreeblk, D_JFREEBLK);
4012 }
4013
4014 /*
4015  * Allocate a new jfreeblk to journal top level block pointer when truncating
4016  * a file.  The caller must add this to the worklist when the soft updates
4017  * lock is held.
4018  */
4019 static struct jfreeblk *
4020 newjfreeblk(freeblks, lbn, blkno, frags)
4021         struct freeblks *freeblks;
4022         ufs_lbn_t lbn;
4023         ufs2_daddr_t blkno;
4024         int frags;
4025 {
4026         struct jfreeblk *jfreeblk;
4027
4028         jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
4029         workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK,
4030             freeblks->fb_list.wk_mp);
4031         jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list);
4032         jfreeblk->jf_dep.jb_freeblks = freeblks;
4033         jfreeblk->jf_ino = freeblks->fb_inum;
4034         jfreeblk->jf_lbn = lbn;
4035         jfreeblk->jf_blkno = blkno;
4036         jfreeblk->jf_frags = frags;
4037         LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps);
4038
4039         return (jfreeblk);
4040 }
4041
4042 /*
4043  * Allocate a new jtrunc to track a partial truncation.
4044  */
4045 static struct jtrunc *
4046 newjtrunc(freeblks, size, extsize)
4047         struct freeblks *freeblks;
4048         off_t size;
4049         int extsize;
4050 {
4051         struct jtrunc *jtrunc;
4052
4053         jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
4054         workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC,
4055             freeblks->fb_list.wk_mp);
4056         jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list);
4057         jtrunc->jt_dep.jb_freeblks = freeblks;
4058         jtrunc->jt_ino = freeblks->fb_inum;
4059         jtrunc->jt_size = size;
4060         jtrunc->jt_extsize = extsize;
4061         LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps);
4062
4063         return (jtrunc);
4064 }
4065
4066 /*
4067  * If we're canceling a new bitmap we have to search for another ref
4068  * to move into the bmsafemap dep.  This might be better expressed
4069  * with another structure.
4070  */
4071 static void
4072 move_newblock_dep(jaddref, inodedep)
4073         struct jaddref *jaddref;
4074         struct inodedep *inodedep;
4075 {
4076         struct inoref *inoref;
4077         struct jaddref *jaddrefn;
4078
4079         jaddrefn = NULL;
4080         for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4081             inoref = TAILQ_NEXT(inoref, if_deps)) {
4082                 if ((jaddref->ja_state & NEWBLOCK) &&
4083                     inoref->if_list.wk_type == D_JADDREF) {
4084                         jaddrefn = (struct jaddref *)inoref;
4085                         break;
4086                 }
4087         }
4088         if (jaddrefn == NULL)
4089                 return;
4090         jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
4091         jaddrefn->ja_state |= jaddref->ja_state &
4092             (ATTACHED | UNDONE | NEWBLOCK);
4093         jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
4094         jaddref->ja_state |= ATTACHED;
4095         LIST_REMOVE(jaddref, ja_bmdeps);
4096         LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
4097             ja_bmdeps);
4098 }
4099
4100 /*
4101  * Cancel a jaddref either before it has been written or while it is being
4102  * written.  This happens when a link is removed before the add reaches
4103  * the disk.  The jaddref dependency is kept linked into the bmsafemap
4104  * and inode to prevent the link count or bitmap from reaching the disk
4105  * until handle_workitem_remove() re-adjusts the counts and bitmaps as
4106  * required.
4107  *
4108  * Returns 1 if the canceled addref requires journaling of the remove and
4109  * 0 otherwise.
4110  */
4111 static int
4112 cancel_jaddref(jaddref, inodedep, wkhd)
4113         struct jaddref *jaddref;
4114         struct inodedep *inodedep;
4115         struct workhead *wkhd;
4116 {
4117         struct inoref *inoref;
4118         struct jsegdep *jsegdep;
4119         int needsj;
4120
4121         KASSERT((jaddref->ja_state & COMPLETE) == 0,
4122             ("cancel_jaddref: Canceling complete jaddref"));
4123         if (jaddref->ja_state & (INPROGRESS | COMPLETE))
4124                 needsj = 1;
4125         else
4126                 needsj = 0;
4127         if (inodedep == NULL)
4128                 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
4129                     0, &inodedep) == 0)
4130                         panic("cancel_jaddref: Lost inodedep");
4131         /*
4132          * We must adjust the nlink of any reference operation that follows
4133          * us so that it is consistent with the in-memory reference.  This
4134          * ensures that inode nlink rollbacks always have the correct link.
4135          */
4136         if (needsj == 0) {
4137                 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4138                     inoref = TAILQ_NEXT(inoref, if_deps)) {
4139                         if (inoref->if_state & GOINGAWAY)
4140                                 break;
4141                         inoref->if_nlink--;
4142                 }
4143         }
4144         jsegdep = inoref_jseg(&jaddref->ja_ref);
4145         if (jaddref->ja_state & NEWBLOCK)
4146                 move_newblock_dep(jaddref, inodedep);
4147         wake_worklist(&jaddref->ja_list);
4148         jaddref->ja_mkdir = NULL;
4149         if (jaddref->ja_state & INPROGRESS) {
4150                 jaddref->ja_state &= ~INPROGRESS;
4151                 WORKLIST_REMOVE(&jaddref->ja_list);
4152                 jwork_insert(wkhd, jsegdep);
4153         } else {
4154                 free_jsegdep(jsegdep);
4155                 if (jaddref->ja_state & DEPCOMPLETE)
4156                         remove_from_journal(&jaddref->ja_list);
4157         }
4158         jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
4159         /*
4160          * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
4161          * can arrange for them to be freed with the bitmap.  Otherwise we
4162          * no longer need this addref attached to the inoreflst and it
4163          * will incorrectly adjust nlink if we leave it.
4164          */
4165         if ((jaddref->ja_state & NEWBLOCK) == 0) {
4166                 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
4167                     if_deps);
4168                 jaddref->ja_state |= COMPLETE;
4169                 free_jaddref(jaddref);
4170                 return (needsj);
4171         }
4172         /*
4173          * Leave the head of the list for jsegdeps for fast merging.
4174          */
4175         if (LIST_FIRST(wkhd) != NULL) {
4176                 jaddref->ja_state |= ONWORKLIST;
4177                 LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
4178         } else
4179                 WORKLIST_INSERT(wkhd, &jaddref->ja_list);
4180
4181         return (needsj);
4182 }
4183
4184 /*
4185  * Attempt to free a jaddref structure when some work completes.  This
4186  * should only succeed once the entry is written and all dependencies have
4187  * been notified.
4188  */
4189 static void
4190 free_jaddref(jaddref)
4191         struct jaddref *jaddref;
4192 {
4193
4194         if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
4195                 return;
4196         if (jaddref->ja_ref.if_jsegdep)
4197                 panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
4198                     jaddref, jaddref->ja_state);
4199         if (jaddref->ja_state & NEWBLOCK)
4200                 LIST_REMOVE(jaddref, ja_bmdeps);
4201         if (jaddref->ja_state & (INPROGRESS | ONWORKLIST))
4202                 panic("free_jaddref: Bad state %p(0x%X)",
4203                     jaddref, jaddref->ja_state);
4204         if (jaddref->ja_mkdir != NULL)
4205                 panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
4206         WORKITEM_FREE(jaddref, D_JADDREF);
4207 }
4208
4209 /*
4210  * Free a jremref structure once it has been written or discarded.
4211  */
4212 static void
4213 free_jremref(jremref)
4214         struct jremref *jremref;
4215 {
4216
4217         if (jremref->jr_ref.if_jsegdep)
4218                 free_jsegdep(jremref->jr_ref.if_jsegdep);
4219         if (jremref->jr_state & INPROGRESS)
4220                 panic("free_jremref: IO still pending");
4221         WORKITEM_FREE(jremref, D_JREMREF);
4222 }
4223
4224 /*
4225  * Free a jnewblk structure.
4226  */
4227 static void
4228 free_jnewblk(jnewblk)
4229         struct jnewblk *jnewblk;
4230 {
4231
4232         if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
4233                 return;
4234         LIST_REMOVE(jnewblk, jn_deps);
4235         if (jnewblk->jn_dep != NULL)
4236                 panic("free_jnewblk: Dependency still attached.");
4237         WORKITEM_FREE(jnewblk, D_JNEWBLK);
4238 }
4239
4240 /*
4241  * Cancel a jnewblk which has been been made redundant by frag extension.
4242  */
4243 static void
4244 cancel_jnewblk(jnewblk, wkhd)
4245         struct jnewblk *jnewblk;
4246         struct workhead *wkhd;
4247 {
4248         struct jsegdep *jsegdep;
4249
4250         CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno);
4251         jsegdep = jnewblk->jn_jsegdep;
4252         if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
4253                 panic("cancel_jnewblk: Invalid state");
4254         jnewblk->jn_jsegdep  = NULL;
4255         jnewblk->jn_dep = NULL;
4256         jnewblk->jn_state |= GOINGAWAY;
4257         if (jnewblk->jn_state & INPROGRESS) {
4258                 jnewblk->jn_state &= ~INPROGRESS;
4259                 WORKLIST_REMOVE(&jnewblk->jn_list);
4260                 jwork_insert(wkhd, jsegdep);
4261         } else {
4262                 free_jsegdep(jsegdep);
4263                 remove_from_journal(&jnewblk->jn_list);
4264         }
4265         wake_worklist(&jnewblk->jn_list);
4266         WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
4267 }
4268
4269 static void
4270 free_jblkdep(jblkdep)
4271         struct jblkdep *jblkdep;
4272 {
4273
4274         if (jblkdep->jb_list.wk_type == D_JFREEBLK)
4275                 WORKITEM_FREE(jblkdep, D_JFREEBLK);
4276         else if (jblkdep->jb_list.wk_type == D_JTRUNC)
4277                 WORKITEM_FREE(jblkdep, D_JTRUNC);
4278         else
4279                 panic("free_jblkdep: Unexpected type %s",
4280                     TYPENAME(jblkdep->jb_list.wk_type));
4281 }
4282
4283 /*
4284  * Free a single jseg once it is no longer referenced in memory or on
4285  * disk.  Reclaim journal blocks and dependencies waiting for the segment
4286  * to disappear.
4287  */
4288 static void
4289 free_jseg(jseg, jblocks)
4290         struct jseg *jseg;
4291         struct jblocks *jblocks;
4292 {
4293         struct freework *freework;
4294
4295         /*
4296          * Free freework structures that were lingering to indicate freed
4297          * indirect blocks that forced journal write ordering on reallocate.
4298          */
4299         while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL)
4300                 indirblk_remove(freework);
4301         if (jblocks->jb_oldestseg == jseg)
4302                 jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
4303         TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
4304         jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
4305         KASSERT(LIST_EMPTY(&jseg->js_entries),
4306             ("free_jseg: Freed jseg has valid entries."));
4307         WORKITEM_FREE(jseg, D_JSEG);
4308 }
4309
4310 /*
4311  * Free all jsegs that meet the criteria for being reclaimed and update
4312  * oldestseg.
4313  */
4314 static void
4315 free_jsegs(jblocks)
4316         struct jblocks *jblocks;
4317 {
4318         struct jseg *jseg;
4319
4320         /*
4321          * Free only those jsegs which have none allocated before them to
4322          * preserve the journal space ordering.
4323          */
4324         while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
4325                 /*
4326                  * Only reclaim space when nothing depends on this journal
4327                  * set and another set has written that it is no longer
4328                  * valid.
4329                  */
4330                 if (jseg->js_refs != 0) {
4331                         jblocks->jb_oldestseg = jseg;
4332                         return;
4333                 }
4334                 if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE)
4335                         break;
4336                 if (jseg->js_seq > jblocks->jb_oldestwrseq)
4337                         break;
4338                 /*
4339                  * We can free jsegs that didn't write entries when
4340                  * oldestwrseq == js_seq.
4341                  */
4342                 if (jseg->js_seq == jblocks->jb_oldestwrseq &&
4343                     jseg->js_cnt != 0)
4344                         break;
4345                 free_jseg(jseg, jblocks);
4346         }
4347         /*
4348          * If we exited the loop above we still must discover the
4349          * oldest valid segment.
4350          */
4351         if (jseg)
4352                 for (jseg = jblocks->jb_oldestseg; jseg != NULL;
4353                      jseg = TAILQ_NEXT(jseg, js_next))
4354                         if (jseg->js_refs != 0)
4355                                 break;
4356         jblocks->jb_oldestseg = jseg;
4357         /*
4358          * The journal has no valid records but some jsegs may still be
4359          * waiting on oldestwrseq to advance.  We force a small record
4360          * out to permit these lingering records to be reclaimed.
4361          */
4362         if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
4363                 jblocks->jb_needseg = 1;
4364 }
4365
4366 /*
4367  * Release one reference to a jseg and free it if the count reaches 0.  This
4368  * should eventually reclaim journal space as well.
4369  */
4370 static void
4371 rele_jseg(jseg)
4372         struct jseg *jseg;
4373 {
4374
4375         KASSERT(jseg->js_refs > 0,
4376             ("free_jseg: Invalid refcnt %d", jseg->js_refs));
4377         if (--jseg->js_refs != 0)
4378                 return;
4379         free_jsegs(jseg->js_jblocks);
4380 }
4381
4382 /*
4383  * Release a jsegdep and decrement the jseg count.
4384  */
4385 static void
4386 free_jsegdep(jsegdep)
4387         struct jsegdep *jsegdep;
4388 {
4389
4390         if (jsegdep->jd_seg)
4391                 rele_jseg(jsegdep->jd_seg);
4392         WORKITEM_FREE(jsegdep, D_JSEGDEP);
4393 }
4394
4395 /*
4396  * Wait for a journal item to make it to disk.  Initiate journal processing
4397  * if required.
4398  */
4399 static int
4400 jwait(wk, waitfor)
4401         struct worklist *wk;
4402         int waitfor;
4403 {
4404
4405         LOCK_OWNED(VFSTOUFS(wk->wk_mp));
4406         /*
4407          * Blocking journal waits cause slow synchronous behavior.  Record
4408          * stats on the frequency of these blocking operations.
4409          */
4410         if (waitfor == MNT_WAIT) {
4411                 stat_journal_wait++;
4412                 switch (wk->wk_type) {
4413                 case D_JREMREF:
4414                 case D_JMVREF:
4415                         stat_jwait_filepage++;
4416                         break;
4417                 case D_JTRUNC:
4418                 case D_JFREEBLK:
4419                         stat_jwait_freeblks++;
4420                         break;
4421                 case D_JNEWBLK:
4422                         stat_jwait_newblk++;
4423                         break;
4424                 case D_JADDREF:
4425                         stat_jwait_inode++;
4426                         break;
4427                 default:
4428                         break;
4429                 }
4430         }
4431         /*
4432          * If IO has not started we process the journal.  We can't mark the
4433          * worklist item as IOWAITING because we drop the lock while
4434          * processing the journal and the worklist entry may be freed after
4435          * this point.  The caller may call back in and re-issue the request.
4436          */
4437         if ((wk->wk_state & INPROGRESS) == 0) {
4438                 softdep_process_journal(wk->wk_mp, wk, waitfor);
4439                 if (waitfor != MNT_WAIT)
4440                         return (EBUSY);
4441                 return (0);
4442         }
4443         if (waitfor != MNT_WAIT)
4444                 return (EBUSY);
4445         wait_worklist(wk, "jwait");
4446         return (0);
4447 }
4448
4449 /*
4450  * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
4451  * appropriate.  This is a convenience function to reduce duplicate code
4452  * for the setup and revert functions below.
4453  */
4454 static struct inodedep *
4455 inodedep_lookup_ip(ip)
4456         struct inode *ip;
4457 {
4458         struct inodedep *inodedep;
4459         int dflags;
4460
4461         KASSERT(ip->i_nlink >= ip->i_effnlink,
4462             ("inodedep_lookup_ip: bad delta"));
4463         dflags = DEPALLOC;
4464         if (IS_SNAPSHOT(ip))
4465                 dflags |= NODELAY;
4466         (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags,
4467             &inodedep);
4468         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
4469         KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
4470
4471         return (inodedep);
4472 }
4473
4474 /*
4475  * Called prior to creating a new inode and linking it to a directory.  The
4476  * jaddref structure must already be allocated by softdep_setup_inomapdep
4477  * and it is discovered here so we can initialize the mode and update
4478  * nlinkdelta.
4479  */
4480 void
4481 softdep_setup_create(dp, ip)
4482         struct inode *dp;
4483         struct inode *ip;
4484 {
4485         struct inodedep *inodedep;
4486         struct jaddref *jaddref;
4487         struct vnode *dvp;
4488
4489         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4490             ("softdep_setup_create called on non-softdep filesystem"));
4491         KASSERT(ip->i_nlink == 1,
4492             ("softdep_setup_create: Invalid link count."));
4493         dvp = ITOV(dp);
4494         ACQUIRE_LOCK(dp->i_ump);
4495         inodedep = inodedep_lookup_ip(ip);
4496         if (DOINGSUJ(dvp)) {
4497                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4498                     inoreflst);
4499                 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
4500                     ("softdep_setup_create: No addref structure present."));
4501         }
4502         softdep_prelink(dvp, NULL);
4503         FREE_LOCK(dp->i_ump);
4504 }
4505
4506 /*
4507  * Create a jaddref structure to track the addition of a DOTDOT link when
4508  * we are reparenting an inode as part of a rename.  This jaddref will be
4509  * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
4510  * non-journaling softdep.
4511  */
4512 void
4513 softdep_setup_dotdot_link(dp, ip)
4514         struct inode *dp;
4515         struct inode *ip;
4516 {
4517         struct inodedep *inodedep;
4518         struct jaddref *jaddref;
4519         struct vnode *dvp;
4520         struct vnode *vp;
4521
4522         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4523             ("softdep_setup_dotdot_link called on non-softdep filesystem"));
4524         dvp = ITOV(dp);
4525         vp = ITOV(ip);
4526         jaddref = NULL;
4527         /*
4528          * We don't set MKDIR_PARENT as this is not tied to a mkdir and
4529          * is used as a normal link would be.
4530          */
4531         if (DOINGSUJ(dvp))
4532                 jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4533                     dp->i_effnlink - 1, dp->i_mode);
4534         ACQUIRE_LOCK(dp->i_ump);
4535         inodedep = inodedep_lookup_ip(dp);
4536         if (jaddref)
4537                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4538                     if_deps);
4539         softdep_prelink(dvp, ITOV(ip));
4540         FREE_LOCK(dp->i_ump);
4541 }
4542
4543 /*
4544  * Create a jaddref structure to track a new link to an inode.  The directory
4545  * offset is not known until softdep_setup_directory_add or
4546  * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
4547  * softdep.
4548  */
4549 void
4550 softdep_setup_link(dp, ip)
4551         struct inode *dp;
4552         struct inode *ip;
4553 {
4554         struct inodedep *inodedep;
4555         struct jaddref *jaddref;
4556         struct vnode *dvp;
4557
4558         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4559             ("softdep_setup_link called on non-softdep filesystem"));
4560         dvp = ITOV(dp);
4561         jaddref = NULL;
4562         if (DOINGSUJ(dvp))
4563                 jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
4564                     ip->i_mode);
4565         ACQUIRE_LOCK(dp->i_ump);
4566         inodedep = inodedep_lookup_ip(ip);
4567         if (jaddref)
4568                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4569                     if_deps);
4570         softdep_prelink(dvp, ITOV(ip));
4571         FREE_LOCK(dp->i_ump);
4572 }
4573
4574 /*
4575  * Called to create the jaddref structures to track . and .. references as
4576  * well as lookup and further initialize the incomplete jaddref created
4577  * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
4578  * nlinkdelta for non-journaling softdep.
4579  */
4580 void
4581 softdep_setup_mkdir(dp, ip)
4582         struct inode *dp;
4583         struct inode *ip;
4584 {
4585         struct inodedep *inodedep;
4586         struct jaddref *dotdotaddref;
4587         struct jaddref *dotaddref;
4588         struct jaddref *jaddref;
4589         struct vnode *dvp;
4590
4591         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4592             ("softdep_setup_mkdir called on non-softdep filesystem"));
4593         dvp = ITOV(dp);
4594         dotaddref = dotdotaddref = NULL;
4595         if (DOINGSUJ(dvp)) {
4596                 dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
4597                     ip->i_mode);
4598                 dotaddref->ja_state |= MKDIR_BODY;
4599                 dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4600                     dp->i_effnlink - 1, dp->i_mode);
4601                 dotdotaddref->ja_state |= MKDIR_PARENT;
4602         }
4603         ACQUIRE_LOCK(dp->i_ump);
4604         inodedep = inodedep_lookup_ip(ip);
4605         if (DOINGSUJ(dvp)) {
4606                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4607                     inoreflst);
4608                 KASSERT(jaddref != NULL,
4609                     ("softdep_setup_mkdir: No addref structure present."));
4610                 KASSERT(jaddref->ja_parent == dp->i_number,
4611                     ("softdep_setup_mkdir: bad parent %ju",
4612                     (uintmax_t)jaddref->ja_parent));
4613                 TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
4614                     if_deps);
4615         }
4616         inodedep = inodedep_lookup_ip(dp);
4617         if (DOINGSUJ(dvp))
4618                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
4619                     &dotdotaddref->ja_ref, if_deps);
4620         softdep_prelink(ITOV(dp), NULL);
4621         FREE_LOCK(dp->i_ump);
4622 }
4623
4624 /*
4625  * Called to track nlinkdelta of the inode and parent directories prior to
4626  * unlinking a directory.
4627  */
4628 void
4629 softdep_setup_rmdir(dp, ip)
4630         struct inode *dp;
4631         struct inode *ip;
4632 {
4633         struct vnode *dvp;
4634
4635         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4636             ("softdep_setup_rmdir called on non-softdep filesystem"));
4637         dvp = ITOV(dp);
4638         ACQUIRE_LOCK(dp->i_ump);
4639         (void) inodedep_lookup_ip(ip);
4640         (void) inodedep_lookup_ip(dp);
4641         softdep_prelink(dvp, ITOV(ip));
4642         FREE_LOCK(dp->i_ump);
4643 }
4644
4645 /*
4646  * Called to track nlinkdelta of the inode and parent directories prior to
4647  * unlink.
4648  */
4649 void
4650 softdep_setup_unlink(dp, ip)
4651         struct inode *dp;
4652         struct inode *ip;
4653 {
4654         struct vnode *dvp;
4655
4656         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4657             ("softdep_setup_unlink called on non-softdep filesystem"));
4658         dvp = ITOV(dp);
4659         ACQUIRE_LOCK(dp->i_ump);
4660         (void) inodedep_lookup_ip(ip);
4661         (void) inodedep_lookup_ip(dp);
4662         softdep_prelink(dvp, ITOV(ip));
4663         FREE_LOCK(dp->i_ump);
4664 }
4665
4666 /*
4667  * Called to release the journal structures created by a failed non-directory
4668  * creation.  Adjusts nlinkdelta for non-journaling softdep.
4669  */
4670 void
4671 softdep_revert_create(dp, ip)
4672         struct inode *dp;
4673         struct inode *ip;
4674 {
4675         struct inodedep *inodedep;
4676         struct jaddref *jaddref;
4677         struct vnode *dvp;
4678
4679         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4680             ("softdep_revert_create called on non-softdep filesystem"));
4681         dvp = ITOV(dp);
4682         ACQUIRE_LOCK(dp->i_ump);
4683         inodedep = inodedep_lookup_ip(ip);
4684         if (DOINGSUJ(dvp)) {
4685                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4686                     inoreflst);
4687                 KASSERT(jaddref->ja_parent == dp->i_number,
4688                     ("softdep_revert_create: addref parent mismatch"));
4689                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4690         }
4691         FREE_LOCK(dp->i_ump);
4692 }
4693
4694 /*
4695  * Called to release the journal structures created by a failed link
4696  * addition.  Adjusts nlinkdelta for non-journaling softdep.
4697  */
4698 void
4699 softdep_revert_link(dp, ip)
4700         struct inode *dp;
4701         struct inode *ip;
4702 {
4703         struct inodedep *inodedep;
4704         struct jaddref *jaddref;
4705         struct vnode *dvp;
4706
4707         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4708             ("softdep_revert_link called on non-softdep filesystem"));
4709         dvp = ITOV(dp);
4710         ACQUIRE_LOCK(dp->i_ump);
4711         inodedep = inodedep_lookup_ip(ip);
4712         if (DOINGSUJ(dvp)) {
4713                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4714                     inoreflst);
4715                 KASSERT(jaddref->ja_parent == dp->i_number,
4716                     ("softdep_revert_link: addref parent mismatch"));
4717                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4718         }
4719         FREE_LOCK(dp->i_ump);
4720 }
4721
4722 /*
4723  * Called to release the journal structures created by a failed mkdir
4724  * attempt.  Adjusts nlinkdelta for non-journaling softdep.
4725  */
4726 void
4727 softdep_revert_mkdir(dp, ip)
4728         struct inode *dp;
4729         struct inode *ip;
4730 {
4731         struct inodedep *inodedep;
4732         struct jaddref *jaddref;
4733         struct jaddref *dotaddref;
4734         struct vnode *dvp;
4735
4736         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4737             ("softdep_revert_mkdir called on non-softdep filesystem"));
4738         dvp = ITOV(dp);
4739
4740         ACQUIRE_LOCK(dp->i_ump);
4741         inodedep = inodedep_lookup_ip(dp);
4742         if (DOINGSUJ(dvp)) {
4743                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4744                     inoreflst);
4745                 KASSERT(jaddref->ja_parent == ip->i_number,
4746                     ("softdep_revert_mkdir: dotdot addref parent mismatch"));
4747                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4748         }
4749         inodedep = inodedep_lookup_ip(ip);
4750         if (DOINGSUJ(dvp)) {
4751                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4752                     inoreflst);
4753                 KASSERT(jaddref->ja_parent == dp->i_number,
4754                     ("softdep_revert_mkdir: addref parent mismatch"));
4755                 dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
4756                     inoreflst, if_deps);
4757                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4758                 KASSERT(dotaddref->ja_parent == ip->i_number,
4759                     ("softdep_revert_mkdir: dot addref parent mismatch"));
4760                 cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
4761         }
4762         FREE_LOCK(dp->i_ump);
4763 }
4764
4765 /*
4766  * Called to correct nlinkdelta after a failed rmdir.
4767  */
4768 void
4769 softdep_revert_rmdir(dp, ip)
4770         struct inode *dp;
4771         struct inode *ip;
4772 {
4773
4774         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4775             ("softdep_revert_rmdir called on non-softdep filesystem"));
4776         ACQUIRE_LOCK(dp->i_ump);
4777         (void) inodedep_lookup_ip(ip);
4778         (void) inodedep_lookup_ip(dp);
4779         FREE_LOCK(dp->i_ump);
4780 }
4781
4782 /*
4783  * Protecting the freemaps (or bitmaps).
4784  *
4785  * To eliminate the need to execute fsck before mounting a filesystem
4786  * after a power failure, one must (conservatively) guarantee that the
4787  * on-disk copy of the bitmaps never indicate that a live inode or block is
4788  * free.  So, when a block or inode is allocated, the bitmap should be
4789  * updated (on disk) before any new pointers.  When a block or inode is
4790  * freed, the bitmap should not be updated until all pointers have been
4791  * reset.  The latter dependency is handled by the delayed de-allocation
4792  * approach described below for block and inode de-allocation.  The former
4793  * dependency is handled by calling the following procedure when a block or
4794  * inode is allocated. When an inode is allocated an "inodedep" is created
4795  * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
4796  * Each "inodedep" is also inserted into the hash indexing structure so
4797  * that any additional link additions can be made dependent on the inode
4798  * allocation.
4799  *
4800  * The ufs filesystem maintains a number of free block counts (e.g., per
4801  * cylinder group, per cylinder and per <cylinder, rotational position> pair)
4802  * in addition to the bitmaps.  These counts are used to improve efficiency
4803  * during allocation and therefore must be consistent with the bitmaps.
4804  * There is no convenient way to guarantee post-crash consistency of these
4805  * counts with simple update ordering, for two main reasons: (1) The counts
4806  * and bitmaps for a single cylinder group block are not in the same disk
4807  * sector.  If a disk write is interrupted (e.g., by power failure), one may
4808  * be written and the other not.  (2) Some of the counts are located in the
4809  * superblock rather than the cylinder group block. So, we focus our soft
4810  * updates implementation on protecting the bitmaps. When mounting a
4811  * filesystem, we recompute the auxiliary counts from the bitmaps.
4812  */
4813
4814 /*
4815  * Called just after updating the cylinder group block to allocate an inode.
4816  */
4817 void
4818 softdep_setup_inomapdep(bp, ip, newinum, mode)
4819         struct buf *bp;         /* buffer for cylgroup block with inode map */
4820         struct inode *ip;       /* inode related to allocation */
4821         ino_t newinum;          /* new inode number being allocated */
4822         int mode;
4823 {
4824         struct inodedep *inodedep;
4825         struct bmsafemap *bmsafemap;
4826         struct jaddref *jaddref;
4827         struct mount *mp;
4828         struct fs *fs;
4829
4830         mp = UFSTOVFS(ip->i_ump);
4831         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
4832             ("softdep_setup_inomapdep called on non-softdep filesystem"));
4833         fs = ip->i_ump->um_fs;
4834         jaddref = NULL;
4835
4836         /*
4837          * Allocate the journal reference add structure so that the bitmap
4838          * can be dependent on it.
4839          */
4840         if (MOUNTEDSUJ(mp)) {
4841                 jaddref = newjaddref(ip, newinum, 0, 0, mode);
4842                 jaddref->ja_state |= NEWBLOCK;
4843         }
4844
4845         /*
4846          * Create a dependency for the newly allocated inode.
4847          * Panic if it already exists as something is seriously wrong.
4848          * Otherwise add it to the dependency list for the buffer holding
4849          * the cylinder group map from which it was allocated.
4850          *
4851          * We have to preallocate a bmsafemap entry in case it is needed
4852          * in bmsafemap_lookup since once we allocate the inodedep, we
4853          * have to finish initializing it before we can FREE_LOCK().
4854          * By preallocating, we avoid FREE_LOCK() while doing a malloc
4855          * in bmsafemap_lookup. We cannot call bmsafemap_lookup before
4856          * creating the inodedep as it can be freed during the time
4857          * that we FREE_LOCK() while allocating the inodedep. We must
4858          * call workitem_alloc() before entering the locked section as
4859          * it also acquires the lock and we must avoid trying doing so
4860          * recursively.
4861          */
4862         bmsafemap = malloc(sizeof(struct bmsafemap),
4863             M_BMSAFEMAP, M_SOFTDEP_FLAGS);
4864         workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
4865         ACQUIRE_LOCK(ip->i_ump);
4866         if ((inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep)))
4867                 panic("softdep_setup_inomapdep: dependency %p for new"
4868                     "inode already exists", inodedep);
4869         bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap);
4870         if (jaddref) {
4871                 LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
4872                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4873                     if_deps);
4874         } else {
4875                 inodedep->id_state |= ONDEPLIST;
4876                 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
4877         }
4878         inodedep->id_bmsafemap = bmsafemap;
4879         inodedep->id_state &= ~DEPCOMPLETE;
4880         FREE_LOCK(ip->i_ump);
4881 }
4882
4883 /*
4884  * Called just after updating the cylinder group block to
4885  * allocate block or fragment.
4886  */
4887 void
4888 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
4889         struct buf *bp;         /* buffer for cylgroup block with block map */
4890         struct mount *mp;       /* filesystem doing allocation */
4891         ufs2_daddr_t newblkno;  /* number of newly allocated block */
4892         int frags;              /* Number of fragments. */
4893         int oldfrags;           /* Previous number of fragments for extend. */
4894 {
4895         struct newblk *newblk;
4896         struct bmsafemap *bmsafemap;
4897         struct jnewblk *jnewblk;
4898         struct ufsmount *ump;
4899         struct fs *fs;
4900
4901         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
4902             ("softdep_setup_blkmapdep called on non-softdep filesystem"));
4903         ump = VFSTOUFS(mp);
4904         fs = ump->um_fs;
4905         jnewblk = NULL;
4906         /*
4907          * Create a dependency for the newly allocated block.
4908          * Add it to the dependency list for the buffer holding
4909          * the cylinder group map from which it was allocated.
4910          */
4911         if (MOUNTEDSUJ(mp)) {
4912                 jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
4913                 workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
4914                 jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
4915                 jnewblk->jn_state = ATTACHED;
4916                 jnewblk->jn_blkno = newblkno;
4917                 jnewblk->jn_frags = frags;
4918                 jnewblk->jn_oldfrags = oldfrags;
4919 #ifdef SUJ_DEBUG
4920                 {
4921                         struct cg *cgp;
4922                         uint8_t *blksfree;
4923                         long bno;
4924                         int i;
4925
4926                         cgp = (struct cg *)bp->b_data;
4927                         blksfree = cg_blksfree(cgp);
4928                         bno = dtogd(fs, jnewblk->jn_blkno);
4929                         for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
4930                             i++) {
4931                                 if (isset(blksfree, bno + i))
4932                                         panic("softdep_setup_blkmapdep: "
4933                                             "free fragment %d from %d-%d "
4934                                             "state 0x%X dep %p", i,
4935                                             jnewblk->jn_oldfrags,
4936                                             jnewblk->jn_frags,
4937                                             jnewblk->jn_state,
4938                                             jnewblk->jn_dep);
4939                         }
4940                 }
4941 #endif
4942         }
4943
4944         CTR3(KTR_SUJ,
4945             "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d",
4946             newblkno, frags, oldfrags);
4947         ACQUIRE_LOCK(ump);
4948         if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
4949                 panic("softdep_setup_blkmapdep: found block");
4950         newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
4951             dtog(fs, newblkno), NULL);
4952         if (jnewblk) {
4953                 jnewblk->jn_dep = (struct worklist *)newblk;
4954                 LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
4955         } else {
4956                 newblk->nb_state |= ONDEPLIST;
4957                 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
4958         }
4959         newblk->nb_bmsafemap = bmsafemap;
4960         newblk->nb_jnewblk = jnewblk;
4961         FREE_LOCK(ump);
4962 }
4963
4964 #define BMSAFEMAP_HASH(ump, cg) \
4965       (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size])
4966
4967 static int
4968 bmsafemap_find(bmsafemaphd, cg, bmsafemapp)
4969         struct bmsafemap_hashhead *bmsafemaphd;
4970         int cg;
4971         struct bmsafemap **bmsafemapp;
4972 {
4973         struct bmsafemap *bmsafemap;
4974
4975         LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
4976                 if (bmsafemap->sm_cg == cg)
4977                         break;
4978         if (bmsafemap) {
4979                 *bmsafemapp = bmsafemap;
4980                 return (1);
4981         }
4982         *bmsafemapp = NULL;
4983
4984         return (0);
4985 }
4986
4987 /*
4988  * Find the bmsafemap associated with a cylinder group buffer.
4989  * If none exists, create one. The buffer must be locked when
4990  * this routine is called and this routine must be called with
4991  * the softdep lock held. To avoid giving up the lock while
4992  * allocating a new bmsafemap, a preallocated bmsafemap may be
4993  * provided. If it is provided but not needed, it is freed.
4994  */
4995 static struct bmsafemap *
4996 bmsafemap_lookup(mp, bp, cg, newbmsafemap)
4997         struct mount *mp;
4998         struct buf *bp;
4999         int cg;
5000         struct bmsafemap *newbmsafemap;
5001 {
5002         struct bmsafemap_hashhead *bmsafemaphd;
5003         struct bmsafemap *bmsafemap, *collision;
5004         struct worklist *wk;
5005         struct ufsmount *ump;
5006
5007         ump = VFSTOUFS(mp);
5008         LOCK_OWNED(ump);
5009         KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer"));
5010         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5011                 if (wk->wk_type == D_BMSAFEMAP) {
5012                         if (newbmsafemap)
5013                                 WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5014                         return (WK_BMSAFEMAP(wk));
5015                 }
5016         }
5017         bmsafemaphd = BMSAFEMAP_HASH(ump, cg);
5018         if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) {
5019                 if (newbmsafemap)
5020                         WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5021                 return (bmsafemap);
5022         }
5023         if (newbmsafemap) {
5024                 bmsafemap = newbmsafemap;
5025         } else {
5026                 FREE_LOCK(ump);
5027                 bmsafemap = malloc(sizeof(struct bmsafemap),
5028                         M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5029                 workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5030                 ACQUIRE_LOCK(ump);
5031         }
5032         bmsafemap->sm_buf = bp;
5033         LIST_INIT(&bmsafemap->sm_inodedephd);
5034         LIST_INIT(&bmsafemap->sm_inodedepwr);
5035         LIST_INIT(&bmsafemap->sm_newblkhd);
5036         LIST_INIT(&bmsafemap->sm_newblkwr);
5037         LIST_INIT(&bmsafemap->sm_jaddrefhd);
5038         LIST_INIT(&bmsafemap->sm_jnewblkhd);
5039         LIST_INIT(&bmsafemap->sm_freehd);
5040         LIST_INIT(&bmsafemap->sm_freewr);
5041         if (bmsafemap_find(bmsafemaphd, cg, &collision) == 1) {
5042                 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
5043                 return (collision);
5044         }
5045         bmsafemap->sm_cg = cg;
5046         LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
5047         LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
5048         WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
5049         return (bmsafemap);
5050 }
5051
5052 /*
5053  * Direct block allocation dependencies.
5054  *
5055  * When a new block is allocated, the corresponding disk locations must be
5056  * initialized (with zeros or new data) before the on-disk inode points to
5057  * them.  Also, the freemap from which the block was allocated must be
5058  * updated (on disk) before the inode's pointer. These two dependencies are
5059  * independent of each other and are needed for all file blocks and indirect
5060  * blocks that are pointed to directly by the inode.  Just before the
5061  * "in-core" version of the inode is updated with a newly allocated block
5062  * number, a procedure (below) is called to setup allocation dependency
5063  * structures.  These structures are removed when the corresponding
5064  * dependencies are satisfied or when the block allocation becomes obsolete
5065  * (i.e., the file is deleted, the block is de-allocated, or the block is a
5066  * fragment that gets upgraded).  All of these cases are handled in
5067  * procedures described later.
5068  *
5069  * When a file extension causes a fragment to be upgraded, either to a larger
5070  * fragment or to a full block, the on-disk location may change (if the
5071  * previous fragment could not simply be extended). In this case, the old
5072  * fragment must be de-allocated, but not until after the inode's pointer has
5073  * been updated. In most cases, this is handled by later procedures, which
5074  * will construct a "freefrag" structure to be added to the workitem queue
5075  * when the inode update is complete (or obsolete).  The main exception to
5076  * this is when an allocation occurs while a pending allocation dependency
5077  * (for the same block pointer) remains.  This case is handled in the main
5078  * allocation dependency setup procedure by immediately freeing the
5079  * unreferenced fragments.
5080  */
5081 void
5082 softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5083         struct inode *ip;       /* inode to which block is being added */
5084         ufs_lbn_t off;          /* block pointer within inode */
5085         ufs2_daddr_t newblkno;  /* disk block number being added */
5086         ufs2_daddr_t oldblkno;  /* previous block number, 0 unless frag */
5087         long newsize;           /* size of new block */
5088         long oldsize;           /* size of new block */
5089         struct buf *bp;         /* bp for allocated block */
5090 {
5091         struct allocdirect *adp, *oldadp;
5092         struct allocdirectlst *adphead;
5093         struct freefrag *freefrag;
5094         struct inodedep *inodedep;
5095         struct pagedep *pagedep;
5096         struct jnewblk *jnewblk;
5097         struct newblk *newblk;
5098         struct mount *mp;
5099         ufs_lbn_t lbn;
5100
5101         lbn = bp->b_lblkno;
5102         mp = UFSTOVFS(ip->i_ump);
5103         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5104             ("softdep_setup_allocdirect called on non-softdep filesystem"));
5105         if (oldblkno && oldblkno != newblkno)
5106                 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
5107         else
5108                 freefrag = NULL;
5109
5110         CTR6(KTR_SUJ,
5111             "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd "
5112             "off %jd newsize %ld oldsize %d",
5113             ip->i_number, newblkno, oldblkno, off, newsize, oldsize);
5114         ACQUIRE_LOCK(ip->i_ump);
5115         if (off >= NDADDR) {
5116                 if (lbn > 0)
5117                         panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
5118                             lbn, off);
5119                 /* allocating an indirect block */
5120                 if (oldblkno != 0)
5121                         panic("softdep_setup_allocdirect: non-zero indir");
5122         } else {
5123                 if (off != lbn)
5124                         panic("softdep_setup_allocdirect: lbn %jd != off %jd",
5125                             lbn, off);
5126                 /*
5127                  * Allocating a direct block.
5128                  *
5129                  * If we are allocating a directory block, then we must
5130                  * allocate an associated pagedep to track additions and
5131                  * deletions.
5132                  */
5133                 if ((ip->i_mode & IFMT) == IFDIR)
5134                         pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC,
5135                             &pagedep);
5136         }
5137         if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5138                 panic("softdep_setup_allocdirect: lost block");
5139         KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5140             ("softdep_setup_allocdirect: newblk already initialized"));
5141         /*
5142          * Convert the newblk to an allocdirect.
5143          */
5144         WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5145         adp = (struct allocdirect *)newblk;
5146         newblk->nb_freefrag = freefrag;
5147         adp->ad_offset = off;
5148         adp->ad_oldblkno = oldblkno;
5149         adp->ad_newsize = newsize;
5150         adp->ad_oldsize = oldsize;
5151
5152         /*
5153          * Finish initializing the journal.
5154          */
5155         if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5156                 jnewblk->jn_ino = ip->i_number;
5157                 jnewblk->jn_lbn = lbn;
5158                 add_to_journal(&jnewblk->jn_list);
5159         }
5160         if (freefrag && freefrag->ff_jdep != NULL &&
5161             freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5162                 add_to_journal(freefrag->ff_jdep);
5163         inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
5164         adp->ad_inodedep = inodedep;
5165
5166         WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5167         /*
5168          * The list of allocdirects must be kept in sorted and ascending
5169          * order so that the rollback routines can quickly determine the
5170          * first uncommitted block (the size of the file stored on disk
5171          * ends at the end of the lowest committed fragment, or if there
5172          * are no fragments, at the end of the highest committed block).
5173          * Since files generally grow, the typical case is that the new
5174          * block is to be added at the end of the list. We speed this
5175          * special case by checking against the last allocdirect in the
5176          * list before laboriously traversing the list looking for the
5177          * insertion point.
5178          */
5179         adphead = &inodedep->id_newinoupdt;
5180         oldadp = TAILQ_LAST(adphead, allocdirectlst);
5181         if (oldadp == NULL || oldadp->ad_offset <= off) {
5182                 /* insert at end of list */
5183                 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5184                 if (oldadp != NULL && oldadp->ad_offset == off)
5185                         allocdirect_merge(adphead, adp, oldadp);
5186                 FREE_LOCK(ip->i_ump);
5187                 return;
5188         }
5189         TAILQ_FOREACH(oldadp, adphead, ad_next) {
5190                 if (oldadp->ad_offset >= off)
5191                         break;
5192         }
5193         if (oldadp == NULL)
5194                 panic("softdep_setup_allocdirect: lost entry");
5195         /* insert in middle of list */
5196         TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5197         if (oldadp->ad_offset == off)
5198                 allocdirect_merge(adphead, adp, oldadp);
5199
5200         FREE_LOCK(ip->i_ump);
5201 }
5202
5203 /*
5204  * Merge a newer and older journal record to be stored either in a
5205  * newblock or freefrag.  This handles aggregating journal records for
5206  * fragment allocation into a second record as well as replacing a
5207  * journal free with an aborted journal allocation.  A segment for the
5208  * oldest record will be placed on wkhd if it has been written.  If not
5209  * the segment for the newer record will suffice.
5210  */
5211 static struct worklist *
5212 jnewblk_merge(new, old, wkhd)
5213         struct worklist *new;
5214         struct worklist *old;
5215         struct workhead *wkhd;
5216 {
5217         struct jnewblk *njnewblk;
5218         struct jnewblk *jnewblk;
5219
5220         /* Handle NULLs to simplify callers. */
5221         if (new == NULL)
5222                 return (old);
5223         if (old == NULL)
5224                 return (new);
5225         /* Replace a jfreefrag with a jnewblk. */
5226         if (new->wk_type == D_JFREEFRAG) {
5227                 if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno)
5228                         panic("jnewblk_merge: blkno mismatch: %p, %p",
5229                             old, new);
5230                 cancel_jfreefrag(WK_JFREEFRAG(new));
5231                 return (old);
5232         }
5233         if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK)
5234                 panic("jnewblk_merge: Bad type: old %d new %d\n",
5235                     old->wk_type, new->wk_type);
5236         /*
5237          * Handle merging of two jnewblk records that describe
5238          * different sets of fragments in the same block.
5239          */
5240         jnewblk = WK_JNEWBLK(old);
5241         njnewblk = WK_JNEWBLK(new);
5242         if (jnewblk->jn_blkno != njnewblk->jn_blkno)
5243                 panic("jnewblk_merge: Merging disparate blocks.");
5244         /*
5245          * The record may be rolled back in the cg.
5246          */
5247         if (jnewblk->jn_state & UNDONE) {
5248                 jnewblk->jn_state &= ~UNDONE;
5249                 njnewblk->jn_state |= UNDONE;
5250                 njnewblk->jn_state &= ~ATTACHED;
5251         }
5252         /*
5253          * We modify the newer addref and free the older so that if neither
5254          * has been written the most up-to-date copy will be on disk.  If
5255          * both have been written but rolled back we only temporarily need
5256          * one of them to fix the bits when the cg write completes.
5257          */
5258         jnewblk->jn_state |= ATTACHED | COMPLETE;
5259         njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
5260         cancel_jnewblk(jnewblk, wkhd);
5261         WORKLIST_REMOVE(&jnewblk->jn_list);
5262         free_jnewblk(jnewblk);
5263         return (new);
5264 }
5265
5266 /*
5267  * Replace an old allocdirect dependency with a newer one.
5268  * This routine must be called with splbio interrupts blocked.
5269  */
5270 static void
5271 allocdirect_merge(adphead, newadp, oldadp)
5272         struct allocdirectlst *adphead; /* head of list holding allocdirects */
5273         struct allocdirect *newadp;     /* allocdirect being added */
5274         struct allocdirect *oldadp;     /* existing allocdirect being checked */
5275 {
5276         struct worklist *wk;
5277         struct freefrag *freefrag;
5278
5279         freefrag = NULL;
5280         LOCK_OWNED(VFSTOUFS(newadp->ad_list.wk_mp));
5281         if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
5282             newadp->ad_oldsize != oldadp->ad_newsize ||
5283             newadp->ad_offset >= NDADDR)
5284                 panic("%s %jd != new %jd || old size %ld != new %ld",
5285                     "allocdirect_merge: old blkno",
5286                     (intmax_t)newadp->ad_oldblkno,
5287                     (intmax_t)oldadp->ad_newblkno,
5288                     newadp->ad_oldsize, oldadp->ad_newsize);
5289         newadp->ad_oldblkno = oldadp->ad_oldblkno;
5290         newadp->ad_oldsize = oldadp->ad_oldsize;
5291         /*
5292          * If the old dependency had a fragment to free or had never
5293          * previously had a block allocated, then the new dependency
5294          * can immediately post its freefrag and adopt the old freefrag.
5295          * This action is done by swapping the freefrag dependencies.
5296          * The new dependency gains the old one's freefrag, and the
5297          * old one gets the new one and then immediately puts it on
5298          * the worklist when it is freed by free_newblk. It is
5299          * not possible to do this swap when the old dependency had a
5300          * non-zero size but no previous fragment to free. This condition
5301          * arises when the new block is an extension of the old block.
5302          * Here, the first part of the fragment allocated to the new
5303          * dependency is part of the block currently claimed on disk by
5304          * the old dependency, so cannot legitimately be freed until the
5305          * conditions for the new dependency are fulfilled.
5306          */
5307         freefrag = newadp->ad_freefrag;
5308         if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
5309                 newadp->ad_freefrag = oldadp->ad_freefrag;
5310                 oldadp->ad_freefrag = freefrag;
5311         }
5312         /*
5313          * If we are tracking a new directory-block allocation,
5314          * move it from the old allocdirect to the new allocdirect.
5315          */
5316         if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
5317                 WORKLIST_REMOVE(wk);
5318                 if (!LIST_EMPTY(&oldadp->ad_newdirblk))
5319                         panic("allocdirect_merge: extra newdirblk");
5320                 WORKLIST_INSERT(&newadp->ad_newdirblk, wk);
5321         }
5322         TAILQ_REMOVE(adphead, oldadp, ad_next);
5323         /*
5324          * We need to move any journal dependencies over to the freefrag
5325          * that releases this block if it exists.  Otherwise we are
5326          * extending an existing block and we'll wait until that is
5327          * complete to release the journal space and extend the
5328          * new journal to cover this old space as well.
5329          */
5330         if (freefrag == NULL) {
5331                 if (oldadp->ad_newblkno != newadp->ad_newblkno)
5332                         panic("allocdirect_merge: %jd != %jd",
5333                             oldadp->ad_newblkno, newadp->ad_newblkno);
5334                 newadp->ad_block.nb_jnewblk = (struct jnewblk *)
5335                     jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list,
5336                     &oldadp->ad_block.nb_jnewblk->jn_list,
5337                     &newadp->ad_block.nb_jwork);
5338                 oldadp->ad_block.nb_jnewblk = NULL;
5339                 cancel_newblk(&oldadp->ad_block, NULL,
5340                     &newadp->ad_block.nb_jwork);
5341         } else {
5342                 wk = (struct worklist *) cancel_newblk(&oldadp->ad_block,
5343                     &freefrag->ff_list, &freefrag->ff_jwork);
5344                 freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk,
5345                     &freefrag->ff_jwork);
5346         }
5347         free_newblk(&oldadp->ad_block);
5348 }
5349
5350 /*
5351  * Allocate a jfreefrag structure to journal a single block free.
5352  */
5353 static struct jfreefrag *
5354 newjfreefrag(freefrag, ip, blkno, size, lbn)
5355         struct freefrag *freefrag;
5356         struct inode *ip;
5357         ufs2_daddr_t blkno;
5358         long size;
5359         ufs_lbn_t lbn;
5360 {
5361         struct jfreefrag *jfreefrag;
5362         struct fs *fs;
5363
5364         fs = ip->i_fs;
5365         jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
5366             M_SOFTDEP_FLAGS);
5367         workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump));
5368         jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
5369         jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
5370         jfreefrag->fr_ino = ip->i_number;
5371         jfreefrag->fr_lbn = lbn;
5372         jfreefrag->fr_blkno = blkno;
5373         jfreefrag->fr_frags = numfrags(fs, size);
5374         jfreefrag->fr_freefrag = freefrag;
5375
5376         return (jfreefrag);
5377 }
5378
5379 /*
5380  * Allocate a new freefrag structure.
5381  */
5382 static struct freefrag *
5383 newfreefrag(ip, blkno, size, lbn)
5384         struct inode *ip;
5385         ufs2_daddr_t blkno;
5386         long size;
5387         ufs_lbn_t lbn;
5388 {
5389         struct freefrag *freefrag;
5390         struct fs *fs;
5391
5392         CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd",
5393             ip->i_number, blkno, size, lbn);
5394         fs = ip->i_fs;
5395         if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
5396                 panic("newfreefrag: frag size");
5397         freefrag = malloc(sizeof(struct freefrag),
5398             M_FREEFRAG, M_SOFTDEP_FLAGS);
5399         workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
5400         freefrag->ff_state = ATTACHED;
5401         LIST_INIT(&freefrag->ff_jwork);
5402         freefrag->ff_inum = ip->i_number;
5403         freefrag->ff_vtype = ITOV(ip)->v_type;
5404         freefrag->ff_blkno = blkno;
5405         freefrag->ff_fragsize = size;
5406
5407         if (MOUNTEDSUJ(UFSTOVFS(ip->i_ump))) {
5408                 freefrag->ff_jdep = (struct worklist *)
5409                     newjfreefrag(freefrag, ip, blkno, size, lbn);
5410         } else {
5411                 freefrag->ff_state |= DEPCOMPLETE;
5412                 freefrag->ff_jdep = NULL;
5413         }
5414
5415         return (freefrag);
5416 }
5417
5418 /*
5419  * This workitem de-allocates fragments that were replaced during
5420  * file block allocation.
5421  */
5422 static void
5423 handle_workitem_freefrag(freefrag)
5424         struct freefrag *freefrag;
5425 {
5426         struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
5427         struct workhead wkhd;
5428
5429         CTR3(KTR_SUJ,
5430             "handle_workitem_freefrag: ino %d blkno %jd size %ld",
5431             freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize);
5432         /*
5433          * It would be illegal to add new completion items to the
5434          * freefrag after it was schedule to be done so it must be
5435          * safe to modify the list head here.
5436          */
5437         LIST_INIT(&wkhd);
5438         ACQUIRE_LOCK(ump);
5439         LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
5440         /*
5441          * If the journal has not been written we must cancel it here.
5442          */
5443         if (freefrag->ff_jdep) {
5444                 if (freefrag->ff_jdep->wk_type != D_JNEWBLK)
5445                         panic("handle_workitem_freefrag: Unexpected type %d\n",
5446                             freefrag->ff_jdep->wk_type);
5447                 cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd);
5448         }
5449         FREE_LOCK(ump);
5450         ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
5451            freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd);
5452         ACQUIRE_LOCK(ump);
5453         WORKITEM_FREE(freefrag, D_FREEFRAG);
5454         FREE_LOCK(ump);
5455 }
5456
5457 /*
5458  * Set up a dependency structure for an external attributes data block.
5459  * This routine follows much of the structure of softdep_setup_allocdirect.
5460  * See the description of softdep_setup_allocdirect above for details.
5461  */
5462 void
5463 softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5464         struct inode *ip;
5465         ufs_lbn_t off;
5466         ufs2_daddr_t newblkno;
5467         ufs2_daddr_t oldblkno;
5468         long newsize;
5469         long oldsize;
5470         struct buf *bp;
5471 {
5472         struct allocdirect *adp, *oldadp;
5473         struct allocdirectlst *adphead;
5474         struct freefrag *freefrag;
5475         struct inodedep *inodedep;
5476         struct jnewblk *jnewblk;
5477         struct newblk *newblk;
5478         struct mount *mp;
5479         ufs_lbn_t lbn;
5480
5481         mp = UFSTOVFS(ip->i_ump);
5482         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5483             ("softdep_setup_allocext called on non-softdep filesystem"));
5484         KASSERT(off < NXADDR, ("softdep_setup_allocext: lbn %lld > NXADDR",
5485                     (long long)off));
5486
5487         lbn = bp->b_lblkno;
5488         if (oldblkno && oldblkno != newblkno)
5489                 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
5490         else
5491                 freefrag = NULL;
5492
5493         ACQUIRE_LOCK(ip->i_ump);
5494         if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5495                 panic("softdep_setup_allocext: lost block");
5496         KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5497             ("softdep_setup_allocext: newblk already initialized"));
5498         /*
5499          * Convert the newblk to an allocdirect.
5500          */
5501         WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5502         adp = (struct allocdirect *)newblk;
5503         newblk->nb_freefrag = freefrag;
5504         adp->ad_offset = off;
5505         adp->ad_oldblkno = oldblkno;
5506         adp->ad_newsize = newsize;
5507         adp->ad_oldsize = oldsize;
5508         adp->ad_state |=  EXTDATA;
5509
5510         /*
5511          * Finish initializing the journal.
5512          */
5513         if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5514                 jnewblk->jn_ino = ip->i_number;
5515                 jnewblk->jn_lbn = lbn;
5516                 add_to_journal(&jnewblk->jn_list);
5517         }
5518         if (freefrag && freefrag->ff_jdep != NULL &&
5519             freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5520                 add_to_journal(freefrag->ff_jdep);
5521         inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
5522         adp->ad_inodedep = inodedep;
5523
5524         WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5525         /*
5526          * The list of allocdirects must be kept in sorted and ascending
5527          * order so that the rollback routines can quickly determine the
5528          * first uncommitted block (the size of the file stored on disk
5529          * ends at the end of the lowest committed fragment, or if there
5530          * are no fragments, at the end of the highest committed block).
5531          * Since files generally grow, the typical case is that the new
5532          * block is to be added at the end of the list. We speed this
5533          * special case by checking against the last allocdirect in the
5534          * list before laboriously traversing the list looking for the
5535          * insertion point.
5536          */
5537         adphead = &inodedep->id_newextupdt;
5538         oldadp = TAILQ_LAST(adphead, allocdirectlst);
5539         if (oldadp == NULL || oldadp->ad_offset <= off) {
5540                 /* insert at end of list */
5541                 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5542                 if (oldadp != NULL && oldadp->ad_offset == off)
5543                         allocdirect_merge(adphead, adp, oldadp);
5544                 FREE_LOCK(ip->i_ump);
5545                 return;
5546         }
5547         TAILQ_FOREACH(oldadp, adphead, ad_next) {
5548                 if (oldadp->ad_offset >= off)
5549                         break;
5550         }
5551         if (oldadp == NULL)
5552                 panic("softdep_setup_allocext: lost entry");
5553         /* insert in middle of list */
5554         TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5555         if (oldadp->ad_offset == off)
5556                 allocdirect_merge(adphead, adp, oldadp);
5557         FREE_LOCK(ip->i_ump);
5558 }
5559
5560 /*
5561  * Indirect block allocation dependencies.
5562  *
5563  * The same dependencies that exist for a direct block also exist when
5564  * a new block is allocated and pointed to by an entry in a block of
5565  * indirect pointers. The undo/redo states described above are also
5566  * used here. Because an indirect block contains many pointers that
5567  * may have dependencies, a second copy of the entire in-memory indirect
5568  * block is kept. The buffer cache copy is always completely up-to-date.
5569  * The second copy, which is used only as a source for disk writes,
5570  * contains only the safe pointers (i.e., those that have no remaining
5571  * update dependencies). The second copy is freed when all pointers
5572  * are safe. The cache is not allowed to replace indirect blocks with
5573  * pending update dependencies. If a buffer containing an indirect
5574  * block with dependencies is written, these routines will mark it
5575  * dirty again. It can only be successfully written once all the
5576  * dependencies are removed. The ffs_fsync routine in conjunction with
5577  * softdep_sync_metadata work together to get all the dependencies
5578  * removed so that a file can be successfully written to disk. Three
5579  * procedures are used when setting up indirect block pointer
5580  * dependencies. The division is necessary because of the organization
5581  * of the "balloc" routine and because of the distinction between file
5582  * pages and file metadata blocks.
5583  */
5584
5585 /*
5586  * Allocate a new allocindir structure.
5587  */
5588 static struct allocindir *
5589 newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
5590         struct inode *ip;       /* inode for file being extended */
5591         int ptrno;              /* offset of pointer in indirect block */
5592         ufs2_daddr_t newblkno;  /* disk block number being added */
5593         ufs2_daddr_t oldblkno;  /* previous block number, 0 if none */
5594         ufs_lbn_t lbn;
5595 {
5596         struct newblk *newblk;
5597         struct allocindir *aip;
5598         struct freefrag *freefrag;
5599         struct jnewblk *jnewblk;
5600
5601         if (oldblkno)
5602                 freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn);
5603         else
5604                 freefrag = NULL;
5605         ACQUIRE_LOCK(ip->i_ump);
5606         if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0)
5607                 panic("new_allocindir: lost block");
5608         KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5609             ("newallocindir: newblk already initialized"));
5610         WORKITEM_REASSIGN(newblk, D_ALLOCINDIR);
5611         newblk->nb_freefrag = freefrag;
5612         aip = (struct allocindir *)newblk;
5613         aip->ai_offset = ptrno;
5614         aip->ai_oldblkno = oldblkno;
5615         aip->ai_lbn = lbn;
5616         if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5617                 jnewblk->jn_ino = ip->i_number;
5618                 jnewblk->jn_lbn = lbn;
5619                 add_to_journal(&jnewblk->jn_list);
5620         }
5621         if (freefrag && freefrag->ff_jdep != NULL &&
5622             freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5623                 add_to_journal(freefrag->ff_jdep);
5624         return (aip);
5625 }
5626
5627 /*
5628  * Called just before setting an indirect block pointer
5629  * to a newly allocated file page.
5630  */
5631 void
5632 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
5633         struct inode *ip;       /* inode for file being extended */
5634         ufs_lbn_t lbn;          /* allocated block number within file */
5635         struct buf *bp;         /* buffer with indirect blk referencing page */
5636         int ptrno;              /* offset of pointer in indirect block */
5637         ufs2_daddr_t newblkno;  /* disk block number being added */
5638         ufs2_daddr_t oldblkno;  /* previous block number, 0 if none */
5639         struct buf *nbp;        /* buffer holding allocated page */
5640 {
5641         struct inodedep *inodedep;
5642         struct freefrag *freefrag;
5643         struct allocindir *aip;
5644         struct pagedep *pagedep;
5645         struct mount *mp;
5646         int dflags;
5647
5648         mp = UFSTOVFS(ip->i_ump);
5649         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5650             ("softdep_setup_allocindir_page called on non-softdep filesystem"));
5651         KASSERT(lbn == nbp->b_lblkno,
5652             ("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
5653             lbn, bp->b_lblkno));
5654         CTR4(KTR_SUJ,
5655             "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd "
5656             "lbn %jd", ip->i_number, newblkno, oldblkno, lbn);
5657         ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
5658         aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
5659         dflags = DEPALLOC;
5660         if (IS_SNAPSHOT(ip))
5661                 dflags |= NODELAY;
5662         (void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
5663         /*
5664          * If we are allocating a directory page, then we must
5665          * allocate an associated pagedep to track additions and
5666          * deletions.
5667          */
5668         if ((ip->i_mode & IFMT) == IFDIR)
5669                 pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep);
5670         WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5671         freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
5672         FREE_LOCK(ip->i_ump);
5673         if (freefrag)
5674                 handle_workitem_freefrag(freefrag);
5675 }
5676
5677 /*
5678  * Called just before setting an indirect block pointer to a
5679  * newly allocated indirect block.
5680  */
5681 void
5682 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
5683         struct buf *nbp;        /* newly allocated indirect block */
5684         struct inode *ip;       /* inode for file being extended */
5685         struct buf *bp;         /* indirect block referencing allocated block */
5686         int ptrno;              /* offset of pointer in indirect block */
5687         ufs2_daddr_t newblkno;  /* disk block number being added */
5688 {
5689         struct inodedep *inodedep;
5690         struct allocindir *aip;
5691         ufs_lbn_t lbn;
5692         int dflags;
5693
5694         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
5695             ("softdep_setup_allocindir_meta called on non-softdep filesystem"));
5696         CTR3(KTR_SUJ,
5697             "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d",
5698             ip->i_number, newblkno, ptrno);
5699         lbn = nbp->b_lblkno;
5700         ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
5701         aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
5702         dflags = DEPALLOC;
5703         if (IS_SNAPSHOT(ip))
5704                 dflags |= NODELAY;
5705         inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep);
5706         WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5707         if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn))
5708                 panic("softdep_setup_allocindir_meta: Block already existed");
5709         FREE_LOCK(ip->i_ump);
5710 }
5711
5712 static void
5713 indirdep_complete(indirdep)
5714         struct indirdep *indirdep;
5715 {
5716         struct allocindir *aip;
5717
5718         LIST_REMOVE(indirdep, ir_next);
5719         indirdep->ir_state |= DEPCOMPLETE;
5720
5721         while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
5722                 LIST_REMOVE(aip, ai_next);
5723                 free_newblk(&aip->ai_block);
5724         }
5725         /*
5726          * If this indirdep is not attached to a buf it was simply waiting
5727          * on completion to clear completehd.  free_indirdep() asserts
5728          * that nothing is dangling.
5729          */
5730         if ((indirdep->ir_state & ONWORKLIST) == 0)
5731                 free_indirdep(indirdep);
5732 }
5733
5734 static struct indirdep *
5735 indirdep_lookup(mp, ip, bp)
5736         struct mount *mp;
5737         struct inode *ip;
5738         struct buf *bp;
5739 {
5740         struct indirdep *indirdep, *newindirdep;
5741         struct newblk *newblk;
5742         struct ufsmount *ump;
5743         struct worklist *wk;
5744         struct fs *fs;
5745         ufs2_daddr_t blkno;
5746
5747         ump = VFSTOUFS(mp);
5748         LOCK_OWNED(ump);
5749         indirdep = NULL;
5750         newindirdep = NULL;
5751         fs = ip->i_fs;
5752         for (;;) {
5753                 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5754                         if (wk->wk_type != D_INDIRDEP)
5755                                 continue;
5756                         indirdep = WK_INDIRDEP(wk);
5757                         break;
5758                 }
5759                 /* Found on the buffer worklist, no new structure to free. */
5760                 if (indirdep != NULL && newindirdep == NULL)
5761                         return (indirdep);
5762                 if (indirdep != NULL && newindirdep != NULL)
5763                         panic("indirdep_lookup: simultaneous create");
5764                 /* None found on the buffer and a new structure is ready. */
5765                 if (indirdep == NULL && newindirdep != NULL)
5766                         break;
5767                 /* None found and no new structure available. */
5768                 FREE_LOCK(ump);
5769                 newindirdep = malloc(sizeof(struct indirdep),
5770                     M_INDIRDEP, M_SOFTDEP_FLAGS);
5771                 workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
5772                 newindirdep->ir_state = ATTACHED;
5773                 if (ip->i_ump->um_fstype == UFS1)
5774                         newindirdep->ir_state |= UFS1FMT;
5775                 TAILQ_INIT(&newindirdep->ir_trunc);
5776                 newindirdep->ir_saveddata = NULL;
5777                 LIST_INIT(&newindirdep->ir_deplisthd);
5778                 LIST_INIT(&newindirdep->ir_donehd);
5779                 LIST_INIT(&newindirdep->ir_writehd);
5780                 LIST_INIT(&newindirdep->ir_completehd);
5781                 if (bp->b_blkno == bp->b_lblkno) {
5782                         ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
5783                             NULL, NULL);
5784                         bp->b_blkno = blkno;
5785                 }
5786                 newindirdep->ir_freeblks = NULL;
5787                 newindirdep->ir_savebp =
5788                     getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
5789                 newindirdep->ir_bp = bp;
5790                 BUF_KERNPROC(newindirdep->ir_savebp);
5791                 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
5792                 ACQUIRE_LOCK(ump);
5793         }
5794         indirdep = newindirdep;
5795         WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
5796         /*
5797          * If the block is not yet allocated we don't set DEPCOMPLETE so
5798          * that we don't free dependencies until the pointers are valid.
5799          * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather
5800          * than using the hash.
5801          */
5802         if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk))
5803                 LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next);
5804         else
5805                 indirdep->ir_state |= DEPCOMPLETE;
5806         return (indirdep);
5807 }
5808
5809 /*
5810  * Called to finish the allocation of the "aip" allocated
5811  * by one of the two routines above.
5812  */
5813 static struct freefrag *
5814 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
5815         struct buf *bp;         /* in-memory copy of the indirect block */
5816         struct inode *ip;       /* inode for file being extended */
5817         struct inodedep *inodedep; /* Inodedep for ip */
5818         struct allocindir *aip; /* allocindir allocated by the above routines */
5819         ufs_lbn_t lbn;          /* Logical block number for this block. */
5820 {
5821         struct fs *fs;
5822         struct indirdep *indirdep;
5823         struct allocindir *oldaip;
5824         struct freefrag *freefrag;
5825         struct mount *mp;
5826
5827         LOCK_OWNED(ip->i_ump);
5828         mp = UFSTOVFS(ip->i_ump);
5829         fs = ip->i_fs;
5830         if (bp->b_lblkno >= 0)
5831                 panic("setup_allocindir_phase2: not indir blk");
5832         KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs),
5833             ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset));
5834         indirdep = indirdep_lookup(mp, ip, bp);
5835         KASSERT(indirdep->ir_savebp != NULL,
5836             ("setup_allocindir_phase2 NULL ir_savebp"));
5837         aip->ai_indirdep = indirdep;
5838         /*
5839          * Check for an unwritten dependency for this indirect offset.  If
5840          * there is, merge the old dependency into the new one.  This happens
5841          * as a result of reallocblk only.
5842          */
5843         freefrag = NULL;
5844         if (aip->ai_oldblkno != 0) {
5845                 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) {
5846                         if (oldaip->ai_offset == aip->ai_offset) {
5847                                 freefrag = allocindir_merge(aip, oldaip);
5848                                 goto done;
5849                         }
5850                 }
5851                 LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) {
5852                         if (oldaip->ai_offset == aip->ai_offset) {
5853                                 freefrag = allocindir_merge(aip, oldaip);
5854                                 goto done;
5855                         }
5856                 }
5857         }
5858 done:
5859         LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
5860         return (freefrag);
5861 }
5862
5863 /*
5864  * Merge two allocindirs which refer to the same block.  Move newblock
5865  * dependencies and setup the freefrags appropriately.
5866  */
5867 static struct freefrag *
5868 allocindir_merge(aip, oldaip)
5869         struct allocindir *aip;
5870         struct allocindir *oldaip;
5871 {
5872         struct freefrag *freefrag;
5873         struct worklist *wk;
5874
5875         if (oldaip->ai_newblkno != aip->ai_oldblkno)
5876                 panic("allocindir_merge: blkno");
5877         aip->ai_oldblkno = oldaip->ai_oldblkno;
5878         freefrag = aip->ai_freefrag;
5879         aip->ai_freefrag = oldaip->ai_freefrag;
5880         oldaip->ai_freefrag = NULL;
5881         KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
5882         /*
5883          * If we are tracking a new directory-block allocation,
5884          * move it from the old allocindir to the new allocindir.
5885          */
5886         if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
5887                 WORKLIST_REMOVE(wk);
5888                 if (!LIST_EMPTY(&oldaip->ai_newdirblk))
5889                         panic("allocindir_merge: extra newdirblk");
5890                 WORKLIST_INSERT(&aip->ai_newdirblk, wk);
5891         }
5892         /*
5893          * We can skip journaling for this freefrag and just complete
5894          * any pending journal work for the allocindir that is being
5895          * removed after the freefrag completes.
5896          */
5897         if (freefrag->ff_jdep)
5898                 cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep));
5899         LIST_REMOVE(oldaip, ai_next);
5900         freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block,
5901             &freefrag->ff_list, &freefrag->ff_jwork);
5902         free_newblk(&oldaip->ai_block);
5903
5904         return (freefrag);
5905 }
5906
5907 static inline void
5908 setup_freedirect(freeblks, ip, i, needj)
5909         struct freeblks *freeblks;
5910         struct inode *ip;
5911         int i;
5912         int needj;
5913 {
5914         ufs2_daddr_t blkno;
5915         int frags;
5916
5917         blkno = DIP(ip, i_db[i]);
5918         if (blkno == 0)
5919                 return;
5920         DIP_SET(ip, i_db[i], 0);
5921         frags = sblksize(ip->i_fs, ip->i_size, i);
5922         frags = numfrags(ip->i_fs, frags);
5923         newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj);
5924 }
5925
5926 static inline void
5927 setup_freeext(freeblks, ip, i, needj)
5928         struct freeblks *freeblks;
5929         struct inode *ip;
5930         int i;
5931         int needj;
5932 {
5933         ufs2_daddr_t blkno;
5934         int frags;
5935
5936         blkno = ip->i_din2->di_extb[i];
5937         if (blkno == 0)
5938                 return;
5939         ip->i_din2->di_extb[i] = 0;
5940         frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i);
5941         frags = numfrags(ip->i_fs, frags);
5942         newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj);
5943 }
5944
5945 static inline void
5946 setup_freeindir(freeblks, ip, i, lbn, needj)
5947         struct freeblks *freeblks;
5948         struct inode *ip;
5949         int i;
5950         ufs_lbn_t lbn;
5951         int needj;
5952 {
5953         ufs2_daddr_t blkno;
5954
5955         blkno = DIP(ip, i_ib[i]);
5956         if (blkno == 0)
5957                 return;
5958         DIP_SET(ip, i_ib[i], 0);
5959         newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag,
5960             0, needj);
5961 }
5962
5963 static inline struct freeblks *
5964 newfreeblks(mp, ip)
5965         struct mount *mp;
5966         struct inode *ip;
5967 {
5968         struct freeblks *freeblks;
5969
5970         freeblks = malloc(sizeof(struct freeblks),
5971                 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
5972         workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
5973         LIST_INIT(&freeblks->fb_jblkdephd);
5974         LIST_INIT(&freeblks->fb_jwork);
5975         freeblks->fb_ref = 0;
5976         freeblks->fb_cgwait = 0;
5977         freeblks->fb_state = ATTACHED;
5978         freeblks->fb_uid = ip->i_uid;
5979         freeblks->fb_inum = ip->i_number;
5980         freeblks->fb_vtype = ITOV(ip)->v_type;
5981         freeblks->fb_modrev = DIP(ip, i_modrev);
5982         freeblks->fb_devvp = ip->i_devvp;
5983         freeblks->fb_chkcnt = 0;
5984         freeblks->fb_len = 0;
5985
5986         return (freeblks);
5987 }
5988
5989 static void
5990 trunc_indirdep(indirdep, freeblks, bp, off)
5991         struct indirdep *indirdep;
5992         struct freeblks *freeblks;
5993         struct buf *bp;
5994         int off;
5995 {
5996         struct allocindir *aip, *aipn;
5997
5998         /*
5999          * The first set of allocindirs won't be in savedbp.
6000          */
6001         LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn)
6002                 if (aip->ai_offset > off)
6003                         cancel_allocindir(aip, bp, freeblks, 1);
6004         LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn)
6005                 if (aip->ai_offset > off)
6006                         cancel_allocindir(aip, bp, freeblks, 1);
6007         /*
6008          * These will exist in savedbp.
6009          */
6010         LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn)
6011                 if (aip->ai_offset > off)
6012                         cancel_allocindir(aip, NULL, freeblks, 0);
6013         LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn)
6014                 if (aip->ai_offset > off)
6015                         cancel_allocindir(aip, NULL, freeblks, 0);
6016 }
6017
6018 /*
6019  * Follow the chain of indirects down to lastlbn creating a freework
6020  * structure for each.  This will be used to start indir_trunc() at
6021  * the right offset and create the journal records for the parrtial
6022  * truncation.  A second step will handle the truncated dependencies.
6023  */
6024 static int
6025 setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno)
6026         struct freeblks *freeblks;
6027         struct inode *ip;
6028         ufs_lbn_t lbn;
6029         ufs_lbn_t lastlbn;
6030         ufs2_daddr_t blkno;
6031 {
6032         struct indirdep *indirdep;
6033         struct indirdep *indirn;
6034         struct freework *freework;
6035         struct newblk *newblk;
6036         struct mount *mp;
6037         struct buf *bp;
6038         uint8_t *start;
6039         uint8_t *end;
6040         ufs_lbn_t lbnadd;
6041         int level;
6042         int error;
6043         int off;
6044
6045
6046         freework = NULL;
6047         if (blkno == 0)
6048                 return (0);
6049         mp = freeblks->fb_list.wk_mp;
6050         bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0);
6051         if ((bp->b_flags & B_CACHE) == 0) {
6052                 bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno);
6053                 bp->b_iocmd = BIO_READ;
6054                 bp->b_flags &= ~B_INVAL;
6055                 bp->b_ioflags &= ~BIO_ERROR;
6056                 vfs_busy_pages(bp, 0);
6057                 bp->b_iooffset = dbtob(bp->b_blkno);
6058                 bstrategy(bp);
6059                 curthread->td_ru.ru_inblock++;
6060                 error = bufwait(bp);
6061                 if (error) {
6062                         brelse(bp);
6063                         return (error);
6064                 }
6065         }
6066         level = lbn_level(lbn);
6067         lbnadd = lbn_offset(ip->i_fs, level);
6068         /*
6069          * Compute the offset of the last block we want to keep.  Store
6070          * in the freework the first block we want to completely free.
6071          */
6072         off = (lastlbn - -(lbn + level)) / lbnadd;
6073         if (off + 1 == NINDIR(ip->i_fs))
6074                 goto nowork;
6075         freework = newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, 0, off+1,
6076             0);
6077         /*
6078          * Link the freework into the indirdep.  This will prevent any new
6079          * allocations from proceeding until we are finished with the
6080          * truncate and the block is written.
6081          */
6082         ACQUIRE_LOCK(ip->i_ump);
6083         indirdep = indirdep_lookup(mp, ip, bp);
6084         if (indirdep->ir_freeblks)
6085                 panic("setup_trunc_indir: indirdep already truncated.");
6086         TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next);
6087         freework->fw_indir = indirdep;
6088         /*
6089          * Cancel any allocindirs that will not make it to disk.
6090          * We have to do this for all copies of the indirdep that
6091          * live on this newblk.
6092          */
6093         if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
6094                 newblk_lookup(mp, dbtofsb(ip->i_fs, bp->b_blkno), 0, &newblk);
6095                 LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next)
6096                         trunc_indirdep(indirn, freeblks, bp, off);
6097         } else
6098                 trunc_indirdep(indirdep, freeblks, bp, off);
6099         FREE_LOCK(ip->i_ump);
6100         /*
6101          * Creation is protected by the buf lock. The saveddata is only
6102          * needed if a full truncation follows a partial truncation but it
6103          * is difficult to allocate in that case so we fetch it anyway.
6104          */
6105         if (indirdep->ir_saveddata == NULL)
6106                 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
6107                     M_SOFTDEP_FLAGS);
6108 nowork:
6109         /* Fetch the blkno of the child and the zero start offset. */
6110         if (ip->i_ump->um_fstype == UFS1) {
6111                 blkno = ((ufs1_daddr_t *)bp->b_data)[off];
6112                 start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1];
6113         } else {
6114                 blkno = ((ufs2_daddr_t *)bp->b_data)[off];
6115                 start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1];
6116         }
6117         if (freework) {
6118                 /* Zero the truncated pointers. */
6119                 end = bp->b_data + bp->b_bcount;
6120                 bzero(start, end - start);
6121                 bdwrite(bp);
6122         } else
6123                 bqrelse(bp);
6124         if (level == 0)
6125                 return (0);
6126         lbn++; /* adjust level */
6127         lbn -= (off * lbnadd);
6128         return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno);
6129 }
6130
6131 /*
6132  * Complete the partial truncation of an indirect block setup by
6133  * setup_trunc_indir().  This zeros the truncated pointers in the saved
6134  * copy and writes them to disk before the freeblks is allowed to complete.
6135  */
6136 static void
6137 complete_trunc_indir(freework)
6138         struct freework *freework;
6139 {
6140         struct freework *fwn;
6141         struct indirdep *indirdep;
6142         struct ufsmount *ump;
6143         struct buf *bp;
6144         uintptr_t start;
6145         int count;
6146
6147         ump = VFSTOUFS(freework->fw_list.wk_mp);
6148         LOCK_OWNED(ump);
6149         indirdep = freework->fw_indir;
6150         for (;;) {
6151                 bp = indirdep->ir_bp;
6152                 /* See if the block was discarded. */
6153                 if (bp == NULL)
6154                         break;
6155                 /* Inline part of getdirtybuf().  We dont want bremfree. */
6156                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0)
6157                         break;
6158                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
6159                     LOCK_PTR(ump)) == 0)
6160                         BUF_UNLOCK(bp);
6161                 ACQUIRE_LOCK(ump);
6162         }
6163         freework->fw_state |= DEPCOMPLETE;
6164         TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next);
6165         /*
6166          * Zero the pointers in the saved copy.
6167          */
6168         if (indirdep->ir_state & UFS1FMT)
6169                 start = sizeof(ufs1_daddr_t);
6170         else
6171                 start = sizeof(ufs2_daddr_t);
6172         start *= freework->fw_start;
6173         count = indirdep->ir_savebp->b_bcount - start;
6174         start += (uintptr_t)indirdep->ir_savebp->b_data;
6175         bzero((char *)start, count);
6176         /*
6177          * We need to start the next truncation in the list if it has not
6178          * been started yet.
6179          */
6180         fwn = TAILQ_FIRST(&indirdep->ir_trunc);
6181         if (fwn != NULL) {
6182                 if (fwn->fw_freeblks == indirdep->ir_freeblks)
6183                         TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next);
6184                 if ((fwn->fw_state & ONWORKLIST) == 0)
6185                         freework_enqueue(fwn);
6186         }
6187         /*
6188          * If bp is NULL the block was fully truncated, restore
6189          * the saved block list otherwise free it if it is no
6190          * longer needed.
6191          */
6192         if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
6193                 if (bp == NULL)
6194                         bcopy(indirdep->ir_saveddata,
6195                             indirdep->ir_savebp->b_data,
6196                             indirdep->ir_savebp->b_bcount);
6197                 free(indirdep->ir_saveddata, M_INDIRDEP);
6198                 indirdep->ir_saveddata = NULL;
6199         }
6200         /*
6201          * When bp is NULL there is a full truncation pending.  We
6202          * must wait for this full truncation to be journaled before
6203          * we can release this freework because the disk pointers will
6204          * never be written as zero.
6205          */
6206         if (bp == NULL)  {
6207                 if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd))
6208                         handle_written_freework(freework);
6209                 else
6210                         WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd,
6211                            &freework->fw_list);
6212         } else {
6213                 /* Complete when the real copy is written. */
6214                 WORKLIST_INSERT(&bp->b_dep, &freework->fw_list);
6215                 BUF_UNLOCK(bp);
6216         }
6217 }
6218
6219 /*
6220  * Calculate the number of blocks we are going to release where datablocks
6221  * is the current total and length is the new file size.
6222  */
6223 static ufs2_daddr_t
6224 blkcount(fs, datablocks, length)
6225         struct fs *fs;
6226         ufs2_daddr_t datablocks;
6227         off_t length;
6228 {
6229         off_t totblks, numblks;
6230
6231         totblks = 0;
6232         numblks = howmany(length, fs->fs_bsize);
6233         if (numblks <= NDADDR) {
6234                 totblks = howmany(length, fs->fs_fsize);
6235                 goto out;
6236         }
6237         totblks = blkstofrags(fs, numblks);
6238         numblks -= NDADDR;
6239         /*
6240          * Count all single, then double, then triple indirects required.
6241          * Subtracting one indirects worth of blocks for each pass
6242          * acknowledges one of each pointed to by the inode.
6243          */
6244         for (;;) {
6245                 totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs)));
6246                 numblks -= NINDIR(fs);
6247                 if (numblks <= 0)
6248                         break;
6249                 numblks = howmany(numblks, NINDIR(fs));
6250         }
6251 out:
6252         totblks = fsbtodb(fs, totblks);
6253         /*
6254          * Handle sparse files.  We can't reclaim more blocks than the inode
6255          * references.  We will correct it later in handle_complete_freeblks()
6256          * when we know the real count.
6257          */
6258         if (totblks > datablocks)
6259                 return (0);
6260         return (datablocks - totblks);
6261 }
6262
6263 /*
6264  * Handle freeblocks for journaled softupdate filesystems.
6265  *
6266  * Contrary to normal softupdates, we must preserve the block pointers in
6267  * indirects until their subordinates are free.  This is to avoid journaling
6268  * every block that is freed which may consume more space than the journal
6269  * itself.  The recovery program will see the free block journals at the
6270  * base of the truncated area and traverse them to reclaim space.  The
6271  * pointers in the inode may be cleared immediately after the journal
6272  * records are written because each direct and indirect pointer in the
6273  * inode is recorded in a journal.  This permits full truncation to proceed
6274  * asynchronously.  The write order is journal -> inode -> cgs -> indirects.
6275  *
6276  * The algorithm is as follows:
6277  * 1) Traverse the in-memory state and create journal entries to release
6278  *    the relevant blocks and full indirect trees.
6279  * 2) Traverse the indirect block chain adding partial truncation freework
6280  *    records to indirects in the path to lastlbn.  The freework will
6281  *    prevent new allocation dependencies from being satisfied in this
6282  *    indirect until the truncation completes.
6283  * 3) Read and lock the inode block, performing an update with the new size
6284  *    and pointers.  This prevents truncated data from becoming valid on
6285  *    disk through step 4.
6286  * 4) Reap unsatisfied dependencies that are beyond the truncated area,
6287  *    eliminate journal work for those records that do not require it.
6288  * 5) Schedule the journal records to be written followed by the inode block.
6289  * 6) Allocate any necessary frags for the end of file.
6290  * 7) Zero any partially truncated blocks.
6291  *
6292  * From this truncation proceeds asynchronously using the freework and
6293  * indir_trunc machinery.  The file will not be extended again into a
6294  * partially truncated indirect block until all work is completed but
6295  * the normal dependency mechanism ensures that it is rolled back/forward
6296  * as appropriate.  Further truncation may occur without delay and is
6297  * serialized in indir_trunc().
6298  */
6299 void
6300 softdep_journal_freeblocks(ip, cred, length, flags)
6301         struct inode *ip;       /* The inode whose length is to be reduced */
6302         struct ucred *cred;
6303         off_t length;           /* The new length for the file */
6304         int flags;              /* IO_EXT and/or IO_NORMAL */
6305 {
6306         struct freeblks *freeblks, *fbn;
6307         struct worklist *wk, *wkn;
6308         struct inodedep *inodedep;
6309         struct jblkdep *jblkdep;
6310         struct allocdirect *adp, *adpn;
6311         struct ufsmount *ump;
6312         struct fs *fs;
6313         struct buf *bp;
6314         struct vnode *vp;
6315         struct mount *mp;
6316         ufs2_daddr_t extblocks, datablocks;
6317         ufs_lbn_t tmpval, lbn, lastlbn;
6318         int frags, lastoff, iboff, allocblock, needj, dflags, error, i;
6319
6320         fs = ip->i_fs;
6321         ump = ip->i_ump;
6322         mp = UFSTOVFS(ump);
6323         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6324             ("softdep_journal_freeblocks called on non-softdep filesystem"));
6325         vp = ITOV(ip);
6326         needj = 1;
6327         iboff = -1;
6328         allocblock = 0;
6329         extblocks = 0;
6330         datablocks = 0;
6331         frags = 0;
6332         freeblks = newfreeblks(mp, ip);
6333         ACQUIRE_LOCK(ump);
6334         /*
6335          * If we're truncating a removed file that will never be written
6336          * we don't need to journal the block frees.  The canceled journals
6337          * for the allocations will suffice.
6338          */
6339         dflags = DEPALLOC;
6340         if (IS_SNAPSHOT(ip))
6341                 dflags |= NODELAY;
6342         inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6343         if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED &&
6344             length == 0)
6345                 needj = 0;
6346         CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d",
6347             ip->i_number, length, needj);
6348         FREE_LOCK(ump);
6349         /*
6350          * Calculate the lbn that we are truncating to.  This results in -1
6351          * if we're truncating the 0 bytes.  So it is the last lbn we want
6352          * to keep, not the first lbn we want to truncate.
6353          */
6354         lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1;
6355         lastoff = blkoff(fs, length);
6356         /*
6357          * Compute frags we are keeping in lastlbn.  0 means all.
6358          */
6359         if (lastlbn >= 0 && lastlbn < NDADDR) {
6360                 frags = fragroundup(fs, lastoff);
6361                 /* adp offset of last valid allocdirect. */
6362                 iboff = lastlbn;
6363         } else if (lastlbn > 0)
6364                 iboff = NDADDR;
6365         if (fs->fs_magic == FS_UFS2_MAGIC)
6366                 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6367         /*
6368          * Handle normal data blocks and indirects.  This section saves
6369          * values used after the inode update to complete frag and indirect
6370          * truncation.
6371          */
6372         if ((flags & IO_NORMAL) != 0) {
6373                 /*
6374                  * Handle truncation of whole direct and indirect blocks.
6375                  */
6376                 for (i = iboff + 1; i < NDADDR; i++)
6377                         setup_freedirect(freeblks, ip, i, needj);
6378                 for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
6379                     i++, lbn += tmpval, tmpval *= NINDIR(fs)) {
6380                         /* Release a whole indirect tree. */
6381                         if (lbn > lastlbn) {
6382                                 setup_freeindir(freeblks, ip, i, -lbn -i,
6383                                     needj);
6384                                 continue;
6385                         }
6386                         iboff = i + NDADDR;
6387                         /*
6388                          * Traverse partially truncated indirect tree.
6389                          */
6390                         if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn)
6391                                 setup_trunc_indir(freeblks, ip, -lbn - i,
6392                                     lastlbn, DIP(ip, i_ib[i]));
6393                 }
6394                 /*
6395                  * Handle partial truncation to a frag boundary.
6396                  */
6397                 if (frags) {
6398                         ufs2_daddr_t blkno;
6399                         long oldfrags;
6400
6401                         oldfrags = blksize(fs, ip, lastlbn);
6402                         blkno = DIP(ip, i_db[lastlbn]);
6403                         if (blkno && oldfrags != frags) {
6404                                 oldfrags -= frags;
6405                                 oldfrags = numfrags(ip->i_fs, oldfrags);
6406                                 blkno += numfrags(ip->i_fs, frags);
6407                                 newfreework(ump, freeblks, NULL, lastlbn,
6408                                     blkno, oldfrags, 0, needj);
6409                         } else if (blkno == 0)
6410                                 allocblock = 1;
6411                 }
6412                 /*
6413                  * Add a journal record for partial truncate if we are
6414                  * handling indirect blocks.  Non-indirects need no extra
6415                  * journaling.
6416                  */
6417                 if (length != 0 && lastlbn >= NDADDR) {
6418                         ip->i_flag |= IN_TRUNCATED;
6419                         newjtrunc(freeblks, length, 0);
6420                 }
6421                 ip->i_size = length;
6422                 DIP_SET(ip, i_size, ip->i_size);
6423                 datablocks = DIP(ip, i_blocks) - extblocks;
6424                 if (length != 0)
6425                         datablocks = blkcount(ip->i_fs, datablocks, length);
6426                 freeblks->fb_len = length;
6427         }
6428         if ((flags & IO_EXT) != 0) {
6429                 for (i = 0; i < NXADDR; i++)
6430                         setup_freeext(freeblks, ip, i, needj);
6431                 ip->i_din2->di_extsize = 0;
6432                 datablocks += extblocks;
6433         }
6434 #ifdef QUOTA
6435         /* Reference the quotas in case the block count is wrong in the end. */
6436         quotaref(vp, freeblks->fb_quota);
6437         (void) chkdq(ip, -datablocks, NOCRED, 0);
6438 #endif
6439         freeblks->fb_chkcnt = -datablocks;
6440         UFS_LOCK(ump);
6441         fs->fs_pendingblocks += datablocks;
6442         UFS_UNLOCK(ump);
6443         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6444         /*
6445          * Handle truncation of incomplete alloc direct dependencies.  We
6446          * hold the inode block locked to prevent incomplete dependencies
6447          * from reaching the disk while we are eliminating those that
6448          * have been truncated.  This is a partially inlined ffs_update().
6449          */
6450         ufs_itimes(vp);
6451         ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
6452         error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6453             (int)fs->fs_bsize, cred, &bp);
6454         if (error) {
6455                 brelse(bp);
6456                 softdep_error("softdep_journal_freeblocks", error);
6457                 return;
6458         }
6459         if (bp->b_bufsize == fs->fs_bsize)
6460                 bp->b_flags |= B_CLUSTEROK;
6461         softdep_update_inodeblock(ip, bp, 0);
6462         if (ump->um_fstype == UFS1)
6463                 *((struct ufs1_dinode *)bp->b_data +
6464                     ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
6465         else
6466                 *((struct ufs2_dinode *)bp->b_data +
6467                     ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
6468         ACQUIRE_LOCK(ump);
6469         (void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6470         if ((inodedep->id_state & IOSTARTED) != 0)
6471                 panic("softdep_setup_freeblocks: inode busy");
6472         /*
6473          * Add the freeblks structure to the list of operations that
6474          * must await the zero'ed inode being written to disk. If we
6475          * still have a bitmap dependency (needj), then the inode
6476          * has never been written to disk, so we can process the
6477          * freeblks below once we have deleted the dependencies.
6478          */
6479         if (needj)
6480                 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6481         else
6482                 freeblks->fb_state |= COMPLETE;
6483         if ((flags & IO_NORMAL) != 0) {
6484                 TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) {
6485                         if (adp->ad_offset > iboff)
6486                                 cancel_allocdirect(&inodedep->id_inoupdt, adp,
6487                                     freeblks);
6488                         /*
6489                          * Truncate the allocdirect.  We could eliminate
6490                          * or modify journal records as well.
6491                          */
6492                         else if (adp->ad_offset == iboff && frags)
6493                                 adp->ad_newsize = frags;
6494                 }
6495         }
6496         if ((flags & IO_EXT) != 0)
6497                 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
6498                         cancel_allocdirect(&inodedep->id_extupdt, adp,
6499                             freeblks);
6500         /*
6501          * Scan the bufwait list for newblock dependencies that will never
6502          * make it to disk.
6503          */
6504         LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) {
6505                 if (wk->wk_type != D_ALLOCDIRECT)
6506                         continue;
6507                 adp = WK_ALLOCDIRECT(wk);
6508                 if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) ||
6509                     ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) {
6510                         cancel_jfreeblk(freeblks, adp->ad_newblkno);
6511                         cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork);
6512                         WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
6513                 }
6514         }
6515         /*
6516          * Add journal work.
6517          */
6518         LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps)
6519                 add_to_journal(&jblkdep->jb_list);
6520         FREE_LOCK(ump);
6521         bdwrite(bp);
6522         /*
6523          * Truncate dependency structures beyond length.
6524          */
6525         trunc_dependencies(ip, freeblks, lastlbn, frags, flags);
6526         /*
6527          * This is only set when we need to allocate a fragment because
6528          * none existed at the end of a frag-sized file.  It handles only
6529          * allocating a new, zero filled block.
6530          */
6531         if (allocblock) {
6532                 ip->i_size = length - lastoff;
6533                 DIP_SET(ip, i_size, ip->i_size);
6534                 error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp);
6535                 if (error != 0) {
6536                         softdep_error("softdep_journal_freeblks", error);
6537                         return;
6538                 }
6539                 ip->i_size = length;
6540                 DIP_SET(ip, i_size, length);
6541                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
6542                 allocbuf(bp, frags);
6543                 ffs_update(vp, 0);
6544                 bawrite(bp);
6545         } else if (lastoff != 0 && vp->v_type != VDIR) {
6546                 int size;
6547
6548                 /*
6549                  * Zero the end of a truncated frag or block.
6550                  */
6551                 size = sblksize(fs, length, lastlbn);
6552                 error = bread(vp, lastlbn, size, cred, &bp);
6553                 if (error) {
6554                         softdep_error("softdep_journal_freeblks", error);
6555                         return;
6556                 }
6557                 bzero((char *)bp->b_data + lastoff, size - lastoff);
6558                 bawrite(bp);
6559
6560         }
6561         ACQUIRE_LOCK(ump);
6562         inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6563         TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next);
6564         freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST;
6565         /*
6566          * We zero earlier truncations so they don't erroneously
6567          * update i_blocks.
6568          */
6569         if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0)
6570                 TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next)
6571                         fbn->fb_len = 0;
6572         if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE &&
6573             LIST_EMPTY(&freeblks->fb_jblkdephd))
6574                 freeblks->fb_state |= INPROGRESS;
6575         else
6576                 freeblks = NULL;
6577         FREE_LOCK(ump);
6578         if (freeblks)
6579                 handle_workitem_freeblocks(freeblks, 0);
6580         trunc_pages(ip, length, extblocks, flags);
6581
6582 }
6583
6584 /*
6585  * Flush a JOP_SYNC to the journal.
6586  */
6587 void
6588 softdep_journal_fsync(ip)
6589         struct inode *ip;
6590 {
6591         struct jfsync *jfsync;
6592
6593         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
6594             ("softdep_journal_fsync called on non-softdep filesystem"));
6595         if ((ip->i_flag & IN_TRUNCATED) == 0)
6596                 return;
6597         ip->i_flag &= ~IN_TRUNCATED;
6598         jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO);
6599         workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ip->i_ump));
6600         jfsync->jfs_size = ip->i_size;
6601         jfsync->jfs_ino = ip->i_number;
6602         ACQUIRE_LOCK(ip->i_ump);
6603         add_to_journal(&jfsync->jfs_list);
6604         jwait(&jfsync->jfs_list, MNT_WAIT);
6605         FREE_LOCK(ip->i_ump);
6606 }
6607
6608 /*
6609  * Block de-allocation dependencies.
6610  *
6611  * When blocks are de-allocated, the on-disk pointers must be nullified before
6612  * the blocks are made available for use by other files.  (The true
6613  * requirement is that old pointers must be nullified before new on-disk
6614  * pointers are set.  We chose this slightly more stringent requirement to
6615  * reduce complexity.) Our implementation handles this dependency by updating
6616  * the inode (or indirect block) appropriately but delaying the actual block
6617  * de-allocation (i.e., freemap and free space count manipulation) until
6618  * after the updated versions reach stable storage.  After the disk is
6619  * updated, the blocks can be safely de-allocated whenever it is convenient.
6620  * This implementation handles only the common case of reducing a file's
6621  * length to zero. Other cases are handled by the conventional synchronous
6622  * write approach.
6623  *
6624  * The ffs implementation with which we worked double-checks
6625  * the state of the block pointers and file size as it reduces
6626  * a file's length.  Some of this code is replicated here in our
6627  * soft updates implementation.  The freeblks->fb_chkcnt field is
6628  * used to transfer a part of this information to the procedure
6629  * that eventually de-allocates the blocks.
6630  *
6631  * This routine should be called from the routine that shortens
6632  * a file's length, before the inode's size or block pointers
6633  * are modified. It will save the block pointer information for
6634  * later release and zero the inode so that the calling routine
6635  * can release it.
6636  */
6637 void
6638 softdep_setup_freeblocks(ip, length, flags)
6639         struct inode *ip;       /* The inode whose length is to be reduced */
6640         off_t length;           /* The new length for the file */
6641         int flags;              /* IO_EXT and/or IO_NORMAL */
6642 {
6643         struct ufs1_dinode *dp1;
6644         struct ufs2_dinode *dp2;
6645         struct freeblks *freeblks;
6646         struct inodedep *inodedep;
6647         struct allocdirect *adp;
6648         struct ufsmount *ump;
6649         struct buf *bp;
6650         struct fs *fs;
6651         ufs2_daddr_t extblocks, datablocks;
6652         struct mount *mp;
6653         int i, delay, error, dflags;
6654         ufs_lbn_t tmpval;
6655         ufs_lbn_t lbn;
6656
6657         ump = ip->i_ump;
6658         mp = UFSTOVFS(ump);
6659         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6660             ("softdep_setup_freeblocks called on non-softdep filesystem"));
6661         CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld",
6662             ip->i_number, length);
6663         KASSERT(length == 0, ("softdep_setup_freeblocks: non-zero length"));
6664         fs = ip->i_fs;
6665         freeblks = newfreeblks(mp, ip);
6666         extblocks = 0;
6667         datablocks = 0;
6668         if (fs->fs_magic == FS_UFS2_MAGIC)
6669                 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6670         if ((flags & IO_NORMAL) != 0) {
6671                 for (i = 0; i < NDADDR; i++)
6672                         setup_freedirect(freeblks, ip, i, 0);
6673                 for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
6674                     i++, lbn += tmpval, tmpval *= NINDIR(fs))
6675                         setup_freeindir(freeblks, ip, i, -lbn -i, 0);
6676                 ip->i_size = 0;
6677                 DIP_SET(ip, i_size, 0);
6678                 datablocks = DIP(ip, i_blocks) - extblocks;
6679         }
6680         if ((flags & IO_EXT) != 0) {
6681                 for (i = 0; i < NXADDR; i++)
6682                         setup_freeext(freeblks, ip, i, 0);
6683                 ip->i_din2->di_extsize = 0;
6684                 datablocks += extblocks;
6685         }
6686 #ifdef QUOTA
6687         /* Reference the quotas in case the block count is wrong in the end. */
6688         quotaref(ITOV(ip), freeblks->fb_quota);
6689         (void) chkdq(ip, -datablocks, NOCRED, 0);
6690 #endif
6691         freeblks->fb_chkcnt = -datablocks;
6692         UFS_LOCK(ump);
6693         fs->fs_pendingblocks += datablocks;
6694         UFS_UNLOCK(ump);
6695         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6696         /*
6697          * Push the zero'ed inode to to its disk buffer so that we are free
6698          * to delete its dependencies below. Once the dependencies are gone
6699          * the buffer can be safely released.
6700          */
6701         if ((error = bread(ip->i_devvp,
6702             fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6703             (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
6704                 brelse(bp);
6705                 softdep_error("softdep_setup_freeblocks", error);
6706         }
6707         if (ump->um_fstype == UFS1) {
6708                 dp1 = ((struct ufs1_dinode *)bp->b_data +
6709                     ino_to_fsbo(fs, ip->i_number));
6710                 ip->i_din1->di_freelink = dp1->di_freelink;
6711                 *dp1 = *ip->i_din1;
6712         } else {
6713                 dp2 = ((struct ufs2_dinode *)bp->b_data +
6714                     ino_to_fsbo(fs, ip->i_number));
6715                 ip->i_din2->di_freelink = dp2->di_freelink;
6716                 *dp2 = *ip->i_din2;
6717         }
6718         /*
6719          * Find and eliminate any inode dependencies.
6720          */
6721         ACQUIRE_LOCK(ump);
6722         dflags = DEPALLOC;
6723         if (IS_SNAPSHOT(ip))
6724                 dflags |= NODELAY;
6725         (void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6726         if ((inodedep->id_state & IOSTARTED) != 0)
6727                 panic("softdep_setup_freeblocks: inode busy");
6728         /*
6729          * Add the freeblks structure to the list of operations that
6730          * must await the zero'ed inode being written to disk. If we
6731          * still have a bitmap dependency (delay == 0), then the inode
6732          * has never been written to disk, so we can process the
6733          * freeblks below once we have deleted the dependencies.
6734          */
6735         delay = (inodedep->id_state & DEPCOMPLETE);
6736         if (delay)
6737                 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6738         else
6739                 freeblks->fb_state |= COMPLETE;
6740         /*
6741          * Because the file length has been truncated to zero, any
6742          * pending block allocation dependency structures associated
6743          * with this inode are obsolete and can simply be de-allocated.
6744          * We must first merge the two dependency lists to get rid of
6745          * any duplicate freefrag structures, then purge the merged list.
6746          * If we still have a bitmap dependency, then the inode has never
6747          * been written to disk, so we can free any fragments without delay.
6748          */
6749         if (flags & IO_NORMAL) {
6750                 merge_inode_lists(&inodedep->id_newinoupdt,
6751                     &inodedep->id_inoupdt);
6752                 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
6753                         cancel_allocdirect(&inodedep->id_inoupdt, adp,
6754                             freeblks);
6755         }
6756         if (flags & IO_EXT) {
6757                 merge_inode_lists(&inodedep->id_newextupdt,
6758                     &inodedep->id_extupdt);
6759                 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
6760                         cancel_allocdirect(&inodedep->id_extupdt, adp,
6761                             freeblks);
6762         }
6763         FREE_LOCK(ump);
6764         bdwrite(bp);
6765         trunc_dependencies(ip, freeblks, -1, 0, flags);
6766         ACQUIRE_LOCK(ump);
6767         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
6768                 (void) free_inodedep(inodedep);
6769         freeblks->fb_state |= DEPCOMPLETE;
6770         /*
6771          * If the inode with zeroed block pointers is now on disk
6772          * we can start freeing blocks.
6773          */
6774         if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
6775                 freeblks->fb_state |= INPROGRESS;
6776         else
6777                 freeblks = NULL;
6778         FREE_LOCK(ump);
6779         if (freeblks)
6780                 handle_workitem_freeblocks(freeblks, 0);
6781         trunc_pages(ip, length, extblocks, flags);
6782 }
6783
6784 /*
6785  * Eliminate pages from the page cache that back parts of this inode and
6786  * adjust the vnode pager's idea of our size.  This prevents stale data
6787  * from hanging around in the page cache.
6788  */
6789 static void
6790 trunc_pages(ip, length, extblocks, flags)
6791         struct inode *ip;
6792         off_t length;
6793         ufs2_daddr_t extblocks;
6794         int flags;
6795 {
6796         struct vnode *vp;
6797         struct fs *fs;
6798         ufs_lbn_t lbn;
6799         off_t end, extend;
6800
6801         vp = ITOV(ip);
6802         fs = ip->i_fs;
6803         extend = OFF_TO_IDX(lblktosize(fs, -extblocks));
6804         if ((flags & IO_EXT) != 0)
6805                 vn_pages_remove(vp, extend, 0);
6806         if ((flags & IO_NORMAL) == 0)
6807                 return;
6808         BO_LOCK(&vp->v_bufobj);
6809         drain_output(vp);
6810         BO_UNLOCK(&vp->v_bufobj);
6811         /*
6812          * The vnode pager eliminates file pages we eliminate indirects
6813          * below.
6814          */
6815         vnode_pager_setsize(vp, length);
6816         /*
6817          * Calculate the end based on the last indirect we want to keep.  If
6818          * the block extends into indirects we can just use the negative of
6819          * its lbn.  Doubles and triples exist at lower numbers so we must
6820          * be careful not to remove those, if they exist.  double and triple
6821          * indirect lbns do not overlap with others so it is not important
6822          * to verify how many levels are required.
6823          */
6824         lbn = lblkno(fs, length);
6825         if (lbn >= NDADDR) {
6826                 /* Calculate the virtual lbn of the triple indirect. */
6827                 lbn = -lbn - (NIADDR - 1);
6828                 end = OFF_TO_IDX(lblktosize(fs, lbn));
6829         } else
6830                 end = extend;
6831         vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end);
6832 }
6833
6834 /*
6835  * See if the buf bp is in the range eliminated by truncation.
6836  */
6837 static int
6838 trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags)
6839         struct buf *bp;
6840         int *blkoffp;
6841         ufs_lbn_t lastlbn;
6842         int lastoff;
6843         int flags;
6844 {
6845         ufs_lbn_t lbn;
6846
6847         *blkoffp = 0;
6848         /* Only match ext/normal blocks as appropriate. */
6849         if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
6850             ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0))
6851                 return (0);
6852         /* ALTDATA is always a full truncation. */
6853         if ((bp->b_xflags & BX_ALTDATA) != 0)
6854                 return (1);
6855         /* -1 is full truncation. */
6856         if (lastlbn == -1)
6857                 return (1);
6858         /*
6859          * If this is a partial truncate we only want those
6860          * blocks and indirect blocks that cover the range
6861          * we're after.
6862          */
6863         lbn = bp->b_lblkno;
6864         if (lbn < 0)
6865                 lbn = -(lbn + lbn_level(lbn));
6866         if (lbn < lastlbn)
6867                 return (0);
6868         /* Here we only truncate lblkno if it's partial. */
6869         if (lbn == lastlbn) {
6870                 if (lastoff == 0)
6871                         return (0);
6872                 *blkoffp = lastoff;
6873         }
6874         return (1);
6875 }
6876
6877 /*
6878  * Eliminate any dependencies that exist in memory beyond lblkno:off
6879  */
6880 static void
6881 trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags)
6882         struct inode *ip;
6883         struct freeblks *freeblks;
6884         ufs_lbn_t lastlbn;
6885         int lastoff;
6886         int flags;
6887 {
6888         struct bufobj *bo;
6889         struct vnode *vp;
6890         struct buf *bp;
6891         struct fs *fs;
6892         int blkoff;
6893
6894         /*
6895          * We must wait for any I/O in progress to finish so that
6896          * all potential buffers on the dirty list will be visible.
6897          * Once they are all there, walk the list and get rid of
6898          * any dependencies.
6899          */
6900         fs = ip->i_fs;
6901         vp = ITOV(ip);
6902         bo = &vp->v_bufobj;
6903         BO_LOCK(bo);
6904         drain_output(vp);
6905         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
6906                 bp->b_vflags &= ~BV_SCANNED;
6907 restart:
6908         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
6909                 if (bp->b_vflags & BV_SCANNED)
6910                         continue;
6911                 if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
6912                         bp->b_vflags |= BV_SCANNED;
6913                         continue;
6914                 }
6915                 KASSERT(bp->b_bufobj == bo, ("Wrong object in buffer"));
6916                 if ((bp = getdirtybuf(bp, BO_LOCKPTR(bo), MNT_WAIT)) == NULL)
6917                         goto restart;
6918                 BO_UNLOCK(bo);
6919                 if (deallocate_dependencies(bp, freeblks, blkoff))
6920                         bqrelse(bp);
6921                 else
6922                         brelse(bp);
6923                 BO_LOCK(bo);
6924                 goto restart;
6925         }
6926         /*
6927          * Now do the work of vtruncbuf while also matching indirect blocks.
6928          */
6929         TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs)
6930                 bp->b_vflags &= ~BV_SCANNED;
6931 cleanrestart:
6932         TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) {
6933                 if (bp->b_vflags & BV_SCANNED)
6934                         continue;
6935                 if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
6936                         bp->b_vflags |= BV_SCANNED;
6937                         continue;
6938                 }
6939                 if (BUF_LOCK(bp,
6940                     LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
6941                     BO_LOCKPTR(bo)) == ENOLCK) {
6942                         BO_LOCK(bo);
6943                         goto cleanrestart;
6944                 }
6945                 bp->b_vflags |= BV_SCANNED;
6946                 bremfree(bp);
6947                 if (blkoff != 0) {
6948                         allocbuf(bp, blkoff);
6949                         bqrelse(bp);
6950                 } else {
6951                         bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF;
6952                         brelse(bp);
6953                 }
6954                 BO_LOCK(bo);
6955                 goto cleanrestart;
6956         }
6957         drain_output(vp);
6958         BO_UNLOCK(bo);
6959 }
6960
6961 static int
6962 cancel_pagedep(pagedep, freeblks, blkoff)
6963         struct pagedep *pagedep;
6964         struct freeblks *freeblks;
6965         int blkoff;
6966 {
6967         struct jremref *jremref;
6968         struct jmvref *jmvref;
6969         struct dirrem *dirrem, *tmp;
6970         int i;
6971
6972         /*
6973          * Copy any directory remove dependencies to the list
6974          * to be processed after the freeblks proceeds.  If
6975          * directory entry never made it to disk they
6976          * can be dumped directly onto the work list.
6977          */
6978         LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) {
6979                 /* Skip this directory removal if it is intended to remain. */
6980                 if (dirrem->dm_offset < blkoff)
6981                         continue;
6982                 /*
6983                  * If there are any dirrems we wait for the journal write
6984                  * to complete and then restart the buf scan as the lock
6985                  * has been dropped.
6986                  */
6987                 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
6988                         jwait(&jremref->jr_list, MNT_WAIT);
6989                         return (ERESTART);
6990                 }
6991                 LIST_REMOVE(dirrem, dm_next);
6992                 dirrem->dm_dirinum = pagedep->pd_ino;
6993                 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list);
6994         }
6995         while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
6996                 jwait(&jmvref->jm_list, MNT_WAIT);
6997                 return (ERESTART);
6998         }
6999         /*
7000          * When we're partially truncating a pagedep we just want to flush
7001          * journal entries and return.  There can not be any adds in the
7002          * truncated portion of the directory and newblk must remain if
7003          * part of the block remains.
7004          */
7005         if (blkoff != 0) {
7006                 struct diradd *dap;
7007
7008                 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
7009                         if (dap->da_offset > blkoff)
7010                                 panic("cancel_pagedep: diradd %p off %d > %d",
7011                                     dap, dap->da_offset, blkoff);
7012                 for (i = 0; i < DAHASHSZ; i++)
7013                         LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist)
7014                                 if (dap->da_offset > blkoff)
7015                                         panic("cancel_pagedep: diradd %p off %d > %d",
7016                                             dap, dap->da_offset, blkoff);
7017                 return (0);
7018         }
7019         /*
7020          * There should be no directory add dependencies present
7021          * as the directory could not be truncated until all
7022          * children were removed.
7023          */
7024         KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
7025             ("deallocate_dependencies: pendinghd != NULL"));
7026         for (i = 0; i < DAHASHSZ; i++)
7027                 KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
7028                     ("deallocate_dependencies: diraddhd != NULL"));
7029         if ((pagedep->pd_state & NEWBLOCK) != 0)
7030                 free_newdirblk(pagedep->pd_newdirblk);
7031         if (free_pagedep(pagedep) == 0)
7032                 panic("Failed to free pagedep %p", pagedep);
7033         return (0);
7034 }
7035
7036 /*
7037  * Reclaim any dependency structures from a buffer that is about to
7038  * be reallocated to a new vnode. The buffer must be locked, thus,
7039  * no I/O completion operations can occur while we are manipulating
7040  * its associated dependencies. The mutex is held so that other I/O's
7041  * associated with related dependencies do not occur.
7042  */
7043 static int
7044 deallocate_dependencies(bp, freeblks, off)
7045         struct buf *bp;
7046         struct freeblks *freeblks;
7047         int off;
7048 {
7049         struct indirdep *indirdep;
7050         struct pagedep *pagedep;
7051         struct allocdirect *adp;
7052         struct worklist *wk, *wkn;
7053         struct ufsmount *ump;
7054
7055         if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
7056                 goto done;
7057         ump = VFSTOUFS(wk->wk_mp);
7058         ACQUIRE_LOCK(ump);
7059         LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) {
7060                 switch (wk->wk_type) {
7061                 case D_INDIRDEP:
7062                         indirdep = WK_INDIRDEP(wk);
7063                         if (bp->b_lblkno >= 0 ||
7064                             bp->b_blkno != indirdep->ir_savebp->b_lblkno)
7065                                 panic("deallocate_dependencies: not indir");
7066                         cancel_indirdep(indirdep, bp, freeblks);
7067                         continue;
7068
7069                 case D_PAGEDEP:
7070                         pagedep = WK_PAGEDEP(wk);
7071                         if (cancel_pagedep(pagedep, freeblks, off)) {
7072                                 FREE_LOCK(ump);
7073                                 return (ERESTART);
7074                         }
7075                         continue;
7076
7077                 case D_ALLOCINDIR:
7078                         /*
7079                          * Simply remove the allocindir, we'll find it via
7080                          * the indirdep where we can clear pointers if
7081                          * needed.
7082                          */
7083                         WORKLIST_REMOVE(wk);
7084                         continue;
7085
7086                 case D_FREEWORK:
7087                         /*
7088                          * A truncation is waiting for the zero'd pointers
7089                          * to be written.  It can be freed when the freeblks
7090                          * is journaled.
7091                          */
7092                         WORKLIST_REMOVE(wk);
7093                         wk->wk_state |= ONDEPLIST;
7094                         WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
7095                         break;
7096
7097                 case D_ALLOCDIRECT:
7098                         adp = WK_ALLOCDIRECT(wk);
7099                         if (off != 0)
7100                                 continue;
7101                         /* FALLTHROUGH */
7102                 default:
7103                         panic("deallocate_dependencies: Unexpected type %s",
7104                             TYPENAME(wk->wk_type));
7105                         /* NOTREACHED */
7106                 }
7107         }
7108         FREE_LOCK(ump);
7109 done:
7110         /*
7111          * Don't throw away this buf, we were partially truncating and
7112          * some deps may always remain.
7113          */
7114         if (off) {
7115                 allocbuf(bp, off);
7116                 bp->b_vflags |= BV_SCANNED;
7117                 return (EBUSY);
7118         }
7119         bp->b_flags |= B_INVAL | B_NOCACHE;
7120
7121         return (0);
7122 }
7123
7124 /*
7125  * An allocdirect is being canceled due to a truncate.  We must make sure
7126  * the journal entry is released in concert with the blkfree that releases
7127  * the storage.  Completed journal entries must not be released until the
7128  * space is no longer pointed to by the inode or in the bitmap.
7129  */
7130 static void
7131 cancel_allocdirect(adphead, adp, freeblks)
7132         struct allocdirectlst *adphead;
7133         struct allocdirect *adp;
7134         struct freeblks *freeblks;
7135 {
7136         struct freework *freework;
7137         struct newblk *newblk;
7138         struct worklist *wk;
7139
7140         TAILQ_REMOVE(adphead, adp, ad_next);
7141         newblk = (struct newblk *)adp;
7142         freework = NULL;
7143         /*
7144          * Find the correct freework structure.
7145          */
7146         LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
7147                 if (wk->wk_type != D_FREEWORK)
7148                         continue;
7149                 freework = WK_FREEWORK(wk);
7150                 if (freework->fw_blkno == newblk->nb_newblkno)
7151                         break;
7152         }
7153         if (freework == NULL)
7154                 panic("cancel_allocdirect: Freework not found");
7155         /*
7156          * If a newblk exists at all we still have the journal entry that
7157          * initiated the allocation so we do not need to journal the free.
7158          */
7159         cancel_jfreeblk(freeblks, freework->fw_blkno);
7160         /*
7161          * If the journal hasn't been written the jnewblk must be passed
7162          * to the call to ffs_blkfree that reclaims the space.  We accomplish
7163          * this by linking the journal dependency into the freework to be
7164          * freed when freework_freeblock() is called.  If the journal has
7165          * been written we can simply reclaim the journal space when the
7166          * freeblks work is complete.
7167          */
7168         freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list,
7169             &freeblks->fb_jwork);
7170         WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
7171 }
7172
7173
7174 /*
7175  * Cancel a new block allocation.  May be an indirect or direct block.  We
7176  * remove it from various lists and return any journal record that needs to
7177  * be resolved by the caller.
7178  *
7179  * A special consideration is made for indirects which were never pointed
7180  * at on disk and will never be found once this block is released.
7181  */
7182 static struct jnewblk *
7183 cancel_newblk(newblk, wk, wkhd)
7184         struct newblk *newblk;
7185         struct worklist *wk;
7186         struct workhead *wkhd;
7187 {
7188         struct jnewblk *jnewblk;
7189
7190         CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno);
7191
7192         newblk->nb_state |= GOINGAWAY;
7193         /*
7194          * Previously we traversed the completedhd on each indirdep
7195          * attached to this newblk to cancel them and gather journal
7196          * work.  Since we need only the oldest journal segment and
7197          * the lowest point on the tree will always have the oldest
7198          * journal segment we are free to release the segments
7199          * of any subordinates and may leave the indirdep list to
7200          * indirdep_complete() when this newblk is freed.
7201          */
7202         if (newblk->nb_state & ONDEPLIST) {
7203                 newblk->nb_state &= ~ONDEPLIST;
7204                 LIST_REMOVE(newblk, nb_deps);
7205         }
7206         if (newblk->nb_state & ONWORKLIST)
7207                 WORKLIST_REMOVE(&newblk->nb_list);
7208         /*
7209          * If the journal entry hasn't been written we save a pointer to
7210          * the dependency that frees it until it is written or the
7211          * superseding operation completes.
7212          */
7213         jnewblk = newblk->nb_jnewblk;
7214         if (jnewblk != NULL && wk != NULL) {
7215                 newblk->nb_jnewblk = NULL;
7216                 jnewblk->jn_dep = wk;
7217         }
7218         if (!LIST_EMPTY(&newblk->nb_jwork))
7219                 jwork_move(wkhd, &newblk->nb_jwork);
7220         /*
7221          * When truncating we must free the newdirblk early to remove
7222          * the pagedep from the hash before returning.
7223          */
7224         if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7225                 free_newdirblk(WK_NEWDIRBLK(wk));
7226         if (!LIST_EMPTY(&newblk->nb_newdirblk))
7227                 panic("cancel_newblk: extra newdirblk");
7228
7229         return (jnewblk);
7230 }
7231
7232 /*
7233  * Schedule the freefrag associated with a newblk to be released once
7234  * the pointers are written and the previous block is no longer needed.
7235  */
7236 static void
7237 newblk_freefrag(newblk)
7238         struct newblk *newblk;
7239 {
7240         struct freefrag *freefrag;
7241
7242         if (newblk->nb_freefrag == NULL)
7243                 return;
7244         freefrag = newblk->nb_freefrag;
7245         newblk->nb_freefrag = NULL;
7246         freefrag->ff_state |= COMPLETE;
7247         if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
7248                 add_to_worklist(&freefrag->ff_list, 0);
7249 }
7250
7251 /*
7252  * Free a newblk. Generate a new freefrag work request if appropriate.
7253  * This must be called after the inode pointer and any direct block pointers
7254  * are valid or fully removed via truncate or frag extension.
7255  */
7256 static void
7257 free_newblk(newblk)
7258         struct newblk *newblk;
7259 {
7260         struct indirdep *indirdep;
7261         struct worklist *wk;
7262
7263         KASSERT(newblk->nb_jnewblk == NULL,
7264             ("free_newblk: jnewblk %p still attached", newblk->nb_jnewblk));
7265         KASSERT(newblk->nb_list.wk_type != D_NEWBLK,
7266             ("free_newblk: unclaimed newblk"));
7267         LOCK_OWNED(VFSTOUFS(newblk->nb_list.wk_mp));
7268         newblk_freefrag(newblk);
7269         if (newblk->nb_state & ONDEPLIST)
7270                 LIST_REMOVE(newblk, nb_deps);
7271         if (newblk->nb_state & ONWORKLIST)
7272                 WORKLIST_REMOVE(&newblk->nb_list);
7273         LIST_REMOVE(newblk, nb_hash);
7274         if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7275                 free_newdirblk(WK_NEWDIRBLK(wk));
7276         if (!LIST_EMPTY(&newblk->nb_newdirblk))
7277                 panic("free_newblk: extra newdirblk");
7278         while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL)
7279                 indirdep_complete(indirdep);
7280         handle_jwork(&newblk->nb_jwork);
7281         WORKITEM_FREE(newblk, D_NEWBLK);
7282 }
7283
7284 /*
7285  * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
7286  * This routine must be called with splbio interrupts blocked.
7287  */
7288 static void
7289 free_newdirblk(newdirblk)
7290         struct newdirblk *newdirblk;
7291 {
7292         struct pagedep *pagedep;
7293         struct diradd *dap;
7294         struct worklist *wk;
7295
7296         LOCK_OWNED(VFSTOUFS(newdirblk->db_list.wk_mp));
7297         WORKLIST_REMOVE(&newdirblk->db_list);
7298         /*
7299          * If the pagedep is still linked onto the directory buffer
7300          * dependency chain, then some of the entries on the
7301          * pd_pendinghd list may not be committed to disk yet. In
7302          * this case, we will simply clear the NEWBLOCK flag and
7303          * let the pd_pendinghd list be processed when the pagedep
7304          * is next written. If the pagedep is no longer on the buffer
7305          * dependency chain, then all the entries on the pd_pending
7306          * list are committed to disk and we can free them here.
7307          */
7308         pagedep = newdirblk->db_pagedep;
7309         pagedep->pd_state &= ~NEWBLOCK;
7310         if ((pagedep->pd_state & ONWORKLIST) == 0) {
7311                 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
7312                         free_diradd(dap, NULL);
7313                 /*
7314                  * If no dependencies remain, the pagedep will be freed.
7315                  */
7316                 free_pagedep(pagedep);
7317         }
7318         /* Should only ever be one item in the list. */
7319         while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
7320                 WORKLIST_REMOVE(wk);
7321                 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
7322         }
7323         WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
7324 }
7325
7326 /*
7327  * Prepare an inode to be freed. The actual free operation is not
7328  * done until the zero'ed inode has been written to disk.
7329  */
7330 void
7331 softdep_freefile(pvp, ino, mode)
7332         struct vnode *pvp;
7333         ino_t ino;
7334         int mode;
7335 {
7336         struct inode *ip = VTOI(pvp);
7337         struct inodedep *inodedep;
7338         struct freefile *freefile;
7339         struct freeblks *freeblks;
7340         struct ufsmount *ump;
7341
7342         ump = ip->i_ump;
7343         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
7344             ("softdep_freefile called on non-softdep filesystem"));
7345         /*
7346          * This sets up the inode de-allocation dependency.
7347          */
7348         freefile = malloc(sizeof(struct freefile),
7349                 M_FREEFILE, M_SOFTDEP_FLAGS);
7350         workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
7351         freefile->fx_mode = mode;
7352         freefile->fx_oldinum = ino;
7353         freefile->fx_devvp = ip->i_devvp;
7354         LIST_INIT(&freefile->fx_jwork);
7355         UFS_LOCK(ump);
7356         ip->i_fs->fs_pendinginodes += 1;
7357         UFS_UNLOCK(ump);
7358
7359         /*
7360          * If the inodedep does not exist, then the zero'ed inode has
7361          * been written to disk. If the allocated inode has never been
7362          * written to disk, then the on-disk inode is zero'ed. In either
7363          * case we can free the file immediately.  If the journal was
7364          * canceled before being written the inode will never make it to
7365          * disk and we must send the canceled journal entrys to
7366          * ffs_freefile() to be cleared in conjunction with the bitmap.
7367          * Any blocks waiting on the inode to write can be safely freed
7368          * here as it will never been written.
7369          */
7370         ACQUIRE_LOCK(ump);
7371         inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7372         if (inodedep) {
7373                 /*
7374                  * Clear out freeblks that no longer need to reference
7375                  * this inode.
7376                  */
7377                 while ((freeblks =
7378                     TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) {
7379                         TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks,
7380                             fb_next);
7381                         freeblks->fb_state &= ~ONDEPLIST;
7382                 }
7383                 /*
7384                  * Remove this inode from the unlinked list.
7385                  */
7386                 if (inodedep->id_state & UNLINKED) {
7387                         /*
7388                          * Save the journal work to be freed with the bitmap
7389                          * before we clear UNLINKED.  Otherwise it can be lost
7390                          * if the inode block is written.
7391                          */
7392                         handle_bufwait(inodedep, &freefile->fx_jwork);
7393                         clear_unlinked_inodedep(inodedep);
7394                         /*
7395                          * Re-acquire inodedep as we've dropped the
7396                          * soft updates lock in clear_unlinked_inodedep().
7397                          */
7398                         inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7399                 }
7400         }
7401         if (inodedep == NULL || check_inode_unwritten(inodedep)) {
7402                 FREE_LOCK(ump);
7403                 handle_workitem_freefile(freefile);
7404                 return;
7405         }
7406         if ((inodedep->id_state & DEPCOMPLETE) == 0)
7407                 inodedep->id_state |= GOINGAWAY;
7408         WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
7409         FREE_LOCK(ump);
7410         if (ip->i_number == ino)
7411                 ip->i_flag |= IN_MODIFIED;
7412 }
7413
7414 /*
7415  * Check to see if an inode has never been written to disk. If
7416  * so free the inodedep and return success, otherwise return failure.
7417  * This routine must be called with splbio interrupts blocked.
7418  *
7419  * If we still have a bitmap dependency, then the inode has never
7420  * been written to disk. Drop the dependency as it is no longer
7421  * necessary since the inode is being deallocated. We set the
7422  * ALLCOMPLETE flags since the bitmap now properly shows that the
7423  * inode is not allocated. Even if the inode is actively being
7424  * written, it has been rolled back to its zero'ed state, so we
7425  * are ensured that a zero inode is what is on the disk. For short
7426  * lived files, this change will usually result in removing all the
7427  * dependencies from the inode so that it can be freed immediately.
7428  */
7429 static int
7430 check_inode_unwritten(inodedep)
7431         struct inodedep *inodedep;
7432 {
7433
7434         LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7435
7436         if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
7437             !LIST_EMPTY(&inodedep->id_dirremhd) ||
7438             !LIST_EMPTY(&inodedep->id_pendinghd) ||
7439             !LIST_EMPTY(&inodedep->id_bufwait) ||
7440             !LIST_EMPTY(&inodedep->id_inowait) ||
7441             !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7442             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7443             !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7444             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7445             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7446             !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7447             inodedep->id_mkdiradd != NULL ||
7448             inodedep->id_nlinkdelta != 0)
7449                 return (0);
7450         /*
7451          * Another process might be in initiate_write_inodeblock_ufs[12]
7452          * trying to allocate memory without holding "Softdep Lock".
7453          */
7454         if ((inodedep->id_state & IOSTARTED) != 0 &&
7455             inodedep->id_savedino1 == NULL)
7456                 return (0);
7457
7458         if (inodedep->id_state & ONDEPLIST)
7459                 LIST_REMOVE(inodedep, id_deps);
7460         inodedep->id_state &= ~ONDEPLIST;
7461         inodedep->id_state |= ALLCOMPLETE;
7462         inodedep->id_bmsafemap = NULL;
7463         if (inodedep->id_state & ONWORKLIST)
7464                 WORKLIST_REMOVE(&inodedep->id_list);
7465         if (inodedep->id_savedino1 != NULL) {
7466                 free(inodedep->id_savedino1, M_SAVEDINO);
7467                 inodedep->id_savedino1 = NULL;
7468         }
7469         if (free_inodedep(inodedep) == 0)
7470                 panic("check_inode_unwritten: busy inode");
7471         return (1);
7472 }
7473
7474 /*
7475  * Try to free an inodedep structure. Return 1 if it could be freed.
7476  */
7477 static int
7478 free_inodedep(inodedep)
7479         struct inodedep *inodedep;
7480 {
7481
7482         LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7483         if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
7484             (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
7485             !LIST_EMPTY(&inodedep->id_dirremhd) ||
7486             !LIST_EMPTY(&inodedep->id_pendinghd) ||
7487             !LIST_EMPTY(&inodedep->id_bufwait) ||
7488             !LIST_EMPTY(&inodedep->id_inowait) ||
7489             !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7490             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7491             !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7492             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7493             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7494             !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7495             inodedep->id_mkdiradd != NULL ||
7496             inodedep->id_nlinkdelta != 0 ||
7497             inodedep->id_savedino1 != NULL)
7498                 return (0);
7499         if (inodedep->id_state & ONDEPLIST)
7500                 LIST_REMOVE(inodedep, id_deps);
7501         LIST_REMOVE(inodedep, id_hash);
7502         WORKITEM_FREE(inodedep, D_INODEDEP);
7503         return (1);
7504 }
7505
7506 /*
7507  * Free the block referenced by a freework structure.  The parent freeblks
7508  * structure is released and completed when the final cg bitmap reaches
7509  * the disk.  This routine may be freeing a jnewblk which never made it to
7510  * disk in which case we do not have to wait as the operation is undone
7511  * in memory immediately.
7512  */
7513 static void
7514 freework_freeblock(freework)
7515         struct freework *freework;
7516 {
7517         struct freeblks *freeblks;
7518         struct jnewblk *jnewblk;
7519         struct ufsmount *ump;
7520         struct workhead wkhd;
7521         struct fs *fs;
7522         int bsize;
7523         int needj;
7524
7525         ump = VFSTOUFS(freework->fw_list.wk_mp);
7526         LOCK_OWNED(ump);
7527         /*
7528          * Handle partial truncate separately.
7529          */
7530         if (freework->fw_indir) {
7531                 complete_trunc_indir(freework);
7532                 return;
7533         }
7534         freeblks = freework->fw_freeblks;
7535         fs = ump->um_fs;
7536         needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0;
7537         bsize = lfragtosize(fs, freework->fw_frags);
7538         LIST_INIT(&wkhd);
7539         /*
7540          * DEPCOMPLETE is cleared in indirblk_insert() if the block lives
7541          * on the indirblk hashtable and prevents premature freeing.
7542          */
7543         freework->fw_state |= DEPCOMPLETE;
7544         /*
7545          * SUJ needs to wait for the segment referencing freed indirect
7546          * blocks to expire so that we know the checker will not confuse
7547          * a re-allocated indirect block with its old contents.
7548          */
7549         if (needj && freework->fw_lbn <= -NDADDR)
7550                 indirblk_insert(freework);
7551         /*
7552          * If we are canceling an existing jnewblk pass it to the free
7553          * routine, otherwise pass the freeblk which will ultimately
7554          * release the freeblks.  If we're not journaling, we can just
7555          * free the freeblks immediately.
7556          */
7557         jnewblk = freework->fw_jnewblk;
7558         if (jnewblk != NULL) {
7559                 cancel_jnewblk(jnewblk, &wkhd);
7560                 needj = 0;
7561         } else if (needj) {
7562                 freework->fw_state |= DELAYEDFREE;
7563                 freeblks->fb_cgwait++;
7564                 WORKLIST_INSERT(&wkhd, &freework->fw_list);
7565         }
7566         FREE_LOCK(ump);
7567         freeblks_free(ump, freeblks, btodb(bsize));
7568         CTR4(KTR_SUJ,
7569             "freework_freeblock: ino %d blkno %jd lbn %jd size %ld",
7570             freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
7571         ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
7572             freeblks->fb_inum, freeblks->fb_vtype, &wkhd);
7573         ACQUIRE_LOCK(ump);
7574         /*
7575          * The jnewblk will be discarded and the bits in the map never
7576          * made it to disk.  We can immediately free the freeblk.
7577          */
7578         if (needj == 0)
7579                 handle_written_freework(freework);
7580 }
7581
7582 /*
7583  * We enqueue freework items that need processing back on the freeblks and
7584  * add the freeblks to the worklist.  This makes it easier to find all work
7585  * required to flush a truncation in process_truncates().
7586  */
7587 static void
7588 freework_enqueue(freework)
7589         struct freework *freework;
7590 {
7591         struct freeblks *freeblks;
7592
7593         freeblks = freework->fw_freeblks;
7594         if ((freework->fw_state & INPROGRESS) == 0)
7595                 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
7596         if ((freeblks->fb_state &
7597             (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE &&
7598             LIST_EMPTY(&freeblks->fb_jblkdephd))
7599                 add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7600 }
7601
7602 /*
7603  * Start, continue, or finish the process of freeing an indirect block tree.
7604  * The free operation may be paused at any point with fw_off containing the
7605  * offset to restart from.  This enables us to implement some flow control
7606  * for large truncates which may fan out and generate a huge number of
7607  * dependencies.
7608  */
7609 static void
7610 handle_workitem_indirblk(freework)
7611         struct freework *freework;
7612 {
7613         struct freeblks *freeblks;
7614         struct ufsmount *ump;
7615         struct fs *fs;
7616
7617         freeblks = freework->fw_freeblks;
7618         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7619         fs = ump->um_fs;
7620         if (freework->fw_state & DEPCOMPLETE) {
7621                 handle_written_freework(freework);
7622                 return;
7623         }
7624         if (freework->fw_off == NINDIR(fs)) {
7625                 freework_freeblock(freework);
7626                 return;
7627         }
7628         freework->fw_state |= INPROGRESS;
7629         FREE_LOCK(ump);
7630         indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
7631             freework->fw_lbn);
7632         ACQUIRE_LOCK(ump);
7633 }
7634
7635 /*
7636  * Called when a freework structure attached to a cg buf is written.  The
7637  * ref on either the parent or the freeblks structure is released and
7638  * the freeblks is added back to the worklist if there is more work to do.
7639  */
7640 static void
7641 handle_written_freework(freework)
7642         struct freework *freework;
7643 {
7644         struct freeblks *freeblks;
7645         struct freework *parent;
7646
7647         freeblks = freework->fw_freeblks;
7648         parent = freework->fw_parent;
7649         if (freework->fw_state & DELAYEDFREE)
7650                 freeblks->fb_cgwait--;
7651         freework->fw_state |= COMPLETE;
7652         if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
7653                 WORKITEM_FREE(freework, D_FREEWORK);
7654         if (parent) {
7655                 if (--parent->fw_ref == 0)
7656                         freework_enqueue(parent);
7657                 return;
7658         }
7659         if (--freeblks->fb_ref != 0)
7660                 return;
7661         if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) ==
7662             ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd))
7663                 add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7664 }
7665
7666 /*
7667  * This workitem routine performs the block de-allocation.
7668  * The workitem is added to the pending list after the updated
7669  * inode block has been written to disk.  As mentioned above,
7670  * checks regarding the number of blocks de-allocated (compared
7671  * to the number of blocks allocated for the file) are also
7672  * performed in this function.
7673  */
7674 static int
7675 handle_workitem_freeblocks(freeblks, flags)
7676         struct freeblks *freeblks;
7677         int flags;
7678 {
7679         struct freework *freework;
7680         struct newblk *newblk;
7681         struct allocindir *aip;
7682         struct ufsmount *ump;
7683         struct worklist *wk;
7684
7685         KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
7686             ("handle_workitem_freeblocks: Journal entries not written."));
7687         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7688         ACQUIRE_LOCK(ump);
7689         while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
7690                 WORKLIST_REMOVE(wk);
7691                 switch (wk->wk_type) {
7692                 case D_DIRREM:
7693                         wk->wk_state |= COMPLETE;
7694                         add_to_worklist(wk, 0);
7695                         continue;
7696
7697                 case D_ALLOCDIRECT:
7698                         free_newblk(WK_NEWBLK(wk));
7699                         continue;
7700
7701                 case D_ALLOCINDIR:
7702                         aip = WK_ALLOCINDIR(wk);
7703                         freework = NULL;
7704                         if (aip->ai_state & DELAYEDFREE) {
7705                                 FREE_LOCK(ump);
7706                                 freework = newfreework(ump, freeblks, NULL,
7707                                     aip->ai_lbn, aip->ai_newblkno,
7708                                     ump->um_fs->fs_frag, 0, 0);
7709                                 ACQUIRE_LOCK(ump);
7710                         }
7711                         newblk = WK_NEWBLK(wk);
7712                         if (newblk->nb_jnewblk) {
7713                                 freework->fw_jnewblk = newblk->nb_jnewblk;
7714                                 newblk->nb_jnewblk->jn_dep = &freework->fw_list;
7715                                 newblk->nb_jnewblk = NULL;
7716                         }
7717                         free_newblk(newblk);
7718                         continue;
7719
7720                 case D_FREEWORK:
7721                         freework = WK_FREEWORK(wk);
7722                         if (freework->fw_lbn <= -NDADDR)
7723                                 handle_workitem_indirblk(freework);
7724                         else
7725                                 freework_freeblock(freework);
7726                         continue;
7727                 default:
7728                         panic("handle_workitem_freeblocks: Unknown type %s",
7729                             TYPENAME(wk->wk_type));
7730                 }
7731         }
7732         if (freeblks->fb_ref != 0) {
7733                 freeblks->fb_state &= ~INPROGRESS;
7734                 wake_worklist(&freeblks->fb_list);
7735                 freeblks = NULL;
7736         }
7737         FREE_LOCK(ump);
7738         if (freeblks)
7739                 return handle_complete_freeblocks(freeblks, flags);
7740         return (0);
7741 }
7742
7743 /*
7744  * Handle completion of block free via truncate.  This allows fs_pending
7745  * to track the actual free block count more closely than if we only updated
7746  * it at the end.  We must be careful to handle cases where the block count
7747  * on free was incorrect.
7748  */
7749 static void
7750 freeblks_free(ump, freeblks, blocks)
7751         struct ufsmount *ump;
7752         struct freeblks *freeblks;
7753         int blocks;
7754 {
7755         struct fs *fs;
7756         ufs2_daddr_t remain;
7757
7758         UFS_LOCK(ump);
7759         remain = -freeblks->fb_chkcnt;
7760         freeblks->fb_chkcnt += blocks;
7761         if (remain > 0) {
7762                 if (remain < blocks)
7763                         blocks = remain;
7764                 fs = ump->um_fs;
7765                 fs->fs_pendingblocks -= blocks;
7766         }
7767         UFS_UNLOCK(ump);
7768 }
7769
7770 /*
7771  * Once all of the freework workitems are complete we can retire the
7772  * freeblocks dependency and any journal work awaiting completion.  This
7773  * can not be called until all other dependencies are stable on disk.
7774  */
7775 static int
7776 handle_complete_freeblocks(freeblks, flags)
7777         struct freeblks *freeblks;
7778         int flags;
7779 {
7780         struct inodedep *inodedep;
7781         struct inode *ip;
7782         struct vnode *vp;
7783         struct fs *fs;
7784         struct ufsmount *ump;
7785         ufs2_daddr_t spare;
7786
7787         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7788         fs = ump->um_fs;
7789         flags = LK_EXCLUSIVE | flags;
7790         spare = freeblks->fb_chkcnt;
7791
7792         /*
7793          * If we did not release the expected number of blocks we may have
7794          * to adjust the inode block count here.  Only do so if it wasn't
7795          * a truncation to zero and the modrev still matches.
7796          */
7797         if (spare && freeblks->fb_len != 0) {
7798                 if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum,
7799                     flags, &vp, FFSV_FORCEINSMQ) != 0)
7800                         return (EBUSY);
7801                 ip = VTOI(vp);
7802                 if (DIP(ip, i_modrev) == freeblks->fb_modrev) {
7803                         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare);
7804                         ip->i_flag |= IN_CHANGE;
7805                         /*
7806                          * We must wait so this happens before the
7807                          * journal is reclaimed.
7808                          */
7809                         ffs_update(vp, 1);
7810                 }
7811                 vput(vp);
7812         }
7813         if (spare < 0) {
7814                 UFS_LOCK(ump);
7815                 fs->fs_pendingblocks += spare;
7816                 UFS_UNLOCK(ump);
7817         }
7818 #ifdef QUOTA
7819         /* Handle spare. */
7820         if (spare)
7821                 quotaadj(freeblks->fb_quota, ump, -spare);
7822         quotarele(freeblks->fb_quota);
7823 #endif
7824         ACQUIRE_LOCK(ump);
7825         if (freeblks->fb_state & ONDEPLIST) {
7826                 inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum,
7827                     0, &inodedep);
7828                 TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next);
7829                 freeblks->fb_state &= ~ONDEPLIST;
7830                 if (TAILQ_EMPTY(&inodedep->id_freeblklst))
7831                         free_inodedep(inodedep);
7832         }
7833         /*
7834          * All of the freeblock deps must be complete prior to this call
7835          * so it's now safe to complete earlier outstanding journal entries.
7836          */
7837         handle_jwork(&freeblks->fb_jwork);
7838         WORKITEM_FREE(freeblks, D_FREEBLKS);
7839         FREE_LOCK(ump);
7840         return (0);
7841 }
7842
7843 /*
7844  * Release blocks associated with the freeblks and stored in the indirect
7845  * block dbn. If level is greater than SINGLE, the block is an indirect block
7846  * and recursive calls to indirtrunc must be used to cleanse other indirect
7847  * blocks.
7848  *
7849  * This handles partial and complete truncation of blocks.  Partial is noted
7850  * with goingaway == 0.  In this case the freework is completed after the
7851  * zero'd indirects are written to disk.  For full truncation the freework
7852  * is completed after the block is freed.
7853  */
7854 static void
7855 indir_trunc(freework, dbn, lbn)
7856         struct freework *freework;
7857         ufs2_daddr_t dbn;
7858         ufs_lbn_t lbn;
7859 {
7860         struct freework *nfreework;
7861         struct workhead wkhd;
7862         struct freeblks *freeblks;
7863         struct buf *bp;
7864         struct fs *fs;
7865         struct indirdep *indirdep;
7866         struct ufsmount *ump;
7867         ufs1_daddr_t *bap1 = 0;
7868         ufs2_daddr_t nb, nnb, *bap2 = 0;
7869         ufs_lbn_t lbnadd, nlbn;
7870         int i, nblocks, ufs1fmt;
7871         int freedblocks;
7872         int goingaway;
7873         int freedeps;
7874         int needj;
7875         int level;
7876         int cnt;
7877
7878         freeblks = freework->fw_freeblks;
7879         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7880         fs = ump->um_fs;
7881         /*
7882          * Get buffer of block pointers to be freed.  There are three cases:
7883          *
7884          * 1) Partial truncate caches the indirdep pointer in the freework
7885          *    which provides us a back copy to the save bp which holds the
7886          *    pointers we want to clear.  When this completes the zero
7887          *    pointers are written to the real copy.
7888          * 2) The indirect is being completely truncated, cancel_indirdep()
7889          *    eliminated the real copy and placed the indirdep on the saved
7890          *    copy.  The indirdep and buf are discarded when this completes.
7891          * 3) The indirect was not in memory, we read a copy off of the disk
7892          *    using the devvp and drop and invalidate the buffer when we're
7893          *    done.
7894          */
7895         goingaway = 1;
7896         indirdep = NULL;
7897         if (freework->fw_indir != NULL) {
7898                 goingaway = 0;
7899                 indirdep = freework->fw_indir;
7900                 bp = indirdep->ir_savebp;
7901                 if (bp == NULL || bp->b_blkno != dbn)
7902                         panic("indir_trunc: Bad saved buf %p blkno %jd",
7903                             bp, (intmax_t)dbn);
7904         } else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) {
7905                 /*
7906                  * The lock prevents the buf dep list from changing and
7907                  * indirects on devvp should only ever have one dependency.
7908                  */
7909                 indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep));
7910                 if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0)
7911                         panic("indir_trunc: Bad indirdep %p from buf %p",
7912                             indirdep, bp);
7913         } else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
7914             NOCRED, &bp) != 0) {
7915                 brelse(bp);
7916                 return;
7917         }
7918         ACQUIRE_LOCK(ump);
7919         /* Protects against a race with complete_trunc_indir(). */
7920         freework->fw_state &= ~INPROGRESS;
7921         /*
7922          * If we have an indirdep we need to enforce the truncation order
7923          * and discard it when it is complete.
7924          */
7925         if (indirdep) {
7926                 if (freework != TAILQ_FIRST(&indirdep->ir_trunc) &&
7927                     !TAILQ_EMPTY(&indirdep->ir_trunc)) {
7928                         /*
7929                          * Add the complete truncate to the list on the
7930                          * indirdep to enforce in-order processing.
7931                          */
7932                         if (freework->fw_indir == NULL)
7933                                 TAILQ_INSERT_TAIL(&indirdep->ir_trunc,
7934                                     freework, fw_next);
7935                         FREE_LOCK(ump);
7936                         return;
7937                 }
7938                 /*
7939                  * If we're goingaway, free the indirdep.  Otherwise it will
7940                  * linger until the write completes.
7941                  */
7942                 if (goingaway) {
7943                         free_indirdep(indirdep);
7944                         ump->softdep_numindirdeps -= 1;
7945                 }
7946         }
7947         FREE_LOCK(ump);
7948         /* Initialize pointers depending on block size. */
7949         if (ump->um_fstype == UFS1) {
7950                 bap1 = (ufs1_daddr_t *)bp->b_data;
7951                 nb = bap1[freework->fw_off];
7952                 ufs1fmt = 1;
7953         } else {
7954                 bap2 = (ufs2_daddr_t *)bp->b_data;
7955                 nb = bap2[freework->fw_off];
7956                 ufs1fmt = 0;
7957         }
7958         level = lbn_level(lbn);
7959         needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0;
7960         lbnadd = lbn_offset(fs, level);
7961         nblocks = btodb(fs->fs_bsize);
7962         nfreework = freework;
7963         freedeps = 0;
7964         cnt = 0;
7965         /*
7966          * Reclaim blocks.  Traverses into nested indirect levels and
7967          * arranges for the current level to be freed when subordinates
7968          * are free when journaling.
7969          */
7970         for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
7971                 if (i != NINDIR(fs) - 1) {
7972                         if (ufs1fmt)
7973                                 nnb = bap1[i+1];
7974                         else
7975                                 nnb = bap2[i+1];
7976                 } else
7977                         nnb = 0;
7978                 if (nb == 0)
7979                         continue;
7980                 cnt++;
7981                 if (level != 0) {
7982                         nlbn = (lbn + 1) - (i * lbnadd);
7983                         if (needj != 0) {
7984                                 nfreework = newfreework(ump, freeblks, freework,
7985                                     nlbn, nb, fs->fs_frag, 0, 0);
7986                                 freedeps++;
7987                         }
7988                         indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
7989                 } else {
7990                         struct freedep *freedep;
7991
7992                         /*
7993                          * Attempt to aggregate freedep dependencies for
7994                          * all blocks being released to the same CG.
7995                          */
7996                         LIST_INIT(&wkhd);
7997                         if (needj != 0 &&
7998                             (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
7999                                 freedep = newfreedep(freework);
8000                                 WORKLIST_INSERT_UNLOCKED(&wkhd,
8001                                     &freedep->fd_list);
8002                                 freedeps++;
8003                         }
8004                         CTR3(KTR_SUJ,
8005                             "indir_trunc: ino %d blkno %jd size %ld",
8006                             freeblks->fb_inum, nb, fs->fs_bsize);
8007                         ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
8008                             fs->fs_bsize, freeblks->fb_inum,
8009                             freeblks->fb_vtype, &wkhd);
8010                 }
8011         }
8012         if (goingaway) {
8013                 bp->b_flags |= B_INVAL | B_NOCACHE;
8014                 brelse(bp);
8015         }
8016         freedblocks = 0;
8017         if (level == 0)
8018                 freedblocks = (nblocks * cnt);
8019         if (needj == 0)
8020                 freedblocks += nblocks;
8021         freeblks_free(ump, freeblks, freedblocks);
8022         /*
8023          * If we are journaling set up the ref counts and offset so this
8024          * indirect can be completed when its children are free.
8025          */
8026         if (needj) {
8027                 ACQUIRE_LOCK(ump);
8028                 freework->fw_off = i;
8029                 freework->fw_ref += freedeps;
8030                 freework->fw_ref -= NINDIR(fs) + 1;
8031                 if (level == 0)
8032                         freeblks->fb_cgwait += freedeps;
8033                 if (freework->fw_ref == 0)
8034                         freework_freeblock(freework);
8035                 FREE_LOCK(ump);
8036                 return;
8037         }
8038         /*
8039          * If we're not journaling we can free the indirect now.
8040          */
8041         dbn = dbtofsb(fs, dbn);
8042         CTR3(KTR_SUJ,
8043             "indir_trunc 2: ino %d blkno %jd size %ld",
8044             freeblks->fb_inum, dbn, fs->fs_bsize);
8045         ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
8046             freeblks->fb_inum, freeblks->fb_vtype, NULL);
8047         /* Non SUJ softdep does single-threaded truncations. */
8048         if (freework->fw_blkno == dbn) {
8049                 freework->fw_state |= ALLCOMPLETE;
8050                 ACQUIRE_LOCK(ump);
8051                 handle_written_freework(freework);
8052                 FREE_LOCK(ump);
8053         }
8054         return;
8055 }
8056
8057 /*
8058  * Cancel an allocindir when it is removed via truncation.  When bp is not
8059  * NULL the indirect never appeared on disk and is scheduled to be freed
8060  * independently of the indir so we can more easily track journal work.
8061  */
8062 static void
8063 cancel_allocindir(aip, bp, freeblks, trunc)
8064         struct allocindir *aip;
8065         struct buf *bp;
8066         struct freeblks *freeblks;
8067         int trunc;
8068 {
8069         struct indirdep *indirdep;
8070         struct freefrag *freefrag;
8071         struct newblk *newblk;
8072
8073         newblk = (struct newblk *)aip;
8074         LIST_REMOVE(aip, ai_next);
8075         /*
8076          * We must eliminate the pointer in bp if it must be freed on its
8077          * own due to partial truncate or pending journal work.
8078          */
8079         if (bp && (trunc || newblk->nb_jnewblk)) {
8080                 /*
8081                  * Clear the pointer and mark the aip to be freed
8082                  * directly if it never existed on disk.
8083                  */
8084                 aip->ai_state |= DELAYEDFREE;
8085                 indirdep = aip->ai_indirdep;
8086                 if (indirdep->ir_state & UFS1FMT)
8087                         ((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8088                 else
8089                         ((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8090         }
8091         /*
8092          * When truncating the previous pointer will be freed via
8093          * savedbp.  Eliminate the freefrag which would dup free.
8094          */
8095         if (trunc && (freefrag = newblk->nb_freefrag) != NULL) {
8096                 newblk->nb_freefrag = NULL;
8097                 if (freefrag->ff_jdep)
8098                         cancel_jfreefrag(
8099                             WK_JFREEFRAG(freefrag->ff_jdep));
8100                 jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork);
8101                 WORKITEM_FREE(freefrag, D_FREEFRAG);
8102         }
8103         /*
8104          * If the journal hasn't been written the jnewblk must be passed
8105          * to the call to ffs_blkfree that reclaims the space.  We accomplish
8106          * this by leaving the journal dependency on the newblk to be freed
8107          * when a freework is created in handle_workitem_freeblocks().
8108          */
8109         cancel_newblk(newblk, NULL, &freeblks->fb_jwork);
8110         WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
8111 }
8112
8113 /*
8114  * Create the mkdir dependencies for . and .. in a new directory.  Link them
8115  * in to a newdirblk so any subsequent additions are tracked properly.  The
8116  * caller is responsible for adding the mkdir1 dependency to the journal
8117  * and updating id_mkdiradd.  This function returns with the soft updates
8118  * lock held.
8119  */
8120 static struct mkdir *
8121 setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
8122         struct diradd *dap;
8123         ino_t newinum;
8124         ino_t dinum;
8125         struct buf *newdirbp;
8126         struct mkdir **mkdirp;
8127 {
8128         struct newblk *newblk;
8129         struct pagedep *pagedep;
8130         struct inodedep *inodedep;
8131         struct newdirblk *newdirblk = 0;
8132         struct mkdir *mkdir1, *mkdir2;
8133         struct worklist *wk;
8134         struct jaddref *jaddref;
8135         struct ufsmount *ump;
8136         struct mount *mp;
8137
8138         mp = dap->da_list.wk_mp;
8139         ump = VFSTOUFS(mp);
8140         newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
8141             M_SOFTDEP_FLAGS);
8142         workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8143         LIST_INIT(&newdirblk->db_mkdir);
8144         mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8145         workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
8146         mkdir1->md_state = ATTACHED | MKDIR_BODY;
8147         mkdir1->md_diradd = dap;
8148         mkdir1->md_jaddref = NULL;
8149         mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8150         workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
8151         mkdir2->md_state = ATTACHED | MKDIR_PARENT;
8152         mkdir2->md_diradd = dap;
8153         mkdir2->md_jaddref = NULL;
8154         if (MOUNTEDSUJ(mp) == 0) {
8155                 mkdir1->md_state |= DEPCOMPLETE;
8156                 mkdir2->md_state |= DEPCOMPLETE;
8157         }
8158         /*
8159          * Dependency on "." and ".." being written to disk.
8160          */
8161         mkdir1->md_buf = newdirbp;
8162         ACQUIRE_LOCK(VFSTOUFS(mp));
8163         LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir1, md_mkdirs);
8164         /*
8165          * We must link the pagedep, allocdirect, and newdirblk for
8166          * the initial file page so the pointer to the new directory
8167          * is not written until the directory contents are live and
8168          * any subsequent additions are not marked live until the
8169          * block is reachable via the inode.
8170          */
8171         if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0)
8172                 panic("setup_newdir: lost pagedep");
8173         LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
8174                 if (wk->wk_type == D_ALLOCDIRECT)
8175                         break;
8176         if (wk == NULL)
8177                 panic("setup_newdir: lost allocdirect");
8178         if (pagedep->pd_state & NEWBLOCK)
8179                 panic("setup_newdir: NEWBLOCK already set");
8180         newblk = WK_NEWBLK(wk);
8181         pagedep->pd_state |= NEWBLOCK;
8182         pagedep->pd_newdirblk = newdirblk;
8183         newdirblk->db_pagedep = pagedep;
8184         WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8185         WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
8186         /*
8187          * Look up the inodedep for the parent directory so that we
8188          * can link mkdir2 into the pending dotdot jaddref or
8189          * the inode write if there is none.  If the inode is
8190          * ALLCOMPLETE and no jaddref is present all dependencies have
8191          * been satisfied and mkdir2 can be freed.
8192          */
8193         inodedep_lookup(mp, dinum, 0, &inodedep);
8194         if (MOUNTEDSUJ(mp)) {
8195                 if (inodedep == NULL)
8196                         panic("setup_newdir: Lost parent.");
8197                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8198                     inoreflst);
8199                 KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
8200                     (jaddref->ja_state & MKDIR_PARENT),
8201                     ("setup_newdir: bad dotdot jaddref %p", jaddref));
8202                 LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
8203                 mkdir2->md_jaddref = jaddref;
8204                 jaddref->ja_mkdir = mkdir2;
8205         } else if (inodedep == NULL ||
8206             (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
8207                 dap->da_state &= ~MKDIR_PARENT;
8208                 WORKITEM_FREE(mkdir2, D_MKDIR);
8209                 mkdir2 = NULL;
8210         } else {
8211                 LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
8212                 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list);
8213         }
8214         *mkdirp = mkdir2;
8215
8216         return (mkdir1);
8217 }
8218
8219 /*
8220  * Directory entry addition dependencies.
8221  *
8222  * When adding a new directory entry, the inode (with its incremented link
8223  * count) must be written to disk before the directory entry's pointer to it.
8224  * Also, if the inode is newly allocated, the corresponding freemap must be
8225  * updated (on disk) before the directory entry's pointer. These requirements
8226  * are met via undo/redo on the directory entry's pointer, which consists
8227  * simply of the inode number.
8228  *
8229  * As directory entries are added and deleted, the free space within a
8230  * directory block can become fragmented.  The ufs filesystem will compact
8231  * a fragmented directory block to make space for a new entry. When this
8232  * occurs, the offsets of previously added entries change. Any "diradd"
8233  * dependency structures corresponding to these entries must be updated with
8234  * the new offsets.
8235  */
8236
8237 /*
8238  * This routine is called after the in-memory inode's link
8239  * count has been incremented, but before the directory entry's
8240  * pointer to the inode has been set.
8241  */
8242 int
8243 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
8244         struct buf *bp;         /* buffer containing directory block */
8245         struct inode *dp;       /* inode for directory */
8246         off_t diroffset;        /* offset of new entry in directory */
8247         ino_t newinum;          /* inode referenced by new directory entry */
8248         struct buf *newdirbp;   /* non-NULL => contents of new mkdir */
8249         int isnewblk;           /* entry is in a newly allocated block */
8250 {
8251         int offset;             /* offset of new entry within directory block */
8252         ufs_lbn_t lbn;          /* block in directory containing new entry */
8253         struct fs *fs;
8254         struct diradd *dap;
8255         struct newblk *newblk;
8256         struct pagedep *pagedep;
8257         struct inodedep *inodedep;
8258         struct newdirblk *newdirblk = 0;
8259         struct mkdir *mkdir1, *mkdir2;
8260         struct jaddref *jaddref;
8261         struct ufsmount *ump;
8262         struct mount *mp;
8263         int isindir;
8264
8265         ump = dp->i_ump;
8266         mp = UFSTOVFS(ump);
8267         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
8268             ("softdep_setup_directory_add called on non-softdep filesystem"));
8269         /*
8270          * Whiteouts have no dependencies.
8271          */
8272         if (newinum == WINO) {
8273                 if (newdirbp != NULL)
8274                         bdwrite(newdirbp);
8275                 return (0);
8276         }
8277         jaddref = NULL;
8278         mkdir1 = mkdir2 = NULL;
8279         fs = dp->i_fs;
8280         lbn = lblkno(fs, diroffset);
8281         offset = blkoff(fs, diroffset);
8282         dap = malloc(sizeof(struct diradd), M_DIRADD,
8283                 M_SOFTDEP_FLAGS|M_ZERO);
8284         workitem_alloc(&dap->da_list, D_DIRADD, mp);
8285         dap->da_offset = offset;
8286         dap->da_newinum = newinum;
8287         dap->da_state = ATTACHED;
8288         LIST_INIT(&dap->da_jwork);
8289         isindir = bp->b_lblkno >= NDADDR;
8290         if (isnewblk &&
8291             (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
8292                 newdirblk = malloc(sizeof(struct newdirblk),
8293                     M_NEWDIRBLK, M_SOFTDEP_FLAGS);
8294                 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8295                 LIST_INIT(&newdirblk->db_mkdir);
8296         }
8297         /*
8298          * If we're creating a new directory setup the dependencies and set
8299          * the dap state to wait for them.  Otherwise it's COMPLETE and
8300          * we can move on.
8301          */
8302         if (newdirbp == NULL) {
8303                 dap->da_state |= DEPCOMPLETE;
8304                 ACQUIRE_LOCK(ump);
8305         } else {
8306                 dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
8307                 mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
8308                     &mkdir2);
8309         }
8310         /*
8311          * Link into parent directory pagedep to await its being written.
8312          */
8313         pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep);
8314 #ifdef DEBUG
8315         if (diradd_lookup(pagedep, offset) != NULL)
8316                 panic("softdep_setup_directory_add: %p already at off %d\n",
8317                     diradd_lookup(pagedep, offset), offset);
8318 #endif
8319         dap->da_pagedep = pagedep;
8320         LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
8321             da_pdlist);
8322         inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep);
8323         /*
8324          * If we're journaling, link the diradd into the jaddref so it
8325          * may be completed after the journal entry is written.  Otherwise,
8326          * link the diradd into its inodedep.  If the inode is not yet
8327          * written place it on the bufwait list, otherwise do the post-inode
8328          * write processing to put it on the id_pendinghd list.
8329          */
8330         if (MOUNTEDSUJ(mp)) {
8331                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8332                     inoreflst);
8333                 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
8334                     ("softdep_setup_directory_add: bad jaddref %p", jaddref));
8335                 jaddref->ja_diroff = diroffset;
8336                 jaddref->ja_diradd = dap;
8337                 add_to_journal(&jaddref->ja_list);
8338         } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
8339                 diradd_inode_written(dap, inodedep);
8340         else
8341                 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
8342         /*
8343          * Add the journal entries for . and .. links now that the primary
8344          * link is written.
8345          */
8346         if (mkdir1 != NULL && MOUNTEDSUJ(mp)) {
8347                 jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
8348                     inoreflst, if_deps);
8349                 KASSERT(jaddref != NULL &&
8350                     jaddref->ja_ino == jaddref->ja_parent &&
8351                     (jaddref->ja_state & MKDIR_BODY),
8352                     ("softdep_setup_directory_add: bad dot jaddref %p",
8353                     jaddref));
8354                 mkdir1->md_jaddref = jaddref;
8355                 jaddref->ja_mkdir = mkdir1;
8356                 /*
8357                  * It is important that the dotdot journal entry
8358                  * is added prior to the dot entry since dot writes
8359                  * both the dot and dotdot links.  These both must
8360                  * be added after the primary link for the journal
8361                  * to remain consistent.
8362                  */
8363                 add_to_journal(&mkdir2->md_jaddref->ja_list);
8364                 add_to_journal(&jaddref->ja_list);
8365         }
8366         /*
8367          * If we are adding a new directory remember this diradd so that if
8368          * we rename it we can keep the dot and dotdot dependencies.  If
8369          * we are adding a new name for an inode that has a mkdiradd we
8370          * must be in rename and we have to move the dot and dotdot
8371          * dependencies to this new name.  The old name is being orphaned
8372          * soon.
8373          */
8374         if (mkdir1 != NULL) {
8375                 if (inodedep->id_mkdiradd != NULL)
8376                         panic("softdep_setup_directory_add: Existing mkdir");
8377                 inodedep->id_mkdiradd = dap;
8378         } else if (inodedep->id_mkdiradd)
8379                 merge_diradd(inodedep, dap);
8380         if (newdirblk) {
8381                 /*
8382                  * There is nothing to do if we are already tracking
8383                  * this block.
8384                  */
8385                 if ((pagedep->pd_state & NEWBLOCK) != 0) {
8386                         WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
8387                         FREE_LOCK(ump);
8388                         return (0);
8389                 }
8390                 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
8391                     == 0)
8392                         panic("softdep_setup_directory_add: lost entry");
8393                 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8394                 pagedep->pd_state |= NEWBLOCK;
8395                 pagedep->pd_newdirblk = newdirblk;
8396                 newdirblk->db_pagedep = pagedep;
8397                 FREE_LOCK(ump);
8398                 /*
8399                  * If we extended into an indirect signal direnter to sync.
8400                  */
8401                 if (isindir)
8402                         return (1);
8403                 return (0);
8404         }
8405         FREE_LOCK(ump);
8406         return (0);
8407 }
8408
8409 /*
8410  * This procedure is called to change the offset of a directory
8411  * entry when compacting a directory block which must be owned
8412  * exclusively by the caller. Note that the actual entry movement
8413  * must be done in this procedure to ensure that no I/O completions
8414  * occur while the move is in progress.
8415  */
8416 void
8417 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
8418         struct buf *bp;         /* Buffer holding directory block. */
8419         struct inode *dp;       /* inode for directory */
8420         caddr_t base;           /* address of dp->i_offset */
8421         caddr_t oldloc;         /* address of old directory location */
8422         caddr_t newloc;         /* address of new directory location */
8423         int entrysize;          /* size of directory entry */
8424 {
8425         int offset, oldoffset, newoffset;
8426         struct pagedep *pagedep;
8427         struct jmvref *jmvref;
8428         struct diradd *dap;
8429         struct direct *de;
8430         struct mount *mp;
8431         ufs_lbn_t lbn;
8432         int flags;
8433
8434         mp = UFSTOVFS(dp->i_ump);
8435         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
8436             ("softdep_change_directoryentry_offset called on "
8437              "non-softdep filesystem"));
8438         de = (struct direct *)oldloc;
8439         jmvref = NULL;
8440         flags = 0;
8441         /*
8442          * Moves are always journaled as it would be too complex to
8443          * determine if any affected adds or removes are present in the
8444          * journal.
8445          */
8446         if (MOUNTEDSUJ(mp)) {
8447                 flags = DEPALLOC;
8448                 jmvref = newjmvref(dp, de->d_ino,
8449                     dp->i_offset + (oldloc - base),
8450                     dp->i_offset + (newloc - base));
8451         }
8452         lbn = lblkno(dp->i_fs, dp->i_offset);
8453         offset = blkoff(dp->i_fs, dp->i_offset);
8454         oldoffset = offset + (oldloc - base);
8455         newoffset = offset + (newloc - base);
8456         ACQUIRE_LOCK(dp->i_ump);
8457         if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0)
8458                 goto done;
8459         dap = diradd_lookup(pagedep, oldoffset);
8460         if (dap) {
8461                 dap->da_offset = newoffset;
8462                 newoffset = DIRADDHASH(newoffset);
8463                 oldoffset = DIRADDHASH(oldoffset);
8464                 if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
8465                     newoffset != oldoffset) {
8466                         LIST_REMOVE(dap, da_pdlist);
8467                         LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
8468                             dap, da_pdlist);
8469                 }
8470         }
8471 done:
8472         if (jmvref) {
8473                 jmvref->jm_pagedep = pagedep;
8474                 LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
8475                 add_to_journal(&jmvref->jm_list);
8476         }
8477         bcopy(oldloc, newloc, entrysize);
8478         FREE_LOCK(dp->i_ump);
8479 }
8480
8481 /*
8482  * Move the mkdir dependencies and journal work from one diradd to another
8483  * when renaming a directory.  The new name must depend on the mkdir deps
8484  * completing as the old name did.  Directories can only have one valid link
8485  * at a time so one must be canonical.
8486  */
8487 static void
8488 merge_diradd(inodedep, newdap)
8489         struct inodedep *inodedep;
8490         struct diradd *newdap;
8491 {
8492         struct diradd *olddap;
8493         struct mkdir *mkdir, *nextmd;
8494         struct ufsmount *ump;
8495         short state;
8496
8497         olddap = inodedep->id_mkdiradd;
8498         inodedep->id_mkdiradd = newdap;
8499         if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8500                 newdap->da_state &= ~DEPCOMPLETE;
8501                 ump = VFSTOUFS(inodedep->id_list.wk_mp);
8502                 for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
8503                      mkdir = nextmd) {
8504                         nextmd = LIST_NEXT(mkdir, md_mkdirs);
8505                         if (mkdir->md_diradd != olddap)
8506                                 continue;
8507                         mkdir->md_diradd = newdap;
8508                         state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
8509                         newdap->da_state |= state;
8510                         olddap->da_state &= ~state;
8511                         if ((olddap->da_state &
8512                             (MKDIR_PARENT | MKDIR_BODY)) == 0)
8513                                 break;
8514                 }
8515                 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8516                         panic("merge_diradd: unfound ref");
8517         }
8518         /*
8519          * Any mkdir related journal items are not safe to be freed until
8520          * the new name is stable.
8521          */
8522         jwork_move(&newdap->da_jwork, &olddap->da_jwork);
8523         olddap->da_state |= DEPCOMPLETE;
8524         complete_diradd(olddap);
8525 }
8526
8527 /*
8528  * Move the diradd to the pending list when all diradd dependencies are
8529  * complete.
8530  */
8531 static void
8532 complete_diradd(dap)
8533         struct diradd *dap;
8534 {
8535         struct pagedep *pagedep;
8536
8537         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
8538                 if (dap->da_state & DIRCHG)
8539                         pagedep = dap->da_previous->dm_pagedep;
8540                 else
8541                         pagedep = dap->da_pagedep;
8542                 LIST_REMOVE(dap, da_pdlist);
8543                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
8544         }
8545 }
8546
8547 /*
8548  * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
8549  * add entries and conditonally journal the remove.
8550  */
8551 static void
8552 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
8553         struct diradd *dap;
8554         struct dirrem *dirrem;
8555         struct jremref *jremref;
8556         struct jremref *dotremref;
8557         struct jremref *dotdotremref;
8558 {
8559         struct inodedep *inodedep;
8560         struct jaddref *jaddref;
8561         struct inoref *inoref;
8562         struct ufsmount *ump;
8563         struct mkdir *mkdir;
8564
8565         /*
8566          * If no remove references were allocated we're on a non-journaled
8567          * filesystem and can skip the cancel step.
8568          */
8569         if (jremref == NULL) {
8570                 free_diradd(dap, NULL);
8571                 return;
8572         }
8573         /*
8574          * Cancel the primary name an free it if it does not require
8575          * journaling.
8576          */
8577         if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
8578             0, &inodedep) != 0) {
8579                 /* Abort the addref that reference this diradd.  */
8580                 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
8581                         if (inoref->if_list.wk_type != D_JADDREF)
8582                                 continue;
8583                         jaddref = (struct jaddref *)inoref;
8584                         if (jaddref->ja_diradd != dap)
8585                                 continue;
8586                         if (cancel_jaddref(jaddref, inodedep,
8587                             &dirrem->dm_jwork) == 0) {
8588                                 free_jremref(jremref);
8589                                 jremref = NULL;
8590                         }
8591                         break;
8592                 }
8593         }
8594         /*
8595          * Cancel subordinate names and free them if they do not require
8596          * journaling.
8597          */
8598         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8599                 ump = VFSTOUFS(dap->da_list.wk_mp);
8600                 LIST_FOREACH(mkdir, &ump->softdep_mkdirlisthd, md_mkdirs) {
8601                         if (mkdir->md_diradd != dap)
8602                                 continue;
8603                         if ((jaddref = mkdir->md_jaddref) == NULL)
8604                                 continue;
8605                         mkdir->md_jaddref = NULL;
8606                         if (mkdir->md_state & MKDIR_PARENT) {
8607                                 if (cancel_jaddref(jaddref, NULL,
8608                                     &dirrem->dm_jwork) == 0) {
8609                                         free_jremref(dotdotremref);
8610                                         dotdotremref = NULL;
8611                                 }
8612                         } else {
8613                                 if (cancel_jaddref(jaddref, inodedep,
8614                                     &dirrem->dm_jwork) == 0) {
8615                                         free_jremref(dotremref);
8616                                         dotremref = NULL;
8617                                 }
8618                         }
8619                 }
8620         }
8621
8622         if (jremref)
8623                 journal_jremref(dirrem, jremref, inodedep);
8624         if (dotremref)
8625                 journal_jremref(dirrem, dotremref, inodedep);
8626         if (dotdotremref)
8627                 journal_jremref(dirrem, dotdotremref, NULL);
8628         jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
8629         free_diradd(dap, &dirrem->dm_jwork);
8630 }
8631
8632 /*
8633  * Free a diradd dependency structure. This routine must be called
8634  * with splbio interrupts blocked.
8635  */
8636 static void
8637 free_diradd(dap, wkhd)
8638         struct diradd *dap;
8639         struct workhead *wkhd;
8640 {
8641         struct dirrem *dirrem;
8642         struct pagedep *pagedep;
8643         struct inodedep *inodedep;
8644         struct mkdir *mkdir, *nextmd;
8645         struct ufsmount *ump;
8646
8647         ump = VFSTOUFS(dap->da_list.wk_mp);
8648         LOCK_OWNED(ump);
8649         LIST_REMOVE(dap, da_pdlist);
8650         if (dap->da_state & ONWORKLIST)
8651                 WORKLIST_REMOVE(&dap->da_list);
8652         if ((dap->da_state & DIRCHG) == 0) {
8653                 pagedep = dap->da_pagedep;
8654         } else {
8655                 dirrem = dap->da_previous;
8656                 pagedep = dirrem->dm_pagedep;
8657                 dirrem->dm_dirinum = pagedep->pd_ino;
8658                 dirrem->dm_state |= COMPLETE;
8659                 if (LIST_EMPTY(&dirrem->dm_jremrefhd))
8660                         add_to_worklist(&dirrem->dm_list, 0);
8661         }
8662         if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
8663             0, &inodedep) != 0)
8664                 if (inodedep->id_mkdiradd == dap)
8665                         inodedep->id_mkdiradd = NULL;
8666         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8667                 for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
8668                      mkdir = nextmd) {
8669                         nextmd = LIST_NEXT(mkdir, md_mkdirs);
8670                         if (mkdir->md_diradd != dap)
8671                                 continue;
8672                         dap->da_state &=
8673                             ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
8674                         LIST_REMOVE(mkdir, md_mkdirs);
8675                         if (mkdir->md_state & ONWORKLIST)
8676                                 WORKLIST_REMOVE(&mkdir->md_list);
8677                         if (mkdir->md_jaddref != NULL)
8678                                 panic("free_diradd: Unexpected jaddref");
8679                         WORKITEM_FREE(mkdir, D_MKDIR);
8680                         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
8681                                 break;
8682                 }
8683                 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8684                         panic("free_diradd: unfound ref");
8685         }
8686         if (inodedep)
8687                 free_inodedep(inodedep);
8688         /*
8689          * Free any journal segments waiting for the directory write.
8690          */
8691         handle_jwork(&dap->da_jwork);
8692         WORKITEM_FREE(dap, D_DIRADD);
8693 }
8694
8695 /*
8696  * Directory entry removal dependencies.
8697  *
8698  * When removing a directory entry, the entry's inode pointer must be
8699  * zero'ed on disk before the corresponding inode's link count is decremented
8700  * (possibly freeing the inode for re-use). This dependency is handled by
8701  * updating the directory entry but delaying the inode count reduction until
8702  * after the directory block has been written to disk. After this point, the
8703  * inode count can be decremented whenever it is convenient.
8704  */
8705
8706 /*
8707  * This routine should be called immediately after removing
8708  * a directory entry.  The inode's link count should not be
8709  * decremented by the calling procedure -- the soft updates
8710  * code will do this task when it is safe.
8711  */
8712 void
8713 softdep_setup_remove(bp, dp, ip, isrmdir)
8714         struct buf *bp;         /* buffer containing directory block */
8715         struct inode *dp;       /* inode for the directory being modified */
8716         struct inode *ip;       /* inode for directory entry being removed */
8717         int isrmdir;            /* indicates if doing RMDIR */
8718 {
8719         struct dirrem *dirrem, *prevdirrem;
8720         struct inodedep *inodedep;
8721         int direct;
8722
8723         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
8724             ("softdep_setup_remove called on non-softdep filesystem"));
8725         /*
8726          * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
8727          * newdirrem() to setup the full directory remove which requires
8728          * isrmdir > 1.
8729          */
8730         dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
8731         /*
8732          * Add the dirrem to the inodedep's pending remove list for quick
8733          * discovery later.
8734          */
8735         if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
8736             &inodedep) == 0)
8737                 panic("softdep_setup_remove: Lost inodedep.");
8738         KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
8739         dirrem->dm_state |= ONDEPLIST;
8740         LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
8741
8742         /*
8743          * If the COMPLETE flag is clear, then there were no active
8744          * entries and we want to roll back to a zeroed entry until
8745          * the new inode is committed to disk. If the COMPLETE flag is
8746          * set then we have deleted an entry that never made it to
8747          * disk. If the entry we deleted resulted from a name change,
8748          * then the old name still resides on disk. We cannot delete
8749          * its inode (returned to us in prevdirrem) until the zeroed
8750          * directory entry gets to disk. The new inode has never been
8751          * referenced on the disk, so can be deleted immediately.
8752          */
8753         if ((dirrem->dm_state & COMPLETE) == 0) {
8754                 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
8755                     dm_next);
8756                 FREE_LOCK(ip->i_ump);
8757         } else {
8758                 if (prevdirrem != NULL)
8759                         LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
8760                             prevdirrem, dm_next);
8761                 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
8762                 direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
8763                 FREE_LOCK(ip->i_ump);
8764                 if (direct)
8765                         handle_workitem_remove(dirrem, 0);
8766         }
8767 }
8768
8769 /*
8770  * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
8771  * pd_pendinghd list of a pagedep.
8772  */
8773 static struct diradd *
8774 diradd_lookup(pagedep, offset)
8775         struct pagedep *pagedep;
8776         int offset;
8777 {
8778         struct diradd *dap;
8779
8780         LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
8781                 if (dap->da_offset == offset)
8782                         return (dap);
8783         LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
8784                 if (dap->da_offset == offset)
8785                         return (dap);
8786         return (NULL);
8787 }
8788
8789 /*
8790  * Search for a .. diradd dependency in a directory that is being removed.
8791  * If the directory was renamed to a new parent we have a diradd rather
8792  * than a mkdir for the .. entry.  We need to cancel it now before
8793  * it is found in truncate().
8794  */
8795 static struct jremref *
8796 cancel_diradd_dotdot(ip, dirrem, jremref)
8797         struct inode *ip;
8798         struct dirrem *dirrem;
8799         struct jremref *jremref;
8800 {
8801         struct pagedep *pagedep;
8802         struct diradd *dap;
8803         struct worklist *wk;
8804
8805         if (pagedep_lookup(UFSTOVFS(ip->i_ump), NULL, ip->i_number, 0, 0,
8806             &pagedep) == 0)
8807                 return (jremref);
8808         dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
8809         if (dap == NULL)
8810                 return (jremref);
8811         cancel_diradd(dap, dirrem, jremref, NULL, NULL);
8812         /*
8813          * Mark any journal work as belonging to the parent so it is freed
8814          * with the .. reference.
8815          */
8816         LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
8817                 wk->wk_state |= MKDIR_PARENT;
8818         return (NULL);
8819 }
8820
8821 /*
8822  * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
8823  * replace it with a dirrem/diradd pair as a result of re-parenting a
8824  * directory.  This ensures that we don't simultaneously have a mkdir and
8825  * a diradd for the same .. entry.
8826  */
8827 static struct jremref *
8828 cancel_mkdir_dotdot(ip, dirrem, jremref)
8829         struct inode *ip;
8830         struct dirrem *dirrem;
8831         struct jremref *jremref;
8832 {
8833         struct inodedep *inodedep;
8834         struct jaddref *jaddref;
8835         struct ufsmount *ump;
8836         struct mkdir *mkdir;
8837         struct diradd *dap;
8838
8839         if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
8840             &inodedep) == 0)
8841                 return (jremref);
8842         dap = inodedep->id_mkdiradd;
8843         if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
8844                 return (jremref);
8845         ump = VFSTOUFS(inodedep->id_list.wk_mp);
8846         for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
8847             mkdir = LIST_NEXT(mkdir, md_mkdirs))
8848                 if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
8849                         break;
8850         if (mkdir == NULL)
8851                 panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
8852         if ((jaddref = mkdir->md_jaddref) != NULL) {
8853                 mkdir->md_jaddref = NULL;
8854                 jaddref->ja_state &= ~MKDIR_PARENT;
8855                 if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0,
8856                     &inodedep) == 0)
8857                         panic("cancel_mkdir_dotdot: Lost parent inodedep");
8858                 if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
8859                         journal_jremref(dirrem, jremref, inodedep);
8860                         jremref = NULL;
8861                 }
8862         }
8863         if (mkdir->md_state & ONWORKLIST)
8864                 WORKLIST_REMOVE(&mkdir->md_list);
8865         mkdir->md_state |= ALLCOMPLETE;
8866         complete_mkdir(mkdir);
8867         return (jremref);
8868 }
8869
8870 static void
8871 journal_jremref(dirrem, jremref, inodedep)
8872         struct dirrem *dirrem;
8873         struct jremref *jremref;
8874         struct inodedep *inodedep;
8875 {
8876
8877         if (inodedep == NULL)
8878                 if (inodedep_lookup(jremref->jr_list.wk_mp,
8879                     jremref->jr_ref.if_ino, 0, &inodedep) == 0)
8880                         panic("journal_jremref: Lost inodedep");
8881         LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
8882         TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
8883         add_to_journal(&jremref->jr_list);
8884 }
8885
8886 static void
8887 dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
8888         struct dirrem *dirrem;
8889         struct jremref *jremref;
8890         struct jremref *dotremref;
8891         struct jremref *dotdotremref;
8892 {
8893         struct inodedep *inodedep;
8894
8895
8896         if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
8897             &inodedep) == 0)
8898                 panic("dirrem_journal: Lost inodedep");
8899         journal_jremref(dirrem, jremref, inodedep);
8900         if (dotremref)
8901                 journal_jremref(dirrem, dotremref, inodedep);
8902         if (dotdotremref)
8903                 journal_jremref(dirrem, dotdotremref, NULL);
8904 }
8905
8906 /*
8907  * Allocate a new dirrem if appropriate and return it along with
8908  * its associated pagedep. Called without a lock, returns with lock.
8909  */
8910 static struct dirrem *
8911 newdirrem(bp, dp, ip, isrmdir, prevdirremp)
8912         struct buf *bp;         /* buffer containing directory block */
8913         struct inode *dp;       /* inode for the directory being modified */
8914         struct inode *ip;       /* inode for directory entry being removed */
8915         int isrmdir;            /* indicates if doing RMDIR */
8916         struct dirrem **prevdirremp; /* previously referenced inode, if any */
8917 {
8918         int offset;
8919         ufs_lbn_t lbn;
8920         struct diradd *dap;
8921         struct dirrem *dirrem;
8922         struct pagedep *pagedep;
8923         struct jremref *jremref;
8924         struct jremref *dotremref;
8925         struct jremref *dotdotremref;
8926         struct vnode *dvp;
8927
8928         /*
8929          * Whiteouts have no deletion dependencies.
8930          */
8931         if (ip == NULL)
8932                 panic("newdirrem: whiteout");
8933         dvp = ITOV(dp);
8934         /*
8935          * If we are over our limit, try to improve the situation.
8936          * Limiting the number of dirrem structures will also limit
8937          * the number of freefile and freeblks structures.
8938          */
8939         ACQUIRE_LOCK(ip->i_ump);
8940         if (!IS_SNAPSHOT(ip) && dep_current[D_DIRREM] > max_softdeps / 2)
8941                 (void) request_cleanup(ITOV(dp)->v_mount, FLUSH_BLOCKS);
8942         FREE_LOCK(ip->i_ump);
8943         dirrem = malloc(sizeof(struct dirrem),
8944                 M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
8945         workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
8946         LIST_INIT(&dirrem->dm_jremrefhd);
8947         LIST_INIT(&dirrem->dm_jwork);
8948         dirrem->dm_state = isrmdir ? RMDIR : 0;
8949         dirrem->dm_oldinum = ip->i_number;
8950         *prevdirremp = NULL;
8951         /*
8952          * Allocate remove reference structures to track journal write
8953          * dependencies.  We will always have one for the link and
8954          * when doing directories we will always have one more for dot.
8955          * When renaming a directory we skip the dotdot link change so
8956          * this is not needed.
8957          */
8958         jremref = dotremref = dotdotremref = NULL;
8959         if (DOINGSUJ(dvp)) {
8960                 if (isrmdir) {
8961                         jremref = newjremref(dirrem, dp, ip, dp->i_offset,
8962                             ip->i_effnlink + 2);
8963                         dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
8964                             ip->i_effnlink + 1);
8965                         dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
8966                             dp->i_effnlink + 1);
8967                         dotdotremref->jr_state |= MKDIR_PARENT;
8968                 } else
8969                         jremref = newjremref(dirrem, dp, ip, dp->i_offset,
8970                             ip->i_effnlink + 1);
8971         }
8972         ACQUIRE_LOCK(ip->i_ump);
8973         lbn = lblkno(dp->i_fs, dp->i_offset);
8974         offset = blkoff(dp->i_fs, dp->i_offset);
8975         pagedep_lookup(UFSTOVFS(dp->i_ump), bp, dp->i_number, lbn, DEPALLOC,
8976             &pagedep);
8977         dirrem->dm_pagedep = pagedep;
8978         dirrem->dm_offset = offset;
8979         /*
8980          * If we're renaming a .. link to a new directory, cancel any
8981          * existing MKDIR_PARENT mkdir.  If it has already been canceled
8982          * the jremref is preserved for any potential diradd in this
8983          * location.  This can not coincide with a rmdir.
8984          */
8985         if (dp->i_offset == DOTDOT_OFFSET) {
8986                 if (isrmdir)
8987                         panic("newdirrem: .. directory change during remove?");
8988                 jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
8989         }
8990         /*
8991          * If we're removing a directory search for the .. dependency now and
8992          * cancel it.  Any pending journal work will be added to the dirrem
8993          * to be completed when the workitem remove completes.
8994          */
8995         if (isrmdir)
8996                 dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
8997         /*
8998          * Check for a diradd dependency for the same directory entry.
8999          * If present, then both dependencies become obsolete and can
9000          * be de-allocated.
9001          */
9002         dap = diradd_lookup(pagedep, offset);
9003         if (dap == NULL) {
9004                 /*
9005                  * Link the jremref structures into the dirrem so they are
9006                  * written prior to the pagedep.
9007                  */
9008                 if (jremref)
9009                         dirrem_journal(dirrem, jremref, dotremref,
9010                             dotdotremref);
9011                 return (dirrem);
9012         }
9013         /*
9014          * Must be ATTACHED at this point.
9015          */
9016         if ((dap->da_state & ATTACHED) == 0)
9017                 panic("newdirrem: not ATTACHED");
9018         if (dap->da_newinum != ip->i_number)
9019                 panic("newdirrem: inum %ju should be %ju",
9020                     (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum);
9021         /*
9022          * If we are deleting a changed name that never made it to disk,
9023          * then return the dirrem describing the previous inode (which
9024          * represents the inode currently referenced from this entry on disk).
9025          */
9026         if ((dap->da_state & DIRCHG) != 0) {
9027                 *prevdirremp = dap->da_previous;
9028                 dap->da_state &= ~DIRCHG;
9029                 dap->da_pagedep = pagedep;
9030         }
9031         /*
9032          * We are deleting an entry that never made it to disk.
9033          * Mark it COMPLETE so we can delete its inode immediately.
9034          */
9035         dirrem->dm_state |= COMPLETE;
9036         cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
9037 #ifdef SUJ_DEBUG
9038         if (isrmdir == 0) {
9039                 struct worklist *wk;
9040
9041                 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
9042                         if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
9043                                 panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
9044         }
9045 #endif
9046
9047         return (dirrem);
9048 }
9049
9050 /*
9051  * Directory entry change dependencies.
9052  *
9053  * Changing an existing directory entry requires that an add operation
9054  * be completed first followed by a deletion. The semantics for the addition
9055  * are identical to the description of adding a new entry above except
9056  * that the rollback is to the old inode number rather than zero. Once
9057  * the addition dependency is completed, the removal is done as described
9058  * in the removal routine above.
9059  */
9060
9061 /*
9062  * This routine should be called immediately after changing
9063  * a directory entry.  The inode's link count should not be
9064  * decremented by the calling procedure -- the soft updates
9065  * code will perform this task when it is safe.
9066  */
9067 void
9068 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
9069         struct buf *bp;         /* buffer containing directory block */
9070         struct inode *dp;       /* inode for the directory being modified */
9071         struct inode *ip;       /* inode for directory entry being removed */
9072         ino_t newinum;          /* new inode number for changed entry */
9073         int isrmdir;            /* indicates if doing RMDIR */
9074 {
9075         int offset;
9076         struct diradd *dap = NULL;
9077         struct dirrem *dirrem, *prevdirrem;
9078         struct pagedep *pagedep;
9079         struct inodedep *inodedep;
9080         struct jaddref *jaddref;
9081         struct mount *mp;
9082
9083         offset = blkoff(dp->i_fs, dp->i_offset);
9084         mp = UFSTOVFS(dp->i_ump);
9085         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
9086            ("softdep_setup_directory_change called on non-softdep filesystem"));
9087
9088         /*
9089          * Whiteouts do not need diradd dependencies.
9090          */
9091         if (newinum != WINO) {
9092                 dap = malloc(sizeof(struct diradd),
9093                     M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
9094                 workitem_alloc(&dap->da_list, D_DIRADD, mp);
9095                 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
9096                 dap->da_offset = offset;
9097                 dap->da_newinum = newinum;
9098                 LIST_INIT(&dap->da_jwork);
9099         }
9100
9101         /*
9102          * Allocate a new dirrem and ACQUIRE_LOCK.
9103          */
9104         dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
9105         pagedep = dirrem->dm_pagedep;
9106         /*
9107          * The possible values for isrmdir:
9108          *      0 - non-directory file rename
9109          *      1 - directory rename within same directory
9110          *   inum - directory rename to new directory of given inode number
9111          * When renaming to a new directory, we are both deleting and
9112          * creating a new directory entry, so the link count on the new
9113          * directory should not change. Thus we do not need the followup
9114          * dirrem which is usually done in handle_workitem_remove. We set
9115          * the DIRCHG flag to tell handle_workitem_remove to skip the
9116          * followup dirrem.
9117          */
9118         if (isrmdir > 1)
9119                 dirrem->dm_state |= DIRCHG;
9120
9121         /*
9122          * Whiteouts have no additional dependencies,
9123          * so just put the dirrem on the correct list.
9124          */
9125         if (newinum == WINO) {
9126                 if ((dirrem->dm_state & COMPLETE) == 0) {
9127                         LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
9128                             dm_next);
9129                 } else {
9130                         dirrem->dm_dirinum = pagedep->pd_ino;
9131                         if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9132                                 add_to_worklist(&dirrem->dm_list, 0);
9133                 }
9134                 FREE_LOCK(dp->i_ump);
9135                 return;
9136         }
9137         /*
9138          * Add the dirrem to the inodedep's pending remove list for quick
9139          * discovery later.  A valid nlinkdelta ensures that this lookup
9140          * will not fail.
9141          */
9142         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
9143                 panic("softdep_setup_directory_change: Lost inodedep.");
9144         dirrem->dm_state |= ONDEPLIST;
9145         LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9146
9147         /*
9148          * If the COMPLETE flag is clear, then there were no active
9149          * entries and we want to roll back to the previous inode until
9150          * the new inode is committed to disk. If the COMPLETE flag is
9151          * set, then we have deleted an entry that never made it to disk.
9152          * If the entry we deleted resulted from a name change, then the old
9153          * inode reference still resides on disk. Any rollback that we do
9154          * needs to be to that old inode (returned to us in prevdirrem). If
9155          * the entry we deleted resulted from a create, then there is
9156          * no entry on the disk, so we want to roll back to zero rather
9157          * than the uncommitted inode. In either of the COMPLETE cases we
9158          * want to immediately free the unwritten and unreferenced inode.
9159          */
9160         if ((dirrem->dm_state & COMPLETE) == 0) {
9161                 dap->da_previous = dirrem;
9162         } else {
9163                 if (prevdirrem != NULL) {
9164                         dap->da_previous = prevdirrem;
9165                 } else {
9166                         dap->da_state &= ~DIRCHG;
9167                         dap->da_pagedep = pagedep;
9168                 }
9169                 dirrem->dm_dirinum = pagedep->pd_ino;
9170                 if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9171                         add_to_worklist(&dirrem->dm_list, 0);
9172         }
9173         /*
9174          * Lookup the jaddref for this journal entry.  We must finish
9175          * initializing it and make the diradd write dependent on it.
9176          * If we're not journaling, put it on the id_bufwait list if the
9177          * inode is not yet written. If it is written, do the post-inode
9178          * write processing to put it on the id_pendinghd list.
9179          */
9180         inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep);
9181         if (MOUNTEDSUJ(mp)) {
9182                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
9183                     inoreflst);
9184                 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
9185                     ("softdep_setup_directory_change: bad jaddref %p",
9186                     jaddref));
9187                 jaddref->ja_diroff = dp->i_offset;
9188                 jaddref->ja_diradd = dap;
9189                 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9190                     dap, da_pdlist);
9191                 add_to_journal(&jaddref->ja_list);
9192         } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
9193                 dap->da_state |= COMPLETE;
9194                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
9195                 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
9196         } else {
9197                 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9198                     dap, da_pdlist);
9199                 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
9200         }
9201         /*
9202          * If we're making a new name for a directory that has not been
9203          * committed when need to move the dot and dotdot references to
9204          * this new name.
9205          */
9206         if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
9207                 merge_diradd(inodedep, dap);
9208         FREE_LOCK(dp->i_ump);
9209 }
9210
9211 /*
9212  * Called whenever the link count on an inode is changed.
9213  * It creates an inode dependency so that the new reference(s)
9214  * to the inode cannot be committed to disk until the updated
9215  * inode has been written.
9216  */
9217 void
9218 softdep_change_linkcnt(ip)
9219         struct inode *ip;       /* the inode with the increased link count */
9220 {
9221         struct inodedep *inodedep;
9222         int dflags;
9223
9224         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
9225             ("softdep_change_linkcnt called on non-softdep filesystem"));
9226         ACQUIRE_LOCK(ip->i_ump);
9227         dflags = DEPALLOC;
9228         if (IS_SNAPSHOT(ip))
9229                 dflags |= NODELAY;
9230         inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep);
9231         if (ip->i_nlink < ip->i_effnlink)
9232                 panic("softdep_change_linkcnt: bad delta");
9233         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9234         FREE_LOCK(ip->i_ump);
9235 }
9236
9237 /*
9238  * Attach a sbdep dependency to the superblock buf so that we can keep
9239  * track of the head of the linked list of referenced but unlinked inodes.
9240  */
9241 void
9242 softdep_setup_sbupdate(ump, fs, bp)
9243         struct ufsmount *ump;
9244         struct fs *fs;
9245         struct buf *bp;
9246 {
9247         struct sbdep *sbdep;
9248         struct worklist *wk;
9249
9250         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
9251             ("softdep_setup_sbupdate called on non-softdep filesystem"));
9252         LIST_FOREACH(wk, &bp->b_dep, wk_list)
9253                 if (wk->wk_type == D_SBDEP)
9254                         break;
9255         if (wk != NULL)
9256                 return;
9257         sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
9258         workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
9259         sbdep->sb_fs = fs;
9260         sbdep->sb_ump = ump;
9261         ACQUIRE_LOCK(ump);
9262         WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
9263         FREE_LOCK(ump);
9264 }
9265
9266 /*
9267  * Return the first unlinked inodedep which is ready to be the head of the
9268  * list.  The inodedep and all those after it must have valid next pointers.
9269  */
9270 static struct inodedep *
9271 first_unlinked_inodedep(ump)
9272         struct ufsmount *ump;
9273 {
9274         struct inodedep *inodedep;
9275         struct inodedep *idp;
9276
9277         LOCK_OWNED(ump);
9278         for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
9279             inodedep; inodedep = idp) {
9280                 if ((inodedep->id_state & UNLINKNEXT) == 0)
9281                         return (NULL);
9282                 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9283                 if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
9284                         break;
9285                 if ((inodedep->id_state & UNLINKPREV) == 0)
9286                         break;
9287         }
9288         return (inodedep);
9289 }
9290
9291 /*
9292  * Set the sujfree unlinked head pointer prior to writing a superblock.
9293  */
9294 static void
9295 initiate_write_sbdep(sbdep)
9296         struct sbdep *sbdep;
9297 {
9298         struct inodedep *inodedep;
9299         struct fs *bpfs;
9300         struct fs *fs;
9301
9302         bpfs = sbdep->sb_fs;
9303         fs = sbdep->sb_ump->um_fs;
9304         inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9305         if (inodedep) {
9306                 fs->fs_sujfree = inodedep->id_ino;
9307                 inodedep->id_state |= UNLINKPREV;
9308         } else
9309                 fs->fs_sujfree = 0;
9310         bpfs->fs_sujfree = fs->fs_sujfree;
9311 }
9312
9313 /*
9314  * After a superblock is written determine whether it must be written again
9315  * due to a changing unlinked list head.
9316  */
9317 static int
9318 handle_written_sbdep(sbdep, bp)
9319         struct sbdep *sbdep;
9320         struct buf *bp;
9321 {
9322         struct inodedep *inodedep;
9323         struct mount *mp;
9324         struct fs *fs;
9325
9326         LOCK_OWNED(sbdep->sb_ump);
9327         fs = sbdep->sb_fs;
9328         mp = UFSTOVFS(sbdep->sb_ump);
9329         /*
9330          * If the superblock doesn't match the in-memory list start over.
9331          */
9332         inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9333         if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
9334             (inodedep == NULL && fs->fs_sujfree != 0)) {
9335                 bdirty(bp);
9336                 return (1);
9337         }
9338         WORKITEM_FREE(sbdep, D_SBDEP);
9339         if (fs->fs_sujfree == 0)
9340                 return (0);
9341         /*
9342          * Now that we have a record of this inode in stable store allow it
9343          * to be written to free up pending work.  Inodes may see a lot of
9344          * write activity after they are unlinked which we must not hold up.
9345          */
9346         for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
9347                 if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
9348                         panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
9349                             inodedep, inodedep->id_state);
9350                 if (inodedep->id_state & UNLINKONLIST)
9351                         break;
9352                 inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
9353         }
9354
9355         return (0);
9356 }
9357
9358 /*
9359  * Mark an inodedep as unlinked and insert it into the in-memory unlinked list.
9360  */
9361 static void
9362 unlinked_inodedep(mp, inodedep)
9363         struct mount *mp;
9364         struct inodedep *inodedep;
9365 {
9366         struct ufsmount *ump;
9367
9368         ump = VFSTOUFS(mp);
9369         LOCK_OWNED(ump);
9370         if (MOUNTEDSUJ(mp) == 0)
9371                 return;
9372         ump->um_fs->fs_fmod = 1;
9373         if (inodedep->id_state & UNLINKED)
9374                 panic("unlinked_inodedep: %p already unlinked\n", inodedep);
9375         inodedep->id_state |= UNLINKED;
9376         TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
9377 }
9378
9379 /*
9380  * Remove an inodedep from the unlinked inodedep list.  This may require
9381  * disk writes if the inode has made it that far.
9382  */
9383 static void
9384 clear_unlinked_inodedep(inodedep)
9385         struct inodedep *inodedep;
9386 {
9387         struct ufsmount *ump;
9388         struct inodedep *idp;
9389         struct inodedep *idn;
9390         struct fs *fs;
9391         struct buf *bp;
9392         ino_t ino;
9393         ino_t nino;
9394         ino_t pino;
9395         int error;
9396
9397         ump = VFSTOUFS(inodedep->id_list.wk_mp);
9398         fs = ump->um_fs;
9399         ino = inodedep->id_ino;
9400         error = 0;
9401         for (;;) {
9402                 LOCK_OWNED(ump);
9403                 KASSERT((inodedep->id_state & UNLINKED) != 0,
9404                     ("clear_unlinked_inodedep: inodedep %p not unlinked",
9405                     inodedep));
9406                 /*
9407                  * If nothing has yet been written simply remove us from
9408                  * the in memory list and return.  This is the most common
9409                  * case where handle_workitem_remove() loses the final
9410                  * reference.
9411                  */
9412                 if ((inodedep->id_state & UNLINKLINKS) == 0)
9413                         break;
9414                 /*
9415                  * If we have a NEXT pointer and no PREV pointer we can simply
9416                  * clear NEXT's PREV and remove ourselves from the list.  Be
9417                  * careful not to clear PREV if the superblock points at
9418                  * next as well.
9419                  */
9420                 idn = TAILQ_NEXT(inodedep, id_unlinked);
9421                 if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
9422                         if (idn && fs->fs_sujfree != idn->id_ino)
9423                                 idn->id_state &= ~UNLINKPREV;
9424                         break;
9425                 }
9426                 /*
9427                  * Here we have an inodedep which is actually linked into
9428                  * the list.  We must remove it by forcing a write to the
9429                  * link before us, whether it be the superblock or an inode.
9430                  * Unfortunately the list may change while we're waiting
9431                  * on the buf lock for either resource so we must loop until
9432                  * we lock the right one.  If both the superblock and an
9433                  * inode point to this inode we must clear the inode first
9434                  * followed by the superblock.
9435                  */
9436                 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9437                 pino = 0;
9438                 if (idp && (idp->id_state & UNLINKNEXT))
9439                         pino = idp->id_ino;
9440                 FREE_LOCK(ump);
9441                 if (pino == 0) {
9442                         bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9443                             (int)fs->fs_sbsize, 0, 0, 0);
9444                 } else {
9445                         error = bread(ump->um_devvp,
9446                             fsbtodb(fs, ino_to_fsba(fs, pino)),
9447                             (int)fs->fs_bsize, NOCRED, &bp);
9448                         if (error)
9449                                 brelse(bp);
9450                 }
9451                 ACQUIRE_LOCK(ump);
9452                 if (error)
9453                         break;
9454                 /* If the list has changed restart the loop. */
9455                 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9456                 nino = 0;
9457                 if (idp && (idp->id_state & UNLINKNEXT))
9458                         nino = idp->id_ino;
9459                 if (nino != pino ||
9460                     (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
9461                         FREE_LOCK(ump);
9462                         brelse(bp);
9463                         ACQUIRE_LOCK(ump);
9464                         continue;
9465                 }
9466                 nino = 0;
9467                 idn = TAILQ_NEXT(inodedep, id_unlinked);
9468                 if (idn)
9469                         nino = idn->id_ino;
9470                 /*
9471                  * Remove us from the in memory list.  After this we cannot
9472                  * access the inodedep.
9473                  */
9474                 KASSERT((inodedep->id_state & UNLINKED) != 0,
9475                     ("clear_unlinked_inodedep: inodedep %p not unlinked",
9476                     inodedep));
9477                 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9478                 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9479                 FREE_LOCK(ump);
9480                 /*
9481                  * The predecessor's next pointer is manually updated here
9482                  * so that the NEXT flag is never cleared for an element
9483                  * that is in the list.
9484                  */
9485                 if (pino == 0) {
9486                         bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9487                         ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9488                         softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9489                             bp);
9490                 } else if (fs->fs_magic == FS_UFS1_MAGIC)
9491                         ((struct ufs1_dinode *)bp->b_data +
9492                             ino_to_fsbo(fs, pino))->di_freelink = nino;
9493                 else
9494                         ((struct ufs2_dinode *)bp->b_data +
9495                             ino_to_fsbo(fs, pino))->di_freelink = nino;
9496                 /*
9497                  * If the bwrite fails we have no recourse to recover.  The
9498                  * filesystem is corrupted already.
9499                  */
9500                 bwrite(bp);
9501                 ACQUIRE_LOCK(ump);
9502                 /*
9503                  * If the superblock pointer still needs to be cleared force
9504                  * a write here.
9505                  */
9506                 if (fs->fs_sujfree == ino) {
9507                         FREE_LOCK(ump);
9508                         bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9509                             (int)fs->fs_sbsize, 0, 0, 0);
9510                         bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9511                         ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9512                         softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9513                             bp);
9514                         bwrite(bp);
9515                         ACQUIRE_LOCK(ump);
9516                 }
9517
9518                 if (fs->fs_sujfree != ino)
9519                         return;
9520                 panic("clear_unlinked_inodedep: Failed to clear free head");
9521         }
9522         if (inodedep->id_ino == fs->fs_sujfree)
9523                 panic("clear_unlinked_inodedep: Freeing head of free list");
9524         inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9525         TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9526         return;
9527 }
9528
9529 /*
9530  * This workitem decrements the inode's link count.
9531  * If the link count reaches zero, the file is removed.
9532  */
9533 static int
9534 handle_workitem_remove(dirrem, flags)
9535         struct dirrem *dirrem;
9536         int flags;
9537 {
9538         struct inodedep *inodedep;
9539         struct workhead dotdotwk;
9540         struct worklist *wk;
9541         struct ufsmount *ump;
9542         struct mount *mp;
9543         struct vnode *vp;
9544         struct inode *ip;
9545         ino_t oldinum;
9546
9547         if (dirrem->dm_state & ONWORKLIST)
9548                 panic("handle_workitem_remove: dirrem %p still on worklist",
9549                     dirrem);
9550         oldinum = dirrem->dm_oldinum;
9551         mp = dirrem->dm_list.wk_mp;
9552         ump = VFSTOUFS(mp);
9553         flags |= LK_EXCLUSIVE;
9554         if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0)
9555                 return (EBUSY);
9556         ip = VTOI(vp);
9557         ACQUIRE_LOCK(ump);
9558         if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
9559                 panic("handle_workitem_remove: lost inodedep");
9560         if (dirrem->dm_state & ONDEPLIST)
9561                 LIST_REMOVE(dirrem, dm_inonext);
9562         KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
9563             ("handle_workitem_remove:  Journal entries not written."));
9564
9565         /*
9566          * Move all dependencies waiting on the remove to complete
9567          * from the dirrem to the inode inowait list to be completed
9568          * after the inode has been updated and written to disk.  Any
9569          * marked MKDIR_PARENT are saved to be completed when the .. ref
9570          * is removed.
9571          */
9572         LIST_INIT(&dotdotwk);
9573         while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
9574                 WORKLIST_REMOVE(wk);
9575                 if (wk->wk_state & MKDIR_PARENT) {
9576                         wk->wk_state &= ~MKDIR_PARENT;
9577                         WORKLIST_INSERT(&dotdotwk, wk);
9578                         continue;
9579                 }
9580                 WORKLIST_INSERT(&inodedep->id_inowait, wk);
9581         }
9582         LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
9583         /*
9584          * Normal file deletion.
9585          */
9586         if ((dirrem->dm_state & RMDIR) == 0) {
9587                 ip->i_nlink--;
9588                 DIP_SET(ip, i_nlink, ip->i_nlink);
9589                 ip->i_flag |= IN_CHANGE;
9590                 if (ip->i_nlink < ip->i_effnlink)
9591                         panic("handle_workitem_remove: bad file delta");
9592                 if (ip->i_nlink == 0)
9593                         unlinked_inodedep(mp, inodedep);
9594                 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9595                 KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9596                     ("handle_workitem_remove: worklist not empty. %s",
9597                     TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
9598                 WORKITEM_FREE(dirrem, D_DIRREM);
9599                 FREE_LOCK(ump);
9600                 goto out;
9601         }
9602         /*
9603          * Directory deletion. Decrement reference count for both the
9604          * just deleted parent directory entry and the reference for ".".
9605          * Arrange to have the reference count on the parent decremented
9606          * to account for the loss of "..".
9607          */
9608         ip->i_nlink -= 2;
9609         DIP_SET(ip, i_nlink, ip->i_nlink);
9610         ip->i_flag |= IN_CHANGE;
9611         if (ip->i_nlink < ip->i_effnlink)
9612                 panic("handle_workitem_remove: bad dir delta");
9613         if (ip->i_nlink == 0)
9614                 unlinked_inodedep(mp, inodedep);
9615         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9616         /*
9617          * Rename a directory to a new parent. Since, we are both deleting
9618          * and creating a new directory entry, the link count on the new
9619          * directory should not change. Thus we skip the followup dirrem.
9620          */
9621         if (dirrem->dm_state & DIRCHG) {
9622                 KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9623                     ("handle_workitem_remove: DIRCHG and worklist not empty."));
9624                 WORKITEM_FREE(dirrem, D_DIRREM);
9625                 FREE_LOCK(ump);
9626                 goto out;
9627         }
9628         dirrem->dm_state = ONDEPLIST;
9629         dirrem->dm_oldinum = dirrem->dm_dirinum;
9630         /*
9631          * Place the dirrem on the parent's diremhd list.
9632          */
9633         if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
9634                 panic("handle_workitem_remove: lost dir inodedep");
9635         LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9636         /*
9637          * If the allocated inode has never been written to disk, then
9638          * the on-disk inode is zero'ed and we can remove the file
9639          * immediately.  When journaling if the inode has been marked
9640          * unlinked and not DEPCOMPLETE we know it can never be written.
9641          */
9642         inodedep_lookup(mp, oldinum, 0, &inodedep);
9643         if (inodedep == NULL ||
9644             (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
9645             check_inode_unwritten(inodedep)) {
9646                 FREE_LOCK(ump);
9647                 vput(vp);
9648                 return handle_workitem_remove(dirrem, flags);
9649         }
9650         WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
9651         FREE_LOCK(ump);
9652         ip->i_flag |= IN_CHANGE;
9653 out:
9654         ffs_update(vp, 0);
9655         vput(vp);
9656         return (0);
9657 }
9658
9659 /*
9660  * Inode de-allocation dependencies.
9661  *
9662  * When an inode's link count is reduced to zero, it can be de-allocated. We
9663  * found it convenient to postpone de-allocation until after the inode is
9664  * written to disk with its new link count (zero).  At this point, all of the
9665  * on-disk inode's block pointers are nullified and, with careful dependency
9666  * list ordering, all dependencies related to the inode will be satisfied and
9667  * the corresponding dependency structures de-allocated.  So, if/when the
9668  * inode is reused, there will be no mixing of old dependencies with new
9669  * ones.  This artificial dependency is set up by the block de-allocation
9670  * procedure above (softdep_setup_freeblocks) and completed by the
9671  * following procedure.
9672  */
9673 static void
9674 handle_workitem_freefile(freefile)
9675         struct freefile *freefile;
9676 {
9677         struct workhead wkhd;
9678         struct fs *fs;
9679         struct inodedep *idp;
9680         struct ufsmount *ump;
9681         int error;
9682
9683         ump = VFSTOUFS(freefile->fx_list.wk_mp);
9684         fs = ump->um_fs;
9685 #ifdef DEBUG
9686         ACQUIRE_LOCK(ump);
9687         error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
9688         FREE_LOCK(ump);
9689         if (error)
9690                 panic("handle_workitem_freefile: inodedep %p survived", idp);
9691 #endif
9692         UFS_LOCK(ump);
9693         fs->fs_pendinginodes -= 1;
9694         UFS_UNLOCK(ump);
9695         LIST_INIT(&wkhd);
9696         LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
9697         if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
9698             freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
9699                 softdep_error("handle_workitem_freefile", error);
9700         ACQUIRE_LOCK(ump);
9701         WORKITEM_FREE(freefile, D_FREEFILE);
9702         FREE_LOCK(ump);
9703 }
9704
9705
9706 /*
9707  * Helper function which unlinks marker element from work list and returns
9708  * the next element on the list.
9709  */
9710 static __inline struct worklist *
9711 markernext(struct worklist *marker)
9712 {
9713         struct worklist *next;
9714
9715         next = LIST_NEXT(marker, wk_list);
9716         LIST_REMOVE(marker, wk_list);
9717         return next;
9718 }
9719
9720 /*
9721  * Disk writes.
9722  *
9723  * The dependency structures constructed above are most actively used when file
9724  * system blocks are written to disk.  No constraints are placed on when a
9725  * block can be written, but unsatisfied update dependencies are made safe by
9726  * modifying (or replacing) the source memory for the duration of the disk
9727  * write.  When the disk write completes, the memory block is again brought
9728  * up-to-date.
9729  *
9730  * In-core inode structure reclamation.
9731  *
9732  * Because there are a finite number of "in-core" inode structures, they are
9733  * reused regularly.  By transferring all inode-related dependencies to the
9734  * in-memory inode block and indexing them separately (via "inodedep"s), we
9735  * can allow "in-core" inode structures to be reused at any time and avoid
9736  * any increase in contention.
9737  *
9738  * Called just before entering the device driver to initiate a new disk I/O.
9739  * The buffer must be locked, thus, no I/O completion operations can occur
9740  * while we are manipulating its associated dependencies.
9741  */
9742 static void
9743 softdep_disk_io_initiation(bp)
9744         struct buf *bp;         /* structure describing disk write to occur */
9745 {
9746         struct worklist *wk;
9747         struct worklist marker;
9748         struct inodedep *inodedep;
9749         struct freeblks *freeblks;
9750         struct jblkdep *jblkdep;
9751         struct newblk *newblk;
9752         struct ufsmount *ump;
9753
9754         /*
9755          * We only care about write operations. There should never
9756          * be dependencies for reads.
9757          */
9758         if (bp->b_iocmd != BIO_WRITE)
9759                 panic("softdep_disk_io_initiation: not write");
9760
9761         if (bp->b_vflags & BV_BKGRDINPROG)
9762                 panic("softdep_disk_io_initiation: Writing buffer with "
9763                     "background write in progress: %p", bp);
9764
9765         if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
9766                 return;
9767         ump = VFSTOUFS(wk->wk_mp);
9768
9769         marker.wk_type = D_LAST + 1;    /* Not a normal workitem */
9770         PHOLD(curproc);                 /* Don't swap out kernel stack */
9771         ACQUIRE_LOCK(ump);
9772         /*
9773          * Do any necessary pre-I/O processing.
9774          */
9775         for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
9776              wk = markernext(&marker)) {
9777                 LIST_INSERT_AFTER(wk, &marker, wk_list);
9778                 switch (wk->wk_type) {
9779
9780                 case D_PAGEDEP:
9781                         initiate_write_filepage(WK_PAGEDEP(wk), bp);
9782                         continue;
9783
9784                 case D_INODEDEP:
9785                         inodedep = WK_INODEDEP(wk);
9786                         if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
9787                                 initiate_write_inodeblock_ufs1(inodedep, bp);
9788                         else
9789                                 initiate_write_inodeblock_ufs2(inodedep, bp);
9790                         continue;
9791
9792                 case D_INDIRDEP:
9793                         initiate_write_indirdep(WK_INDIRDEP(wk), bp);
9794                         continue;
9795
9796                 case D_BMSAFEMAP:
9797                         initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
9798                         continue;
9799
9800                 case D_JSEG:
9801                         WK_JSEG(wk)->js_buf = NULL;
9802                         continue;
9803
9804                 case D_FREEBLKS:
9805                         freeblks = WK_FREEBLKS(wk);
9806                         jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd);
9807                         /*
9808                          * We have to wait for the freeblks to be journaled
9809                          * before we can write an inodeblock with updated
9810                          * pointers.  Be careful to arrange the marker so
9811                          * we revisit the freeblks if it's not removed by
9812                          * the first jwait().
9813                          */
9814                         if (jblkdep != NULL) {
9815                                 LIST_REMOVE(&marker, wk_list);
9816                                 LIST_INSERT_BEFORE(wk, &marker, wk_list);
9817                                 jwait(&jblkdep->jb_list, MNT_WAIT);
9818                         }
9819                         continue;
9820                 case D_ALLOCDIRECT:
9821                 case D_ALLOCINDIR:
9822                         /*
9823                          * We have to wait for the jnewblk to be journaled
9824                          * before we can write to a block if the contents
9825                          * may be confused with an earlier file's indirect
9826                          * at recovery time.  Handle the marker as described
9827                          * above.
9828                          */
9829                         newblk = WK_NEWBLK(wk);
9830                         if (newblk->nb_jnewblk != NULL &&
9831                             indirblk_lookup(newblk->nb_list.wk_mp,
9832                             newblk->nb_newblkno)) {
9833                                 LIST_REMOVE(&marker, wk_list);
9834                                 LIST_INSERT_BEFORE(wk, &marker, wk_list);
9835                                 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
9836                         }
9837                         continue;
9838
9839                 case D_SBDEP:
9840                         initiate_write_sbdep(WK_SBDEP(wk));
9841                         continue;
9842
9843                 case D_MKDIR:
9844                 case D_FREEWORK:
9845                 case D_FREEDEP:
9846                 case D_JSEGDEP:
9847                         continue;
9848
9849                 default:
9850                         panic("handle_disk_io_initiation: Unexpected type %s",
9851                             TYPENAME(wk->wk_type));
9852                         /* NOTREACHED */
9853                 }
9854         }
9855         FREE_LOCK(ump);
9856         PRELE(curproc);                 /* Allow swapout of kernel stack */
9857 }
9858
9859 /*
9860  * Called from within the procedure above to deal with unsatisfied
9861  * allocation dependencies in a directory. The buffer must be locked,
9862  * thus, no I/O completion operations can occur while we are
9863  * manipulating its associated dependencies.
9864  */
9865 static void
9866 initiate_write_filepage(pagedep, bp)
9867         struct pagedep *pagedep;
9868         struct buf *bp;
9869 {
9870         struct jremref *jremref;
9871         struct jmvref *jmvref;
9872         struct dirrem *dirrem;
9873         struct diradd *dap;
9874         struct direct *ep;
9875         int i;
9876
9877         if (pagedep->pd_state & IOSTARTED) {
9878                 /*
9879                  * This can only happen if there is a driver that does not
9880                  * understand chaining. Here biodone will reissue the call
9881                  * to strategy for the incomplete buffers.
9882                  */
9883                 printf("initiate_write_filepage: already started\n");
9884                 return;
9885         }
9886         pagedep->pd_state |= IOSTARTED;
9887         /*
9888          * Wait for all journal remove dependencies to hit the disk.
9889          * We can not allow any potentially conflicting directory adds
9890          * to be visible before removes and rollback is too difficult.
9891          * The soft updates lock may be dropped and re-acquired, however
9892          * we hold the buf locked so the dependency can not go away.
9893          */
9894         LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
9895                 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL)
9896                         jwait(&jremref->jr_list, MNT_WAIT);
9897         while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL)
9898                 jwait(&jmvref->jm_list, MNT_WAIT);
9899         for (i = 0; i < DAHASHSZ; i++) {
9900                 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
9901                         ep = (struct direct *)
9902                             ((char *)bp->b_data + dap->da_offset);
9903                         if (ep->d_ino != dap->da_newinum)
9904                                 panic("%s: dir inum %ju != new %ju",
9905                                     "initiate_write_filepage",
9906                                     (uintmax_t)ep->d_ino,
9907                                     (uintmax_t)dap->da_newinum);
9908                         if (dap->da_state & DIRCHG)
9909                                 ep->d_ino = dap->da_previous->dm_oldinum;
9910                         else
9911                                 ep->d_ino = 0;
9912                         dap->da_state &= ~ATTACHED;
9913                         dap->da_state |= UNDONE;
9914                 }
9915         }
9916 }
9917
9918 /*
9919  * Version of initiate_write_inodeblock that handles UFS1 dinodes.
9920  * Note that any bug fixes made to this routine must be done in the
9921  * version found below.
9922  *
9923  * Called from within the procedure above to deal with unsatisfied
9924  * allocation dependencies in an inodeblock. The buffer must be
9925  * locked, thus, no I/O completion operations can occur while we
9926  * are manipulating its associated dependencies.
9927  */
9928 static void
9929 initiate_write_inodeblock_ufs1(inodedep, bp)
9930         struct inodedep *inodedep;
9931         struct buf *bp;                 /* The inode block */
9932 {
9933         struct allocdirect *adp, *lastadp;
9934         struct ufs1_dinode *dp;
9935         struct ufs1_dinode *sip;
9936         struct inoref *inoref;
9937         struct ufsmount *ump;
9938         struct fs *fs;
9939         ufs_lbn_t i;
9940 #ifdef INVARIANTS
9941         ufs_lbn_t prevlbn = 0;
9942 #endif
9943         int deplist;
9944
9945         if (inodedep->id_state & IOSTARTED)
9946                 panic("initiate_write_inodeblock_ufs1: already started");
9947         inodedep->id_state |= IOSTARTED;
9948         fs = inodedep->id_fs;
9949         ump = VFSTOUFS(inodedep->id_list.wk_mp);
9950         LOCK_OWNED(ump);
9951         dp = (struct ufs1_dinode *)bp->b_data +
9952             ino_to_fsbo(fs, inodedep->id_ino);
9953
9954         /*
9955          * If we're on the unlinked list but have not yet written our
9956          * next pointer initialize it here.
9957          */
9958         if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
9959                 struct inodedep *inon;
9960
9961                 inon = TAILQ_NEXT(inodedep, id_unlinked);
9962                 dp->di_freelink = inon ? inon->id_ino : 0;
9963         }
9964         /*
9965          * If the bitmap is not yet written, then the allocated
9966          * inode cannot be written to disk.
9967          */
9968         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
9969                 if (inodedep->id_savedino1 != NULL)
9970                         panic("initiate_write_inodeblock_ufs1: I/O underway");
9971                 FREE_LOCK(ump);
9972                 sip = malloc(sizeof(struct ufs1_dinode),
9973                     M_SAVEDINO, M_SOFTDEP_FLAGS);
9974                 ACQUIRE_LOCK(ump);
9975                 inodedep->id_savedino1 = sip;
9976                 *inodedep->id_savedino1 = *dp;
9977                 bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
9978                 dp->di_gen = inodedep->id_savedino1->di_gen;
9979                 dp->di_freelink = inodedep->id_savedino1->di_freelink;
9980                 return;
9981         }
9982         /*
9983          * If no dependencies, then there is nothing to roll back.
9984          */
9985         inodedep->id_savedsize = dp->di_size;
9986         inodedep->id_savedextsize = 0;
9987         inodedep->id_savednlink = dp->di_nlink;
9988         if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
9989             TAILQ_EMPTY(&inodedep->id_inoreflst))
9990                 return;
9991         /*
9992          * Revert the link count to that of the first unwritten journal entry.
9993          */
9994         inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
9995         if (inoref)
9996                 dp->di_nlink = inoref->if_nlink;
9997         /*
9998          * Set the dependencies to busy.
9999          */
10000         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10001              adp = TAILQ_NEXT(adp, ad_next)) {
10002 #ifdef INVARIANTS
10003                 if (deplist != 0 && prevlbn >= adp->ad_offset)
10004                         panic("softdep_write_inodeblock: lbn order");
10005                 prevlbn = adp->ad_offset;
10006                 if (adp->ad_offset < NDADDR &&
10007                     dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10008                         panic("%s: direct pointer #%jd mismatch %d != %jd",
10009                             "softdep_write_inodeblock",
10010                             (intmax_t)adp->ad_offset,
10011                             dp->di_db[adp->ad_offset],
10012                             (intmax_t)adp->ad_newblkno);
10013                 if (adp->ad_offset >= NDADDR &&
10014                     dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
10015                         panic("%s: indirect pointer #%jd mismatch %d != %jd",
10016                             "softdep_write_inodeblock",
10017                             (intmax_t)adp->ad_offset - NDADDR,
10018                             dp->di_ib[adp->ad_offset - NDADDR],
10019                             (intmax_t)adp->ad_newblkno);
10020                 deplist |= 1 << adp->ad_offset;
10021                 if ((adp->ad_state & ATTACHED) == 0)
10022                         panic("softdep_write_inodeblock: Unknown state 0x%x",
10023                             adp->ad_state);
10024 #endif /* INVARIANTS */
10025                 adp->ad_state &= ~ATTACHED;
10026                 adp->ad_state |= UNDONE;
10027         }
10028         /*
10029          * The on-disk inode cannot claim to be any larger than the last
10030          * fragment that has been written. Otherwise, the on-disk inode
10031          * might have fragments that were not the last block in the file
10032          * which would corrupt the filesystem.
10033          */
10034         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10035              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10036                 if (adp->ad_offset >= NDADDR)
10037                         break;
10038                 dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10039                 /* keep going until hitting a rollback to a frag */
10040                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10041                         continue;
10042                 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10043                 for (i = adp->ad_offset + 1; i < NDADDR; i++) {
10044 #ifdef INVARIANTS
10045                         if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10046                                 panic("softdep_write_inodeblock: lost dep1");
10047 #endif /* INVARIANTS */
10048                         dp->di_db[i] = 0;
10049                 }
10050                 for (i = 0; i < NIADDR; i++) {
10051 #ifdef INVARIANTS
10052                         if (dp->di_ib[i] != 0 &&
10053                             (deplist & ((1 << NDADDR) << i)) == 0)
10054                                 panic("softdep_write_inodeblock: lost dep2");
10055 #endif /* INVARIANTS */
10056                         dp->di_ib[i] = 0;
10057                 }
10058                 return;
10059         }
10060         /*
10061          * If we have zero'ed out the last allocated block of the file,
10062          * roll back the size to the last currently allocated block.
10063          * We know that this last allocated block is a full-sized as
10064          * we already checked for fragments in the loop above.
10065          */
10066         if (lastadp != NULL &&
10067             dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10068                 for (i = lastadp->ad_offset; i >= 0; i--)
10069                         if (dp->di_db[i] != 0)
10070                                 break;
10071                 dp->di_size = (i + 1) * fs->fs_bsize;
10072         }
10073         /*
10074          * The only dependencies are for indirect blocks.
10075          *
10076          * The file size for indirect block additions is not guaranteed.
10077          * Such a guarantee would be non-trivial to achieve. The conventional
10078          * synchronous write implementation also does not make this guarantee.
10079          * Fsck should catch and fix discrepancies. Arguably, the file size
10080          * can be over-estimated without destroying integrity when the file
10081          * moves into the indirect blocks (i.e., is large). If we want to
10082          * postpone fsck, we are stuck with this argument.
10083          */
10084         for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10085                 dp->di_ib[adp->ad_offset - NDADDR] = 0;
10086 }
10087
10088 /*
10089  * Version of initiate_write_inodeblock that handles UFS2 dinodes.
10090  * Note that any bug fixes made to this routine must be done in the
10091  * version found above.
10092  *
10093  * Called from within the procedure above to deal with unsatisfied
10094  * allocation dependencies in an inodeblock. The buffer must be
10095  * locked, thus, no I/O completion operations can occur while we
10096  * are manipulating its associated dependencies.
10097  */
10098 static void
10099 initiate_write_inodeblock_ufs2(inodedep, bp)
10100         struct inodedep *inodedep;
10101         struct buf *bp;                 /* The inode block */
10102 {
10103         struct allocdirect *adp, *lastadp;
10104         struct ufs2_dinode *dp;
10105         struct ufs2_dinode *sip;
10106         struct inoref *inoref;
10107         struct ufsmount *ump;
10108         struct fs *fs;
10109         ufs_lbn_t i;
10110 #ifdef INVARIANTS
10111         ufs_lbn_t prevlbn = 0;
10112 #endif
10113         int deplist;
10114
10115         if (inodedep->id_state & IOSTARTED)
10116                 panic("initiate_write_inodeblock_ufs2: already started");
10117         inodedep->id_state |= IOSTARTED;
10118         fs = inodedep->id_fs;
10119         ump = VFSTOUFS(inodedep->id_list.wk_mp);
10120         LOCK_OWNED(ump);
10121         dp = (struct ufs2_dinode *)bp->b_data +
10122             ino_to_fsbo(fs, inodedep->id_ino);
10123
10124         /*
10125          * If we're on the unlinked list but have not yet written our
10126          * next pointer initialize it here.
10127          */
10128         if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10129                 struct inodedep *inon;
10130
10131                 inon = TAILQ_NEXT(inodedep, id_unlinked);
10132                 dp->di_freelink = inon ? inon->id_ino : 0;
10133         }
10134         /*
10135          * If the bitmap is not yet written, then the allocated
10136          * inode cannot be written to disk.
10137          */
10138         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10139                 if (inodedep->id_savedino2 != NULL)
10140                         panic("initiate_write_inodeblock_ufs2: I/O underway");
10141                 FREE_LOCK(ump);
10142                 sip = malloc(sizeof(struct ufs2_dinode),
10143                     M_SAVEDINO, M_SOFTDEP_FLAGS);
10144                 ACQUIRE_LOCK(ump);
10145                 inodedep->id_savedino2 = sip;
10146                 *inodedep->id_savedino2 = *dp;
10147                 bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
10148                 dp->di_gen = inodedep->id_savedino2->di_gen;
10149                 dp->di_freelink = inodedep->id_savedino2->di_freelink;
10150                 return;
10151         }
10152         /*
10153          * If no dependencies, then there is nothing to roll back.
10154          */
10155         inodedep->id_savedsize = dp->di_size;
10156         inodedep->id_savedextsize = dp->di_extsize;
10157         inodedep->id_savednlink = dp->di_nlink;
10158         if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10159             TAILQ_EMPTY(&inodedep->id_extupdt) &&
10160             TAILQ_EMPTY(&inodedep->id_inoreflst))
10161                 return;
10162         /*
10163          * Revert the link count to that of the first unwritten journal entry.
10164          */
10165         inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10166         if (inoref)
10167                 dp->di_nlink = inoref->if_nlink;
10168
10169         /*
10170          * Set the ext data dependencies to busy.
10171          */
10172         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10173              adp = TAILQ_NEXT(adp, ad_next)) {
10174 #ifdef INVARIANTS
10175                 if (deplist != 0 && prevlbn >= adp->ad_offset)
10176                         panic("softdep_write_inodeblock: lbn order");
10177                 prevlbn = adp->ad_offset;
10178                 if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
10179                         panic("%s: direct pointer #%jd mismatch %jd != %jd",
10180                             "softdep_write_inodeblock",
10181                             (intmax_t)adp->ad_offset,
10182                             (intmax_t)dp->di_extb[adp->ad_offset],
10183                             (intmax_t)adp->ad_newblkno);
10184                 deplist |= 1 << adp->ad_offset;
10185                 if ((adp->ad_state & ATTACHED) == 0)
10186                         panic("softdep_write_inodeblock: Unknown state 0x%x",
10187                             adp->ad_state);
10188 #endif /* INVARIANTS */
10189                 adp->ad_state &= ~ATTACHED;
10190                 adp->ad_state |= UNDONE;
10191         }
10192         /*
10193          * The on-disk inode cannot claim to be any larger than the last
10194          * fragment that has been written. Otherwise, the on-disk inode
10195          * might have fragments that were not the last block in the ext
10196          * data which would corrupt the filesystem.
10197          */
10198         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10199              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10200                 dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
10201                 /* keep going until hitting a rollback to a frag */
10202                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10203                         continue;
10204                 dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10205                 for (i = adp->ad_offset + 1; i < NXADDR; i++) {
10206 #ifdef INVARIANTS
10207                         if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
10208                                 panic("softdep_write_inodeblock: lost dep1");
10209 #endif /* INVARIANTS */
10210                         dp->di_extb[i] = 0;
10211                 }
10212                 lastadp = NULL;
10213                 break;
10214         }
10215         /*
10216          * If we have zero'ed out the last allocated block of the ext
10217          * data, roll back the size to the last currently allocated block.
10218          * We know that this last allocated block is a full-sized as
10219          * we already checked for fragments in the loop above.
10220          */
10221         if (lastadp != NULL &&
10222             dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10223                 for (i = lastadp->ad_offset; i >= 0; i--)
10224                         if (dp->di_extb[i] != 0)
10225                                 break;
10226                 dp->di_extsize = (i + 1) * fs->fs_bsize;
10227         }
10228         /*
10229          * Set the file data dependencies to busy.
10230          */
10231         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10232              adp = TAILQ_NEXT(adp, ad_next)) {
10233 #ifdef INVARIANTS
10234                 if (deplist != 0 && prevlbn >= adp->ad_offset)
10235                         panic("softdep_write_inodeblock: lbn order");
10236                 if ((adp->ad_state & ATTACHED) == 0)
10237                         panic("inodedep %p and adp %p not attached", inodedep, adp);
10238                 prevlbn = adp->ad_offset;
10239                 if (adp->ad_offset < NDADDR &&
10240                     dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10241                         panic("%s: direct pointer #%jd mismatch %jd != %jd",
10242                             "softdep_write_inodeblock",
10243                             (intmax_t)adp->ad_offset,
10244                             (intmax_t)dp->di_db[adp->ad_offset],
10245                             (intmax_t)adp->ad_newblkno);
10246                 if (adp->ad_offset >= NDADDR &&
10247                     dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
10248                         panic("%s indirect pointer #%jd mismatch %jd != %jd",
10249                             "softdep_write_inodeblock:",
10250                             (intmax_t)adp->ad_offset - NDADDR,
10251                             (intmax_t)dp->di_ib[adp->ad_offset - NDADDR],
10252                             (intmax_t)adp->ad_newblkno);
10253                 deplist |= 1 << adp->ad_offset;
10254                 if ((adp->ad_state & ATTACHED) == 0)
10255                         panic("softdep_write_inodeblock: Unknown state 0x%x",
10256                             adp->ad_state);
10257 #endif /* INVARIANTS */
10258                 adp->ad_state &= ~ATTACHED;
10259                 adp->ad_state |= UNDONE;
10260         }
10261         /*
10262          * The on-disk inode cannot claim to be any larger than the last
10263          * fragment that has been written. Otherwise, the on-disk inode
10264          * might have fragments that were not the last block in the file
10265          * which would corrupt the filesystem.
10266          */
10267         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10268              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10269                 if (adp->ad_offset >= NDADDR)
10270                         break;
10271                 dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10272                 /* keep going until hitting a rollback to a frag */
10273                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10274                         continue;
10275                 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10276                 for (i = adp->ad_offset + 1; i < NDADDR; i++) {
10277 #ifdef INVARIANTS
10278                         if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10279                                 panic("softdep_write_inodeblock: lost dep2");
10280 #endif /* INVARIANTS */
10281                         dp->di_db[i] = 0;
10282                 }
10283                 for (i = 0; i < NIADDR; i++) {
10284 #ifdef INVARIANTS
10285                         if (dp->di_ib[i] != 0 &&
10286                             (deplist & ((1 << NDADDR) << i)) == 0)
10287                                 panic("softdep_write_inodeblock: lost dep3");
10288 #endif /* INVARIANTS */
10289                         dp->di_ib[i] = 0;
10290                 }
10291                 return;
10292         }
10293         /*
10294          * If we have zero'ed out the last allocated block of the file,
10295          * roll back the size to the last currently allocated block.
10296          * We know that this last allocated block is a full-sized as
10297          * we already checked for fragments in the loop above.
10298          */
10299         if (lastadp != NULL &&
10300             dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10301                 for (i = lastadp->ad_offset; i >= 0; i--)
10302                         if (dp->di_db[i] != 0)
10303                                 break;
10304                 dp->di_size = (i + 1) * fs->fs_bsize;
10305         }
10306         /*
10307          * The only dependencies are for indirect blocks.
10308          *
10309          * The file size for indirect block additions is not guaranteed.
10310          * Such a guarantee would be non-trivial to achieve. The conventional
10311          * synchronous write implementation also does not make this guarantee.
10312          * Fsck should catch and fix discrepancies. Arguably, the file size
10313          * can be over-estimated without destroying integrity when the file
10314          * moves into the indirect blocks (i.e., is large). If we want to
10315          * postpone fsck, we are stuck with this argument.
10316          */
10317         for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10318                 dp->di_ib[adp->ad_offset - NDADDR] = 0;
10319 }
10320
10321 /*
10322  * Cancel an indirdep as a result of truncation.  Release all of the
10323  * children allocindirs and place their journal work on the appropriate
10324  * list.
10325  */
10326 static void
10327 cancel_indirdep(indirdep, bp, freeblks)
10328         struct indirdep *indirdep;
10329         struct buf *bp;
10330         struct freeblks *freeblks;
10331 {
10332         struct allocindir *aip;
10333
10334         /*
10335          * None of the indirect pointers will ever be visible,
10336          * so they can simply be tossed. GOINGAWAY ensures
10337          * that allocated pointers will be saved in the buffer
10338          * cache until they are freed. Note that they will
10339          * only be able to be found by their physical address
10340          * since the inode mapping the logical address will
10341          * be gone. The save buffer used for the safe copy
10342          * was allocated in setup_allocindir_phase2 using
10343          * the physical address so it could be used for this
10344          * purpose. Hence we swap the safe copy with the real
10345          * copy, allowing the safe copy to be freed and holding
10346          * on to the real copy for later use in indir_trunc.
10347          */
10348         if (indirdep->ir_state & GOINGAWAY)
10349                 panic("cancel_indirdep: already gone");
10350         if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
10351                 indirdep->ir_state |= DEPCOMPLETE;
10352                 LIST_REMOVE(indirdep, ir_next);
10353         }
10354         indirdep->ir_state |= GOINGAWAY;
10355         VFSTOUFS(indirdep->ir_list.wk_mp)->softdep_numindirdeps += 1;
10356         /*
10357          * Pass in bp for blocks still have journal writes
10358          * pending so we can cancel them on their own.
10359          */
10360         while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
10361                 cancel_allocindir(aip, bp, freeblks, 0);
10362         while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0)
10363                 cancel_allocindir(aip, NULL, freeblks, 0);
10364         while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0)
10365                 cancel_allocindir(aip, NULL, freeblks, 0);
10366         while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0)
10367                 cancel_allocindir(aip, NULL, freeblks, 0);
10368         /*
10369          * If there are pending partial truncations we need to keep the
10370          * old block copy around until they complete.  This is because
10371          * the current b_data is not a perfect superset of the available
10372          * blocks.
10373          */
10374         if (TAILQ_EMPTY(&indirdep->ir_trunc))
10375                 bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
10376         else
10377                 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10378         WORKLIST_REMOVE(&indirdep->ir_list);
10379         WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
10380         indirdep->ir_bp = NULL;
10381         indirdep->ir_freeblks = freeblks;
10382 }
10383
10384 /*
10385  * Free an indirdep once it no longer has new pointers to track.
10386  */
10387 static void
10388 free_indirdep(indirdep)
10389         struct indirdep *indirdep;
10390 {
10391
10392         KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc),
10393             ("free_indirdep: Indir trunc list not empty."));
10394         KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
10395             ("free_indirdep: Complete head not empty."));
10396         KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
10397             ("free_indirdep: write head not empty."));
10398         KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
10399             ("free_indirdep: done head not empty."));
10400         KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
10401             ("free_indirdep: deplist head not empty."));
10402         KASSERT((indirdep->ir_state & DEPCOMPLETE),
10403             ("free_indirdep: %p still on newblk list.", indirdep));
10404         KASSERT(indirdep->ir_saveddata == NULL,
10405             ("free_indirdep: %p still has saved data.", indirdep));
10406         if (indirdep->ir_state & ONWORKLIST)
10407                 WORKLIST_REMOVE(&indirdep->ir_list);
10408         WORKITEM_FREE(indirdep, D_INDIRDEP);
10409 }
10410
10411 /*
10412  * Called before a write to an indirdep.  This routine is responsible for
10413  * rolling back pointers to a safe state which includes only those
10414  * allocindirs which have been completed.
10415  */
10416 static void
10417 initiate_write_indirdep(indirdep, bp)
10418         struct indirdep *indirdep;
10419         struct buf *bp;
10420 {
10421         struct ufsmount *ump;
10422
10423         indirdep->ir_state |= IOSTARTED;
10424         if (indirdep->ir_state & GOINGAWAY)
10425                 panic("disk_io_initiation: indirdep gone");
10426         /*
10427          * If there are no remaining dependencies, this will be writing
10428          * the real pointers.
10429          */
10430         if (LIST_EMPTY(&indirdep->ir_deplisthd) &&
10431             TAILQ_EMPTY(&indirdep->ir_trunc))
10432                 return;
10433         /*
10434          * Replace up-to-date version with safe version.
10435          */
10436         if (indirdep->ir_saveddata == NULL) {
10437                 ump = VFSTOUFS(indirdep->ir_list.wk_mp);
10438                 LOCK_OWNED(ump);
10439                 FREE_LOCK(ump);
10440                 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
10441                     M_SOFTDEP_FLAGS);
10442                 ACQUIRE_LOCK(ump);
10443         }
10444         indirdep->ir_state &= ~ATTACHED;
10445         indirdep->ir_state |= UNDONE;
10446         bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10447         bcopy(indirdep->ir_savebp->b_data, bp->b_data,
10448             bp->b_bcount);
10449 }
10450
10451 /*
10452  * Called when an inode has been cleared in a cg bitmap.  This finally
10453  * eliminates any canceled jaddrefs
10454  */
10455 void
10456 softdep_setup_inofree(mp, bp, ino, wkhd)
10457         struct mount *mp;
10458         struct buf *bp;
10459         ino_t ino;
10460         struct workhead *wkhd;
10461 {
10462         struct worklist *wk, *wkn;
10463         struct inodedep *inodedep;
10464         struct ufsmount *ump;
10465         uint8_t *inosused;
10466         struct cg *cgp;
10467         struct fs *fs;
10468
10469         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
10470             ("softdep_setup_inofree called on non-softdep filesystem"));
10471         ump = VFSTOUFS(mp);
10472         ACQUIRE_LOCK(ump);
10473         fs = ump->um_fs;
10474         cgp = (struct cg *)bp->b_data;
10475         inosused = cg_inosused(cgp);
10476         if (isset(inosused, ino % fs->fs_ipg))
10477                 panic("softdep_setup_inofree: inode %ju not freed.",
10478                     (uintmax_t)ino);
10479         if (inodedep_lookup(mp, ino, 0, &inodedep))
10480                 panic("softdep_setup_inofree: ino %ju has existing inodedep %p",
10481                     (uintmax_t)ino, inodedep);
10482         if (wkhd) {
10483                 LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
10484                         if (wk->wk_type != D_JADDREF)
10485                                 continue;
10486                         WORKLIST_REMOVE(wk);
10487                         /*
10488                          * We can free immediately even if the jaddref
10489                          * isn't attached in a background write as now
10490                          * the bitmaps are reconciled.
10491                          */
10492                         wk->wk_state |= COMPLETE | ATTACHED;
10493                         free_jaddref(WK_JADDREF(wk));
10494                 }
10495                 jwork_move(&bp->b_dep, wkhd);
10496         }
10497         FREE_LOCK(ump);
10498 }
10499
10500
10501 /*
10502  * Called via ffs_blkfree() after a set of frags has been cleared from a cg
10503  * map.  Any dependencies waiting for the write to clear are added to the
10504  * buf's list and any jnewblks that are being canceled are discarded
10505  * immediately.
10506  */
10507 void
10508 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
10509         struct mount *mp;
10510         struct buf *bp;
10511         ufs2_daddr_t blkno;
10512         int frags;
10513         struct workhead *wkhd;
10514 {
10515         struct bmsafemap *bmsafemap;
10516         struct jnewblk *jnewblk;
10517         struct ufsmount *ump;
10518         struct worklist *wk;
10519         struct fs *fs;
10520 #ifdef SUJ_DEBUG
10521         uint8_t *blksfree;
10522         struct cg *cgp;
10523         ufs2_daddr_t jstart;
10524         ufs2_daddr_t jend;
10525         ufs2_daddr_t end;
10526         long bno;
10527         int i;
10528 #endif
10529
10530         CTR3(KTR_SUJ,
10531             "softdep_setup_blkfree: blkno %jd frags %d wk head %p",
10532             blkno, frags, wkhd);
10533
10534         ump = VFSTOUFS(mp);
10535         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
10536             ("softdep_setup_blkfree called on non-softdep filesystem"));
10537         ACQUIRE_LOCK(ump);
10538         /* Lookup the bmsafemap so we track when it is dirty. */
10539         fs = ump->um_fs;
10540         bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10541         /*
10542          * Detach any jnewblks which have been canceled.  They must linger
10543          * until the bitmap is cleared again by ffs_blkfree() to prevent
10544          * an unjournaled allocation from hitting the disk.
10545          */
10546         if (wkhd) {
10547                 while ((wk = LIST_FIRST(wkhd)) != NULL) {
10548                         CTR2(KTR_SUJ,
10549                             "softdep_setup_blkfree: blkno %jd wk type %d",
10550                             blkno, wk->wk_type);
10551                         WORKLIST_REMOVE(wk);
10552                         if (wk->wk_type != D_JNEWBLK) {
10553                                 WORKLIST_INSERT(&bmsafemap->sm_freehd, wk);
10554                                 continue;
10555                         }
10556                         jnewblk = WK_JNEWBLK(wk);
10557                         KASSERT(jnewblk->jn_state & GOINGAWAY,
10558                             ("softdep_setup_blkfree: jnewblk not canceled."));
10559 #ifdef SUJ_DEBUG
10560                         /*
10561                          * Assert that this block is free in the bitmap
10562                          * before we discard the jnewblk.
10563                          */
10564                         cgp = (struct cg *)bp->b_data;
10565                         blksfree = cg_blksfree(cgp);
10566                         bno = dtogd(fs, jnewblk->jn_blkno);
10567                         for (i = jnewblk->jn_oldfrags;
10568                             i < jnewblk->jn_frags; i++) {
10569                                 if (isset(blksfree, bno + i))
10570                                         continue;
10571                                 panic("softdep_setup_blkfree: not free");
10572                         }
10573 #endif
10574                         /*
10575                          * Even if it's not attached we can free immediately
10576                          * as the new bitmap is correct.
10577                          */
10578                         wk->wk_state |= COMPLETE | ATTACHED;
10579                         free_jnewblk(jnewblk);
10580                 }
10581         }
10582
10583 #ifdef SUJ_DEBUG
10584         /*
10585          * Assert that we are not freeing a block which has an outstanding
10586          * allocation dependency.
10587          */
10588         fs = VFSTOUFS(mp)->um_fs;
10589         bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10590         end = blkno + frags;
10591         LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10592                 /*
10593                  * Don't match against blocks that will be freed when the
10594                  * background write is done.
10595                  */
10596                 if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
10597                     (COMPLETE | DEPCOMPLETE))
10598                         continue;
10599                 jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
10600                 jend = jnewblk->jn_blkno + jnewblk->jn_frags;
10601                 if ((blkno >= jstart && blkno < jend) ||
10602                     (end > jstart && end <= jend)) {
10603                         printf("state 0x%X %jd - %d %d dep %p\n",
10604                             jnewblk->jn_state, jnewblk->jn_blkno,
10605                             jnewblk->jn_oldfrags, jnewblk->jn_frags,
10606                             jnewblk->jn_dep);
10607                         panic("softdep_setup_blkfree: "
10608                             "%jd-%jd(%d) overlaps with %jd-%jd",
10609                             blkno, end, frags, jstart, jend);
10610                 }
10611         }
10612 #endif
10613         FREE_LOCK(ump);
10614 }
10615
10616 /*
10617  * Revert a block allocation when the journal record that describes it
10618  * is not yet written.
10619  */
10620 static int
10621 jnewblk_rollback(jnewblk, fs, cgp, blksfree)
10622         struct jnewblk *jnewblk;
10623         struct fs *fs;
10624         struct cg *cgp;
10625         uint8_t *blksfree;
10626 {
10627         ufs1_daddr_t fragno;
10628         long cgbno, bbase;
10629         int frags, blk;
10630         int i;
10631
10632         frags = 0;
10633         cgbno = dtogd(fs, jnewblk->jn_blkno);
10634         /*
10635          * We have to test which frags need to be rolled back.  We may
10636          * be operating on a stale copy when doing background writes.
10637          */
10638         for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++)
10639                 if (isclr(blksfree, cgbno + i))
10640                         frags++;
10641         if (frags == 0)
10642                 return (0);
10643         /*
10644          * This is mostly ffs_blkfree() sans some validation and
10645          * superblock updates.
10646          */
10647         if (frags == fs->fs_frag) {
10648                 fragno = fragstoblks(fs, cgbno);
10649                 ffs_setblock(fs, blksfree, fragno);
10650                 ffs_clusteracct(fs, cgp, fragno, 1);
10651                 cgp->cg_cs.cs_nbfree++;
10652         } else {
10653                 cgbno += jnewblk->jn_oldfrags;
10654                 bbase = cgbno - fragnum(fs, cgbno);
10655                 /* Decrement the old frags.  */
10656                 blk = blkmap(fs, blksfree, bbase);
10657                 ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
10658                 /* Deallocate the fragment */
10659                 for (i = 0; i < frags; i++)
10660                         setbit(blksfree, cgbno + i);
10661                 cgp->cg_cs.cs_nffree += frags;
10662                 /* Add back in counts associated with the new frags */
10663                 blk = blkmap(fs, blksfree, bbase);
10664                 ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
10665                 /* If a complete block has been reassembled, account for it. */
10666                 fragno = fragstoblks(fs, bbase);
10667                 if (ffs_isblock(fs, blksfree, fragno)) {
10668                         cgp->cg_cs.cs_nffree -= fs->fs_frag;
10669                         ffs_clusteracct(fs, cgp, fragno, 1);
10670                         cgp->cg_cs.cs_nbfree++;
10671                 }
10672         }
10673         stat_jnewblk++;
10674         jnewblk->jn_state &= ~ATTACHED;
10675         jnewblk->jn_state |= UNDONE;
10676
10677         return (frags);
10678 }
10679
10680 static void
10681 initiate_write_bmsafemap(bmsafemap, bp)
10682         struct bmsafemap *bmsafemap;
10683         struct buf *bp;                 /* The cg block. */
10684 {
10685         struct jaddref *jaddref;
10686         struct jnewblk *jnewblk;
10687         uint8_t *inosused;
10688         uint8_t *blksfree;
10689         struct cg *cgp;
10690         struct fs *fs;
10691         ino_t ino;
10692
10693         if (bmsafemap->sm_state & IOSTARTED)
10694                 return;
10695         bmsafemap->sm_state |= IOSTARTED;
10696         /*
10697          * Clear any inode allocations which are pending journal writes.
10698          */
10699         if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
10700                 cgp = (struct cg *)bp->b_data;
10701                 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10702                 inosused = cg_inosused(cgp);
10703                 LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
10704                         ino = jaddref->ja_ino % fs->fs_ipg;
10705                         if (isset(inosused, ino)) {
10706                                 if ((jaddref->ja_mode & IFMT) == IFDIR)
10707                                         cgp->cg_cs.cs_ndir--;
10708                                 cgp->cg_cs.cs_nifree++;
10709                                 clrbit(inosused, ino);
10710                                 jaddref->ja_state &= ~ATTACHED;
10711                                 jaddref->ja_state |= UNDONE;
10712                                 stat_jaddref++;
10713                         } else
10714                                 panic("initiate_write_bmsafemap: inode %ju "
10715                                     "marked free", (uintmax_t)jaddref->ja_ino);
10716                 }
10717         }
10718         /*
10719          * Clear any block allocations which are pending journal writes.
10720          */
10721         if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
10722                 cgp = (struct cg *)bp->b_data;
10723                 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10724                 blksfree = cg_blksfree(cgp);
10725                 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10726                         if (jnewblk_rollback(jnewblk, fs, cgp, blksfree))
10727                                 continue;
10728                         panic("initiate_write_bmsafemap: block %jd "
10729                             "marked free", jnewblk->jn_blkno);
10730                 }
10731         }
10732         /*
10733          * Move allocation lists to the written lists so they can be
10734          * cleared once the block write is complete.
10735          */
10736         LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
10737             inodedep, id_deps);
10738         LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
10739             newblk, nb_deps);
10740         LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist,
10741             wk_list);
10742 }
10743
10744 /*
10745  * This routine is called during the completion interrupt
10746  * service routine for a disk write (from the procedure called
10747  * by the device driver to inform the filesystem caches of
10748  * a request completion).  It should be called early in this
10749  * procedure, before the block is made available to other
10750  * processes or other routines are called.
10751  *
10752  */
10753 static void
10754 softdep_disk_write_complete(bp)
10755         struct buf *bp;         /* describes the completed disk write */
10756 {
10757         struct worklist *wk;
10758         struct worklist *owk;
10759         struct ufsmount *ump;
10760         struct workhead reattach;
10761         struct freeblks *freeblks;
10762         struct buf *sbp;
10763
10764         /*
10765          * If an error occurred while doing the write, then the data
10766          * has not hit the disk and the dependencies cannot be unrolled.
10767          */
10768         if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
10769                 return;
10770         if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
10771                 return;
10772         ump = VFSTOUFS(wk->wk_mp);
10773         LIST_INIT(&reattach);
10774         /*
10775          * This lock must not be released anywhere in this code segment.
10776          */
10777         sbp = NULL;
10778         owk = NULL;
10779         ACQUIRE_LOCK(ump);
10780         while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
10781                 WORKLIST_REMOVE(wk);
10782                 dep_write[wk->wk_type]++;
10783                 if (wk == owk)
10784                         panic("duplicate worklist: %p\n", wk);
10785                 owk = wk;
10786                 switch (wk->wk_type) {
10787
10788                 case D_PAGEDEP:
10789                         if (handle_written_filepage(WK_PAGEDEP(wk), bp))
10790                                 WORKLIST_INSERT(&reattach, wk);
10791                         continue;
10792
10793                 case D_INODEDEP:
10794                         if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
10795                                 WORKLIST_INSERT(&reattach, wk);
10796                         continue;
10797
10798                 case D_BMSAFEMAP:
10799                         if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp))
10800                                 WORKLIST_INSERT(&reattach, wk);
10801                         continue;
10802
10803                 case D_MKDIR:
10804                         handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
10805                         continue;
10806
10807                 case D_ALLOCDIRECT:
10808                         wk->wk_state |= COMPLETE;
10809                         handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
10810                         continue;
10811
10812                 case D_ALLOCINDIR:
10813                         wk->wk_state |= COMPLETE;
10814                         handle_allocindir_partdone(WK_ALLOCINDIR(wk));
10815                         continue;
10816
10817                 case D_INDIRDEP:
10818                         if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp))
10819                                 WORKLIST_INSERT(&reattach, wk);
10820                         continue;
10821
10822                 case D_FREEBLKS:
10823                         wk->wk_state |= COMPLETE;
10824                         freeblks = WK_FREEBLKS(wk);
10825                         if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE &&
10826                             LIST_EMPTY(&freeblks->fb_jblkdephd))
10827                                 add_to_worklist(wk, WK_NODELAY);
10828                         continue;
10829
10830                 case D_FREEWORK:
10831                         handle_written_freework(WK_FREEWORK(wk));
10832                         break;
10833
10834                 case D_JSEGDEP:
10835                         free_jsegdep(WK_JSEGDEP(wk));
10836                         continue;
10837
10838                 case D_JSEG:
10839                         handle_written_jseg(WK_JSEG(wk), bp);
10840                         continue;
10841
10842                 case D_SBDEP:
10843                         if (handle_written_sbdep(WK_SBDEP(wk), bp))
10844                                 WORKLIST_INSERT(&reattach, wk);
10845                         continue;
10846
10847                 case D_FREEDEP:
10848                         free_freedep(WK_FREEDEP(wk));
10849                         continue;
10850
10851                 default:
10852                         panic("handle_disk_write_complete: Unknown type %s",
10853                             TYPENAME(wk->wk_type));
10854                         /* NOTREACHED */
10855                 }
10856         }
10857         /*
10858          * Reattach any requests that must be redone.
10859          */
10860         while ((wk = LIST_FIRST(&reattach)) != NULL) {
10861                 WORKLIST_REMOVE(wk);
10862                 WORKLIST_INSERT(&bp->b_dep, wk);
10863         }
10864         FREE_LOCK(ump);
10865         if (sbp)
10866                 brelse(sbp);
10867 }
10868
10869 /*
10870  * Called from within softdep_disk_write_complete above. Note that
10871  * this routine is always called from interrupt level with further
10872  * splbio interrupts blocked.
10873  */
10874 static void
10875 handle_allocdirect_partdone(adp, wkhd)
10876         struct allocdirect *adp;        /* the completed allocdirect */
10877         struct workhead *wkhd;          /* Work to do when inode is writtne. */
10878 {
10879         struct allocdirectlst *listhead;
10880         struct allocdirect *listadp;
10881         struct inodedep *inodedep;
10882         long bsize;
10883
10884         if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
10885                 return;
10886         /*
10887          * The on-disk inode cannot claim to be any larger than the last
10888          * fragment that has been written. Otherwise, the on-disk inode
10889          * might have fragments that were not the last block in the file
10890          * which would corrupt the filesystem. Thus, we cannot free any
10891          * allocdirects after one whose ad_oldblkno claims a fragment as
10892          * these blocks must be rolled back to zero before writing the inode.
10893          * We check the currently active set of allocdirects in id_inoupdt
10894          * or id_extupdt as appropriate.
10895          */
10896         inodedep = adp->ad_inodedep;
10897         bsize = inodedep->id_fs->fs_bsize;
10898         if (adp->ad_state & EXTDATA)
10899                 listhead = &inodedep->id_extupdt;
10900         else
10901                 listhead = &inodedep->id_inoupdt;
10902         TAILQ_FOREACH(listadp, listhead, ad_next) {
10903                 /* found our block */
10904                 if (listadp == adp)
10905                         break;
10906                 /* continue if ad_oldlbn is not a fragment */
10907                 if (listadp->ad_oldsize == 0 ||
10908                     listadp->ad_oldsize == bsize)
10909                         continue;
10910                 /* hit a fragment */
10911                 return;
10912         }
10913         /*
10914          * If we have reached the end of the current list without
10915          * finding the just finished dependency, then it must be
10916          * on the future dependency list. Future dependencies cannot
10917          * be freed until they are moved to the current list.
10918          */
10919         if (listadp == NULL) {
10920 #ifdef DEBUG
10921                 if (adp->ad_state & EXTDATA)
10922                         listhead = &inodedep->id_newextupdt;
10923                 else
10924                         listhead = &inodedep->id_newinoupdt;
10925                 TAILQ_FOREACH(listadp, listhead, ad_next)
10926                         /* found our block */
10927                         if (listadp == adp)
10928                                 break;
10929                 if (listadp == NULL)
10930                         panic("handle_allocdirect_partdone: lost dep");
10931 #endif /* DEBUG */
10932                 return;
10933         }
10934         /*
10935          * If we have found the just finished dependency, then queue
10936          * it along with anything that follows it that is complete.
10937          * Since the pointer has not yet been written in the inode
10938          * as the dependency prevents it, place the allocdirect on the
10939          * bufwait list where it will be freed once the pointer is
10940          * valid.
10941          */
10942         if (wkhd == NULL)
10943                 wkhd = &inodedep->id_bufwait;
10944         for (; adp; adp = listadp) {
10945                 listadp = TAILQ_NEXT(adp, ad_next);
10946                 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
10947                         return;
10948                 TAILQ_REMOVE(listhead, adp, ad_next);
10949                 WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
10950         }
10951 }
10952
10953 /*
10954  * Called from within softdep_disk_write_complete above.  This routine
10955  * completes successfully written allocindirs.
10956  */
10957 static void
10958 handle_allocindir_partdone(aip)
10959         struct allocindir *aip;         /* the completed allocindir */
10960 {
10961         struct indirdep *indirdep;
10962
10963         if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
10964                 return;
10965         indirdep = aip->ai_indirdep;
10966         LIST_REMOVE(aip, ai_next);
10967         /*
10968          * Don't set a pointer while the buffer is undergoing IO or while
10969          * we have active truncations.
10970          */
10971         if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) {
10972                 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
10973                 return;
10974         }
10975         if (indirdep->ir_state & UFS1FMT)
10976                 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
10977                     aip->ai_newblkno;
10978         else
10979                 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
10980                     aip->ai_newblkno;
10981         /*
10982          * Await the pointer write before freeing the allocindir.
10983          */
10984         LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
10985 }
10986
10987 /*
10988  * Release segments held on a jwork list.
10989  */
10990 static void
10991 handle_jwork(wkhd)
10992         struct workhead *wkhd;
10993 {
10994         struct worklist *wk;
10995
10996         while ((wk = LIST_FIRST(wkhd)) != NULL) {
10997                 WORKLIST_REMOVE(wk);
10998                 switch (wk->wk_type) {
10999                 case D_JSEGDEP:
11000                         free_jsegdep(WK_JSEGDEP(wk));
11001                         continue;
11002                 case D_FREEDEP:
11003                         free_freedep(WK_FREEDEP(wk));
11004                         continue;
11005                 case D_FREEFRAG:
11006                         rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep));
11007                         WORKITEM_FREE(wk, D_FREEFRAG);
11008                         continue;
11009                 case D_FREEWORK:
11010                         handle_written_freework(WK_FREEWORK(wk));
11011                         continue;
11012                 default:
11013                         panic("handle_jwork: Unknown type %s\n",
11014                             TYPENAME(wk->wk_type));
11015                 }
11016         }
11017 }
11018
11019 /*
11020  * Handle the bufwait list on an inode when it is safe to release items
11021  * held there.  This normally happens after an inode block is written but
11022  * may be delayed and handled later if there are pending journal items that
11023  * are not yet safe to be released.
11024  */
11025 static struct freefile *
11026 handle_bufwait(inodedep, refhd)
11027         struct inodedep *inodedep;
11028         struct workhead *refhd;
11029 {
11030         struct jaddref *jaddref;
11031         struct freefile *freefile;
11032         struct worklist *wk;
11033
11034         freefile = NULL;
11035         while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
11036                 WORKLIST_REMOVE(wk);
11037                 switch (wk->wk_type) {
11038                 case D_FREEFILE:
11039                         /*
11040                          * We defer adding freefile to the worklist
11041                          * until all other additions have been made to
11042                          * ensure that it will be done after all the
11043                          * old blocks have been freed.
11044                          */
11045                         if (freefile != NULL)
11046                                 panic("handle_bufwait: freefile");
11047                         freefile = WK_FREEFILE(wk);
11048                         continue;
11049
11050                 case D_MKDIR:
11051                         handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
11052                         continue;
11053
11054                 case D_DIRADD:
11055                         diradd_inode_written(WK_DIRADD(wk), inodedep);
11056                         continue;
11057
11058                 case D_FREEFRAG:
11059                         wk->wk_state |= COMPLETE;
11060                         if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
11061                                 add_to_worklist(wk, 0);
11062                         continue;
11063
11064                 case D_DIRREM:
11065                         wk->wk_state |= COMPLETE;
11066                         add_to_worklist(wk, 0);
11067                         continue;
11068
11069                 case D_ALLOCDIRECT:
11070                 case D_ALLOCINDIR:
11071                         free_newblk(WK_NEWBLK(wk));
11072                         continue;
11073
11074                 case D_JNEWBLK:
11075                         wk->wk_state |= COMPLETE;
11076                         free_jnewblk(WK_JNEWBLK(wk));
11077                         continue;
11078
11079                 /*
11080                  * Save freed journal segments and add references on
11081                  * the supplied list which will delay their release
11082                  * until the cg bitmap is cleared on disk.
11083                  */
11084                 case D_JSEGDEP:
11085                         if (refhd == NULL)
11086                                 free_jsegdep(WK_JSEGDEP(wk));
11087                         else
11088                                 WORKLIST_INSERT(refhd, wk);
11089                         continue;
11090
11091                 case D_JADDREF:
11092                         jaddref = WK_JADDREF(wk);
11093                         TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
11094                             if_deps);
11095                         /*
11096                          * Transfer any jaddrefs to the list to be freed with
11097                          * the bitmap if we're handling a removed file.
11098                          */
11099                         if (refhd == NULL) {
11100                                 wk->wk_state |= COMPLETE;
11101                                 free_jaddref(jaddref);
11102                         } else
11103                                 WORKLIST_INSERT(refhd, wk);
11104                         continue;
11105
11106                 default:
11107                         panic("handle_bufwait: Unknown type %p(%s)",
11108                             wk, TYPENAME(wk->wk_type));
11109                         /* NOTREACHED */
11110                 }
11111         }
11112         return (freefile);
11113 }
11114 /*
11115  * Called from within softdep_disk_write_complete above to restore
11116  * in-memory inode block contents to their most up-to-date state. Note
11117  * that this routine is always called from interrupt level with further
11118  * splbio interrupts blocked.
11119  */
11120 static int
11121 handle_written_inodeblock(inodedep, bp)
11122         struct inodedep *inodedep;
11123         struct buf *bp;         /* buffer containing the inode block */
11124 {
11125         struct freefile *freefile;
11126         struct allocdirect *adp, *nextadp;
11127         struct ufs1_dinode *dp1 = NULL;
11128         struct ufs2_dinode *dp2 = NULL;
11129         struct workhead wkhd;
11130         int hadchanges, fstype;
11131         ino_t freelink;
11132
11133         LIST_INIT(&wkhd);
11134         hadchanges = 0;
11135         freefile = NULL;
11136         if ((inodedep->id_state & IOSTARTED) == 0)
11137                 panic("handle_written_inodeblock: not started");
11138         inodedep->id_state &= ~IOSTARTED;
11139         if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
11140                 fstype = UFS1;
11141                 dp1 = (struct ufs1_dinode *)bp->b_data +
11142                     ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11143                 freelink = dp1->di_freelink;
11144         } else {
11145                 fstype = UFS2;
11146                 dp2 = (struct ufs2_dinode *)bp->b_data +
11147                     ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11148                 freelink = dp2->di_freelink;
11149         }
11150         /*
11151          * Leave this inodeblock dirty until it's in the list.
11152          */
11153         if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) {
11154                 struct inodedep *inon;
11155
11156                 inon = TAILQ_NEXT(inodedep, id_unlinked);
11157                 if ((inon == NULL && freelink == 0) ||
11158                     (inon && inon->id_ino == freelink)) {
11159                         if (inon)
11160                                 inon->id_state |= UNLINKPREV;
11161                         inodedep->id_state |= UNLINKNEXT;
11162                 }
11163                 hadchanges = 1;
11164         }
11165         /*
11166          * If we had to rollback the inode allocation because of
11167          * bitmaps being incomplete, then simply restore it.
11168          * Keep the block dirty so that it will not be reclaimed until
11169          * all associated dependencies have been cleared and the
11170          * corresponding updates written to disk.
11171          */
11172         if (inodedep->id_savedino1 != NULL) {
11173                 hadchanges = 1;
11174                 if (fstype == UFS1)
11175                         *dp1 = *inodedep->id_savedino1;
11176                 else
11177                         *dp2 = *inodedep->id_savedino2;
11178                 free(inodedep->id_savedino1, M_SAVEDINO);
11179                 inodedep->id_savedino1 = NULL;
11180                 if ((bp->b_flags & B_DELWRI) == 0)
11181                         stat_inode_bitmap++;
11182                 bdirty(bp);
11183                 /*
11184                  * If the inode is clear here and GOINGAWAY it will never
11185                  * be written.  Process the bufwait and clear any pending
11186                  * work which may include the freefile.
11187                  */
11188                 if (inodedep->id_state & GOINGAWAY)
11189                         goto bufwait;
11190                 return (1);
11191         }
11192         inodedep->id_state |= COMPLETE;
11193         /*
11194          * Roll forward anything that had to be rolled back before
11195          * the inode could be updated.
11196          */
11197         for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
11198                 nextadp = TAILQ_NEXT(adp, ad_next);
11199                 if (adp->ad_state & ATTACHED)
11200                         panic("handle_written_inodeblock: new entry");
11201                 if (fstype == UFS1) {
11202                         if (adp->ad_offset < NDADDR) {
11203                                 if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11204                                         panic("%s %s #%jd mismatch %d != %jd",
11205                                             "handle_written_inodeblock:",
11206                                             "direct pointer",
11207                                             (intmax_t)adp->ad_offset,
11208                                             dp1->di_db[adp->ad_offset],
11209                                             (intmax_t)adp->ad_oldblkno);
11210                                 dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
11211                         } else {
11212                                 if (dp1->di_ib[adp->ad_offset - NDADDR] != 0)
11213                                         panic("%s: %s #%jd allocated as %d",
11214                                             "handle_written_inodeblock",
11215                                             "indirect pointer",
11216                                             (intmax_t)adp->ad_offset - NDADDR,
11217                                             dp1->di_ib[adp->ad_offset - NDADDR]);
11218                                 dp1->di_ib[adp->ad_offset - NDADDR] =
11219                                     adp->ad_newblkno;
11220                         }
11221                 } else {
11222                         if (adp->ad_offset < NDADDR) {
11223                                 if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11224                                         panic("%s: %s #%jd %s %jd != %jd",
11225                                             "handle_written_inodeblock",
11226                                             "direct pointer",
11227                                             (intmax_t)adp->ad_offset, "mismatch",
11228                                             (intmax_t)dp2->di_db[adp->ad_offset],
11229                                             (intmax_t)adp->ad_oldblkno);
11230                                 dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
11231                         } else {
11232                                 if (dp2->di_ib[adp->ad_offset - NDADDR] != 0)
11233                                         panic("%s: %s #%jd allocated as %jd",
11234                                             "handle_written_inodeblock",
11235                                             "indirect pointer",
11236                                             (intmax_t)adp->ad_offset - NDADDR,
11237                                             (intmax_t)
11238                                             dp2->di_ib[adp->ad_offset - NDADDR]);
11239                                 dp2->di_ib[adp->ad_offset - NDADDR] =
11240                                     adp->ad_newblkno;
11241                         }
11242                 }
11243                 adp->ad_state &= ~UNDONE;
11244                 adp->ad_state |= ATTACHED;
11245                 hadchanges = 1;
11246         }
11247         for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
11248                 nextadp = TAILQ_NEXT(adp, ad_next);
11249                 if (adp->ad_state & ATTACHED)
11250                         panic("handle_written_inodeblock: new entry");
11251                 if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
11252                         panic("%s: direct pointers #%jd %s %jd != %jd",
11253                             "handle_written_inodeblock",
11254                             (intmax_t)adp->ad_offset, "mismatch",
11255                             (intmax_t)dp2->di_extb[adp->ad_offset],
11256                             (intmax_t)adp->ad_oldblkno);
11257                 dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
11258                 adp->ad_state &= ~UNDONE;
11259                 adp->ad_state |= ATTACHED;
11260                 hadchanges = 1;
11261         }
11262         if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
11263                 stat_direct_blk_ptrs++;
11264         /*
11265          * Reset the file size to its most up-to-date value.
11266          */
11267         if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
11268                 panic("handle_written_inodeblock: bad size");
11269         if (inodedep->id_savednlink > LINK_MAX)
11270                 panic("handle_written_inodeblock: Invalid link count "
11271                     "%d for inodedep %p", inodedep->id_savednlink, inodedep);
11272         if (fstype == UFS1) {
11273                 if (dp1->di_nlink != inodedep->id_savednlink) {
11274                         dp1->di_nlink = inodedep->id_savednlink;
11275                         hadchanges = 1;
11276                 }
11277                 if (dp1->di_size != inodedep->id_savedsize) {
11278                         dp1->di_size = inodedep->id_savedsize;
11279                         hadchanges = 1;
11280                 }
11281         } else {
11282                 if (dp2->di_nlink != inodedep->id_savednlink) {
11283                         dp2->di_nlink = inodedep->id_savednlink;
11284                         hadchanges = 1;
11285                 }
11286                 if (dp2->di_size != inodedep->id_savedsize) {
11287                         dp2->di_size = inodedep->id_savedsize;
11288                         hadchanges = 1;
11289                 }
11290                 if (dp2->di_extsize != inodedep->id_savedextsize) {
11291                         dp2->di_extsize = inodedep->id_savedextsize;
11292                         hadchanges = 1;
11293                 }
11294         }
11295         inodedep->id_savedsize = -1;
11296         inodedep->id_savedextsize = -1;
11297         inodedep->id_savednlink = -1;
11298         /*
11299          * If there were any rollbacks in the inode block, then it must be
11300          * marked dirty so that its will eventually get written back in
11301          * its correct form.
11302          */
11303         if (hadchanges)
11304                 bdirty(bp);
11305 bufwait:
11306         /*
11307          * Process any allocdirects that completed during the update.
11308          */
11309         if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
11310                 handle_allocdirect_partdone(adp, &wkhd);
11311         if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
11312                 handle_allocdirect_partdone(adp, &wkhd);
11313         /*
11314          * Process deallocations that were held pending until the
11315          * inode had been written to disk. Freeing of the inode
11316          * is delayed until after all blocks have been freed to
11317          * avoid creation of new <vfsid, inum, lbn> triples
11318          * before the old ones have been deleted.  Completely
11319          * unlinked inodes are not processed until the unlinked
11320          * inode list is written or the last reference is removed.
11321          */
11322         if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
11323                 freefile = handle_bufwait(inodedep, NULL);
11324                 if (freefile && !LIST_EMPTY(&wkhd)) {
11325                         WORKLIST_INSERT(&wkhd, &freefile->fx_list);
11326                         freefile = NULL;
11327                 }
11328         }
11329         /*
11330          * Move rolled forward dependency completions to the bufwait list
11331          * now that those that were already written have been processed.
11332          */
11333         if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
11334                 panic("handle_written_inodeblock: bufwait but no changes");
11335         jwork_move(&inodedep->id_bufwait, &wkhd);
11336
11337         if (freefile != NULL) {
11338                 /*
11339                  * If the inode is goingaway it was never written.  Fake up
11340                  * the state here so free_inodedep() can succeed.
11341                  */
11342                 if (inodedep->id_state & GOINGAWAY)
11343                         inodedep->id_state |= COMPLETE | DEPCOMPLETE;
11344                 if (free_inodedep(inodedep) == 0)
11345                         panic("handle_written_inodeblock: live inodedep %p",
11346                             inodedep);
11347                 add_to_worklist(&freefile->fx_list, 0);
11348                 return (0);
11349         }
11350
11351         /*
11352          * If no outstanding dependencies, free it.
11353          */
11354         if (free_inodedep(inodedep) ||
11355             (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
11356              TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
11357              TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
11358              LIST_FIRST(&inodedep->id_bufwait) == 0))
11359                 return (0);
11360         return (hadchanges);
11361 }
11362
11363 static int
11364 handle_written_indirdep(indirdep, bp, bpp)
11365         struct indirdep *indirdep;
11366         struct buf *bp;
11367         struct buf **bpp;
11368 {
11369         struct allocindir *aip;
11370         struct buf *sbp;
11371         int chgs;
11372
11373         if (indirdep->ir_state & GOINGAWAY)
11374                 panic("handle_written_indirdep: indirdep gone");
11375         if ((indirdep->ir_state & IOSTARTED) == 0)
11376                 panic("handle_written_indirdep: IO not started");
11377         chgs = 0;
11378         /*
11379          * If there were rollbacks revert them here.
11380          */
11381         if (indirdep->ir_saveddata) {
11382                 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
11383                 if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11384                         free(indirdep->ir_saveddata, M_INDIRDEP);
11385                         indirdep->ir_saveddata = NULL;
11386                 }
11387                 chgs = 1;
11388         }
11389         indirdep->ir_state &= ~(UNDONE | IOSTARTED);
11390         indirdep->ir_state |= ATTACHED;
11391         /*
11392          * Move allocindirs with written pointers to the completehd if
11393          * the indirdep's pointer is not yet written.  Otherwise
11394          * free them here.
11395          */
11396         while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) {
11397                 LIST_REMOVE(aip, ai_next);
11398                 if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
11399                         LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
11400                             ai_next);
11401                         newblk_freefrag(&aip->ai_block);
11402                         continue;
11403                 }
11404                 free_newblk(&aip->ai_block);
11405         }
11406         /*
11407          * Move allocindirs that have finished dependency processing from
11408          * the done list to the write list after updating the pointers.
11409          */
11410         if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11411                 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
11412                         handle_allocindir_partdone(aip);
11413                         if (aip == LIST_FIRST(&indirdep->ir_donehd))
11414                                 panic("disk_write_complete: not gone");
11415                         chgs = 1;
11416                 }
11417         }
11418         /*
11419          * Preserve the indirdep if there were any changes or if it is not
11420          * yet valid on disk.
11421          */
11422         if (chgs) {
11423                 stat_indir_blk_ptrs++;
11424                 bdirty(bp);
11425                 return (1);
11426         }
11427         /*
11428          * If there were no changes we can discard the savedbp and detach
11429          * ourselves from the buf.  We are only carrying completed pointers
11430          * in this case.
11431          */
11432         sbp = indirdep->ir_savebp;
11433         sbp->b_flags |= B_INVAL | B_NOCACHE;
11434         indirdep->ir_savebp = NULL;
11435         indirdep->ir_bp = NULL;
11436         if (*bpp != NULL)
11437                 panic("handle_written_indirdep: bp already exists.");
11438         *bpp = sbp;
11439         /*
11440          * The indirdep may not be freed until its parent points at it.
11441          */
11442         if (indirdep->ir_state & DEPCOMPLETE)
11443                 free_indirdep(indirdep);
11444
11445         return (0);
11446 }
11447
11448 /*
11449  * Process a diradd entry after its dependent inode has been written.
11450  * This routine must be called with splbio interrupts blocked.
11451  */
11452 static void
11453 diradd_inode_written(dap, inodedep)
11454         struct diradd *dap;
11455         struct inodedep *inodedep;
11456 {
11457
11458         dap->da_state |= COMPLETE;
11459         complete_diradd(dap);
11460         WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
11461 }
11462
11463 /*
11464  * Returns true if the bmsafemap will have rollbacks when written.  Must only
11465  * be called with the soft updates lock and the buf lock on the cg held.
11466  */
11467 static int
11468 bmsafemap_backgroundwrite(bmsafemap, bp)
11469         struct bmsafemap *bmsafemap;
11470         struct buf *bp;
11471 {
11472         int dirty;
11473
11474         LOCK_OWNED(VFSTOUFS(bmsafemap->sm_list.wk_mp));
11475         dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
11476             !LIST_EMPTY(&bmsafemap->sm_jnewblkhd);
11477         /*
11478          * If we're initiating a background write we need to process the
11479          * rollbacks as they exist now, not as they exist when IO starts.
11480          * No other consumers will look at the contents of the shadowed
11481          * buf so this is safe to do here.
11482          */
11483         if (bp->b_xflags & BX_BKGRDMARKER)
11484                 initiate_write_bmsafemap(bmsafemap, bp);
11485
11486         return (dirty);
11487 }
11488
11489 /*
11490  * Re-apply an allocation when a cg write is complete.
11491  */
11492 static int
11493 jnewblk_rollforward(jnewblk, fs, cgp, blksfree)
11494         struct jnewblk *jnewblk;
11495         struct fs *fs;
11496         struct cg *cgp;
11497         uint8_t *blksfree;
11498 {
11499         ufs1_daddr_t fragno;
11500         ufs2_daddr_t blkno;
11501         long cgbno, bbase;
11502         int frags, blk;
11503         int i;
11504
11505         frags = 0;
11506         cgbno = dtogd(fs, jnewblk->jn_blkno);
11507         for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) {
11508                 if (isclr(blksfree, cgbno + i))
11509                         panic("jnewblk_rollforward: re-allocated fragment");
11510                 frags++;
11511         }
11512         if (frags == fs->fs_frag) {
11513                 blkno = fragstoblks(fs, cgbno);
11514                 ffs_clrblock(fs, blksfree, (long)blkno);
11515                 ffs_clusteracct(fs, cgp, blkno, -1);
11516                 cgp->cg_cs.cs_nbfree--;
11517         } else {
11518                 bbase = cgbno - fragnum(fs, cgbno);
11519                 cgbno += jnewblk->jn_oldfrags;
11520                 /* If a complete block had been reassembled, account for it. */
11521                 fragno = fragstoblks(fs, bbase);
11522                 if (ffs_isblock(fs, blksfree, fragno)) {
11523                         cgp->cg_cs.cs_nffree += fs->fs_frag;
11524                         ffs_clusteracct(fs, cgp, fragno, -1);
11525                         cgp->cg_cs.cs_nbfree--;
11526                 }
11527                 /* Decrement the old frags.  */
11528                 blk = blkmap(fs, blksfree, bbase);
11529                 ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
11530                 /* Allocate the fragment */
11531                 for (i = 0; i < frags; i++)
11532                         clrbit(blksfree, cgbno + i);
11533                 cgp->cg_cs.cs_nffree -= frags;
11534                 /* Add back in counts associated with the new frags */
11535                 blk = blkmap(fs, blksfree, bbase);
11536                 ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
11537         }
11538         return (frags);
11539 }
11540
11541 /*
11542  * Complete a write to a bmsafemap structure.  Roll forward any bitmap
11543  * changes if it's not a background write.  Set all written dependencies
11544  * to DEPCOMPLETE and free the structure if possible.
11545  */
11546 static int
11547 handle_written_bmsafemap(bmsafemap, bp)
11548         struct bmsafemap *bmsafemap;
11549         struct buf *bp;
11550 {
11551         struct newblk *newblk;
11552         struct inodedep *inodedep;
11553         struct jaddref *jaddref, *jatmp;
11554         struct jnewblk *jnewblk, *jntmp;
11555         struct ufsmount *ump;
11556         uint8_t *inosused;
11557         uint8_t *blksfree;
11558         struct cg *cgp;
11559         struct fs *fs;
11560         ino_t ino;
11561         int foreground;
11562         int chgs;
11563
11564         if ((bmsafemap->sm_state & IOSTARTED) == 0)
11565                 panic("initiate_write_bmsafemap: Not started\n");
11566         ump = VFSTOUFS(bmsafemap->sm_list.wk_mp);
11567         chgs = 0;
11568         bmsafemap->sm_state &= ~IOSTARTED;
11569         foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0;
11570         /*
11571          * Release journal work that was waiting on the write.
11572          */
11573         handle_jwork(&bmsafemap->sm_freewr);
11574
11575         /*
11576          * Restore unwritten inode allocation pending jaddref writes.
11577          */
11578         if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
11579                 cgp = (struct cg *)bp->b_data;
11580                 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11581                 inosused = cg_inosused(cgp);
11582                 LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
11583                     ja_bmdeps, jatmp) {
11584                         if ((jaddref->ja_state & UNDONE) == 0)
11585                                 continue;
11586                         ino = jaddref->ja_ino % fs->fs_ipg;
11587                         if (isset(inosused, ino))
11588                                 panic("handle_written_bmsafemap: "
11589                                     "re-allocated inode");
11590                         /* Do the roll-forward only if it's a real copy. */
11591                         if (foreground) {
11592                                 if ((jaddref->ja_mode & IFMT) == IFDIR)
11593                                         cgp->cg_cs.cs_ndir++;
11594                                 cgp->cg_cs.cs_nifree--;
11595                                 setbit(inosused, ino);
11596                                 chgs = 1;
11597                         }
11598                         jaddref->ja_state &= ~UNDONE;
11599                         jaddref->ja_state |= ATTACHED;
11600                         free_jaddref(jaddref);
11601                 }
11602         }
11603         /*
11604          * Restore any block allocations which are pending journal writes.
11605          */
11606         if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
11607                 cgp = (struct cg *)bp->b_data;
11608                 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11609                 blksfree = cg_blksfree(cgp);
11610                 LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
11611                     jntmp) {
11612                         if ((jnewblk->jn_state & UNDONE) == 0)
11613                                 continue;
11614                         /* Do the roll-forward only if it's a real copy. */
11615                         if (foreground &&
11616                             jnewblk_rollforward(jnewblk, fs, cgp, blksfree))
11617                                 chgs = 1;
11618                         jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
11619                         jnewblk->jn_state |= ATTACHED;
11620                         free_jnewblk(jnewblk);
11621                 }
11622         }
11623         while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
11624                 newblk->nb_state |= DEPCOMPLETE;
11625                 newblk->nb_state &= ~ONDEPLIST;
11626                 newblk->nb_bmsafemap = NULL;
11627                 LIST_REMOVE(newblk, nb_deps);
11628                 if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
11629                         handle_allocdirect_partdone(
11630                             WK_ALLOCDIRECT(&newblk->nb_list), NULL);
11631                 else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
11632                         handle_allocindir_partdone(
11633                             WK_ALLOCINDIR(&newblk->nb_list));
11634                 else if (newblk->nb_list.wk_type != D_NEWBLK)
11635                         panic("handle_written_bmsafemap: Unexpected type: %s",
11636                             TYPENAME(newblk->nb_list.wk_type));
11637         }
11638         while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
11639                 inodedep->id_state |= DEPCOMPLETE;
11640                 inodedep->id_state &= ~ONDEPLIST;
11641                 LIST_REMOVE(inodedep, id_deps);
11642                 inodedep->id_bmsafemap = NULL;
11643         }
11644         LIST_REMOVE(bmsafemap, sm_next);
11645         if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
11646             LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
11647             LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
11648             LIST_EMPTY(&bmsafemap->sm_inodedephd) &&
11649             LIST_EMPTY(&bmsafemap->sm_freehd)) {
11650                 LIST_REMOVE(bmsafemap, sm_hash);
11651                 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
11652                 return (0);
11653         }
11654         LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
11655         if (foreground)
11656                 bdirty(bp);
11657         return (1);
11658 }
11659
11660 /*
11661  * Try to free a mkdir dependency.
11662  */
11663 static void
11664 complete_mkdir(mkdir)
11665         struct mkdir *mkdir;
11666 {
11667         struct diradd *dap;
11668
11669         if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
11670                 return;
11671         LIST_REMOVE(mkdir, md_mkdirs);
11672         dap = mkdir->md_diradd;
11673         dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
11674         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
11675                 dap->da_state |= DEPCOMPLETE;
11676                 complete_diradd(dap);
11677         }
11678         WORKITEM_FREE(mkdir, D_MKDIR);
11679 }
11680
11681 /*
11682  * Handle the completion of a mkdir dependency.
11683  */
11684 static void
11685 handle_written_mkdir(mkdir, type)
11686         struct mkdir *mkdir;
11687         int type;
11688 {
11689
11690         if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
11691                 panic("handle_written_mkdir: bad type");
11692         mkdir->md_state |= COMPLETE;
11693         complete_mkdir(mkdir);
11694 }
11695
11696 static int
11697 free_pagedep(pagedep)
11698         struct pagedep *pagedep;
11699 {
11700         int i;
11701
11702         if (pagedep->pd_state & NEWBLOCK)
11703                 return (0);
11704         if (!LIST_EMPTY(&pagedep->pd_dirremhd))
11705                 return (0);
11706         for (i = 0; i < DAHASHSZ; i++)
11707                 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
11708                         return (0);
11709         if (!LIST_EMPTY(&pagedep->pd_pendinghd))
11710                 return (0);
11711         if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
11712                 return (0);
11713         if (pagedep->pd_state & ONWORKLIST)
11714                 WORKLIST_REMOVE(&pagedep->pd_list);
11715         LIST_REMOVE(pagedep, pd_hash);
11716         WORKITEM_FREE(pagedep, D_PAGEDEP);
11717
11718         return (1);
11719 }
11720
11721 /*
11722  * Called from within softdep_disk_write_complete above.
11723  * A write operation was just completed. Removed inodes can
11724  * now be freed and associated block pointers may be committed.
11725  * Note that this routine is always called from interrupt level
11726  * with further splbio interrupts blocked.
11727  */
11728 static int
11729 handle_written_filepage(pagedep, bp)
11730         struct pagedep *pagedep;
11731         struct buf *bp;         /* buffer containing the written page */
11732 {
11733         struct dirrem *dirrem;
11734         struct diradd *dap, *nextdap;
11735         struct direct *ep;
11736         int i, chgs;
11737
11738         if ((pagedep->pd_state & IOSTARTED) == 0)
11739                 panic("handle_written_filepage: not started");
11740         pagedep->pd_state &= ~IOSTARTED;
11741         /*
11742          * Process any directory removals that have been committed.
11743          */
11744         while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
11745                 LIST_REMOVE(dirrem, dm_next);
11746                 dirrem->dm_state |= COMPLETE;
11747                 dirrem->dm_dirinum = pagedep->pd_ino;
11748                 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
11749                     ("handle_written_filepage: Journal entries not written."));
11750                 add_to_worklist(&dirrem->dm_list, 0);
11751         }
11752         /*
11753          * Free any directory additions that have been committed.
11754          * If it is a newly allocated block, we have to wait until
11755          * the on-disk directory inode claims the new block.
11756          */
11757         if ((pagedep->pd_state & NEWBLOCK) == 0)
11758                 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
11759                         free_diradd(dap, NULL);
11760         /*
11761          * Uncommitted directory entries must be restored.
11762          */
11763         for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
11764                 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
11765                      dap = nextdap) {
11766                         nextdap = LIST_NEXT(dap, da_pdlist);
11767                         if (dap->da_state & ATTACHED)
11768                                 panic("handle_written_filepage: attached");
11769                         ep = (struct direct *)
11770                             ((char *)bp->b_data + dap->da_offset);
11771                         ep->d_ino = dap->da_newinum;
11772                         dap->da_state &= ~UNDONE;
11773                         dap->da_state |= ATTACHED;
11774                         chgs = 1;
11775                         /*
11776                          * If the inode referenced by the directory has
11777                          * been written out, then the dependency can be
11778                          * moved to the pending list.
11779                          */
11780                         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
11781                                 LIST_REMOVE(dap, da_pdlist);
11782                                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
11783                                     da_pdlist);
11784                         }
11785                 }
11786         }
11787         /*
11788          * If there were any rollbacks in the directory, then it must be
11789          * marked dirty so that its will eventually get written back in
11790          * its correct form.
11791          */
11792         if (chgs) {
11793                 if ((bp->b_flags & B_DELWRI) == 0)
11794                         stat_dir_entry++;
11795                 bdirty(bp);
11796                 return (1);
11797         }
11798         /*
11799          * If we are not waiting for a new directory block to be
11800          * claimed by its inode, then the pagedep will be freed.
11801          * Otherwise it will remain to track any new entries on
11802          * the page in case they are fsync'ed.
11803          */
11804         free_pagedep(pagedep);
11805         return (0);
11806 }
11807
11808 /*
11809  * Writing back in-core inode structures.
11810  *
11811  * The filesystem only accesses an inode's contents when it occupies an
11812  * "in-core" inode structure.  These "in-core" structures are separate from
11813  * the page frames used to cache inode blocks.  Only the latter are
11814  * transferred to/from the disk.  So, when the updated contents of the
11815  * "in-core" inode structure are copied to the corresponding in-memory inode
11816  * block, the dependencies are also transferred.  The following procedure is
11817  * called when copying a dirty "in-core" inode to a cached inode block.
11818  */
11819
11820 /*
11821  * Called when an inode is loaded from disk. If the effective link count
11822  * differed from the actual link count when it was last flushed, then we
11823  * need to ensure that the correct effective link count is put back.
11824  */
11825 void
11826 softdep_load_inodeblock(ip)
11827         struct inode *ip;       /* the "in_core" copy of the inode */
11828 {
11829         struct inodedep *inodedep;
11830
11831         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
11832             ("softdep_load_inodeblock called on non-softdep filesystem"));
11833         /*
11834          * Check for alternate nlink count.
11835          */
11836         ip->i_effnlink = ip->i_nlink;
11837         ACQUIRE_LOCK(ip->i_ump);
11838         if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
11839             &inodedep) == 0) {
11840                 FREE_LOCK(ip->i_ump);
11841                 return;
11842         }
11843         ip->i_effnlink -= inodedep->id_nlinkdelta;
11844         FREE_LOCK(ip->i_ump);
11845 }
11846
11847 /*
11848  * This routine is called just before the "in-core" inode
11849  * information is to be copied to the in-memory inode block.
11850  * Recall that an inode block contains several inodes. If
11851  * the force flag is set, then the dependencies will be
11852  * cleared so that the update can always be made. Note that
11853  * the buffer is locked when this routine is called, so we
11854  * will never be in the middle of writing the inode block
11855  * to disk.
11856  */
11857 void
11858 softdep_update_inodeblock(ip, bp, waitfor)
11859         struct inode *ip;       /* the "in_core" copy of the inode */
11860         struct buf *bp;         /* the buffer containing the inode block */
11861         int waitfor;            /* nonzero => update must be allowed */
11862 {
11863         struct inodedep *inodedep;
11864         struct inoref *inoref;
11865         struct ufsmount *ump;
11866         struct worklist *wk;
11867         struct mount *mp;
11868         struct buf *ibp;
11869         struct fs *fs;
11870         int error;
11871
11872         ump = ip->i_ump;
11873         mp = UFSTOVFS(ump);
11874         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
11875             ("softdep_update_inodeblock called on non-softdep filesystem"));
11876         fs = ip->i_fs;
11877         /*
11878          * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
11879          * does not have access to the in-core ip so must write directly into
11880          * the inode block buffer when setting freelink.
11881          */
11882         if (fs->fs_magic == FS_UFS1_MAGIC)
11883                 DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
11884                     ino_to_fsbo(fs, ip->i_number))->di_freelink);
11885         else
11886                 DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
11887                     ino_to_fsbo(fs, ip->i_number))->di_freelink);
11888         /*
11889          * If the effective link count is not equal to the actual link
11890          * count, then we must track the difference in an inodedep while
11891          * the inode is (potentially) tossed out of the cache. Otherwise,
11892          * if there is no existing inodedep, then there are no dependencies
11893          * to track.
11894          */
11895         ACQUIRE_LOCK(ump);
11896 again:
11897         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
11898                 FREE_LOCK(ump);
11899                 if (ip->i_effnlink != ip->i_nlink)
11900                         panic("softdep_update_inodeblock: bad link count");
11901                 return;
11902         }
11903         if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
11904                 panic("softdep_update_inodeblock: bad delta");
11905         /*
11906          * If we're flushing all dependencies we must also move any waiting
11907          * for journal writes onto the bufwait list prior to I/O.
11908          */
11909         if (waitfor) {
11910                 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
11911                         if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
11912                             == DEPCOMPLETE) {
11913                                 jwait(&inoref->if_list, MNT_WAIT);
11914                                 goto again;
11915                         }
11916                 }
11917         }
11918         /*
11919          * Changes have been initiated. Anything depending on these
11920          * changes cannot occur until this inode has been written.
11921          */
11922         inodedep->id_state &= ~COMPLETE;
11923         if ((inodedep->id_state & ONWORKLIST) == 0)
11924                 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
11925         /*
11926          * Any new dependencies associated with the incore inode must
11927          * now be moved to the list associated with the buffer holding
11928          * the in-memory copy of the inode. Once merged process any
11929          * allocdirects that are completed by the merger.
11930          */
11931         merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
11932         if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
11933                 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
11934                     NULL);
11935         merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
11936         if (!TAILQ_EMPTY(&inodedep->id_extupdt))
11937                 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
11938                     NULL);
11939         /*
11940          * Now that the inode has been pushed into the buffer, the
11941          * operations dependent on the inode being written to disk
11942          * can be moved to the id_bufwait so that they will be
11943          * processed when the buffer I/O completes.
11944          */
11945         while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
11946                 WORKLIST_REMOVE(wk);
11947                 WORKLIST_INSERT(&inodedep->id_bufwait, wk);
11948         }
11949         /*
11950          * Newly allocated inodes cannot be written until the bitmap
11951          * that allocates them have been written (indicated by
11952          * DEPCOMPLETE being set in id_state). If we are doing a
11953          * forced sync (e.g., an fsync on a file), we force the bitmap
11954          * to be written so that the update can be done.
11955          */
11956         if (waitfor == 0) {
11957                 FREE_LOCK(ump);
11958                 return;
11959         }
11960 retry:
11961         if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
11962                 FREE_LOCK(ump);
11963                 return;
11964         }
11965         ibp = inodedep->id_bmsafemap->sm_buf;
11966         ibp = getdirtybuf(ibp, LOCK_PTR(ump), MNT_WAIT);
11967         if (ibp == NULL) {
11968                 /*
11969                  * If ibp came back as NULL, the dependency could have been
11970                  * freed while we slept.  Look it up again, and check to see
11971                  * that it has completed.
11972                  */
11973                 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
11974                         goto retry;
11975                 FREE_LOCK(ump);
11976                 return;
11977         }
11978         FREE_LOCK(ump);
11979         if ((error = bwrite(ibp)) != 0)
11980                 softdep_error("softdep_update_inodeblock: bwrite", error);
11981 }
11982
11983 /*
11984  * Merge the a new inode dependency list (such as id_newinoupdt) into an
11985  * old inode dependency list (such as id_inoupdt). This routine must be
11986  * called with splbio interrupts blocked.
11987  */
11988 static void
11989 merge_inode_lists(newlisthead, oldlisthead)
11990         struct allocdirectlst *newlisthead;
11991         struct allocdirectlst *oldlisthead;
11992 {
11993         struct allocdirect *listadp, *newadp;
11994
11995         newadp = TAILQ_FIRST(newlisthead);
11996         for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
11997                 if (listadp->ad_offset < newadp->ad_offset) {
11998                         listadp = TAILQ_NEXT(listadp, ad_next);
11999                         continue;
12000                 }
12001                 TAILQ_REMOVE(newlisthead, newadp, ad_next);
12002                 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
12003                 if (listadp->ad_offset == newadp->ad_offset) {
12004                         allocdirect_merge(oldlisthead, newadp,
12005                             listadp);
12006                         listadp = newadp;
12007                 }
12008                 newadp = TAILQ_FIRST(newlisthead);
12009         }
12010         while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
12011                 TAILQ_REMOVE(newlisthead, newadp, ad_next);
12012                 TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
12013         }
12014 }
12015
12016 /*
12017  * If we are doing an fsync, then we must ensure that any directory
12018  * entries for the inode have been written after the inode gets to disk.
12019  */
12020 int
12021 softdep_fsync(vp)
12022         struct vnode *vp;       /* the "in_core" copy of the inode */
12023 {
12024         struct inodedep *inodedep;
12025         struct pagedep *pagedep;
12026         struct inoref *inoref;
12027         struct ufsmount *ump;
12028         struct worklist *wk;
12029         struct diradd *dap;
12030         struct mount *mp;
12031         struct vnode *pvp;
12032         struct inode *ip;
12033         struct buf *bp;
12034         struct fs *fs;
12035         struct thread *td = curthread;
12036         int error, flushparent, pagedep_new_block;
12037         ino_t parentino;
12038         ufs_lbn_t lbn;
12039
12040         ip = VTOI(vp);
12041         fs = ip->i_fs;
12042         ump = ip->i_ump;
12043         mp = vp->v_mount;
12044         if (MOUNTEDSOFTDEP(mp) == 0)
12045                 return (0);
12046         ACQUIRE_LOCK(ump);
12047 restart:
12048         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12049                 FREE_LOCK(ump);
12050                 return (0);
12051         }
12052         TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12053                 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12054                     == DEPCOMPLETE) {
12055                         jwait(&inoref->if_list, MNT_WAIT);
12056                         goto restart;
12057                 }
12058         }
12059         if (!LIST_EMPTY(&inodedep->id_inowait) ||
12060             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
12061             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
12062             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
12063             !TAILQ_EMPTY(&inodedep->id_newinoupdt))
12064                 panic("softdep_fsync: pending ops %p", inodedep);
12065         for (error = 0, flushparent = 0; ; ) {
12066                 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
12067                         break;
12068                 if (wk->wk_type != D_DIRADD)
12069                         panic("softdep_fsync: Unexpected type %s",
12070                             TYPENAME(wk->wk_type));
12071                 dap = WK_DIRADD(wk);
12072                 /*
12073                  * Flush our parent if this directory entry has a MKDIR_PARENT
12074                  * dependency or is contained in a newly allocated block.
12075                  */
12076                 if (dap->da_state & DIRCHG)
12077                         pagedep = dap->da_previous->dm_pagedep;
12078                 else
12079                         pagedep = dap->da_pagedep;
12080                 parentino = pagedep->pd_ino;
12081                 lbn = pagedep->pd_lbn;
12082                 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
12083                         panic("softdep_fsync: dirty");
12084                 if ((dap->da_state & MKDIR_PARENT) ||
12085                     (pagedep->pd_state & NEWBLOCK))
12086                         flushparent = 1;
12087                 else
12088                         flushparent = 0;
12089                 /*
12090                  * If we are being fsync'ed as part of vgone'ing this vnode,
12091                  * then we will not be able to release and recover the
12092                  * vnode below, so we just have to give up on writing its
12093                  * directory entry out. It will eventually be written, just
12094                  * not now, but then the user was not asking to have it
12095                  * written, so we are not breaking any promises.
12096                  */
12097                 if (vp->v_iflag & VI_DOOMED)
12098                         break;
12099                 /*
12100                  * We prevent deadlock by always fetching inodes from the
12101                  * root, moving down the directory tree. Thus, when fetching
12102                  * our parent directory, we first try to get the lock. If
12103                  * that fails, we must unlock ourselves before requesting
12104                  * the lock on our parent. See the comment in ufs_lookup
12105                  * for details on possible races.
12106                  */
12107                 FREE_LOCK(ump);
12108                 if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
12109                     FFSV_FORCEINSMQ)) {
12110                         error = vfs_busy(mp, MBF_NOWAIT);
12111                         if (error != 0) {
12112                                 vfs_ref(mp);
12113                                 VOP_UNLOCK(vp, 0);
12114                                 error = vfs_busy(mp, 0);
12115                                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
12116                                 vfs_rel(mp);
12117                                 if (error != 0)
12118                                         return (ENOENT);
12119                                 if (vp->v_iflag & VI_DOOMED) {
12120                                         vfs_unbusy(mp);
12121                                         return (ENOENT);
12122                                 }
12123                         }
12124                         VOP_UNLOCK(vp, 0);
12125                         error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
12126                             &pvp, FFSV_FORCEINSMQ);
12127                         vfs_unbusy(mp);
12128                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
12129                         if (vp->v_iflag & VI_DOOMED) {
12130                                 if (error == 0)
12131                                         vput(pvp);
12132                                 error = ENOENT;
12133                         }
12134                         if (error != 0)
12135                                 return (error);
12136                 }
12137                 /*
12138                  * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
12139                  * that are contained in direct blocks will be resolved by
12140                  * doing a ffs_update. Pagedeps contained in indirect blocks
12141                  * may require a complete sync'ing of the directory. So, we
12142                  * try the cheap and fast ffs_update first, and if that fails,
12143                  * then we do the slower ffs_syncvnode of the directory.
12144                  */
12145                 if (flushparent) {
12146                         int locked;
12147
12148                         if ((error = ffs_update(pvp, 1)) != 0) {
12149                                 vput(pvp);
12150                                 return (error);
12151                         }
12152                         ACQUIRE_LOCK(ump);
12153                         locked = 1;
12154                         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
12155                                 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
12156                                         if (wk->wk_type != D_DIRADD)
12157                                                 panic("softdep_fsync: Unexpected type %s",
12158                                                       TYPENAME(wk->wk_type));
12159                                         dap = WK_DIRADD(wk);
12160                                         if (dap->da_state & DIRCHG)
12161                                                 pagedep = dap->da_previous->dm_pagedep;
12162                                         else
12163                                                 pagedep = dap->da_pagedep;
12164                                         pagedep_new_block = pagedep->pd_state & NEWBLOCK;
12165                                         FREE_LOCK(ump);
12166                                         locked = 0;
12167                                         if (pagedep_new_block && (error =
12168                                             ffs_syncvnode(pvp, MNT_WAIT, 0))) {
12169                                                 vput(pvp);
12170                                                 return (error);
12171                                         }
12172                                 }
12173                         }
12174                         if (locked)
12175                                 FREE_LOCK(ump);
12176                 }
12177                 /*
12178                  * Flush directory page containing the inode's name.
12179                  */
12180                 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
12181                     &bp);
12182                 if (error == 0)
12183                         error = bwrite(bp);
12184                 else
12185                         brelse(bp);
12186                 vput(pvp);
12187                 if (error != 0)
12188                         return (error);
12189                 ACQUIRE_LOCK(ump);
12190                 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
12191                         break;
12192         }
12193         FREE_LOCK(ump);
12194         return (0);
12195 }
12196
12197 /*
12198  * Flush all the dirty bitmaps associated with the block device
12199  * before flushing the rest of the dirty blocks so as to reduce
12200  * the number of dependencies that will have to be rolled back.
12201  *
12202  * XXX Unused?
12203  */
12204 void
12205 softdep_fsync_mountdev(vp)
12206         struct vnode *vp;
12207 {
12208         struct buf *bp, *nbp;
12209         struct worklist *wk;
12210         struct bufobj *bo;
12211
12212         if (!vn_isdisk(vp, NULL))
12213                 panic("softdep_fsync_mountdev: vnode not a disk");
12214         bo = &vp->v_bufobj;
12215 restart:
12216         BO_LOCK(bo);
12217         TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
12218                 /*
12219                  * If it is already scheduled, skip to the next buffer.
12220                  */
12221                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
12222                         continue;
12223
12224                 if ((bp->b_flags & B_DELWRI) == 0)
12225                         panic("softdep_fsync_mountdev: not dirty");
12226                 /*
12227                  * We are only interested in bitmaps with outstanding
12228                  * dependencies.
12229                  */
12230                 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
12231                     wk->wk_type != D_BMSAFEMAP ||
12232                     (bp->b_vflags & BV_BKGRDINPROG)) {
12233                         BUF_UNLOCK(bp);
12234                         continue;
12235                 }
12236                 BO_UNLOCK(bo);
12237                 bremfree(bp);
12238                 (void) bawrite(bp);
12239                 goto restart;
12240         }
12241         drain_output(vp);
12242         BO_UNLOCK(bo);
12243 }
12244
12245 /*
12246  * Sync all cylinder groups that were dirty at the time this function is
12247  * called.  Newly dirtied cgs will be inserted before the sentinel.  This
12248  * is used to flush freedep activity that may be holding up writes to a
12249  * indirect block.
12250  */
12251 static int
12252 sync_cgs(mp, waitfor)
12253         struct mount *mp;
12254         int waitfor;
12255 {
12256         struct bmsafemap *bmsafemap;
12257         struct bmsafemap *sentinel;
12258         struct ufsmount *ump;
12259         struct buf *bp;
12260         int error;
12261
12262         sentinel = malloc(sizeof(*sentinel), M_BMSAFEMAP, M_ZERO | M_WAITOK);
12263         sentinel->sm_cg = -1;
12264         ump = VFSTOUFS(mp);
12265         error = 0;
12266         ACQUIRE_LOCK(ump);
12267         LIST_INSERT_HEAD(&ump->softdep_dirtycg, sentinel, sm_next);
12268         for (bmsafemap = LIST_NEXT(sentinel, sm_next); bmsafemap != NULL;
12269             bmsafemap = LIST_NEXT(sentinel, sm_next)) {
12270                 /* Skip sentinels and cgs with no work to release. */
12271                 if (bmsafemap->sm_cg == -1 ||
12272                     (LIST_EMPTY(&bmsafemap->sm_freehd) &&
12273                     LIST_EMPTY(&bmsafemap->sm_freewr))) {
12274                         LIST_REMOVE(sentinel, sm_next);
12275                         LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12276                         continue;
12277                 }
12278                 /*
12279                  * If we don't get the lock and we're waiting try again, if
12280                  * not move on to the next buf and try to sync it.
12281                  */
12282                 bp = getdirtybuf(bmsafemap->sm_buf, LOCK_PTR(ump), waitfor);
12283                 if (bp == NULL && waitfor == MNT_WAIT)
12284                         continue;
12285                 LIST_REMOVE(sentinel, sm_next);
12286                 LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12287                 if (bp == NULL)
12288                         continue;
12289                 FREE_LOCK(ump);
12290                 if (waitfor == MNT_NOWAIT)
12291                         bawrite(bp);
12292                 else
12293                         error = bwrite(bp);
12294                 ACQUIRE_LOCK(ump);
12295                 if (error)
12296                         break;
12297         }
12298         LIST_REMOVE(sentinel, sm_next);
12299         FREE_LOCK(ump);
12300         free(sentinel, M_BMSAFEMAP);
12301         return (error);
12302 }
12303
12304 /*
12305  * This routine is called when we are trying to synchronously flush a
12306  * file. This routine must eliminate any filesystem metadata dependencies
12307  * so that the syncing routine can succeed.
12308  */
12309 int
12310 softdep_sync_metadata(struct vnode *vp)
12311 {
12312         struct inode *ip;
12313         int error;
12314
12315         ip = VTOI(vp);
12316         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
12317             ("softdep_sync_metadata called on non-softdep filesystem"));
12318         /*
12319          * Ensure that any direct block dependencies have been cleared,
12320          * truncations are started, and inode references are journaled.
12321          */
12322         ACQUIRE_LOCK(ip->i_ump);
12323         /*
12324          * Write all journal records to prevent rollbacks on devvp.
12325          */
12326         if (vp->v_type == VCHR)
12327                 softdep_flushjournal(vp->v_mount);
12328         error = flush_inodedep_deps(vp, vp->v_mount, ip->i_number);
12329         /*
12330          * Ensure that all truncates are written so we won't find deps on
12331          * indirect blocks.
12332          */
12333         process_truncates(vp);
12334         FREE_LOCK(ip->i_ump);
12335
12336         return (error);
12337 }
12338
12339 /*
12340  * This routine is called when we are attempting to sync a buf with
12341  * dependencies.  If waitfor is MNT_NOWAIT it attempts to schedule any
12342  * other IO it can but returns EBUSY if the buffer is not yet able to
12343  * be written.  Dependencies which will not cause rollbacks will always
12344  * return 0.
12345  */
12346 int
12347 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
12348 {
12349         struct indirdep *indirdep;
12350         struct pagedep *pagedep;
12351         struct allocindir *aip;
12352         struct newblk *newblk;
12353         struct ufsmount *ump;
12354         struct buf *nbp;
12355         struct worklist *wk;
12356         int i, error;
12357
12358         KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
12359             ("softdep_sync_buf called on non-softdep filesystem"));
12360         /*
12361          * For VCHR we just don't want to force flush any dependencies that
12362          * will cause rollbacks.
12363          */
12364         if (vp->v_type == VCHR) {
12365                 if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0))
12366                         return (EBUSY);
12367                 return (0);
12368         }
12369         ump = VTOI(vp)->i_ump;
12370         ACQUIRE_LOCK(ump);
12371         /*
12372          * As we hold the buffer locked, none of its dependencies
12373          * will disappear.
12374          */
12375         error = 0;
12376 top:
12377         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
12378                 switch (wk->wk_type) {
12379
12380                 case D_ALLOCDIRECT:
12381                 case D_ALLOCINDIR:
12382                         newblk = WK_NEWBLK(wk);
12383                         if (newblk->nb_jnewblk != NULL) {
12384                                 if (waitfor == MNT_NOWAIT) {
12385                                         error = EBUSY;
12386                                         goto out_unlock;
12387                                 }
12388                                 jwait(&newblk->nb_jnewblk->jn_list, waitfor);
12389                                 goto top;
12390                         }
12391                         if (newblk->nb_state & DEPCOMPLETE ||
12392                             waitfor == MNT_NOWAIT)
12393                                 continue;
12394                         nbp = newblk->nb_bmsafemap->sm_buf;
12395                         nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
12396                         if (nbp == NULL)
12397                                 goto top;
12398                         FREE_LOCK(ump);
12399                         if ((error = bwrite(nbp)) != 0)
12400                                 goto out;
12401                         ACQUIRE_LOCK(ump);
12402                         continue;
12403
12404                 case D_INDIRDEP:
12405                         indirdep = WK_INDIRDEP(wk);
12406                         if (waitfor == MNT_NOWAIT) {
12407                                 if (!TAILQ_EMPTY(&indirdep->ir_trunc) ||
12408                                     !LIST_EMPTY(&indirdep->ir_deplisthd)) {
12409                                         error = EBUSY;
12410                                         goto out_unlock;
12411                                 }
12412                         }
12413                         if (!TAILQ_EMPTY(&indirdep->ir_trunc))
12414                                 panic("softdep_sync_buf: truncation pending.");
12415                 restart:
12416                         LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
12417                                 newblk = (struct newblk *)aip;
12418                                 if (newblk->nb_jnewblk != NULL) {
12419                                         jwait(&newblk->nb_jnewblk->jn_list,
12420                                             waitfor);
12421                                         goto restart;
12422                                 }
12423                                 if (newblk->nb_state & DEPCOMPLETE)
12424                                         continue;
12425                                 nbp = newblk->nb_bmsafemap->sm_buf;
12426                                 nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
12427                                 if (nbp == NULL)
12428                                         goto restart;
12429                                 FREE_LOCK(ump);
12430                                 if ((error = bwrite(nbp)) != 0)
12431                                         goto out;
12432                                 ACQUIRE_LOCK(ump);
12433                                 goto restart;
12434                         }
12435                         continue;
12436
12437                 case D_PAGEDEP:
12438                         /*
12439                          * Only flush directory entries in synchronous passes.
12440                          */
12441                         if (waitfor != MNT_WAIT) {
12442                                 error = EBUSY;
12443                                 goto out_unlock;
12444                         }
12445                         /*
12446                          * While syncing snapshots, we must allow recursive
12447                          * lookups.
12448                          */
12449                         BUF_AREC(bp);
12450                         /*
12451                          * We are trying to sync a directory that may
12452                          * have dependencies on both its own metadata
12453                          * and/or dependencies on the inodes of any
12454                          * recently allocated files. We walk its diradd
12455                          * lists pushing out the associated inode.
12456                          */
12457                         pagedep = WK_PAGEDEP(wk);
12458                         for (i = 0; i < DAHASHSZ; i++) {
12459                                 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
12460                                         continue;
12461                                 if ((error = flush_pagedep_deps(vp, wk->wk_mp,
12462                                     &pagedep->pd_diraddhd[i]))) {
12463                                         BUF_NOREC(bp);
12464                                         goto out_unlock;
12465                                 }
12466                         }
12467                         BUF_NOREC(bp);
12468                         continue;
12469
12470                 case D_FREEWORK:
12471                 case D_FREEDEP:
12472                 case D_JSEGDEP:
12473                 case D_JNEWBLK:
12474                         continue;
12475
12476                 default:
12477                         panic("softdep_sync_buf: Unknown type %s",
12478                             TYPENAME(wk->wk_type));
12479                         /* NOTREACHED */
12480                 }
12481         }
12482 out_unlock:
12483         FREE_LOCK(ump);
12484 out:
12485         return (error);
12486 }
12487
12488 /*
12489  * Flush the dependencies associated with an inodedep.
12490  * Called with splbio blocked.
12491  */
12492 static int
12493 flush_inodedep_deps(vp, mp, ino)
12494         struct vnode *vp;
12495         struct mount *mp;
12496         ino_t ino;
12497 {
12498         struct inodedep *inodedep;
12499         struct inoref *inoref;
12500         struct ufsmount *ump;
12501         int error, waitfor;
12502
12503         /*
12504          * This work is done in two passes. The first pass grabs most
12505          * of the buffers and begins asynchronously writing them. The
12506          * only way to wait for these asynchronous writes is to sleep
12507          * on the filesystem vnode which may stay busy for a long time
12508          * if the filesystem is active. So, instead, we make a second
12509          * pass over the dependencies blocking on each write. In the
12510          * usual case we will be blocking against a write that we
12511          * initiated, so when it is done the dependency will have been
12512          * resolved. Thus the second pass is expected to end quickly.
12513          * We give a brief window at the top of the loop to allow
12514          * any pending I/O to complete.
12515          */
12516         ump = VFSTOUFS(mp);
12517         LOCK_OWNED(ump);
12518         for (error = 0, waitfor = MNT_NOWAIT; ; ) {
12519                 if (error)
12520                         return (error);
12521                 FREE_LOCK(ump);
12522                 ACQUIRE_LOCK(ump);
12523 restart:
12524                 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
12525                         return (0);
12526                 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12527                         if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12528                             == DEPCOMPLETE) {
12529                                 jwait(&inoref->if_list, MNT_WAIT);
12530                                 goto restart;
12531                         }
12532                 }
12533                 if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
12534                     flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
12535                     flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
12536                     flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
12537                         continue;
12538                 /*
12539                  * If pass2, we are done, otherwise do pass 2.
12540                  */
12541                 if (waitfor == MNT_WAIT)
12542                         break;
12543                 waitfor = MNT_WAIT;
12544         }
12545         /*
12546          * Try freeing inodedep in case all dependencies have been removed.
12547          */
12548         if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
12549                 (void) free_inodedep(inodedep);
12550         return (0);
12551 }
12552
12553 /*
12554  * Flush an inode dependency list.
12555  * Called with splbio blocked.
12556  */
12557 static int
12558 flush_deplist(listhead, waitfor, errorp)
12559         struct allocdirectlst *listhead;
12560         int waitfor;
12561         int *errorp;
12562 {
12563         struct allocdirect *adp;
12564         struct newblk *newblk;
12565         struct ufsmount *ump;
12566         struct buf *bp;
12567
12568         if ((adp = TAILQ_FIRST(listhead)) == NULL)
12569                 return (0);
12570         ump = VFSTOUFS(adp->ad_list.wk_mp);
12571         LOCK_OWNED(ump);
12572         TAILQ_FOREACH(adp, listhead, ad_next) {
12573                 newblk = (struct newblk *)adp;
12574                 if (newblk->nb_jnewblk != NULL) {
12575                         jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12576                         return (1);
12577                 }
12578                 if (newblk->nb_state & DEPCOMPLETE)
12579                         continue;
12580                 bp = newblk->nb_bmsafemap->sm_buf;
12581                 bp = getdirtybuf(bp, LOCK_PTR(ump), waitfor);
12582                 if (bp == NULL) {
12583                         if (waitfor == MNT_NOWAIT)
12584                                 continue;
12585                         return (1);
12586                 }
12587                 FREE_LOCK(ump);
12588                 if (waitfor == MNT_NOWAIT)
12589                         bawrite(bp);
12590                 else
12591                         *errorp = bwrite(bp);
12592                 ACQUIRE_LOCK(ump);
12593                 return (1);
12594         }
12595         return (0);
12596 }
12597
12598 /*
12599  * Flush dependencies associated with an allocdirect block.
12600  */
12601 static int
12602 flush_newblk_dep(vp, mp, lbn)
12603         struct vnode *vp;
12604         struct mount *mp;
12605         ufs_lbn_t lbn;
12606 {
12607         struct newblk *newblk;
12608         struct ufsmount *ump;
12609         struct bufobj *bo;
12610         struct inode *ip;
12611         struct buf *bp;
12612         ufs2_daddr_t blkno;
12613         int error;
12614
12615         error = 0;
12616         bo = &vp->v_bufobj;
12617         ip = VTOI(vp);
12618         blkno = DIP(ip, i_db[lbn]);
12619         if (blkno == 0)
12620                 panic("flush_newblk_dep: Missing block");
12621         ump = VFSTOUFS(mp);
12622         ACQUIRE_LOCK(ump);
12623         /*
12624          * Loop until all dependencies related to this block are satisfied.
12625          * We must be careful to restart after each sleep in case a write
12626          * completes some part of this process for us.
12627          */
12628         for (;;) {
12629                 if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
12630                         FREE_LOCK(ump);
12631                         break;
12632                 }
12633                 if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
12634                         panic("flush_newblk_deps: Bad newblk %p", newblk);
12635                 /*
12636                  * Flush the journal.
12637                  */
12638                 if (newblk->nb_jnewblk != NULL) {
12639                         jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12640                         continue;
12641                 }
12642                 /*
12643                  * Write the bitmap dependency.
12644                  */
12645                 if ((newblk->nb_state & DEPCOMPLETE) == 0) {
12646                         bp = newblk->nb_bmsafemap->sm_buf;
12647                         bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
12648                         if (bp == NULL)
12649                                 continue;
12650                         FREE_LOCK(ump);
12651                         error = bwrite(bp);
12652                         if (error)
12653                                 break;
12654                         ACQUIRE_LOCK(ump);
12655                         continue;
12656                 }
12657                 /*
12658                  * Write the buffer.
12659                  */
12660                 FREE_LOCK(ump);
12661                 BO_LOCK(bo);
12662                 bp = gbincore(bo, lbn);
12663                 if (bp != NULL) {
12664                         error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
12665                             LK_INTERLOCK, BO_LOCKPTR(bo));
12666                         if (error == ENOLCK) {
12667                                 ACQUIRE_LOCK(ump);
12668                                 continue; /* Slept, retry */
12669                         }
12670                         if (error != 0)
12671                                 break;  /* Failed */
12672                         if (bp->b_flags & B_DELWRI) {
12673                                 bremfree(bp);
12674                                 error = bwrite(bp);
12675                                 if (error)
12676                                         break;
12677                         } else
12678                                 BUF_UNLOCK(bp);
12679                 } else
12680                         BO_UNLOCK(bo);
12681                 /*
12682                  * We have to wait for the direct pointers to
12683                  * point at the newdirblk before the dependency
12684                  * will go away.
12685                  */
12686                 error = ffs_update(vp, 1);
12687                 if (error)
12688                         break;
12689                 ACQUIRE_LOCK(ump);
12690         }
12691         return (error);
12692 }
12693
12694 /*
12695  * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
12696  * Called with splbio blocked.
12697  */
12698 static int
12699 flush_pagedep_deps(pvp, mp, diraddhdp)
12700         struct vnode *pvp;
12701         struct mount *mp;
12702         struct diraddhd *diraddhdp;
12703 {
12704         struct inodedep *inodedep;
12705         struct inoref *inoref;
12706         struct ufsmount *ump;
12707         struct diradd *dap;
12708         struct vnode *vp;
12709         int error = 0;
12710         struct buf *bp;
12711         ino_t inum;
12712         struct diraddhd unfinished;
12713
12714         LIST_INIT(&unfinished);
12715         ump = VFSTOUFS(mp);
12716         LOCK_OWNED(ump);
12717 restart:
12718         while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
12719                 /*
12720                  * Flush ourselves if this directory entry
12721                  * has a MKDIR_PARENT dependency.
12722                  */
12723                 if (dap->da_state & MKDIR_PARENT) {
12724                         FREE_LOCK(ump);
12725                         if ((error = ffs_update(pvp, 1)) != 0)
12726                                 break;
12727                         ACQUIRE_LOCK(ump);
12728                         /*
12729                          * If that cleared dependencies, go on to next.
12730                          */
12731                         if (dap != LIST_FIRST(diraddhdp))
12732                                 continue;
12733                         /*
12734                          * All MKDIR_PARENT dependencies and all the
12735                          * NEWBLOCK pagedeps that are contained in direct
12736                          * blocks were resolved by doing above ffs_update.
12737                          * Pagedeps contained in indirect blocks may
12738                          * require a complete sync'ing of the directory.
12739                          * We are in the midst of doing a complete sync,
12740                          * so if they are not resolved in this pass we
12741                          * defer them for now as they will be sync'ed by
12742                          * our caller shortly.
12743                          */
12744                         LIST_REMOVE(dap, da_pdlist);
12745                         LIST_INSERT_HEAD(&unfinished, dap, da_pdlist);
12746                         continue;
12747                 }
12748                 /*
12749                  * A newly allocated directory must have its "." and
12750                  * ".." entries written out before its name can be
12751                  * committed in its parent.
12752                  */
12753                 inum = dap->da_newinum;
12754                 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
12755                         panic("flush_pagedep_deps: lost inode1");
12756                 /*
12757                  * Wait for any pending journal adds to complete so we don't
12758                  * cause rollbacks while syncing.
12759                  */
12760                 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12761                         if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12762                             == DEPCOMPLETE) {
12763                                 jwait(&inoref->if_list, MNT_WAIT);
12764                                 goto restart;
12765                         }
12766                 }
12767                 if (dap->da_state & MKDIR_BODY) {
12768                         FREE_LOCK(ump);
12769                         if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
12770                             FFSV_FORCEINSMQ)))
12771                                 break;
12772                         error = flush_newblk_dep(vp, mp, 0);
12773                         /*
12774                          * If we still have the dependency we might need to
12775                          * update the vnode to sync the new link count to
12776                          * disk.
12777                          */
12778                         if (error == 0 && dap == LIST_FIRST(diraddhdp))
12779                                 error = ffs_update(vp, 1);
12780                         vput(vp);
12781                         if (error != 0)
12782                                 break;
12783                         ACQUIRE_LOCK(ump);
12784                         /*
12785                          * If that cleared dependencies, go on to next.
12786                          */
12787                         if (dap != LIST_FIRST(diraddhdp))
12788                                 continue;
12789                         if (dap->da_state & MKDIR_BODY) {
12790                                 inodedep_lookup(UFSTOVFS(ump), inum, 0,
12791                                     &inodedep);
12792                                 panic("flush_pagedep_deps: MKDIR_BODY "
12793                                     "inodedep %p dap %p vp %p",
12794                                     inodedep, dap, vp);
12795                         }
12796                 }
12797                 /*
12798                  * Flush the inode on which the directory entry depends.
12799                  * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
12800                  * the only remaining dependency is that the updated inode
12801                  * count must get pushed to disk. The inode has already
12802                  * been pushed into its inode buffer (via VOP_UPDATE) at
12803                  * the time of the reference count change. So we need only
12804                  * locate that buffer, ensure that there will be no rollback
12805                  * caused by a bitmap dependency, then write the inode buffer.
12806                  */
12807 retry:
12808                 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
12809                         panic("flush_pagedep_deps: lost inode");
12810                 /*
12811                  * If the inode still has bitmap dependencies,
12812                  * push them to disk.
12813                  */
12814                 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
12815                         bp = inodedep->id_bmsafemap->sm_buf;
12816                         bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
12817                         if (bp == NULL)
12818                                 goto retry;
12819                         FREE_LOCK(ump);
12820                         if ((error = bwrite(bp)) != 0)
12821                                 break;
12822                         ACQUIRE_LOCK(ump);
12823                         if (dap != LIST_FIRST(diraddhdp))
12824                                 continue;
12825                 }
12826                 /*
12827                  * If the inode is still sitting in a buffer waiting
12828                  * to be written or waiting for the link count to be
12829                  * adjusted update it here to flush it to disk.
12830                  */
12831                 if (dap == LIST_FIRST(diraddhdp)) {
12832                         FREE_LOCK(ump);
12833                         if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
12834                             FFSV_FORCEINSMQ)))
12835                                 break;
12836                         error = ffs_update(vp, 1);
12837                         vput(vp);
12838                         if (error)
12839                                 break;
12840                         ACQUIRE_LOCK(ump);
12841                 }
12842                 /*
12843                  * If we have failed to get rid of all the dependencies
12844                  * then something is seriously wrong.
12845                  */
12846                 if (dap == LIST_FIRST(diraddhdp)) {
12847                         inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
12848                         panic("flush_pagedep_deps: failed to flush "
12849                             "inodedep %p ino %ju dap %p",
12850                             inodedep, (uintmax_t)inum, dap);
12851                 }
12852         }
12853         if (error)
12854                 ACQUIRE_LOCK(ump);
12855         while ((dap = LIST_FIRST(&unfinished)) != NULL) {
12856                 LIST_REMOVE(dap, da_pdlist);
12857                 LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist);
12858         }
12859         return (error);
12860 }
12861
12862 /*
12863  * A large burst of file addition or deletion activity can drive the
12864  * memory load excessively high. First attempt to slow things down
12865  * using the techniques below. If that fails, this routine requests
12866  * the offending operations to fall back to running synchronously
12867  * until the memory load returns to a reasonable level.
12868  */
12869 int
12870 softdep_slowdown(vp)
12871         struct vnode *vp;
12872 {
12873         struct ufsmount *ump;
12874         int jlow;
12875         int max_softdeps_hard;
12876
12877         KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
12878             ("softdep_slowdown called on non-softdep filesystem"));
12879         ump = VFSTOUFS(vp->v_mount);
12880         ACQUIRE_LOCK(ump);
12881         jlow = 0;
12882         /*
12883          * Check for journal space if needed.
12884          */
12885         if (DOINGSUJ(vp)) {
12886                 if (journal_space(ump, 0) == 0)
12887                         jlow = 1;
12888         }
12889         max_softdeps_hard = max_softdeps * 11 / 10;
12890         if (dep_current[D_DIRREM] < max_softdeps_hard / 2 &&
12891             dep_current[D_INODEDEP] < max_softdeps_hard &&
12892             VFSTOUFS(vp->v_mount)->softdep_numindirdeps < maxindirdeps &&
12893             dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0) {
12894                 FREE_LOCK(ump);
12895                 return (0);
12896         }
12897         if (VFSTOUFS(vp->v_mount)->softdep_numindirdeps >= maxindirdeps || jlow)
12898                 softdep_speedup();
12899         stat_sync_limit_hit += 1;
12900         FREE_LOCK(ump);
12901         if (DOINGSUJ(vp))
12902                 return (0);
12903         return (1);
12904 }
12905
12906 /*
12907  * Called by the allocation routines when they are about to fail
12908  * in the hope that we can free up the requested resource (inodes
12909  * or disk space).
12910  *
12911  * First check to see if the work list has anything on it. If it has,
12912  * clean up entries until we successfully free the requested resource.
12913  * Because this process holds inodes locked, we cannot handle any remove
12914  * requests that might block on a locked inode as that could lead to
12915  * deadlock. If the worklist yields none of the requested resource,
12916  * start syncing out vnodes to free up the needed space.
12917  */
12918 int
12919 softdep_request_cleanup(fs, vp, cred, resource)
12920         struct fs *fs;
12921         struct vnode *vp;
12922         struct ucred *cred;
12923         int resource;
12924 {
12925         struct ufsmount *ump;
12926         struct mount *mp;
12927         struct vnode *lvp, *mvp;
12928         long starttime;
12929         ufs2_daddr_t needed;
12930         int error;
12931
12932         /*
12933          * If we are being called because of a process doing a
12934          * copy-on-write, then it is not safe to process any
12935          * worklist items as we will recurse into the copyonwrite
12936          * routine.  This will result in an incoherent snapshot.
12937          * If the vnode that we hold is a snapshot, we must avoid
12938          * handling other resources that could cause deadlock.
12939          */
12940         if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp)))
12941                 return (0);
12942
12943         if (resource == FLUSH_BLOCKS_WAIT)
12944                 stat_cleanup_blkrequests += 1;
12945         else
12946                 stat_cleanup_inorequests += 1;
12947
12948         mp = vp->v_mount;
12949         ump = VFSTOUFS(mp);
12950         mtx_assert(UFS_MTX(ump), MA_OWNED);
12951         UFS_UNLOCK(ump);
12952         error = ffs_update(vp, 1);
12953         if (error != 0 || MOUNTEDSOFTDEP(mp) == 0) {
12954                 UFS_LOCK(ump);
12955                 return (0);
12956         }
12957         /*
12958          * If we are in need of resources, consider pausing for
12959          * tickdelay to give ourselves some breathing room.
12960          */
12961         ACQUIRE_LOCK(ump);
12962         process_removes(vp);
12963         process_truncates(vp);
12964         request_cleanup(UFSTOVFS(ump), resource);
12965         FREE_LOCK(ump);
12966         /*
12967          * Now clean up at least as many resources as we will need.
12968          *
12969          * When requested to clean up inodes, the number that are needed
12970          * is set by the number of simultaneous writers (mnt_writeopcount)
12971          * plus a bit of slop (2) in case some more writers show up while
12972          * we are cleaning.
12973          *
12974          * When requested to free up space, the amount of space that
12975          * we need is enough blocks to allocate a full-sized segment
12976          * (fs_contigsumsize). The number of such segments that will
12977          * be needed is set by the number of simultaneous writers
12978          * (mnt_writeopcount) plus a bit of slop (2) in case some more
12979          * writers show up while we are cleaning.
12980          *
12981          * Additionally, if we are unpriviledged and allocating space,
12982          * we need to ensure that we clean up enough blocks to get the
12983          * needed number of blocks over the threshhold of the minimum
12984          * number of blocks required to be kept free by the filesystem
12985          * (fs_minfree).
12986          */
12987         if (resource == FLUSH_INODES_WAIT) {
12988                 needed = vp->v_mount->mnt_writeopcount + 2;
12989         } else if (resource == FLUSH_BLOCKS_WAIT) {
12990                 needed = (vp->v_mount->mnt_writeopcount + 2) *
12991                     fs->fs_contigsumsize;
12992                 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0))
12993                         needed += fragstoblks(fs,
12994                             roundup((fs->fs_dsize * fs->fs_minfree / 100) -
12995                             fs->fs_cstotal.cs_nffree, fs->fs_frag));
12996         } else {
12997                 UFS_LOCK(ump);
12998                 printf("softdep_request_cleanup: Unknown resource type %d\n",
12999                     resource);
13000                 return (0);
13001         }
13002         starttime = time_second;
13003 retry:
13004         if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 &&
13005             fs->fs_cstotal.cs_nbfree <= needed) ||
13006             (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13007             fs->fs_cstotal.cs_nifree <= needed)) {
13008                 ACQUIRE_LOCK(ump);
13009                 if (ump->softdep_on_worklist > 0 &&
13010                     process_worklist_item(UFSTOVFS(ump),
13011                     ump->softdep_on_worklist, LK_NOWAIT) != 0)
13012                         stat_worklist_push += 1;
13013                 FREE_LOCK(ump);
13014         }
13015         /*
13016          * If we still need resources and there are no more worklist
13017          * entries to process to obtain them, we have to start flushing
13018          * the dirty vnodes to force the release of additional requests
13019          * to the worklist that we can then process to reap addition
13020          * resources. We walk the vnodes associated with the mount point
13021          * until we get the needed worklist requests that we can reap.
13022          */
13023         if ((resource == FLUSH_BLOCKS_WAIT &&
13024              fs->fs_cstotal.cs_nbfree <= needed) ||
13025             (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13026              fs->fs_cstotal.cs_nifree <= needed)) {
13027                 MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) {
13028                         if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) {
13029                                 VI_UNLOCK(lvp);
13030                                 continue;
13031                         }
13032                         if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT,
13033                             curthread))
13034                                 continue;
13035                         if (lvp->v_vflag & VV_NOSYNC) { /* unlinked */
13036                                 vput(lvp);
13037                                 continue;
13038                         }
13039                         (void) ffs_syncvnode(lvp, MNT_NOWAIT, 0);
13040                         vput(lvp);
13041                 }
13042                 lvp = ump->um_devvp;
13043                 if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
13044                         VOP_FSYNC(lvp, MNT_NOWAIT, curthread);
13045                         VOP_UNLOCK(lvp, 0);
13046                 }
13047                 if (ump->softdep_on_worklist > 0) {
13048                         stat_cleanup_retries += 1;
13049                         goto retry;
13050                 }
13051                 stat_cleanup_failures += 1;
13052         }
13053         if (time_second - starttime > stat_cleanup_high_delay)
13054                 stat_cleanup_high_delay = time_second - starttime;
13055         UFS_LOCK(ump);
13056         return (1);
13057 }
13058
13059 /*
13060  * If memory utilization has gotten too high, deliberately slow things
13061  * down and speed up the I/O processing.
13062  */
13063 static int
13064 request_cleanup(mp, resource)
13065         struct mount *mp;
13066         int resource;
13067 {
13068         struct thread *td = curthread;
13069         struct ufsmount *ump;
13070
13071         ump = VFSTOUFS(mp);
13072         LOCK_OWNED(ump);
13073         /*
13074          * We never hold up the filesystem syncer or buf daemon.
13075          */
13076         if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
13077                 return (0);
13078         /*
13079          * First check to see if the work list has gotten backlogged.
13080          * If it has, co-opt this process to help clean up two entries.
13081          * Because this process may hold inodes locked, we cannot
13082          * handle any remove requests that might block on a locked
13083          * inode as that could lead to deadlock.  We set TDP_SOFTDEP
13084          * to avoid recursively processing the worklist.
13085          */
13086         if (ump->softdep_on_worklist > max_softdeps / 10) {
13087                 td->td_pflags |= TDP_SOFTDEP;
13088                 process_worklist_item(mp, 2, LK_NOWAIT);
13089                 td->td_pflags &= ~TDP_SOFTDEP;
13090                 stat_worklist_push += 2;
13091                 return(1);
13092         }
13093         /*
13094          * Next, we attempt to speed up the syncer process. If that
13095          * is successful, then we allow the process to continue.
13096          */
13097         if (softdep_speedup() &&
13098             resource != FLUSH_BLOCKS_WAIT &&
13099             resource != FLUSH_INODES_WAIT)
13100                 return(0);
13101         /*
13102          * If we are resource constrained on inode dependencies, try
13103          * flushing some dirty inodes. Otherwise, we are constrained
13104          * by file deletions, so try accelerating flushes of directories
13105          * with removal dependencies. We would like to do the cleanup
13106          * here, but we probably hold an inode locked at this point and
13107          * that might deadlock against one that we try to clean. So,
13108          * the best that we can do is request the syncer daemon to do
13109          * the cleanup for us.
13110          */
13111         switch (resource) {
13112
13113         case FLUSH_INODES:
13114         case FLUSH_INODES_WAIT:
13115                 stat_ino_limit_push += 1;
13116                 req_clear_inodedeps += 1;
13117                 stat_countp = &stat_ino_limit_hit;
13118                 break;
13119
13120         case FLUSH_BLOCKS:
13121         case FLUSH_BLOCKS_WAIT:
13122                 stat_blk_limit_push += 1;
13123                 req_clear_remove += 1;
13124                 stat_countp = &stat_blk_limit_hit;
13125                 break;
13126
13127         default:
13128                 panic("request_cleanup: unknown type");
13129         }
13130         /*
13131          * Hopefully the syncer daemon will catch up and awaken us.
13132          * We wait at most tickdelay before proceeding in any case.
13133          */
13134         proc_waiting += 1;
13135         if (callout_pending(&softdep_callout) == FALSE)
13136                 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
13137                     pause_timer, 0);
13138
13139         msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
13140         proc_waiting -= 1;
13141         return (1);
13142 }
13143
13144 /*
13145  * Awaken processes pausing in request_cleanup and clear proc_waiting
13146  * to indicate that there is no longer a timer running. Pause_timer
13147  * will be called with the global softdep mutex (&lk) locked.
13148  */
13149 static void
13150 pause_timer(arg)
13151         void *arg;
13152 {
13153
13154         rw_assert(&lk, RA_WLOCKED);
13155         /*
13156          * The callout_ API has acquired mtx and will hold it around this
13157          * function call.
13158          */
13159         *stat_countp += 1;
13160         wakeup_one(&proc_waiting);
13161         if (proc_waiting > 0)
13162                 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
13163                     pause_timer, 0);
13164 }
13165
13166 /*
13167  * If requested, try removing inode or removal dependencies.
13168  */
13169 static void
13170 check_clear_deps(mp)
13171         struct mount *mp;
13172 {
13173
13174         rw_assert(&lk, RA_WLOCKED);
13175         /*
13176          * If we are suspended, it may be because of our using
13177          * too many inodedeps, so help clear them out.
13178          */
13179         if (MOUNTEDSUJ(mp) && VFSTOUFS(mp)->softdep_jblocks->jb_suspended)
13180                 clear_inodedeps(mp);
13181         /*
13182          * General requests for cleanup of backed up dependencies
13183          */
13184         if (req_clear_inodedeps) {
13185                 req_clear_inodedeps -= 1;
13186                 clear_inodedeps(mp);
13187                 wakeup_one(&proc_waiting);
13188         }
13189         if (req_clear_remove) {
13190                 req_clear_remove -= 1;
13191                 clear_remove(mp);
13192                 wakeup_one(&proc_waiting);
13193         }
13194 }
13195
13196 /*
13197  * Flush out a directory with at least one removal dependency in an effort to
13198  * reduce the number of dirrem, freefile, and freeblks dependency structures.
13199  */
13200 static void
13201 clear_remove(mp)
13202         struct mount *mp;
13203 {
13204         struct pagedep_hashhead *pagedephd;
13205         struct pagedep *pagedep;
13206         struct ufsmount *ump;
13207         struct vnode *vp;
13208         struct bufobj *bo;
13209         int error, cnt;
13210         ino_t ino;
13211
13212         ump = VFSTOUFS(mp);
13213         LOCK_OWNED(ump);
13214
13215         for (cnt = 0; cnt <= ump->pagedep_hash_size; cnt++) {
13216                 pagedephd = &ump->pagedep_hashtbl[ump->pagedep_nextclean++];
13217                 if (ump->pagedep_nextclean > ump->pagedep_hash_size)
13218                         ump->pagedep_nextclean = 0;
13219                 LIST_FOREACH(pagedep, pagedephd, pd_hash) {
13220                         if (LIST_EMPTY(&pagedep->pd_dirremhd))
13221                                 continue;
13222                         ino = pagedep->pd_ino;
13223                         if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13224                                 continue;
13225                         FREE_LOCK(ump);
13226
13227                         /*
13228                          * Let unmount clear deps
13229                          */
13230                         error = vfs_busy(mp, MBF_NOWAIT);
13231                         if (error != 0)
13232                                 goto finish_write;
13233                         error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13234                              FFSV_FORCEINSMQ);
13235                         vfs_unbusy(mp);
13236                         if (error != 0) {
13237                                 softdep_error("clear_remove: vget", error);
13238                                 goto finish_write;
13239                         }
13240                         if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13241                                 softdep_error("clear_remove: fsync", error);
13242                         bo = &vp->v_bufobj;
13243                         BO_LOCK(bo);
13244                         drain_output(vp);
13245                         BO_UNLOCK(bo);
13246                         vput(vp);
13247                 finish_write:
13248                         vn_finished_write(mp);
13249                         ACQUIRE_LOCK(ump);
13250                         return;
13251                 }
13252         }
13253 }
13254
13255 /*
13256  * Clear out a block of dirty inodes in an effort to reduce
13257  * the number of inodedep dependency structures.
13258  */
13259 static void
13260 clear_inodedeps(mp)
13261         struct mount *mp;
13262 {
13263         struct inodedep_hashhead *inodedephd;
13264         struct inodedep *inodedep;
13265         struct ufsmount *ump;
13266         struct vnode *vp;
13267         struct fs *fs;
13268         int error, cnt;
13269         ino_t firstino, lastino, ino;
13270
13271         ump = VFSTOUFS(mp);
13272         fs = ump->um_fs;
13273         LOCK_OWNED(ump);
13274         /*
13275          * Pick a random inode dependency to be cleared.
13276          * We will then gather up all the inodes in its block
13277          * that have dependencies and flush them out.
13278          */
13279         for (cnt = 0; cnt <= ump->inodedep_hash_size; cnt++) {
13280                 inodedephd = &ump->inodedep_hashtbl[ump->inodedep_nextclean++];
13281                 if (ump->inodedep_nextclean > ump->inodedep_hash_size)
13282                         ump->inodedep_nextclean = 0;
13283                 if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
13284                         break;
13285         }
13286         if (inodedep == NULL)
13287                 return;
13288         /*
13289          * Find the last inode in the block with dependencies.
13290          */
13291         firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
13292         for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
13293                 if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
13294                         break;
13295         /*
13296          * Asynchronously push all but the last inode with dependencies.
13297          * Synchronously push the last inode with dependencies to ensure
13298          * that the inode block gets written to free up the inodedeps.
13299          */
13300         for (ino = firstino; ino <= lastino; ino++) {
13301                 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
13302                         continue;
13303                 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13304                         continue;
13305                 FREE_LOCK(ump);
13306                 error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
13307                 if (error != 0) {
13308                         vn_finished_write(mp);
13309                         ACQUIRE_LOCK(ump);
13310                         return;
13311                 }
13312                 if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13313                     FFSV_FORCEINSMQ)) != 0) {
13314                         softdep_error("clear_inodedeps: vget", error);
13315                         vfs_unbusy(mp);
13316                         vn_finished_write(mp);
13317                         ACQUIRE_LOCK(ump);
13318                         return;
13319                 }
13320                 vfs_unbusy(mp);
13321                 if (ino == lastino) {
13322                         if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)))
13323                                 softdep_error("clear_inodedeps: fsync1", error);
13324                 } else {
13325                         if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13326                                 softdep_error("clear_inodedeps: fsync2", error);
13327                         BO_LOCK(&vp->v_bufobj);
13328                         drain_output(vp);
13329                         BO_UNLOCK(&vp->v_bufobj);
13330                 }
13331                 vput(vp);
13332                 vn_finished_write(mp);
13333                 ACQUIRE_LOCK(ump);
13334         }
13335 }
13336
13337 void
13338 softdep_buf_append(bp, wkhd)
13339         struct buf *bp;
13340         struct workhead *wkhd;
13341 {
13342         struct worklist *wk;
13343         struct ufsmount *ump;
13344
13345         if ((wk = LIST_FIRST(wkhd)) == NULL)
13346                 return;
13347         KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
13348             ("softdep_buf_append called on non-softdep filesystem"));
13349         ump = VFSTOUFS(wk->wk_mp);
13350         ACQUIRE_LOCK(ump);
13351         while ((wk = LIST_FIRST(wkhd)) != NULL) {
13352                 WORKLIST_REMOVE(wk);
13353                 WORKLIST_INSERT(&bp->b_dep, wk);
13354         }
13355         FREE_LOCK(ump);
13356
13357 }
13358
13359 void
13360 softdep_inode_append(ip, cred, wkhd)
13361         struct inode *ip;
13362         struct ucred *cred;
13363         struct workhead *wkhd;
13364 {
13365         struct buf *bp;
13366         struct fs *fs;
13367         int error;
13368
13369         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
13370             ("softdep_inode_append called on non-softdep filesystem"));
13371         fs = ip->i_fs;
13372         error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
13373             (int)fs->fs_bsize, cred, &bp);
13374         if (error) {
13375                 bqrelse(bp);
13376                 softdep_freework(wkhd);
13377                 return;
13378         }
13379         softdep_buf_append(bp, wkhd);
13380         bqrelse(bp);
13381 }
13382
13383 void
13384 softdep_freework(wkhd)
13385         struct workhead *wkhd;
13386 {
13387         struct worklist *wk;
13388         struct ufsmount *ump;
13389
13390         if ((wk = LIST_FIRST(wkhd)) == NULL)
13391                 return;
13392         KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
13393             ("softdep_freework called on non-softdep filesystem"));
13394         ump = VFSTOUFS(wk->wk_mp);
13395         ACQUIRE_LOCK(ump);
13396         handle_jwork(wkhd);
13397         FREE_LOCK(ump);
13398 }
13399
13400 /*
13401  * Function to determine if the buffer has outstanding dependencies
13402  * that will cause a roll-back if the buffer is written. If wantcount
13403  * is set, return number of dependencies, otherwise just yes or no.
13404  */
13405 static int
13406 softdep_count_dependencies(bp, wantcount)
13407         struct buf *bp;
13408         int wantcount;
13409 {
13410         struct worklist *wk;
13411         struct ufsmount *ump;
13412         struct bmsafemap *bmsafemap;
13413         struct freework *freework;
13414         struct inodedep *inodedep;
13415         struct indirdep *indirdep;
13416         struct freeblks *freeblks;
13417         struct allocindir *aip;
13418         struct pagedep *pagedep;
13419         struct dirrem *dirrem;
13420         struct newblk *newblk;
13421         struct mkdir *mkdir;
13422         struct diradd *dap;
13423         int i, retval;
13424
13425         retval = 0;
13426         if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
13427                 return (0);
13428         ump = VFSTOUFS(wk->wk_mp);
13429         ACQUIRE_LOCK(ump);
13430         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
13431                 switch (wk->wk_type) {
13432
13433                 case D_INODEDEP:
13434                         inodedep = WK_INODEDEP(wk);
13435                         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
13436                                 /* bitmap allocation dependency */
13437                                 retval += 1;
13438                                 if (!wantcount)
13439                                         goto out;
13440                         }
13441                         if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
13442                                 /* direct block pointer dependency */
13443                                 retval += 1;
13444                                 if (!wantcount)
13445                                         goto out;
13446                         }
13447                         if (TAILQ_FIRST(&inodedep->id_extupdt)) {
13448                                 /* direct block pointer dependency */
13449                                 retval += 1;
13450                                 if (!wantcount)
13451                                         goto out;
13452                         }
13453                         if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
13454                                 /* Add reference dependency. */
13455                                 retval += 1;
13456                                 if (!wantcount)
13457                                         goto out;
13458                         }
13459                         continue;
13460
13461                 case D_INDIRDEP:
13462                         indirdep = WK_INDIRDEP(wk);
13463
13464                         TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) {
13465                                 /* indirect truncation dependency */
13466                                 retval += 1;
13467                                 if (!wantcount)
13468                                         goto out;
13469                         }
13470
13471                         LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
13472                                 /* indirect block pointer dependency */
13473                                 retval += 1;
13474                                 if (!wantcount)
13475                                         goto out;
13476                         }
13477                         continue;
13478
13479                 case D_PAGEDEP:
13480                         pagedep = WK_PAGEDEP(wk);
13481                         LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
13482                                 if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
13483                                         /* Journal remove ref dependency. */
13484                                         retval += 1;
13485                                         if (!wantcount)
13486                                                 goto out;
13487                                 }
13488                         }
13489                         for (i = 0; i < DAHASHSZ; i++) {
13490
13491                                 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
13492                                         /* directory entry dependency */
13493                                         retval += 1;
13494                                         if (!wantcount)
13495                                                 goto out;
13496                                 }
13497                         }
13498                         continue;
13499
13500                 case D_BMSAFEMAP:
13501                         bmsafemap = WK_BMSAFEMAP(wk);
13502                         if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
13503                                 /* Add reference dependency. */
13504                                 retval += 1;
13505                                 if (!wantcount)
13506                                         goto out;
13507                         }
13508                         if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
13509                                 /* Allocate block dependency. */
13510                                 retval += 1;
13511                                 if (!wantcount)
13512                                         goto out;
13513                         }
13514                         continue;
13515
13516                 case D_FREEBLKS:
13517                         freeblks = WK_FREEBLKS(wk);
13518                         if (LIST_FIRST(&freeblks->fb_jblkdephd)) {
13519                                 /* Freeblk journal dependency. */
13520                                 retval += 1;
13521                                 if (!wantcount)
13522                                         goto out;
13523                         }
13524                         continue;
13525
13526                 case D_ALLOCDIRECT:
13527                 case D_ALLOCINDIR:
13528                         newblk = WK_NEWBLK(wk);
13529                         if (newblk->nb_jnewblk) {
13530                                 /* Journal allocate dependency. */
13531                                 retval += 1;
13532                                 if (!wantcount)
13533                                         goto out;
13534                         }
13535                         continue;
13536
13537                 case D_MKDIR:
13538                         mkdir = WK_MKDIR(wk);
13539                         if (mkdir->md_jaddref) {
13540                                 /* Journal reference dependency. */
13541                                 retval += 1;
13542                                 if (!wantcount)
13543                                         goto out;
13544                         }
13545                         continue;
13546
13547                 case D_FREEWORK:
13548                 case D_FREEDEP:
13549                 case D_JSEGDEP:
13550                 case D_JSEG:
13551                 case D_SBDEP:
13552                         /* never a dependency on these blocks */
13553                         continue;
13554
13555                 default:
13556                         panic("softdep_count_dependencies: Unexpected type %s",
13557                             TYPENAME(wk->wk_type));
13558                         /* NOTREACHED */
13559                 }
13560         }
13561 out:
13562         FREE_LOCK(ump);
13563         return retval;
13564 }
13565
13566 /*
13567  * Acquire exclusive access to a buffer.
13568  * Must be called with a locked mtx parameter.
13569  * Return acquired buffer or NULL on failure.
13570  */
13571 static struct buf *
13572 getdirtybuf(bp, lock, waitfor)
13573         struct buf *bp;
13574         struct rwlock *lock;
13575         int waitfor;
13576 {
13577         int error;
13578
13579         if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
13580                 if (waitfor != MNT_WAIT)
13581                         return (NULL);
13582                 error = BUF_LOCK(bp,
13583                     LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock);
13584                 /*
13585                  * Even if we sucessfully acquire bp here, we have dropped
13586                  * lock, which may violates our guarantee.
13587                  */
13588                 if (error == 0)
13589                         BUF_UNLOCK(bp);
13590                 else if (error != ENOLCK)
13591                         panic("getdirtybuf: inconsistent lock: %d", error);
13592                 rw_wlock(lock);
13593                 return (NULL);
13594         }
13595         if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
13596                 if (lock != BO_LOCKPTR(bp->b_bufobj) && waitfor == MNT_WAIT) {
13597                         rw_wunlock(lock);
13598                         BO_LOCK(bp->b_bufobj);
13599                         BUF_UNLOCK(bp);
13600                         if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
13601                                 bp->b_vflags |= BV_BKGRDWAIT;
13602                                 msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj),
13603                                        PRIBIO | PDROP, "getbuf", 0);
13604                         } else
13605                                 BO_UNLOCK(bp->b_bufobj);
13606                         rw_wlock(lock);
13607                         return (NULL);
13608                 }
13609                 BUF_UNLOCK(bp);
13610                 if (waitfor != MNT_WAIT)
13611                         return (NULL);
13612                 /*
13613                  * The lock argument must be bp->b_vp's mutex in
13614                  * this case.
13615                  */
13616 #ifdef  DEBUG_VFS_LOCKS
13617                 if (bp->b_vp->v_type != VCHR)
13618                         ASSERT_BO_WLOCKED(bp->b_bufobj);
13619 #endif
13620                 bp->b_vflags |= BV_BKGRDWAIT;
13621                 rw_sleep(&bp->b_xflags, lock, PRIBIO, "getbuf", 0);
13622                 return (NULL);
13623         }
13624         if ((bp->b_flags & B_DELWRI) == 0) {
13625                 BUF_UNLOCK(bp);
13626                 return (NULL);
13627         }
13628         bremfree(bp);
13629         return (bp);
13630 }
13631
13632
13633 /*
13634  * Check if it is safe to suspend the file system now.  On entry,
13635  * the vnode interlock for devvp should be held.  Return 0 with
13636  * the mount interlock held if the file system can be suspended now,
13637  * otherwise return EAGAIN with the mount interlock held.
13638  */
13639 int
13640 softdep_check_suspend(struct mount *mp,
13641                       struct vnode *devvp,
13642                       int softdep_depcnt,
13643                       int softdep_accdepcnt,
13644                       int secondary_writes,
13645                       int secondary_accwrites)
13646 {
13647         struct bufobj *bo;
13648         struct ufsmount *ump;
13649         int error;
13650
13651         bo = &devvp->v_bufobj;
13652         ASSERT_BO_WLOCKED(bo);
13653
13654         /*
13655          * If we are not running with soft updates, then we need only
13656          * deal with secondary writes as we try to suspend.
13657          */
13658         if (MOUNTEDSOFTDEP(mp) == 0) {
13659                 MNT_ILOCK(mp);
13660                 while (mp->mnt_secondary_writes != 0) {
13661                         BO_UNLOCK(bo);
13662                         msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
13663                             (PUSER - 1) | PDROP, "secwr", 0);
13664                         BO_LOCK(bo);
13665                         MNT_ILOCK(mp);
13666                 }
13667
13668                 /*
13669                  * Reasons for needing more work before suspend:
13670                  * - Dirty buffers on devvp.
13671                  * - Secondary writes occurred after start of vnode sync loop
13672                  */
13673                 error = 0;
13674                 if (bo->bo_numoutput > 0 ||
13675                     bo->bo_dirty.bv_cnt > 0 ||
13676                     secondary_writes != 0 ||
13677                     mp->mnt_secondary_writes != 0 ||
13678                     secondary_accwrites != mp->mnt_secondary_accwrites)
13679                         error = EAGAIN;
13680                 BO_UNLOCK(bo);
13681                 return (error);
13682         }
13683
13684         /*
13685          * If we are running with soft updates, then we need to coordinate
13686          * with them as we try to suspend.
13687          */
13688         ump = VFSTOUFS(mp);
13689         for (;;) {
13690                 if (!TRY_ACQUIRE_LOCK(ump)) {
13691                         BO_UNLOCK(bo);
13692                         ACQUIRE_LOCK(ump);
13693                         FREE_LOCK(ump);
13694                         BO_LOCK(bo);
13695                         continue;
13696                 }
13697                 MNT_ILOCK(mp);
13698                 if (mp->mnt_secondary_writes != 0) {
13699                         FREE_LOCK(ump);
13700                         BO_UNLOCK(bo);
13701                         msleep(&mp->mnt_secondary_writes,
13702                                MNT_MTX(mp),
13703                                (PUSER - 1) | PDROP, "secwr", 0);
13704                         BO_LOCK(bo);
13705                         continue;
13706                 }
13707                 break;
13708         }
13709
13710         /*
13711          * Reasons for needing more work before suspend:
13712          * - Dirty buffers on devvp.
13713          * - Softdep activity occurred after start of vnode sync loop
13714          * - Secondary writes occurred after start of vnode sync loop
13715          */
13716         error = 0;
13717         if (bo->bo_numoutput > 0 ||
13718             bo->bo_dirty.bv_cnt > 0 ||
13719             softdep_depcnt != 0 ||
13720             ump->softdep_deps != 0 ||
13721             softdep_accdepcnt != ump->softdep_accdeps ||
13722             secondary_writes != 0 ||
13723             mp->mnt_secondary_writes != 0 ||
13724             secondary_accwrites != mp->mnt_secondary_accwrites)
13725                 error = EAGAIN;
13726         FREE_LOCK(ump);
13727         BO_UNLOCK(bo);
13728         return (error);
13729 }
13730
13731
13732 /*
13733  * Get the number of dependency structures for the file system, both
13734  * the current number and the total number allocated.  These will
13735  * later be used to detect that softdep processing has occurred.
13736  */
13737 void
13738 softdep_get_depcounts(struct mount *mp,
13739                       int *softdep_depsp,
13740                       int *softdep_accdepsp)
13741 {
13742         struct ufsmount *ump;
13743
13744         if (MOUNTEDSOFTDEP(mp) == 0) {
13745                 *softdep_depsp = 0;
13746                 *softdep_accdepsp = 0;
13747                 return;
13748         }
13749         ump = VFSTOUFS(mp);
13750         ACQUIRE_LOCK(ump);
13751         *softdep_depsp = ump->softdep_deps;
13752         *softdep_accdepsp = ump->softdep_accdeps;
13753         FREE_LOCK(ump);
13754 }
13755
13756 /*
13757  * Wait for pending output on a vnode to complete.
13758  * Must be called with vnode lock and interlock locked.
13759  *
13760  * XXX: Should just be a call to bufobj_wwait().
13761  */
13762 static void
13763 drain_output(vp)
13764         struct vnode *vp;
13765 {
13766         struct bufobj *bo;
13767
13768         bo = &vp->v_bufobj;
13769         ASSERT_VOP_LOCKED(vp, "drain_output");
13770         ASSERT_BO_WLOCKED(bo);
13771
13772         while (bo->bo_numoutput) {
13773                 bo->bo_flag |= BO_WWAIT;
13774                 msleep((caddr_t)&bo->bo_numoutput,
13775                     BO_LOCKPTR(bo), PRIBIO + 1, "drainvp", 0);
13776         }
13777 }
13778
13779 /*
13780  * Called whenever a buffer that is being invalidated or reallocated
13781  * contains dependencies. This should only happen if an I/O error has
13782  * occurred. The routine is called with the buffer locked.
13783  */
13784 static void
13785 softdep_deallocate_dependencies(bp)
13786         struct buf *bp;
13787 {
13788
13789         if ((bp->b_ioflags & BIO_ERROR) == 0)
13790                 panic("softdep_deallocate_dependencies: dangling deps");
13791         if (bp->b_vp != NULL && bp->b_vp->v_mount != NULL)
13792                 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
13793         else
13794                 printf("softdep_deallocate_dependencies: "
13795                     "got error %d while accessing filesystem\n", bp->b_error);
13796         if (bp->b_error != ENXIO)
13797                 panic("softdep_deallocate_dependencies: unrecovered I/O error");
13798 }
13799
13800 /*
13801  * Function to handle asynchronous write errors in the filesystem.
13802  */
13803 static void
13804 softdep_error(func, error)
13805         char *func;
13806         int error;
13807 {
13808
13809         /* XXX should do something better! */
13810         printf("%s: got error %d while accessing filesystem\n", func, error);
13811 }
13812
13813 #ifdef DDB
13814
13815 static void
13816 inodedep_print(struct inodedep *inodedep, int verbose)
13817 {
13818         db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d"
13819             " saveino %p\n",
13820             inodedep, inodedep->id_fs, inodedep->id_state,
13821             (intmax_t)inodedep->id_ino,
13822             (intmax_t)fsbtodb(inodedep->id_fs,
13823             ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
13824             inodedep->id_nlinkdelta, inodedep->id_savednlink,
13825             inodedep->id_savedino1);
13826
13827         if (verbose == 0)
13828                 return;
13829
13830         db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "
13831             "mkdiradd %p\n",
13832             LIST_FIRST(&inodedep->id_pendinghd),
13833             LIST_FIRST(&inodedep->id_bufwait),
13834             LIST_FIRST(&inodedep->id_inowait),
13835             TAILQ_FIRST(&inodedep->id_inoreflst),
13836             inodedep->id_mkdiradd);
13837         db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",
13838             TAILQ_FIRST(&inodedep->id_inoupdt),
13839             TAILQ_FIRST(&inodedep->id_newinoupdt),
13840             TAILQ_FIRST(&inodedep->id_extupdt),
13841             TAILQ_FIRST(&inodedep->id_newextupdt));
13842 }
13843
13844 DB_SHOW_COMMAND(inodedep, db_show_inodedep)
13845 {
13846
13847         if (have_addr == 0) {
13848                 db_printf("Address required\n");
13849                 return;
13850         }
13851         inodedep_print((struct inodedep*)addr, 1);
13852 }
13853
13854 DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
13855 {
13856         struct inodedep_hashhead *inodedephd;
13857         struct inodedep *inodedep;
13858         struct ufsmount *ump;
13859         int cnt;
13860
13861         if (have_addr == 0) {
13862                 db_printf("Address required\n");
13863                 return;
13864         }
13865         ump = (struct ufsmount *)addr;
13866         for (cnt = 0; cnt < ump->inodedep_hash_size; cnt++) {
13867                 inodedephd = &ump->inodedep_hashtbl[cnt];
13868                 LIST_FOREACH(inodedep, inodedephd, id_hash) {
13869                         inodedep_print(inodedep, 0);
13870                 }
13871         }
13872 }
13873
13874 DB_SHOW_COMMAND(worklist, db_show_worklist)
13875 {
13876         struct worklist *wk;
13877
13878         if (have_addr == 0) {
13879                 db_printf("Address required\n");
13880                 return;
13881         }
13882         wk = (struct worklist *)addr;
13883         printf("worklist: %p type %s state 0x%X\n",
13884             wk, TYPENAME(wk->wk_type), wk->wk_state);
13885 }
13886
13887 DB_SHOW_COMMAND(workhead, db_show_workhead)
13888 {
13889         struct workhead *wkhd;
13890         struct worklist *wk;
13891         int i;
13892
13893         if (have_addr == 0) {
13894                 db_printf("Address required\n");
13895                 return;
13896         }
13897         wkhd = (struct workhead *)addr;
13898         wk = LIST_FIRST(wkhd);
13899         for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list))
13900                 db_printf("worklist: %p type %s state 0x%X",
13901                     wk, TYPENAME(wk->wk_type), wk->wk_state);
13902         if (i == 100)
13903                 db_printf("workhead overflow");
13904         printf("\n");
13905 }
13906
13907
13908 DB_SHOW_COMMAND(mkdirs, db_show_mkdirs)
13909 {
13910         struct mkdirlist *mkdirlisthd;
13911         struct jaddref *jaddref;
13912         struct diradd *diradd;
13913         struct mkdir *mkdir;
13914
13915         if (have_addr == 0) {
13916                 db_printf("Address required\n");
13917                 return;
13918         }
13919         mkdirlisthd = (struct mkdirlist *)addr;
13920         LIST_FOREACH(mkdir, mkdirlisthd, md_mkdirs) {
13921                 diradd = mkdir->md_diradd;
13922                 db_printf("mkdir: %p state 0x%X dap %p state 0x%X",
13923                     mkdir, mkdir->md_state, diradd, diradd->da_state);
13924                 if ((jaddref = mkdir->md_jaddref) != NULL)
13925                         db_printf(" jaddref %p jaddref state 0x%X",
13926                             jaddref, jaddref->ja_state);
13927                 db_printf("\n");
13928         }
13929 }
13930
13931 /* exported to ffs_vfsops.c */
13932 extern void db_print_ffs(struct ufsmount *ump);
13933 void
13934 db_print_ffs(struct ufsmount *ump)
13935 {
13936         db_printf("mp %p %s devvp %p fs %p su_wl %d su_deps %d su_req %d\n",
13937             ump->um_mountp, ump->um_mountp->mnt_stat.f_mntonname,
13938             ump->um_devvp, ump->um_fs, ump->softdep_on_worklist,
13939             ump->softdep_deps, ump->softdep_req);
13940 }
13941
13942 #endif /* DDB */
13943
13944 #endif /* SOFTUPDATES */